mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
use urlcanon library for canonicalization, surtification, scope match rules
This commit is contained in:
parent
479f0f7e09
commit
12fb9eaa15
@ -9,7 +9,7 @@ install:
|
|||||||
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
|
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
|
||||||
- pip install $TRAVIS_BUILD_DIR pytest
|
- pip install $TRAVIS_BUILD_DIR pytest
|
||||||
script:
|
script:
|
||||||
- DISPLAY=:1 py.test -v -s tests
|
- DISPLAY=:1 py.test -v tests
|
||||||
after_failure:
|
after_failure:
|
||||||
- sudo cat /var/log/upstart/warcprox.log
|
- sudo cat /var/log/upstart/warcprox.log
|
||||||
- sudo cat /var/log/upstart/brozzler-worker.log
|
- sudo cat /var/log/upstart/brozzler-worker.log
|
||||||
|
@ -50,31 +50,18 @@ class ReachedLimit(Exception):
|
|||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.__repr__()
|
return self.__repr__()
|
||||||
|
|
||||||
def fixup(url, hash_strip=False):
|
|
||||||
'''
|
|
||||||
Does rudimentary canonicalization, such as converting IDN to punycode.
|
|
||||||
'''
|
|
||||||
import surt
|
|
||||||
hurl = surt.handyurl.parse(url)
|
|
||||||
if hash_strip:
|
|
||||||
hurl.hash = None
|
|
||||||
# handyurl.parse() already lowercases the scheme via urlsplit
|
|
||||||
if hurl.host:
|
|
||||||
hurl.host = hurl.host.encode('idna').decode('ascii').lower()
|
|
||||||
return hurl.getURLString()
|
|
||||||
|
|
||||||
# monkey-patch log level TRACE
|
# monkey-patch log level TRACE
|
||||||
TRACE = 5
|
TRACE = 5
|
||||||
import logging as _logging
|
import logging
|
||||||
def _logging_trace(msg, *args, **kwargs):
|
def _logging_trace(msg, *args, **kwargs):
|
||||||
_logging.root.trace(msg, *args, **kwargs)
|
logging.root.trace(msg, *args, **kwargs)
|
||||||
def _logger_trace(self, msg, *args, **kwargs):
|
def _logger_trace(self, msg, *args, **kwargs):
|
||||||
if self.isEnabledFor(TRACE):
|
if self.isEnabledFor(TRACE):
|
||||||
self._log(TRACE, msg, args, **kwargs)
|
self._log(TRACE, msg, args, **kwargs)
|
||||||
_logging.trace = _logging_trace
|
logging.trace = _logging_trace
|
||||||
_logging.Logger.trace = _logger_trace
|
logging.Logger.trace = _logger_trace
|
||||||
_logging._levelToName[TRACE] = 'TRACE'
|
logging._levelToName[TRACE] = 'TRACE'
|
||||||
_logging._nameToLevel['TRACE'] = TRACE
|
logging._nameToLevel['TRACE'] = TRACE
|
||||||
|
|
||||||
_behaviors = None
|
_behaviors = None
|
||||||
def behaviors():
|
def behaviors():
|
||||||
@ -158,6 +145,14 @@ def jinja2_environment():
|
|||||||
_jinja2_env.filters['json'] = json.dumps
|
_jinja2_env.filters['json'] = json.dumps
|
||||||
return _jinja2_env
|
return _jinja2_env
|
||||||
|
|
||||||
|
import urlcanon
|
||||||
|
def _remove_query(url):
|
||||||
|
url.question_mark = b''
|
||||||
|
url.query = b''
|
||||||
|
# XXX chop off path after last slash??
|
||||||
|
site_surt_canon = urlcanon.Canonicalizer(
|
||||||
|
urlcanon.semantic.steps + [_remove_query])
|
||||||
|
|
||||||
from brozzler.site import Page, Site
|
from brozzler.site import Page, Site
|
||||||
from brozzler.worker import BrozzlerWorker
|
from brozzler.worker import BrozzlerWorker
|
||||||
from brozzler.robots import is_permitted_by_robots
|
from brozzler.robots import is_permitted_by_robots
|
||||||
@ -166,3 +161,6 @@ from brozzler.browser import Browser, BrowserPool, BrowsingException
|
|||||||
from brozzler.job import new_job, new_site, Job
|
from brozzler.job import new_job, new_site, Job
|
||||||
from brozzler.cli import suggest_default_chrome_exe
|
from brozzler.cli import suggest_default_chrome_exe
|
||||||
|
|
||||||
|
__all__ = ['Page', 'Site', 'BrozzlerWorker', 'is_permitted_by_robots',
|
||||||
|
'RethinkDbFrontier', 'Browser', 'BrowserPool', 'BrowsingException',
|
||||||
|
'new_job', 'new_site', 'Job']
|
||||||
|
@ -29,7 +29,6 @@ from requests.structures import CaseInsensitiveDict
|
|||||||
import datetime
|
import datetime
|
||||||
import base64
|
import base64
|
||||||
from brozzler.chrome import Chrome
|
from brozzler.chrome import Chrome
|
||||||
import surt
|
|
||||||
import socket
|
import socket
|
||||||
|
|
||||||
class BrowsingException(Exception):
|
class BrowsingException(Exception):
|
||||||
|
@ -541,7 +541,7 @@ def brozzler_list_captures():
|
|||||||
Handy utility for looking up entries in the rethinkdb "captures" table by
|
Handy utility for looking up entries in the rethinkdb "captures" table by
|
||||||
url or sha1.
|
url or sha1.
|
||||||
'''
|
'''
|
||||||
import surt
|
import urlcanon
|
||||||
|
|
||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
prog=os.path.basename(sys.argv[0]),
|
prog=os.path.basename(sys.argv[0]),
|
||||||
@ -579,9 +579,7 @@ def brozzler_list_captures():
|
|||||||
logging.debug('querying rethinkdb: %s', reql)
|
logging.debug('querying rethinkdb: %s', reql)
|
||||||
results = reql.run()
|
results = reql.run()
|
||||||
else:
|
else:
|
||||||
key = surt.surt(
|
key = urlcanon.semantic(args.url_or_sha1).surt().decode('ascii')
|
||||||
args.url_or_sha1, trailing_comma=True, host_massage=False,
|
|
||||||
with_scheme=True)
|
|
||||||
abbr_start_key = key[:150]
|
abbr_start_key = key[:150]
|
||||||
if args.prefix:
|
if args.prefix:
|
||||||
# surt is necessarily ascii and \x7f is the last ascii character
|
# surt is necessarily ascii and \x7f is the last ascii character
|
||||||
|
@ -23,6 +23,7 @@ import time
|
|||||||
import datetime
|
import datetime
|
||||||
import rethinkdb as r
|
import rethinkdb as r
|
||||||
import doublethink
|
import doublethink
|
||||||
|
import urlcanon
|
||||||
|
|
||||||
class UnexpectedDbResult(Exception):
|
class UnexpectedDbResult(Exception):
|
||||||
pass
|
pass
|
||||||
@ -261,18 +262,21 @@ class RethinkDbFrontier:
|
|||||||
|
|
||||||
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
||||||
if site.remember_outlinks:
|
if site.remember_outlinks:
|
||||||
parent_page.outlinks = {"accepted":[],"blocked":[],"rejected":[]}
|
decisions = {"accepted":set(),"blocked":set(),"rejected":set()}
|
||||||
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
||||||
for url in outlinks or []:
|
for url in outlinks or []:
|
||||||
u = brozzler.site.Url(url)
|
url_for_scoping = urlcanon.semantic(url)
|
||||||
if site.is_in_scope(u, parent_page=parent_page):
|
url_for_crawling = urlcanon.whatwg(url)
|
||||||
|
if site.is_in_scope(url_for_scoping, parent_page=parent_page):
|
||||||
if brozzler.is_permitted_by_robots(site, url):
|
if brozzler.is_permitted_by_robots(site, url):
|
||||||
if not u.surt.startswith(site.scope["surt"]):
|
if not url_for_scoping.surt().startswith(
|
||||||
|
site.scope["surt"].encode("utf-8")):
|
||||||
hops_off_surt = parent_page.hops_off_surt + 1
|
hops_off_surt = parent_page.hops_off_surt + 1
|
||||||
else:
|
else:
|
||||||
hops_off_surt = 0
|
hops_off_surt = 0
|
||||||
new_child_page = brozzler.Page(self.rr, {
|
new_child_page = brozzler.Page(self.rr, {
|
||||||
'url': url, 'site_id': site.id, 'job_id': site.job_id,
|
'url': str(url_for_crawling),
|
||||||
|
'site_id': site.id, 'job_id': site.job_id,
|
||||||
'hops_from_seed': parent_page.hops_from_seed+1,
|
'hops_from_seed': parent_page.hops_from_seed+1,
|
||||||
'via_page_id': parent_page.id,
|
'via_page_id': parent_page.id,
|
||||||
'hops_off_surt': hops_off_surt})
|
'hops_off_surt': hops_off_surt})
|
||||||
@ -286,17 +290,20 @@ class RethinkDbFrontier:
|
|||||||
new_child_page.save()
|
new_child_page.save()
|
||||||
counts["added"] += 1
|
counts["added"] += 1
|
||||||
if site.remember_outlinks:
|
if site.remember_outlinks:
|
||||||
parent_page.outlinks["accepted"].append(url)
|
decisions["accepted"].add(str(url_for_crawling))
|
||||||
else:
|
else:
|
||||||
counts["blocked"] += 1
|
counts["blocked"] += 1
|
||||||
if site.remember_outlinks:
|
if site.remember_outlinks:
|
||||||
parent_page.outlinks["blocked"].append(url)
|
decisions["blocked"].add(str(url_for_crawling))
|
||||||
else:
|
else:
|
||||||
counts["rejected"] += 1
|
counts["rejected"] += 1
|
||||||
if site.remember_outlinks:
|
if site.remember_outlinks:
|
||||||
parent_page.outlinks["rejected"].append(url)
|
decisions["rejected"].add(str(url_for_crawling))
|
||||||
|
|
||||||
if site.remember_outlinks:
|
if site.remember_outlinks:
|
||||||
|
parent_page.outlinks = {}
|
||||||
|
for k in decisions:
|
||||||
|
parent_page.outlinks[k] = list(decisions[k])
|
||||||
parent_page.save()
|
parent_page.save()
|
||||||
|
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
|
@ -36,7 +36,7 @@ except ImportError as e:
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
import doublethink
|
import doublethink
|
||||||
import rethinkdb as r
|
import rethinkdb as r
|
||||||
import surt
|
import urlcanon
|
||||||
import json
|
import json
|
||||||
import brozzler
|
import brozzler
|
||||||
import argparse
|
import argparse
|
||||||
@ -116,9 +116,7 @@ class TheGoodUrlCanonicalizer(object):
|
|||||||
|
|
||||||
def __call__(self, url):
|
def __call__(self, url):
|
||||||
try:
|
try:
|
||||||
key = surt.surt(
|
key = urlcanon.semantic(url).surt().decode('ascii')
|
||||||
url, trailing_comma=True, host_massage=False,
|
|
||||||
with_scheme=True)
|
|
||||||
# logging.debug('%s -> %s', url, key)
|
# logging.debug('%s -> %s', url, key)
|
||||||
return key
|
return key
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
209
brozzler/site.py
209
brozzler/site.py
@ -16,7 +16,7 @@ See the License for the specific language governing permissions and
|
|||||||
limitations under the License.
|
limitations under the License.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import surt
|
import urlcanon
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import brozzler
|
import brozzler
|
||||||
@ -25,69 +25,10 @@ import time
|
|||||||
import doublethink
|
import doublethink
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
import ipaddress
|
|
||||||
|
|
||||||
_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
|
_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
|
||||||
tzinfo=doublethink.UTC)
|
tzinfo=doublethink.UTC)
|
||||||
|
|
||||||
class Url:
|
|
||||||
def __init__(self, url):
|
|
||||||
self.url = url
|
|
||||||
self._surt = None
|
|
||||||
self._host = None
|
|
||||||
|
|
||||||
@property
|
|
||||||
def surt(self):
|
|
||||||
if not self._surt:
|
|
||||||
try:
|
|
||||||
hurl = surt.handyurl.parse(self.url)
|
|
||||||
surt.GoogleURLCanonicalizer.canonicalize(hurl)
|
|
||||||
hurl.query = None
|
|
||||||
hurl.hash = None
|
|
||||||
# XXX chop off path after last slash??
|
|
||||||
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
|
|
||||||
except Exception as e:
|
|
||||||
logging.warn('problem surting %s - %s', repr(self.url), e)
|
|
||||||
return self._surt
|
|
||||||
|
|
||||||
@property
|
|
||||||
def host(self):
|
|
||||||
if not self._host:
|
|
||||||
self._host = surt.handyurl.parse(self.url).host
|
|
||||||
return self._host
|
|
||||||
|
|
||||||
def matches_ip_or_domain(self, ip_or_domain):
|
|
||||||
"""
|
|
||||||
Returns true if
|
|
||||||
- ip_or_domain is an ip address and self.host is the same ip address
|
|
||||||
- ip_or_domain is a domain and self.host is the same domain
|
|
||||||
- ip_or_domain is a domain and self.host is a subdomain of it
|
|
||||||
"""
|
|
||||||
if not self.host:
|
|
||||||
return False
|
|
||||||
|
|
||||||
if ip_or_domain == self.host:
|
|
||||||
return True
|
|
||||||
|
|
||||||
# if either ip_or_domain or self.host are ip addresses, and they're not
|
|
||||||
# identical (previous check), not a match
|
|
||||||
try:
|
|
||||||
ipaddress.ip_address(ip_or_domain)
|
|
||||||
return False
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
try:
|
|
||||||
ipaddress.ip_address(self.host)
|
|
||||||
return False
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# if we get here, we're looking at two hostnames
|
|
||||||
domain_parts = ip_or_domain.encode("idna").decode("ascii").lower().split(".")
|
|
||||||
host_parts = self.host.encode("idna").decode("ascii").lower().split(".")
|
|
||||||
|
|
||||||
return host_parts[-len(domain_parts):] == domain_parts
|
|
||||||
|
|
||||||
class Site(doublethink.Document):
|
class Site(doublethink.Document):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
table = 'sites'
|
table = 'sites'
|
||||||
@ -105,8 +46,9 @@ class Site(doublethink.Document):
|
|||||||
self.last_claimed = _EPOCH_UTC
|
self.last_claimed = _EPOCH_UTC
|
||||||
if not "scope" in self:
|
if not "scope" in self:
|
||||||
self.scope = {}
|
self.scope = {}
|
||||||
if not "surt" in self.scope:
|
if not "surt" in self.scope and self.seed:
|
||||||
self.scope["surt"] = Url(self.seed).surt
|
self.scope["surt"] = brozzler.site_surt_canon(
|
||||||
|
self.seed).surt().decode('ascii')
|
||||||
|
|
||||||
if not "starts_and_stops" in self:
|
if not "starts_and_stops" in self:
|
||||||
if self.get("start_time"): # backward compatibility
|
if self.get("start_time"): # backward compatibility
|
||||||
@ -135,7 +77,7 @@ class Site(doublethink.Document):
|
|||||||
return dt
|
return dt
|
||||||
|
|
||||||
def note_seed_redirect(self, url):
|
def note_seed_redirect(self, url):
|
||||||
new_scope_surt = Url(url).surt
|
new_scope_surt = brozzler.site_surt_canon(url).surt().decode("ascii")
|
||||||
if not new_scope_surt.startswith(self.scope["surt"]):
|
if not new_scope_surt.startswith(self.scope["surt"]):
|
||||||
self.logger.info("changing site scope surt from {} to {}".format(
|
self.logger.info("changing site scope surt from {} to {}".format(
|
||||||
self.scope["surt"], new_scope_surt))
|
self.scope["surt"], new_scope_surt))
|
||||||
@ -149,149 +91,50 @@ class Site(doublethink.Document):
|
|||||||
return hdrs
|
return hdrs
|
||||||
|
|
||||||
def is_in_scope(self, url, parent_page=None):
|
def is_in_scope(self, url, parent_page=None):
|
||||||
if not isinstance(url, Url):
|
if not isinstance(url, urlcanon.ParsedUrl):
|
||||||
u = Url(url)
|
url = urlcanon.semantic(url)
|
||||||
else:
|
if parent_page:
|
||||||
u = url
|
parent_url = urlcanon.semantic(parent_page.url)
|
||||||
|
|
||||||
might_accept = False
|
might_accept = False
|
||||||
if not u.surt:
|
if not url.scheme in (b'http', b'https'):
|
||||||
return False
|
|
||||||
elif not u.surt.startswith("http://") and not u.surt.startswith("https://"):
|
|
||||||
# XXX doesn't belong here maybe (where? worker ignores unknown
|
# XXX doesn't belong here maybe (where? worker ignores unknown
|
||||||
# schemes?)
|
# schemes?)
|
||||||
return False
|
return False
|
||||||
elif (parent_page and "max_hops" in self.scope
|
elif (parent_page and "max_hops" in self.scope
|
||||||
and parent_page.hops_from_seed >= self.scope["max_hops"]):
|
and parent_page.hops_from_seed >= self.scope["max_hops"]):
|
||||||
pass
|
pass
|
||||||
elif u.surt.startswith(self.scope["surt"]):
|
elif url.surt().startswith(self.scope["surt"].encode("utf-8")):
|
||||||
might_accept = True
|
might_accept = True
|
||||||
elif parent_page and parent_page.hops_off_surt < self.scope.get(
|
elif parent_page and parent_page.hops_off_surt < self.scope.get(
|
||||||
"max_hops_off_surt", 0):
|
"max_hops_off_surt", 0):
|
||||||
might_accept = True
|
might_accept = True
|
||||||
elif "accepts" in self.scope:
|
elif "accepts" in self.scope:
|
||||||
for rule in self.scope["accepts"]:
|
for accept_rule in self.scope["accepts"]:
|
||||||
if self._scope_rule_applies(rule, u, parent_page):
|
rule = urlcanon.MatchRule(**accept_rule)
|
||||||
|
if rule.applies(url, parent_url):
|
||||||
might_accept = True
|
might_accept = True
|
||||||
break
|
break
|
||||||
|
|
||||||
if might_accept:
|
if might_accept:
|
||||||
if "blocks" in self.scope:
|
if "blocks" in self.scope:
|
||||||
for rule in self.scope["blocks"]:
|
for block_rule in self.scope["blocks"]:
|
||||||
if self._scope_rule_applies(rule, u, parent_page):
|
rule = urlcanon.MatchRule(**block_rule)
|
||||||
|
if rule.applies(url, parent_url):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _normalize_rule(self, rule):
|
|
||||||
"""
|
|
||||||
Normalizes a scope rule.
|
|
||||||
|
|
||||||
A scope rule is considered deprecated if it contains a `url_match` and
|
|
||||||
`value`. This method converts such scope rules to the preferred style
|
|
||||||
and returns the new rule. If `rule` is not a deprecated-style rule,
|
|
||||||
returns it unchanged.
|
|
||||||
"""
|
|
||||||
if "url_match" in rule and "value" in rule:
|
|
||||||
new_rule = dict(rule)
|
|
||||||
url_match = new_rule.pop("url_match")
|
|
||||||
if url_match == "REGEX_MATCH":
|
|
||||||
new_rule["regex"] = new_rule.pop("value")
|
|
||||||
elif url_match == "SURT_MATCH":
|
|
||||||
new_rule["surt"] = new_rule.pop("value")
|
|
||||||
elif url_match == "STRING_MATCH":
|
|
||||||
new_rule["substring"] = new_rule.pop("value")
|
|
||||||
else:
|
|
||||||
raise Exception("invalid scope rule")
|
|
||||||
return new_rule
|
|
||||||
else:
|
|
||||||
return rule
|
|
||||||
|
|
||||||
def _scope_rule_applies(self, rule, url, parent_page=None):
|
|
||||||
"""
|
|
||||||
Examples of valid rules expressed as yaml.
|
|
||||||
|
|
||||||
- domain: bad.domain.com
|
|
||||||
|
|
||||||
# preferred:
|
|
||||||
- domain: monkey.org
|
|
||||||
substring: bar
|
|
||||||
|
|
||||||
# deprecated version of the same:
|
|
||||||
- domain: monkey.org
|
|
||||||
url_match: STRING_MATCH
|
|
||||||
value: bar
|
|
||||||
|
|
||||||
# preferred:
|
|
||||||
- surt: http://(com,woop,)/fuh/
|
|
||||||
|
|
||||||
# deprecated version of the same:
|
|
||||||
- url_match: SURT_MATCH
|
|
||||||
value: http://(com,woop,)/fuh/
|
|
||||||
|
|
||||||
# preferred:
|
|
||||||
- regex: ^https?://(www.)?youtube.com/watch?.*$
|
|
||||||
parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
|
|
||||||
|
|
||||||
# deprecated version of the same:
|
|
||||||
- url_match: REGEX_MATCH
|
|
||||||
value: ^https?://(www.)?youtube.com/watch?.*$
|
|
||||||
parent_url_regex: ^https?://(www.)?youtube.com/user/.*$
|
|
||||||
"""
|
|
||||||
if not isinstance(url, Url):
|
|
||||||
u = Url(url)
|
|
||||||
else:
|
|
||||||
u = url
|
|
||||||
|
|
||||||
try:
|
|
||||||
rewl = self._normalize_rule(rule)
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(
|
|
||||||
"problem normalizing scope rule %s - %s", rule, e)
|
|
||||||
return False
|
|
||||||
|
|
||||||
invalid_keys = rewl.keys() - {
|
|
||||||
"domain", "surt", "substring", "regex", "parent_url_regex"}
|
|
||||||
if invalid_keys:
|
|
||||||
self.logger.error(
|
|
||||||
"invalid keys %s in scope rule %s", invalid_keys, rule)
|
|
||||||
return False
|
|
||||||
|
|
||||||
if "domain" in rewl and not u.matches_ip_or_domain(rewl["domain"]):
|
|
||||||
return False
|
|
||||||
if "surt" in rewl and not u.surt.startswith(rewl["surt"]):
|
|
||||||
return False
|
|
||||||
if "substring" in rewl and not u.url.find(rewl["substring"]) >= 0:
|
|
||||||
return False
|
|
||||||
if "regex" in rewl:
|
|
||||||
try:
|
|
||||||
if not re.fullmatch(rewl["regex"], u.url):
|
|
||||||
return False
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(
|
|
||||||
"caught exception matching against regex %s - %s",
|
|
||||||
rewl["regex"], e)
|
|
||||||
return False
|
|
||||||
if "parent_url_regex" in rewl:
|
|
||||||
if not parent_page:
|
|
||||||
return False
|
|
||||||
pu = Url(parent_page.url)
|
|
||||||
try:
|
|
||||||
if not re.fullmatch(rule["parent_url_regex"], pu.url):
|
|
||||||
return False
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(
|
|
||||||
"caught exception matching against regex %s - %s",
|
|
||||||
rule["parent_url_regex"], e)
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
class Page(doublethink.Document):
|
class Page(doublethink.Document):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
table = "pages"
|
table = "pages"
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def compute_id(site_id, url):
|
||||||
|
digest_this = "site_id:%s,url:%s" % (site_id, url)
|
||||||
|
return hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
|
||||||
|
|
||||||
def populate_defaults(self):
|
def populate_defaults(self):
|
||||||
if not "hops_from_seed" in self:
|
if not "hops_from_seed" in self:
|
||||||
self.hops_from_seed = 0
|
self.hops_from_seed = 0
|
||||||
@ -306,8 +149,7 @@ class Page(doublethink.Document):
|
|||||||
if not "priority" in self:
|
if not "priority" in self:
|
||||||
self.priority = self._calc_priority()
|
self.priority = self._calc_priority()
|
||||||
if not "id" in self:
|
if not "id" in self:
|
||||||
digest_this = "site_id:%s,url:%s" % (self.site_id, self.url)
|
self.id = self.compute_id(self.site_id, self.url)
|
||||||
self.id = hashlib.sha1(digest_this.encode("utf-8")).hexdigest()
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return 'Page({"id":"%s","url":"%s",...})' % (self.id, self.url)
|
return 'Page({"id":"%s","url":"%s",...})' % (self.id, self.url)
|
||||||
@ -327,7 +169,6 @@ class Page(doublethink.Document):
|
|||||||
if not self.url:
|
if not self.url:
|
||||||
return None
|
return None
|
||||||
if self._canon_hurl is None:
|
if self._canon_hurl is None:
|
||||||
self._canon_hurl = surt.handyurl.parse(self.url)
|
self._canon_hurl = urlcanon.semantic(self.url)
|
||||||
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
|
return str(self._canon_hurl)
|
||||||
return self._canon_hurl.geturl()
|
|
||||||
|
|
||||||
|
@ -33,6 +33,7 @@ import collections
|
|||||||
import requests
|
import requests
|
||||||
import doublethink
|
import doublethink
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import urlcanon
|
||||||
|
|
||||||
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||||
def __init__(self, extra_headers):
|
def __init__(self, extra_headers):
|
||||||
@ -208,7 +209,7 @@ class BrozzlerWorker:
|
|||||||
"with youtube-dl json for %s", page)
|
"with youtube-dl json for %s", page)
|
||||||
self._warcprox_write_record(
|
self._warcprox_write_record(
|
||||||
warcprox_address=self._proxy(site),
|
warcprox_address=self._proxy(site),
|
||||||
url="youtube-dl:%s" % brozzler.fixup(page.url),
|
url="youtube-dl:%s" % str(urlcanon.semantic(page.url)),
|
||||||
warc_type="metadata",
|
warc_type="metadata",
|
||||||
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
||||||
payload=info_json.encode("utf-8"),
|
payload=info_json.encode("utf-8"),
|
||||||
@ -245,7 +246,7 @@ class BrozzlerWorker:
|
|||||||
def _on_screenshot(screenshot_png):
|
def _on_screenshot(screenshot_png):
|
||||||
if on_screenshot:
|
if on_screenshot:
|
||||||
on_screenshot(screenshot_png)
|
on_screenshot(screenshot_png)
|
||||||
elif self._proxy(site) and self._enable_warcprox_features(site):
|
if self._proxy(site) and self._enable_warcprox_features(site):
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"sending WARCPROX_WRITE_RECORD request to %s with "
|
"sending WARCPROX_WRITE_RECORD request to %s with "
|
||||||
"screenshot for %s", self._proxy(site), page)
|
"screenshot for %s", self._proxy(site), page)
|
||||||
@ -253,13 +254,13 @@ class BrozzlerWorker:
|
|||||||
screenshot_png)
|
screenshot_png)
|
||||||
self._warcprox_write_record(
|
self._warcprox_write_record(
|
||||||
warcprox_address=self._proxy(site),
|
warcprox_address=self._proxy(site),
|
||||||
url="screenshot:%s" % brozzler.fixup(page.url, True),
|
url="screenshot:%s" % str(urlcanon.semantic(page.url)),
|
||||||
warc_type="resource", content_type="image/jpeg",
|
warc_type="resource", content_type="image/jpeg",
|
||||||
payload=screenshot_jpeg,
|
payload=screenshot_jpeg,
|
||||||
extra_headers=site.extra_headers())
|
extra_headers=site.extra_headers())
|
||||||
self._warcprox_write_record(
|
self._warcprox_write_record(
|
||||||
warcprox_address=self._proxy(site),
|
warcprox_address=self._proxy(site),
|
||||||
url="thumbnail:%s" % brozzler.fixup(page.url, True),
|
url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
|
||||||
warc_type="resource", content_type="image/jpeg",
|
warc_type="resource", content_type="image/jpeg",
|
||||||
payload=thumbnail_jpeg,
|
payload=thumbnail_jpeg,
|
||||||
extra_headers=site.extra_headers())
|
extra_headers=site.extra_headers())
|
||||||
|
11
setup.py
11
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev202',
|
version='1.1b9.dev203',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
@ -68,7 +68,7 @@ setuptools.setup(
|
|||||||
'requests',
|
'requests',
|
||||||
'websocket-client!=0.39.0',
|
'websocket-client!=0.39.0',
|
||||||
'pillow==3.3.0',
|
'pillow==3.3.0',
|
||||||
'surt>=0.3.0',
|
'urlcanon>=0.1.dev16',
|
||||||
'doublethink>=0.2.0.dev71',
|
'doublethink>=0.2.0.dev71',
|
||||||
'rethinkdb>=2.3,<2.4',
|
'rethinkdb>=2.3,<2.4',
|
||||||
'cerberus==1.0.1',
|
'cerberus==1.0.1',
|
||||||
@ -76,7 +76,12 @@ setuptools.setup(
|
|||||||
],
|
],
|
||||||
extras_require={
|
extras_require={
|
||||||
'dashboard': ['flask>=0.11', 'gunicorn'],
|
'dashboard': ['flask>=0.11', 'gunicorn'],
|
||||||
'easy': ['warcprox>=2.0b2', 'pywb', 'flask>=0.11', 'gunicorn'],
|
'easy': [
|
||||||
|
'warcprox>=2.1b1.dev57',
|
||||||
|
'pywb',
|
||||||
|
'flask>=0.11',
|
||||||
|
'gunicorn'
|
||||||
|
],
|
||||||
},
|
},
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
classifiers=[
|
classifiers=[
|
||||||
|
@ -75,7 +75,6 @@ blocks:
|
|||||||
- domain: twitter.com
|
- domain: twitter.com
|
||||||
url_match: REGEX_MATCH
|
url_match: REGEX_MATCH
|
||||||
value: ^.*lang=(?!en).*$
|
value: ^.*lang=(?!en).*$
|
||||||
- bad_thing: bad rule should be ignored
|
|
||||||
''')
|
''')
|
||||||
|
|
||||||
site = brozzler.Site(None, {
|
site = brozzler.Site(None, {
|
||||||
|
@ -16,4 +16,4 @@ vagrant ssh -- 'status warcprox ;
|
|||||||
echo
|
echo
|
||||||
|
|
||||||
vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && pip install pytest'
|
vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && pip install pytest'
|
||||||
vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && DISPLAY=:1 py.test -v -s /brozzler/tests $@"
|
vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && DISPLAY=:1 py.test -v /brozzler/tests $@"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user