Merge branch 'master' into karl

* master:
  bump up heartbeat interval (see comment)
  back to dev version
  version 1.3 (messed up 1.2)
  setuptools wants README not readme
  back to dev version number
  version 1.2
  bump dev version after merge
  is test_time_limit is failing because of timing?
  fix bug in test, add another one
  treat any error fetching robots.txt as "allow all"
  update instagram behavior
This commit is contained in:
Noah Levitt 2018-07-23 23:28:42 +00:00
commit a7fb7bcc37
10 changed files with 134 additions and 54 deletions

View file

@ -1,7 +1,7 @@
.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master .. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master
:target: https://travis-ci.org/internetarchive/brozzler :target: https://travis-ci.org/internetarchive/brozzler
.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg .. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b12/brozzler/dashboard/static/brozzler.svg
:width: 60px :width: 60px
|logo| brozzler |logo| brozzler

View file

@ -27,7 +27,7 @@
default_parameters: default_parameters:
actions: actions:
- selector: a.coreSpriteDismissLarge - selector: a.coreSpriteDismissLarge
- selector: div._mck9w a - selector: a>div[role='button']
firstMatchOnly: true firstMatchOnly: true
- selector: a.coreSpriteRightPaginationArrow - selector: a.coreSpriteRightPaginationArrow
repeatSameElement: true repeatSameElement: true

View file

@ -24,7 +24,7 @@ try:
except ImportError as e: except ImportError as e:
logging.critical( logging.critical(
'%s: %s\n\nYou might need to run "pip install ' '%s: %s\n\nYou might need to run "pip install '
'brozzler[dashboard]".\nSee readme.rst for more information.', 'brozzler[dashboard]".\nSee README.rst for more information.',
type(e).__name__, e) type(e).__name__, e)
sys.exit(1) sys.exit(1)
import doublethink import doublethink

View file

@ -31,7 +31,7 @@ try:
except ImportError as e: except ImportError as e:
logging.critical( logging.critical(
'%s: %s\n\nYou might need to run "pip install ' '%s: %s\n\nYou might need to run "pip install '
'brozzler[easy]".\nSee readme.rst for more information.', 'brozzler[easy]".\nSee README.rst for more information.',
type(e).__name__, e) type(e).__name__, e)
sys.exit(1) sys.exit(1)
import argparse import argparse

View file

@ -31,7 +31,7 @@ try:
except ImportError as e: except ImportError as e:
logging.critical( logging.critical(
'%s: %s\n\nYou might need to run "pip install ' '%s: %s\n\nYou might need to run "pip install '
'brozzler[easy]".\nSee readme.rst for more information.', 'brozzler[easy]".\nSee README.rst for more information.',
type(e).__name__, e) type(e).__name__, e)
sys.exit(1) sys.exit(1)
import doublethink import doublethink
@ -270,7 +270,7 @@ Run pywb like so:
$ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback $ PYWB_CONFIG_FILE=pywb.yml brozzler-wayback
See readme.rst for more information. See README.rst for more information.
''' '''
# copied and pasted from cdxdomainspecific.py, only changes are commented as # copied and pasted from cdxdomainspecific.py, only changes are commented as

View file

@ -46,20 +46,21 @@ def _reppy_rules_getitem(self, agent):
return self.agents.get('*') return self.agents.get('*')
reppy.parser.Rules.__getitem__ = _reppy_rules_getitem reppy.parser.Rules.__getitem__ = _reppy_rules_getitem
class _SessionRaiseOn420(requests.Session):
timeout = 60
def get(self, url, *args, **kwargs):
res = super().get(url, timeout=self.timeout, *args, **kwargs)
if res.status_code == 420 and 'warcprox-meta' in res.headers:
raise brozzler.ReachedLimit(
warcprox_meta=json.loads(res.headers['warcprox-meta']),
http_payload=res.text)
else:
return res
_robots_caches = {} # {site_id:reppy.cache.RobotsCache} _robots_caches = {} # {site_id:reppy.cache.RobotsCache}
def _robots_cache(site, proxy=None): def _robots_cache(site, proxy=None):
class SessionRaiseOn420(requests.Session):
def get(self, url, *args, **kwargs):
res = super().get(url, *args, **kwargs)
if res.status_code == 420 and 'warcprox-meta' in res.headers:
raise brozzler.ReachedLimit(
warcprox_meta=json.loads(res.headers['warcprox-meta']),
http_payload=res.text)
else:
return res
if not site.id in _robots_caches: if not site.id in _robots_caches:
req_sesh = SessionRaiseOn420() req_sesh = _SessionRaiseOn420()
req_sesh.verify = False # ignore cert errors req_sesh.verify = False # ignore cert errors
if proxy: if proxy:
proxie = "http://%s" % proxy proxie = "http://%s" % proxy
@ -68,7 +69,8 @@ def _robots_cache(site, proxy=None):
req_sesh.headers.update(site.extra_headers()) req_sesh.headers.update(site.extra_headers())
if site.user_agent: if site.user_agent:
req_sesh.headers['User-Agent'] = site.user_agent req_sesh.headers['User-Agent'] = site.user_agent
_robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh) _robots_caches[site.id] = reppy.cache.RobotsCache(
session=req_sesh, disallow_forbidden=False)
return _robots_caches[site.id] return _robots_caches[site.id]
@ -76,13 +78,9 @@ def is_permitted_by_robots(site, url, proxy=None):
''' '''
Checks if `url` is permitted by robots.txt. Checks if `url` is permitted by robots.txt.
In case of problems fetching robots.txt, different things can happen. Treats any kind of error fetching robots.txt as "allow all". See
Reppy (the robots.txt parsing library) handles some exceptions internally http://builds.archive.org/javadoc/heritrix-3.x-snapshot/org/archive/modules/net/CrawlServer.html#updateRobots(org.archive.modules.CrawlURI)
and applies an appropriate policy. It bubbles up other exceptions. Of for some background on that policy.
these, there are two kinds that this function raises for the caller to
handle, described below. Yet other types of exceptions are caught, and the
fetch is retried up to 10 times. In this case, after the 10th failure, the
function returns `False` (i.e. forbidden by robots).
Returns: Returns:
bool: `True` if `site.ignore_robots` is set, or if `url` is permitted bool: `True` if `site.ignore_robots` is set, or if `url` is permitted
@ -95,29 +93,21 @@ def is_permitted_by_robots(site, url, proxy=None):
if site.ignore_robots: if site.ignore_robots:
return True return True
tries_left = 10 try:
while True: result = _robots_cache(site, proxy).allowed(
try: url, site.user_agent or "brozzler")
result = _robots_cache(site, proxy).allowed( return result
url, site.user_agent or "brozzler") except Exception as e:
return result if isinstance(e, reppy.exceptions.ServerError) and isinstance(
except Exception as e: e.args[0], brozzler.ReachedLimit):
if isinstance(e, reppy.exceptions.ServerError) and isinstance( raise e.args[0]
e.args[0], brozzler.ReachedLimit): elif hasattr(e, 'args') and isinstance(
raise e.args[0] e.args[0], requests.exceptions.ProxyError):
elif hasattr(e, 'args') and isinstance( # reppy has wrapped an exception that we want to bubble up
e.args[0], requests.exceptions.ProxyError): raise brozzler.ProxyError(e)
# reppy has wrapped an exception that we want to bubble up else:
raise brozzler.ProxyError(e) logging.warn(
else: "returning true (permitted) after problem fetching "
if tries_left > 0: "robots.txt for %r: %r", url, e)
logging.warn( return True
"caught exception fetching robots.txt (%r tries "
"left) for %r: %r", tries_left, url, e)
tries_left -= 1
else:
logging.error(
"caught exception fetching robots.txt (0 tries "
"left) for %r: %r", url, e, exc_info=True)
return False

View file

@ -113,7 +113,11 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
class BrozzlerWorker: class BrozzlerWorker:
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
HEARTBEAT_INTERVAL = 20.0 # 3⅓ min heartbeat interval => 10 min ttl
# This is kind of a long time, because `frontier.claim_sites()`, which runs
# in the same thread as the heartbeats, can take a while on a busy brozzler
# cluster with slow rethinkdb.
HEARTBEAT_INTERVAL = 200.0
SITE_SESSION_MINUTES = 15 SITE_SESSION_MINUTES = 15
def __init__( def __init__(

View file

@ -32,12 +32,12 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b13.dev291', version='1.4.dev295',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',
author_email='nlevitt@archive.org', author_email='nlevitt@archive.org',
long_description=open('readme.rst', mode='rb').read().decode('UTF-8'), long_description=open('README.rst', mode='rb').read().decode('UTF-8'),
license='Apache License 2.0', license='Apache License 2.0',
packages=['brozzler', 'brozzler.dashboard'], packages=['brozzler', 'brozzler.dashboard'],
package_data={ package_data={

View file

@ -769,7 +769,7 @@ def test_time_limit(httpd):
rr = doublethink.Rethinker('localhost', db='brozzler') rr = doublethink.Rethinker('localhost', db='brozzler')
frontier = brozzler.RethinkDbFrontier(rr) frontier = brozzler.RethinkDbFrontier(rr)
# create a new job with three sites that could be crawled forever # create a new job with one seed that could be crawled forever
job_conf = {'seeds': [{ job_conf = {'seeds': [{
'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port, 'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port,
'time_limit': 20}]} 'time_limit': 20}]}
@ -789,6 +789,10 @@ def test_time_limit(httpd):
assert sites[0].status == 'FINISHED_TIME_LIMIT' assert sites[0].status == 'FINISHED_TIME_LIMIT'
# all sites finished so job should be finished too # all sites finished so job should be finished too
start = time.time()
job.refresh() job.refresh()
while not job.status == 'FINISHED' and time.time() - start < 10:
time.sleep(0.5)
job.refresh()
assert job.status == 'FINISHED' assert job.status == 'FINISHED'

View file

@ -32,6 +32,7 @@ import uuid
import socket import socket
import time import time
import sys import sys
import threading
logging.basicConfig( logging.basicConfig(
stream=sys.stderr, level=logging.INFO, format=( stream=sys.stderr, level=logging.INFO, format=(
@ -67,6 +68,87 @@ def test_robots(httpd):
site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'}) site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'})
assert not brozzler.is_permitted_by_robots(site, url) assert not brozzler.is_permitted_by_robots(site, url)
def test_robots_http_statuses():
for status in (
200, 204, 400, 401, 402, 403, 404, 405,
500, 501, 502, 503, 504, 505):
class Handler(http.server.BaseHTTPRequestHandler):
def do_GET(self):
response = (('HTTP/1.1 %s Meaningless message\r\n'
+ 'Content-length: 0\r\n'
+ '\r\n') % status).encode('utf-8')
self.connection.sendall(response)
# self.send_response(status)
# self.end_headers()
httpd = http.server.HTTPServer(('localhost', 0), Handler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
httpd_thread.start()
try:
url = 'http://localhost:%s/' % httpd.server_port
site = brozzler.Site(None, {'seed': url})
assert brozzler.is_permitted_by_robots(site, url)
finally:
httpd.shutdown()
httpd.server_close()
httpd_thread.join()
def test_robots_empty_response():
class Handler(http.server.BaseHTTPRequestHandler):
def do_GET(self):
self.connection.shutdown(socket.SHUT_RDWR)
self.connection.close()
httpd = http.server.HTTPServer(('localhost', 0), Handler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
httpd_thread.start()
try:
url = 'http://localhost:%s/' % httpd.server_port
site = brozzler.Site(None, {'seed': url})
assert brozzler.is_permitted_by_robots(site, url)
finally:
httpd.shutdown()
httpd.server_close()
httpd_thread.join()
def test_robots_socket_timeout():
stop_hanging = threading.Event()
class Handler(http.server.BaseHTTPRequestHandler):
def do_GET(self):
stop_hanging.wait(60)
self.connection.sendall(
b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n')
orig_timeout = brozzler.robots._SessionRaiseOn420.timeout
httpd = http.server.HTTPServer(('localhost', 0), Handler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
httpd_thread.start()
try:
url = 'http://localhost:%s/' % httpd.server_port
site = brozzler.Site(None, {'seed': url})
brozzler.robots._SessionRaiseOn420.timeout = 2
assert brozzler.is_permitted_by_robots(site, url)
finally:
brozzler.robots._SessionRaiseOn420.timeout = orig_timeout
stop_hanging.set()
httpd.shutdown()
httpd.server_close()
httpd_thread.join()
def test_robots_dns_failure():
# .invalid. is guaranteed nonexistent per rfc 6761
url = 'http://whatever.invalid./'
site = brozzler.Site(None, {'seed': url})
assert brozzler.is_permitted_by_robots(site, url)
def test_robots_connection_failure():
# .invalid. is guaranteed nonexistent per rfc 6761
url = 'http://localhost:4/' # nobody listens on port 4
site = brozzler.Site(None, {'seed': url})
assert brozzler.is_permitted_by_robots(site, url)
def test_scoping(): def test_scoping():
test_scope = yaml.load(''' test_scope = yaml.load('''
max_hops: 100 max_hops: 100