fix bug with seed redirects where scope change was applied too late to affect scoping of outlinks from the seed (with automated tests)

This commit is contained in:
Noah Levitt 2017-03-06 15:13:40 -08:00
parent 40bbbb3524
commit 242ff51ec7
5 changed files with 68 additions and 4 deletions

View File

@ -352,14 +352,15 @@ class BrozzlerWorker:
not brozzler.is_permitted_by_robots(site, page.url)):
logging.warn("page %s is blocked by robots.txt", page.url)
page.blocked_by_robots = True
self._frontier.completed_page(site, page)
else:
outlinks = self.brozzle_page(browser, site, page)
self._frontier.completed_page(site, page)
self._frontier.scope_and_schedule_outlinks(
site, page, outlinks)
if browser.is_running():
site.cookie_db = browser.chrome.persist_and_read_cookie_db()
self._frontier.completed_page(site, page)
page = None
except brozzler.ShutdownRequested:
self.logger.info("shutdown requested")

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b9.dev198',
version='1.1b9.dev199',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',

View File

@ -0,0 +1,10 @@
<html>
<head>
<title>a page with a link</title>
</head>
<body>
<p>this is a page with a link</p>
<a href="page2.html">page2.html</a>
</body>
</html>

View File

@ -0,0 +1,9 @@
<html>
<head>
<title>a dead end page</title>
</head>
<body>
<p>this page is a dead end</p>
</body>
</html>

View File

@ -39,11 +39,22 @@ def stop_service(service):
@pytest.fixture(scope='module')
def httpd(request):
class RequestHandler(http.server.SimpleHTTPRequestHandler):
def do_GET(self):
if self.path == '/site5/redirect/':
self.send_response(303, 'See other')
self.send_header('Connection', 'close')
self.send_header('Content-Length', 0)
self.send_header('Location', '/site5/destination/')
self.end_headers()
self.wfile.write(b'')
else:
super().do_GET()
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
httpd = http.server.HTTPServer(
('localhost', 0), http.server.SimpleHTTPRequestHandler)
httpd = http.server.HTTPServer(('localhost', 0), RequestHandler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
httpd_thread.start()
@ -330,3 +341,36 @@ def test_login(httpd):
assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
def test_seed_redirect(httpd):
test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker('localhost', db='brozzler')
seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port
site = brozzler.Site(rr, {
'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port,
'proxy': 'localhost:8000', 'enable_warcprox_features': True,
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
assert site.scope['surt'] == 'http://(localhost:%s,)/site5/redirect/' % httpd.server_port
frontier = brozzler.RethinkDbFrontier(rr)
brozzler.new_site(frontier, site)
assert site.id
# the site should be brozzled fairly quickly
start = time.time()
while site.status != 'FINISHED' and time.time() - start < 300:
time.sleep(0.5)
site.refresh()
assert site.status == 'FINISHED'
# take a look at the pages table
pages = list(frontier.site_pages(site.id))
assert len(pages) == 2
pages.sort(key=lambda page: page.hops_from_seed)
assert pages[0].hops_from_seed == 0
assert pages[0].url == seed_url
assert pages[0].redirect_url == 'http://localhost:%s/site5/destination/' % httpd.server_port
assert pages[1].hops_from_seed == 1
assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port
# check that scope has been updated properly
assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port