diff --git a/brozzler/worker.py b/brozzler/worker.py index 0ab56bf..f9c5233 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -352,14 +352,15 @@ class BrozzlerWorker: not brozzler.is_permitted_by_robots(site, page.url)): logging.warn("page %s is blocked by robots.txt", page.url) page.blocked_by_robots = True + self._frontier.completed_page(site, page) else: outlinks = self.brozzle_page(browser, site, page) + self._frontier.completed_page(site, page) self._frontier.scope_and_schedule_outlinks( site, page, outlinks) if browser.is_running(): site.cookie_db = browser.chrome.persist_and_read_cookie_db() - self._frontier.completed_page(site, page) page = None except brozzler.ShutdownRequested: self.logger.info("shutdown requested") diff --git a/setup.py b/setup.py index 82d4f32..71408b6 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev198', + version='1.1b9.dev199', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/htdocs/site5/destination/index.html b/tests/htdocs/site5/destination/index.html new file mode 100644 index 0000000..7b47d8f --- /dev/null +++ b/tests/htdocs/site5/destination/index.html @@ -0,0 +1,10 @@ + +
+this is a page with a link
+ page2.html + + + diff --git a/tests/htdocs/site5/destination/page2.html b/tests/htdocs/site5/destination/page2.html new file mode 100644 index 0000000..212a348 --- /dev/null +++ b/tests/htdocs/site5/destination/page2.html @@ -0,0 +1,9 @@ + + +this page is a dead end
+ + + diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 5953090..6a9ddd7 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -39,11 +39,22 @@ def stop_service(service): @pytest.fixture(scope='module') def httpd(request): + class RequestHandler(http.server.SimpleHTTPRequestHandler): + def do_GET(self): + if self.path == '/site5/redirect/': + self.send_response(303, 'See other') + self.send_header('Connection', 'close') + self.send_header('Content-Length', 0) + self.send_header('Location', '/site5/destination/') + self.end_headers() + self.wfile.write(b'') + else: + super().do_GET() + # SimpleHTTPRequestHandler always uses CWD so we have to chdir os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs')) - httpd = http.server.HTTPServer( - ('localhost', 0), http.server.SimpleHTTPRequestHandler) + httpd = http.server.HTTPServer(('localhost', 0), RequestHandler) httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) httpd_thread.start() @@ -330,3 +341,36 @@ def test_login(httpd): assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url +def test_seed_redirect(httpd): + test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat() + rr = doublethink.Rethinker('localhost', db='brozzler') + seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port + site = brozzler.Site(rr, { + 'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port, + 'proxy': 'localhost:8000', 'enable_warcprox_features': True, + 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) + assert site.scope['surt'] == 'http://(localhost:%s,)/site5/redirect/' % httpd.server_port + + frontier = brozzler.RethinkDbFrontier(rr) + brozzler.new_site(frontier, site) + assert site.id + + # the site should be brozzled fairly quickly + start = time.time() + while site.status != 'FINISHED' and time.time() - start < 300: + time.sleep(0.5) + site.refresh() + assert site.status == 'FINISHED' + + # take a look at the pages table + pages = list(frontier.site_pages(site.id)) + assert len(pages) == 2 + pages.sort(key=lambda page: page.hops_from_seed) + assert pages[0].hops_from_seed == 0 + assert pages[0].url == seed_url + assert pages[0].redirect_url == 'http://localhost:%s/site5/destination/' % httpd.server_port + assert pages[1].hops_from_seed == 1 + assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port + + # check that scope has been updated properly + assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port