mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
fix bug with seed redirects where scope change was applied too late to affect scoping of outlinks from the seed (with automated tests)
This commit is contained in:
parent
40bbbb3524
commit
242ff51ec7
@ -352,14 +352,15 @@ class BrozzlerWorker:
|
|||||||
not brozzler.is_permitted_by_robots(site, page.url)):
|
not brozzler.is_permitted_by_robots(site, page.url)):
|
||||||
logging.warn("page %s is blocked by robots.txt", page.url)
|
logging.warn("page %s is blocked by robots.txt", page.url)
|
||||||
page.blocked_by_robots = True
|
page.blocked_by_robots = True
|
||||||
|
self._frontier.completed_page(site, page)
|
||||||
else:
|
else:
|
||||||
outlinks = self.brozzle_page(browser, site, page)
|
outlinks = self.brozzle_page(browser, site, page)
|
||||||
|
self._frontier.completed_page(site, page)
|
||||||
self._frontier.scope_and_schedule_outlinks(
|
self._frontier.scope_and_schedule_outlinks(
|
||||||
site, page, outlinks)
|
site, page, outlinks)
|
||||||
if browser.is_running():
|
if browser.is_running():
|
||||||
site.cookie_db = browser.chrome.persist_and_read_cookie_db()
|
site.cookie_db = browser.chrome.persist_and_read_cookie_db()
|
||||||
|
|
||||||
self._frontier.completed_page(site, page)
|
|
||||||
page = None
|
page = None
|
||||||
except brozzler.ShutdownRequested:
|
except brozzler.ShutdownRequested:
|
||||||
self.logger.info("shutdown requested")
|
self.logger.info("shutdown requested")
|
||||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev198',
|
version='1.1b9.dev199',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
10
tests/htdocs/site5/destination/index.html
Normal file
10
tests/htdocs/site5/destination/index.html
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>a page with a link</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>this is a page with a link</p>
|
||||||
|
<a href="page2.html">page2.html</a>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
|
9
tests/htdocs/site5/destination/page2.html
Normal file
9
tests/htdocs/site5/destination/page2.html
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>a dead end page</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>this page is a dead end</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
|
@ -39,11 +39,22 @@ def stop_service(service):
|
|||||||
|
|
||||||
@pytest.fixture(scope='module')
|
@pytest.fixture(scope='module')
|
||||||
def httpd(request):
|
def httpd(request):
|
||||||
|
class RequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
|
def do_GET(self):
|
||||||
|
if self.path == '/site5/redirect/':
|
||||||
|
self.send_response(303, 'See other')
|
||||||
|
self.send_header('Connection', 'close')
|
||||||
|
self.send_header('Content-Length', 0)
|
||||||
|
self.send_header('Location', '/site5/destination/')
|
||||||
|
self.end_headers()
|
||||||
|
self.wfile.write(b'')
|
||||||
|
else:
|
||||||
|
super().do_GET()
|
||||||
|
|
||||||
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
||||||
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
|
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
|
||||||
|
|
||||||
httpd = http.server.HTTPServer(
|
httpd = http.server.HTTPServer(('localhost', 0), RequestHandler)
|
||||||
('localhost', 0), http.server.SimpleHTTPRequestHandler)
|
|
||||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||||
httpd_thread.start()
|
httpd_thread.start()
|
||||||
|
|
||||||
@ -330,3 +341,36 @@ def test_login(httpd):
|
|||||||
assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
|
assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
|
||||||
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
|
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
|
||||||
|
|
||||||
|
def test_seed_redirect(httpd):
|
||||||
|
test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
|
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||||
|
seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port
|
||||||
|
site = brozzler.Site(rr, {
|
||||||
|
'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port,
|
||||||
|
'proxy': 'localhost:8000', 'enable_warcprox_features': True,
|
||||||
|
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
||||||
|
assert site.scope['surt'] == 'http://(localhost:%s,)/site5/redirect/' % httpd.server_port
|
||||||
|
|
||||||
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
brozzler.new_site(frontier, site)
|
||||||
|
assert site.id
|
||||||
|
|
||||||
|
# the site should be brozzled fairly quickly
|
||||||
|
start = time.time()
|
||||||
|
while site.status != 'FINISHED' and time.time() - start < 300:
|
||||||
|
time.sleep(0.5)
|
||||||
|
site.refresh()
|
||||||
|
assert site.status == 'FINISHED'
|
||||||
|
|
||||||
|
# take a look at the pages table
|
||||||
|
pages = list(frontier.site_pages(site.id))
|
||||||
|
assert len(pages) == 2
|
||||||
|
pages.sort(key=lambda page: page.hops_from_seed)
|
||||||
|
assert pages[0].hops_from_seed == 0
|
||||||
|
assert pages[0].url == seed_url
|
||||||
|
assert pages[0].redirect_url == 'http://localhost:%s/site5/destination/' % httpd.server_port
|
||||||
|
assert pages[1].hops_from_seed == 1
|
||||||
|
assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port
|
||||||
|
|
||||||
|
# check that scope has been updated properly
|
||||||
|
assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port
|
||||||
|
Loading…
x
Reference in New Issue
Block a user