diff --git a/.travis.yml b/.travis.yml index d8c6e86..8db8e2c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,13 +1,15 @@ language: python python: - 3.4 +- 3.5 +- 3.6 sudo: required dist: trusty before_install: - sudo pip install ansible==2.1.3.0 install: - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml -- pip install $TRAVIS_BUILD_DIR 'warcprox>=2.1b1.dev71' pytest +- pip install $TRAVIS_BUILD_DIR 'warcprox>=2.1b1.dev87' pytest script: - DISPLAY=:1 py.test -v tests after_failure: diff --git a/brozzler/browser.py b/brozzler/browser.py index 4628e74..988f8b2 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -429,11 +429,18 @@ class Browser: self.websock_thread.on_response = on_response try: with brozzler.thread_accept_exceptions(): - self.navigate_to_page( - page_url, extra_headers=extra_headers, - user_agent=user_agent, timeout=300) + self.configure_browser( + extra_headers=extra_headers, + user_agent=user_agent) + self.navigate_to_page(page_url, timeout=300) if password: self.try_login(username, password, timeout=300) + # if login redirected us, return to page_url + if page_url != self.url().split('#')[0]: + self.logger.debug( + 'login navigated away from %s; returning!', + page_url) + self.navigate_to_page(page_url, timeout=300) if on_screenshot: jpeg_bytes = self.screenshot() on_screenshot(jpeg_bytes) @@ -479,8 +486,7 @@ class Browser: # run behavior again with short timeout? # retrieve outlinks again and append to list? - def navigate_to_page( - self, page_url, extra_headers=None, user_agent=None, timeout=300): + def configure_browser(self, extra_headers=None, user_agent=None): headers = extra_headers or {} headers['Accept-Encoding'] = 'identity' self.send_to_chrome( @@ -492,7 +498,7 @@ class Browser: method='Network.setUserAgentOverride', params={'userAgent': user_agent}) - # navigate to the page! + def navigate_to_page(self, page_url, timeout=300): self.logger.info('navigating to page %s', page_url) self.websock_thread.got_page_load_event = None self.send_to_chrome(method='Page.navigate', params={'url': page_url}) diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 00ad46e..8adaab1 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -268,52 +268,95 @@ class RethinkDbFrontier: {"start":doublethink.utcnow(), "stop":None}) site.save() - def scope_and_schedule_outlinks(self, site, parent_page, outlinks): - decisions = {"accepted":set(),"blocked":set(),"rejected":set()} - counts = {"added":0,"updated":0,"rejected":0,"blocked":0} + def _scope_and_enforce_robots(self, site, parent_page, outlinks): + ''' + Returns tuple ( + set of in scope urls (uncanonicalized) accepted by robots policy, + set of in scope urls (canonicalized) blocked by robots policy, + set of out-of-scope urls (canonicalized)). + ''' + in_scope = set() + blocked = set() + out_of_scope = set() for url in outlinks or []: + url_for_scoping = urlcanon.semantic(url) + url_for_crawling = urlcanon.whatwg(url) + urlcanon.canon.remove_fragment(url_for_crawling) + if site.is_in_scope(url_for_scoping, parent_page=parent_page): + if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): + in_scope.add(url) + else: + blocked.add(str(url_for_crawling)) + else: + out_of_scope.add(str(url_for_crawling)) + return in_scope, blocked, out_of_scope + + def _build_fresh_pages(self, site, parent_page, urls): + ''' + Returns a dict of page_id => brozzler.Page. + ''' + pages = {} + for url in urls: url_for_scoping = urlcanon.semantic(url) url_for_crawling = urlcanon.whatwg(url) hashtag = (url_for_crawling.hash_sign + url_for_crawling.fragment).decode('utf-8') urlcanon.canon.remove_fragment(url_for_crawling) - if site.is_in_scope(url_for_scoping, parent_page=parent_page): - if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): - if not url_for_scoping.surt().startswith( - site.scope["surt"].encode("utf-8")): - hops_off_surt = parent_page.hops_off_surt + 1 - else: - hops_off_surt = 0 - new_child_page = brozzler.Page(self.rr, { - 'url': str(url_for_crawling), - 'site_id': site.id, 'job_id': site.job_id, - 'hops_from_seed': parent_page.hops_from_seed+1, - 'via_page_id': parent_page.id, - 'hops_off_surt': hops_off_surt}) - existing_child_page = brozzler.Page.load( - self.rr, new_child_page.id) - if existing_child_page: - existing_child_page.priority += new_child_page.priority - if hashtag and existing_child_page.hashtags: - hashtags = set(existing_child_page.hashtags) - hashtags.add(hashtag) - existing_child_page.hashtags = list(hashtags) - elif hashtag: - existing_child_page.hashtags = [hashtag] - existing_child_page.save() - counts["updated"] += 1 - else: - if hashtag: - new_child_page.hashtags = [hashtag,] - new_child_page.save() - counts["added"] += 1 - decisions["accepted"].add(str(url_for_crawling)) - else: - counts["blocked"] += 1 - decisions["blocked"].add(str(url_for_crawling)) + if not url_for_scoping.surt().startswith( + site.scope['surt'].encode('utf-8')): + hops_off_surt = parent_page.hops_off_surt + 1 else: - counts["rejected"] += 1 - decisions["rejected"].add(str(url_for_crawling)) + hops_off_surt = 0 + page = brozzler.Page(self.rr, { + 'url': str(url_for_crawling), + 'site_id': site.id, + 'job_id': site.job_id, + 'hops_from_seed': parent_page.hops_from_seed + 1, + 'via_page_id': parent_page.id, + 'hops_off_surt': hops_off_surt, + 'hashtags': []}) + if page.id in pages: + pages[page.id].priority += page.priority + page = pages[page.id] + else: + pages[page.id] = page + if hashtag: + page.hashtags = list(set(page.hashtags + [hashtag])) + return pages + + def scope_and_schedule_outlinks(self, site, parent_page, outlinks): + decisions = {'accepted':set(),'blocked':set(),'rejected':set()} + counts = {'added':0,'updated':0,'rejected':0,'blocked':0} + + in_scope, blocked, out_of_scope = self._scope_and_enforce_robots( + site, parent_page, outlinks) + decisions['blocked'] = blocked + decisions['rejected'] = out_of_scope + counts['blocked'] += len(blocked) + counts['rejected'] += len(out_of_scope) + + fresh_pages = self._build_fresh_pages(site, parent_page, in_scope) + + # get existing pages from rethinkdb + results = self.rr.table('pages').get_all(*fresh_pages.keys()).run() + pages = {doc['id']: brozzler.Page(self.rr, doc) for doc in results} + + # build list of pages to save, consisting of new pages, and existing + # pages updated with higher priority and new hashtags + for fresh_page in fresh_pages.values(): + decisions['accepted'].add(fresh_page.url) + if fresh_page.id in pages: + page = pages[fresh_page.id] + page.hashtags = list(set((page.hashtags or []) + + fresh_page.hashtags)) + page.priority += fresh_page.priority + counts['updated'] += 1 + else: + pages[fresh_page.id] = fresh_page + counts['added'] += 1 + + result = self.rr.table('pages').insert( + pages.values(), conflict='replace').run() parent_page.outlinks = {} for k in decisions: @@ -321,10 +364,10 @@ class RethinkDbFrontier: parent_page.save() self.logger.info( - "%s new links added, %s existing links updated, %s links " - "rejected, %s links blocked by robots from %s", - counts["added"], counts["updated"], counts["rejected"], - counts["blocked"], parent_page) + '%s new links added, %s existing links updated, %s links ' + 'rejected, %s links blocked by robots from %s', + counts['added'], counts['updated'], counts['rejected'], + counts['blocked'], parent_page) def reached_limit(self, site, e): self.logger.info("reached_limit site=%s e=%s", site, e) diff --git a/setup.py b/setup.py index c45eaba..c09705e 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b11.dev250', + version='1.1b11.dev252', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -78,7 +78,7 @@ setuptools.setup( extras_require={ 'dashboard': ['flask>=0.11', 'gunicorn'], 'easy': [ - 'warcprox>=2.1b1.dev86', + 'warcprox>=2.1b1.dev87', 'pywb', 'flask>=0.11', 'gunicorn' @@ -91,6 +91,7 @@ setuptools.setup( 'License :: OSI Approved :: Apache Software License', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', 'Topic :: Internet :: WWW/HTTP', 'Topic :: System :: Archiving', ])