mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-14 09:15:54 -04:00
Merge branch 'master' into qa
* master: catch exception from rethinkdb when unregistering from the service registry at shutdown correctly handle site with no pages (which means the seed was blocked by robots.txt) in frontier.seed_page
This commit is contained in:
commit
7dd841cae2
3 changed files with 9 additions and 2 deletions
|
@ -313,6 +313,8 @@ class RethinkDbFrontier:
|
||||||
if len(pages) > 1:
|
if len(pages) > 1:
|
||||||
self.logger.warn(
|
self.logger.warn(
|
||||||
"more than one seed page for site_id %s ?", site_id)
|
"more than one seed page for site_id %s ?", site_id)
|
||||||
|
if len(pages) < 1:
|
||||||
|
return None
|
||||||
return brozzler.Page(**pages[0])
|
return brozzler.Page(**pages[0])
|
||||||
|
|
||||||
def site_pages(self, site_id, unbrozzled_only=False):
|
def site_pages(self, site_id, unbrozzled_only=False):
|
||||||
|
|
|
@ -360,7 +360,12 @@ class BrozzlerWorker:
|
||||||
self.logger.critical("thread exiting due to unexpected exception", exc_info=True)
|
self.logger.critical("thread exiting due to unexpected exception", exc_info=True)
|
||||||
finally:
|
finally:
|
||||||
if self._service_registry and hasattr(self, "status_info"):
|
if self._service_registry and hasattr(self, "status_info"):
|
||||||
self._service_registry.unregister(self.status_info["id"])
|
try:
|
||||||
|
self._service_registry.unregister(self.status_info["id"])
|
||||||
|
except:
|
||||||
|
self.logger.error(
|
||||||
|
"failed to unregister from service registry",
|
||||||
|
exc_info=True)
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
th = threading.Thread(target=self.run, name="BrozzlerWorker")
|
th = threading.Thread(target=self.run, name="BrozzlerWorker")
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -20,7 +20,7 @@ import setuptools
|
||||||
import glob
|
import glob
|
||||||
|
|
||||||
setuptools.setup(name='brozzler',
|
setuptools.setup(name='brozzler',
|
||||||
version='1.1.dev9',
|
version='1.1.dev10',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/nlevitt/brozzler',
|
url='https://github.com/nlevitt/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue