mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-09-23 06:04:47 -04:00
Merge branch 'master' into qa
* master: implement timeout to work around issue where sometimes we receive no result message after requesting scroll to top avoid "AttributeError: 'ExtractorError' object has no attribute 'code'" checking for 430 (soft limit) from youtube-dl set Browser._chrome_instance=None if _chrome_instance.start() throws exception, to avoid endless loop after one failure fix case where rethinkdb page already has claimed=True undo accidentally committed change to browser startup timeout, and remove now misleading comment about browser ports (see https://github.com/internetarchive/brozzler/pull/3) fix bug preventing brozzler-new-site from working, add note about brozzler-new-site in readme --trace level logging
This commit is contained in:
commit
d82feb14da
7 changed files with 60 additions and 20 deletions
|
@ -70,6 +70,14 @@ must be specified, everything else is optional.
|
||||||
scope:
|
scope:
|
||||||
surt: http://(org,example,
|
surt: http://(org,example,
|
||||||
|
|
||||||
|
Submit a Site to Crawl Without Configuring a Job
|
||||||
|
------------------------------------------------
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
brozzler-new-site --proxy=localhost:8000 --enable-warcprox-features \
|
||||||
|
--time-limit=600 http://example.com/
|
||||||
|
|
||||||
Brozzler Web Console
|
Brozzler Web Console
|
||||||
--------------------
|
--------------------
|
||||||
|
|
||||||
|
|
|
@ -64,6 +64,9 @@ class BaseDictable:
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return "{}(**{})".format(self.__class__.__name__, self.to_dict())
|
return "{}(**{})".format(self.__class__.__name__, self.to_dict())
|
||||||
|
|
||||||
|
# logging level more fine-grained than logging.DEBUG==10
|
||||||
|
TRACE = 5
|
||||||
|
|
||||||
from brozzler.site import Page, Site
|
from brozzler.site import Page, Site
|
||||||
from brozzler.worker import BrozzlerWorker
|
from brozzler.worker import BrozzlerWorker
|
||||||
from brozzler.robots import is_permitted_by_robots
|
from brozzler.robots import is_permitted_by_robots
|
||||||
|
|
|
@ -55,8 +55,6 @@ class BrowserPool:
|
||||||
|
|
||||||
self._lock = threading.Lock()
|
self._lock = threading.Lock()
|
||||||
|
|
||||||
self.logger.info("browser ports: {}".format([browser.chrome_port for browser in self._available]))
|
|
||||||
|
|
||||||
def acquire(self):
|
def acquire(self):
|
||||||
"""
|
"""
|
||||||
Returns browser from pool if available, raises NoBrowsersAvailable
|
Returns browser from pool if available, raises NoBrowsersAvailable
|
||||||
|
@ -133,13 +131,18 @@ class Browser:
|
||||||
# these can raise exceptions
|
# these can raise exceptions
|
||||||
self.chrome_port = self._find_available_port()
|
self.chrome_port = self._find_available_port()
|
||||||
self._work_dir = tempfile.TemporaryDirectory()
|
self._work_dir = tempfile.TemporaryDirectory()
|
||||||
self._chrome_instance = Chrome(port=self.chrome_port,
|
self._chrome_instance = Chrome(
|
||||||
executable=self.chrome_exe,
|
port=self.chrome_port, executable=self.chrome_exe,
|
||||||
user_home_dir=self._work_dir.name,
|
user_home_dir=self._work_dir.name,
|
||||||
user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]),
|
user_data_dir=os.sep.join([
|
||||||
|
self._work_dir.name, "chrome-user-data"]),
|
||||||
ignore_cert_errors=self.ignore_cert_errors,
|
ignore_cert_errors=self.ignore_cert_errors,
|
||||||
proxy=proxy or self.proxy)
|
proxy=proxy or self.proxy)
|
||||||
self._websocket_url = self._chrome_instance.start()
|
try:
|
||||||
|
self._websocket_url = self._chrome_instance.start()
|
||||||
|
except:
|
||||||
|
self._chrome_instance = None
|
||||||
|
raise
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
try:
|
try:
|
||||||
|
@ -201,6 +204,7 @@ class Browser:
|
||||||
self.behavior_parameters = behavior_parameters
|
self.behavior_parameters = behavior_parameters
|
||||||
|
|
||||||
self._waiting_on_scroll_to_top_msg_id = None
|
self._waiting_on_scroll_to_top_msg_id = None
|
||||||
|
self._waiting_on_scroll_to_top_start = None
|
||||||
self._waiting_on_screenshot_msg_id = None
|
self._waiting_on_screenshot_msg_id = None
|
||||||
self._waiting_on_document_url_msg_id = None
|
self._waiting_on_document_url_msg_id = None
|
||||||
self._waiting_on_outlinks_msg_id = None
|
self._waiting_on_outlinks_msg_id = None
|
||||||
|
@ -267,7 +271,19 @@ class Browser:
|
||||||
self._waiting_on_scroll_to_top_msg_id = self.send_to_chrome(
|
self._waiting_on_scroll_to_top_msg_id = self.send_to_chrome(
|
||||||
method="Runtime.evaluate",
|
method="Runtime.evaluate",
|
||||||
params={"expression":"window.scrollTo(0, 0);"})
|
params={"expression":"window.scrollTo(0, 0);"})
|
||||||
|
self._waiting_on_scroll_to_top_start = time.time()
|
||||||
return False
|
return False
|
||||||
|
elif (self._waiting_on_scroll_to_top_msg_id
|
||||||
|
and time.time() - self._waiting_on_scroll_to_top_start > 30):
|
||||||
|
# chromium bug? occasionally we get no scroll-to-top result message
|
||||||
|
self.logger.warn(
|
||||||
|
"timed out after %.1fs waiting for scroll-to-top result "
|
||||||
|
"message, requesting screenshot now",
|
||||||
|
time.time() - self._waiting_on_scroll_to_top_start)
|
||||||
|
self._waiting_on_scroll_to_top_msg_id = None
|
||||||
|
self._waiting_on_scroll_to_top_start = None
|
||||||
|
self._waiting_on_screenshot_msg_id = self.send_to_chrome(
|
||||||
|
method="Page.captureScreenshot")
|
||||||
elif not self._has_screenshot and (
|
elif not self._has_screenshot and (
|
||||||
self._waiting_on_scroll_to_top_msg_id
|
self._waiting_on_scroll_to_top_msg_id
|
||||||
or self._waiting_on_screenshot_msg_id):
|
or self._waiting_on_screenshot_msg_id):
|
||||||
|
@ -406,8 +422,10 @@ compileOutlinks(window).join(' ');
|
||||||
self._has_screenshot = True
|
self._has_screenshot = True
|
||||||
self.logger.info("got screenshot, moving on to getting outlinks url={}".format(self.url))
|
self.logger.info("got screenshot, moving on to getting outlinks url={}".format(self.url))
|
||||||
elif message["id"] == self._waiting_on_scroll_to_top_msg_id:
|
elif message["id"] == self._waiting_on_scroll_to_top_msg_id:
|
||||||
self._waiting_on_screenshot_msg_id = self.send_to_chrome(method="Page.captureScreenshot")
|
|
||||||
self._waiting_on_scroll_to_top_msg_id = None
|
self._waiting_on_scroll_to_top_msg_id = None
|
||||||
|
self._waiting_on_scroll_to_top_start = None
|
||||||
|
self._waiting_on_screenshot_msg_id = self.send_to_chrome(
|
||||||
|
method="Page.captureScreenshot")
|
||||||
elif message["id"] == self._waiting_on_outlinks_msg_id:
|
elif message["id"] == self._waiting_on_outlinks_msg_id:
|
||||||
self.logger.debug("got outlinks message=%s", message)
|
self.logger.debug("got outlinks message=%s", message)
|
||||||
self._outlinks = frozenset(
|
self._outlinks = frozenset(
|
||||||
|
|
|
@ -41,6 +41,9 @@ def _add_common_options(arg_parser):
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-v', '--verbose', dest='log_level',
|
'-v', '--verbose', dest='log_level',
|
||||||
action='store_const', default=logging.INFO, const=logging.DEBUG)
|
action='store_const', default=logging.INFO, const=logging.DEBUG)
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--trace', dest='log_level',
|
||||||
|
action='store_const', default=logging.INFO, const=brozzler.TRACE)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--version', action='version',
|
'--version', action='version',
|
||||||
version='brozzler %s - %s' % (
|
version='brozzler %s - %s' % (
|
||||||
|
|
|
@ -197,12 +197,12 @@ class RethinkDbFrontier:
|
||||||
1).update({
|
1).update({
|
||||||
"claimed":True,
|
"claimed":True,
|
||||||
"last_claimed_by":worker_id},
|
"last_claimed_by":worker_id},
|
||||||
return_changes=True).run()
|
return_changes="always").run()
|
||||||
self._vet_result(result, replaced=[0,1])
|
self._vet_result(result, unchanged=[0,1], replaced=[0,1])
|
||||||
if result["replaced"] == 1:
|
if result["unchanged"] == 0 and result["replaced"] == 0:
|
||||||
return brozzler.Page(**result["changes"][0]["new_val"])
|
|
||||||
else:
|
|
||||||
raise brozzler.NothingToClaim
|
raise brozzler.NothingToClaim
|
||||||
|
else:
|
||||||
|
return brozzler.Page(**result["changes"][0]["new_val"])
|
||||||
|
|
||||||
def has_outstanding_pages(self, site):
|
def has_outstanding_pages(self, site):
|
||||||
results_iter = self.r.table("pages").between(
|
results_iter = self.r.table("pages").between(
|
||||||
|
@ -233,6 +233,8 @@ class RethinkDbFrontier:
|
||||||
yield brozzler.Job(**result)
|
yield brozzler.Job(**result)
|
||||||
|
|
||||||
def job(self, id):
|
def job(self, id):
|
||||||
|
if id is None:
|
||||||
|
return None
|
||||||
result = self.r.table("jobs").get(id).run()
|
result = self.r.table("jobs").get(id).run()
|
||||||
if result:
|
if result:
|
||||||
return brozzler.Job(**result)
|
return brozzler.Job(**result)
|
||||||
|
|
|
@ -225,6 +225,7 @@ class BrozzlerWorker:
|
||||||
raise
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
|
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
|
||||||
|
and hasattr(e.exc_info[1], 'code')
|
||||||
and e.exc_info[1].code == 430):
|
and e.exc_info[1].code == 430):
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
'youtube-dl got %s %s processing %s',
|
'youtube-dl got %s %s processing %s',
|
||||||
|
@ -288,9 +289,8 @@ class BrozzlerWorker:
|
||||||
while (not self._shutdown_requested.is_set()
|
while (not self._shutdown_requested.is_set()
|
||||||
and time.time() - start < 7 * 60):
|
and time.time() - start < 7 * 60):
|
||||||
self._frontier.honor_stop_request(site.job_id)
|
self._frontier.honor_stop_request(site.job_id)
|
||||||
page = self._frontier.claim_page(site,
|
page = self._frontier.claim_page(site, "%s:%s" % (
|
||||||
"{}:{}".format(
|
socket.gethostname(), browser.chrome_port))
|
||||||
socket.gethostname(), browser.chrome_port))
|
|
||||||
outlinks = self.brozzle_page(browser, site, page)
|
outlinks = self.brozzle_page(browser, site, page)
|
||||||
self._frontier.completed_page(site, page)
|
self._frontier.completed_page(site, page)
|
||||||
self._frontier.scope_and_schedule_outlinks(site, page, outlinks)
|
self._frontier.scope_and_schedule_outlinks(site, page, outlinks)
|
||||||
|
@ -326,8 +326,9 @@ class BrozzlerWorker:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.status_info = self._service_registry.heartbeat(status_info)
|
self.status_info = self._service_registry.heartbeat(status_info)
|
||||||
self.logger.debug(
|
self.logger.log(
|
||||||
"status in service registry: %s", self.status_info)
|
brozzler.TRACE, "status in service registry: %s",
|
||||||
|
self.status_info)
|
||||||
except rethinkdb.ReqlError as e:
|
except rethinkdb.ReqlError as e:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
"failed to send heartbeat and update service registry "
|
"failed to send heartbeat and update service registry "
|
||||||
|
@ -337,7 +338,11 @@ class BrozzlerWorker:
|
||||||
try:
|
try:
|
||||||
latest_state = None
|
latest_state = None
|
||||||
while not self._shutdown_requested.is_set():
|
while not self._shutdown_requested.is_set():
|
||||||
if self._service_registry and (not hasattr(self, "status_info") or (rethinkstuff.utcnow() - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL):
|
if self._service_registry and (
|
||||||
|
not hasattr(self, "status_info")
|
||||||
|
or (rethinkstuff.utcnow() -
|
||||||
|
self.status_info["last_heartbeat"]).total_seconds()
|
||||||
|
> self.HEARTBEAT_INTERVAL):
|
||||||
self._service_heartbeat()
|
self._service_heartbeat()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -357,7 +362,8 @@ class BrozzlerWorker:
|
||||||
raise
|
raise
|
||||||
except brozzler.browser.NoBrowsersAvailable:
|
except brozzler.browser.NoBrowsersAvailable:
|
||||||
if latest_state != "browsers-busy":
|
if latest_state != "browsers-busy":
|
||||||
self.logger.info("all %s browsers are busy", self._max_browsers)
|
self.logger.info(
|
||||||
|
"all %s browsers are busy", self._max_browsers)
|
||||||
latest_state = "browsers-busy"
|
latest_state = "browsers-busy"
|
||||||
except brozzler.NothingToClaim:
|
except brozzler.NothingToClaim:
|
||||||
if latest_state != "no-unclaimed-sites":
|
if latest_state != "no-unclaimed-sites":
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -21,7 +21,7 @@ import setuptools
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1.dev31',
|
version='1.1.dev38',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue