Merge branch 'master' into qa

* master:
  implement timeout to work around issue where sometimes we receive no result message after requesting scroll to top
  avoid "AttributeError: 'ExtractorError' object has no attribute 'code'" checking for 430 (soft limit) from youtube-dl
  set Browser._chrome_instance=None if _chrome_instance.start() throws exception, to avoid endless loop after one failure
  fix case where rethinkdb page already has claimed=True
  undo accidentally committed change to browser startup timeout, and remove now misleading comment about browser ports (see https://github.com/internetarchive/brozzler/pull/3)
  fix bug preventing brozzler-new-site from working, add note about brozzler-new-site in readme
  --trace level logging
This commit is contained in:
Noah Levitt 2016-06-30 11:46:31 -05:00
commit d82feb14da
7 changed files with 60 additions and 20 deletions

View File

@ -70,6 +70,14 @@ must be specified, everything else is optional.
scope:
surt: http://(org,example,
Submit a Site to Crawl Without Configuring a Job
------------------------------------------------
::
brozzler-new-site --proxy=localhost:8000 --enable-warcprox-features \
--time-limit=600 http://example.com/
Brozzler Web Console
--------------------

View File

@ -64,6 +64,9 @@ class BaseDictable:
def __repr__(self):
return "{}(**{})".format(self.__class__.__name__, self.to_dict())
# logging level more fine-grained than logging.DEBUG==10
TRACE = 5
from brozzler.site import Page, Site
from brozzler.worker import BrozzlerWorker
from brozzler.robots import is_permitted_by_robots

View File

@ -55,8 +55,6 @@ class BrowserPool:
self._lock = threading.Lock()
self.logger.info("browser ports: {}".format([browser.chrome_port for browser in self._available]))
def acquire(self):
"""
Returns browser from pool if available, raises NoBrowsersAvailable
@ -133,13 +131,18 @@ class Browser:
# these can raise exceptions
self.chrome_port = self._find_available_port()
self._work_dir = tempfile.TemporaryDirectory()
self._chrome_instance = Chrome(port=self.chrome_port,
executable=self.chrome_exe,
self._chrome_instance = Chrome(
port=self.chrome_port, executable=self.chrome_exe,
user_home_dir=self._work_dir.name,
user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]),
user_data_dir=os.sep.join([
self._work_dir.name, "chrome-user-data"]),
ignore_cert_errors=self.ignore_cert_errors,
proxy=proxy or self.proxy)
self._websocket_url = self._chrome_instance.start()
try:
self._websocket_url = self._chrome_instance.start()
except:
self._chrome_instance = None
raise
def stop(self):
try:
@ -201,6 +204,7 @@ class Browser:
self.behavior_parameters = behavior_parameters
self._waiting_on_scroll_to_top_msg_id = None
self._waiting_on_scroll_to_top_start = None
self._waiting_on_screenshot_msg_id = None
self._waiting_on_document_url_msg_id = None
self._waiting_on_outlinks_msg_id = None
@ -267,7 +271,19 @@ class Browser:
self._waiting_on_scroll_to_top_msg_id = self.send_to_chrome(
method="Runtime.evaluate",
params={"expression":"window.scrollTo(0, 0);"})
self._waiting_on_scroll_to_top_start = time.time()
return False
elif (self._waiting_on_scroll_to_top_msg_id
and time.time() - self._waiting_on_scroll_to_top_start > 30):
# chromium bug? occasionally we get no scroll-to-top result message
self.logger.warn(
"timed out after %.1fs waiting for scroll-to-top result "
"message, requesting screenshot now",
time.time() - self._waiting_on_scroll_to_top_start)
self._waiting_on_scroll_to_top_msg_id = None
self._waiting_on_scroll_to_top_start = None
self._waiting_on_screenshot_msg_id = self.send_to_chrome(
method="Page.captureScreenshot")
elif not self._has_screenshot and (
self._waiting_on_scroll_to_top_msg_id
or self._waiting_on_screenshot_msg_id):
@ -406,8 +422,10 @@ compileOutlinks(window).join(' ');
self._has_screenshot = True
self.logger.info("got screenshot, moving on to getting outlinks url={}".format(self.url))
elif message["id"] == self._waiting_on_scroll_to_top_msg_id:
self._waiting_on_screenshot_msg_id = self.send_to_chrome(method="Page.captureScreenshot")
self._waiting_on_scroll_to_top_msg_id = None
self._waiting_on_scroll_to_top_start = None
self._waiting_on_screenshot_msg_id = self.send_to_chrome(
method="Page.captureScreenshot")
elif message["id"] == self._waiting_on_outlinks_msg_id:
self.logger.debug("got outlinks message=%s", message)
self._outlinks = frozenset(

View File

@ -41,6 +41,9 @@ def _add_common_options(arg_parser):
arg_parser.add_argument(
'-v', '--verbose', dest='log_level',
action='store_const', default=logging.INFO, const=logging.DEBUG)
arg_parser.add_argument(
'--trace', dest='log_level',
action='store_const', default=logging.INFO, const=brozzler.TRACE)
arg_parser.add_argument(
'--version', action='version',
version='brozzler %s - %s' % (

View File

@ -197,12 +197,12 @@ class RethinkDbFrontier:
1).update({
"claimed":True,
"last_claimed_by":worker_id},
return_changes=True).run()
self._vet_result(result, replaced=[0,1])
if result["replaced"] == 1:
return brozzler.Page(**result["changes"][0]["new_val"])
else:
return_changes="always").run()
self._vet_result(result, unchanged=[0,1], replaced=[0,1])
if result["unchanged"] == 0 and result["replaced"] == 0:
raise brozzler.NothingToClaim
else:
return brozzler.Page(**result["changes"][0]["new_val"])
def has_outstanding_pages(self, site):
results_iter = self.r.table("pages").between(
@ -233,6 +233,8 @@ class RethinkDbFrontier:
yield brozzler.Job(**result)
def job(self, id):
if id is None:
return None
result = self.r.table("jobs").get(id).run()
if result:
return brozzler.Job(**result)

View File

@ -225,6 +225,7 @@ class BrozzlerWorker:
raise
except Exception as e:
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
and hasattr(e.exc_info[1], 'code')
and e.exc_info[1].code == 430):
self.logger.info(
'youtube-dl got %s %s processing %s',
@ -288,9 +289,8 @@ class BrozzlerWorker:
while (not self._shutdown_requested.is_set()
and time.time() - start < 7 * 60):
self._frontier.honor_stop_request(site.job_id)
page = self._frontier.claim_page(site,
"{}:{}".format(
socket.gethostname(), browser.chrome_port))
page = self._frontier.claim_page(site, "%s:%s" % (
socket.gethostname(), browser.chrome_port))
outlinks = self.brozzle_page(browser, site, page)
self._frontier.completed_page(site, page)
self._frontier.scope_and_schedule_outlinks(site, page, outlinks)
@ -326,8 +326,9 @@ class BrozzlerWorker:
try:
self.status_info = self._service_registry.heartbeat(status_info)
self.logger.debug(
"status in service registry: %s", self.status_info)
self.logger.log(
brozzler.TRACE, "status in service registry: %s",
self.status_info)
except rethinkdb.ReqlError as e:
self.logger.error(
"failed to send heartbeat and update service registry "
@ -337,7 +338,11 @@ class BrozzlerWorker:
try:
latest_state = None
while not self._shutdown_requested.is_set():
if self._service_registry and (not hasattr(self, "status_info") or (rethinkstuff.utcnow() - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL):
if self._service_registry and (
not hasattr(self, "status_info")
or (rethinkstuff.utcnow() -
self.status_info["last_heartbeat"]).total_seconds()
> self.HEARTBEAT_INTERVAL):
self._service_heartbeat()
try:
@ -357,7 +362,8 @@ class BrozzlerWorker:
raise
except brozzler.browser.NoBrowsersAvailable:
if latest_state != "browsers-busy":
self.logger.info("all %s browsers are busy", self._max_browsers)
self.logger.info(
"all %s browsers are busy", self._max_browsers)
latest_state = "browsers-busy"
except brozzler.NothingToClaim:
if latest_state != "no-unclaimed-sites":

View File

@ -21,7 +21,7 @@ import setuptools
setuptools.setup(
name='brozzler',
version='1.1.dev31',
version='1.1.dev38',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',