mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge branch 'master' into qa
* master: implement timeout to work around issue where sometimes we receive no result message after requesting scroll to top avoid "AttributeError: 'ExtractorError' object has no attribute 'code'" checking for 430 (soft limit) from youtube-dl set Browser._chrome_instance=None if _chrome_instance.start() throws exception, to avoid endless loop after one failure fix case where rethinkdb page already has claimed=True undo accidentally committed change to browser startup timeout, and remove now misleading comment about browser ports (see https://github.com/internetarchive/brozzler/pull/3) fix bug preventing brozzler-new-site from working, add note about brozzler-new-site in readme --trace level logging
This commit is contained in:
commit
d82feb14da
@ -70,6 +70,14 @@ must be specified, everything else is optional.
|
||||
scope:
|
||||
surt: http://(org,example,
|
||||
|
||||
Submit a Site to Crawl Without Configuring a Job
|
||||
------------------------------------------------
|
||||
|
||||
::
|
||||
|
||||
brozzler-new-site --proxy=localhost:8000 --enable-warcprox-features \
|
||||
--time-limit=600 http://example.com/
|
||||
|
||||
Brozzler Web Console
|
||||
--------------------
|
||||
|
||||
|
@ -64,6 +64,9 @@ class BaseDictable:
|
||||
def __repr__(self):
|
||||
return "{}(**{})".format(self.__class__.__name__, self.to_dict())
|
||||
|
||||
# logging level more fine-grained than logging.DEBUG==10
|
||||
TRACE = 5
|
||||
|
||||
from brozzler.site import Page, Site
|
||||
from brozzler.worker import BrozzlerWorker
|
||||
from brozzler.robots import is_permitted_by_robots
|
||||
|
@ -55,8 +55,6 @@ class BrowserPool:
|
||||
|
||||
self._lock = threading.Lock()
|
||||
|
||||
self.logger.info("browser ports: {}".format([browser.chrome_port for browser in self._available]))
|
||||
|
||||
def acquire(self):
|
||||
"""
|
||||
Returns browser from pool if available, raises NoBrowsersAvailable
|
||||
@ -133,13 +131,18 @@ class Browser:
|
||||
# these can raise exceptions
|
||||
self.chrome_port = self._find_available_port()
|
||||
self._work_dir = tempfile.TemporaryDirectory()
|
||||
self._chrome_instance = Chrome(port=self.chrome_port,
|
||||
executable=self.chrome_exe,
|
||||
self._chrome_instance = Chrome(
|
||||
port=self.chrome_port, executable=self.chrome_exe,
|
||||
user_home_dir=self._work_dir.name,
|
||||
user_data_dir=os.sep.join([self._work_dir.name, "chrome-user-data"]),
|
||||
user_data_dir=os.sep.join([
|
||||
self._work_dir.name, "chrome-user-data"]),
|
||||
ignore_cert_errors=self.ignore_cert_errors,
|
||||
proxy=proxy or self.proxy)
|
||||
self._websocket_url = self._chrome_instance.start()
|
||||
try:
|
||||
self._websocket_url = self._chrome_instance.start()
|
||||
except:
|
||||
self._chrome_instance = None
|
||||
raise
|
||||
|
||||
def stop(self):
|
||||
try:
|
||||
@ -201,6 +204,7 @@ class Browser:
|
||||
self.behavior_parameters = behavior_parameters
|
||||
|
||||
self._waiting_on_scroll_to_top_msg_id = None
|
||||
self._waiting_on_scroll_to_top_start = None
|
||||
self._waiting_on_screenshot_msg_id = None
|
||||
self._waiting_on_document_url_msg_id = None
|
||||
self._waiting_on_outlinks_msg_id = None
|
||||
@ -267,7 +271,19 @@ class Browser:
|
||||
self._waiting_on_scroll_to_top_msg_id = self.send_to_chrome(
|
||||
method="Runtime.evaluate",
|
||||
params={"expression":"window.scrollTo(0, 0);"})
|
||||
self._waiting_on_scroll_to_top_start = time.time()
|
||||
return False
|
||||
elif (self._waiting_on_scroll_to_top_msg_id
|
||||
and time.time() - self._waiting_on_scroll_to_top_start > 30):
|
||||
# chromium bug? occasionally we get no scroll-to-top result message
|
||||
self.logger.warn(
|
||||
"timed out after %.1fs waiting for scroll-to-top result "
|
||||
"message, requesting screenshot now",
|
||||
time.time() - self._waiting_on_scroll_to_top_start)
|
||||
self._waiting_on_scroll_to_top_msg_id = None
|
||||
self._waiting_on_scroll_to_top_start = None
|
||||
self._waiting_on_screenshot_msg_id = self.send_to_chrome(
|
||||
method="Page.captureScreenshot")
|
||||
elif not self._has_screenshot and (
|
||||
self._waiting_on_scroll_to_top_msg_id
|
||||
or self._waiting_on_screenshot_msg_id):
|
||||
@ -406,8 +422,10 @@ compileOutlinks(window).join(' ');
|
||||
self._has_screenshot = True
|
||||
self.logger.info("got screenshot, moving on to getting outlinks url={}".format(self.url))
|
||||
elif message["id"] == self._waiting_on_scroll_to_top_msg_id:
|
||||
self._waiting_on_screenshot_msg_id = self.send_to_chrome(method="Page.captureScreenshot")
|
||||
self._waiting_on_scroll_to_top_msg_id = None
|
||||
self._waiting_on_scroll_to_top_start = None
|
||||
self._waiting_on_screenshot_msg_id = self.send_to_chrome(
|
||||
method="Page.captureScreenshot")
|
||||
elif message["id"] == self._waiting_on_outlinks_msg_id:
|
||||
self.logger.debug("got outlinks message=%s", message)
|
||||
self._outlinks = frozenset(
|
||||
|
@ -41,6 +41,9 @@ def _add_common_options(arg_parser):
|
||||
arg_parser.add_argument(
|
||||
'-v', '--verbose', dest='log_level',
|
||||
action='store_const', default=logging.INFO, const=logging.DEBUG)
|
||||
arg_parser.add_argument(
|
||||
'--trace', dest='log_level',
|
||||
action='store_const', default=logging.INFO, const=brozzler.TRACE)
|
||||
arg_parser.add_argument(
|
||||
'--version', action='version',
|
||||
version='brozzler %s - %s' % (
|
||||
|
@ -197,12 +197,12 @@ class RethinkDbFrontier:
|
||||
1).update({
|
||||
"claimed":True,
|
||||
"last_claimed_by":worker_id},
|
||||
return_changes=True).run()
|
||||
self._vet_result(result, replaced=[0,1])
|
||||
if result["replaced"] == 1:
|
||||
return brozzler.Page(**result["changes"][0]["new_val"])
|
||||
else:
|
||||
return_changes="always").run()
|
||||
self._vet_result(result, unchanged=[0,1], replaced=[0,1])
|
||||
if result["unchanged"] == 0 and result["replaced"] == 0:
|
||||
raise brozzler.NothingToClaim
|
||||
else:
|
||||
return brozzler.Page(**result["changes"][0]["new_val"])
|
||||
|
||||
def has_outstanding_pages(self, site):
|
||||
results_iter = self.r.table("pages").between(
|
||||
@ -233,6 +233,8 @@ class RethinkDbFrontier:
|
||||
yield brozzler.Job(**result)
|
||||
|
||||
def job(self, id):
|
||||
if id is None:
|
||||
return None
|
||||
result = self.r.table("jobs").get(id).run()
|
||||
if result:
|
||||
return brozzler.Job(**result)
|
||||
|
@ -225,6 +225,7 @@ class BrozzlerWorker:
|
||||
raise
|
||||
except Exception as e:
|
||||
if (hasattr(e, 'exc_info') and len(e.exc_info) >= 2
|
||||
and hasattr(e.exc_info[1], 'code')
|
||||
and e.exc_info[1].code == 430):
|
||||
self.logger.info(
|
||||
'youtube-dl got %s %s processing %s',
|
||||
@ -288,9 +289,8 @@ class BrozzlerWorker:
|
||||
while (not self._shutdown_requested.is_set()
|
||||
and time.time() - start < 7 * 60):
|
||||
self._frontier.honor_stop_request(site.job_id)
|
||||
page = self._frontier.claim_page(site,
|
||||
"{}:{}".format(
|
||||
socket.gethostname(), browser.chrome_port))
|
||||
page = self._frontier.claim_page(site, "%s:%s" % (
|
||||
socket.gethostname(), browser.chrome_port))
|
||||
outlinks = self.brozzle_page(browser, site, page)
|
||||
self._frontier.completed_page(site, page)
|
||||
self._frontier.scope_and_schedule_outlinks(site, page, outlinks)
|
||||
@ -326,8 +326,9 @@ class BrozzlerWorker:
|
||||
|
||||
try:
|
||||
self.status_info = self._service_registry.heartbeat(status_info)
|
||||
self.logger.debug(
|
||||
"status in service registry: %s", self.status_info)
|
||||
self.logger.log(
|
||||
brozzler.TRACE, "status in service registry: %s",
|
||||
self.status_info)
|
||||
except rethinkdb.ReqlError as e:
|
||||
self.logger.error(
|
||||
"failed to send heartbeat and update service registry "
|
||||
@ -337,7 +338,11 @@ class BrozzlerWorker:
|
||||
try:
|
||||
latest_state = None
|
||||
while not self._shutdown_requested.is_set():
|
||||
if self._service_registry and (not hasattr(self, "status_info") or (rethinkstuff.utcnow() - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL):
|
||||
if self._service_registry and (
|
||||
not hasattr(self, "status_info")
|
||||
or (rethinkstuff.utcnow() -
|
||||
self.status_info["last_heartbeat"]).total_seconds()
|
||||
> self.HEARTBEAT_INTERVAL):
|
||||
self._service_heartbeat()
|
||||
|
||||
try:
|
||||
@ -357,7 +362,8 @@ class BrozzlerWorker:
|
||||
raise
|
||||
except brozzler.browser.NoBrowsersAvailable:
|
||||
if latest_state != "browsers-busy":
|
||||
self.logger.info("all %s browsers are busy", self._max_browsers)
|
||||
self.logger.info(
|
||||
"all %s browsers are busy", self._max_browsers)
|
||||
latest_state = "browsers-busy"
|
||||
except brozzler.NothingToClaim:
|
||||
if latest_state != "no-unclaimed-sites":
|
||||
|
Loading…
x
Reference in New Issue
Block a user