diff --git a/.travis.yml b/.travis.yml index 3d745c6..bbdbadf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ install: - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml - pip install $TRAVIS_BUILD_DIR pytest script: -- py.test -v -s tests +- DISPLAY=:1 py.test -v -s tests after_failure: - sudo cat /var/log/upstart/warcprox.log - sudo cat /var/log/upstart/brozzler-worker.log diff --git a/brozzler/__init__.py b/brozzler/__init__.py index af6ec3d..9e5d5fc 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -44,7 +44,8 @@ class ReachedLimit(Exception): self.http_payload = http_payload def __repr__(self): - return "ReachedLimit(warcprox_meta={},http_payload={})".format(repr(self.warcprox_meta), repr(self.http_payload)) + return "ReachedLimit(warcprox_meta=%s,http_payload=%s)" % ( + repr(self.warcprox_meta), repr(self.http_payload)) def __str__(self): return self.__repr__() @@ -85,23 +86,7 @@ def behaviors(): behaviors_yaml = os.path.join( os.path.dirname(__file__), 'behaviors.yaml') with open(behaviors_yaml) as fin: - conf = yaml.load(fin) - _behaviors = conf['behaviors'] - - for behavior in _behaviors: - if 'behavior_js' in behavior: - behavior_js = os.path.join( - os.path.dirname(__file__), 'behaviors.d', - behavior['behavior_js']) - with open(behavior_js, encoding='utf-8') as fin: - behavior['script'] = fin.read() - elif 'behavior_js_template' in behavior: - behavior_js_template = os.path.join( - os.path.dirname(__file__), 'behaviors.d', - behavior['behavior_js_template']) - with open(behavior_js_template, encoding='utf-8') as fin: - behavior['template'] = string.Template(fin.read()) - + _behaviors = yaml.load(fin) return _behaviors def behavior_script(url, template_parameters=None): @@ -111,22 +96,18 @@ def behavior_script(url, template_parameters=None): import re, logging for behavior in behaviors(): if re.match(behavior['url_regex'], url): - if 'behavior_js' in behavior: - logging.info( - 'using behavior %s for %s', - behavior['behavior_js'], url) - return behavior['script'] - elif 'behavior_js_template' in behavior: - parameters = dict() - if 'default_parameters' in behavior: - parameters.update(behavior['default_parameters']) - if template_parameters: - parameters.update(template_parameters) - script = behavior['template'].safe_substitute(parameters) - logging.info( - 'using template=%s populated with parameters=%s for %s', - repr(behavior['behavior_js_template']), parameters, url) - return script + parameters = dict() + if 'default_parameters' in behavior: + parameters.update(behavior['default_parameters']) + if template_parameters: + parameters.update(template_parameters) + template = jinja2_environment().get_template( + behavior['behavior_js_template']) + script = template.render(parameters) + logging.info( + 'using template=%s populated with parameters=%s for %s', + repr(behavior['behavior_js_template']), parameters, url) + return script return None def thread_raise(thread, exctype): @@ -169,10 +150,21 @@ def sleep(duration): break time.sleep(min(duration - elapsed, 0.5)) +_jinja2_env = None +def jinja2_environment(): + global _jinja2_env + if not _jinja2_env: + import jinja2, json + _jinja2_env = jinja2.Environment( + loader=jinja2.PackageLoader('brozzler', 'js-templates')) + _jinja2_env.filters['json'] = json.dumps + return _jinja2_env + from brozzler.site import Page, Site from brozzler.worker import BrozzlerWorker from brozzler.robots import is_permitted_by_robots from brozzler.frontier import RethinkDbFrontier -from brozzler.browser import Browser, BrowserPool +from brozzler.browser import Browser, BrowserPool, BrowsingException from brozzler.job import new_job, new_site, Job +from brozzler.cli import suggest_default_chrome_exe diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 0da72cc..5bcd945 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -17,95 +17,98 @@ # # first matched behavior is used, so order matters here -behaviors: - - - url_regex: '^https?://(?:www\.)?facebook\.com/.*$' - behavior_js_template: facebook.js.template - # default_parameters: - # parameter_username: jdoe@example.com - # parameter_password: abcd1234 - request_idle_timeout_sec: 30 - - - url_regex: '^https?://(?:www\.)?marquette\.edu/.*$' - behavior_js: marquette_edu.js - request_idle_timeout_sec: 10 - - - url_regex: '^https?://(?:www\.)?vimeo\.com/.*$' - behavior_js: vimeo.js - request_idle_timeout_sec: 10 - - - url_regex: '^https?://(?:www\.)?psu24.psu.edu/.*$' - behavior_js: psu24.js - request_idle_timeout_sec: 10 - - - url_regex: '^https?://(?:www\.)?instagram\.com/.*$' - behavior_js: instagram.js - request_idle_timeout_sec: 10 - - - url_regex: '^https?://catalogue\.noguchi\.org/index.php/LoginReg/form$' - behavior_js_template: noguchi.js.template - request_idle_timeout_sec: 10 - - - url_regex: '^https?://catalogue\.noguchi\.org/index.php/Search/Index/search/.*/target/ca_.*$' - behavior_js_template: noguchi.js.template - request_idle_timeout_sec: 10 - - - url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$' - behavior_js_template: huffpostslides.js - request_idle_timeout_sec: 10 - - - url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$' - behavior_js_template: simpleclicks.js.template - default_parameters: - click_css_selector: img.img-responsive - request_idle_timeout_sec: 10 - - # acalog https://webarchive.jira.com/browse/ARI-3775 - url_regex: '^https?://.*[?&]catoid=[^?]*$' - behavior_js_template: simpleclicks.js.template - default_parameters: - click_css_selector: a[onclick] - request_idle_timeout_sec: 10 - - # https://webarchive.jira.com/browse/ARI-3956 - url_regex: '^https?://(?:www\.)?usask.ca/.*$' - behavior_js_template: simpleclicks.js.template - default_parameters: - click_css_selector: a[id='feature-next'] - request_idle_timeout_sec: 10 - - # https://webarchive.jira.com/browse/AITFIVE-451 - url_regex: '^https?://(?:www\.)?soundcloud.com/.*$' - behavior_js_template: simpleclicks.js.template - default_parameters: - click_css_selector: button.sc-button-play, button.playButton - request_idle_timeout_sec: 10 - - # https://webarchive.jira.com/browse/AITFIVE-463 - url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$' - behavior_js_template: simpleclicks.js.template - default_parameters: - click_css_selector: button.playButton.medium - request_idle_timeout_sec: 10 - - # https://webarchive.jira.com/browse/ARI-4690 - url_regex: '^https?://(?:www\.)?youtube.com/.*$' - behavior_js_template: simpleclicks.js.template - default_parameters: - click_css_selector: span.load-more-text - request_idle_timeout_sec: 10 - - # https://webarchive.jira.com/browse/ARI-4725 - url_regex: '^https?://(?:www\.)?moma.org/.*$' - behavior_js_template: simpleclicks.js.template - default_parameters: - click_css_selector: button[data-more-results-bottom-button] - click_until_hard_timeout: True - request_idle_timeout_sec: 10 - - # https://webarchive.jira.com/browse/ARI-4692 - url_regex: '^https?://(?:www\.)?fec.gov/data/.*$' - behavior_js: fec_gov.js - request_idle_timeout_sec: 10 - - url_regex: '^https?://(?:www\.)?news\.com\.au/.*$' - behavior_js_template: mouseovers.js.template - default_parameters: - mouseover_css_selector: .menu-item a - request_idle_timeout_sec: 10 - - # default fallback behavior - url_regex: '^.*$' - request_idle_timeout_sec: 10 - behavior_js: default.js +- + url_regex: '^https?://(?:www\.)?facebook\.com/.*$' + behavior_js_template: facebook.js + request_idle_timeout_sec: 30 +- + url_regex: '^https?://(?:www\.)?marquette\.edu/.*$' + behavior_js_template: marquette_edu.js + request_idle_timeout_sec: 10 +- + url_regex: '^https?://(?:www\.)?vimeo\.com/.*$' + behavior_js_template: vimeo.js + request_idle_timeout_sec: 10 +- + url_regex: '^https?://(?:www\.)?psu24.psu.edu/.*$' + behavior_js_template: psu24.js + request_idle_timeout_sec: 10 +- + url_regex: '^https?://(?:www\.)?instagram\.com/.*$' + behavior_js_template: instagram.js + request_idle_timeout_sec: 10 +- + url_regex: '^https?://catalogue\.noguchi\.org/index.php/LoginReg/form$' + behavior_js_template: noguchi.js.template + request_idle_timeout_sec: 10 +- + url_regex: '^https?://catalogue\.noguchi\.org/index.php/Search/Index/search/.*/target/ca_.*$' + behavior_js_template: noguchi.js.template + request_idle_timeout_sec: 10 +- + url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$' + behavior_js_template: huffpostslides.js + request_idle_timeout_sec: 10 +- + url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$' + behavior_js_template: simpleclicks.js.j2 + default_parameters: + click_css_selector: img.img-responsive + click_until_hard_timeout: False + request_idle_timeout_sec: 10 +- # acalog https://webarchive.jira.com/browse/ARI-3775 + url_regex: '^https?://.*[?&]catoid=[^?]*$' + behavior_js_template: simpleclicks.js.j2 + default_parameters: + click_css_selector: a[onclick] + click_until_hard_timeout: False + request_idle_timeout_sec: 10 +- # https://webarchive.jira.com/browse/ARI-3956 + url_regex: '^https?://(?:www\.)?usask.ca/.*$' + behavior_js_template: simpleclicks.js.j2 + default_parameters: + click_css_selector: a[id='feature-next'] + click_until_hard_timeout: False + request_idle_timeout_sec: 10 +- # https://webarchive.jira.com/browse/AITFIVE-451 + url_regex: '^https?://(?:www\.)?soundcloud.com/.*$' + behavior_js_template: simpleclicks.js.j2 + default_parameters: + click_css_selector: button.sc-button-play, button.playButton + click_until_hard_timeout: False + request_idle_timeout_sec: 10 +- # https://webarchive.jira.com/browse/AITFIVE-463 + url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$' + behavior_js_template: simpleclicks.js.j2 + default_parameters: + click_css_selector: button.playButton.medium + click_until_hard_timeout: False + request_idle_timeout_sec: 10 +- # https://webarchive.jira.com/browse/ARI-4690 + url_regex: '^https?://(?:www\.)?youtube.com/.*$' + behavior_js_template: simpleclicks.js.j2 + default_parameters: + click_css_selector: span.load-more-text + click_until_hard_timeout: False + request_idle_timeout_sec: 10 +- # https://webarchive.jira.com/browse/ARI-4725 + url_regex: '^https?://(?:www\.)?moma.org/.*$' + behavior_js_template: simpleclicks.js.j2 + default_parameters: + click_css_selector: button[data-more-results-bottom-button] + click_until_hard_timeout: True + request_idle_timeout_sec: 10 +- # https://webarchive.jira.com/browse/ARI-4692 + url_regex: '^https?://(?:www\.)?fec.gov/data/.*$' + behavior_js_template: fec_gov.js + request_idle_timeout_sec: 10 +- url_regex: '^https?://(?:www\.)?news\.com\.au/.*$' + behavior_js_template: mouseovers.js.j2 + default_parameters: + mouseover_css_selector: .menu-item a + mouseover_until_hard_timeout: False + request_idle_timeout_sec: 10 +- # default fallback behavior + url_regex: '^.*$' + request_idle_timeout_sec: 10 + behavior_js_template: default.js diff --git a/brozzler/browser.py b/brozzler/browser.py index 114d212..2e5a011 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -102,6 +102,108 @@ class BrowserPool: def num_in_use(self): return len(self._in_use) +class WebsockReceiverThread(threading.Thread): + logger = logging.getLogger(__module__ + '.' + __qualname__) + + def __init__(self, websock, name=None, daemon=True): + super().__init__(name=name, daemon=daemon) + + self.websock = websock + + self.calling_thread = threading.current_thread() + + self.websock.on_open = self._on_open + self.websock.on_message = self._on_message + self.websock.on_error = self._on_error + self.websock.on_close = self._on_close + + self.is_open = False + self.got_page_load_event = None + + self._result_messages = {} + + def expect_result(self, msg_id): + self._result_messages[msg_id] = None + + def received_result(self, msg_id): + return bool(self._result_messages.get(msg_id)) + + def pop_result(self, msg_id): + return self._result_messages.pop(msg_id) + + def _on_close(self, websock): + pass + # self.logger.info('GOODBYE GOODBYE WEBSOCKET') + + def _on_open(self, websock): + self.is_open = True + + def _on_error(self, websock, e): + ''' + Raises BrowsingException in the thread that created this instance. + ''' + if isinstance(e, ( + websocket.WebSocketConnectionClosedException, + ConnectionResetError)): + self.logger.error('websocket closed, did chrome die?') + else: + self.logger.error( + 'exception from websocket receiver thread', + exc_info=1) + brozzler.thread_raise(self.calling_thread, BrowsingException) + + def run(self): + self.websock.run_forever() + + def _on_message(self, websock, message): + try: + self._handle_message(websock, message) + except: + self.logger.error( + 'uncaught exception in _handle_message message=%s', + message, exc_info=True) + + def _debugger_paused(self, message): + # we hit the breakpoint set in start(), get rid of google analytics + self.logger.debug('debugger paused! message=%s', message) + scriptId = message['params']['callFrames'][0]['location']['scriptId'] + + # replace script + self.websock.send( + json.dumps(dict( + id=0, method='Debugger.setScriptSource', + params={'scriptId': scriptId, + 'scriptSource': 'console.log("google analytics is no more!");'}))) + + # resume execution + self.websock.send(json.dumps(dict(id=0, method='Debugger.resume'))) + + def _handle_message(self, websock, json_message): + message = json.loads(json_message) + if 'method' in message: + if message['method'] == 'Page.loadEventFired': + self.got_page_load_event = datetime.datetime.utcnow() + elif message['method'] == 'Debugger.paused': + self._debugger_paused(message) + elif message["method"] == "Inspector.targetCrashed": + self.logger.error( + '''chrome tab went "aw snap" or "he's dead jim"!''') + brozzler.thread_raise(self.calling_thread, BrowsingException) + elif message['method'] == 'Console.messageAdded': + self.logger.debug( + '%s console.%s %s', self.websock.url, + message['params']['message']['level'], + message['params']['message']['text']) + # else: + # self.logger.debug("%s %s", message["method"], json_message) + elif 'result' in message: + if message['id'] in self._result_messages: + self._result_messages[message['id']] = message + # else: + # self.logger.debug("%s", json_message) + # else: + # self.logger.debug("%s", json_message) + class Browser: ''' Manages an instance of Chrome for browsing pages. @@ -116,9 +218,11 @@ class Browser: **kwargs: arguments for Chrome(...) ''' self.chrome = Chrome(**kwargs) - self.websocket_url = None + self.websock_url = None + self.websock = None + self.websock_thread = None self.is_browsing = False - self._browser_controller = None + self._command_id = Counter() def __enter__(self): self.start() @@ -127,6 +231,31 @@ class Browser: def __exit__(self, *args): self.stop() + def _wait_for(self, callback, timeout=None): + ''' + Spins until callback() returns truthy. + ''' + start = time.time() + while True: + if callback(): + return + elapsed = time.time() - start + if timeout and elapsed > timeout: + raise BrowsingTimeout( + 'timed out after %.1fs waiting for: %s' % ( + elapsed, callback)) + brozzler.sleep(0.5) + + def send_to_chrome(self, suppress_logging=False, **kwargs): + msg_id = next(self._command_id) + kwargs['id'] = msg_id + msg = json.dumps(kwargs) + logging.log( + brozzler.TRACE if suppress_logging else logging.DEBUG, + 'sending message to %s: %s', self.websock, msg) + self.websock.send(msg) + return msg_id + def start(self, **kwargs): ''' Starts chrome if it's not running. @@ -135,29 +264,75 @@ class Browser: **kwargs: arguments for self.chrome.start(...) ''' if not self.is_running(): - self.websocket_url = self.chrome.start(**kwargs) - self._browser_controller = BrowserController(self.websocket_url) - self._browser_controller.start() + self.websock_url = self.chrome.start(**kwargs) + self.websock = websocket.WebSocketApp(self.websock_url) + thread_name = 'WebsockThread:{}-{:%Y%m%d%H%M%S}'.format( + surt.handyurl.parse(self.websock_url).port, + datetime.datetime.utcnow()) + self.websock_thread = WebsockReceiverThread(self.websock) + self.websock_thread.start() + + self._wait_for(lambda: self.websock_thread.is_open, timeout=10) + + # tell browser to send us messages we're interested in + self.send_to_chrome(method='Network.enable') + self.send_to_chrome(method='Page.enable') + self.send_to_chrome(method='Console.enable') + self.send_to_chrome(method='Debugger.enable') + self.send_to_chrome(method='Runtime.enable') + + # disable google analytics, see _handle_message() where breakpoint + # is caught Debugger.paused + self.send_to_chrome( + method='Debugger.setBreakpointByUrl', + params={ + 'lineNumber': 1, + 'urlRegex': 'https?://www.google-analytics.com/analytics.js'}) def stop(self): ''' Stops chrome if it's running. ''' try: - if self._browser_controller: - self._browser_controller.stop() - self.websocket_url = None + if (self.websock and self.websock.sock + and self.websock.sock.connected): + self.logger.info('shutting down websocket connection') + try: + self.websock.close() + except BaseException as e: + self.logger.error( + 'exception closing websocket %s - %s', + self.websock, e) + self.chrome.stop() + + if self.websock_thread and ( + self.websock_thread != threading.current_thread()): + self.websock_thread.join(timeout=30) + if self.websock_thread.is_alive(): + self.logger.error( + '%s still alive 30 seconds after closing %s, will ' + 'forcefully nudge it again', self.websock_thread, + self.websock) + self.websock.keep_running = False + self.websock_thread.join(timeout=30) + if self.websock_thread.is_alive(): + self.logger.critical( + '%s still alive 60 seconds after closing %s', + self.websock_thread, self.websock) + + self.websock_url = None except: self.logger.error('problem stopping', exc_info=True) def is_running(self): - return self.websocket_url is not None + return self.websock_url is not None def browse_page( self, page_url, ignore_cert_errors=False, extra_headers=None, user_agent=None, behavior_parameters=None, - on_request=None, on_response=None, on_screenshot=None): + on_request=None, on_response=None, on_screenshot=None, + username=None, password=None): ''' Browses page in browser. @@ -201,24 +376,26 @@ class Browser: raise BrowsingException('browser is already busy browsing a page') self.is_browsing = True try: - self._browser_controller.navigate_to_page(page_url, timeout=300) - ## if login_credentials: - ## self._browser_controller.try_login(login_credentials) (5 min?) + self.navigate_to_page( + page_url, extra_headers=extra_headers, + user_agent=user_agent, timeout=300) + if password: + self.try_login(username, password, timeout=300) behavior_script = brozzler.behavior_script( page_url, behavior_parameters) - self._browser_controller.run_behavior(behavior_script, timeout=900) + self.run_behavior(behavior_script, timeout=900) if on_screenshot: - self._browser_controller.scroll_to_top() - jpeg_bytes = self._browser_controller.screenshot() + self.scroll_to_top() + jpeg_bytes = self.screenshot() on_screenshot(jpeg_bytes) - outlinks = self._browser_controller.extract_outlinks() + outlinks = self.extract_outlinks() ## for each hashtag not already visited: ## navigate_to_hashtag (nothing to wait for so no timeout?) ## if on_screenshot; ## take screenshot (30 sec) ## run behavior (3 min) ## outlinks += retrieve_outlinks (60 sec) - final_page_url = self._browser_controller.url() + final_page_url = self.url() return final_page_url, outlinks except websocket.WebSocketConnectionClosedException as e: self.logger.error('websocket closed, did chrome die?') @@ -226,183 +403,10 @@ class Browser: finally: self.is_browsing = False -class Counter: - def __init__(self): - self.next_value = 0 - def __next__(self): - try: - return self.next_value - finally: - self.next_value += 1 - def peek_next(self): - return self.next_value - -class BrowserController: - ''' - ''' - - logger = logging.getLogger(__module__ + '.' + __qualname__) - - def __init__(self, websocket_url): - self.websocket_url = websocket_url - self._command_id = Counter() - self._websock_thread = None - self._websock_open = None - self._result_messages = {} - - def _wait_for(self, callback, timeout=None): - ''' - Spins until callback() returns truthy. - ''' - start = time.time() - while True: - brozzler.sleep(0.5) - if callback(): - return - elapsed = time.time() - start - if timeout and elapsed > timeout: - raise BrowsingTimeout( - 'timed out after %.1fs waiting for: %s' % ( - elapsed, callback)) - - def __enter__(self): - self.start() - return self - - def __exit__(self, *args): - self.stop() - - def start(self): - if not self._websock_thread: - calling_thread = threading.current_thread() - - def on_open(websock): - self._websock_open = datetime.datetime.utcnow() - def on_error(websock, e): - ''' - Raises BrowsingException in the thread that called start() - ''' - if isinstance(e, websocket.WebSocketConnectionClosedException): - self.logger.error('websocket closed, did chrome die?') - else: - self.logger.error( - 'exception from websocket receiver thread', - exc_info=1) - brozzler.thread_raise(calling_thread, BrowsingException) - - # open websocket, start thread that receives messages - self._websock = websocket.WebSocketApp( - self.websocket_url, on_open=on_open, - on_message=self._on_message, on_error=on_error) - thread_name = 'WebsockThread:{}-{:%Y%m%d%H%M%S}'.format( - surt.handyurl.parse(self.websocket_url).port, - datetime.datetime.utcnow()) - self._websock_thread = threading.Thread( - target=self._websock.run_forever, name=thread_name, - daemon=True) - self._websock_thread.start() - self._wait_for(lambda: self._websock_open, timeout=10) - - # tell browser to send messages we're interested in - self.send_to_chrome(method='Network.enable') - self.send_to_chrome(method='Page.enable') - self.send_to_chrome(method='Console.enable') - self.send_to_chrome(method='Debugger.enable') - self.send_to_chrome(method='Runtime.enable') - - # disable google analytics, see _handle_message() where breakpoint - # is caught Debugger.paused - self.send_to_chrome( - method='Debugger.setBreakpointByUrl', - params={ - 'lineNumber': 1, - 'urlRegex': 'https?://www.google-analytics.com/analytics.js'}) - - def stop(self, *args): - if self._websock_thread: - if (self._websock and self._websock.sock - and self._websock.sock.connected): - self.logger.info('shutting down websocket connection') - try: - self._websock.close() - except BaseException as e: - self.logger.error( - 'exception closing websocket %s - %s', - self._websock, e) - - if self._websock_thread != threading.current_thread(): - self._websock_thread.join(timeout=30) - if self._websock_thread.is_alive(): - self.logger.error( - '%s still alive 30 seconds after closing %s, will ' - 'forcefully nudge it again', self._websock_thread, - self._websock) - self._websock.keep_running = False - self._websock_thread.join(timeout=30) - if self._websock_thread.is_alive(): - self.logger.critical( - '%s still alive 60 seconds after closing %s', - self._websock_thread, self._websock) - - def _on_message(self, websock, message): - try: - self._handle_message(websock, message) - except: - self.logger.error( - 'uncaught exception in _handle_message message=%s', - message, exc_info=True) - - def _handle_message(self, websock, json_message): - message = json.loads(json_message) - if 'method' in message: - if message['method'] == 'Page.loadEventFired': - self._got_page_load_event = datetime.datetime.utcnow() - elif message['method'] == 'Debugger.paused': - self._debugger_paused(message) - elif message['method'] == 'Console.messageAdded': - self.logger.debug( - '%s console.%s %s', self._websock.url, - message['params']['message']['level'], - message['params']['message']['text']) - # else: - # self.logger.debug("%s %s", message["method"], json_message) - elif 'result' in message: - if message['id'] in self._result_messages: - self._result_messages[message['id']] = message - # else: - # self.logger.debug("%s", json_message) - # else: - # self.logger.debug("%s", json_message) - - def _debugger_paused(self, message): - # we hit the breakpoint set in start(), get rid of google analytics - self.logger.debug('debugger paused! message=%s', message) - scriptId = message['params']['callFrames'][0]['location']['scriptId'] - - # replace script - self.send_to_chrome( - method='Debugger.setScriptSource', - params={'scriptId': scriptId, - 'scriptSource': 'console.log("google analytics is no more!");'}) - - # resume execution - self.send_to_chrome(method='Debugger.resume') - - def send_to_chrome(self, suppress_logging=False, **kwargs): - msg_id = next(self._command_id) - kwargs['id'] = msg_id - msg = json.dumps(kwargs) - if not suppress_logging: - self.logger.debug('sending message to %s: %s', self._websock, msg) - self._websock.send(msg) - return msg_id - def navigate_to_page( self, page_url, extra_headers=None, user_agent=None, timeout=300): - ''' - ''' headers = extra_headers or {} - headers['Accept-Encoding'] = 'gzip, deflate' + headers['Accept-Encoding'] = 'identity' self.send_to_chrome( method='Network.setExtraHTTPHeaders', params={'headers': headers}) @@ -414,73 +418,62 @@ class BrowserController: # navigate to the page! self.logger.info('navigating to page %s', page_url) - self._got_page_load_event = None + self.websock_thread.got_page_load_event = None self.send_to_chrome(method='Page.navigate', params={'url': page_url}) - self._wait_for(lambda: self._got_page_load_event, timeout=timeout) + self._wait_for( + lambda: self.websock_thread.got_page_load_event, + timeout=timeout) - OUTLINKS_JS = r''' -var __brzl_framesDone = new Set(); -var __brzl_compileOutlinks = function(frame) { - __brzl_framesDone.add(frame); - if (frame && frame.document) { - var outlinks = Array.prototype.slice.call( - frame.document.querySelectorAll('a[href]')); - for (var i = 0; i < frame.frames.length; i++) { - if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) { - outlinks = outlinks.concat( - __brzl_compileOutlinks(frame.frames[i])); - } - } - } - return outlinks; -} -__brzl_compileOutlinks(window).join('\n'); -''' def extract_outlinks(self, timeout=60): self.logger.info('extracting outlinks') - self._result_messages[self._command_id.peek_next()] = None + self.websock_thread.expect_result(self._command_id.peek()) + js = brozzler.jinja2_environment().get_template( + 'extract-outlinks.js').render() msg_id = self.send_to_chrome( - method='Runtime.evaluate', - params={'expression': self.OUTLINKS_JS}) + method='Runtime.evaluate', params={'expression': js}) self._wait_for( - lambda: self._result_messages.get(msg_id), timeout=timeout) - message = self._result_messages.pop(msg_id) + lambda: self.websock_thread.received_result(msg_id), + timeout=timeout) + message = self.websock_thread.pop_result(msg_id) if message['result']['result']['value']: return frozenset(message['result']['result']['value'].split('\n')) else: - self._outlinks = frozenset() + return frozenset() def screenshot(self, timeout=30): self.logger.info('taking screenshot') - self._result_messages[self._command_id.peek_next()] = None + self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome(method='Page.captureScreenshot') self._wait_for( - lambda: self._result_messages.get(msg_id), timeout=timeout) - message = self._result_messages.pop(msg_id) + lambda: self.websock_thread.received_result(msg_id), + timeout=timeout) + message = self.websock_thread.pop_result(msg_id) jpeg_bytes = base64.b64decode(message['result']['data']) return jpeg_bytes def scroll_to_top(self, timeout=30): self.logger.info('scrolling to top') - self._result_messages[self._command_id.peek_next()] = None + self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( method='Runtime.evaluate', params={'expression': 'window.scrollTo(0, 0);'}) self._wait_for( - lambda: self._result_messages.get(msg_id), timeout=timeout) - self._result_messages.pop(msg_id) + lambda: self.websock_thread.received_result(msg_id), + timeout=timeout) + self.websock_thread.pop_result(msg_id) def url(self, timeout=30): ''' Returns value of document.URL from the browser. ''' - self._result_messages[self._command_id.peek_next()] = None + self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( method='Runtime.evaluate', params={'expression': 'document.URL'}) self._wait_for( - lambda: self._result_messages.get(msg_id), timeout=timeout) - message = self._result_messages.pop(msg_id) + lambda: self.websock_thread.received_result(msg_id), + timeout=timeout) + message = self.websock_thread.pop_result(msg_id) return message['result']['result']['value'] def run_behavior(self, behavior_script, timeout=900): @@ -498,14 +491,15 @@ __brzl_compileOutlinks(window).join('\n'); brozzler.sleep(7) - self._result_messages[self._command_id.peek_next()] = None + self.websock_thread.expect_result(self._command_id.peek()) msg_id = self.send_to_chrome( method='Runtime.evaluate', suppress_logging=True, params={'expression': 'umbraBehaviorFinished()'}) try: self._wait_for( - lambda: self._result_messages.get(msg_id), timeout=5) - msg = self._result_messages.get(msg_id) + lambda: self.websock_thread.received_result(msg_id), + timeout=5) + msg = self.websock_thread.pop_result(msg_id) if (msg and 'result' in msg and not ('wasThrown' in msg['result'] and msg['result']['wasThrown']) @@ -517,4 +511,63 @@ __brzl_compileOutlinks(window).join('\n'); except BrowsingTimeout: pass + def try_login(self, username, password, timeout=300): + try_login_js = brozzler.jinja2_environment().get_template( + 'try-login.js.j2').render( + username=username, password=password) + + self.websock_thread.got_page_load_event = None + self.send_to_chrome( + method='Runtime.evaluate', suppress_logging=True, + params={'expression': try_login_js}) + + # wait for tryLogin to finish trying (should be very very quick) + start = time.time() + while True: + self.websock_thread.expect_result(self._command_id.peek()) + msg_id = self.send_to_chrome( + method='Runtime.evaluate', + params={'expression': 'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }'}) + try: + self._wait_for( + lambda: self.websock_thread.received_result(msg_id), + timeout=5) + msg = self.websock_thread.pop_result(msg_id) + if (msg and 'result' in msg + and 'result' in msg['result']): + result = msg['result']['result']['value'] + if result == 'login-form-not-found': + # we're done + return + elif result in ('submitted-form', 'maybe-submitted-form'): + # wait for page load event below + self.logger.info( + 'submitted a login form, waiting for another ' + 'page load event') + break + # else try again to get __brzl_tryLoginState + + except BrowsingTimeout: + pass + + if time.time() - start > 30: + raise BrowsingException( + 'timed out trying to check if tryLogin finished') + + # if we get here, we submitted a form, now we wait for another page + # load event + self._wait_for( + lambda: self.websock_thread.got_page_load_event, + timeout=timeout) + +class Counter: + def __init__(self): + self.next_value = 0 + def __next__(self): + try: + return self.next_value + finally: + self.next_value += 1 + def peek(self): + return self.next_value diff --git a/brozzler/cli.py b/brozzler/cli.py index 7de298e..02bca24 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -126,6 +126,12 @@ def brozzle_page(): 'json blob of parameters to populate the javascript behavior ' 'template, e.g. {"parameter_username":"x",' '"parameter_password":"y"}')) + arg_parser.add_argument( + '--username', dest='username', default=None, + help='use this username to try to log in if a login form is found') + arg_parser.add_argument( + '--password', dest='password', default=None, + help='use this password to try to log in if a login form is found') arg_parser.add_argument( '--proxy', dest='proxy', default=None, help='http proxy') @@ -145,7 +151,8 @@ def brozzle_page(): site = brozzler.Site( id=-1, seed=args.url, proxy=args.proxy, enable_warcprox_features=args.enable_warcprox_features, - behavior_parameters=behavior_parameters) + behavior_parameters=behavior_parameters, username=args.username, + password=args.password) page = brozzler.Page(url=args.url, site_id=site.id) worker = brozzler.BrozzlerWorker(frontier=None) @@ -230,6 +237,12 @@ def brozzler_new_site(): 'json blob of parameters to populate the javascript behavior ' 'template, e.g. {"parameter_username":"x",' '"parameter_password":"y"}')) + arg_parser.add_argument( + '--username', dest='username', default=None, + help='use this username to try to log in if a login form is found') + arg_parser.add_argument( + '--password', dest='password', default=None, + help='use this password to try to log in if a login form is found') _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) @@ -243,7 +256,8 @@ def brozzler_new_site(): warcprox_meta=json.loads( args.warcprox_meta) if args.warcprox_meta else None, behavior_parameters=json.loads( - args.behavior_parameters) if args.behavior_parameters else None) + args.behavior_parameters) if args.behavior_parameters else None, + username=args.username, password=args.password) r = rethinkstuff.Rethinker( args.rethinkdb_servers.split(","), args.rethinkdb_db) diff --git a/brozzler/job_schema.yaml b/brozzler/job_schema.yaml index a7f3d8e..d9c1df9 100644 --- a/brozzler/job_schema.yaml +++ b/brozzler/job_schema.yaml @@ -84,4 +84,10 @@ seeds: type: url required: true + username: + type: string + + password: + type: string + <<: *multi_level_options diff --git a/brozzler/behaviors.d/clickGetPDFs.js.template b/brozzler/js-templates/clickGetPDFs.js.template similarity index 100% rename from brozzler/behaviors.d/clickGetPDFs.js.template rename to brozzler/js-templates/clickGetPDFs.js.template diff --git a/brozzler/behaviors.d/default.js b/brozzler/js-templates/default.js similarity index 100% rename from brozzler/behaviors.d/default.js rename to brozzler/js-templates/default.js diff --git a/brozzler/js-templates/extract-outlinks.js b/brozzler/js-templates/extract-outlinks.js new file mode 100644 index 0000000..3be0dfc --- /dev/null +++ b/brozzler/js-templates/extract-outlinks.js @@ -0,0 +1,16 @@ +var __brzl_framesDone = new Set(); +var __brzl_compileOutlinks = function(frame) { + __brzl_framesDone.add(frame); + if (frame && frame.document) { + var outlinks = Array.prototype.slice.call( + frame.document.querySelectorAll('a[href]')); + for (var i = 0; i < frame.frames.length; i++) { + if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) { + outlinks = outlinks.concat( + __brzl_compileOutlinks(frame.frames[i])); + } + } + } + return outlinks; +} +__brzl_compileOutlinks(window).join('\n'); diff --git a/brozzler/behaviors.d/facebook.js.template b/brozzler/js-templates/facebook.js similarity index 89% rename from brozzler/behaviors.d/facebook.js.template rename to brozzler/js-templates/facebook.js index d93e127..c5c1770 100644 --- a/brozzler/behaviors.d/facebook.js.template +++ b/brozzler/js-templates/facebook.js @@ -39,8 +39,6 @@ var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"] //div[class="phm pluginLikeboxStream"] = facebook widget embedded in 3rd party pages var UMBRA_THINGS_TO_SCROLL_SELECTOR = 'div[class="phm pluginLikeboxStream"]'; var NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING = 5; -var UMBRA_FB_USER_NAME = "${parameter_username}"; -var UMBRA_FB_PASSWORD = "${parameter_password}"; var umbraAlreadyClicked = {}; var umbraAlreadyScrolledThing = {}; var umbraScrolledThingFailedScrollAttempts = {}; @@ -172,15 +170,6 @@ var umbraIntervalFunc = function() { } } -var umbraFacebookLogin = function() { - var emailInput = document.querySelector("form#login_form input#email"); - var passwordInput = document.querySelector("form#login_form input#pass"); - var loginButton = document.querySelector("form#login_form label#loginbutton > input"); - emailInput.value=UMBRA_FB_USER_NAME; - passwordInput.value=UMBRA_FB_PASSWORD; - loginButton.click(); -} - // If we haven't had anything to do (scrolled, clicked, etc) in this amount of // time, then we consider ourselves finished with the page. var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10; @@ -202,11 +191,4 @@ if (document.querySelector("div.captcha_interstitial") != null) { // found a cap console.log("captcha found for " + location.href); } -if (document.getElementById("login_form") == null || UMBRA_FB_USER_NAME.indexOf("parameter")>0 || UMBRA_FB_PASSWORD.indexOf("parameter")>0 ) {//check for unset parameters - console.log("missing #login_form or login credentials; maybe already logged in for " + location.href); - var umbraIntervalId = setInterval(umbraIntervalFunc, 200); -} -else {//login - console.log("#login_form and credentials found for " + location.href); - umbraFacebookLogin(); -} +var umbraIntervalId = setInterval(umbraIntervalFunc, 200); diff --git a/brozzler/behaviors.d/fec_gov.js b/brozzler/js-templates/fec_gov.js similarity index 100% rename from brozzler/behaviors.d/fec_gov.js rename to brozzler/js-templates/fec_gov.js diff --git a/brozzler/behaviors.d/huffpostslides.js b/brozzler/js-templates/huffpostslides.js similarity index 100% rename from brozzler/behaviors.d/huffpostslides.js rename to brozzler/js-templates/huffpostslides.js diff --git a/brozzler/behaviors.d/instagram.js b/brozzler/js-templates/instagram.js similarity index 100% rename from brozzler/behaviors.d/instagram.js rename to brozzler/js-templates/instagram.js diff --git a/brozzler/behaviors.d/marquette_edu.js b/brozzler/js-templates/marquette_edu.js similarity index 100% rename from brozzler/behaviors.d/marquette_edu.js rename to brozzler/js-templates/marquette_edu.js diff --git a/brozzler/behaviors.d/mouseovers.js.template b/brozzler/js-templates/mouseovers.js.j2 similarity index 93% rename from brozzler/behaviors.d/mouseovers.js.template rename to brozzler/js-templates/mouseovers.js.j2 index f4d6173..8521387 100644 --- a/brozzler/behaviors.d/mouseovers.js.template +++ b/brozzler/js-templates/mouseovers.js.j2 @@ -26,11 +26,8 @@ var umbraBehavior = { var mouseoveredSomething = false; var somethingLeftBelow = false; var somethingLeftAbove = false; - var cssSelector = "${mouseover_css_selector}"; - var mouseoverUntilTimeout = "${mouseover_until_hard_timeout}"; - - //handle Python to JavaScript boolean conversion - mouseoverUntilTimeout == "True" ? mouseoverUntilTimeout = true : mouseoverUntilTimeout = false; + var cssSelector = {{mouseover_css_selector|json}}; + var mouseoverUntilTimeout = {{mouseover_until_hard_timeout|json}}; var iframes = document.querySelectorAll("iframe"); var documents = Array(iframes.length + 1); diff --git a/brozzler/behaviors.d/noguchi.js.template b/brozzler/js-templates/noguchi.js.template similarity index 100% rename from brozzler/behaviors.d/noguchi.js.template rename to brozzler/js-templates/noguchi.js.template diff --git a/brozzler/behaviors.d/psu24.js b/brozzler/js-templates/psu24.js similarity index 100% rename from brozzler/behaviors.d/psu24.js rename to brozzler/js-templates/psu24.js diff --git a/brozzler/behaviors.d/simpleclicks.js.template b/brozzler/js-templates/simpleclicks.js.j2 similarity index 94% rename from brozzler/behaviors.d/simpleclicks.js.template rename to brozzler/js-templates/simpleclicks.js.j2 index 282c096..95f0008 100644 --- a/brozzler/behaviors.d/simpleclicks.js.template +++ b/brozzler/js-templates/simpleclicks.js.j2 @@ -26,11 +26,8 @@ var umbraBehavior = { var clickedSomething = false; var somethingLeftBelow = false; var somethingLeftAbove = false; - var cssSelector = "${click_css_selector}"; - var clickUntilTimeout = "${click_until_hard_timeout}"; - - //handle Python to JavaScript boolean conversion - clickUntilTimeout == "True" ? clickUntilTimeout = true : clickUntilTimeout = false; + var cssSelector = {{click_css_selector|json}}; + var clickUntilTimeout = {{click_until_hard_timeout|json}}; var iframes = document.querySelectorAll("iframe"); var documents = Array(iframes.length + 1); diff --git a/brozzler/js-templates/try-login.js.j2 b/brozzler/js-templates/try-login.js.j2 new file mode 100644 index 0000000..e6bbfa3 --- /dev/null +++ b/brozzler/js-templates/try-login.js.j2 @@ -0,0 +1,53 @@ +var __brzl_tryLoginState = 'trying'; + +var __brzl_tryLogin = function() { + for (var i = 0; i < document.forms.length; i++) { + var form = document.forms[i]; + if (form.method != 'post') { + continue; + } + var usernameField, passwordField; + for (var j = 0; j < form.elements.length; j++) { + var field = form.elements[j]; + if (field.type == 'text' || field.type == 'email') { + if (!usernameField) { + usernameField = field; + } else { + usernameField = undefined; + break; + } + } else if (field.type == 'password') { + if (!passwordField) { + passwordField = field; + } else { + passwordField = undefined; + break; + } + } else if (field.type == 'textarea') { + usernameField = undefined; + passwordField = undefined; + break; + } + } + if (usernameField && passwordField) { + usernameField.value = {{username|json}}; + passwordField.value = {{password|json}}; + console.log('submitting username=' + usernameField.value + + ' password=*** to detected login form'); + try { + form.submit(); + } catch (e) { + // "If a form control (such as a submit button) has a name or + // id of 'submit' it will mask the form's submit method." -MDN + // http://stackoverflow.com/a/2000021 + var pseudoForm = document.createElement('form'); + pseudoForm.submit.apply(form); + } + __brzl_tryLoginState = 'submitted-form'; + return; + } + } + __brzl_tryLoginState = 'login-form-not-found'; +}; + +__brzl_tryLogin(); diff --git a/brozzler/behaviors.d/vimeo.js b/brozzler/js-templates/vimeo.js similarity index 100% rename from brozzler/behaviors.d/vimeo.js rename to brozzler/js-templates/vimeo.js diff --git a/brozzler/site.py b/brozzler/site.py index 8ff692a..d0c0f48 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -95,7 +95,8 @@ class Site(brozzler.BaseDictable): status="ACTIVE", claimed=False, start_time=None, last_disclaimed=_EPOCH_UTC, last_claimed_by=None, last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None, - cookie_db=None, user_agent=None, behavior_parameters=None): + cookie_db=None, user_agent=None, behavior_parameters=None, + username=None, password=None): self.seed = seed self.id = id @@ -117,6 +118,8 @@ class Site(brozzler.BaseDictable): self.cookie_db = cookie_db self.user_agent = user_agent self.behavior_parameters = behavior_parameters + self.username = username + self.password = password self.scope = scope or {} if not "surt" in self.scope: diff --git a/brozzler/worker.py b/brozzler/worker.py index 49626ad..90a6442 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -277,6 +277,7 @@ class BrozzlerWorker: final_page_url, outlinks = browser.browse_page( page.url, extra_headers=site.extra_headers(), behavior_parameters=site.behavior_parameters, + username=site.username, password=site.password, user_agent=site.user_agent, on_screenshot=_on_screenshot) if final_page_url != page.url: diff --git a/setup.py b/setup.py index f4f6b10..618c0c8 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev142', + version='1.1b9.dev154', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -41,7 +41,8 @@ setuptools.setup( license='Apache License 2.0', packages=['brozzler', 'brozzler.dashboard'], package_data={ - 'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml', 'job_schema.yaml'], + 'brozzler': [ + 'js-templates/*.js*', 'behaviors.yaml', 'job_schema.yaml'], 'brozzler.dashboard': find_package_data('brozzler.dashboard'), }, entry_points={ @@ -69,6 +70,7 @@ setuptools.setup( 'rethinkdb>=2.3,<2.4', 'psutil==4.3.0', 'cerberus==1.0.1', + 'jinja2', ], extras_require={ 'dashboard': ['flask>=0.11', 'gunicorn'], @@ -80,6 +82,7 @@ setuptools.setup( 'Environment :: Console', 'License :: OSI Approved :: Apache Software License', 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', 'Topic :: Internet :: WWW/HTTP', 'Topic :: System :: Archiving', ]) diff --git a/tests/htdocs/file1.txt b/tests/htdocs/site1/file1.txt similarity index 100% rename from tests/htdocs/file1.txt rename to tests/htdocs/site1/file1.txt diff --git a/tests/htdocs/site2/login.html b/tests/htdocs/site2/login.html new file mode 100644 index 0000000..d2d9236 --- /dev/null +++ b/tests/htdocs/site2/login.html @@ -0,0 +1,102 @@ + +
+