Merge branch 'master' into qa

* master:
  need $DISPLAY set for test_brozzling.py
  restore handling of "aw snap" or "he's dead jim"
  add seed username/password parameters to job config schema
  loosen the find_available_port test slightly, since it seems to be not 100% predictable for reasons i haven't investigated
  convert mouseovers and simpleclicks to jinja2
  remove obsolete facebook login code
  convert behaviors to jinja2, move them to new subdir js-templates, along with javascript previously stored as a string in browser.py
  add hack for submitting a login form containing an element with name or id "submit", which masks the form submit() method
  how did i miss this file?
  forgot to git add new test data
  detect <input type="email"> as potential username field for login
  generalized support for login doing automatic detection of login form on a page
  yet more refactoring of browser.py, clearer separation of purpose, Browser class manages browsing, sends most of the messages to chrome, WebsockReceiverThread handles messages that come back from chrome
  bump version number in setup.py
  major refactoring of browsing code to make it easier to add functionality
  back to dev version number
  i dub thee 1.1b8
This commit is contained in:
Noah Levitt 2016-12-21 18:11:56 -08:00
commit 422a5ad726
29 changed files with 710 additions and 422 deletions

View File

@ -9,7 +9,7 @@ install:
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
- pip install $TRAVIS_BUILD_DIR pytest
script:
- py.test -v -s tests
- DISPLAY=:1 py.test -v -s tests
after_failure:
- sudo cat /var/log/upstart/warcprox.log
- sudo cat /var/log/upstart/brozzler-worker.log

View File

@ -44,7 +44,8 @@ class ReachedLimit(Exception):
self.http_payload = http_payload
def __repr__(self):
return "ReachedLimit(warcprox_meta={},http_payload={})".format(repr(self.warcprox_meta), repr(self.http_payload))
return "ReachedLimit(warcprox_meta=%s,http_payload=%s)" % (
repr(self.warcprox_meta), repr(self.http_payload))
def __str__(self):
return self.__repr__()
@ -85,23 +86,7 @@ def behaviors():
behaviors_yaml = os.path.join(
os.path.dirname(__file__), 'behaviors.yaml')
with open(behaviors_yaml) as fin:
conf = yaml.load(fin)
_behaviors = conf['behaviors']
for behavior in _behaviors:
if 'behavior_js' in behavior:
behavior_js = os.path.join(
os.path.dirname(__file__), 'behaviors.d',
behavior['behavior_js'])
with open(behavior_js, encoding='utf-8') as fin:
behavior['script'] = fin.read()
elif 'behavior_js_template' in behavior:
behavior_js_template = os.path.join(
os.path.dirname(__file__), 'behaviors.d',
behavior['behavior_js_template'])
with open(behavior_js_template, encoding='utf-8') as fin:
behavior['template'] = string.Template(fin.read())
_behaviors = yaml.load(fin)
return _behaviors
def behavior_script(url, template_parameters=None):
@ -111,22 +96,18 @@ def behavior_script(url, template_parameters=None):
import re, logging
for behavior in behaviors():
if re.match(behavior['url_regex'], url):
if 'behavior_js' in behavior:
logging.info(
'using behavior %s for %s',
behavior['behavior_js'], url)
return behavior['script']
elif 'behavior_js_template' in behavior:
parameters = dict()
if 'default_parameters' in behavior:
parameters.update(behavior['default_parameters'])
if template_parameters:
parameters.update(template_parameters)
script = behavior['template'].safe_substitute(parameters)
logging.info(
'using template=%s populated with parameters=%s for %s',
repr(behavior['behavior_js_template']), parameters, url)
return script
parameters = dict()
if 'default_parameters' in behavior:
parameters.update(behavior['default_parameters'])
if template_parameters:
parameters.update(template_parameters)
template = jinja2_environment().get_template(
behavior['behavior_js_template'])
script = template.render(parameters)
logging.info(
'using template=%s populated with parameters=%s for %s',
repr(behavior['behavior_js_template']), parameters, url)
return script
return None
def thread_raise(thread, exctype):
@ -169,10 +150,21 @@ def sleep(duration):
break
time.sleep(min(duration - elapsed, 0.5))
_jinja2_env = None
def jinja2_environment():
global _jinja2_env
if not _jinja2_env:
import jinja2, json
_jinja2_env = jinja2.Environment(
loader=jinja2.PackageLoader('brozzler', 'js-templates'))
_jinja2_env.filters['json'] = json.dumps
return _jinja2_env
from brozzler.site import Page, Site
from brozzler.worker import BrozzlerWorker
from brozzler.robots import is_permitted_by_robots
from brozzler.frontier import RethinkDbFrontier
from brozzler.browser import Browser, BrowserPool
from brozzler.browser import Browser, BrowserPool, BrowsingException
from brozzler.job import new_job, new_site, Job
from brozzler.cli import suggest_default_chrome_exe

View File

@ -17,95 +17,98 @@
#
# first matched behavior is used, so order matters here
behaviors:
-
url_regex: '^https?://(?:www\.)?facebook\.com/.*$'
behavior_js_template: facebook.js.template
# default_parameters:
# parameter_username: jdoe@example.com
# parameter_password: abcd1234
request_idle_timeout_sec: 30
-
url_regex: '^https?://(?:www\.)?marquette\.edu/.*$'
behavior_js: marquette_edu.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?vimeo\.com/.*$'
behavior_js: vimeo.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?psu24.psu.edu/.*$'
behavior_js: psu24.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?instagram\.com/.*$'
behavior_js: instagram.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://catalogue\.noguchi\.org/index.php/LoginReg/form$'
behavior_js_template: noguchi.js.template
request_idle_timeout_sec: 10
-
url_regex: '^https?://catalogue\.noguchi\.org/index.php/Search/Index/search/.*/target/ca_.*$'
behavior_js_template: noguchi.js.template
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$'
behavior_js_template: huffpostslides.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$'
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: img.img-responsive
request_idle_timeout_sec: 10
- # acalog https://webarchive.jira.com/browse/ARI-3775
url_regex: '^https?://.*[?&]catoid=[^?]*$'
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: a[onclick]
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-3956
url_regex: '^https?://(?:www\.)?usask.ca/.*$'
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: a[id='feature-next']
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/AITFIVE-451
url_regex: '^https?://(?:www\.)?soundcloud.com/.*$'
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: button.sc-button-play, button.playButton
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/AITFIVE-463
url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$'
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: button.playButton.medium
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4690
url_regex: '^https?://(?:www\.)?youtube.com/.*$'
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: span.load-more-text
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4725
url_regex: '^https?://(?:www\.)?moma.org/.*$'
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: button[data-more-results-bottom-button]
click_until_hard_timeout: True
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4692
url_regex: '^https?://(?:www\.)?fec.gov/data/.*$'
behavior_js: fec_gov.js
request_idle_timeout_sec: 10
- url_regex: '^https?://(?:www\.)?news\.com\.au/.*$'
behavior_js_template: mouseovers.js.template
default_parameters:
mouseover_css_selector: .menu-item a
request_idle_timeout_sec: 10
- # default fallback behavior
url_regex: '^.*$'
request_idle_timeout_sec: 10
behavior_js: default.js
-
url_regex: '^https?://(?:www\.)?facebook\.com/.*$'
behavior_js_template: facebook.js
request_idle_timeout_sec: 30
-
url_regex: '^https?://(?:www\.)?marquette\.edu/.*$'
behavior_js_template: marquette_edu.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?vimeo\.com/.*$'
behavior_js_template: vimeo.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?psu24.psu.edu/.*$'
behavior_js_template: psu24.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?instagram\.com/.*$'
behavior_js_template: instagram.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://catalogue\.noguchi\.org/index.php/LoginReg/form$'
behavior_js_template: noguchi.js.template
request_idle_timeout_sec: 10
-
url_regex: '^https?://catalogue\.noguchi\.org/index.php/Search/Index/search/.*/target/ca_.*$'
behavior_js_template: noguchi.js.template
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$'
behavior_js_template: huffpostslides.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$'
behavior_js_template: simpleclicks.js.j2
default_parameters:
click_css_selector: img.img-responsive
click_until_hard_timeout: False
request_idle_timeout_sec: 10
- # acalog https://webarchive.jira.com/browse/ARI-3775
url_regex: '^https?://.*[?&]catoid=[^?]*$'
behavior_js_template: simpleclicks.js.j2
default_parameters:
click_css_selector: a[onclick]
click_until_hard_timeout: False
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-3956
url_regex: '^https?://(?:www\.)?usask.ca/.*$'
behavior_js_template: simpleclicks.js.j2
default_parameters:
click_css_selector: a[id='feature-next']
click_until_hard_timeout: False
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/AITFIVE-451
url_regex: '^https?://(?:www\.)?soundcloud.com/.*$'
behavior_js_template: simpleclicks.js.j2
default_parameters:
click_css_selector: button.sc-button-play, button.playButton
click_until_hard_timeout: False
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/AITFIVE-463
url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$'
behavior_js_template: simpleclicks.js.j2
default_parameters:
click_css_selector: button.playButton.medium
click_until_hard_timeout: False
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4690
url_regex: '^https?://(?:www\.)?youtube.com/.*$'
behavior_js_template: simpleclicks.js.j2
default_parameters:
click_css_selector: span.load-more-text
click_until_hard_timeout: False
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4725
url_regex: '^https?://(?:www\.)?moma.org/.*$'
behavior_js_template: simpleclicks.js.j2
default_parameters:
click_css_selector: button[data-more-results-bottom-button]
click_until_hard_timeout: True
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4692
url_regex: '^https?://(?:www\.)?fec.gov/data/.*$'
behavior_js_template: fec_gov.js
request_idle_timeout_sec: 10
- url_regex: '^https?://(?:www\.)?news\.com\.au/.*$'
behavior_js_template: mouseovers.js.j2
default_parameters:
mouseover_css_selector: .menu-item a
mouseover_until_hard_timeout: False
request_idle_timeout_sec: 10
- # default fallback behavior
url_regex: '^.*$'
request_idle_timeout_sec: 10
behavior_js_template: default.js

View File

@ -102,6 +102,108 @@ class BrowserPool:
def num_in_use(self):
return len(self._in_use)
class WebsockReceiverThread(threading.Thread):
logger = logging.getLogger(__module__ + '.' + __qualname__)
def __init__(self, websock, name=None, daemon=True):
super().__init__(name=name, daemon=daemon)
self.websock = websock
self.calling_thread = threading.current_thread()
self.websock.on_open = self._on_open
self.websock.on_message = self._on_message
self.websock.on_error = self._on_error
self.websock.on_close = self._on_close
self.is_open = False
self.got_page_load_event = None
self._result_messages = {}
def expect_result(self, msg_id):
self._result_messages[msg_id] = None
def received_result(self, msg_id):
return bool(self._result_messages.get(msg_id))
def pop_result(self, msg_id):
return self._result_messages.pop(msg_id)
def _on_close(self, websock):
pass
# self.logger.info('GOODBYE GOODBYE WEBSOCKET')
def _on_open(self, websock):
self.is_open = True
def _on_error(self, websock, e):
'''
Raises BrowsingException in the thread that created this instance.
'''
if isinstance(e, (
websocket.WebSocketConnectionClosedException,
ConnectionResetError)):
self.logger.error('websocket closed, did chrome die?')
else:
self.logger.error(
'exception from websocket receiver thread',
exc_info=1)
brozzler.thread_raise(self.calling_thread, BrowsingException)
def run(self):
self.websock.run_forever()
def _on_message(self, websock, message):
try:
self._handle_message(websock, message)
except:
self.logger.error(
'uncaught exception in _handle_message message=%s',
message, exc_info=True)
def _debugger_paused(self, message):
# we hit the breakpoint set in start(), get rid of google analytics
self.logger.debug('debugger paused! message=%s', message)
scriptId = message['params']['callFrames'][0]['location']['scriptId']
# replace script
self.websock.send(
json.dumps(dict(
id=0, method='Debugger.setScriptSource',
params={'scriptId': scriptId,
'scriptSource': 'console.log("google analytics is no more!");'})))
# resume execution
self.websock.send(json.dumps(dict(id=0, method='Debugger.resume')))
def _handle_message(self, websock, json_message):
message = json.loads(json_message)
if 'method' in message:
if message['method'] == 'Page.loadEventFired':
self.got_page_load_event = datetime.datetime.utcnow()
elif message['method'] == 'Debugger.paused':
self._debugger_paused(message)
elif message["method"] == "Inspector.targetCrashed":
self.logger.error(
'''chrome tab went "aw snap" or "he's dead jim"!''')
brozzler.thread_raise(self.calling_thread, BrowsingException)
elif message['method'] == 'Console.messageAdded':
self.logger.debug(
'%s console.%s %s', self.websock.url,
message['params']['message']['level'],
message['params']['message']['text'])
# else:
# self.logger.debug("%s %s", message["method"], json_message)
elif 'result' in message:
if message['id'] in self._result_messages:
self._result_messages[message['id']] = message
# else:
# self.logger.debug("%s", json_message)
# else:
# self.logger.debug("%s", json_message)
class Browser:
'''
Manages an instance of Chrome for browsing pages.
@ -116,9 +218,11 @@ class Browser:
**kwargs: arguments for Chrome(...)
'''
self.chrome = Chrome(**kwargs)
self.websocket_url = None
self.websock_url = None
self.websock = None
self.websock_thread = None
self.is_browsing = False
self._browser_controller = None
self._command_id = Counter()
def __enter__(self):
self.start()
@ -127,6 +231,31 @@ class Browser:
def __exit__(self, *args):
self.stop()
def _wait_for(self, callback, timeout=None):
'''
Spins until callback() returns truthy.
'''
start = time.time()
while True:
if callback():
return
elapsed = time.time() - start
if timeout and elapsed > timeout:
raise BrowsingTimeout(
'timed out after %.1fs waiting for: %s' % (
elapsed, callback))
brozzler.sleep(0.5)
def send_to_chrome(self, suppress_logging=False, **kwargs):
msg_id = next(self._command_id)
kwargs['id'] = msg_id
msg = json.dumps(kwargs)
logging.log(
brozzler.TRACE if suppress_logging else logging.DEBUG,
'sending message to %s: %s', self.websock, msg)
self.websock.send(msg)
return msg_id
def start(self, **kwargs):
'''
Starts chrome if it's not running.
@ -135,29 +264,75 @@ class Browser:
**kwargs: arguments for self.chrome.start(...)
'''
if not self.is_running():
self.websocket_url = self.chrome.start(**kwargs)
self._browser_controller = BrowserController(self.websocket_url)
self._browser_controller.start()
self.websock_url = self.chrome.start(**kwargs)
self.websock = websocket.WebSocketApp(self.websock_url)
thread_name = 'WebsockThread:{}-{:%Y%m%d%H%M%S}'.format(
surt.handyurl.parse(self.websock_url).port,
datetime.datetime.utcnow())
self.websock_thread = WebsockReceiverThread(self.websock)
self.websock_thread.start()
self._wait_for(lambda: self.websock_thread.is_open, timeout=10)
# tell browser to send us messages we're interested in
self.send_to_chrome(method='Network.enable')
self.send_to_chrome(method='Page.enable')
self.send_to_chrome(method='Console.enable')
self.send_to_chrome(method='Debugger.enable')
self.send_to_chrome(method='Runtime.enable')
# disable google analytics, see _handle_message() where breakpoint
# is caught Debugger.paused
self.send_to_chrome(
method='Debugger.setBreakpointByUrl',
params={
'lineNumber': 1,
'urlRegex': 'https?://www.google-analytics.com/analytics.js'})
def stop(self):
'''
Stops chrome if it's running.
'''
try:
if self._browser_controller:
self._browser_controller.stop()
self.websocket_url = None
if (self.websock and self.websock.sock
and self.websock.sock.connected):
self.logger.info('shutting down websocket connection')
try:
self.websock.close()
except BaseException as e:
self.logger.error(
'exception closing websocket %s - %s',
self.websock, e)
self.chrome.stop()
if self.websock_thread and (
self.websock_thread != threading.current_thread()):
self.websock_thread.join(timeout=30)
if self.websock_thread.is_alive():
self.logger.error(
'%s still alive 30 seconds after closing %s, will '
'forcefully nudge it again', self.websock_thread,
self.websock)
self.websock.keep_running = False
self.websock_thread.join(timeout=30)
if self.websock_thread.is_alive():
self.logger.critical(
'%s still alive 60 seconds after closing %s',
self.websock_thread, self.websock)
self.websock_url = None
except:
self.logger.error('problem stopping', exc_info=True)
def is_running(self):
return self.websocket_url is not None
return self.websock_url is not None
def browse_page(
self, page_url, ignore_cert_errors=False, extra_headers=None,
user_agent=None, behavior_parameters=None,
on_request=None, on_response=None, on_screenshot=None):
on_request=None, on_response=None, on_screenshot=None,
username=None, password=None):
'''
Browses page in browser.
@ -201,24 +376,26 @@ class Browser:
raise BrowsingException('browser is already busy browsing a page')
self.is_browsing = True
try:
self._browser_controller.navigate_to_page(page_url, timeout=300)
## if login_credentials:
## self._browser_controller.try_login(login_credentials) (5 min?)
self.navigate_to_page(
page_url, extra_headers=extra_headers,
user_agent=user_agent, timeout=300)
if password:
self.try_login(username, password, timeout=300)
behavior_script = brozzler.behavior_script(
page_url, behavior_parameters)
self._browser_controller.run_behavior(behavior_script, timeout=900)
self.run_behavior(behavior_script, timeout=900)
if on_screenshot:
self._browser_controller.scroll_to_top()
jpeg_bytes = self._browser_controller.screenshot()
self.scroll_to_top()
jpeg_bytes = self.screenshot()
on_screenshot(jpeg_bytes)
outlinks = self._browser_controller.extract_outlinks()
outlinks = self.extract_outlinks()
## for each hashtag not already visited:
## navigate_to_hashtag (nothing to wait for so no timeout?)
## if on_screenshot;
## take screenshot (30 sec)
## run behavior (3 min)
## outlinks += retrieve_outlinks (60 sec)
final_page_url = self._browser_controller.url()
final_page_url = self.url()
return final_page_url, outlinks
except websocket.WebSocketConnectionClosedException as e:
self.logger.error('websocket closed, did chrome die?')
@ -226,183 +403,10 @@ class Browser:
finally:
self.is_browsing = False
class Counter:
def __init__(self):
self.next_value = 0
def __next__(self):
try:
return self.next_value
finally:
self.next_value += 1
def peek_next(self):
return self.next_value
class BrowserController:
'''
'''
logger = logging.getLogger(__module__ + '.' + __qualname__)
def __init__(self, websocket_url):
self.websocket_url = websocket_url
self._command_id = Counter()
self._websock_thread = None
self._websock_open = None
self._result_messages = {}
def _wait_for(self, callback, timeout=None):
'''
Spins until callback() returns truthy.
'''
start = time.time()
while True:
brozzler.sleep(0.5)
if callback():
return
elapsed = time.time() - start
if timeout and elapsed > timeout:
raise BrowsingTimeout(
'timed out after %.1fs waiting for: %s' % (
elapsed, callback))
def __enter__(self):
self.start()
return self
def __exit__(self, *args):
self.stop()
def start(self):
if not self._websock_thread:
calling_thread = threading.current_thread()
def on_open(websock):
self._websock_open = datetime.datetime.utcnow()
def on_error(websock, e):
'''
Raises BrowsingException in the thread that called start()
'''
if isinstance(e, websocket.WebSocketConnectionClosedException):
self.logger.error('websocket closed, did chrome die?')
else:
self.logger.error(
'exception from websocket receiver thread',
exc_info=1)
brozzler.thread_raise(calling_thread, BrowsingException)
# open websocket, start thread that receives messages
self._websock = websocket.WebSocketApp(
self.websocket_url, on_open=on_open,
on_message=self._on_message, on_error=on_error)
thread_name = 'WebsockThread:{}-{:%Y%m%d%H%M%S}'.format(
surt.handyurl.parse(self.websocket_url).port,
datetime.datetime.utcnow())
self._websock_thread = threading.Thread(
target=self._websock.run_forever, name=thread_name,
daemon=True)
self._websock_thread.start()
self._wait_for(lambda: self._websock_open, timeout=10)
# tell browser to send messages we're interested in
self.send_to_chrome(method='Network.enable')
self.send_to_chrome(method='Page.enable')
self.send_to_chrome(method='Console.enable')
self.send_to_chrome(method='Debugger.enable')
self.send_to_chrome(method='Runtime.enable')
# disable google analytics, see _handle_message() where breakpoint
# is caught Debugger.paused
self.send_to_chrome(
method='Debugger.setBreakpointByUrl',
params={
'lineNumber': 1,
'urlRegex': 'https?://www.google-analytics.com/analytics.js'})
def stop(self, *args):
if self._websock_thread:
if (self._websock and self._websock.sock
and self._websock.sock.connected):
self.logger.info('shutting down websocket connection')
try:
self._websock.close()
except BaseException as e:
self.logger.error(
'exception closing websocket %s - %s',
self._websock, e)
if self._websock_thread != threading.current_thread():
self._websock_thread.join(timeout=30)
if self._websock_thread.is_alive():
self.logger.error(
'%s still alive 30 seconds after closing %s, will '
'forcefully nudge it again', self._websock_thread,
self._websock)
self._websock.keep_running = False
self._websock_thread.join(timeout=30)
if self._websock_thread.is_alive():
self.logger.critical(
'%s still alive 60 seconds after closing %s',
self._websock_thread, self._websock)
def _on_message(self, websock, message):
try:
self._handle_message(websock, message)
except:
self.logger.error(
'uncaught exception in _handle_message message=%s',
message, exc_info=True)
def _handle_message(self, websock, json_message):
message = json.loads(json_message)
if 'method' in message:
if message['method'] == 'Page.loadEventFired':
self._got_page_load_event = datetime.datetime.utcnow()
elif message['method'] == 'Debugger.paused':
self._debugger_paused(message)
elif message['method'] == 'Console.messageAdded':
self.logger.debug(
'%s console.%s %s', self._websock.url,
message['params']['message']['level'],
message['params']['message']['text'])
# else:
# self.logger.debug("%s %s", message["method"], json_message)
elif 'result' in message:
if message['id'] in self._result_messages:
self._result_messages[message['id']] = message
# else:
# self.logger.debug("%s", json_message)
# else:
# self.logger.debug("%s", json_message)
def _debugger_paused(self, message):
# we hit the breakpoint set in start(), get rid of google analytics
self.logger.debug('debugger paused! message=%s', message)
scriptId = message['params']['callFrames'][0]['location']['scriptId']
# replace script
self.send_to_chrome(
method='Debugger.setScriptSource',
params={'scriptId': scriptId,
'scriptSource': 'console.log("google analytics is no more!");'})
# resume execution
self.send_to_chrome(method='Debugger.resume')
def send_to_chrome(self, suppress_logging=False, **kwargs):
msg_id = next(self._command_id)
kwargs['id'] = msg_id
msg = json.dumps(kwargs)
if not suppress_logging:
self.logger.debug('sending message to %s: %s', self._websock, msg)
self._websock.send(msg)
return msg_id
def navigate_to_page(
self, page_url, extra_headers=None, user_agent=None, timeout=300):
'''
'''
headers = extra_headers or {}
headers['Accept-Encoding'] = 'gzip, deflate'
headers['Accept-Encoding'] = 'identity'
self.send_to_chrome(
method='Network.setExtraHTTPHeaders',
params={'headers': headers})
@ -414,73 +418,62 @@ class BrowserController:
# navigate to the page!
self.logger.info('navigating to page %s', page_url)
self._got_page_load_event = None
self.websock_thread.got_page_load_event = None
self.send_to_chrome(method='Page.navigate', params={'url': page_url})
self._wait_for(lambda: self._got_page_load_event, timeout=timeout)
self._wait_for(
lambda: self.websock_thread.got_page_load_event,
timeout=timeout)
OUTLINKS_JS = r'''
var __brzl_framesDone = new Set();
var __brzl_compileOutlinks = function(frame) {
__brzl_framesDone.add(frame);
if (frame && frame.document) {
var outlinks = Array.prototype.slice.call(
frame.document.querySelectorAll('a[href]'));
for (var i = 0; i < frame.frames.length; i++) {
if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
outlinks = outlinks.concat(
__brzl_compileOutlinks(frame.frames[i]));
}
}
}
return outlinks;
}
__brzl_compileOutlinks(window).join('\n');
'''
def extract_outlinks(self, timeout=60):
self.logger.info('extracting outlinks')
self._result_messages[self._command_id.peek_next()] = None
self.websock_thread.expect_result(self._command_id.peek())
js = brozzler.jinja2_environment().get_template(
'extract-outlinks.js').render()
msg_id = self.send_to_chrome(
method='Runtime.evaluate',
params={'expression': self.OUTLINKS_JS})
method='Runtime.evaluate', params={'expression': js})
self._wait_for(
lambda: self._result_messages.get(msg_id), timeout=timeout)
message = self._result_messages.pop(msg_id)
lambda: self.websock_thread.received_result(msg_id),
timeout=timeout)
message = self.websock_thread.pop_result(msg_id)
if message['result']['result']['value']:
return frozenset(message['result']['result']['value'].split('\n'))
else:
self._outlinks = frozenset()
return frozenset()
def screenshot(self, timeout=30):
self.logger.info('taking screenshot')
self._result_messages[self._command_id.peek_next()] = None
self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome(method='Page.captureScreenshot')
self._wait_for(
lambda: self._result_messages.get(msg_id), timeout=timeout)
message = self._result_messages.pop(msg_id)
lambda: self.websock_thread.received_result(msg_id),
timeout=timeout)
message = self.websock_thread.pop_result(msg_id)
jpeg_bytes = base64.b64decode(message['result']['data'])
return jpeg_bytes
def scroll_to_top(self, timeout=30):
self.logger.info('scrolling to top')
self._result_messages[self._command_id.peek_next()] = None
self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome(
method='Runtime.evaluate',
params={'expression': 'window.scrollTo(0, 0);'})
self._wait_for(
lambda: self._result_messages.get(msg_id), timeout=timeout)
self._result_messages.pop(msg_id)
lambda: self.websock_thread.received_result(msg_id),
timeout=timeout)
self.websock_thread.pop_result(msg_id)
def url(self, timeout=30):
'''
Returns value of document.URL from the browser.
'''
self._result_messages[self._command_id.peek_next()] = None
self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome(
method='Runtime.evaluate',
params={'expression': 'document.URL'})
self._wait_for(
lambda: self._result_messages.get(msg_id), timeout=timeout)
message = self._result_messages.pop(msg_id)
lambda: self.websock_thread.received_result(msg_id),
timeout=timeout)
message = self.websock_thread.pop_result(msg_id)
return message['result']['result']['value']
def run_behavior(self, behavior_script, timeout=900):
@ -498,14 +491,15 @@ __brzl_compileOutlinks(window).join('\n');
brozzler.sleep(7)
self._result_messages[self._command_id.peek_next()] = None
self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome(
method='Runtime.evaluate', suppress_logging=True,
params={'expression': 'umbraBehaviorFinished()'})
try:
self._wait_for(
lambda: self._result_messages.get(msg_id), timeout=5)
msg = self._result_messages.get(msg_id)
lambda: self.websock_thread.received_result(msg_id),
timeout=5)
msg = self.websock_thread.pop_result(msg_id)
if (msg and 'result' in msg
and not ('wasThrown' in msg['result']
and msg['result']['wasThrown'])
@ -517,4 +511,63 @@ __brzl_compileOutlinks(window).join('\n');
except BrowsingTimeout:
pass
def try_login(self, username, password, timeout=300):
try_login_js = brozzler.jinja2_environment().get_template(
'try-login.js.j2').render(
username=username, password=password)
self.websock_thread.got_page_load_event = None
self.send_to_chrome(
method='Runtime.evaluate', suppress_logging=True,
params={'expression': try_login_js})
# wait for tryLogin to finish trying (should be very very quick)
start = time.time()
while True:
self.websock_thread.expect_result(self._command_id.peek())
msg_id = self.send_to_chrome(
method='Runtime.evaluate',
params={'expression': 'try { __brzl_tryLoginState } catch (e) { "maybe-submitted-form" }'})
try:
self._wait_for(
lambda: self.websock_thread.received_result(msg_id),
timeout=5)
msg = self.websock_thread.pop_result(msg_id)
if (msg and 'result' in msg
and 'result' in msg['result']):
result = msg['result']['result']['value']
if result == 'login-form-not-found':
# we're done
return
elif result in ('submitted-form', 'maybe-submitted-form'):
# wait for page load event below
self.logger.info(
'submitted a login form, waiting for another '
'page load event')
break
# else try again to get __brzl_tryLoginState
except BrowsingTimeout:
pass
if time.time() - start > 30:
raise BrowsingException(
'timed out trying to check if tryLogin finished')
# if we get here, we submitted a form, now we wait for another page
# load event
self._wait_for(
lambda: self.websock_thread.got_page_load_event,
timeout=timeout)
class Counter:
def __init__(self):
self.next_value = 0
def __next__(self):
try:
return self.next_value
finally:
self.next_value += 1
def peek(self):
return self.next_value

View File

@ -126,6 +126,12 @@ def brozzle_page():
'json blob of parameters to populate the javascript behavior '
'template, e.g. {"parameter_username":"x",'
'"parameter_password":"y"}'))
arg_parser.add_argument(
'--username', dest='username', default=None,
help='use this username to try to log in if a login form is found')
arg_parser.add_argument(
'--password', dest='password', default=None,
help='use this password to try to log in if a login form is found')
arg_parser.add_argument(
'--proxy', dest='proxy', default=None,
help='http proxy')
@ -145,7 +151,8 @@ def brozzle_page():
site = brozzler.Site(
id=-1, seed=args.url, proxy=args.proxy,
enable_warcprox_features=args.enable_warcprox_features,
behavior_parameters=behavior_parameters)
behavior_parameters=behavior_parameters, username=args.username,
password=args.password)
page = brozzler.Page(url=args.url, site_id=site.id)
worker = brozzler.BrozzlerWorker(frontier=None)
@ -230,6 +237,12 @@ def brozzler_new_site():
'json blob of parameters to populate the javascript behavior '
'template, e.g. {"parameter_username":"x",'
'"parameter_password":"y"}'))
arg_parser.add_argument(
'--username', dest='username', default=None,
help='use this username to try to log in if a login form is found')
arg_parser.add_argument(
'--password', dest='password', default=None,
help='use this password to try to log in if a login form is found')
_add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
@ -243,7 +256,8 @@ def brozzler_new_site():
warcprox_meta=json.loads(
args.warcprox_meta) if args.warcprox_meta else None,
behavior_parameters=json.loads(
args.behavior_parameters) if args.behavior_parameters else None)
args.behavior_parameters) if args.behavior_parameters else None,
username=args.username, password=args.password)
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(","), args.rethinkdb_db)

View File

@ -84,4 +84,10 @@ seeds:
type: url
required: true
username:
type: string
password:
type: string
<<: *multi_level_options

View File

@ -0,0 +1,16 @@
var __brzl_framesDone = new Set();
var __brzl_compileOutlinks = function(frame) {
__brzl_framesDone.add(frame);
if (frame && frame.document) {
var outlinks = Array.prototype.slice.call(
frame.document.querySelectorAll('a[href]'));
for (var i = 0; i < frame.frames.length; i++) {
if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
outlinks = outlinks.concat(
__brzl_compileOutlinks(frame.frames[i]));
}
}
}
return outlinks;
}
__brzl_compileOutlinks(window).join('\n');

View File

@ -39,8 +39,6 @@ var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"]
//div[class="phm pluginLikeboxStream"] = facebook widget embedded in 3rd party pages
var UMBRA_THINGS_TO_SCROLL_SELECTOR = 'div[class="phm pluginLikeboxStream"]';
var NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING = 5;
var UMBRA_FB_USER_NAME = "${parameter_username}";
var UMBRA_FB_PASSWORD = "${parameter_password}";
var umbraAlreadyClicked = {};
var umbraAlreadyScrolledThing = {};
var umbraScrolledThingFailedScrollAttempts = {};
@ -172,15 +170,6 @@ var umbraIntervalFunc = function() {
}
}
var umbraFacebookLogin = function() {
var emailInput = document.querySelector("form#login_form input#email");
var passwordInput = document.querySelector("form#login_form input#pass");
var loginButton = document.querySelector("form#login_form label#loginbutton > input");
emailInput.value=UMBRA_FB_USER_NAME;
passwordInput.value=UMBRA_FB_PASSWORD;
loginButton.click();
}
// If we haven't had anything to do (scrolled, clicked, etc) in this amount of
// time, then we consider ourselves finished with the page.
var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
@ -202,11 +191,4 @@ if (document.querySelector("div.captcha_interstitial") != null) { // found a cap
console.log("captcha found for " + location.href);
}
if (document.getElementById("login_form") == null || UMBRA_FB_USER_NAME.indexOf("parameter")>0 || UMBRA_FB_PASSWORD.indexOf("parameter")>0 ) {//check for unset parameters
console.log("missing #login_form or login credentials; maybe already logged in for " + location.href);
var umbraIntervalId = setInterval(umbraIntervalFunc, 200);
}
else {//login
console.log("#login_form and credentials found for " + location.href);
umbraFacebookLogin();
}
var umbraIntervalId = setInterval(umbraIntervalFunc, 200);

View File

@ -26,11 +26,8 @@ var umbraBehavior = {
var mouseoveredSomething = false;
var somethingLeftBelow = false;
var somethingLeftAbove = false;
var cssSelector = "${mouseover_css_selector}";
var mouseoverUntilTimeout = "${mouseover_until_hard_timeout}";
//handle Python to JavaScript boolean conversion
mouseoverUntilTimeout == "True" ? mouseoverUntilTimeout = true : mouseoverUntilTimeout = false;
var cssSelector = {{mouseover_css_selector|json}};
var mouseoverUntilTimeout = {{mouseover_until_hard_timeout|json}};
var iframes = document.querySelectorAll("iframe");
var documents = Array(iframes.length + 1);

View File

@ -26,11 +26,8 @@ var umbraBehavior = {
var clickedSomething = false;
var somethingLeftBelow = false;
var somethingLeftAbove = false;
var cssSelector = "${click_css_selector}";
var clickUntilTimeout = "${click_until_hard_timeout}";
//handle Python to JavaScript boolean conversion
clickUntilTimeout == "True" ? clickUntilTimeout = true : clickUntilTimeout = false;
var cssSelector = {{click_css_selector|json}};
var clickUntilTimeout = {{click_until_hard_timeout|json}};
var iframes = document.querySelectorAll("iframe");
var documents = Array(iframes.length + 1);

View File

@ -0,0 +1,53 @@
var __brzl_tryLoginState = 'trying';
var __brzl_tryLogin = function() {
for (var i = 0; i < document.forms.length; i++) {
var form = document.forms[i];
if (form.method != 'post') {
continue;
}
var usernameField, passwordField;
for (var j = 0; j < form.elements.length; j++) {
var field = form.elements[j];
if (field.type == 'text' || field.type == 'email') {
if (!usernameField) {
usernameField = field;
} else {
usernameField = undefined;
break;
}
} else if (field.type == 'password') {
if (!passwordField) {
passwordField = field;
} else {
passwordField = undefined;
break;
}
} else if (field.type == 'textarea') {
usernameField = undefined;
passwordField = undefined;
break;
}
}
if (usernameField && passwordField) {
usernameField.value = {{username|json}};
passwordField.value = {{password|json}};
console.log('submitting username=' + usernameField.value
+ ' password=*** to detected login form');
try {
form.submit();
} catch (e) {
// "If a form control (such as a submit button) has a name or
// id of 'submit' it will mask the form's submit method." -MDN
// http://stackoverflow.com/a/2000021
var pseudoForm = document.createElement('form');
pseudoForm.submit.apply(form);
}
__brzl_tryLoginState = 'submitted-form';
return;
}
}
__brzl_tryLoginState = 'login-form-not-found';
};
__brzl_tryLogin();

View File

@ -95,7 +95,8 @@ class Site(brozzler.BaseDictable):
status="ACTIVE", claimed=False, start_time=None,
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None,
cookie_db=None, user_agent=None, behavior_parameters=None):
cookie_db=None, user_agent=None, behavior_parameters=None,
username=None, password=None):
self.seed = seed
self.id = id
@ -117,6 +118,8 @@ class Site(brozzler.BaseDictable):
self.cookie_db = cookie_db
self.user_agent = user_agent
self.behavior_parameters = behavior_parameters
self.username = username
self.password = password
self.scope = scope or {}
if not "surt" in self.scope:

View File

@ -277,6 +277,7 @@ class BrozzlerWorker:
final_page_url, outlinks = browser.browse_page(
page.url, extra_headers=site.extra_headers(),
behavior_parameters=site.behavior_parameters,
username=site.username, password=site.password,
user_agent=site.user_agent,
on_screenshot=_on_screenshot)
if final_page_url != page.url:

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b8.dev142',
version='1.1b9.dev154',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
@ -41,7 +41,8 @@ setuptools.setup(
license='Apache License 2.0',
packages=['brozzler', 'brozzler.dashboard'],
package_data={
'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml', 'job_schema.yaml'],
'brozzler': [
'js-templates/*.js*', 'behaviors.yaml', 'job_schema.yaml'],
'brozzler.dashboard': find_package_data('brozzler.dashboard'),
},
entry_points={
@ -69,6 +70,7 @@ setuptools.setup(
'rethinkdb>=2.3,<2.4',
'psutil==4.3.0',
'cerberus==1.0.1',
'jinja2',
],
extras_require={
'dashboard': ['flask>=0.11', 'gunicorn'],
@ -80,6 +82,7 @@ setuptools.setup(
'Environment :: Console',
'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Topic :: Internet :: WWW/HTTP',
'Topic :: System :: Archiving',
])

View File

@ -0,0 +1,102 @@
<html>
<head>
<title>brozzler login form test</title>
</head>
<body>
<form method='GET' action='01'>
<h3>this is a form with method=GET</h3>
<div>
<label for='01-user'>username</label>
<input id='01-user' type='text' name='user'>
</div>
<div>
<label for='01-password'>password</label>
<input id='01-password' type='password' name='password'>
</div>
<div><input type='submit'></div>
</form>
<form method='POST' action='02'>
<h3>this is a form with two password fields</h3>
<div>
<label for='02-user'>username</label>
<input id='02-user' type='text' name='user'>
</div>
<div>
<label for='02-password1'>password</label>
<input id='02-password1' type='password' name='password1'>
</div>
<div>
<label for='02-password2'>retype password</label>
<input id='02-password2' type='password' name='password2'>
</div>
<div><input type='submit'></div>
</form>
<form method='POST' action='03'>
<h3>this is a form with an extra text field</h3>
<div>
<label for='03-user'>username</label>
<input id='03-user' type='text' name='user'>
</div>
<div>
<label for='03-password'>password</label>
<input id='03-password' type='password' name='password'>
</div>
<div>
<label for='03-color'>favorite color</label>
<input id='03-color' type='text' name='color'>
</div>
<div><input type='submit'></div>
</form>
<form method='POST' action='04'>
<h3>this is a form with an extra textarea field</h3>
<div>
<label for='04-user'>username</label>
<input id='04-user' type='text' name='user'>
</div>
<div>
<label for='04-password'>password</label>
<input id='04-password' type='password' name='password'>
</div>
<div>
<label for='04-color'>favorite color</label>
<textarea id='04-color' name='color' rows='5' cols='40'>you have room for a very long favorite color here, dum dum!</textarea>
</div>
<div><input type='submit'></div>
</form>
<form method='POST' action='00'>
<h3>login with this form</h3>
<div>
<label for='00-user'>username</label>
<input id='00-user' type='text' name='user'>
</div>
<div>
<label for='00-password'>password</label>
<input id='00-password' type='password' name='password'>
</div>
<div>
radio button group with none checked
<label> <input type='radio' name='00-radio-unchecked' value='milk'> milk</label>
<label> <input type='radio' name='00-radio-unchecked' value='butter'> butter</label>
</div>
<div>
radio button group with one checked
<label><input type='radio' name='00-radio-checked' value='milk'> milk </label>
<label><input type='radio' name='00-radio-checked' value='butter' checked> butter</label>
</div>
<div>
checkboxes, one checked, one not
<label> <input type='checkbox' name='00-checkbox' value='milk'> milk</label>
<label> <input type='checkbox' name='00-checkbox' value='butter' checked> butter</label>
</div>
<input type='hidden' name='00-hidden' value='i-am-a-hidden-field-value'>
<div><input type='submit' name='00-submit' value='submit'></div>
<div><button name='00-button-button' type='button' value='button-button-value' >this is a &lt;button type='button'&gt;</button></div>
<div><button name='00-button-submit' type='button-submit-value'>this is a &lt;button type='submit'&gt;</button></div>
</form>
</body>
</html>

27
tests/test_brozzling.py Normal file
View File

@ -0,0 +1,27 @@
#!/usr/bin/env python
'''
test_brozzling.py - XXX explain
Copyright (C) 2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import pytest
import brozzler
def test_aw_snap_hes_dead_jim():
chrome_exe = brozzler.suggest_default_chrome_exe()
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
with pytest.raises(brozzler.BrowsingException):
browser.browse_page('chrome://crash')

View File

@ -62,14 +62,13 @@ def test_httpd(httpd):
deduplication.
'''
payload1 = content2 = None
with urllib.request.urlopen(
'http://localhost:%s/file1.txt' % httpd.server_port) as response:
url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
with urllib.request.urlopen(url) as response:
assert response.status == 200
payload1 = response.read()
assert payload1
with urllib.request.urlopen(
'http://localhost:%s/file1.txt' % httpd.server_port) as response:
with urllib.request.urlopen(url) as response:
assert response.status == 200
payload2 = response.read()
assert payload2
@ -101,13 +100,14 @@ def test_services_up():
def test_brozzle_site(httpd):
test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
site = brozzler.Site(
seed='http://localhost:%s/' % httpd.server_port,
seed='http://localhost:%s/site1/' % httpd.server_port,
proxy='localhost:8000', enable_warcprox_features=True,
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
# the two pages we expect to be crawled
page1 = 'http://localhost:%s/' % httpd.server_port
page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
page1 = 'http://localhost:%s/site1/' % httpd.server_port
page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
robots = 'http://localhost:%s/robots.txt' % httpd.server_port
# so we can examine rethinkdb before it does anything
try:
@ -131,19 +131,18 @@ def test_brozzle_site(httpd):
# check that we got the two pages we expected
pages = list(frontier.site_pages(site.id))
assert len(pages) == 3
assert len(pages) == 2
assert {page.url for page in pages} == {
'http://localhost:%s/' % httpd.server_port,
'http://localhost:%s/robots.txt' % httpd.server_port,
'http://localhost:%s/file1.txt' % httpd.server_port}
'http://localhost:%s/site1/' % httpd.server_port,
'http://localhost:%s/site1/file1.txt' % httpd.server_port}
time.sleep(2) # in case warcprox hasn't finished processing urls
# take a look at the captures table
captures = r.table('captures').filter({'test_id':test_id}).run()
captures_by_url = {
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
assert robots in captures_by_url
assert page1 in captures_by_url
assert '%srobots.txt' % page1 in captures_by_url
assert page2 in captures_by_url
assert 'screenshot:%s' % page1 in captures_by_url
assert 'thumbnail:%s' % page1 in captures_by_url
@ -153,7 +152,7 @@ def test_brozzle_site(httpd):
t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
expected_payload = open(os.path.join(
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
assert requests.get(wb_url).content == expected_payload
def test_warcprox_selection(httpd):
@ -163,11 +162,12 @@ def test_warcprox_selection(httpd):
test_id = 'test_warcprox_selection-%s' % datetime.datetime.utcnow().isoformat()
# the two pages we expect to be crawled
page1 = 'http://localhost:%s/' % httpd.server_port
page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
page1 = 'http://localhost:%s/site1/' % httpd.server_port
page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
robots = 'http://localhost:%s/robots.txt' % httpd.server_port
site = brozzler.Site(
seed='http://localhost:%s/' % httpd.server_port,
seed='http://localhost:%s/site1/' % httpd.server_port,
enable_warcprox_features=True,
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
@ -199,19 +199,18 @@ def test_warcprox_selection(httpd):
# check that we got the two pages we expected
pages = list(frontier.site_pages(site.id))
assert len(pages) == 3
assert len(pages) == 2
assert {page.url for page in pages} == {
'http://localhost:%s/' % httpd.server_port,
'http://localhost:%s/robots.txt' % httpd.server_port,
'http://localhost:%s/file1.txt' % httpd.server_port}
'http://localhost:%s/site1/' % httpd.server_port,
'http://localhost:%s/site1/file1.txt' % httpd.server_port}
time.sleep(2) # in case warcprox hasn't finished processing urls
# take a look at the captures table
captures = r.table('captures').filter({'test_id':test_id}).run()
captures_by_url = {
c['url']:c for c in captures if c['http_method'] != 'HEAD'}
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
assert robots in captures_by_url
assert page1 in captures_by_url
assert '%srobots.txt' % page1 in captures_by_url
assert page2 in captures_by_url
assert 'screenshot:%s' % page1 in captures_by_url
assert 'thumbnail:%s' % page1 in captures_by_url
@ -221,14 +220,13 @@ def test_warcprox_selection(httpd):
t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
expected_payload = open(os.path.join(
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
assert requests.get(
wb_url, allow_redirects=False).content == expected_payload
os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
assert requests.get(wb_url).content == expected_payload
def test_obey_robots(httpd):
test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
site = brozzler.Site(
seed='http://localhost:%s/' % httpd.server_port,
seed='http://localhost:%s/site1/' % httpd.server_port,
proxy='localhost:8000', enable_warcprox_features=True,
user_agent='im a badbot', # robots.txt blocks badbot
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
@ -256,11 +254,11 @@ def test_obey_robots(httpd):
site = frontier.site(site.id)
assert site.status == 'FINISHED'
# check that we got the two pages we expected
# check that only the one page is in rethinkdb
pages = list(frontier.site_pages(site.id))
assert len(pages) == 1
assert {page.url for page in pages} == {
'http://localhost:%s/' % httpd.server_port}
'http://localhost:%s/site1/' % httpd.server_port}
# take a look at the captures table
time.sleep(2) # in case warcprox hasn't finished processing urls
@ -276,3 +274,44 @@ def test_obey_robots(httpd):
os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read()
assert requests.get(
wb_url, allow_redirects=False).content == expected_payload
def test_login(httpd):
test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
site = brozzler.Site(
seed='http://localhost:%s/site2/' % httpd.server_port,
proxy='localhost:8000', enable_warcprox_features=True,
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}},
username='test_username', password='test_password')
r = rethinkstuff.Rethinker('localhost', db='brozzler')
frontier = brozzler.RethinkDbFrontier(r)
brozzler.new_site(frontier, site)
# the site should be brozzled fairly quickly
start = time.time()
while site.status != 'FINISHED' and time.time() - start < 300:
time.sleep(0.5)
site = frontier.site(site.id)
assert site.status == 'FINISHED'
# take a look at the captures table
time.sleep(2) # in case warcprox hasn't finished processing urls
robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
captures = list(r.table('captures').filter(
{'test_id':test_id}).order_by('timestamp').run())
meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures]
# there are several forms in in htdocs/site2/login.html but only one
# that brozzler's heuristic should match and try to submit, and it has
# action='00', so we can check for that here
assert ('POST http://localhost:%s/site2/00' % httpd.server_port) in meth_url
# sanity check the rest of the crawl
assert ('GET http://localhost:%s/robots.txt' % httpd.server_port) in meth_url
assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url
assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' % httpd.server_port) in meth_url
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' % httpd.server_port) in meth_url
assert ('GET http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url

View File

@ -69,7 +69,7 @@ def test_find_available_port():
sock = socket.socket()
sock.bind(('localhost', 9800))
sock.listen(0)
assert x._find_available_port(9800) == 9999
assert x._find_available_port(9800) >= 9990
sock.close()
assert x._find_available_port(9800) == 9800

View File

@ -16,4 +16,4 @@ vagrant ssh -- 'status warcprox ;
echo
vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && pip install pytest'
vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests $@"
vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && DISPLAY=:1 py.test -v -s /brozzler/tests $@"