mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-21 05:14:22 -04:00
rudimentary link extraction and crawling
This commit is contained in:
parent
d8a962b29e
commit
4042f22497
3 changed files with 74 additions and 18 deletions
39
bin/crawl-url
Executable file
39
bin/crawl-url
Executable file
|
@ -0,0 +1,39 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# vim: set sw=4 et:
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import logging
|
||||||
|
import umbra
|
||||||
|
|
||||||
|
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
||||||
|
description='browse-url - open urls in chrome/chromium and run behaviors',
|
||||||
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
|
arg_parser.add_argument('urls', metavar='URL', nargs='+', help='URL(s) to browse')
|
||||||
|
arg_parser.add_argument('-w', '--browser-wait', dest='browser_wait', default='60',
|
||||||
|
help='seconds to wait for browser initialization')
|
||||||
|
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
|
||||||
|
help='executable to use to invoke chrome')
|
||||||
|
arg_parser.add_argument('-v', '--verbose', dest='log_level',
|
||||||
|
action="store_const", default=logging.INFO, const=logging.DEBUG)
|
||||||
|
arg_parser.add_argument('--version', action='version',
|
||||||
|
version="umbra {} - {}".format(umbra.version, os.path.basename(__file__)))
|
||||||
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
|
|
||||||
|
logging.basicConfig(stream=sys.stdout, level=args.log_level,
|
||||||
|
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||||
|
|
||||||
|
frontier = list(args.urls)
|
||||||
|
|
||||||
|
def add_to_frontier(urls):
|
||||||
|
logging.info("adding {} urls to frontier".format(len(urls)))
|
||||||
|
frontier.extend(urls)
|
||||||
|
|
||||||
|
with umbra.Browser(chrome_exe=args.chrome_exe) as browser:
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
browser.browse_page(frontier.pop(), on_outlinks=add_to_frontier)
|
||||||
|
except IndexError:
|
||||||
|
logging.info("finished, frontier is empty")
|
||||||
|
|
|
@ -62,6 +62,8 @@ class Behavior:
|
||||||
self.logger.warn("no behavior to run on {}".format(self.url))
|
self.logger.warn("no behavior to run on {}".format(self.url))
|
||||||
|
|
||||||
def is_finished(self):
|
def is_finished(self):
|
||||||
|
"""Asynchronously asks behavior if it is finished, and in the mean time
|
||||||
|
returns the response from the previous such query."""
|
||||||
msg_id = self.umbra_worker.send_to_chrome(method="Runtime.evaluate",
|
msg_id = self.umbra_worker.send_to_chrome(method="Runtime.evaluate",
|
||||||
suppress_logging=True, params={"expression":"umbraBehaviorFinished()"})
|
suppress_logging=True, params={"expression":"umbraBehaviorFinished()"})
|
||||||
self.waiting_result_msg_ids.append(msg_id)
|
self.waiting_result_msg_ids.append(msg_id)
|
||||||
|
|
|
@ -14,6 +14,7 @@ import tempfile
|
||||||
import os
|
import os
|
||||||
import socket
|
import socket
|
||||||
import base64
|
import base64
|
||||||
|
import random
|
||||||
from umbra.behaviors import Behavior
|
from umbra.behaviors import Behavior
|
||||||
|
|
||||||
class BrowserPool:
|
class BrowserPool:
|
||||||
|
@ -95,21 +96,25 @@ class Browser:
|
||||||
def abort_browse_page(self):
|
def abort_browse_page(self):
|
||||||
self._abort_browse_page = True
|
self._abort_browse_page = True
|
||||||
|
|
||||||
def browse_page(self, url, on_request=None, on_screenshot=None):
|
def browse_page(self, url, on_request=None, on_screenshot=None, on_outlinks=None):
|
||||||
"""Synchronously browses a page and runs behaviors.
|
"""Synchronously loads a page, takes a screenshot, and runs behaviors.
|
||||||
|
|
||||||
Raises BrowsingException if browsing the page fails in a non-critical
|
Raises BrowsingException if browsing the page fails in a non-critical
|
||||||
way.
|
way.
|
||||||
"""
|
"""
|
||||||
self.url = url
|
self.url = url
|
||||||
self.on_request = on_request
|
self.on_request = on_request
|
||||||
|
|
||||||
self.on_screenshot = on_screenshot
|
self.on_screenshot = on_screenshot
|
||||||
self._waiting_on_screenshot_msg_id = None
|
self._waiting_on_screenshot_msg_id = None
|
||||||
|
|
||||||
|
self.on_outlinks = on_outlinks
|
||||||
|
self._waiting_on_outlinks_msg_id = None
|
||||||
|
self._got_outlinks = False
|
||||||
|
|
||||||
self._websock = websocket.WebSocketApp(self._websocket_url,
|
self._websock = websocket.WebSocketApp(self._websocket_url,
|
||||||
on_open=self._visit_page, on_message=self._handle_message)
|
on_open=self._visit_page, on_message=self._handle_message)
|
||||||
|
|
||||||
import random
|
|
||||||
threadName = "WebsockThread{}-{}".format(self.chrome_port,
|
threadName = "WebsockThread{}-{}".format(self.chrome_port,
|
||||||
''.join((random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for _ in range(6))))
|
''.join((random.choice('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789') for _ in range(6))))
|
||||||
websock_thread = threading.Thread(target=self._websock.run_forever, name=threadName, kwargs={'ping_timeout':0.5})
|
websock_thread = threading.Thread(target=self._websock.run_forever, name=threadName, kwargs={'ping_timeout':0.5})
|
||||||
|
@ -120,19 +125,9 @@ class Browser:
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
if not self._websock or not self._websock.sock or not self._websock.sock.connected:
|
if self._browse_interval_func():
|
||||||
raise BrowsingException("websocket closed, did chrome die? {}".format(self._websocket_url))
|
|
||||||
elif time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS:
|
|
||||||
self.logger.info("finished browsing page, reached hard timeout of {} seconds url={}".format(Browser.HARD_TIMEOUT_SECONDS, self.url))
|
|
||||||
return
|
return
|
||||||
elif self._behavior != None and self._behavior.is_finished():
|
|
||||||
self.logger.info("finished browsing page according to behavior url={}".format(self.url))
|
|
||||||
return
|
|
||||||
elif self._abort_browse_page:
|
|
||||||
raise BrowsingException("browsing page aborted")
|
|
||||||
finally:
|
finally:
|
||||||
self.capture_screenshot()
|
|
||||||
|
|
||||||
if self._websock and self._websock.sock and self._websock.sock.connected:
|
if self._websock and self._websock.sock and self._websock.sock.connected:
|
||||||
try:
|
try:
|
||||||
self._websock.close()
|
self._websock.close()
|
||||||
|
@ -149,9 +144,24 @@ class Browser:
|
||||||
|
|
||||||
self._behavior = None
|
self._behavior = None
|
||||||
|
|
||||||
|
def _browse_interval_func(self):
|
||||||
def capture_screenshot(self):
|
"""Returns True when finished browsing."""
|
||||||
time.sleep(10)
|
if not self._websock or not self._websock.sock or not self._websock.sock.connected:
|
||||||
|
raise BrowsingException("websocket closed, did chrome die? {}".format(self._websocket_url))
|
||||||
|
elif self._behavior != None and self._behavior.is_finished():
|
||||||
|
if self._got_outlinks:
|
||||||
|
self.logger.info("got outlinks, finished url={}".format(self.url))
|
||||||
|
return True
|
||||||
|
elif not self._waiting_on_outlinks_msg_id:
|
||||||
|
self.logger.info("finished browsing page according to behavior, retrieving outlinks url={}".format(self.url))
|
||||||
|
self._waiting_on_outlinks_msg_id = self.send_to_chrome(method="Runtime.evaluate",
|
||||||
|
params={"expression":"Array.prototype.slice.call(document.querySelectorAll('a[href]')).join(' ')"})
|
||||||
|
return False
|
||||||
|
elif time.time() - self._start > Browser.HARD_TIMEOUT_SECONDS:
|
||||||
|
self.logger.info("finished browsing page, reached hard timeout of {} seconds url={}".format(Browser.HARD_TIMEOUT_SECONDS, self.url))
|
||||||
|
return True
|
||||||
|
elif self._abort_browse_page:
|
||||||
|
raise BrowsingException("browsing page aborted")
|
||||||
|
|
||||||
def send_to_chrome(self, suppress_logging=False, **kwargs):
|
def send_to_chrome(self, suppress_logging=False, **kwargs):
|
||||||
msg_id = next(self.command_id)
|
msg_id = next(self.command_id)
|
||||||
|
@ -196,7 +206,6 @@ class Browser:
|
||||||
elif "method" in message and message["method"] == "Debugger.paused":
|
elif "method" in message and message["method"] == "Debugger.paused":
|
||||||
# We hit the breakpoint set in visit_page. Get rid of google
|
# We hit the breakpoint set in visit_page. Get rid of google
|
||||||
# analytics script!
|
# analytics script!
|
||||||
|
|
||||||
self.logger.debug("debugger paused! message={}".format(message))
|
self.logger.debug("debugger paused! message={}".format(message))
|
||||||
scriptId = message['params']['callFrames'][0]['location']['scriptId']
|
scriptId = message['params']['callFrames'][0]['location']['scriptId']
|
||||||
|
|
||||||
|
@ -214,6 +223,12 @@ class Browser:
|
||||||
self.logger.info("got screenshot, moving on to starting behaviors url={}".format(self.url))
|
self.logger.info("got screenshot, moving on to starting behaviors url={}".format(self.url))
|
||||||
self._behavior = Behavior(self.url, self)
|
self._behavior = Behavior(self.url, self)
|
||||||
self._behavior.start()
|
self._behavior.start()
|
||||||
|
elif message["id"] == self._waiting_on_outlinks_msg_id:
|
||||||
|
self.logger.debug("got outlinks message={}".format(message))
|
||||||
|
self._got_outlinks = True
|
||||||
|
# {'result': {'wasThrown': False, 'result': {'value': 'https://archive-it.org/cgi-bin/dedup-test/change_every_second https://archive-it.org/cgi-bin/dedup-test/change_every_minute https://archive-it.org/cgi-bin/dedup-test/change_every_10minutes https://archive-it.org/cgi-bin/dedup-test/change_every_hour https://archive-it.org/cgi-bin/dedup-test/change_every_day https://archive-it.org/cgi-bin/dedup-test/change_every_month https://archive-it.org/cgi-bin/dedup-test/change_every_year https://archive-it.org/cgi-bin/dedup-test/change_never http://validator.w3.org/check?uri=referer', 'type': 'string'}}, 'id': 32}
|
||||||
|
if self.on_outlinks:
|
||||||
|
self.on_outlinks(frozenset(message["result"]["result"]["value"].split(" ")))
|
||||||
elif self._behavior and self._behavior.is_waiting_on_result(message["id"]):
|
elif self._behavior and self._behavior.is_waiting_on_result(message["id"]):
|
||||||
self._behavior.notify_of_result(message)
|
self._behavior.notify_of_result(message)
|
||||||
# elif "method" in message and message["method"] in ("Network.dataReceived", "Network.responseReceived", "Network.loadingFinished"):
|
# elif "method" in message and message["method"] in ("Network.dataReceived", "Network.responseReceived", "Network.loadingFinished"):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue