diff --git a/bin/crawl-url b/bin/crawl-url index 17c6d72..b83d202 100755 --- a/bin/crawl-url +++ b/bin/crawl-url @@ -6,6 +6,7 @@ import os import sys import logging import umbra +import umbra.frontier arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__), description='browse-url - open urls in chrome/chromium and run behaviors', @@ -24,16 +25,20 @@ args = arg_parser.parse_args(args=sys.argv[1:]) logging.basicConfig(stream=sys.stdout, level=args.log_level, format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') -frontier = list(args.urls) +frontier = umbra.frontier.Frontier() +for url in args.urls: + frontier.schedule(umbra.frontier.CrawlUrl(url, priority=1000)) -def add_to_frontier(urls): +def frontier_schedule(urls): logging.info("adding {} urls to frontier".format(len(urls))) - frontier.extend(urls) + for url in urls: + frontier.schedule(umbra.frontier.CrawlUrl(url)) with umbra.Browser(chrome_exe=args.chrome_exe) as browser: try: while True: - browser.browse_page(frontier.pop(), on_outlinks=add_to_frontier) + crawl_url = frontier.pop() + browser.browse_page(crawl_url.url, on_outlinks=frontier_schedule) except IndexError: logging.info("finished, frontier is empty") diff --git a/setup.py b/setup.py index 258e962..6c78b07 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ setuptools.setup(name='umbra', license='Apache License 2.0', packages=['umbra'], package_data={'umbra':['behaviors.d/*.js*', 'behaviors.yaml', 'version.txt']}, - install_requires=['kombu', 'websocket-client-py3==0.13.1', 'argparse', 'PyYAML'], + install_requires=['kombu', 'websocket-client-py3==0.13.1', 'argparse', 'PyYAML', 'sortedcontainers'], scripts=glob.glob('bin/*'), zip_safe=False, classifiers=[ diff --git a/umbra/frontier.py b/umbra/frontier.py new file mode 100644 index 0000000..52cdfe5 --- /dev/null +++ b/umbra/frontier.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python +# vim: set sw=4 et: + +import logging +import sys +import urllib.parse +import sortedcontainers + +class CrawlUrl: + def __init__(self, url, priority=1): + self.url = url + self.set_priority(priority) + self._netloc = None + + def set_priority(self, priority): + # priority_key is both a sortable priority (higher value is higher + # priority) and a unique hash key + self.priority_key = (priority << 32) | (hash(self.url) & (2**32 - 1)) + + def get_priority(self): + return self.priority >> 32 + + @property + def host(self): + if self._netloc is None: + self._netloc = urllib.parse.urlsplit(self.url)[1] + return self._netloc + +class Frontier: + def __init__(self): + # {url:CrawlUrl} + self.urls = {} + + # {host:SortedDict{priority_key:CrawlUrl}} + self.queues_by_host = {} + + def schedule(self, crawl_url): + try: + old_priority_key = self.urls.pop(crawl_url.url).priority_key + old_crawl_url = self.queues_by_host[crawl_url.host].pop(old_priority_key) + + # XXX very dumb calculation of new priority, probably doesn't belong here + crawl_url.set_priority(crawl_url.get_priority() + old_crawl_url.get_priority()) + except KeyError: + pass + + self.urls[crawl_url.url] = crawl_url + if crawl_url.host not in self.queues_by_host: + self.queues_by_host[crawl_url.host] = sortedcontainers.SortedDict() + self.queues_by_host[crawl_url.host][crawl_url.priority_key] = crawl_url + + def pop(self, host=None): + if not host or host not in self.queues_by_host: + # XXX should prioritize queues, this picks one at random + for h in self.queues_by_host: + host = h + break + + result = self.queues_by_host[host].popitem(last=True)[1] + if len(self.queues_by_host[host]) == 0: + del self.queues_by_host[host] + + result2 = self.urls.pop(result.url) + assert result2 is result + + return result +