diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 1a0d627..a96eefa 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -1,5 +1,6 @@ -from brozzler.browser import Browser -from brozzler.url import CrawlUrl +from brozzler.browser import Browser, BrowserPool +from brozzler.site import CrawlUrl, Site +from brozzler.hq import BrozzlerHQ def _read_version(): import os diff --git a/brozzler/hq.py b/brozzler/hq.py index 64b3f00..3b29aae 100644 --- a/brozzler/hq.py +++ b/brozzler/hq.py @@ -1,90 +1,13 @@ # vim: set sw=4 et: -import surt import json import logging -import urllib.robotparser -import urllib.request import brozzler import sqlite3 import time import kombu import kombu.simple -def robots_url(url): - hurl = surt.handyurl.parse(url) - hurl.path = "/robots.txt" - hurl.query = None - hurl.hash = None - return hurl.geturl() - -class RobotsFileParser(urllib.robotparser.RobotsFileParser): - """Adds support for fetching robots.txt through a proxy to - urllib.robotparser.RobotsFileParser.""" - def __init__(self, proxy): - self.proxy = proxy - - def read(self): - """Reads the robots.txt URL and feeds it to the parser.""" - try: - request = urllib.request.Request(url) - if proxy: - request.set_proxy(proxy, request.type) - f = urllib.request.urlopen(request) - except urllib.error.HTTPError as err: - if err.code in (401, 403): - self.disallow_all = True - elif err.code >= 400: - self.allow_all = True - else: - raw = f.read() - self.parse(raw.decode("utf-8").splitlines()) - -class Site: - logger = logging.getLogger(__module__ + "." + __qualname__) - - def __init__(self, seed, id=None, scope_surt=None, proxy=None, ignore_robots=False): - self.seed = seed - self.id = id - self.proxy = proxy - self.ignore_robots = ignore_robots - - if scope_surt: - self.scope_surt = scope_surt - else: - self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True) - - self._robots_cache = {} # {robots_url:RobotsFileParser,...} - - def is_permitted_by_robots(self, url): - return ignore_robots or self._robots(robots_url(url)).can_fetch("*", url) - - def is_in_scope(self, url): - try: - surtt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True) - return surtt.startswith(self.scope_surt) - except: - self.logger.warn("""problem parsing url "{}" """.format(url)) - return False - - def to_dict(self): - return dict(id=self.id, seed=self.seed, scope_surt=self.scope_surt) - - def to_json(self): - return json.dumps(self.to_dict(), separators=(',', ':')) - - def _robots(robots_url): - if not robots_url in _robots_cache: - robots_txt = RobotFileParser(robots_url) - logging.info("fetching {}".format(robots_url)) - try: - robots_txt.read() - _robots_cache[robots_url] = robots_txt - except BaseException as e: - logger.error("problem fetching {}".format(robots_url)) - - return _robots_cache[robots_url] - class BrozzlerHQDb: logger = logging.getLogger(__module__ + "." + __qualname__) @@ -155,7 +78,7 @@ class BrozzlerHQDb: break site_dict = json.loads(row[1]) site_dict["id"] = row[0] - yield brozzler.hq.Site(**site_dict) + yield brozzler.Site(**site_dict) def update_crawl_url(self, crawl_url): cursor = self._conn.cursor() @@ -199,7 +122,7 @@ class BrozzlerHQ: def _new_site(self): try: msg = self._new_sites_q.get(block=False) - new_site = brozzler.hq.Site(**msg.payload) + new_site = brozzler.Site(**msg.payload) msg.ack() self.logger.info("new site {}".format(new_site)) diff --git a/brozzler/site.py b/brozzler/site.py new file mode 100644 index 0000000..0fb4f8b --- /dev/null +++ b/brozzler/site.py @@ -0,0 +1,125 @@ +# vim: set sw=4 et: + +import surt +import json +import logging +import urllib.robotparser +import urllib.request + +def robots_url(url): + hurl = surt.handyurl.parse(url) + hurl.path = "/robots.txt" + hurl.query = None + hurl.hash = None + return hurl.geturl() + +class RobotFileParser(urllib.robotparser.RobotFileParser): + logger = logging.getLogger(__module__ + "." + __qualname__) + + """Adds support for fetching robots.txt through a proxy to + urllib.robotparser.RobotFileParser.""" + def __init__(self, url="", proxy=None): + super(RobotFileParser, self).__init__(url) + self.proxy = proxy + + def read(self): + """Reads the robots.txt URL and feeds it to the parser.""" + try: + request = urllib.request.Request(self.url) + if self.proxy: + request.set_proxy(self.proxy, request.type) + f = urllib.request.urlopen(request) + except urllib.error.HTTPError as err: + if err.code in (401, 403): + self.logger.info("{} returned {}, disallowing all".format(self.url, err.code)) + self.disallow_all = True + elif err.code >= 400: + self.logger.info("{} returned {}, allowing all".format(self.url, err.code)) + self.allow_all = True + except BaseException as err: + self.logger.error("problem fetching {}, disallowing all".format(self.url), exc_info=True) + self.disallow_all = True + else: + raw = f.read() + self.parse(raw.decode("utf-8").splitlines()) + +class Site: + logger = logging.getLogger(__module__ + "." + __qualname__) + + def __init__(self, seed, id=None, scope_surt=None, proxy=None, ignore_robots=False): + self.seed = seed + self.id = id + self.proxy = proxy + self.ignore_robots = ignore_robots + + if scope_surt: + self.scope_surt = scope_surt + else: + self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True) + + self._robots_cache = {} # {robots_url:RobotFileParser,...} + + def is_permitted_by_robots(self, url): + return self.ignore_robots or self._robots(robots_url(url)).can_fetch("*", url) + + def is_in_scope(self, url): + try: + surtt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True) + return surtt.startswith(self.scope_surt) + except: + self.logger.warn("""problem parsing url "{}" """.format(url)) + return False + + def to_dict(self): + return dict(id=self.id, seed=self.seed, scope_surt=self.scope_surt) + + def to_json(self): + return json.dumps(self.to_dict(), separators=(',', ':')) + + def _robots(self, robots_url): + if not robots_url in self._robots_cache: + robots_txt = RobotFileParser(robots_url, self.proxy) + self.logger.info("fetching {}".format(robots_url)) + robots_txt.read() + self._robots_cache[robots_url] = robots_txt + + return self._robots_cache[robots_url] + +class CrawlUrl: + def __init__(self, url, id=None, site_id=None, hops_from_seed=0, outlinks=None): + self.id = id + self.site_id = site_id + self.url = url + self.hops_from_seed = hops_from_seed + self._canon_hurl = None + self.outlinks = outlinks + + def __repr__(self): + return """CrawlUrl(url="{}",site_id={},hops_from_seed={})""".format( + self.url, self.site_id, self.hops_from_seed) + + def calc_priority(self): + priority = 0 + priority += max(0, 10 - self.hops_from_seed) + priority += max(0, 6 - self.canonical().count("/")) + return priority + + def canonical(self): + if self._canon_hurl is None: + self._canon_hurl = surt.handyurl.parse(self.url) + surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl) + return self._canon_hurl.geturl() + + def to_dict(self): + if self.outlinks is not None and not isinstance(self.outlinks, list): + outlinks = [] + outlinks.extend(self.outlinks) + else: + outlinks = self.outlinks + + return dict(id=self.id, site_id=self.site_id, url=self.url, + hops_from_seed=self.hops_from_seed, outlinks=outlinks) + + def to_json(self): + return json.dumps(self.to_dict(), separators=(',', ':')) + diff --git a/brozzler/url.py b/brozzler/url.py deleted file mode 100644 index 81ae4a6..0000000 --- a/brozzler/url.py +++ /dev/null @@ -1,43 +0,0 @@ -# vim: set sw=4 et: - -import surt -import json - -class CrawlUrl: - def __init__(self, url, id=None, site_id=None, hops_from_seed=0, outlinks=None): - self.id = id - self.site_id = site_id - self.url = url - self.hops_from_seed = hops_from_seed - self._canon_hurl = None - self.outlinks = outlinks - - def __repr__(self): - return """CrawlUrl(url="{}",site_id={},hops_from_seed={})""".format( - self.url, self.site_id, self.hops_from_seed) - - def calc_priority(self): - priority = 0 - priority += max(0, 10 - self.hops_from_seed) - priority += max(0, 6 - self.canonical().count("/")) - return priority - - def canonical(self): - if self._canon_hurl is None: - self._canon_hurl = surt.handyurl.parse(self.url) - surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl) - return self._canon_hurl.geturl() - - def to_dict(self): - if self.outlinks is not None and not isinstance(self.outlinks, list): - outlinks = [] - outlinks.extend(self.outlinks) - else: - outlinks = self.outlinks - - return dict(id=self.id, site_id=self.site_id, url=self.url, - hops_from_seed=self.hops_from_seed, outlinks=outlinks) - - def to_json(self): - return json.dumps(self.to_dict(), separators=(',', ':')) - diff --git a/brozzler/worker.py b/brozzler/worker.py index 52e383a..e913731 100755 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -150,7 +150,7 @@ class BrozzlerWorker: browser = self._browser_pool.acquire() try: msg = q.get(block=True, timeout=0.5) - site = brozzler.hq.Site(**msg.payload) + site = brozzler.Site(**msg.payload) msg.ack() # XXX ack only after browsing finished? kinda complicated logging.info("browsing site {}".format(site)) th = threading.Thread(target=lambda: self._brozzle_site(browser, site),