refactor brozzler.hq.Site and brozzler.url.CrawlUrl into new brozzler.site package; fix bugs in robots.txt handling

This commit is contained in:
Noah Levitt 2015-07-15 18:03:03 -07:00
parent a9c51edd84
commit f2bc7ec271
5 changed files with 131 additions and 125 deletions

View file

@ -1,5 +1,6 @@
from brozzler.browser import Browser from brozzler.browser import Browser, BrowserPool
from brozzler.url import CrawlUrl from brozzler.site import CrawlUrl, Site
from brozzler.hq import BrozzlerHQ
def _read_version(): def _read_version():
import os import os

View file

@ -1,90 +1,13 @@
# vim: set sw=4 et: # vim: set sw=4 et:
import surt
import json import json
import logging import logging
import urllib.robotparser
import urllib.request
import brozzler import brozzler
import sqlite3 import sqlite3
import time import time
import kombu import kombu
import kombu.simple import kombu.simple
def robots_url(url):
hurl = surt.handyurl.parse(url)
hurl.path = "/robots.txt"
hurl.query = None
hurl.hash = None
return hurl.geturl()
class RobotsFileParser(urllib.robotparser.RobotsFileParser):
"""Adds support for fetching robots.txt through a proxy to
urllib.robotparser.RobotsFileParser."""
def __init__(self, proxy):
self.proxy = proxy
def read(self):
"""Reads the robots.txt URL and feeds it to the parser."""
try:
request = urllib.request.Request(url)
if proxy:
request.set_proxy(proxy, request.type)
f = urllib.request.urlopen(request)
except urllib.error.HTTPError as err:
if err.code in (401, 403):
self.disallow_all = True
elif err.code >= 400:
self.allow_all = True
else:
raw = f.read()
self.parse(raw.decode("utf-8").splitlines())
class Site:
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, seed, id=None, scope_surt=None, proxy=None, ignore_robots=False):
self.seed = seed
self.id = id
self.proxy = proxy
self.ignore_robots = ignore_robots
if scope_surt:
self.scope_surt = scope_surt
else:
self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
self._robots_cache = {} # {robots_url:RobotsFileParser,...}
def is_permitted_by_robots(self, url):
return ignore_robots or self._robots(robots_url(url)).can_fetch("*", url)
def is_in_scope(self, url):
try:
surtt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
return surtt.startswith(self.scope_surt)
except:
self.logger.warn("""problem parsing url "{}" """.format(url))
return False
def to_dict(self):
return dict(id=self.id, seed=self.seed, scope_surt=self.scope_surt)
def to_json(self):
return json.dumps(self.to_dict(), separators=(',', ':'))
def _robots(robots_url):
if not robots_url in _robots_cache:
robots_txt = RobotFileParser(robots_url)
logging.info("fetching {}".format(robots_url))
try:
robots_txt.read()
_robots_cache[robots_url] = robots_txt
except BaseException as e:
logger.error("problem fetching {}".format(robots_url))
return _robots_cache[robots_url]
class BrozzlerHQDb: class BrozzlerHQDb:
logger = logging.getLogger(__module__ + "." + __qualname__) logger = logging.getLogger(__module__ + "." + __qualname__)
@ -155,7 +78,7 @@ class BrozzlerHQDb:
break break
site_dict = json.loads(row[1]) site_dict = json.loads(row[1])
site_dict["id"] = row[0] site_dict["id"] = row[0]
yield brozzler.hq.Site(**site_dict) yield brozzler.Site(**site_dict)
def update_crawl_url(self, crawl_url): def update_crawl_url(self, crawl_url):
cursor = self._conn.cursor() cursor = self._conn.cursor()
@ -199,7 +122,7 @@ class BrozzlerHQ:
def _new_site(self): def _new_site(self):
try: try:
msg = self._new_sites_q.get(block=False) msg = self._new_sites_q.get(block=False)
new_site = brozzler.hq.Site(**msg.payload) new_site = brozzler.Site(**msg.payload)
msg.ack() msg.ack()
self.logger.info("new site {}".format(new_site)) self.logger.info("new site {}".format(new_site))

125
brozzler/site.py Normal file
View file

@ -0,0 +1,125 @@
# vim: set sw=4 et:
import surt
import json
import logging
import urllib.robotparser
import urllib.request
def robots_url(url):
hurl = surt.handyurl.parse(url)
hurl.path = "/robots.txt"
hurl.query = None
hurl.hash = None
return hurl.geturl()
class RobotFileParser(urllib.robotparser.RobotFileParser):
logger = logging.getLogger(__module__ + "." + __qualname__)
"""Adds support for fetching robots.txt through a proxy to
urllib.robotparser.RobotFileParser."""
def __init__(self, url="", proxy=None):
super(RobotFileParser, self).__init__(url)
self.proxy = proxy
def read(self):
"""Reads the robots.txt URL and feeds it to the parser."""
try:
request = urllib.request.Request(self.url)
if self.proxy:
request.set_proxy(self.proxy, request.type)
f = urllib.request.urlopen(request)
except urllib.error.HTTPError as err:
if err.code in (401, 403):
self.logger.info("{} returned {}, disallowing all".format(self.url, err.code))
self.disallow_all = True
elif err.code >= 400:
self.logger.info("{} returned {}, allowing all".format(self.url, err.code))
self.allow_all = True
except BaseException as err:
self.logger.error("problem fetching {}, disallowing all".format(self.url), exc_info=True)
self.disallow_all = True
else:
raw = f.read()
self.parse(raw.decode("utf-8").splitlines())
class Site:
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, seed, id=None, scope_surt=None, proxy=None, ignore_robots=False):
self.seed = seed
self.id = id
self.proxy = proxy
self.ignore_robots = ignore_robots
if scope_surt:
self.scope_surt = scope_surt
else:
self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
self._robots_cache = {} # {robots_url:RobotFileParser,...}
def is_permitted_by_robots(self, url):
return self.ignore_robots or self._robots(robots_url(url)).can_fetch("*", url)
def is_in_scope(self, url):
try:
surtt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
return surtt.startswith(self.scope_surt)
except:
self.logger.warn("""problem parsing url "{}" """.format(url))
return False
def to_dict(self):
return dict(id=self.id, seed=self.seed, scope_surt=self.scope_surt)
def to_json(self):
return json.dumps(self.to_dict(), separators=(',', ':'))
def _robots(self, robots_url):
if not robots_url in self._robots_cache:
robots_txt = RobotFileParser(robots_url, self.proxy)
self.logger.info("fetching {}".format(robots_url))
robots_txt.read()
self._robots_cache[robots_url] = robots_txt
return self._robots_cache[robots_url]
class CrawlUrl:
def __init__(self, url, id=None, site_id=None, hops_from_seed=0, outlinks=None):
self.id = id
self.site_id = site_id
self.url = url
self.hops_from_seed = hops_from_seed
self._canon_hurl = None
self.outlinks = outlinks
def __repr__(self):
return """CrawlUrl(url="{}",site_id={},hops_from_seed={})""".format(
self.url, self.site_id, self.hops_from_seed)
def calc_priority(self):
priority = 0
priority += max(0, 10 - self.hops_from_seed)
priority += max(0, 6 - self.canonical().count("/"))
return priority
def canonical(self):
if self._canon_hurl is None:
self._canon_hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
return self._canon_hurl.geturl()
def to_dict(self):
if self.outlinks is not None and not isinstance(self.outlinks, list):
outlinks = []
outlinks.extend(self.outlinks)
else:
outlinks = self.outlinks
return dict(id=self.id, site_id=self.site_id, url=self.url,
hops_from_seed=self.hops_from_seed, outlinks=outlinks)
def to_json(self):
return json.dumps(self.to_dict(), separators=(',', ':'))

View file

@ -1,43 +0,0 @@
# vim: set sw=4 et:
import surt
import json
class CrawlUrl:
def __init__(self, url, id=None, site_id=None, hops_from_seed=0, outlinks=None):
self.id = id
self.site_id = site_id
self.url = url
self.hops_from_seed = hops_from_seed
self._canon_hurl = None
self.outlinks = outlinks
def __repr__(self):
return """CrawlUrl(url="{}",site_id={},hops_from_seed={})""".format(
self.url, self.site_id, self.hops_from_seed)
def calc_priority(self):
priority = 0
priority += max(0, 10 - self.hops_from_seed)
priority += max(0, 6 - self.canonical().count("/"))
return priority
def canonical(self):
if self._canon_hurl is None:
self._canon_hurl = surt.handyurl.parse(self.url)
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
return self._canon_hurl.geturl()
def to_dict(self):
if self.outlinks is not None and not isinstance(self.outlinks, list):
outlinks = []
outlinks.extend(self.outlinks)
else:
outlinks = self.outlinks
return dict(id=self.id, site_id=self.site_id, url=self.url,
hops_from_seed=self.hops_from_seed, outlinks=outlinks)
def to_json(self):
return json.dumps(self.to_dict(), separators=(',', ':'))

View file

@ -150,7 +150,7 @@ class BrozzlerWorker:
browser = self._browser_pool.acquire() browser = self._browser_pool.acquire()
try: try:
msg = q.get(block=True, timeout=0.5) msg = q.get(block=True, timeout=0.5)
site = brozzler.hq.Site(**msg.payload) site = brozzler.Site(**msg.payload)
msg.ack() # XXX ack only after browsing finished? kinda complicated msg.ack() # XXX ack only after browsing finished? kinda complicated
logging.info("browsing site {}".format(site)) logging.info("browsing site {}".format(site))
th = threading.Thread(target=lambda: self._brozzle_site(browser, site), th = threading.Thread(target=lambda: self._brozzle_site(browser, site),