mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-03 12:06:28 -04:00
refactor brozzler.hq.Site and brozzler.url.CrawlUrl into new brozzler.site package; fix bugs in robots.txt handling
This commit is contained in:
parent
a9c51edd84
commit
f2bc7ec271
5 changed files with 131 additions and 125 deletions
|
@ -1,5 +1,6 @@
|
||||||
from brozzler.browser import Browser
|
from brozzler.browser import Browser, BrowserPool
|
||||||
from brozzler.url import CrawlUrl
|
from brozzler.site import CrawlUrl, Site
|
||||||
|
from brozzler.hq import BrozzlerHQ
|
||||||
|
|
||||||
def _read_version():
|
def _read_version():
|
||||||
import os
|
import os
|
||||||
|
|
|
@ -1,90 +1,13 @@
|
||||||
# vim: set sw=4 et:
|
# vim: set sw=4 et:
|
||||||
|
|
||||||
import surt
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import urllib.robotparser
|
|
||||||
import urllib.request
|
|
||||||
import brozzler
|
import brozzler
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import time
|
import time
|
||||||
import kombu
|
import kombu
|
||||||
import kombu.simple
|
import kombu.simple
|
||||||
|
|
||||||
def robots_url(url):
|
|
||||||
hurl = surt.handyurl.parse(url)
|
|
||||||
hurl.path = "/robots.txt"
|
|
||||||
hurl.query = None
|
|
||||||
hurl.hash = None
|
|
||||||
return hurl.geturl()
|
|
||||||
|
|
||||||
class RobotsFileParser(urllib.robotparser.RobotsFileParser):
|
|
||||||
"""Adds support for fetching robots.txt through a proxy to
|
|
||||||
urllib.robotparser.RobotsFileParser."""
|
|
||||||
def __init__(self, proxy):
|
|
||||||
self.proxy = proxy
|
|
||||||
|
|
||||||
def read(self):
|
|
||||||
"""Reads the robots.txt URL and feeds it to the parser."""
|
|
||||||
try:
|
|
||||||
request = urllib.request.Request(url)
|
|
||||||
if proxy:
|
|
||||||
request.set_proxy(proxy, request.type)
|
|
||||||
f = urllib.request.urlopen(request)
|
|
||||||
except urllib.error.HTTPError as err:
|
|
||||||
if err.code in (401, 403):
|
|
||||||
self.disallow_all = True
|
|
||||||
elif err.code >= 400:
|
|
||||||
self.allow_all = True
|
|
||||||
else:
|
|
||||||
raw = f.read()
|
|
||||||
self.parse(raw.decode("utf-8").splitlines())
|
|
||||||
|
|
||||||
class Site:
|
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
|
||||||
|
|
||||||
def __init__(self, seed, id=None, scope_surt=None, proxy=None, ignore_robots=False):
|
|
||||||
self.seed = seed
|
|
||||||
self.id = id
|
|
||||||
self.proxy = proxy
|
|
||||||
self.ignore_robots = ignore_robots
|
|
||||||
|
|
||||||
if scope_surt:
|
|
||||||
self.scope_surt = scope_surt
|
|
||||||
else:
|
|
||||||
self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
|
||||||
|
|
||||||
self._robots_cache = {} # {robots_url:RobotsFileParser,...}
|
|
||||||
|
|
||||||
def is_permitted_by_robots(self, url):
|
|
||||||
return ignore_robots or self._robots(robots_url(url)).can_fetch("*", url)
|
|
||||||
|
|
||||||
def is_in_scope(self, url):
|
|
||||||
try:
|
|
||||||
surtt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
|
||||||
return surtt.startswith(self.scope_surt)
|
|
||||||
except:
|
|
||||||
self.logger.warn("""problem parsing url "{}" """.format(url))
|
|
||||||
return False
|
|
||||||
|
|
||||||
def to_dict(self):
|
|
||||||
return dict(id=self.id, seed=self.seed, scope_surt=self.scope_surt)
|
|
||||||
|
|
||||||
def to_json(self):
|
|
||||||
return json.dumps(self.to_dict(), separators=(',', ':'))
|
|
||||||
|
|
||||||
def _robots(robots_url):
|
|
||||||
if not robots_url in _robots_cache:
|
|
||||||
robots_txt = RobotFileParser(robots_url)
|
|
||||||
logging.info("fetching {}".format(robots_url))
|
|
||||||
try:
|
|
||||||
robots_txt.read()
|
|
||||||
_robots_cache[robots_url] = robots_txt
|
|
||||||
except BaseException as e:
|
|
||||||
logger.error("problem fetching {}".format(robots_url))
|
|
||||||
|
|
||||||
return _robots_cache[robots_url]
|
|
||||||
|
|
||||||
class BrozzlerHQDb:
|
class BrozzlerHQDb:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
|
@ -155,7 +78,7 @@ class BrozzlerHQDb:
|
||||||
break
|
break
|
||||||
site_dict = json.loads(row[1])
|
site_dict = json.loads(row[1])
|
||||||
site_dict["id"] = row[0]
|
site_dict["id"] = row[0]
|
||||||
yield brozzler.hq.Site(**site_dict)
|
yield brozzler.Site(**site_dict)
|
||||||
|
|
||||||
def update_crawl_url(self, crawl_url):
|
def update_crawl_url(self, crawl_url):
|
||||||
cursor = self._conn.cursor()
|
cursor = self._conn.cursor()
|
||||||
|
@ -199,7 +122,7 @@ class BrozzlerHQ:
|
||||||
def _new_site(self):
|
def _new_site(self):
|
||||||
try:
|
try:
|
||||||
msg = self._new_sites_q.get(block=False)
|
msg = self._new_sites_q.get(block=False)
|
||||||
new_site = brozzler.hq.Site(**msg.payload)
|
new_site = brozzler.Site(**msg.payload)
|
||||||
msg.ack()
|
msg.ack()
|
||||||
|
|
||||||
self.logger.info("new site {}".format(new_site))
|
self.logger.info("new site {}".format(new_site))
|
||||||
|
|
125
brozzler/site.py
Normal file
125
brozzler/site.py
Normal file
|
@ -0,0 +1,125 @@
|
||||||
|
# vim: set sw=4 et:
|
||||||
|
|
||||||
|
import surt
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import urllib.robotparser
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
def robots_url(url):
|
||||||
|
hurl = surt.handyurl.parse(url)
|
||||||
|
hurl.path = "/robots.txt"
|
||||||
|
hurl.query = None
|
||||||
|
hurl.hash = None
|
||||||
|
return hurl.geturl()
|
||||||
|
|
||||||
|
class RobotFileParser(urllib.robotparser.RobotFileParser):
|
||||||
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
|
"""Adds support for fetching robots.txt through a proxy to
|
||||||
|
urllib.robotparser.RobotFileParser."""
|
||||||
|
def __init__(self, url="", proxy=None):
|
||||||
|
super(RobotFileParser, self).__init__(url)
|
||||||
|
self.proxy = proxy
|
||||||
|
|
||||||
|
def read(self):
|
||||||
|
"""Reads the robots.txt URL and feeds it to the parser."""
|
||||||
|
try:
|
||||||
|
request = urllib.request.Request(self.url)
|
||||||
|
if self.proxy:
|
||||||
|
request.set_proxy(self.proxy, request.type)
|
||||||
|
f = urllib.request.urlopen(request)
|
||||||
|
except urllib.error.HTTPError as err:
|
||||||
|
if err.code in (401, 403):
|
||||||
|
self.logger.info("{} returned {}, disallowing all".format(self.url, err.code))
|
||||||
|
self.disallow_all = True
|
||||||
|
elif err.code >= 400:
|
||||||
|
self.logger.info("{} returned {}, allowing all".format(self.url, err.code))
|
||||||
|
self.allow_all = True
|
||||||
|
except BaseException as err:
|
||||||
|
self.logger.error("problem fetching {}, disallowing all".format(self.url), exc_info=True)
|
||||||
|
self.disallow_all = True
|
||||||
|
else:
|
||||||
|
raw = f.read()
|
||||||
|
self.parse(raw.decode("utf-8").splitlines())
|
||||||
|
|
||||||
|
class Site:
|
||||||
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
|
def __init__(self, seed, id=None, scope_surt=None, proxy=None, ignore_robots=False):
|
||||||
|
self.seed = seed
|
||||||
|
self.id = id
|
||||||
|
self.proxy = proxy
|
||||||
|
self.ignore_robots = ignore_robots
|
||||||
|
|
||||||
|
if scope_surt:
|
||||||
|
self.scope_surt = scope_surt
|
||||||
|
else:
|
||||||
|
self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
||||||
|
|
||||||
|
self._robots_cache = {} # {robots_url:RobotFileParser,...}
|
||||||
|
|
||||||
|
def is_permitted_by_robots(self, url):
|
||||||
|
return self.ignore_robots or self._robots(robots_url(url)).can_fetch("*", url)
|
||||||
|
|
||||||
|
def is_in_scope(self, url):
|
||||||
|
try:
|
||||||
|
surtt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
||||||
|
return surtt.startswith(self.scope_surt)
|
||||||
|
except:
|
||||||
|
self.logger.warn("""problem parsing url "{}" """.format(url))
|
||||||
|
return False
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return dict(id=self.id, seed=self.seed, scope_surt=self.scope_surt)
|
||||||
|
|
||||||
|
def to_json(self):
|
||||||
|
return json.dumps(self.to_dict(), separators=(',', ':'))
|
||||||
|
|
||||||
|
def _robots(self, robots_url):
|
||||||
|
if not robots_url in self._robots_cache:
|
||||||
|
robots_txt = RobotFileParser(robots_url, self.proxy)
|
||||||
|
self.logger.info("fetching {}".format(robots_url))
|
||||||
|
robots_txt.read()
|
||||||
|
self._robots_cache[robots_url] = robots_txt
|
||||||
|
|
||||||
|
return self._robots_cache[robots_url]
|
||||||
|
|
||||||
|
class CrawlUrl:
|
||||||
|
def __init__(self, url, id=None, site_id=None, hops_from_seed=0, outlinks=None):
|
||||||
|
self.id = id
|
||||||
|
self.site_id = site_id
|
||||||
|
self.url = url
|
||||||
|
self.hops_from_seed = hops_from_seed
|
||||||
|
self._canon_hurl = None
|
||||||
|
self.outlinks = outlinks
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return """CrawlUrl(url="{}",site_id={},hops_from_seed={})""".format(
|
||||||
|
self.url, self.site_id, self.hops_from_seed)
|
||||||
|
|
||||||
|
def calc_priority(self):
|
||||||
|
priority = 0
|
||||||
|
priority += max(0, 10 - self.hops_from_seed)
|
||||||
|
priority += max(0, 6 - self.canonical().count("/"))
|
||||||
|
return priority
|
||||||
|
|
||||||
|
def canonical(self):
|
||||||
|
if self._canon_hurl is None:
|
||||||
|
self._canon_hurl = surt.handyurl.parse(self.url)
|
||||||
|
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
|
||||||
|
return self._canon_hurl.geturl()
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
if self.outlinks is not None and not isinstance(self.outlinks, list):
|
||||||
|
outlinks = []
|
||||||
|
outlinks.extend(self.outlinks)
|
||||||
|
else:
|
||||||
|
outlinks = self.outlinks
|
||||||
|
|
||||||
|
return dict(id=self.id, site_id=self.site_id, url=self.url,
|
||||||
|
hops_from_seed=self.hops_from_seed, outlinks=outlinks)
|
||||||
|
|
||||||
|
def to_json(self):
|
||||||
|
return json.dumps(self.to_dict(), separators=(',', ':'))
|
||||||
|
|
|
@ -1,43 +0,0 @@
|
||||||
# vim: set sw=4 et:
|
|
||||||
|
|
||||||
import surt
|
|
||||||
import json
|
|
||||||
|
|
||||||
class CrawlUrl:
|
|
||||||
def __init__(self, url, id=None, site_id=None, hops_from_seed=0, outlinks=None):
|
|
||||||
self.id = id
|
|
||||||
self.site_id = site_id
|
|
||||||
self.url = url
|
|
||||||
self.hops_from_seed = hops_from_seed
|
|
||||||
self._canon_hurl = None
|
|
||||||
self.outlinks = outlinks
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return """CrawlUrl(url="{}",site_id={},hops_from_seed={})""".format(
|
|
||||||
self.url, self.site_id, self.hops_from_seed)
|
|
||||||
|
|
||||||
def calc_priority(self):
|
|
||||||
priority = 0
|
|
||||||
priority += max(0, 10 - self.hops_from_seed)
|
|
||||||
priority += max(0, 6 - self.canonical().count("/"))
|
|
||||||
return priority
|
|
||||||
|
|
||||||
def canonical(self):
|
|
||||||
if self._canon_hurl is None:
|
|
||||||
self._canon_hurl = surt.handyurl.parse(self.url)
|
|
||||||
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
|
|
||||||
return self._canon_hurl.geturl()
|
|
||||||
|
|
||||||
def to_dict(self):
|
|
||||||
if self.outlinks is not None and not isinstance(self.outlinks, list):
|
|
||||||
outlinks = []
|
|
||||||
outlinks.extend(self.outlinks)
|
|
||||||
else:
|
|
||||||
outlinks = self.outlinks
|
|
||||||
|
|
||||||
return dict(id=self.id, site_id=self.site_id, url=self.url,
|
|
||||||
hops_from_seed=self.hops_from_seed, outlinks=outlinks)
|
|
||||||
|
|
||||||
def to_json(self):
|
|
||||||
return json.dumps(self.to_dict(), separators=(',', ':'))
|
|
||||||
|
|
|
@ -150,7 +150,7 @@ class BrozzlerWorker:
|
||||||
browser = self._browser_pool.acquire()
|
browser = self._browser_pool.acquire()
|
||||||
try:
|
try:
|
||||||
msg = q.get(block=True, timeout=0.5)
|
msg = q.get(block=True, timeout=0.5)
|
||||||
site = brozzler.hq.Site(**msg.payload)
|
site = brozzler.Site(**msg.payload)
|
||||||
msg.ack() # XXX ack only after browsing finished? kinda complicated
|
msg.ack() # XXX ack only after browsing finished? kinda complicated
|
||||||
logging.info("browsing site {}".format(site))
|
logging.info("browsing site {}".format(site))
|
||||||
th = threading.Thread(target=lambda: self._brozzle_site(browser, site),
|
th = threading.Thread(target=lambda: self._brozzle_site(browser, site),
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue