keep robots caches in BrozzlerHQ class, because Site instances get recreated over and over, which meant robots.txt was fetched over and over

This commit is contained in:
Noah Levitt 2015-07-23 02:19:25 +00:00
parent 4dacc0b087
commit a04bf04307
2 changed files with 27 additions and 20 deletions

View File

@ -7,6 +7,8 @@ import sqlite3
import time
import kombu
import kombu.simple
import reppy.cache
import requests
class BrozzlerHQDb:
logger = logging.getLogger(__module__ + "." + __qualname__)
@ -146,6 +148,29 @@ class BrozzlerHQ:
self._db = db
else:
self._db = BrozzlerHQDb()
self._robots_caches = {} # {site_id:reppy.cache.RobotsCache}
def _robots_cache(self, site):
if not site.id in self._robots_caches:
req_sesh = requests.Session()
req_sesh.verify = False # ignore cert errors
if site.proxy:
proxie = "http://{}".format(site.proxy)
req_sesh.proxies = {"http":proxie,"https":proxie}
if site.extra_headers:
req_sesh.headers.update(site.extra_headers)
self._robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
return self._robots_caches[site.id]
def is_permitted_by_robots(self, site, url):
if site.ignore_robots:
return True
try:
return self._robots_cache(site).allowed(url, "brozzler")
except BaseException as e:
self.logger.error("problem with robots.txt for {}: {}".format(url, e))
return False
def run(self):
try:
@ -184,7 +209,7 @@ class BrozzlerHQ:
site_id = self._db.new_site(new_site)
new_site.id = site_id
if new_site.is_permitted_by_robots(new_site.seed):
if self.is_permitted_by_robots(new_site, new_site.seed):
page = brozzler.Page(new_site.seed, site_id=new_site.id, hops_from_seed=0)
self._db.schedule_page(page, priority=1000)
self._unclaimed_sites_q.put(new_site.to_dict())
@ -213,7 +238,7 @@ class BrozzlerHQ:
if parent_page.outlinks:
for url in parent_page.outlinks:
if site.is_in_scope(url):
if site.is_permitted_by_robots(url):
if self.is_permitted_by_robots(site, url):
child_page = brozzler.Page(url, site_id=site.id, hops_from_seed=parent_page.hops_from_seed+1)
try:
self._db.update_page(child_page)

View File

@ -3,8 +3,6 @@
import surt
import json
import logging
import requests
import reppy.cache
class Site:
logger = logging.getLogger(__module__ + "." + __qualname__)
@ -25,15 +23,6 @@ class Site:
else:
self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
req_sesh = requests.Session()
req_sesh.verify = False # ignore cert errors
if proxy:
proxie = "http://{}".format(proxy)
req_sesh.proxies = {"http":proxie,"https":proxie}
if extra_headers:
req_sesh.headers.update(extra_headers)
self._robots_cache = reppy.cache.RobotsCache(session=req_sesh)
def __repr__(self):
return """Site(seed={},scope_surt={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={})""".format(
repr(self.seed), repr(self.scope_surt), repr(self.proxy), self.enable_warcprox_features, self.ignore_robots, self.extra_headers)
@ -44,13 +33,6 @@ class Site:
self.logger.info("changing site scope surt from {} to {}".format(self.scope_surt, new_scope_surt))
self.scope_surt = new_scope_surt
def is_permitted_by_robots(self, url):
try:
return self.ignore_robots or self._robots_cache.allowed(url, "brozzler")
except BaseException as e:
self.logger.error("problem with robots.txt for {}: {}".format(url, e))
return False
def is_in_scope(self, url):
try:
surtt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)