mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-01 19:16:15 -04:00
keep robots caches in BrozzlerHQ class, because Site instances get recreated over and over, which meant robots.txt was fetched over and over
This commit is contained in:
parent
4dacc0b087
commit
a04bf04307
2 changed files with 27 additions and 20 deletions
|
@ -7,6 +7,8 @@ import sqlite3
|
||||||
import time
|
import time
|
||||||
import kombu
|
import kombu
|
||||||
import kombu.simple
|
import kombu.simple
|
||||||
|
import reppy.cache
|
||||||
|
import requests
|
||||||
|
|
||||||
class BrozzlerHQDb:
|
class BrozzlerHQDb:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
@ -146,6 +148,29 @@ class BrozzlerHQ:
|
||||||
self._db = db
|
self._db = db
|
||||||
else:
|
else:
|
||||||
self._db = BrozzlerHQDb()
|
self._db = BrozzlerHQDb()
|
||||||
|
self._robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
||||||
|
|
||||||
|
def _robots_cache(self, site):
|
||||||
|
if not site.id in self._robots_caches:
|
||||||
|
req_sesh = requests.Session()
|
||||||
|
req_sesh.verify = False # ignore cert errors
|
||||||
|
if site.proxy:
|
||||||
|
proxie = "http://{}".format(site.proxy)
|
||||||
|
req_sesh.proxies = {"http":proxie,"https":proxie}
|
||||||
|
if site.extra_headers:
|
||||||
|
req_sesh.headers.update(site.extra_headers)
|
||||||
|
self._robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
|
||||||
|
|
||||||
|
return self._robots_caches[site.id]
|
||||||
|
|
||||||
|
def is_permitted_by_robots(self, site, url):
|
||||||
|
if site.ignore_robots:
|
||||||
|
return True
|
||||||
|
try:
|
||||||
|
return self._robots_cache(site).allowed(url, "brozzler")
|
||||||
|
except BaseException as e:
|
||||||
|
self.logger.error("problem with robots.txt for {}: {}".format(url, e))
|
||||||
|
return False
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
try:
|
try:
|
||||||
|
@ -184,7 +209,7 @@ class BrozzlerHQ:
|
||||||
site_id = self._db.new_site(new_site)
|
site_id = self._db.new_site(new_site)
|
||||||
new_site.id = site_id
|
new_site.id = site_id
|
||||||
|
|
||||||
if new_site.is_permitted_by_robots(new_site.seed):
|
if self.is_permitted_by_robots(new_site, new_site.seed):
|
||||||
page = brozzler.Page(new_site.seed, site_id=new_site.id, hops_from_seed=0)
|
page = brozzler.Page(new_site.seed, site_id=new_site.id, hops_from_seed=0)
|
||||||
self._db.schedule_page(page, priority=1000)
|
self._db.schedule_page(page, priority=1000)
|
||||||
self._unclaimed_sites_q.put(new_site.to_dict())
|
self._unclaimed_sites_q.put(new_site.to_dict())
|
||||||
|
@ -213,7 +238,7 @@ class BrozzlerHQ:
|
||||||
if parent_page.outlinks:
|
if parent_page.outlinks:
|
||||||
for url in parent_page.outlinks:
|
for url in parent_page.outlinks:
|
||||||
if site.is_in_scope(url):
|
if site.is_in_scope(url):
|
||||||
if site.is_permitted_by_robots(url):
|
if self.is_permitted_by_robots(site, url):
|
||||||
child_page = brozzler.Page(url, site_id=site.id, hops_from_seed=parent_page.hops_from_seed+1)
|
child_page = brozzler.Page(url, site_id=site.id, hops_from_seed=parent_page.hops_from_seed+1)
|
||||||
try:
|
try:
|
||||||
self._db.update_page(child_page)
|
self._db.update_page(child_page)
|
||||||
|
|
|
@ -3,8 +3,6 @@
|
||||||
import surt
|
import surt
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import requests
|
|
||||||
import reppy.cache
|
|
||||||
|
|
||||||
class Site:
|
class Site:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
@ -25,15 +23,6 @@ class Site:
|
||||||
else:
|
else:
|
||||||
self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
||||||
|
|
||||||
req_sesh = requests.Session()
|
|
||||||
req_sesh.verify = False # ignore cert errors
|
|
||||||
if proxy:
|
|
||||||
proxie = "http://{}".format(proxy)
|
|
||||||
req_sesh.proxies = {"http":proxie,"https":proxie}
|
|
||||||
if extra_headers:
|
|
||||||
req_sesh.headers.update(extra_headers)
|
|
||||||
self._robots_cache = reppy.cache.RobotsCache(session=req_sesh)
|
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return """Site(seed={},scope_surt={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={})""".format(
|
return """Site(seed={},scope_surt={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={})""".format(
|
||||||
repr(self.seed), repr(self.scope_surt), repr(self.proxy), self.enable_warcprox_features, self.ignore_robots, self.extra_headers)
|
repr(self.seed), repr(self.scope_surt), repr(self.proxy), self.enable_warcprox_features, self.ignore_robots, self.extra_headers)
|
||||||
|
@ -44,13 +33,6 @@ class Site:
|
||||||
self.logger.info("changing site scope surt from {} to {}".format(self.scope_surt, new_scope_surt))
|
self.logger.info("changing site scope surt from {} to {}".format(self.scope_surt, new_scope_surt))
|
||||||
self.scope_surt = new_scope_surt
|
self.scope_surt = new_scope_surt
|
||||||
|
|
||||||
def is_permitted_by_robots(self, url):
|
|
||||||
try:
|
|
||||||
return self.ignore_robots or self._robots_cache.allowed(url, "brozzler")
|
|
||||||
except BaseException as e:
|
|
||||||
self.logger.error("problem with robots.txt for {}: {}".format(url, e))
|
|
||||||
return False
|
|
||||||
|
|
||||||
def is_in_scope(self, url):
|
def is_in_scope(self, url):
|
||||||
try:
|
try:
|
||||||
surtt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
surtt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue