mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
keep robots caches in BrozzlerHQ class, because Site instances get recreated over and over, which meant robots.txt was fetched over and over
This commit is contained in:
parent
4dacc0b087
commit
a04bf04307
@ -7,6 +7,8 @@ import sqlite3
|
||||
import time
|
||||
import kombu
|
||||
import kombu.simple
|
||||
import reppy.cache
|
||||
import requests
|
||||
|
||||
class BrozzlerHQDb:
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
@ -146,6 +148,29 @@ class BrozzlerHQ:
|
||||
self._db = db
|
||||
else:
|
||||
self._db = BrozzlerHQDb()
|
||||
self._robots_caches = {} # {site_id:reppy.cache.RobotsCache}
|
||||
|
||||
def _robots_cache(self, site):
|
||||
if not site.id in self._robots_caches:
|
||||
req_sesh = requests.Session()
|
||||
req_sesh.verify = False # ignore cert errors
|
||||
if site.proxy:
|
||||
proxie = "http://{}".format(site.proxy)
|
||||
req_sesh.proxies = {"http":proxie,"https":proxie}
|
||||
if site.extra_headers:
|
||||
req_sesh.headers.update(site.extra_headers)
|
||||
self._robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
|
||||
|
||||
return self._robots_caches[site.id]
|
||||
|
||||
def is_permitted_by_robots(self, site, url):
|
||||
if site.ignore_robots:
|
||||
return True
|
||||
try:
|
||||
return self._robots_cache(site).allowed(url, "brozzler")
|
||||
except BaseException as e:
|
||||
self.logger.error("problem with robots.txt for {}: {}".format(url, e))
|
||||
return False
|
||||
|
||||
def run(self):
|
||||
try:
|
||||
@ -184,7 +209,7 @@ class BrozzlerHQ:
|
||||
site_id = self._db.new_site(new_site)
|
||||
new_site.id = site_id
|
||||
|
||||
if new_site.is_permitted_by_robots(new_site.seed):
|
||||
if self.is_permitted_by_robots(new_site, new_site.seed):
|
||||
page = brozzler.Page(new_site.seed, site_id=new_site.id, hops_from_seed=0)
|
||||
self._db.schedule_page(page, priority=1000)
|
||||
self._unclaimed_sites_q.put(new_site.to_dict())
|
||||
@ -213,7 +238,7 @@ class BrozzlerHQ:
|
||||
if parent_page.outlinks:
|
||||
for url in parent_page.outlinks:
|
||||
if site.is_in_scope(url):
|
||||
if site.is_permitted_by_robots(url):
|
||||
if self.is_permitted_by_robots(site, url):
|
||||
child_page = brozzler.Page(url, site_id=site.id, hops_from_seed=parent_page.hops_from_seed+1)
|
||||
try:
|
||||
self._db.update_page(child_page)
|
||||
|
@ -3,8 +3,6 @@
|
||||
import surt
|
||||
import json
|
||||
import logging
|
||||
import requests
|
||||
import reppy.cache
|
||||
|
||||
class Site:
|
||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||
@ -25,15 +23,6 @@ class Site:
|
||||
else:
|
||||
self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
||||
|
||||
req_sesh = requests.Session()
|
||||
req_sesh.verify = False # ignore cert errors
|
||||
if proxy:
|
||||
proxie = "http://{}".format(proxy)
|
||||
req_sesh.proxies = {"http":proxie,"https":proxie}
|
||||
if extra_headers:
|
||||
req_sesh.headers.update(extra_headers)
|
||||
self._robots_cache = reppy.cache.RobotsCache(session=req_sesh)
|
||||
|
||||
def __repr__(self):
|
||||
return """Site(seed={},scope_surt={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={})""".format(
|
||||
repr(self.seed), repr(self.scope_surt), repr(self.proxy), self.enable_warcprox_features, self.ignore_robots, self.extra_headers)
|
||||
@ -44,13 +33,6 @@ class Site:
|
||||
self.logger.info("changing site scope surt from {} to {}".format(self.scope_surt, new_scope_surt))
|
||||
self.scope_surt = new_scope_surt
|
||||
|
||||
def is_permitted_by_robots(self, url):
|
||||
try:
|
||||
return self.ignore_robots or self._robots_cache.allowed(url, "brozzler")
|
||||
except BaseException as e:
|
||||
self.logger.error("problem with robots.txt for {}: {}".format(url, e))
|
||||
return False
|
||||
|
||||
def is_in_scope(self, url):
|
||||
try:
|
||||
surtt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
||||
|
Loading…
x
Reference in New Issue
Block a user