mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 16:49:56 -05:00
add missing file hq.py, improve some logging, fix little race condition bug
This commit is contained in:
parent
bb3561a690
commit
610f9c8cf4
@ -173,15 +173,22 @@ class BrozzlerHQ:
|
|||||||
completed_url = umbra.CrawlUrl(**msg.payload)
|
completed_url = umbra.CrawlUrl(**msg.payload)
|
||||||
msg.ack()
|
msg.ack()
|
||||||
|
|
||||||
self.logger.info("adding outlinks from {} outlinks={}".format(completed_url, completed_url.outlinks))
|
counts = {"added":0,"updated":0,"rejected":0}
|
||||||
if completed_url.outlinks:
|
if completed_url.outlinks:
|
||||||
for url in completed_url.outlinks:
|
for url in completed_url.outlinks:
|
||||||
if site.is_in_scope(url):
|
if site.is_in_scope(url):
|
||||||
crawl_url = umbra.CrawlUrl(url, site_id=site.id, hops_from_seed=completed_url.hops_from_seed+1)
|
crawl_url = umbra.CrawlUrl(url, site_id=site.id, hops_from_seed=completed_url.hops_from_seed+1)
|
||||||
try:
|
try:
|
||||||
self._db.update_crawl_url(crawl_url)
|
self._db.update_crawl_url(crawl_url)
|
||||||
|
counts["updated"] += 1
|
||||||
except KeyError:
|
except KeyError:
|
||||||
self._db.schedule_url(crawl_url, priority=crawl_url.calc_priority())
|
self._db.schedule_url(crawl_url, priority=crawl_url.calc_priority())
|
||||||
|
counts["added"] += 1
|
||||||
|
else:
|
||||||
|
counts["rejected"] += 1
|
||||||
|
|
||||||
|
self.logger.info("{} new links added, {} existing links updated, {} links rejected from {}".format(
|
||||||
|
counts["added"], counts["updated"], counts["rejected"], completed_url))
|
||||||
except kombu.simple.Empty:
|
except kombu.simple.Empty:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -33,6 +33,7 @@ logging.basicConfig(stream=sys.stdout, level=args.log_level,
|
|||||||
|
|
||||||
browsers = set()
|
browsers = set()
|
||||||
browsers_lock = threading.Lock()
|
browsers_lock = threading.Lock()
|
||||||
|
num_browsers = 0
|
||||||
|
|
||||||
shutdown_requested = threading.Event()
|
shutdown_requested = threading.Event()
|
||||||
|
|
||||||
@ -64,7 +65,7 @@ def brozzle_site(site, chrome_port):
|
|||||||
crawl_url.outlinks = browser.browse_page(crawl_url.url)
|
crawl_url.outlinks = browser.browse_page(crawl_url.url)
|
||||||
completed(site, crawl_url)
|
completed(site, crawl_url)
|
||||||
except kombu.simple.Empty:
|
except kombu.simple.Empty:
|
||||||
# if some timeout reached, raise
|
# if some timeout reached, re-raise?
|
||||||
pass
|
pass
|
||||||
# except kombu.simple.Empty:
|
# except kombu.simple.Empty:
|
||||||
# logging.info("finished {} (queue is empty)".format(site))
|
# logging.info("finished {} (queue is empty)".format(site))
|
||||||
@ -89,13 +90,14 @@ latest_state = None
|
|||||||
chrome_port = 9200
|
chrome_port = 9200
|
||||||
try:
|
try:
|
||||||
while True:
|
while True:
|
||||||
if len(browsers) < int(args.max_browsers):
|
if num_browsers < int(args.max_browsers):
|
||||||
with kombu.Connection(args.amqp_url) as conn:
|
with kombu.Connection(args.amqp_url) as conn:
|
||||||
q = conn.SimpleQueue("brozzler.sites.unclaimed")
|
q = conn.SimpleQueue("brozzler.sites.unclaimed")
|
||||||
try:
|
try:
|
||||||
msg = q.get(block=True, timeout=0.5)
|
msg = q.get(block=True, timeout=0.5)
|
||||||
site = hq.Site(**msg.payload)
|
site = hq.Site(**msg.payload)
|
||||||
logging.info("browsing site {}".format(site))
|
logging.info("browsing site {}".format(site))
|
||||||
|
num_browsers += 1
|
||||||
msg.ack()
|
msg.ack()
|
||||||
th = threading.Thread(target=lambda: brozzle_site(site, chrome_port),
|
th = threading.Thread(target=lambda: brozzle_site(site, chrome_port),
|
||||||
name="BrowsingThread-{}".format(site.scope_surt))
|
name="BrowsingThread-{}".format(site.scope_surt))
|
||||||
|
29
umbra/hq.py
Normal file
29
umbra/hq.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
# vim: set sw=4 et:
|
||||||
|
|
||||||
|
import surt
|
||||||
|
import kombu
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
|
||||||
|
class Site:
|
||||||
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
|
|
||||||
|
def __init__(self, seed, id=None):
|
||||||
|
self.seed = seed
|
||||||
|
self.id = id
|
||||||
|
self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
||||||
|
|
||||||
|
def is_in_scope(self, url):
|
||||||
|
try:
|
||||||
|
surtt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
|
||||||
|
return surtt.startswith(self.scope_surt)
|
||||||
|
except:
|
||||||
|
self.logger.warn("""problem parsing url "{}" """.format(url), exc_info=True)
|
||||||
|
return False
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return dict(id=self.id, seed=self.seed)
|
||||||
|
|
||||||
|
def to_json(self):
|
||||||
|
return json.dumps(self.to_dict(), separators=(',', ':'))
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user