add missing file hq.py, improve some logging, fix little race condition bug

This commit is contained in:
Noah Levitt 2015-07-11 13:09:45 -07:00
parent bb3561a690
commit 610f9c8cf4
3 changed files with 41 additions and 3 deletions

View File

@ -173,15 +173,22 @@ class BrozzlerHQ:
completed_url = umbra.CrawlUrl(**msg.payload) completed_url = umbra.CrawlUrl(**msg.payload)
msg.ack() msg.ack()
self.logger.info("adding outlinks from {} outlinks={}".format(completed_url, completed_url.outlinks)) counts = {"added":0,"updated":0,"rejected":0}
if completed_url.outlinks: if completed_url.outlinks:
for url in completed_url.outlinks: for url in completed_url.outlinks:
if site.is_in_scope(url): if site.is_in_scope(url):
crawl_url = umbra.CrawlUrl(url, site_id=site.id, hops_from_seed=completed_url.hops_from_seed+1) crawl_url = umbra.CrawlUrl(url, site_id=site.id, hops_from_seed=completed_url.hops_from_seed+1)
try: try:
self._db.update_crawl_url(crawl_url) self._db.update_crawl_url(crawl_url)
counts["updated"] += 1
except KeyError: except KeyError:
self._db.schedule_url(crawl_url, priority=crawl_url.calc_priority()) self._db.schedule_url(crawl_url, priority=crawl_url.calc_priority())
counts["added"] += 1
else:
counts["rejected"] += 1
self.logger.info("{} new links added, {} existing links updated, {} links rejected from {}".format(
counts["added"], counts["updated"], counts["rejected"], completed_url))
except kombu.simple.Empty: except kombu.simple.Empty:
pass pass

View File

@ -33,6 +33,7 @@ logging.basicConfig(stream=sys.stdout, level=args.log_level,
browsers = set() browsers = set()
browsers_lock = threading.Lock() browsers_lock = threading.Lock()
num_browsers = 0
shutdown_requested = threading.Event() shutdown_requested = threading.Event()
@ -64,7 +65,7 @@ def brozzle_site(site, chrome_port):
crawl_url.outlinks = browser.browse_page(crawl_url.url) crawl_url.outlinks = browser.browse_page(crawl_url.url)
completed(site, crawl_url) completed(site, crawl_url)
except kombu.simple.Empty: except kombu.simple.Empty:
# if some timeout reached, raise # if some timeout reached, re-raise?
pass pass
# except kombu.simple.Empty: # except kombu.simple.Empty:
# logging.info("finished {} (queue is empty)".format(site)) # logging.info("finished {} (queue is empty)".format(site))
@ -89,13 +90,14 @@ latest_state = None
chrome_port = 9200 chrome_port = 9200
try: try:
while True: while True:
if len(browsers) < int(args.max_browsers): if num_browsers < int(args.max_browsers):
with kombu.Connection(args.amqp_url) as conn: with kombu.Connection(args.amqp_url) as conn:
q = conn.SimpleQueue("brozzler.sites.unclaimed") q = conn.SimpleQueue("brozzler.sites.unclaimed")
try: try:
msg = q.get(block=True, timeout=0.5) msg = q.get(block=True, timeout=0.5)
site = hq.Site(**msg.payload) site = hq.Site(**msg.payload)
logging.info("browsing site {}".format(site)) logging.info("browsing site {}".format(site))
num_browsers += 1
msg.ack() msg.ack()
th = threading.Thread(target=lambda: brozzle_site(site, chrome_port), th = threading.Thread(target=lambda: brozzle_site(site, chrome_port),
name="BrowsingThread-{}".format(site.scope_surt)) name="BrowsingThread-{}".format(site.scope_surt))

29
umbra/hq.py Normal file
View File

@ -0,0 +1,29 @@
# vim: set sw=4 et:
import surt
import kombu
import json
import logging
class Site:
logger = logging.getLogger(__module__ + "." + __qualname__)
def __init__(self, seed, id=None):
self.seed = seed
self.id = id
self.scope_surt = surt.surt(seed, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
def is_in_scope(self, url):
try:
surtt = surt.surt(url, canonicalizer=surt.GoogleURLCanonicalizer, trailing_comma=True)
return surtt.startswith(self.scope_surt)
except:
self.logger.warn("""problem parsing url "{}" """.format(url), exc_info=True)
return False
def to_dict(self):
return dict(id=self.id, seed=self.seed)
def to_json(self):
return json.dumps(self.to_dict(), separators=(',', ':'))