mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
trick to avoid crawling same url again too quickly
This commit is contained in:
parent
7cc777661d
commit
5f3c247e0c
@ -64,7 +64,6 @@ class CrawlUrlQueue:
|
|||||||
return len(self._urls)
|
return len(self._urls)
|
||||||
|
|
||||||
def schedule(self, crawl_url):
|
def schedule(self, crawl_url):
|
||||||
logging.info("before: self._pq={}".format(self._pq))
|
|
||||||
self.aggregate_priority += crawl_url.priority
|
self.aggregate_priority += crawl_url.priority
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -79,12 +78,13 @@ class CrawlUrlQueue:
|
|||||||
self._urls[crawl_url.surt] = crawl_url
|
self._urls[crawl_url.surt] = crawl_url
|
||||||
self._pq[crawl_url.priority_key] = crawl_url
|
self._pq[crawl_url.priority_key] = crawl_url
|
||||||
|
|
||||||
def pop(self):
|
def next_url(self):
|
||||||
res0 = self._pq.popitem(last=True)[1]
|
res0 = self._pq.popitem(last=True)[1]
|
||||||
res1 = self._urls.pop(res0.surt)
|
res1 = self._urls.pop(res0.surt)
|
||||||
assert res0 is res1
|
assert res0 is res1
|
||||||
|
|
||||||
self.aggregate_priority -= res0.priority
|
new_low_priority = CrawlUrl(res0.url, -1000)
|
||||||
|
self.schedule(new_low_priority)
|
||||||
|
|
||||||
return res0
|
return res0
|
||||||
|
|
||||||
@ -112,7 +112,7 @@ class Site:
|
|||||||
def brozzle_site(site, chrome_port):
|
def brozzle_site(site, chrome_port):
|
||||||
with umbra.Browser(chrome_port=chrome_port, chrome_exe=args.chrome_exe) as browser:
|
with umbra.Browser(chrome_port=chrome_port, chrome_exe=args.chrome_exe) as browser:
|
||||||
while True:
|
while True:
|
||||||
crawl_url = site.q.pop()
|
crawl_url = site.q.next_url()
|
||||||
outlinks = browser.browse_page(crawl_url.url)
|
outlinks = browser.browse_page(crawl_url.url)
|
||||||
site.submit(outlinks)
|
site.submit(outlinks)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user