mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
Adding support for hop path information to be stored and passed along to warcprox
This commit is contained in:
parent
4f301f4e03
commit
0f72233f3b
@ -302,6 +302,7 @@ class RethinkDbFrontier:
|
|||||||
'site_id': site.id,
|
'site_id': site.id,
|
||||||
'job_id': site.job_id,
|
'job_id': site.job_id,
|
||||||
'hops_from_seed': parent_page.hops_from_seed + 1,
|
'hops_from_seed': parent_page.hops_from_seed + 1,
|
||||||
|
'hop_path': str(parent_page.hop_path if parent_page.hop_path else "") + "L",
|
||||||
'via_page_id': parent_page.id,
|
'via_page_id': parent_page.id,
|
||||||
'hops_off_surt': hops_off,
|
'hops_off_surt': hops_off,
|
||||||
'hashtags': [hashtag] if hashtag else []})
|
'hashtags': [hashtag] if hashtag else []})
|
||||||
|
@ -121,7 +121,8 @@ def new_seed_page(frontier, site):
|
|||||||
page = brozzler.Page(frontier.rr, {
|
page = brozzler.Page(frontier.rr, {
|
||||||
"url": str(url), "site_id": site.get("id"),
|
"url": str(url), "site_id": site.get("id"),
|
||||||
"job_id": site.get("job_id"), "hops_from_seed": 0,
|
"job_id": site.get("job_id"), "hops_from_seed": 0,
|
||||||
"priority": 1000, "needs_robots_check": True})
|
"priority": 1000, "needs_robots_check": True,
|
||||||
|
"hop_path": None})
|
||||||
if hashtag:
|
if hashtag:
|
||||||
page.hashtags = [hashtag,]
|
page.hashtags = [hashtag,]
|
||||||
return page
|
return page
|
||||||
@ -267,11 +268,18 @@ class Site(doublethink.Document, ElapsedMixIn):
|
|||||||
self._accept_ssurt_if_not_redundant(
|
self._accept_ssurt_if_not_redundant(
|
||||||
canon_seed_redirect.ssurt().decode('ascii'))
|
canon_seed_redirect.ssurt().decode('ascii'))
|
||||||
|
|
||||||
def extra_headers(self):
|
def extra_headers(self, page=None):
|
||||||
hdrs = {}
|
hdrs = {}
|
||||||
if self.warcprox_meta:
|
if self.warcprox_meta:
|
||||||
hdrs["Warcprox-Meta"] = json.dumps(
|
if page and "hop_path" in self.warcprox_meta:
|
||||||
self.warcprox_meta, separators=(',', ':'))
|
self.warcprox_meta["hop_path"] = page.hop_path
|
||||||
|
self.warcprox_meta["hop_path_parent"] = page.url
|
||||||
|
warcprox_meta_json = json.dumps(self.warcprox_meta, separators=(',', ':'))
|
||||||
|
self.warcprox_meta["hop_path"] = None
|
||||||
|
del self.warcprox_meta["hop_path_parent"]
|
||||||
|
else:
|
||||||
|
warcprox_meta_json= json.dumps(self.warcprox_meta, separators=(',', ':'))
|
||||||
|
hdrs["Warcprox-Meta"] = warcprox_meta_json
|
||||||
return hdrs
|
return hdrs
|
||||||
|
|
||||||
def accept_reject_or_neither(self, url, parent_page=None):
|
def accept_reject_or_neither(self, url, parent_page=None):
|
||||||
@ -338,6 +346,8 @@ class Page(doublethink.Document):
|
|||||||
def populate_defaults(self):
|
def populate_defaults(self):
|
||||||
if not "hops_from_seed" in self:
|
if not "hops_from_seed" in self:
|
||||||
self.hops_from_seed = 0
|
self.hops_from_seed = 0
|
||||||
|
if not "hop_path" in self:
|
||||||
|
self.hop_path = None
|
||||||
if not "brozzle_count" in self:
|
if not "brozzle_count" in self:
|
||||||
self.brozzle_count = 0
|
self.brozzle_count = 0
|
||||||
if not "claimed" in self:
|
if not "claimed" in self:
|
||||||
|
@ -224,7 +224,7 @@ class BrozzlerWorker:
|
|||||||
else:
|
else:
|
||||||
if not self._already_fetched(page, ydl_fetches):
|
if not self._already_fetched(page, ydl_fetches):
|
||||||
self.logger.info('needs fetch: %s', page)
|
self.logger.info('needs fetch: %s', page)
|
||||||
self._fetch_url(site, page.url)
|
self._fetch_url(site, page=page)
|
||||||
else:
|
else:
|
||||||
self.logger.info('already fetched: %s', page)
|
self.logger.info('already fetched: %s', page)
|
||||||
|
|
||||||
@ -287,7 +287,7 @@ class BrozzlerWorker:
|
|||||||
.get('scriptURL')
|
.get('scriptURL')
|
||||||
if url and url not in sw_fetched:
|
if url and url not in sw_fetched:
|
||||||
self.logger.info('fetching service worker script %s', url)
|
self.logger.info('fetching service worker script %s', url)
|
||||||
self._fetch_url(site, url)
|
self._fetch_url(site, url=url)
|
||||||
sw_fetched.add(url)
|
sw_fetched.add(url)
|
||||||
|
|
||||||
if not browser.is_running():
|
if not browser.is_running():
|
||||||
@ -295,7 +295,7 @@ class BrozzlerWorker:
|
|||||||
proxy=self._proxy_for(site),
|
proxy=self._proxy_for(site),
|
||||||
cookie_db=site.get('cookie_db'))
|
cookie_db=site.get('cookie_db'))
|
||||||
final_page_url, outlinks = browser.browse_page(
|
final_page_url, outlinks = browser.browse_page(
|
||||||
page.url, extra_headers=site.extra_headers(),
|
page.url, extra_headers=site.extra_headers(page),
|
||||||
behavior_parameters=site.get('behavior_parameters'),
|
behavior_parameters=site.get('behavior_parameters'),
|
||||||
username=site.get('username'), password=site.get('password'),
|
username=site.get('username'), password=site.get('password'),
|
||||||
user_agent=site.get('user_agent'),
|
user_agent=site.get('user_agent'),
|
||||||
@ -316,8 +316,10 @@ class BrozzlerWorker:
|
|||||||
page.note_redirect(final_page_url)
|
page.note_redirect(final_page_url)
|
||||||
return outlinks
|
return outlinks
|
||||||
|
|
||||||
def _fetch_url(self, site, url):
|
def _fetch_url(self, site, url=None, page=None):
|
||||||
proxies = None
|
proxies = None
|
||||||
|
if page:
|
||||||
|
url=page.url
|
||||||
if self._proxy_for(site):
|
if self._proxy_for(site):
|
||||||
proxies = {
|
proxies = {
|
||||||
'http': 'http://%s' % self._proxy_for(site),
|
'http': 'http://%s' % self._proxy_for(site),
|
||||||
@ -328,7 +330,7 @@ class BrozzlerWorker:
|
|||||||
try:
|
try:
|
||||||
# response is ignored
|
# response is ignored
|
||||||
requests.get(
|
requests.get(
|
||||||
url, proxies=proxies, headers=site.extra_headers(),
|
url, proxies=proxies, headers=site.extra_headers(page),
|
||||||
verify=False)
|
verify=False)
|
||||||
except requests.exceptions.ProxyError as e:
|
except requests.exceptions.ProxyError as e:
|
||||||
raise brozzler.ProxyError(
|
raise brozzler.ProxyError(
|
||||||
|
@ -234,7 +234,7 @@ def test_proxy_down():
|
|||||||
|
|
||||||
# raw fetch
|
# raw fetch
|
||||||
with pytest.raises(brozzler.ProxyError):
|
with pytest.raises(brozzler.ProxyError):
|
||||||
worker._fetch_url(site, page.url)
|
worker._fetch_url(site, page=page)
|
||||||
|
|
||||||
# WARCPROX_WRITE_RECORD
|
# WARCPROX_WRITE_RECORD
|
||||||
with pytest.raises(brozzler.ProxyError):
|
with pytest.raises(brozzler.ProxyError):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user