mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge pull request #243 from internetarchive/adds-hop-path-support
Adds hop path support
This commit is contained in:
commit
66252e17c3
@ -302,7 +302,9 @@ class RethinkDbFrontier:
|
||||
'site_id': site.id,
|
||||
'job_id': site.job_id,
|
||||
'hops_from_seed': parent_page.hops_from_seed + 1,
|
||||
'hop_path': str(parent_page.hop_path if parent_page.hop_path else "") + "L",
|
||||
'via_page_id': parent_page.id,
|
||||
'via_page_url': parent_page.url,
|
||||
'hops_off_surt': hops_off,
|
||||
'hashtags': [hashtag] if hashtag else []})
|
||||
return page
|
||||
|
@ -31,6 +31,7 @@ import urlcanon
|
||||
import urllib
|
||||
import uuid
|
||||
import yaml
|
||||
from typing import Optional
|
||||
|
||||
def load_schema():
|
||||
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
|
||||
@ -119,9 +120,13 @@ def new_seed_page(frontier, site):
|
||||
hashtag = (url.hash_sign + url.fragment).decode("utf-8")
|
||||
urlcanon.canon.remove_fragment(url)
|
||||
page = brozzler.Page(frontier.rr, {
|
||||
"url": str(url), "site_id": site.get("id"),
|
||||
"job_id": site.get("job_id"), "hops_from_seed": 0,
|
||||
"priority": 1000, "needs_robots_check": True})
|
||||
"url": str(url),
|
||||
"site_id": site.get("id"),
|
||||
"job_id": site.get("job_id"),
|
||||
"hops_from_seed": 0,
|
||||
"priority": 1000,
|
||||
"needs_robots_check": True,
|
||||
"hop_path": None})
|
||||
if hashtag:
|
||||
page.hashtags = [hashtag,]
|
||||
return page
|
||||
@ -267,11 +272,20 @@ class Site(doublethink.Document, ElapsedMixIn):
|
||||
self._accept_ssurt_if_not_redundant(
|
||||
canon_seed_redirect.ssurt().decode('ascii'))
|
||||
|
||||
def extra_headers(self):
|
||||
def extra_headers(self, page: Optional["Page"] = None):
|
||||
hdrs = {}
|
||||
if self.warcprox_meta:
|
||||
hdrs["Warcprox-Meta"] = json.dumps(
|
||||
self.warcprox_meta, separators=(',', ':'))
|
||||
if page is not None:
|
||||
self.warcprox_meta["metadata"]["hop_path"] = page.hop_path
|
||||
self.warcprox_meta["metadata"]["brozzled_url"] = page.url
|
||||
self.warcprox_meta["metadata"]["hop_via_url"] = page.via_page_url
|
||||
warcprox_meta_json = json.dumps(self.warcprox_meta, separators=(',', ':'))
|
||||
del self.warcprox_meta["metadata"]["hop_path"]
|
||||
del self.warcprox_meta["metadata"]["brozzled_url"]
|
||||
del self.warcprox_meta["metadata"]["hop_via_url"]
|
||||
else:
|
||||
warcprox_meta_json= json.dumps(self.warcprox_meta, separators=(',', ':'))
|
||||
hdrs["Warcprox-Meta"] = warcprox_meta_json
|
||||
return hdrs
|
||||
|
||||
def accept_reject_or_neither(self, url, parent_page=None):
|
||||
@ -338,6 +352,10 @@ class Page(doublethink.Document):
|
||||
def populate_defaults(self):
|
||||
if not "hops_from_seed" in self:
|
||||
self.hops_from_seed = 0
|
||||
if not "hop_path" in self:
|
||||
self.hop_path = None
|
||||
if not "via_page_url" in self:
|
||||
self.via_page_url = None
|
||||
if not "brozzle_count" in self:
|
||||
self.brozzle_count = 0
|
||||
if not "claimed" in self:
|
||||
|
@ -224,7 +224,7 @@ class BrozzlerWorker:
|
||||
else:
|
||||
if not self._already_fetched(page, ydl_fetches):
|
||||
self.logger.info('needs fetch: %s', page)
|
||||
self._fetch_url(site, page.url)
|
||||
self._fetch_url(site, page=page)
|
||||
else:
|
||||
self.logger.info('already fetched: %s', page)
|
||||
|
||||
@ -244,13 +244,13 @@ class BrozzlerWorker:
|
||||
url="screenshot:%s" % str(urlcanon.semantic(page.url)),
|
||||
warc_type="resource", content_type="image/jpeg",
|
||||
payload=screenshot_jpeg,
|
||||
extra_headers=site.extra_headers())
|
||||
extra_headers=site.extra_headers(page))
|
||||
self._warcprox_write_record(
|
||||
warcprox_address=self._proxy_for(site),
|
||||
url="thumbnail:%s" % str(urlcanon.semantic(page.url)),
|
||||
warc_type="resource", content_type="image/jpeg",
|
||||
payload=thumbnail_jpeg,
|
||||
extra_headers=site.extra_headers())
|
||||
extra_headers=site.extra_headers(page))
|
||||
|
||||
def _on_response(chrome_msg):
|
||||
if ('params' in chrome_msg
|
||||
@ -287,7 +287,7 @@ class BrozzlerWorker:
|
||||
.get('scriptURL')
|
||||
if url and url not in sw_fetched:
|
||||
self.logger.info('fetching service worker script %s', url)
|
||||
self._fetch_url(site, url)
|
||||
self._fetch_url(site, url=url)
|
||||
sw_fetched.add(url)
|
||||
|
||||
if not browser.is_running():
|
||||
@ -295,7 +295,7 @@ class BrozzlerWorker:
|
||||
proxy=self._proxy_for(site),
|
||||
cookie_db=site.get('cookie_db'))
|
||||
final_page_url, outlinks = browser.browse_page(
|
||||
page.url, extra_headers=site.extra_headers(),
|
||||
page.url, extra_headers=site.extra_headers(page),
|
||||
behavior_parameters=site.get('behavior_parameters'),
|
||||
username=site.get('username'), password=site.get('password'),
|
||||
user_agent=site.get('user_agent'),
|
||||
@ -316,8 +316,10 @@ class BrozzlerWorker:
|
||||
page.note_redirect(final_page_url)
|
||||
return outlinks
|
||||
|
||||
def _fetch_url(self, site, url):
|
||||
def _fetch_url(self, site, url=None, page=None):
|
||||
proxies = None
|
||||
if page:
|
||||
url=page.url
|
||||
if self._proxy_for(site):
|
||||
proxies = {
|
||||
'http': 'http://%s' % self._proxy_for(site),
|
||||
@ -328,7 +330,7 @@ class BrozzlerWorker:
|
||||
try:
|
||||
# response is ignored
|
||||
requests.get(
|
||||
url, proxies=proxies, headers=site.extra_headers(),
|
||||
url, proxies=proxies, headers=site.extra_headers(page),
|
||||
verify=False)
|
||||
except requests.exceptions.ProxyError as e:
|
||||
raise brozzler.ProxyError(
|
||||
|
@ -101,7 +101,7 @@ def final_bounces(fetches, url):
|
||||
|
||||
return final_bounces
|
||||
|
||||
def _build_youtube_dl(worker, destdir, site):
|
||||
def _build_youtube_dl(worker, destdir, site, page):
|
||||
'''
|
||||
Builds a yt-dlp `youtube_dl.YoutubeDL` for brozzling `site` with `worker`.
|
||||
|
||||
@ -262,7 +262,7 @@ def _build_youtube_dl(worker, destdir, site):
|
||||
ydl_opts["proxy"] = "http://{}".format(worker._proxy_for(site))
|
||||
ydl = _YoutubeDL(ydl_opts)
|
||||
if site.extra_headers():
|
||||
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers()))
|
||||
ydl._opener.add_handler(ExtraHeaderAdder(site.extra_headers(page)))
|
||||
ydl.fetch_spy = YoutubeDLSpy()
|
||||
ydl.stitch_ups = []
|
||||
ydl._opener.add_handler(ydl.fetch_spy)
|
||||
@ -330,7 +330,7 @@ def _try_youtube_dl(worker, ydl, site, page):
|
||||
warc_type="metadata",
|
||||
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
||||
payload=info_json.encode("utf-8"),
|
||||
extra_headers=site.extra_headers())
|
||||
extra_headers=site.extra_headers(page))
|
||||
return ie_result
|
||||
except brozzler.ShutdownRequested as e:
|
||||
raise
|
||||
@ -374,7 +374,7 @@ def do_youtube_dl(worker, site, page):
|
||||
`list` of `str`: outlink urls
|
||||
'''
|
||||
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||
ydl = _build_youtube_dl(worker, tempdir, site)
|
||||
ydl = _build_youtube_dl(worker, tempdir, site, page)
|
||||
ie_result = _try_youtube_dl(worker, ydl, site, page)
|
||||
outlinks = set()
|
||||
if ie_result and ie_result.get('extractor') == 'youtube:playlist':
|
||||
|
@ -234,7 +234,7 @@ def test_proxy_down():
|
||||
|
||||
# raw fetch
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
worker._fetch_url(site, page.url)
|
||||
worker._fetch_url(site, page=page)
|
||||
|
||||
# WARCPROX_WRITE_RECORD
|
||||
with pytest.raises(brozzler.ProxyError):
|
||||
|
Loading…
x
Reference in New Issue
Block a user