mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-12-14 16:19:00 -05:00
make it work again, and list discovered outlinks
This commit is contained in:
parent
8b45d7eb69
commit
3af1e10e13
1 changed files with 3 additions and 2 deletions
|
|
@ -38,13 +38,14 @@ site = brozzler.Site(id=-1, seed=args.url, proxy=args.proxy,
|
||||||
enable_warcprox_features=args.enable_warcprox_features,
|
enable_warcprox_features=args.enable_warcprox_features,
|
||||||
extra_headers=extra_headers)
|
extra_headers=extra_headers)
|
||||||
page = brozzler.Page(url=args.url, site_id=site.id)
|
page = brozzler.Page(url=args.url, site_id=site.id)
|
||||||
worker = brozzler.BrozzlerWorker()
|
worker = brozzler.BrozzlerWorker(frontier=None)
|
||||||
ydl = worker._youtube_dl(site)
|
ydl = worker._youtube_dl(site)
|
||||||
|
|
||||||
browser = brozzler.Browser(chrome_exe=args.chrome_exe)
|
browser = brozzler.Browser(chrome_exe=args.chrome_exe)
|
||||||
browser.start(proxy=site.proxy)
|
browser.start(proxy=site.proxy)
|
||||||
try:
|
try:
|
||||||
worker.brozzle_page(browser, ydl, site, page)
|
outlinks = worker.brozzle_page(browser, ydl, site, page)
|
||||||
|
logging.info("outlinks: \n\t%s", "\n\t".join(sorted(outlinks)))
|
||||||
except brozzler.ReachedLimit as e:
|
except brozzler.ReachedLimit as e:
|
||||||
logging.error("reached limit %s", e)
|
logging.error("reached limit %s", e)
|
||||||
finally:
|
finally:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue