mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
determine if youtube-dl can do something with a url
This commit is contained in:
parent
6470a8ef26
commit
3eff099b16
@ -14,6 +14,7 @@ import kombu
|
|||||||
from umbra import hq
|
from umbra import hq
|
||||||
import pprint
|
import pprint
|
||||||
import traceback
|
import traceback
|
||||||
|
import youtube_dl
|
||||||
|
|
||||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
|
||||||
description='crawl-url - browse urls, follow links',
|
description='crawl-url - browse urls, follow links',
|
||||||
@ -62,6 +63,13 @@ def disclaim_site(site):
|
|||||||
logging.info("putting {} on queue {}".format(site, q.queue.name))
|
logging.info("putting {} on queue {}".format(site, q.queue.name))
|
||||||
q.put(site.to_dict())
|
q.put(site.to_dict())
|
||||||
|
|
||||||
|
ydl_extractors = youtube_dl.extractor.gen_extractors()
|
||||||
|
def ydl_suitable(url):
|
||||||
|
for ie in ydl_extractors:
|
||||||
|
if ie.suitable(url):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
def brozzle_site(site, browser):
|
def brozzle_site(site, browser):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
try:
|
try:
|
||||||
@ -70,6 +78,8 @@ def brozzle_site(site, browser):
|
|||||||
try:
|
try:
|
||||||
crawl_url = next_url(site)
|
crawl_url = next_url(site)
|
||||||
logging.info("crawling {}".format(crawl_url))
|
logging.info("crawling {}".format(crawl_url))
|
||||||
|
if ydl_suitable(crawl_url.url):
|
||||||
|
logging.info("youtube-dl suitable for {}".format(crawl_url))
|
||||||
crawl_url.outlinks = browser.browse_page(crawl_url.url)
|
crawl_url.outlinks = browser.browse_page(crawl_url.url)
|
||||||
completed_url(site, crawl_url)
|
completed_url(site, crawl_url)
|
||||||
except kombu.simple.Empty:
|
except kombu.simple.Empty:
|
||||||
|
@ -4,3 +4,4 @@ argparse
|
|||||||
PyYAML
|
PyYAML
|
||||||
sortedcontainers
|
sortedcontainers
|
||||||
git+https://github.com/ikreymer/surt.git@py3
|
git+https://github.com/ikreymer/surt.git@py3
|
||||||
|
youtube_dl
|
||||||
|
Loading…
x
Reference in New Issue
Block a user