determine if youtube-dl can do something with a url

This commit is contained in:
Noah Levitt 2015-07-13 16:40:56 -07:00
parent 6470a8ef26
commit 3eff099b16
2 changed files with 11 additions and 0 deletions

View File

@ -14,6 +14,7 @@ import kombu
from umbra import hq from umbra import hq
import pprint import pprint
import traceback import traceback
import youtube_dl
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__), arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
description='crawl-url - browse urls, follow links', description='crawl-url - browse urls, follow links',
@ -62,6 +63,13 @@ def disclaim_site(site):
logging.info("putting {} on queue {}".format(site, q.queue.name)) logging.info("putting {} on queue {}".format(site, q.queue.name))
q.put(site.to_dict()) q.put(site.to_dict())
ydl_extractors = youtube_dl.extractor.gen_extractors()
def ydl_suitable(url):
for ie in ydl_extractors:
if ie.suitable(url):
return True
return False
def brozzle_site(site, browser): def brozzle_site(site, browser):
start = time.time() start = time.time()
try: try:
@ -70,6 +78,8 @@ def brozzle_site(site, browser):
try: try:
crawl_url = next_url(site) crawl_url = next_url(site)
logging.info("crawling {}".format(crawl_url)) logging.info("crawling {}".format(crawl_url))
if ydl_suitable(crawl_url.url):
logging.info("youtube-dl suitable for {}".format(crawl_url))
crawl_url.outlinks = browser.browse_page(crawl_url.url) crawl_url.outlinks = browser.browse_page(crawl_url.url)
completed_url(site, crawl_url) completed_url(site, crawl_url)
except kombu.simple.Empty: except kombu.simple.Empty:

View File

@ -4,3 +4,4 @@ argparse
PyYAML PyYAML
sortedcontainers sortedcontainers
git+https://github.com/ikreymer/surt.git@py3 git+https://github.com/ikreymer/surt.git@py3
youtube_dl