let youtube-dl write to a temporary directory instead of /dev/null, to fix errors like this "youtube_dl.utils.DownloadError: ERROR: unable to open for writing: [Errno 13] Permission denied: '/dev/null-Frag0.part'

This commit is contained in:
Noah Levitt 2016-06-28 13:56:30 -05:00
parent 772bcf0df6
commit e64a4d6985
3 changed files with 34 additions and 33 deletions

View File

@ -108,7 +108,6 @@ def brozzle_page():
enable_warcprox_features=args.enable_warcprox_features) enable_warcprox_features=args.enable_warcprox_features)
page = brozzler.Page(url=args.url, site_id=site.id) page = brozzler.Page(url=args.url, site_id=site.id)
worker = brozzler.BrozzlerWorker(frontier=None) worker = brozzler.BrozzlerWorker(frontier=None)
ydl = worker._youtube_dl(site)
def on_screenshot(screenshot_png): def on_screenshot(screenshot_png):
OK_CHARS = (string.ascii_letters + string.digits) OK_CHARS = (string.ascii_letters + string.digits)
@ -124,7 +123,7 @@ def brozzle_page():
browser.start(proxy=site.proxy) browser.start(proxy=site.proxy)
try: try:
outlinks = worker.brozzle_page( outlinks = worker.brozzle_page(
browser, ydl, site, page, on_screenshot=on_screenshot) browser, site, page, on_screenshot=on_screenshot)
logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks))) logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
except brozzler.ReachedLimit as e: except brozzler.ReachedLimit as e:
logging.error('reached limit %s', e) logging.error('reached limit %s', e)

View File

@ -1,22 +1,22 @@
# '''
# brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
# it runs youtube-dl on them, browses them and runs behaviors if appropriate, it runs youtube-dl on them, browses them and runs behaviors if appropriate,
# scopes and adds outlinks to the frontier scopes and adds outlinks to the frontier
#
# Copyright (C) 2014-2016 Internet Archive Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
# You may obtain a copy of the License at You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and See the License for the specific language governing permissions and
# limitations under the License. limitations under the License.
# '''
import os import os
import logging import logging
@ -35,6 +35,7 @@ import datetime
import collections import collections
import requests import requests
import rethinkstuff import rethinkstuff
import tempfile
class ExtraHeaderAdder(urllib.request.BaseHandler): class ExtraHeaderAdder(urllib.request.BaseHandler):
def __init__(self, extra_headers): def __init__(self, extra_headers):
@ -101,9 +102,9 @@ class BrozzlerWorker:
chrome_exe=chrome_exe, ignore_cert_errors=True) chrome_exe=chrome_exe, ignore_cert_errors=True)
self._shutdown_requested = threading.Event() self._shutdown_requested = threading.Event()
def _youtube_dl(self, site): def _youtube_dl(self, destdir, site):
ydl_opts = { ydl_opts = {
"outtmpl": "/dev/null", "outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
"verbose": False, "verbose": False,
"retries": 1, "retries": 1,
"logger": logging.getLogger("youtube_dl"), "logger": logging.getLogger("youtube_dl"),
@ -194,7 +195,7 @@ class BrozzlerWorker:
return full_jpeg, thumb_jpeg return full_jpeg, thumb_jpeg
def brozzle_page(self, browser, ydl, site, page, on_screenshot=None): def brozzle_page(self, browser, site, page, on_screenshot=None):
def _on_screenshot(screenshot_png): def _on_screenshot(screenshot_png):
if on_screenshot: if on_screenshot:
on_screenshot(screenshot_png) on_screenshot(screenshot_png)
@ -215,16 +216,18 @@ class BrozzlerWorker:
extra_headers=site.extra_headers()) extra_headers=site.extra_headers())
self.logger.info("brozzling {}".format(page)) self.logger.info("brozzling {}".format(page))
ydl.brozzler_spy.reset()
try: try:
self._try_youtube_dl(ydl, site, page) with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
ydl = self._youtube_dl(tempdir, site)
ydl_spy = ydl.brozzler_spy # remember for later
self._try_youtube_dl(ydl, site, page)
except brozzler.ReachedLimit as e: except brozzler.ReachedLimit as e:
raise raise
except: except:
self.logger.error("youtube_dl raised exception on %s", self.logger.error("youtube_dl raised exception on %s",
page, exc_info=True) page, exc_info=True)
if self._needs_browsing(page, ydl.brozzler_spy): if self._needs_browsing(page, ydl_spy):
self.logger.info('needs browsing: %s', page) self.logger.info('needs browsing: %s', page)
if not browser.is_running(): if not browser.is_running():
browser.start(proxy=site.proxy) browser.start(proxy=site.proxy)
@ -234,7 +237,7 @@ class BrozzlerWorker:
on_url_change=page.note_redirect) on_url_change=page.note_redirect)
return outlinks return outlinks
else: else:
if not self._already_fetched(page, ydl.brozzler_spy): if not self._already_fetched(page, ydl_spy):
self.logger.info('needs fetch: %s', page) self.logger.info('needs fetch: %s', page)
self._fetch_url(site, page) self._fetch_url(site, page)
else: else:
@ -272,7 +275,7 @@ class BrozzlerWorker:
return True return True
return False return False
def _brozzle_site(self, browser, ydl, site): def _brozzle_site(self, browser, site):
start = time.time() start = time.time()
page = None page = None
try: try:
@ -282,7 +285,7 @@ class BrozzlerWorker:
page = self._frontier.claim_page(site, page = self._frontier.claim_page(site,
"{}:{}".format( "{}:{}".format(
socket.gethostname(), browser.chrome_port)) socket.gethostname(), browser.chrome_port))
outlinks = self.brozzle_page(browser, ydl, site, page) outlinks = self.brozzle_page(browser, site, page)
self._frontier.completed_page(site, page) self._frontier.completed_page(site, page)
self._frontier.scope_and_schedule_outlinks(site, page, outlinks) self._frontier.scope_and_schedule_outlinks(site, page, outlinks)
page = None page = None
@ -337,10 +340,9 @@ class BrozzlerWorker:
site = self._frontier.claim_site("{}:{}".format( site = self._frontier.claim_site("{}:{}".format(
socket.gethostname(), browser.chrome_port)) socket.gethostname(), browser.chrome_port))
self.logger.info("brozzling site %s", site) self.logger.info("brozzling site %s", site)
ydl = self._youtube_dl(site)
th = threading.Thread( th = threading.Thread(
target=lambda: self._brozzle_site( target=lambda: self._brozzle_site(
browser, ydl, site), browser, site),
name="BrowsingThread:{}-{}".format( name="BrowsingThread:{}-{}".format(
browser.chrome_port, site.seed)) browser.chrome_port, site.seed))
th.start() th.start()

View File

@ -21,7 +21,7 @@ import setuptools
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1.dev26', version='1.1.dev27',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',