mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
let youtube-dl write to a temporary directory instead of /dev/null, to fix errors like this "youtube_dl.utils.DownloadError: ERROR: unable to open for writing: [Errno 13] Permission denied: '/dev/null-Frag0.part'
This commit is contained in:
parent
772bcf0df6
commit
e64a4d6985
@ -108,7 +108,6 @@ def brozzle_page():
|
|||||||
enable_warcprox_features=args.enable_warcprox_features)
|
enable_warcprox_features=args.enable_warcprox_features)
|
||||||
page = brozzler.Page(url=args.url, site_id=site.id)
|
page = brozzler.Page(url=args.url, site_id=site.id)
|
||||||
worker = brozzler.BrozzlerWorker(frontier=None)
|
worker = brozzler.BrozzlerWorker(frontier=None)
|
||||||
ydl = worker._youtube_dl(site)
|
|
||||||
|
|
||||||
def on_screenshot(screenshot_png):
|
def on_screenshot(screenshot_png):
|
||||||
OK_CHARS = (string.ascii_letters + string.digits)
|
OK_CHARS = (string.ascii_letters + string.digits)
|
||||||
@ -124,7 +123,7 @@ def brozzle_page():
|
|||||||
browser.start(proxy=site.proxy)
|
browser.start(proxy=site.proxy)
|
||||||
try:
|
try:
|
||||||
outlinks = worker.brozzle_page(
|
outlinks = worker.brozzle_page(
|
||||||
browser, ydl, site, page, on_screenshot=on_screenshot)
|
browser, site, page, on_screenshot=on_screenshot)
|
||||||
logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
|
logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
|
||||||
except brozzler.ReachedLimit as e:
|
except brozzler.ReachedLimit as e:
|
||||||
logging.error('reached limit %s', e)
|
logging.error('reached limit %s', e)
|
||||||
|
@ -1,22 +1,22 @@
|
|||||||
#
|
'''
|
||||||
# brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
|
brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
|
||||||
# it runs youtube-dl on them, browses them and runs behaviors if appropriate,
|
it runs youtube-dl on them, browses them and runs behaviors if appropriate,
|
||||||
# scopes and adds outlinks to the frontier
|
scopes and adds outlinks to the frontier
|
||||||
#
|
|
||||||
# Copyright (C) 2014-2016 Internet Archive
|
Copyright (C) 2014-2016 Internet Archive
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
# you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
# You may obtain a copy of the License at
|
You may obtain a copy of the License at
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
Unless required by applicable law or agreed to in writing, software
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
limitations under the License.
|
||||||
#
|
'''
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
@ -35,6 +35,7 @@ import datetime
|
|||||||
import collections
|
import collections
|
||||||
import requests
|
import requests
|
||||||
import rethinkstuff
|
import rethinkstuff
|
||||||
|
import tempfile
|
||||||
|
|
||||||
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||||
def __init__(self, extra_headers):
|
def __init__(self, extra_headers):
|
||||||
@ -101,9 +102,9 @@ class BrozzlerWorker:
|
|||||||
chrome_exe=chrome_exe, ignore_cert_errors=True)
|
chrome_exe=chrome_exe, ignore_cert_errors=True)
|
||||||
self._shutdown_requested = threading.Event()
|
self._shutdown_requested = threading.Event()
|
||||||
|
|
||||||
def _youtube_dl(self, site):
|
def _youtube_dl(self, destdir, site):
|
||||||
ydl_opts = {
|
ydl_opts = {
|
||||||
"outtmpl": "/dev/null",
|
"outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
|
||||||
"verbose": False,
|
"verbose": False,
|
||||||
"retries": 1,
|
"retries": 1,
|
||||||
"logger": logging.getLogger("youtube_dl"),
|
"logger": logging.getLogger("youtube_dl"),
|
||||||
@ -194,7 +195,7 @@ class BrozzlerWorker:
|
|||||||
|
|
||||||
return full_jpeg, thumb_jpeg
|
return full_jpeg, thumb_jpeg
|
||||||
|
|
||||||
def brozzle_page(self, browser, ydl, site, page, on_screenshot=None):
|
def brozzle_page(self, browser, site, page, on_screenshot=None):
|
||||||
def _on_screenshot(screenshot_png):
|
def _on_screenshot(screenshot_png):
|
||||||
if on_screenshot:
|
if on_screenshot:
|
||||||
on_screenshot(screenshot_png)
|
on_screenshot(screenshot_png)
|
||||||
@ -215,8 +216,10 @@ class BrozzlerWorker:
|
|||||||
extra_headers=site.extra_headers())
|
extra_headers=site.extra_headers())
|
||||||
|
|
||||||
self.logger.info("brozzling {}".format(page))
|
self.logger.info("brozzling {}".format(page))
|
||||||
ydl.brozzler_spy.reset()
|
|
||||||
try:
|
try:
|
||||||
|
with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
|
||||||
|
ydl = self._youtube_dl(tempdir, site)
|
||||||
|
ydl_spy = ydl.brozzler_spy # remember for later
|
||||||
self._try_youtube_dl(ydl, site, page)
|
self._try_youtube_dl(ydl, site, page)
|
||||||
except brozzler.ReachedLimit as e:
|
except brozzler.ReachedLimit as e:
|
||||||
raise
|
raise
|
||||||
@ -224,7 +227,7 @@ class BrozzlerWorker:
|
|||||||
self.logger.error("youtube_dl raised exception on %s",
|
self.logger.error("youtube_dl raised exception on %s",
|
||||||
page, exc_info=True)
|
page, exc_info=True)
|
||||||
|
|
||||||
if self._needs_browsing(page, ydl.brozzler_spy):
|
if self._needs_browsing(page, ydl_spy):
|
||||||
self.logger.info('needs browsing: %s', page)
|
self.logger.info('needs browsing: %s', page)
|
||||||
if not browser.is_running():
|
if not browser.is_running():
|
||||||
browser.start(proxy=site.proxy)
|
browser.start(proxy=site.proxy)
|
||||||
@ -234,7 +237,7 @@ class BrozzlerWorker:
|
|||||||
on_url_change=page.note_redirect)
|
on_url_change=page.note_redirect)
|
||||||
return outlinks
|
return outlinks
|
||||||
else:
|
else:
|
||||||
if not self._already_fetched(page, ydl.brozzler_spy):
|
if not self._already_fetched(page, ydl_spy):
|
||||||
self.logger.info('needs fetch: %s', page)
|
self.logger.info('needs fetch: %s', page)
|
||||||
self._fetch_url(site, page)
|
self._fetch_url(site, page)
|
||||||
else:
|
else:
|
||||||
@ -272,7 +275,7 @@ class BrozzlerWorker:
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _brozzle_site(self, browser, ydl, site):
|
def _brozzle_site(self, browser, site):
|
||||||
start = time.time()
|
start = time.time()
|
||||||
page = None
|
page = None
|
||||||
try:
|
try:
|
||||||
@ -282,7 +285,7 @@ class BrozzlerWorker:
|
|||||||
page = self._frontier.claim_page(site,
|
page = self._frontier.claim_page(site,
|
||||||
"{}:{}".format(
|
"{}:{}".format(
|
||||||
socket.gethostname(), browser.chrome_port))
|
socket.gethostname(), browser.chrome_port))
|
||||||
outlinks = self.brozzle_page(browser, ydl, site, page)
|
outlinks = self.brozzle_page(browser, site, page)
|
||||||
self._frontier.completed_page(site, page)
|
self._frontier.completed_page(site, page)
|
||||||
self._frontier.scope_and_schedule_outlinks(site, page, outlinks)
|
self._frontier.scope_and_schedule_outlinks(site, page, outlinks)
|
||||||
page = None
|
page = None
|
||||||
@ -337,10 +340,9 @@ class BrozzlerWorker:
|
|||||||
site = self._frontier.claim_site("{}:{}".format(
|
site = self._frontier.claim_site("{}:{}".format(
|
||||||
socket.gethostname(), browser.chrome_port))
|
socket.gethostname(), browser.chrome_port))
|
||||||
self.logger.info("brozzling site %s", site)
|
self.logger.info("brozzling site %s", site)
|
||||||
ydl = self._youtube_dl(site)
|
|
||||||
th = threading.Thread(
|
th = threading.Thread(
|
||||||
target=lambda: self._brozzle_site(
|
target=lambda: self._brozzle_site(
|
||||||
browser, ydl, site),
|
browser, site),
|
||||||
name="BrowsingThread:{}-{}".format(
|
name="BrowsingThread:{}-{}".format(
|
||||||
browser.chrome_port, site.seed))
|
browser.chrome_port, site.seed))
|
||||||
th.start()
|
th.start()
|
||||||
|
2
setup.py
2
setup.py
@ -21,7 +21,7 @@ import setuptools
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1.dev26',
|
version='1.1.dev27',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user