diff --git a/.travis.yml b/.travis.yml index c20872e..b8a50cb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,7 +1,6 @@ dist: xenial language: python python: -- 3.4 - 3.5 - 3.6 - 3.7 @@ -24,6 +23,8 @@ script: - DISPLAY=:1 py.test --tb=native -v tests after_failure: - chromium-browser --version +- sudo kill -QUIT $(sudo svstat /etc/service/warcprox | egrep -o 'pid [0-9]+' | awk '{print $2}') +- sudo kill -QUIT $(sudo svstat /etc/service/brozzler-worker | egrep -o 'pid [0-9]+' | awk '{print $2}') - sudo cat /var/log/warcprox.log - sudo cat /var/log/brozzler-worker.log - sudo cat /var/log/pywb.log diff --git a/README.rst b/README.rst index 85fe1f3..9f9c28a 100644 --- a/README.rst +++ b/README.rst @@ -19,7 +19,7 @@ Brozzler is designed to work in conjuction with warcprox for web archiving. Requirements ------------ -- Python 3.4 or later +- Python 3.5 or later - RethinkDB deployment - Chromium or Google Chrome >= version 64 diff --git a/brozzler/browser.py b/brozzler/browser.py index b59ce85..33c1ef1 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -361,8 +361,15 @@ class Browser: # disable google analytics and amp analytics self.send_to_chrome( method='Network.setBlockedURLs', - params={'urls': ['*google-analytics.com/analytics.js', - '*google-analytics.com/ga.js', + params={'urls': ['*google-analytics.com/analytics.js*', + '*google-analytics.com/ga.js*', + '*google-analytics.com/ga_exp.js*', + '*google-analytics.com/urchin.js*', + '*google-analytics.com/collect*', + '*google-analytics.com/r/collect*', + '*google-analytics.com/__utm.gif*', + '*google-analytics.com/gtm/js?*', + '*google-analytics.com/cx/api.js*', '*cdn.ampproject.org/*/amp-analytics*.js']}) def stop(self): @@ -410,8 +417,9 @@ class Browser: on_request=None, on_response=None, on_service_worker_version_updated=None, on_screenshot=None, username=None, password=None, hashtags=None, - skip_extract_outlinks=False, skip_visit_hashtags=False, - skip_youtube_dl=False, page_timeout=300, behavior_timeout=900): + screenshot_full_page=False, skip_extract_outlinks=False, + skip_visit_hashtags=False, skip_youtube_dl=False, page_timeout=300, + behavior_timeout=900): ''' Browses page in browser. @@ -486,12 +494,12 @@ class Browser: 'login navigated away from %s; returning!', page_url) self.navigate_to_page(page_url, timeout=page_timeout) - if on_screenshot: - self._try_screenshot(on_screenshot) behavior_script = brozzler.behavior_script( page_url, behavior_parameters, behaviors_dir=behaviors_dir) self.run_behavior(behavior_script, timeout=behavior_timeout) + if on_screenshot: + self._try_screenshot(on_screenshot, screenshot_full_page) if skip_extract_outlinks: outlinks = [] else: @@ -512,10 +520,15 @@ class Browser: self.websock_thread.on_request = None self.websock_thread.on_response = None - def _try_screenshot(self, on_screenshot): + def _try_screenshot(self, on_screenshot, full_page=False): + """The browser instance must be scrolled to the top of the page before + trying to get a screenshot. + """ + self.send_to_chrome(method='Runtime.evaluate', suppress_logging=True, + params={'expression': 'window.scroll(0,0)'}) for i in range(3): try: - jpeg_bytes = self.screenshot() + jpeg_bytes = self.screenshot(full_page) on_screenshot(jpeg_bytes) return except BrowsingTimeout as e: @@ -581,8 +594,8 @@ class Browser: if ('result' in message and 'result' in message['result'] and 'value' in message['result']['result']): if message['result']['result']['value']: - return frozenset( - message['result']['result']['value'].split('\n')) + return frozenset([str(urlcanon.whatwg(link)) for link in + message['result']['result']['value'].split('\n')]) else: # no links found return frozenset() @@ -591,10 +604,36 @@ class Browser: 'problem extracting outlinks, result message: %s', message) return frozenset() - def screenshot(self, timeout=45): + def screenshot(self, full_page=False, timeout=45): + """Optionally capture full page screenshot using puppeteer as an + inspiration: + https://github.com/GoogleChrome/puppeteer/blob/master/lib/Page.js#L898 + """ self.logger.info('taking screenshot') + if full_page: + self.websock_thread.expect_result(self._command_id.peek()) + msg_id = self.send_to_chrome(method='Page.getLayoutMetrics') + self._wait_for( + lambda: self.websock_thread.received_result(msg_id), + timeout=timeout) + message = self.websock_thread.pop_result(msg_id) + width = message['result']['contentSize']['width'] + height = message['result']['contentSize']['height'] + clip = dict(x=0, y=0, width=width, height=height, scale=1) + deviceScaleFactor = 1 + screenOrientation = {'angle': 0, 'type': 'portraitPrimary'} + self.send_to_chrome( + method='Emulation.setDeviceMetricsOverride', + params=dict(mobile=False, width=width, height=height, + deviceScaleFactor=deviceScaleFactor, + screenOrientation=screenOrientation) + ) + capture_params = {'format': 'jpeg', 'quality': 95, 'clip': clip} + else: + capture_params = {'format': 'jpeg', 'quality': 95} self.websock_thread.expect_result(self._command_id.peek()) - msg_id = self.send_to_chrome(method='Page.captureScreenshot') + msg_id = self.send_to_chrome(method='Page.captureScreenshot', + params=capture_params) self._wait_for( lambda: self.websock_thread.received_result(msg_id), timeout=timeout) diff --git a/brozzler/cli.py b/brozzler/cli.py index 2db020c..1cb5912 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -153,6 +153,9 @@ def brozzle_page(argv=None): help='use this password to try to log in if a login form is found') arg_parser.add_argument( '--proxy', dest='proxy', default=None, help='http proxy') + arg_parser.add_argument( + '--screenshot-full-page', dest='screenshot_full_page', + action='store_true') arg_parser.add_argument( '--skip-extract-outlinks', dest='skip_extract_outlinks', action='store_true') @@ -174,19 +177,20 @@ def brozzle_page(argv=None): 'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters, 'username': args.username, 'password': args.password}) page = brozzler.Page(None, {'url': args.url, 'site_id': site.id}) - worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy, - skip_extract_outlinks=args.skip_extract_outlinks, - skip_visit_hashtags=args.skip_visit_hashtags, - skip_youtube_dl=args.skip_youtube_dl) + worker = brozzler.BrozzlerWorker( + frontier=None, proxy=args.proxy, + skip_extract_outlinks=args.skip_extract_outlinks, + skip_visit_hashtags=args.skip_visit_hashtags, + skip_youtube_dl=args.skip_youtube_dl, + screenshot_full_page=args.screenshot_full_page) - def on_screenshot(screenshot_png): - OK_CHARS = (string.ascii_letters + string.digits) - filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format( + def on_screenshot(screenshot_jpeg): + OK_CHARS = string.ascii_letters + string.digits + filename = '/tmp/{}-{:%Y%m%d%H%M%S}.jpg'.format( ''.join(ch if ch in OK_CHARS else '_' for ch in args.url), datetime.datetime.now()) - # logging.info('len(screenshot_png)=%s', len(screenshot_png)) with open(filename, 'wb') as f: - f.write(screenshot_png) + f.write(screenshot_jpeg) logging.info('wrote screenshot to %s', filename) browser = brozzler.Browser(chrome_exe=args.chrome_exe) diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 0e3b777..6715eb3 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -314,7 +314,7 @@ class RethinkDbFrontier: ''' existing_page.priority += fresh_page.priority existing_page.hashtags = list(set( - existing_page.hashtags + fresh_page.hashtags)) + (existing_page.hashtags or []) + (fresh_page.hashtags or []))) existing_page.hops_off = min( existing_page.hops_off, fresh_page.hops_off) @@ -375,14 +375,18 @@ class RethinkDbFrontier: decisions['accepted'].add(fresh_page.url) if fresh_page.id in pages: page = pages[fresh_page.id] - page.hashtags = list(set((page.hashtags or []) - + fresh_page.hashtags)) - page.priority += fresh_page.priority + self._merge_page(page, fresh_page) counts['updated'] += 1 else: pages[fresh_page.id] = fresh_page counts['added'] += 1 + # make sure we're not stepping on our own toes in case we have a link + # back to parent_page, which I think happens because of hashtags + if parent_page.id in pages: + self._merge_page(parent_page, pages[parent_page.id]) + del pages[parent_page.id] + # insert/replace in batches of 50 to try to avoid this error: # "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:" # there can be many pages and each one can be very large (many videos, @@ -392,8 +396,11 @@ class RethinkDbFrontier: try: self.logger.debug( 'inserting/replacing batch of %s pages', len(batch)) - result = self.rr.table('pages').insert( - batch, conflict='replace').run() + reql = self.rr.table('pages').insert(batch, conflict='replace') + self.logger.trace( + 'running query self.rr.table("pages").insert(%r, ' + 'conflict="replace")', batch) + result = reql.run() except Exception as e: self.logger.error( 'problem inserting/replacing batch of %s pages', @@ -450,12 +457,15 @@ class RethinkDbFrontier: Returns: iterator of brozzler.Page ''' - results = self.rr.table("pages").between( + query = self.rr.table("pages").between( [site_id, 1 if brozzled is True else 0, r.minval, r.minval], [site_id, 0 if brozzled is False else r.maxval, r.maxval, r.maxval], - index="priority_by_site").run() + index="priority_by_site") + self.logger.trace("running query: %r", query) + results = query.run() for result in results: + self.logger.trace("yielding result: %r", result) yield brozzler.Page(self.rr, result) diff --git a/brozzler/worker.py b/brozzler/worker.py index 5ce5499..4ef3121 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -50,7 +50,8 @@ class BrozzlerWorker: self, frontier, service_registry=None, max_browsers=1, chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, skip_extract_outlinks=False, skip_visit_hashtags=False, - skip_youtube_dl=False, page_timeout=300, behavior_timeout=900): + skip_youtube_dl=False, screenshot_full_page=False, + page_timeout=300, behavior_timeout=900): self._frontier = frontier self._service_registry = service_registry self._max_browsers = max_browsers @@ -62,6 +63,7 @@ class BrozzlerWorker: self._skip_extract_outlinks = skip_extract_outlinks self._skip_visit_hashtags = skip_visit_hashtags self._skip_youtube_dl = skip_youtube_dl + self._screenshot_full_page = screenshot_full_page self._page_timeout = page_timeout self._behavior_timeout = behavior_timeout @@ -165,22 +167,16 @@ class BrozzlerWorker: raise brozzler.ProxyError( 'proxy error on WARCPROX_WRITE_RECORD %s' % url) from e - def full_and_thumb_jpegs(self, large_png): - # these screenshots never have any alpha (right?) - img = PIL.Image.open(io.BytesIO(large_png)).convert('RGB') - - out = io.BytesIO() - img.save(out, "jpeg", quality=95) - full_jpeg = out.getbuffer() - + def thumb_jpeg(self, full_jpeg): + """Create JPEG thumbnail. + """ + img = PIL.Image.open(io.BytesIO(full_jpeg)) thumb_width = 300 thumb_height = (thumb_width / img.size[0]) * img.size[1] img.thumbnail((thumb_width, thumb_height)) out = io.BytesIO() img.save(out, "jpeg", quality=95) - thumb_jpeg = out.getbuffer() - - return full_jpeg, thumb_jpeg + return out.getbuffer() def brozzle_page(self, browser, site, page, on_screenshot=None, on_request=None, enable_youtube_dl=True): @@ -226,15 +222,14 @@ class BrozzlerWorker: return outlinks def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None): - def _on_screenshot(screenshot_png): + def _on_screenshot(screenshot_jpeg): if on_screenshot: - on_screenshot(screenshot_png) + on_screenshot(screenshot_jpeg) if self._using_warcprox(site): self.logger.info( "sending WARCPROX_WRITE_RECORD request to %s with " "screenshot for %s", self._proxy_for(site), page) - screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs( - screenshot_png) + thumbnail_jpeg = self.thumb_jpeg(screenshot_jpeg) self._warcprox_write_record( warcprox_address=self._proxy_for(site), url="screenshot:%s" % str(urlcanon.semantic(page.url)), @@ -302,6 +297,7 @@ class BrozzlerWorker: skip_extract_outlinks=self._skip_extract_outlinks, skip_visit_hashtags=self._skip_visit_hashtags, skip_youtube_dl=self._skip_youtube_dl, + screenshot_full_page=self._screenshot_full_page, page_timeout=self._page_timeout, behavior_timeout=self._behavior_timeout) if final_page_url != page.url: diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..18b7f86 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,6 @@ +# https://docs.pytest.org/en/latest/logging.html +# https://github.com/pytest-dev/pytest/issues/5296 +[pytest] +log_format = %(asctime)s.%(msecs)03d %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s +log_date_format = %Y-%m-%d %H:%M:%S + diff --git a/setup.py b/setup.py index ef5d945..d486e54 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.8', + version='1.5.11', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -95,7 +95,6 @@ setuptools.setup( 'Development Status :: 5 - Production/Stable', 'Environment :: Console', 'License :: OSI Approved :: Apache Software License', - 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', diff --git a/tests/test_cluster.py b/tests/test_cluster.py index e04624b..fcff145 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -32,6 +32,7 @@ import requests import subprocess import http.server import logging +import sys import warcprox # https://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 800da1e..64f7ab5 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -733,7 +733,7 @@ def test_hashtag_seed(): assert pages[0].hashtags == ['#hash',] def test_hashtag_links(): - rr = doublethink.Rethinker('localhost', db='ignoreme') + rr = doublethink.Rethinker('localhost', db='test_hashtag_links') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, {'seed': 'http://example.org/'})