mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
Merge pull request #88 from nlevitt/block-urls
Block google analytics URLs using new Network.setBlockedURLs API
This commit is contained in:
commit
057284c2a7
@ -22,7 +22,7 @@ Requirements
|
||||
|
||||
- Python 3.4 or later
|
||||
- RethinkDB deployment
|
||||
- Chromium or Google Chrome browser
|
||||
- Chromium or Google Chrome >= version 64
|
||||
|
||||
Worth noting is that the browser requires a graphical environment to run. You
|
||||
already have this on your laptop, but on a server it will probably require
|
||||
|
@ -200,21 +200,6 @@ class WebsockReceiverThread(threading.Thread):
|
||||
'uncaught exception in _handle_message message=%s',
|
||||
message, exc_info=True)
|
||||
|
||||
def _debugger_paused(self, message):
|
||||
# we hit the breakpoint set in start(), get rid of google analytics
|
||||
self.logger.debug('debugger paused! message=%s', message)
|
||||
scriptId = message['params']['callFrames'][0]['location']['scriptId']
|
||||
|
||||
# replace script
|
||||
self.websock.send(
|
||||
json.dumps(dict(
|
||||
id=0, method='Debugger.setScriptSource',
|
||||
params={'scriptId': scriptId,
|
||||
'scriptSource': 'console.log("google analytics is no more!");'})))
|
||||
|
||||
# resume execution
|
||||
self.websock.send(json.dumps(dict(id=0, method='Debugger.resume')))
|
||||
|
||||
def _network_response_received(self, message):
|
||||
if (message['params']['response']['status'] == 420
|
||||
and 'Warcprox-Meta' in CaseInsensitiveDict(
|
||||
@ -255,8 +240,6 @@ class WebsockReceiverThread(threading.Thread):
|
||||
elif message['method'] == 'Network.requestWillBeSent':
|
||||
if self.on_request:
|
||||
self.on_request(message)
|
||||
elif message['method'] == 'Debugger.paused':
|
||||
self._debugger_paused(message)
|
||||
elif message['method'] == 'Page.interstitialShown':
|
||||
# for AITFIVE-1529: handle http auth
|
||||
# for now, we should consider killing the browser when we receive Page.interstitialShown and
|
||||
@ -358,16 +341,14 @@ class Browser:
|
||||
self.send_to_chrome(method='Network.enable')
|
||||
self.send_to_chrome(method='Page.enable')
|
||||
self.send_to_chrome(method='Console.enable')
|
||||
self.send_to_chrome(method='Debugger.enable')
|
||||
self.send_to_chrome(method='Runtime.enable')
|
||||
|
||||
# disable google analytics, see _handle_message() where breakpoint
|
||||
# is caught Debugger.paused
|
||||
# disable google analytics
|
||||
self.send_to_chrome(
|
||||
method='Debugger.setBreakpointByUrl',
|
||||
params={
|
||||
'lineNumber': 1,
|
||||
'urlRegex': 'https?://www.google-analytics.com/analytics.js'})
|
||||
method='Network.setBlockedURLs',
|
||||
params={'urls': ['*google-analytics.com/analytics.js',
|
||||
'*google-analytics.com/ga.js']}
|
||||
)
|
||||
|
||||
def stop(self):
|
||||
'''
|
||||
|
@ -29,6 +29,35 @@ import signal
|
||||
import sqlite3
|
||||
import json
|
||||
import tempfile
|
||||
import sys
|
||||
|
||||
def check_version(chrome_exe):
|
||||
'''
|
||||
Raises SystemExit if `chrome_exe` is not a supported browser version.
|
||||
|
||||
Must run in the main thread to have the desired effect.
|
||||
'''
|
||||
# mac$ /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --version
|
||||
# Google Chrome 64.0.3282.140
|
||||
# mac$ /Applications/Google\ Chrome\ Canary.app/Contents/MacOS/Google\ Chrome\ Canary --version
|
||||
# Google Chrome 66.0.3341.0 canary
|
||||
# linux$ chromium-browser --version
|
||||
# Using PPAPI flash.
|
||||
# --ppapi-flash-path=/usr/lib/adobe-flashplugin/libpepflashplayer.so --ppapi-flash-version=
|
||||
# Chromium 61.0.3163.100 Built on Ubuntu , running on Ubuntu 16.04
|
||||
cmd = [chrome_exe, '--version']
|
||||
out = subprocess.check_output(cmd, timeout=60)
|
||||
m = re.search(br'(Chromium|Google Chrome) ([\d.]+)', out)
|
||||
if not m:
|
||||
sys.exit(
|
||||
'unable to parse browser version from output of '
|
||||
'%r: %r' % (subprocess.list2cmdline(cmd), out))
|
||||
version_str = m.group(2).decode()
|
||||
major_version = int(version_str.split('.')[0])
|
||||
if major_version < 64:
|
||||
sys.exit('brozzler requires chrome/chromium version 64 or '
|
||||
'later but %s reports version %s' % (
|
||||
chrome_exe, version_str))
|
||||
|
||||
class Chrome:
|
||||
logger = logging.getLogger(__module__ + '.' + __qualname__)
|
||||
|
@ -167,6 +167,7 @@ def brozzle_page(argv=None):
|
||||
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
configure_logging(args)
|
||||
brozzler.chrome.check_version(args.chrome_exe)
|
||||
|
||||
behavior_parameters = {}
|
||||
if args.behavior_parameters:
|
||||
@ -325,6 +326,7 @@ def brozzler_worker(argv=None):
|
||||
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
configure_logging(args)
|
||||
brozzler.chrome.check_version(args.chrome_exe)
|
||||
|
||||
def dump_state(signum, frame):
|
||||
signal.signal(signal.SIGQUIT, signal.SIG_IGN)
|
||||
|
@ -268,6 +268,7 @@ def main(argv=None):
|
||||
arg_parser = _build_arg_parser(argv)
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
brozzler.cli.configure_logging(args)
|
||||
brozzler.chrome.check_version(args.chrome_exe)
|
||||
|
||||
controller = BrozzlerEasyController(args)
|
||||
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
|
||||
|
Loading…
x
Reference in New Issue
Block a user