mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
Merge pull request #88 from nlevitt/block-urls
Block google analytics URLs using new Network.setBlockedURLs API
This commit is contained in:
commit
057284c2a7
@ -22,7 +22,7 @@ Requirements
|
|||||||
|
|
||||||
- Python 3.4 or later
|
- Python 3.4 or later
|
||||||
- RethinkDB deployment
|
- RethinkDB deployment
|
||||||
- Chromium or Google Chrome browser
|
- Chromium or Google Chrome >= version 64
|
||||||
|
|
||||||
Worth noting is that the browser requires a graphical environment to run. You
|
Worth noting is that the browser requires a graphical environment to run. You
|
||||||
already have this on your laptop, but on a server it will probably require
|
already have this on your laptop, but on a server it will probably require
|
||||||
|
@ -200,21 +200,6 @@ class WebsockReceiverThread(threading.Thread):
|
|||||||
'uncaught exception in _handle_message message=%s',
|
'uncaught exception in _handle_message message=%s',
|
||||||
message, exc_info=True)
|
message, exc_info=True)
|
||||||
|
|
||||||
def _debugger_paused(self, message):
|
|
||||||
# we hit the breakpoint set in start(), get rid of google analytics
|
|
||||||
self.logger.debug('debugger paused! message=%s', message)
|
|
||||||
scriptId = message['params']['callFrames'][0]['location']['scriptId']
|
|
||||||
|
|
||||||
# replace script
|
|
||||||
self.websock.send(
|
|
||||||
json.dumps(dict(
|
|
||||||
id=0, method='Debugger.setScriptSource',
|
|
||||||
params={'scriptId': scriptId,
|
|
||||||
'scriptSource': 'console.log("google analytics is no more!");'})))
|
|
||||||
|
|
||||||
# resume execution
|
|
||||||
self.websock.send(json.dumps(dict(id=0, method='Debugger.resume')))
|
|
||||||
|
|
||||||
def _network_response_received(self, message):
|
def _network_response_received(self, message):
|
||||||
if (message['params']['response']['status'] == 420
|
if (message['params']['response']['status'] == 420
|
||||||
and 'Warcprox-Meta' in CaseInsensitiveDict(
|
and 'Warcprox-Meta' in CaseInsensitiveDict(
|
||||||
@ -255,8 +240,6 @@ class WebsockReceiverThread(threading.Thread):
|
|||||||
elif message['method'] == 'Network.requestWillBeSent':
|
elif message['method'] == 'Network.requestWillBeSent':
|
||||||
if self.on_request:
|
if self.on_request:
|
||||||
self.on_request(message)
|
self.on_request(message)
|
||||||
elif message['method'] == 'Debugger.paused':
|
|
||||||
self._debugger_paused(message)
|
|
||||||
elif message['method'] == 'Page.interstitialShown':
|
elif message['method'] == 'Page.interstitialShown':
|
||||||
# for AITFIVE-1529: handle http auth
|
# for AITFIVE-1529: handle http auth
|
||||||
# for now, we should consider killing the browser when we receive Page.interstitialShown and
|
# for now, we should consider killing the browser when we receive Page.interstitialShown and
|
||||||
@ -358,16 +341,14 @@ class Browser:
|
|||||||
self.send_to_chrome(method='Network.enable')
|
self.send_to_chrome(method='Network.enable')
|
||||||
self.send_to_chrome(method='Page.enable')
|
self.send_to_chrome(method='Page.enable')
|
||||||
self.send_to_chrome(method='Console.enable')
|
self.send_to_chrome(method='Console.enable')
|
||||||
self.send_to_chrome(method='Debugger.enable')
|
|
||||||
self.send_to_chrome(method='Runtime.enable')
|
self.send_to_chrome(method='Runtime.enable')
|
||||||
|
|
||||||
# disable google analytics, see _handle_message() where breakpoint
|
# disable google analytics
|
||||||
# is caught Debugger.paused
|
|
||||||
self.send_to_chrome(
|
self.send_to_chrome(
|
||||||
method='Debugger.setBreakpointByUrl',
|
method='Network.setBlockedURLs',
|
||||||
params={
|
params={'urls': ['*google-analytics.com/analytics.js',
|
||||||
'lineNumber': 1,
|
'*google-analytics.com/ga.js']}
|
||||||
'urlRegex': 'https?://www.google-analytics.com/analytics.js'})
|
)
|
||||||
|
|
||||||
def stop(self):
|
def stop(self):
|
||||||
'''
|
'''
|
||||||
|
@ -29,6 +29,35 @@ import signal
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
import json
|
import json
|
||||||
import tempfile
|
import tempfile
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def check_version(chrome_exe):
|
||||||
|
'''
|
||||||
|
Raises SystemExit if `chrome_exe` is not a supported browser version.
|
||||||
|
|
||||||
|
Must run in the main thread to have the desired effect.
|
||||||
|
'''
|
||||||
|
# mac$ /Applications/Google\ Chrome.app/Contents/MacOS/Google\ Chrome --version
|
||||||
|
# Google Chrome 64.0.3282.140
|
||||||
|
# mac$ /Applications/Google\ Chrome\ Canary.app/Contents/MacOS/Google\ Chrome\ Canary --version
|
||||||
|
# Google Chrome 66.0.3341.0 canary
|
||||||
|
# linux$ chromium-browser --version
|
||||||
|
# Using PPAPI flash.
|
||||||
|
# --ppapi-flash-path=/usr/lib/adobe-flashplugin/libpepflashplayer.so --ppapi-flash-version=
|
||||||
|
# Chromium 61.0.3163.100 Built on Ubuntu , running on Ubuntu 16.04
|
||||||
|
cmd = [chrome_exe, '--version']
|
||||||
|
out = subprocess.check_output(cmd, timeout=60)
|
||||||
|
m = re.search(br'(Chromium|Google Chrome) ([\d.]+)', out)
|
||||||
|
if not m:
|
||||||
|
sys.exit(
|
||||||
|
'unable to parse browser version from output of '
|
||||||
|
'%r: %r' % (subprocess.list2cmdline(cmd), out))
|
||||||
|
version_str = m.group(2).decode()
|
||||||
|
major_version = int(version_str.split('.')[0])
|
||||||
|
if major_version < 64:
|
||||||
|
sys.exit('brozzler requires chrome/chromium version 64 or '
|
||||||
|
'later but %s reports version %s' % (
|
||||||
|
chrome_exe, version_str))
|
||||||
|
|
||||||
class Chrome:
|
class Chrome:
|
||||||
logger = logging.getLogger(__module__ + '.' + __qualname__)
|
logger = logging.getLogger(__module__ + '.' + __qualname__)
|
||||||
|
@ -167,6 +167,7 @@ def brozzle_page(argv=None):
|
|||||||
|
|
||||||
args = arg_parser.parse_args(args=argv[1:])
|
args = arg_parser.parse_args(args=argv[1:])
|
||||||
configure_logging(args)
|
configure_logging(args)
|
||||||
|
brozzler.chrome.check_version(args.chrome_exe)
|
||||||
|
|
||||||
behavior_parameters = {}
|
behavior_parameters = {}
|
||||||
if args.behavior_parameters:
|
if args.behavior_parameters:
|
||||||
@ -325,6 +326,7 @@ def brozzler_worker(argv=None):
|
|||||||
|
|
||||||
args = arg_parser.parse_args(args=argv[1:])
|
args = arg_parser.parse_args(args=argv[1:])
|
||||||
configure_logging(args)
|
configure_logging(args)
|
||||||
|
brozzler.chrome.check_version(args.chrome_exe)
|
||||||
|
|
||||||
def dump_state(signum, frame):
|
def dump_state(signum, frame):
|
||||||
signal.signal(signal.SIGQUIT, signal.SIG_IGN)
|
signal.signal(signal.SIGQUIT, signal.SIG_IGN)
|
||||||
|
@ -268,6 +268,7 @@ def main(argv=None):
|
|||||||
arg_parser = _build_arg_parser(argv)
|
arg_parser = _build_arg_parser(argv)
|
||||||
args = arg_parser.parse_args(args=argv[1:])
|
args = arg_parser.parse_args(args=argv[1:])
|
||||||
brozzler.cli.configure_logging(args)
|
brozzler.cli.configure_logging(args)
|
||||||
|
brozzler.chrome.check_version(args.chrome_exe)
|
||||||
|
|
||||||
controller = BrozzlerEasyController(args)
|
controller = BrozzlerEasyController(args)
|
||||||
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
|
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user