From 35c5fa482f69c188a53d39743b62f3c3c5ae01c6 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Fri, 15 Nov 2019 13:20:30 +0000 Subject: [PATCH 1/8] Enable running in docker / k8s When trying to run Brozzler in docker, we get the following error: ``` Failed to move to new namespace: PID namespaces supported, Network namespace supported, but failed: errno = Operation not permitted Trace/breakpoint trap ``` This happens because chromium uses sandboxing for increased security by default and its not supported when running in a container. Adding chromium option `--no-sandbox` fixes the problem. This issue is common, I found various reports about it like this: https://github.com/Zenika/alpine-chrome/issues/33 --- brozzler/chrome.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/chrome.py b/brozzler/chrome.py index cbca3e5..d049d3a 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -170,7 +170,7 @@ class Chrome: '--disable-background-networking', '--disable-renderer-backgrounding', '--disable-hang-monitor', '--disable-background-timer-throttling', '--mute-audio', - '--disable-web-sockets', + '--disable-web-sockets', '--no-sandbox', '--window-size=1100,900', '--no-default-browser-check', '--disable-first-run-ui', '--no-first-run', '--homepage=about:blank', '--disable-direct-npapi-requests', From ff523b3bba5b5d87927f4f6e256e3a13d43d44a4 Mon Sep 17 00:00:00 2001 From: Corentin Barreau Date: Mon, 25 Nov 2019 17:48:33 +0100 Subject: [PATCH 2/8] Fix: facebook ads variant selector --- brozzler/behaviors.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 7109107..85b4111 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -25,7 +25,7 @@ interval: 1000 actions: - selector: a[data-testid="snapshot_footer_link"] - childSelector: i[class="_271o img sp_-vbjDsgypf1 sx_1de63f"] + childSelector: i[class="_271o img sp_KBE8sh--02o sx_5d0205"] closeSelector: 'div._7lq1 > button' - url_regex: '^https?://(?:www\.)?facebook\.com/.*$' From 62cb051f93cac66712082073cb940f915888e412 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Mon, 25 Nov 2019 20:44:25 +0000 Subject: [PATCH 3/8] Pass extra CLI params to chrome using ENV variable If ENV var `BROZZLER_EXTRA_CHROME_ARGS` is set, pass its contents as extra chromium cli options. Remove `--no-sandbox` option. Its not good from a security point of view. --- brozzler/chrome.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/brozzler/chrome.py b/brozzler/chrome.py index d049d3a..4eb8c57 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -170,13 +170,16 @@ class Chrome: '--disable-background-networking', '--disable-renderer-backgrounding', '--disable-hang-monitor', '--disable-background-timer-throttling', '--mute-audio', - '--disable-web-sockets', '--no-sandbox', + '--disable-web-sockets', '--window-size=1100,900', '--no-default-browser-check', '--disable-first-run-ui', '--no-first-run', '--homepage=about:blank', '--disable-direct-npapi-requests', '--disable-web-security', '--disable-notifications', '--disable-extensions', '--disable-save-password-bubble'] + extra_chrome_args = os.environ.get('BROZZLER_EXTRA_CHROME_ARGS') + if extra_chrome_args: + chrome_args.append(extra_chrome_args) if disk_cache_dir: chrome_args.append('--disk-cache-dir=%s' % disk_cache_dir) if disk_cache_size: From 64da843dc8753b5344e74cb90eaaea140de38976 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 25 Nov 2019 16:04:13 -0800 Subject: [PATCH 4/8] fix travis badge --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 9f9c28a..c471ae1 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -.. image:: https://travis-ci.org/internetarchive/brozzler.svg +.. image:: https://api.travis-ci.org/internetarchive/brozzler.svg?branch=master :target: https://travis-ci.org/internetarchive/brozzler .. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b12/brozzler/dashboard/static/brozzler.svg From 3bc2f434ef677056795db373753c3090eb0892f4 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Wed, 27 Nov 2019 20:18:41 +0000 Subject: [PATCH 5/8] Split extra chrome args on whitespace This is in case multiple args are used. --- brozzler/chrome.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 4eb8c57..671000b 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -179,7 +179,7 @@ class Chrome: extra_chrome_args = os.environ.get('BROZZLER_EXTRA_CHROME_ARGS') if extra_chrome_args: - chrome_args.append(extra_chrome_args) + chrome_args.extend(extra_chrome_args.split()) if disk_cache_dir: chrome_args.append('--disk-cache-dir=%s' % disk_cache_dir) if disk_cache_size: From 5aeaf47b6b8fc1241706fc41c2933cd572486dbc Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 27 Nov 2019 12:41:16 -0800 Subject: [PATCH 6/8] bump version after merge --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index d5442a6..f38425b 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.15', + version='1.5.16', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From 597f2b5b3373839217efc06615c2f59c4de4f884 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 4 Dec 2019 15:11:53 -0800 Subject: [PATCH 7/8] reveal bad value when job conf validation fails --- brozzler/model.py | 13 ++++++++++--- setup.py | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/brozzler/model.py b/brozzler/model.py index 77dae70..f65fe50 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -43,13 +43,20 @@ class JobValidator(cerberus.Validator): return url.scheme in ('http', 'https', 'ftp') class InvalidJobConf(Exception): - def __init__(self, errors): - self.errors = errors + def __init__(self, validator): + self.errors = validator.errors + try: + # Cerberus does a nice job hiding the bad value. In the case I + # debugged, I found it here. Maybe there's a better way to see it. + value = validator._errors[0].info[0][0].info[0][0].value + self.errors['bad value'] = value + except: + value = None def validate_conf(job_conf, schema=load_schema()): v = JobValidator(schema) if not v.validate(job_conf, normalize=False): - raise InvalidJobConf(v.errors) + raise InvalidJobConf(v) def merge(a, b): if isinstance(a, dict) and isinstance(b, dict): diff --git a/setup.py b/setup.py index f38425b..ef6391d 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.16', + version='1.5.17', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From ca3550af13c781997269d76e7082d392d918bafb Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Mon, 9 Dec 2019 17:50:07 -0800 Subject: [PATCH 8/8] instagram interval 2000ms --- brozzler/behaviors.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 85b4111..867109e 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -39,7 +39,7 @@ url_regex: '^https?://(?:www\.)?instagram\.com/.*$' behavior_js_template: umbraBehavior.js.j2 default_parameters: - interval: 500 + interval: 2000 actions: - selector: .glyphsSpriteGrey_Close rmSelector: '.RnEpo'