diff --git a/README.rst b/README.rst index 5835ff5..2329f97 100644 --- a/README.rst +++ b/README.rst @@ -1,9 +1,11 @@ +.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master + :target: https://travis-ci.org/internetarchive/brozzler + .. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg :width: 7% -brozzler |logo| +|logo| brozzler =============== - "browser" \| "crawler" = "brozzler" Brozzler is a distributed web crawler (爬虫) that uses a real browser (chrome diff --git a/brozzler/behaviors.d/flickr.js b/brozzler/behaviors.d/flickr.js deleted file mode 100644 index 6e5077a..0000000 --- a/brozzler/behaviors.d/flickr.js +++ /dev/null @@ -1,33 +0,0 @@ -/* - * brozzler/behaviors.d/flickr.js - behavior for flickr.com - * - * Copyright (C) 2014-2016 Internet Archive - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -setInterval(function() { window.scrollBy(0,50); }, 100); - -setTimeout(function() { - a = document.evaluate("//a[contains(@class, 'sn-ico-slideshow')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null ); - f = a.iterateNext(); - f.click(); -}, 5000); - -setTimeout(function() { - a = document.evaluate("//a[contains(@data-track, 'photo-click')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null ); - setInterval(function() { - f = a.iterateNext(); - f.click(); - }, 5000); -}, 5000); diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 73566cc..0da72cc 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -25,10 +25,6 @@ behaviors: # parameter_username: jdoe@example.com # parameter_password: abcd1234 request_idle_timeout_sec: 30 - - - url_regex: '^https?://(?:www\.)?flickr\.com/.*$' - behavior_js: flickr.js - request_idle_timeout_sec: 10 - url_regex: '^https?://(?:www\.)?marquette\.edu/.*$' behavior_js: marquette_edu.js diff --git a/brozzler/pywb.py b/brozzler/pywb.py index 8fe8503..9b0a0f7 100644 --- a/brozzler/pywb.py +++ b/brozzler/pywb.py @@ -65,7 +65,7 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource): 'url': record['url'], 'status': str(record['response_code']), 'digest': record['sha1base32'], - 'length': str(record['length']), # XXX is this the right length? + 'length': str(record['record_length']), 'offset': str(record['offset']), 'filename': record['filename'], } diff --git a/setup.py b/setup.py index dc70a9c..a613be4 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b8.dev127', + version='1.1b8.dev129', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -71,7 +71,7 @@ setuptools.setup( ], extras_require={ 'dashboard': ['flask>=0.11', 'gunicorn'], - 'easy': ['warcprox>=2.0b1', 'pywb', 'flask>=0.11', 'gunicorn'], + 'easy': ['warcprox>=2.0b2', 'pywb', 'flask>=0.11', 'gunicorn'], }, zip_safe=False, classifiers=[