Merge branch 'master' into qa

* master:
  in warcprox 2.0b2, captures table field has been renamed to "record_length"
  remove flickr behavior, flickr is better off with the default behavior for now
  Update README.rst
  add travis-ci badge
This commit is contained in:
Noah Levitt 2016-11-21 16:21:30 -08:00
commit 2eea50dcfb
5 changed files with 7 additions and 42 deletions

View File

@ -1,9 +1,11 @@
.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master
:target: https://travis-ci.org/internetarchive/brozzler
.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg
:width: 7%
brozzler |logo|
|logo| brozzler
===============
"browser" \| "crawler" = "brozzler"
Brozzler is a distributed web crawler (爬虫) that uses a real browser (chrome

View File

@ -1,33 +0,0 @@
/*
* brozzler/behaviors.d/flickr.js - behavior for flickr.com
*
* Copyright (C) 2014-2016 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
setInterval(function() { window.scrollBy(0,50); }, 100);
setTimeout(function() {
a = document.evaluate("//a[contains(@class, 'sn-ico-slideshow')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
f = a.iterateNext();
f.click();
}, 5000);
setTimeout(function() {
a = document.evaluate("//a[contains(@data-track, 'photo-click')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
setInterval(function() {
f = a.iterateNext();
f.click();
}, 5000);
}, 5000);

View File

@ -25,10 +25,6 @@ behaviors:
# parameter_username: jdoe@example.com
# parameter_password: abcd1234
request_idle_timeout_sec: 30
-
url_regex: '^https?://(?:www\.)?flickr\.com/.*$'
behavior_js: flickr.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?marquette\.edu/.*$'
behavior_js: marquette_edu.js

View File

@ -65,7 +65,7 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
'url': record['url'],
'status': str(record['response_code']),
'digest': record['sha1base32'],
'length': str(record['length']), # XXX is this the right length?
'length': str(record['record_length']),
'offset': str(record['offset']),
'filename': record['filename'],
}

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b8.dev127',
version='1.1b8.dev129',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
@ -71,7 +71,7 @@ setuptools.setup(
],
extras_require={
'dashboard': ['flask>=0.11', 'gunicorn'],
'easy': ['warcprox>=2.0b1', 'pywb', 'flask>=0.11', 'gunicorn'],
'easy': ['warcprox>=2.0b2', 'pywb', 'flask>=0.11', 'gunicorn'],
},
zip_safe=False,
classifiers=[