mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge branch 'master' into qa
* master: in warcprox 2.0b2, captures table field has been renamed to "record_length" remove flickr behavior, flickr is better off with the default behavior for now Update README.rst add travis-ci badge
This commit is contained in:
commit
2eea50dcfb
@ -1,9 +1,11 @@
|
||||
.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master
|
||||
:target: https://travis-ci.org/internetarchive/brozzler
|
||||
|
||||
.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg
|
||||
:width: 7%
|
||||
|
||||
brozzler |logo|
|
||||
|logo| brozzler
|
||||
===============
|
||||
|
||||
"browser" \| "crawler" = "brozzler"
|
||||
|
||||
Brozzler is a distributed web crawler (爬虫) that uses a real browser (chrome
|
||||
|
@ -1,33 +0,0 @@
|
||||
/*
|
||||
* brozzler/behaviors.d/flickr.js - behavior for flickr.com
|
||||
*
|
||||
* Copyright (C) 2014-2016 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
setInterval(function() { window.scrollBy(0,50); }, 100);
|
||||
|
||||
setTimeout(function() {
|
||||
a = document.evaluate("//a[contains(@class, 'sn-ico-slideshow')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
||||
f = a.iterateNext();
|
||||
f.click();
|
||||
}, 5000);
|
||||
|
||||
setTimeout(function() {
|
||||
a = document.evaluate("//a[contains(@data-track, 'photo-click')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
||||
setInterval(function() {
|
||||
f = a.iterateNext();
|
||||
f.click();
|
||||
}, 5000);
|
||||
}, 5000);
|
@ -25,10 +25,6 @@ behaviors:
|
||||
# parameter_username: jdoe@example.com
|
||||
# parameter_password: abcd1234
|
||||
request_idle_timeout_sec: 30
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?flickr\.com/.*$'
|
||||
behavior_js: flickr.js
|
||||
request_idle_timeout_sec: 10
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?marquette\.edu/.*$'
|
||||
behavior_js: marquette_edu.js
|
||||
|
@ -65,7 +65,7 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
||||
'url': record['url'],
|
||||
'status': str(record['response_code']),
|
||||
'digest': record['sha1base32'],
|
||||
'length': str(record['length']), # XXX is this the right length?
|
||||
'length': str(record['record_length']),
|
||||
'offset': str(record['offset']),
|
||||
'filename': record['filename'],
|
||||
}
|
||||
|
4
setup.py
4
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b8.dev127',
|
||||
version='1.1b8.dev129',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
@ -71,7 +71,7 @@ setuptools.setup(
|
||||
],
|
||||
extras_require={
|
||||
'dashboard': ['flask>=0.11', 'gunicorn'],
|
||||
'easy': ['warcprox>=2.0b1', 'pywb', 'flask>=0.11', 'gunicorn'],
|
||||
'easy': ['warcprox>=2.0b2', 'pywb', 'flask>=0.11', 'gunicorn'],
|
||||
},
|
||||
zip_safe=False,
|
||||
classifiers=[
|
||||
|
Loading…
x
Reference in New Issue
Block a user