mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-09 06:52:46 -04:00
Merge branch 'master' into qa
* master: in warcprox 2.0b2, captures table field has been renamed to "record_length" remove flickr behavior, flickr is better off with the default behavior for now Update README.rst add travis-ci badge
This commit is contained in:
commit
2eea50dcfb
5 changed files with 7 additions and 42 deletions
|
@ -1,9 +1,11 @@
|
||||||
|
.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master
|
||||||
|
:target: https://travis-ci.org/internetarchive/brozzler
|
||||||
|
|
||||||
.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg
|
.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b5/brozzler/webconsole/static/brozzler.svg
|
||||||
:width: 7%
|
:width: 7%
|
||||||
|
|
||||||
brozzler |logo|
|
|logo| brozzler
|
||||||
===============
|
===============
|
||||||
|
|
||||||
"browser" \| "crawler" = "brozzler"
|
"browser" \| "crawler" = "brozzler"
|
||||||
|
|
||||||
Brozzler is a distributed web crawler (爬虫) that uses a real browser (chrome
|
Brozzler is a distributed web crawler (爬虫) that uses a real browser (chrome
|
||||||
|
|
|
@ -1,33 +0,0 @@
|
||||||
/*
|
|
||||||
* brozzler/behaviors.d/flickr.js - behavior for flickr.com
|
|
||||||
*
|
|
||||||
* Copyright (C) 2014-2016 Internet Archive
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
setInterval(function() { window.scrollBy(0,50); }, 100);
|
|
||||||
|
|
||||||
setTimeout(function() {
|
|
||||||
a = document.evaluate("//a[contains(@class, 'sn-ico-slideshow')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
|
||||||
f = a.iterateNext();
|
|
||||||
f.click();
|
|
||||||
}, 5000);
|
|
||||||
|
|
||||||
setTimeout(function() {
|
|
||||||
a = document.evaluate("//a[contains(@data-track, 'photo-click')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
|
||||||
setInterval(function() {
|
|
||||||
f = a.iterateNext();
|
|
||||||
f.click();
|
|
||||||
}, 5000);
|
|
||||||
}, 5000);
|
|
|
@ -25,10 +25,6 @@ behaviors:
|
||||||
# parameter_username: jdoe@example.com
|
# parameter_username: jdoe@example.com
|
||||||
# parameter_password: abcd1234
|
# parameter_password: abcd1234
|
||||||
request_idle_timeout_sec: 30
|
request_idle_timeout_sec: 30
|
||||||
-
|
|
||||||
url_regex: '^https?://(?:www\.)?flickr\.com/.*$'
|
|
||||||
behavior_js: flickr.js
|
|
||||||
request_idle_timeout_sec: 10
|
|
||||||
-
|
-
|
||||||
url_regex: '^https?://(?:www\.)?marquette\.edu/.*$'
|
url_regex: '^https?://(?:www\.)?marquette\.edu/.*$'
|
||||||
behavior_js: marquette_edu.js
|
behavior_js: marquette_edu.js
|
||||||
|
|
|
@ -65,7 +65,7 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
||||||
'url': record['url'],
|
'url': record['url'],
|
||||||
'status': str(record['response_code']),
|
'status': str(record['response_code']),
|
||||||
'digest': record['sha1base32'],
|
'digest': record['sha1base32'],
|
||||||
'length': str(record['length']), # XXX is this the right length?
|
'length': str(record['record_length']),
|
||||||
'offset': str(record['offset']),
|
'offset': str(record['offset']),
|
||||||
'filename': record['filename'],
|
'filename': record['filename'],
|
||||||
}
|
}
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b8.dev127',
|
version='1.1b8.dev129',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -71,7 +71,7 @@ setuptools.setup(
|
||||||
],
|
],
|
||||||
extras_require={
|
extras_require={
|
||||||
'dashboard': ['flask>=0.11', 'gunicorn'],
|
'dashboard': ['flask>=0.11', 'gunicorn'],
|
||||||
'easy': ['warcprox>=2.0b1', 'pywb', 'flask>=0.11', 'gunicorn'],
|
'easy': ['warcprox>=2.0b2', 'pywb', 'flask>=0.11', 'gunicorn'],
|
||||||
},
|
},
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
classifiers=[
|
classifiers=[
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue