mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-05-02 22:55:04 -04:00
remove "dev" from version number and switch README to rst
This commit is contained in:
parent
245078284d
commit
9699a40645
3 changed files with 65 additions and 54 deletions
49
README.md
49
README.md
|
@ -1,49 +0,0 @@
|
||||||
brozzler
|
|
||||||
========
|
|
||||||
"browser" | "crawler" = "brozzler"
|
|
||||||
|
|
||||||
Brozzler is a distributed web crawler (爬虫) that uses a real browser (chrome
|
|
||||||
or chromium) to fetch pages and embedded urls and to extract links. It also
|
|
||||||
uses [youtube-dl](https://github.com/rg3/youtube-dl) to enhance media capture
|
|
||||||
capabilities.
|
|
||||||
|
|
||||||
It is forked from https://github.com/internetarchive/umbra.
|
|
||||||
|
|
||||||
Brozzler is designed to work in conjunction with
|
|
||||||
[warcprox](https://github.com/internetarchive/warcprox) for web archiving.
|
|
||||||
|
|
||||||
Installation
|
|
||||||
------------
|
|
||||||
```
|
|
||||||
git clone https://github.com/nlevitt/brozzler.git
|
|
||||||
cd brozzler
|
|
||||||
# set up virtualenv if desired
|
|
||||||
pip install -r requirements.txt .
|
|
||||||
```
|
|
||||||
Brozzler also requires a rethinkdb deployment.
|
|
||||||
|
|
||||||
Fonts for good screenshots
|
|
||||||
--------------------------
|
|
||||||
On ubuntu 14.04 trusty I installed these packages:
|
|
||||||
|
|
||||||
xfonts-base ttf-mscorefonts-installer fonts-arphic-bkai00mp fonts-arphic-bsmi00lp fonts-arphic-gbsn00lp fonts-arphic-gkai00mp fonts-arphic-ukai fonts-farsiweb fonts-nafees fonts-sil-abyssinica fonts-sil-ezra fonts-sil-padauk fonts-unfonts-extra fonts-unfonts-core ttf-indic-fonts fonts-thai-tlwg fonts-lklug-sinhala
|
|
||||||
|
|
||||||
Haven't looked much at the resulting screenshots yet though.
|
|
||||||
|
|
||||||
License
|
|
||||||
-------
|
|
||||||
|
|
||||||
Copyright 2015 Internet Archive
|
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
you may not use this software except in compliance with the License.
|
|
||||||
You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
|
|
62
README.rst
Normal file
62
README.rst
Normal file
|
@ -0,0 +1,62 @@
|
||||||
|
brozzler
|
||||||
|
========
|
||||||
|
|
||||||
|
"browser" \| "crawler" = "brozzler"
|
||||||
|
|
||||||
|
Brozzler is a distributed web crawler (爬虫) that uses a real browser
|
||||||
|
(chrome or chromium) to fetch pages and embedded urls and to extract
|
||||||
|
links. It also uses `youtube-dl <https://github.com/rg3/youtube-dl>`__
|
||||||
|
to enhance media capture capabilities.
|
||||||
|
|
||||||
|
It is forked from https://github.com/internetarchive/umbra.
|
||||||
|
|
||||||
|
Brozzler is designed to work in conjunction with
|
||||||
|
`warcprox <https://github.com/internetarchive/warcprox>`__ for web
|
||||||
|
archiving.
|
||||||
|
|
||||||
|
Installation
|
||||||
|
------------
|
||||||
|
|
||||||
|
XXX These instructions don't work at the moment. Brozzler requires some
|
||||||
|
customized packages not easily installable in the outside world. I intend to
|
||||||
|
remedy the situation soon.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
# set up virtualenv if desired
|
||||||
|
pip install git+https://github.com/nlevitt/brozzler.git
|
||||||
|
|
||||||
|
Brozzler also requires a rethinkdb deployment.
|
||||||
|
|
||||||
|
Fonts for good screenshots
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
On ubuntu 14.04 trusty I installed these packages:
|
||||||
|
|
||||||
|
xfonts-base ttf-mscorefonts-installer fonts-arphic-bkai00mp
|
||||||
|
fonts-arphic-bsmi00lp fonts-arphic-gbsn00lp fonts-arphic-gkai00mp
|
||||||
|
fonts-arphic-ukai fonts-farsiweb fonts-nafees fonts-sil-abyssinica
|
||||||
|
fonts-sil-ezra fonts-sil-padauk fonts-unfonts-extra fonts-unfonts-core
|
||||||
|
ttf-indic-fonts fonts-thai-tlwg fonts-lklug-sinhala
|
||||||
|
|
||||||
|
Haven't looked much at the resulting screenshots yet though.
|
||||||
|
|
||||||
|
License
|
||||||
|
-------
|
||||||
|
|
||||||
|
Copyright 2015 Internet Archive
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||||
|
not use this software except in compliance with the License. You may
|
||||||
|
obtain a copy of the License at
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
|
|
8
setup.py
8
setup.py
|
@ -1,5 +1,3 @@
|
||||||
# vim: set sw=4 et:
|
|
||||||
|
|
||||||
import setuptools
|
import setuptools
|
||||||
import glob
|
import glob
|
||||||
|
|
||||||
|
@ -9,7 +7,7 @@ def full_version_bytes():
|
||||||
import subprocess, time
|
import subprocess, time
|
||||||
try:
|
try:
|
||||||
commit_num_bytes = subprocess.check_output(['git', 'rev-list', '--count', 'HEAD'])
|
commit_num_bytes = subprocess.check_output(['git', 'rev-list', '--count', 'HEAD'])
|
||||||
return VERSION_BYTES + b'.dev' + commit_num_bytes.strip()
|
return VERSION_BYTES + b'.' + commit_num_bytes.strip()
|
||||||
except subprocess.CalledProcessError:
|
except subprocess.CalledProcessError:
|
||||||
return VERSION_BYTES
|
return VERSION_BYTES
|
||||||
|
|
||||||
|
@ -24,10 +22,10 @@ setuptools.setup(name='brozzler',
|
||||||
url='https://github.com/nlevitt/brozzler',
|
url='https://github.com/nlevitt/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
author_email='nlevitt@archive.org',
|
author_email='nlevitt@archive.org',
|
||||||
long_description=open('README.md').read(),
|
long_description=open('README.rst').read(),
|
||||||
license='Apache License 2.0',
|
license='Apache License 2.0',
|
||||||
packages=['brozzler'],
|
packages=['brozzler'],
|
||||||
package_data={'brozzler':['behaviors.d/*.js*', 'behaviors.yaml', 'version.txt']},
|
package_data={'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml', 'version.txt']},
|
||||||
scripts=glob.glob('bin/*'),
|
scripts=glob.glob('bin/*'),
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'PyYAML',
|
'PyYAML',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue