diff --git a/README.md b/README.md deleted file mode 100644 index abf8415..0000000 --- a/README.md +++ /dev/null @@ -1,49 +0,0 @@ -brozzler -======== -"browser" | "crawler" = "brozzler" - -Brozzler is a distributed web crawler (爬虫) that uses a real browser (chrome -or chromium) to fetch pages and embedded urls and to extract links. It also -uses [youtube-dl](https://github.com/rg3/youtube-dl) to enhance media capture -capabilities. - -It is forked from https://github.com/internetarchive/umbra. - -Brozzler is designed to work in conjunction with -[warcprox](https://github.com/internetarchive/warcprox) for web archiving. - -Installation ------------- -``` -git clone https://github.com/nlevitt/brozzler.git -cd brozzler -# set up virtualenv if desired -pip install -r requirements.txt . -``` -Brozzler also requires a rethinkdb deployment. - -Fonts for good screenshots --------------------------- -On ubuntu 14.04 trusty I installed these packages: - -xfonts-base ttf-mscorefonts-installer fonts-arphic-bkai00mp fonts-arphic-bsmi00lp fonts-arphic-gbsn00lp fonts-arphic-gkai00mp fonts-arphic-ukai fonts-farsiweb fonts-nafees fonts-sil-abyssinica fonts-sil-ezra fonts-sil-padauk fonts-unfonts-extra fonts-unfonts-core ttf-indic-fonts fonts-thai-tlwg fonts-lklug-sinhala - -Haven't looked much at the resulting screenshots yet though. - -License -------- - -Copyright 2015 Internet Archive - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this software except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..ac58b56 --- /dev/null +++ b/README.rst @@ -0,0 +1,62 @@ +brozzler +======== + +"browser" \| "crawler" = "brozzler" + +Brozzler is a distributed web crawler (爬虫) that uses a real browser +(chrome or chromium) to fetch pages and embedded urls and to extract +links. It also uses `youtube-dl `__ +to enhance media capture capabilities. + +It is forked from https://github.com/internetarchive/umbra. + +Brozzler is designed to work in conjunction with +`warcprox `__ for web +archiving. + +Installation +------------ + +XXX These instructions don't work at the moment. Brozzler requires some +customized packages not easily installable in the outside world. I intend to +remedy the situation soon. + +:: + + # set up virtualenv if desired + pip install git+https://github.com/nlevitt/brozzler.git + +Brozzler also requires a rethinkdb deployment. + +Fonts for good screenshots +-------------------------- + +On ubuntu 14.04 trusty I installed these packages: + +xfonts-base ttf-mscorefonts-installer fonts-arphic-bkai00mp +fonts-arphic-bsmi00lp fonts-arphic-gbsn00lp fonts-arphic-gkai00mp +fonts-arphic-ukai fonts-farsiweb fonts-nafees fonts-sil-abyssinica +fonts-sil-ezra fonts-sil-padauk fonts-unfonts-extra fonts-unfonts-core +ttf-indic-fonts fonts-thai-tlwg fonts-lklug-sinhala + +Haven't looked much at the resulting screenshots yet though. + +License +------- + +Copyright 2015 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); you may +not use this software except in compliance with the License. You may +obtain a copy of the License at + +:: + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + diff --git a/setup.py b/setup.py index 65ff08a..03737f6 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,3 @@ -# vim: set sw=4 et: - import setuptools import glob @@ -9,7 +7,7 @@ def full_version_bytes(): import subprocess, time try: commit_num_bytes = subprocess.check_output(['git', 'rev-list', '--count', 'HEAD']) - return VERSION_BYTES + b'.dev' + commit_num_bytes.strip() + return VERSION_BYTES + b'.' + commit_num_bytes.strip() except subprocess.CalledProcessError: return VERSION_BYTES @@ -24,10 +22,10 @@ setuptools.setup(name='brozzler', url='https://github.com/nlevitt/brozzler', author='Noah Levitt', author_email='nlevitt@archive.org', - long_description=open('README.md').read(), + long_description=open('README.rst').read(), license='Apache License 2.0', packages=['brozzler'], - package_data={'brozzler':['behaviors.d/*.js*', 'behaviors.yaml', 'version.txt']}, + package_data={'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml', 'version.txt']}, scripts=glob.glob('bin/*'), install_requires=[ 'PyYAML',