mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-23 08:09:48 -05:00
fix bug clobbering own changes to parent_page
and some other tweaks (python 3.5+, pytest logging config, ...)
This commit is contained in:
parent
ba85917f70
commit
e23fa68d65
@ -1,7 +1,6 @@
|
|||||||
dist: xenial
|
dist: xenial
|
||||||
language: python
|
language: python
|
||||||
python:
|
python:
|
||||||
- 3.4
|
|
||||||
- 3.5
|
- 3.5
|
||||||
- 3.6
|
- 3.6
|
||||||
- 3.7
|
- 3.7
|
||||||
@ -24,6 +23,8 @@ script:
|
|||||||
- DISPLAY=:1 py.test --tb=native -v tests
|
- DISPLAY=:1 py.test --tb=native -v tests
|
||||||
after_failure:
|
after_failure:
|
||||||
- chromium-browser --version
|
- chromium-browser --version
|
||||||
|
- sudo kill -QUIT $(sudo svstat /etc/service/warcprox | egrep -o 'pid [0-9]+' | awk '{print $2}')
|
||||||
|
- sudo kill -QUIT $(sudo svstat /etc/service/brozzler-worker | egrep -o 'pid [0-9]+' | awk '{print $2}')
|
||||||
- sudo cat /var/log/warcprox.log
|
- sudo cat /var/log/warcprox.log
|
||||||
- sudo cat /var/log/brozzler-worker.log
|
- sudo cat /var/log/brozzler-worker.log
|
||||||
- sudo cat /var/log/pywb.log
|
- sudo cat /var/log/pywb.log
|
||||||
|
@ -19,7 +19,7 @@ Brozzler is designed to work in conjuction with warcprox for web archiving.
|
|||||||
Requirements
|
Requirements
|
||||||
------------
|
------------
|
||||||
|
|
||||||
- Python 3.4 or later
|
- Python 3.5 or later
|
||||||
- RethinkDB deployment
|
- RethinkDB deployment
|
||||||
- Chromium or Google Chrome >= version 64
|
- Chromium or Google Chrome >= version 64
|
||||||
|
|
||||||
|
@ -314,7 +314,7 @@ class RethinkDbFrontier:
|
|||||||
'''
|
'''
|
||||||
existing_page.priority += fresh_page.priority
|
existing_page.priority += fresh_page.priority
|
||||||
existing_page.hashtags = list(set(
|
existing_page.hashtags = list(set(
|
||||||
existing_page.hashtags + fresh_page.hashtags))
|
(existing_page.hashtags or []) + (fresh_page.hashtags or [])))
|
||||||
existing_page.hops_off = min(
|
existing_page.hops_off = min(
|
||||||
existing_page.hops_off, fresh_page.hops_off)
|
existing_page.hops_off, fresh_page.hops_off)
|
||||||
|
|
||||||
@ -375,14 +375,18 @@ class RethinkDbFrontier:
|
|||||||
decisions['accepted'].add(fresh_page.url)
|
decisions['accepted'].add(fresh_page.url)
|
||||||
if fresh_page.id in pages:
|
if fresh_page.id in pages:
|
||||||
page = pages[fresh_page.id]
|
page = pages[fresh_page.id]
|
||||||
page.hashtags = list(set((page.hashtags or [])
|
self._merge_page(page, fresh_page)
|
||||||
+ fresh_page.hashtags))
|
|
||||||
page.priority += fresh_page.priority
|
|
||||||
counts['updated'] += 1
|
counts['updated'] += 1
|
||||||
else:
|
else:
|
||||||
pages[fresh_page.id] = fresh_page
|
pages[fresh_page.id] = fresh_page
|
||||||
counts['added'] += 1
|
counts['added'] += 1
|
||||||
|
|
||||||
|
# make sure we're not stepping on our own toes in case we have a link
|
||||||
|
# back to parent_page, which I think happens because of hashtags
|
||||||
|
if parent_page.id in pages:
|
||||||
|
self._merge_page(parent_page, pages[parent_page.id])
|
||||||
|
del pages[parent_page.id]
|
||||||
|
|
||||||
# insert/replace in batches of 50 to try to avoid this error:
|
# insert/replace in batches of 50 to try to avoid this error:
|
||||||
# "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:"
|
# "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:"
|
||||||
# there can be many pages and each one can be very large (many videos,
|
# there can be many pages and each one can be very large (many videos,
|
||||||
@ -392,8 +396,11 @@ class RethinkDbFrontier:
|
|||||||
try:
|
try:
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
'inserting/replacing batch of %s pages', len(batch))
|
'inserting/replacing batch of %s pages', len(batch))
|
||||||
result = self.rr.table('pages').insert(
|
reql = self.rr.table('pages').insert(batch, conflict='replace')
|
||||||
batch, conflict='replace').run()
|
self.logger.trace(
|
||||||
|
'running query self.rr.table("pages").insert(%r, '
|
||||||
|
'conflict="replace")', batch)
|
||||||
|
result = reql.run()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
'problem inserting/replacing batch of %s pages',
|
'problem inserting/replacing batch of %s pages',
|
||||||
@ -450,12 +457,15 @@ class RethinkDbFrontier:
|
|||||||
Returns:
|
Returns:
|
||||||
iterator of brozzler.Page
|
iterator of brozzler.Page
|
||||||
'''
|
'''
|
||||||
results = self.rr.table("pages").between(
|
query = self.rr.table("pages").between(
|
||||||
[site_id, 1 if brozzled is True else 0,
|
[site_id, 1 if brozzled is True else 0,
|
||||||
r.minval, r.minval],
|
r.minval, r.minval],
|
||||||
[site_id, 0 if brozzled is False else r.maxval,
|
[site_id, 0 if brozzled is False else r.maxval,
|
||||||
r.maxval, r.maxval],
|
r.maxval, r.maxval],
|
||||||
index="priority_by_site").run()
|
index="priority_by_site")
|
||||||
|
self.logger.trace("running query: %r", query)
|
||||||
|
results = query.run()
|
||||||
for result in results:
|
for result in results:
|
||||||
|
self.logger.trace("yielding result: %r", result)
|
||||||
yield brozzler.Page(self.rr, result)
|
yield brozzler.Page(self.rr, result)
|
||||||
|
|
||||||
|
6
pytest.ini
Normal file
6
pytest.ini
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
# https://docs.pytest.org/en/latest/logging.html
|
||||||
|
# https://github.com/pytest-dev/pytest/issues/5296
|
||||||
|
[pytest]
|
||||||
|
log_format = %(asctime)s.%(msecs)03d %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s
|
||||||
|
log_date_format = %Y-%m-%d %H:%M:%S
|
||||||
|
|
1
setup.py
1
setup.py
@ -95,7 +95,6 @@ setuptools.setup(
|
|||||||
'Development Status :: 5 - Production/Stable',
|
'Development Status :: 5 - Production/Stable',
|
||||||
'Environment :: Console',
|
'Environment :: Console',
|
||||||
'License :: OSI Approved :: Apache Software License',
|
'License :: OSI Approved :: Apache Software License',
|
||||||
'Programming Language :: Python :: 3.4',
|
|
||||||
'Programming Language :: Python :: 3.5',
|
'Programming Language :: Python :: 3.5',
|
||||||
'Programming Language :: Python :: 3.6',
|
'Programming Language :: Python :: 3.6',
|
||||||
'Programming Language :: Python :: 3.7',
|
'Programming Language :: Python :: 3.7',
|
||||||
|
@ -32,6 +32,7 @@ import requests
|
|||||||
import subprocess
|
import subprocess
|
||||||
import http.server
|
import http.server
|
||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
import warcprox
|
import warcprox
|
||||||
|
|
||||||
# https://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib
|
# https://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib
|
||||||
|
@ -733,7 +733,7 @@ def test_hashtag_seed():
|
|||||||
assert pages[0].hashtags == ['#hash',]
|
assert pages[0].hashtags == ['#hash',]
|
||||||
|
|
||||||
def test_hashtag_links():
|
def test_hashtag_links():
|
||||||
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
rr = doublethink.Rethinker('localhost', db='test_hashtag_links')
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
|
||||||
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
|
site = brozzler.Site(rr, {'seed': 'http://example.org/'})
|
||||||
|
Loading…
x
Reference in New Issue
Block a user