Merge branch 'master' into qa

* master:
  avoid rethinkdb.errors.ReqlDriverError: Query size
This commit is contained in:
Noah Levitt 2017-12-15 15:55:54 -08:00
commit 07715d6f49
2 changed files with 16 additions and 3 deletions

View File

@ -360,8 +360,21 @@ class RethinkDbFrontier:
pages[fresh_page.id] = fresh_page
counts['added'] += 1
result = self.rr.table('pages').insert(
pages.values(), conflict='replace').run()
# insert/replace in batches of 50 to try to avoid this error:
# "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:"
# there can be many pages and each one can be very large (many videos,
# in and out of scope links, etc)
l = list(pages.values())
for batch in (l[i:i+50] for i in range(0, len(l), 50)):
try:
self.logger.info(
'inserting/replacing batch of %s pages', len(batch))
result = self.rr.table('pages').insert(
batch, conflict='replace').run()
except Exception as e:
self.logger.error(
'problem inserting/replacing batch of %s pages',
len(batch), exc_info=True)
parent_page.outlinks = {}
for k in decisions:

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b12.dev270',
version='1.1b12.dev271',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',