mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-20 12:54:23 -04:00
avoid rethinkdb.errors.ReqlDriverError: Query size
This commit is contained in:
parent
a6e5700c18
commit
df6615cc2c
2 changed files with 16 additions and 3 deletions
|
@ -360,8 +360,21 @@ class RethinkDbFrontier:
|
|||
pages[fresh_page.id] = fresh_page
|
||||
counts['added'] += 1
|
||||
|
||||
# insert/replace in batches of 50 to try to avoid this error:
|
||||
# "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:"
|
||||
# there can be many pages and each one can be very large (many videos,
|
||||
# in and out of scope links, etc)
|
||||
l = list(pages.values())
|
||||
for batch in (l[i:i+50] for i in range(0, len(l), 50)):
|
||||
try:
|
||||
self.logger.info(
|
||||
'inserting/replacing batch of %s pages', len(batch))
|
||||
result = self.rr.table('pages').insert(
|
||||
pages.values(), conflict='replace').run()
|
||||
batch, conflict='replace').run()
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
'problem inserting/replacing batch of %s pages',
|
||||
len(batch), exc_info=True)
|
||||
|
||||
parent_page.outlinks = {}
|
||||
for k in decisions:
|
||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b12.dev270',
|
||||
version='1.1b12.dev271',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue