mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-20 12:54:23 -04:00
avoid rethinkdb.errors.ReqlDriverError: Query size
This commit is contained in:
parent
a6e5700c18
commit
df6615cc2c
2 changed files with 16 additions and 3 deletions
|
@ -360,8 +360,21 @@ class RethinkDbFrontier:
|
||||||
pages[fresh_page.id] = fresh_page
|
pages[fresh_page.id] = fresh_page
|
||||||
counts['added'] += 1
|
counts['added'] += 1
|
||||||
|
|
||||||
result = self.rr.table('pages').insert(
|
# insert/replace in batches of 50 to try to avoid this error:
|
||||||
pages.values(), conflict='replace').run()
|
# "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:"
|
||||||
|
# there can be many pages and each one can be very large (many videos,
|
||||||
|
# in and out of scope links, etc)
|
||||||
|
l = list(pages.values())
|
||||||
|
for batch in (l[i:i+50] for i in range(0, len(l), 50)):
|
||||||
|
try:
|
||||||
|
self.logger.info(
|
||||||
|
'inserting/replacing batch of %s pages', len(batch))
|
||||||
|
result = self.rr.table('pages').insert(
|
||||||
|
batch, conflict='replace').run()
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(
|
||||||
|
'problem inserting/replacing batch of %s pages',
|
||||||
|
len(batch), exc_info=True)
|
||||||
|
|
||||||
parent_page.outlinks = {}
|
parent_page.outlinks = {}
|
||||||
for k in decisions:
|
for k in decisions:
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b12.dev270',
|
version='1.1b12.dev271',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue