From df6615cc2cdc2be8042512407c54210ec12e73b8 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 15 Dec 2017 15:55:10 -0800 Subject: [PATCH] avoid rethinkdb.errors.ReqlDriverError: Query size --- brozzler/frontier.py | 17 +++++++++++++++-- setup.py | 2 +- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 4e42bd7..b3a2e6a 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -360,8 +360,21 @@ class RethinkDbFrontier: pages[fresh_page.id] = fresh_page counts['added'] += 1 - result = self.rr.table('pages').insert( - pages.values(), conflict='replace').run() + # insert/replace in batches of 50 to try to avoid this error: + # "rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in:" + # there can be many pages and each one can be very large (many videos, + # in and out of scope links, etc) + l = list(pages.values()) + for batch in (l[i:i+50] for i in range(0, len(l), 50)): + try: + self.logger.info( + 'inserting/replacing batch of %s pages', len(batch)) + result = self.rr.table('pages').insert( + batch, conflict='replace').run() + except Exception as e: + self.logger.error( + 'problem inserting/replacing batch of %s pages', + len(batch), exc_info=True) parent_page.outlinks = {} for k in decisions: diff --git a/setup.py b/setup.py index 889a2d5..cd96cd1 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b12.dev270', + version='1.1b12.dev271', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',