Factor out common code for search insert

we can reuse the same code as is used for event insert, for doing the
background index population.
This commit is contained in:
Richard van der Hoff 2018-02-03 23:07:13 +00:00
parent c46e75d3d8
commit 3c7b480ba3

View File

@ -13,19 +13,25 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from collections import namedtuple
import logging
import re
import ujson as json
from twisted.internet import defer from twisted.internet import defer
from .background_updates import BackgroundUpdateStore from .background_updates import BackgroundUpdateStore
from synapse.api.errors import SynapseError from synapse.api.errors import SynapseError
from synapse.storage.engines import PostgresEngine, Sqlite3Engine from synapse.storage.engines import PostgresEngine, Sqlite3Engine
import logging
import re
import ujson as json
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
SearchEntry = namedtuple('SearchEntry', [
'key', 'value', 'event_id', 'room_id', 'stream_ordering',
'origin_server_ts',
])
class SearchStore(BackgroundUpdateStore): class SearchStore(BackgroundUpdateStore):
@ -49,16 +55,17 @@ class SearchStore(BackgroundUpdateStore):
@defer.inlineCallbacks @defer.inlineCallbacks
def _background_reindex_search(self, progress, batch_size): def _background_reindex_search(self, progress, batch_size):
# we work through the events table from highest stream id to lowest
target_min_stream_id = progress["target_min_stream_id_inclusive"] target_min_stream_id = progress["target_min_stream_id_inclusive"]
max_stream_id = progress["max_stream_id_exclusive"] max_stream_id = progress["max_stream_id_exclusive"]
rows_inserted = progress.get("rows_inserted", 0) rows_inserted = progress.get("rows_inserted", 0)
INSERT_CLUMP_SIZE = 1000
TYPES = ["m.room.name", "m.room.message", "m.room.topic"] TYPES = ["m.room.name", "m.room.message", "m.room.topic"]
def reindex_search_txn(txn): def reindex_search_txn(txn):
sql = ( sql = (
"SELECT stream_ordering, event_id, room_id, type, content FROM events" "SELECT stream_ordering, event_id, room_id, type, content, "
" origin_server_ts FROM events"
" WHERE ? <= stream_ordering AND stream_ordering < ?" " WHERE ? <= stream_ordering AND stream_ordering < ?"
" AND (%s)" " AND (%s)"
" ORDER BY stream_ordering DESC" " ORDER BY stream_ordering DESC"
@ -67,6 +74,10 @@ class SearchStore(BackgroundUpdateStore):
txn.execute(sql, (target_min_stream_id, max_stream_id, batch_size)) txn.execute(sql, (target_min_stream_id, max_stream_id, batch_size))
# we could stream straight from the results into
# store_search_entries_txn with a generator function, but that
# would mean having two cursors open on the database at once.
# Instead we just build a list of results.
rows = self.cursor_to_dict(txn) rows = self.cursor_to_dict(txn)
if not rows: if not rows:
return 0 return 0
@ -79,6 +90,8 @@ class SearchStore(BackgroundUpdateStore):
event_id = row["event_id"] event_id = row["event_id"]
room_id = row["room_id"] room_id = row["room_id"]
etype = row["type"] etype = row["type"]
stream_ordering = row["stream_ordering"]
origin_server_ts = row["origin_server_ts"]
try: try:
content = json.loads(row["content"]) content = json.loads(row["content"])
except Exception: except Exception:
@ -93,6 +106,8 @@ class SearchStore(BackgroundUpdateStore):
elif etype == "m.room.name": elif etype == "m.room.name":
key = "content.name" key = "content.name"
value = content["name"] value = content["name"]
else:
raise Exception("unexpected event type %s" % etype)
except (KeyError, AttributeError): except (KeyError, AttributeError):
# If the event is missing a necessary field then # If the event is missing a necessary field then
# skip over it. # skip over it.
@ -103,25 +118,16 @@ class SearchStore(BackgroundUpdateStore):
# then skip over it # then skip over it
continue continue
event_search_rows.append((event_id, room_id, key, value)) event_search_rows.append(SearchEntry(
key=key,
value=value,
event_id=event_id,
room_id=room_id,
stream_ordering=stream_ordering,
origin_server_ts=origin_server_ts,
))
if isinstance(self.database_engine, PostgresEngine): self.store_search_entries_txn(txn, event_search_rows)
sql = (
"INSERT INTO event_search (event_id, room_id, key, vector)"
" VALUES (?,?,?,to_tsvector('english', ?))"
)
elif isinstance(self.database_engine, Sqlite3Engine):
sql = (
"INSERT INTO event_search (event_id, room_id, key, value)"
" VALUES (?,?,?,?)"
)
else:
# This should be unreachable.
raise Exception("Unrecognized database engine")
for index in range(0, len(event_search_rows), INSERT_CLUMP_SIZE):
clump = event_search_rows[index:index + INSERT_CLUMP_SIZE]
txn.executemany(sql, clump)
progress = { progress = {
"target_min_stream_id_inclusive": target_min_stream_id, "target_min_stream_id_inclusive": target_min_stream_id,
@ -251,26 +257,49 @@ class SearchStore(BackgroundUpdateStore):
key (str): key (str):
value (str): value (str):
""" """
self.store_search_entries_txn(
txn,
(SearchEntry(
key=key,
value=value,
event_id=event.event_id,
room_id=event.room_id,
stream_ordering=event.internal_metadata.stream_ordering,
origin_server_ts=event.origin_server_ts,
),),
)
def store_search_entries_txn(self, txn, entries):
"""Add entries to the search table
Args:
txn (cursor):
entries (iterable[SearchEntry]):
entries to be added to the table
"""
if isinstance(self.database_engine, PostgresEngine): if isinstance(self.database_engine, PostgresEngine):
sql = ( sql = (
"INSERT INTO event_search" "INSERT INTO event_search"
" (event_id, room_id, key, vector, stream_ordering, origin_server_ts)" " (event_id, room_id, key, vector, stream_ordering, origin_server_ts)"
" VALUES (?,?,?,to_tsvector('english', ?),?,?)" " VALUES (?,?,?,to_tsvector('english', ?),?,?)"
) )
txn.execute(
sql, args = ((
( entry.event_id, entry.room_id, entry.key, entry.value,
event.event_id, event.room_id, key, value, entry.stream_ordering, entry.origin_server_ts,
event.internal_metadata.stream_ordering, ) for entry in entries)
event.origin_server_ts,
) txn.executemany(sql, args)
)
elif isinstance(self.database_engine, Sqlite3Engine): elif isinstance(self.database_engine, Sqlite3Engine):
sql = ( sql = (
"INSERT INTO event_search (event_id, room_id, key, value)" "INSERT INTO event_search (event_id, room_id, key, value)"
" VALUES (?,?,?,?)" " VALUES (?,?,?,?)"
) )
txn.execute(sql, (event.event_id, event.room_id, key, value,)) args = ((
entry.event_id, entry.room_id, entry.key, entry.value,
) for entry in entries)
txn.executemany(sql, args)
else: else:
# This should be unreachable. # This should be unreachable.
raise Exception("Unrecognized database engine") raise Exception("Unrecognized database engine")