text-generation-webui/extensions/superbooga/chromadb.py

import chromadb
import posthog
import torch
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer

from modules.logging_colors import logger

logger.info('Intercepting all calls to posthog :)')
posthog.capture = lambda *args, **kwargs: None


class Collecter():
    def __init__(self):
        pass

    def add(self, texts: list[str]):
        pass

    def get(self, search_strings: list[str], n_results: int) -> list[str]:
        pass

    def clear(self):
        pass


class Embedder():
    def __init__(self):
        pass

    def embed(self, text: str) -> list[torch.Tensor]:
        pass


class ChromaCollector(Collecter):
    def __init__(self, embedder: Embedder):
        super().__init__()
        self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
        self.embedder = embedder
        self.collection = self.chroma_client.create_collection(name="context", embedding_function=embedder.embed)
        self.ids = []

    def add(self, texts: list[str]):
        if len(texts) == 0:
            return

        self.ids = [f"id{i}" for i in range(len(texts))]
        self.collection.add(documents=texts, ids=self.ids)

    def get_documents_ids_distances(self, search_strings: list[str], n_results: int):
        n_results = min(len(self.ids), n_results)
        if n_results == 0:
            return [], []

        result = self.collection.query(query_texts=search_strings, n_results=n_results, include=['documents', 'distances'])
        documents = result['documents'][0]
        ids = list(map(lambda x: int(x[2:]), result['ids'][0]))
        distances = result['distances'][0]
        return documents, ids, distances

    # Get chunks by similarity
    def get(self, search_strings: list[str], n_results: int) -> list[str]:
        documents, _, _ = self.get_documents_ids_distances(search_strings, n_results)
        return documents

    # Get ids by similarity
    def get_ids(self, search_strings: list[str], n_results: int) -> list[str]:
        _, ids, _ = self.get_documents_ids_distances(search_strings, n_results)
        return ids

    # Get chunks by similarity and then sort by insertion order
    def get_sorted(self, search_strings: list[str], n_results: int) -> list[str]:
        documents, ids, _ = self.get_documents_ids_distances(search_strings, n_results)
        return [x for _, x in sorted(zip(ids, documents))]

    # Multiply distance by factor within [0, time_weight] where more recent is lower
    def apply_time_weight_to_distances(self, ids: list[int], distances: list[float], time_weight: float = 1.0) -> list[float]:
        if len(self.ids) <= 1:
            return distances.copy()

        return [distance * (1 - _id / (len(self.ids) - 1) * time_weight) for _id, distance in zip(ids, distances)]

    # Get ids by similarity and then sort by insertion order
    def get_ids_sorted(self, search_strings: list[str], n_results: int, n_initial: int = None, time_weight: float = 1.0) -> list[str]:
        do_time_weight = time_weight > 0
        if not (do_time_weight and n_initial is not None):
            n_initial = n_results
        elif n_initial == -1:
            n_initial = len(self.ids)

        if n_initial < n_results:
            raise ValueError(f"n_initial {n_initial} should be >= n_results {n_results}")

        _, ids, distances = self.get_documents_ids_distances(search_strings, n_initial)
        if do_time_weight:
            distances_w = self.apply_time_weight_to_distances(ids, distances, time_weight=time_weight)
            results = zip(ids, distances, distances_w)
            results = sorted(results, key=lambda x: x[2])[:n_results]
            results = sorted(results, key=lambda x: x[0])
            ids = [x[0] for x in results]

        return sorted(ids)

    def clear(self):
        self.collection.delete(ids=self.ids)
        self.ids = []


class SentenceTransformerEmbedder(Embedder):
    def __init__(self) -> None:
        self.model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
        self.embed = self.model.encode


def make_collector():
    global embedder
    return ChromaCollector(embedder)


def add_chunks_to_collector(chunks, collector):
    collector.clear()
    collector.add(chunks)


embedder = SentenceTransformerEmbedder()
Prevent unwanted log messages from modules 2023-05-21 21:42:34 -04:00			`import chromadb`
Refactor superbooga 2023-05-13 13:14:59 -04:00			`import posthog`
			`import torch`
Prevent unwanted log messages from modules 2023-05-21 21:42:34 -04:00			`from chromadb.config import Settings`
Refactor superbooga 2023-05-13 13:14:59 -04:00			`from sentence_transformers import SentenceTransformer`

Prevent unwanted log messages from modules 2023-05-21 21:42:34 -04:00			`from modules.logging_colors import logger`
Refactor superbooga 2023-05-13 13:14:59 -04:00
Prevent unwanted log messages from modules 2023-05-21 21:42:34 -04:00			`logger.info('Intercepting all calls to posthog :)')`
Refactor superbooga 2023-05-13 13:14:59 -04:00			`posthog.capture = lambda args, *kwargs: None`


			`class Collecter():`
			`def __init__(self):`
			`pass`

			`def add(self, texts: list[str]):`
			`pass`

			`def get(self, search_strings: list[str], n_results: int) -> list[str]:`
			`pass`

			`def clear(self):`
			`pass`


			`class Embedder():`
			`def __init__(self):`
			`pass`

			`def embed(self, text: str) -> list[torch.Tensor]:`
			`pass`


			`class ChromaCollector(Collecter):`
			`def __init__(self, embedder: Embedder):`
			`super().__init__()`
			`self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))`
			`self.embedder = embedder`
			`self.collection = self.chroma_client.create_collection(name="context", embedding_function=embedder.embed)`
			`self.ids = []`

			`def add(self, texts: list[str]):`
Fix superbooga error when querying empty DB (Issue #2160) (#2212) 2023-05-20 21:27:22 -04:00			`if len(texts) == 0:`
			`return`

Refactor superbooga 2023-05-13 13:14:59 -04:00			`self.ids = [f"id{i}" for i in range(len(texts))]`
			`self.collection.add(documents=texts, ids=self.ids)`

Add superbooga time weighted history retrieval (#2080) 2023-05-25 09:22:45 -04:00			`def get_documents_ids_distances(self, search_strings: list[str], n_results: int):`
Refactor superbooga 2023-05-13 13:14:59 -04:00			`n_results = min(len(self.ids), n_results)`
Fix superbooga error when querying empty DB (Issue #2160) (#2212) 2023-05-20 21:27:22 -04:00			`if n_results == 0:`
			`return [], []`

Add superbooga time weighted history retrieval (#2080) 2023-05-25 09:22:45 -04:00			`result = self.collection.query(query_texts=search_strings, n_results=n_results, include=['documents', 'distances'])`
Sort selected superbooga chunks by insertion order For better coherence 2023-05-14 21:19:29 -04:00			`documents = result['documents'][0]`
			`ids = list(map(lambda x: int(x[2:]), result['ids'][0]))`
Add superbooga time weighted history retrieval (#2080) 2023-05-25 09:22:45 -04:00			`distances = result['distances'][0]`
			`return documents, ids, distances`
Sort selected superbooga chunks by insertion order For better coherence 2023-05-14 21:19:29 -04:00
			`# Get chunks by similarity`
			`def get(self, search_strings: list[str], n_results: int) -> list[str]:`
Add superbooga time weighted history retrieval (#2080) 2023-05-25 09:22:45 -04:00			`documents, _, _ = self.get_documents_ids_distances(search_strings, n_results)`
Sort selected superbooga chunks by insertion order For better coherence 2023-05-14 21:19:29 -04:00			`return documents`
Refactor superbooga 2023-05-13 13:14:59 -04:00
Sort selected superbooga chunks by insertion order For better coherence 2023-05-14 21:19:29 -04:00			`# Get ids by similarity`
Refactor superbooga 2023-05-13 13:14:59 -04:00			`def get_ids(self, search_strings: list[str], n_results: int) -> list[str]:`
Add superbooga time weighted history retrieval (#2080) 2023-05-25 09:22:45 -04:00			`_, ids, _ = self.get_documents_ids_distances(search_strings, n_results)`
Sort selected superbooga chunks by insertion order For better coherence 2023-05-14 21:19:29 -04:00			`return ids`

			`# Get chunks by similarity and then sort by insertion order`
			`def get_sorted(self, search_strings: list[str], n_results: int) -> list[str]:`
Add superbooga time weighted history retrieval (#2080) 2023-05-25 09:22:45 -04:00			`documents, ids, _ = self.get_documents_ids_distances(search_strings, n_results)`
Sort selected superbooga chunks by insertion order For better coherence 2023-05-14 21:19:29 -04:00			`return [x for _, x in sorted(zip(ids, documents))]`

Add superbooga time weighted history retrieval (#2080) 2023-05-25 09:22:45 -04:00			`# Multiply distance by factor within [0, time_weight] where more recent is lower`
			`def apply_time_weight_to_distances(self, ids: list[int], distances: list[float], time_weight: float = 1.0) -> list[float]:`
			`if len(self.ids) <= 1:`
			`return distances.copy()`

			`return [distance * (1 - _id / (len(self.ids) - 1) * time_weight) for _id, distance in zip(ids, distances)]`

Sort selected superbooga chunks by insertion order For better coherence 2023-05-14 21:19:29 -04:00			`# Get ids by similarity and then sort by insertion order`
Add superbooga time weighted history retrieval (#2080) 2023-05-25 09:22:45 -04:00			`def get_ids_sorted(self, search_strings: list[str], n_results: int, n_initial: int = None, time_weight: float = 1.0) -> list[str]:`
			`do_time_weight = time_weight > 0`
			`if not (do_time_weight and n_initial is not None):`
			`n_initial = n_results`
			`elif n_initial == -1:`
			`n_initial = len(self.ids)`

			`if n_initial < n_results:`
			`raise ValueError(f"n_initial {n_initial} should be >= n_results {n_results}")`

			`_, ids, distances = self.get_documents_ids_distances(search_strings, n_initial)`
			`if do_time_weight:`
			`distances_w = self.apply_time_weight_to_distances(ids, distances, time_weight=time_weight)`
			`results = zip(ids, distances, distances_w)`
			`results = sorted(results, key=lambda x: x[2])[:n_results]`
			`results = sorted(results, key=lambda x: x[0])`
			`ids = [x[0] for x in results]`

Sort selected superbooga chunks by insertion order For better coherence 2023-05-14 21:19:29 -04:00			`return sorted(ids)`
Refactor superbooga 2023-05-13 13:14:59 -04:00
			`def clear(self):`
			`self.collection.delete(ids=self.ids)`
Fix superbooga error when querying empty DB (Issue #2160) (#2212) 2023-05-20 21:27:22 -04:00			`self.ids = []`
Refactor superbooga 2023-05-13 13:14:59 -04:00

			`class SentenceTransformerEmbedder(Embedder):`
			`def __init__(self) -> None:`
			`self.model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")`
			`self.embed = self.model.encode`


			`def make_collector():`
			`global embedder`
			`return ChromaCollector(embedder)`


			`def add_chunks_to_collector(chunks, collector):`
			`collector.clear()`
			`collector.add(chunks)`


			`embedder = SentenceTransformerEmbedder()`