text-generation-webui/extensions/superboogav2/chromadb.py

import math
import random
import threading

import chromadb
import numpy as np
import posthog
from chromadb.config import Settings
from chromadb.utils import embedding_functions

import extensions.superboogav2.parameters as parameters
from modules.logging_colors import logger
from modules.text_generation import decode, encode

# Intercept calls to posthog
posthog.capture = lambda *args, **kwargs: None


embedder = embedding_functions.SentenceTransformerEmbeddingFunction("sentence-transformers/all-mpnet-base-v2")


class Info:
    def __init__(self, start_index, text_with_context, distance, id):
        self.text_with_context = text_with_context
        self.start_index = start_index
        self.distance = distance
        self.id = id

    def calculate_distance(self, other_info):
        if parameters.get_new_dist_strategy() == parameters.DIST_MIN_STRATEGY:
            # Min
            return min(self.distance, other_info.distance)
        elif parameters.get_new_dist_strategy() == parameters.DIST_HARMONIC_STRATEGY:
            # Harmonic mean
            return 2 * (self.distance * other_info.distance) / (self.distance + other_info.distance)
        elif parameters.get_new_dist_strategy() == parameters.DIST_GEOMETRIC_STRATEGY:
            # Geometric mean
            return (self.distance * other_info.distance) ** 0.5
        elif parameters.get_new_dist_strategy() == parameters.DIST_ARITHMETIC_STRATEGY:
            # Arithmetic mean
            return (self.distance + other_info.distance) / 2
        else:  # Min is default
            return min(self.distance, other_info.distance)

    def merge_with(self, other_info):
        s1 = self.text_with_context
        s2 = other_info.text_with_context
        s1_start = self.start_index
        s2_start = other_info.start_index

        new_dist = self.calculate_distance(other_info)

        if self.should_merge(s1, s2, s1_start, s2_start):
            if s1_start <= s2_start:
                if s1_start + len(s1) >= s2_start + len(s2):  # if s1 completely covers s2
                    return Info(s1_start, s1, new_dist, self.id)
                else:
                    overlap = max(0, s1_start + len(s1) - s2_start)
                    return Info(s1_start, s1 + s2[overlap:], new_dist, self.id)
            else:
                if s2_start + len(s2) >= s1_start + len(s1):  # if s2 completely covers s1
                    return Info(s2_start, s2, new_dist, other_info.id)
                else:
                    overlap = max(0, s2_start + len(s2) - s1_start)
                    return Info(s2_start, s2 + s1[overlap:], new_dist, other_info.id)

        return None

    @staticmethod
    def should_merge(s1, s2, s1_start, s2_start):
        # Check if s1 and s2 are adjacent or overlapping
        s1_end = s1_start + len(s1)
        s2_end = s2_start + len(s2)

        return not (s1_end < s2_start or s2_end < s1_start)


class ChromaCollector():
    def __init__(self):
        name = ''.join(random.choice('ab') for _ in range(10))

        self.name = name
        self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
        self.collection = self.chroma_client.create_collection(name=name, embedding_function=embedder)

        self.ids = []
        self.id_to_info = {}
        self.embeddings_cache = {}
        self.lock = threading.Lock()  # Locking so the server doesn't break.

    def add(self, texts: list[str], texts_with_context: list[str], starting_indices: list[int], metadatas: list[dict] = None):
        with self.lock:
            assert metadatas is None or len(metadatas) == len(texts), "metadatas must be None or have the same length as texts"

            if len(texts) == 0:
                return

            new_ids = self._get_new_ids(len(texts))

            (existing_texts, existing_embeddings, existing_ids, existing_metas), \
                (non_existing_texts, non_existing_ids, non_existing_metas) = self._split_texts_by_cache_hit(texts, new_ids, metadatas)

            # If there are any already existing texts, add them all at once.
            if existing_texts:
                logger.info(f'Adding {len(existing_embeddings)} cached embeddings.')
                args = {'embeddings': existing_embeddings, 'documents': existing_texts, 'ids': existing_ids}
                if metadatas is not None:
                    args['metadatas'] = existing_metas
                self.collection.add(**args)

            # If there are any non-existing texts, compute their embeddings all at once. Each call to embed has significant overhead.
            if non_existing_texts:
                non_existing_embeddings = embedder(non_existing_texts)
                for text, embedding in zip(non_existing_texts, non_existing_embeddings):
                    self.embeddings_cache[text] = embedding

                logger.info(f'Adding {len(non_existing_embeddings)} new embeddings.')
                args = {'embeddings': non_existing_embeddings, 'documents': non_existing_texts, 'ids': non_existing_ids}
                if metadatas is not None:
                    args['metadatas'] = non_existing_metas
                self.collection.add(**args)

            # Create a dictionary that maps each ID to its context and starting index
            new_info = {
                id_: {'text_with_context': context, 'start_index': start_index}
                for id_, context, start_index in zip(new_ids, texts_with_context, starting_indices)
            }

            self.id_to_info.update(new_info)
            self.ids.extend(new_ids)

    def _split_texts_by_cache_hit(self, texts: list[str], new_ids: list[str], metadatas: list[dict]):
        existing_texts, non_existing_texts = [], []
        existing_embeddings = []
        existing_ids, non_existing_ids = [], []
        existing_metas, non_existing_metas = [], []

        for i, text in enumerate(texts):
            id_ = new_ids[i]
            metadata = metadatas[i] if metadatas is not None else None
            embedding = self.embeddings_cache.get(text)
            if embedding:
                existing_texts.append(text)
                existing_embeddings.append(embedding)
                existing_ids.append(id_)
                existing_metas.append(metadata)
            else:
                non_existing_texts.append(text)
                non_existing_ids.append(id_)
                non_existing_metas.append(metadata)

        return (existing_texts, existing_embeddings, existing_ids, existing_metas), \
               (non_existing_texts, non_existing_ids, non_existing_metas)

    def _get_new_ids(self, num_new_ids: int):
        if self.ids:
            max_existing_id = max(int(id_) for id_ in self.ids)
        else:
            max_existing_id = -1

        return [str(i + max_existing_id + 1) for i in range(num_new_ids)]

    def _find_min_max_start_index(self):
        max_index, min_index = 0, float('inf')
        for _, val in self.id_to_info.items():
            if val['start_index'] > max_index:
                max_index = val['start_index']
            if val['start_index'] < min_index:
                min_index = val['start_index']
        return min_index, max_index

    # NB: Does not make sense to weigh excerpts from different documents.
    # But let's say that's the user's problem. Perfect world scenario:
    # Apply time weighing to different documents. For each document, then, add
    # separate time weighing.

    def _apply_sigmoid_time_weighing(self, infos: list[Info], document_len: int, time_steepness: float, time_power: float):
        def sigmoid(x):
            return 1 / (1 + np.exp(-x))

        weights = sigmoid(time_steepness * np.linspace(-10, 10, document_len))

        # Scale to [0,time_power] and shift it up to [1-time_power, 1]
        weights = weights - min(weights)
        weights = weights * (time_power / max(weights))
        weights = weights + (1 - time_power)

        # Reverse the weights
        weights = weights[::-1]

        for info in infos:
            index = info.start_index
            info.distance *= weights[index]

    def _filter_outliers_by_median_distance(self, infos: list[Info], significant_level: float):
        # Ensure there are infos to filter
        if not infos:
            return []

        # Find info with minimum distance
        min_info = min(infos, key=lambda x: x.distance)

        # Calculate median distance among infos
        median_distance = np.median([inf.distance for inf in infos])

        # Filter out infos that have a distance significantly greater than the median
        filtered_infos = [inf for inf in infos if inf.distance <= significant_level * median_distance]

        # Always include the info with minimum distance
        if min_info not in filtered_infos:
            filtered_infos.append(min_info)

        return filtered_infos

    def _merge_infos(self, infos: list[Info]):
        merged_infos = []
        current_info = infos[0]

        for next_info in infos[1:]:
            merged = current_info.merge_with(next_info)
            if merged is not None:
                current_info = merged
            else:
                merged_infos.append(current_info)
                current_info = next_info

        merged_infos.append(current_info)
        return merged_infos

    # Main function for retrieving chunks by distance. It performs merging, time weighing, and mean filtering.

    def _get_documents_ids_distances(self, search_strings: list[str], n_results: int):
        n_results = min(len(self.ids), n_results)
        if n_results == 0:
            return [], [], []

        if isinstance(search_strings, str):
            search_strings = [search_strings]

        infos = []
        min_start_index, max_start_index = self._find_min_max_start_index()

        for search_string in search_strings:
            result = self.collection.query(query_texts=search_string, n_results=math.ceil(n_results / len(search_strings)), include=['distances'])
            curr_infos = [Info(start_index=self.id_to_info[id]['start_index'],
                               text_with_context=self.id_to_info[id]['text_with_context'],
                               distance=distance, id=id)
                          for id, distance in zip(result['ids'][0], result['distances'][0])]

            self._apply_sigmoid_time_weighing(infos=curr_infos, document_len=max_start_index - min_start_index + 1, time_steepness=parameters.get_time_steepness(), time_power=parameters.get_time_power())
            curr_infos = self._filter_outliers_by_median_distance(curr_infos, parameters.get_significant_level())
            infos.extend(curr_infos)

        infos.sort(key=lambda x: x.start_index)
        infos = self._merge_infos(infos)

        texts_with_context = [inf.text_with_context for inf in infos]
        ids = [inf.id for inf in infos]
        distances = [inf.distance for inf in infos]

        return texts_with_context, ids, distances

    # Get chunks by similarity

    def get(self, search_strings: list[str], n_results: int) -> list[str]:
        with self.lock:
            documents, _, _ = self._get_documents_ids_distances(search_strings, n_results)
            return documents

    # Get ids by similarity

    def get_ids(self, search_strings: list[str], n_results: int) -> list[str]:
        with self.lock:
            _, ids, _ = self._get_documents_ids_distances(search_strings, n_results)
            return ids

    # Cutoff token count

    def _get_documents_up_to_token_count(self, documents: list[str], max_token_count: int):
        # TODO: Move to caller; We add delimiters there which might go over the limit.
        current_token_count = 0
        return_documents = []

        for doc in documents:
            doc_tokens = encode(doc)[0]
            doc_token_count = len(doc_tokens)
            if current_token_count + doc_token_count > max_token_count:
                # If adding this document would exceed the max token count,
                # truncate the document to fit within the limit.
                remaining_tokens = max_token_count - current_token_count

                truncated_doc = decode(doc_tokens[:remaining_tokens], skip_special_tokens=True)
                return_documents.append(truncated_doc)
                break
            else:
                return_documents.append(doc)
                current_token_count += doc_token_count

        return return_documents

    # Get chunks by similarity and then sort by ids

    def get_sorted_by_ids(self, search_strings: list[str], n_results: int, max_token_count: int) -> list[str]:
        with self.lock:
            documents, ids, _ = self._get_documents_ids_distances(search_strings, n_results)
            sorted_docs = [x for _, x in sorted(zip(ids, documents))]

            return self._get_documents_up_to_token_count(sorted_docs, max_token_count)

    # Get chunks by similarity and then sort by distance (lowest distance is last).

    def get_sorted_by_dist(self, search_strings: list[str], n_results: int, max_token_count: int) -> list[str]:
        with self.lock:
            documents, _, distances = self._get_documents_ids_distances(search_strings, n_results)
            sorted_docs = [doc for doc, _ in sorted(zip(documents, distances), key=lambda x: x[1])]  # sorted lowest -> highest

            # If a document is truncated or competely skipped, it would be with high distance.
            return_documents = self._get_documents_up_to_token_count(sorted_docs, max_token_count)
            return_documents.reverse()  # highest -> lowest

            return return_documents

    def delete(self, ids_to_delete: list[str], where: dict):
        with self.lock:
            ids_to_delete = self.collection.get(ids=ids_to_delete, where=where)['ids']
            self.collection.delete(ids=ids_to_delete, where=where)

            # Remove the deleted ids from self.ids and self.id_to_info
            ids_set = set(ids_to_delete)
            self.ids = [id_ for id_ in self.ids if id_ not in ids_set]
            for id_ in ids_to_delete:
                self.id_to_info.pop(id_, None)

            logger.info(f'Successfully deleted {len(ids_to_delete)} records from chromaDB.')

    def clear(self):
        with self.lock:
            self.chroma_client.reset()

            self.ids = []
            self.chroma_client.delete_collection(name=self.name)
            self.collection = self.chroma_client.create_collection(name=self.name, embedding_function=embedder)

            logger.info('Successfully cleared all records and reset chromaDB.')


def make_collector():
    return ChromaCollector()
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`import math`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`import random`
			`import threading`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`import chromadb`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`import numpy as np`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`import posthog`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`from chromadb.config import Settings`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`from chromadb.utils import embedding_functions`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`import extensions.superboogav2.parameters as parameters`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`from modules.logging_colors import logger`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`from modules.text_generation import decode, encode`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`# Intercept calls to posthog`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`posthog.capture = lambda args, *kwargs: None`


Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`embedder = embedding_functions.SentenceTransformerEmbeddingFunction("sentence-transformers/all-mpnet-base-v2")`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00

			`class Info:`
			`def __init__(self, start_index, text_with_context, distance, id):`
			`self.text_with_context = text_with_context`
			`self.start_index = start_index`
			`self.distance = distance`
			`self.id = id`

			`def calculate_distance(self, other_info):`
			`if parameters.get_new_dist_strategy() == parameters.DIST_MIN_STRATEGY:`
			`# Min`
			`return min(self.distance, other_info.distance)`
			`elif parameters.get_new_dist_strategy() == parameters.DIST_HARMONIC_STRATEGY:`
			`# Harmonic mean`
			`return 2 * (self.distance * other_info.distance) / (self.distance + other_info.distance)`
			`elif parameters.get_new_dist_strategy() == parameters.DIST_GEOMETRIC_STRATEGY:`
			`# Geometric mean`
			`return (self.distance * other_info.distance) ** 0.5`
			`elif parameters.get_new_dist_strategy() == parameters.DIST_ARITHMETIC_STRATEGY:`
			`# Arithmetic mean`
			`return (self.distance + other_info.distance) / 2`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`else: # Min is default`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`return min(self.distance, other_info.distance)`

			`def merge_with(self, other_info):`
			`s1 = self.text_with_context`
			`s2 = other_info.text_with_context`
			`s1_start = self.start_index`
			`s2_start = other_info.start_index`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`new_dist = self.calculate_distance(other_info)`

			`if self.should_merge(s1, s2, s1_start, s2_start):`
			`if s1_start <= s2_start:`
			`if s1_start + len(s1) >= s2_start + len(s2): # if s1 completely covers s2`
			`return Info(s1_start, s1, new_dist, self.id)`
			`else:`
			`overlap = max(0, s1_start + len(s1) - s2_start)`
			`return Info(s1_start, s1 + s2[overlap:], new_dist, self.id)`
			`else:`
			`if s2_start + len(s2) >= s1_start + len(s1): # if s2 completely covers s1`
			`return Info(s2_start, s2, new_dist, other_info.id)`
			`else:`
			`overlap = max(0, s2_start + len(s2) - s1_start)`
			`return Info(s2_start, s2 + s1[overlap:], new_dist, other_info.id)`

			`return None`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`@staticmethod`
			`def should_merge(s1, s2, s1_start, s2_start):`
			`# Check if s1 and s2 are adjacent or overlapping`
			`s1_end = s1_start + len(s1)`
			`s2_end = s2_start + len(s2)`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`return not (s1_end < s2_start or s2_end < s1_start)`

Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
			`class ChromaCollector():`
			`def __init__(self):`
			`name = ''.join(random.choice('ab') for _ in range(10))`

			`self.name = name`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`self.collection = self.chroma_client.create_collection(name=name, embedding_function=embedder)`

Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`self.ids = []`
			`self.id_to_info = {}`
			`self.embeddings_cache = {}`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`self.lock = threading.Lock() # Locking so the server doesn't break.`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00
			`def add(self, texts: list[str], texts_with_context: list[str], starting_indices: list[int], metadatas: list[dict] = None):`
			`with self.lock:`
			`assert metadatas is None or len(metadatas) == len(texts), "metadatas must be None or have the same length as texts"`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
			`if len(texts) == 0:`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`return`

			`new_ids = self._get_new_ids(len(texts))`

			`(existing_texts, existing_embeddings, existing_ids, existing_metas), \`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`(non_existing_texts, non_existing_ids, non_existing_metas) = self._split_texts_by_cache_hit(texts, new_ids, metadatas)`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00
			`# If there are any already existing texts, add them all at once.`
			`if existing_texts:`
			`logger.info(f'Adding {len(existing_embeddings)} cached embeddings.')`
			`args = {'embeddings': existing_embeddings, 'documents': existing_texts, 'ids': existing_ids}`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`if metadatas is not None:`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`args['metadatas'] = existing_metas`
			`self.collection.add(**args)`

			`# If there are any non-existing texts, compute their embeddings all at once. Each call to embed has significant overhead.`
			`if non_existing_texts:`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`non_existing_embeddings = embedder(non_existing_texts)`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`for text, embedding in zip(non_existing_texts, non_existing_embeddings):`
			`self.embeddings_cache[text] = embedding`

			`logger.info(f'Adding {len(non_existing_embeddings)} new embeddings.')`
			`args = {'embeddings': non_existing_embeddings, 'documents': non_existing_texts, 'ids': non_existing_ids}`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`if metadatas is not None:`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`args['metadatas'] = non_existing_metas`
			`self.collection.add(**args)`

			`# Create a dictionary that maps each ID to its context and starting index`
			`new_info = {`
			`id_: {'text_with_context': context, 'start_index': start_index}`
			`for id_, context, start_index in zip(new_ids, texts_with_context, starting_indices)`
			`}`

			`self.id_to_info.update(new_info)`
			`self.ids.extend(new_ids)`

			`def _split_texts_by_cache_hit(self, texts: list[str], new_ids: list[str], metadatas: list[dict]):`
			`existing_texts, non_existing_texts = [], []`
			`existing_embeddings = []`
			`existing_ids, non_existing_ids = [], []`
			`existing_metas, non_existing_metas = [], []`

			`for i, text in enumerate(texts):`
			`id_ = new_ids[i]`
			`metadata = metadatas[i] if metadatas is not None else None`
			`embedding = self.embeddings_cache.get(text)`
			`if embedding:`
			`existing_texts.append(text)`
			`existing_embeddings.append(embedding)`
			`existing_ids.append(id_)`
			`existing_metas.append(metadata)`
			`else:`
			`non_existing_texts.append(text)`
			`non_existing_ids.append(id_)`
			`non_existing_metas.append(metadata)`

			`return (existing_texts, existing_embeddings, existing_ids, existing_metas), \`
			`(non_existing_texts, non_existing_ids, non_existing_metas)`

			`def _get_new_ids(self, num_new_ids: int):`
			`if self.ids:`
			`max_existing_id = max(int(id_) for id_ in self.ids)`
			`else:`
			`max_existing_id = -1`

			`return [str(i + max_existing_id + 1) for i in range(num_new_ids)]`

			`def _find_min_max_start_index(self):`
			`max_index, min_index = 0, float('inf')`
			`for _, val in self.id_to_info.items():`
			`if val['start_index'] > max_index:`
			`max_index = val['start_index']`
			`if val['start_index'] < min_index:`
			`min_index = val['start_index']`
			`return min_index, max_index`

Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`# NB: Does not make sense to weigh excerpts from different documents.`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`# But let's say that's the user's problem. Perfect world scenario:`
			`# Apply time weighing to different documents. For each document, then, add`
			`# separate time weighing.`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`def _apply_sigmoid_time_weighing(self, infos: list[Info], document_len: int, time_steepness: float, time_power: float):`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`def sigmoid(x):`
			`return 1 / (1 + np.exp(-x))`

Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`weights = sigmoid(time_steepness * np.linspace(-10, 10, document_len))`

			`# Scale to [0,time_power] and shift it up to [1-time_power, 1]`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`weights = weights - min(weights)`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`weights = weights * (time_power / max(weights))`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`weights = weights + (1 - time_power)`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00
			`# Reverse the weights`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`weights = weights[::-1]`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00
			`for info in infos:`
			`index = info.start_index`
			`info.distance *= weights[index]`

			`def _filter_outliers_by_median_distance(self, infos: list[Info], significant_level: float):`
			`# Ensure there are infos to filter`
			`if not infos:`
			`return []`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`# Find info with minimum distance`
			`min_info = min(infos, key=lambda x: x.distance)`

			`# Calculate median distance among infos`
			`median_distance = np.median([inf.distance for inf in infos])`

			`# Filter out infos that have a distance significantly greater than the median`
			`filtered_infos = [inf for inf in infos if inf.distance <= significant_level * median_distance]`

			`# Always include the info with minimum distance`
			`if min_info not in filtered_infos:`
			`filtered_infos.append(min_info)`

			`return filtered_infos`

			`def _merge_infos(self, infos: list[Info]):`
			`merged_infos = []`
			`current_info = infos[0]`

			`for next_info in infos[1:]:`
			`merged = current_info.merge_with(next_info)`
			`if merged is not None:`
			`current_info = merged`
			`else:`
			`merged_infos.append(current_info)`
			`current_info = next_info`

			`merged_infos.append(current_info)`
			`return merged_infos`

			`# Main function for retrieving chunks by distance. It performs merging, time weighing, and mean filtering.`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`def _get_documents_ids_distances(self, search_strings: list[str], n_results: int):`
			`n_results = min(len(self.ids), n_results)`
			`if n_results == 0:`
			`return [], [], []`

			`if isinstance(search_strings, str):`
			`search_strings = [search_strings]`

			`infos = []`
			`min_start_index, max_start_index = self._find_min_max_start_index()`

			`for search_string in search_strings:`
			`result = self.collection.query(query_texts=search_string, n_results=math.ceil(n_results / len(search_strings)), include=['distances'])`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`curr_infos = [Info(start_index=self.id_to_info[id]['start_index'],`
			`text_with_context=self.id_to_info[id]['text_with_context'],`
			`distance=distance, id=id)`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`for id, distance in zip(result['ids'][0], result['distances'][0])]`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`self._apply_sigmoid_time_weighing(infos=curr_infos, document_len=max_start_index - min_start_index + 1, time_steepness=parameters.get_time_steepness(), time_power=parameters.get_time_power())`
			`curr_infos = self._filter_outliers_by_median_distance(curr_infos, parameters.get_significant_level())`
			`infos.extend(curr_infos)`

			`infos.sort(key=lambda x: x.start_index)`
			`infos = self._merge_infos(infos)`

			`texts_with_context = [inf.text_with_context for inf in infos]`
			`ids = [inf.id for inf in infos]`
			`distances = [inf.distance for inf in infos]`

			`return texts_with_context, ids, distances`

			`# Get chunks by similarity`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`def get(self, search_strings: list[str], n_results: int) -> list[str]:`
			`with self.lock:`
			`documents, _, _ = self._get_documents_ids_distances(search_strings, n_results)`
			`return documents`

			`# Get ids by similarity`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`def get_ids(self, search_strings: list[str], n_results: int) -> list[str]:`
			`with self.lock:`
			`_, ids, _ = self._get_documents_ids_distances(search_strings, n_results)`
			`return ids`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`# Cutoff token count`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`def _get_documents_up_to_token_count(self, documents: list[str], max_token_count: int):`
			`# TODO: Move to caller; We add delimiters there which might go over the limit.`
			`current_token_count = 0`
			`return_documents = []`

			`for doc in documents:`
			`doc_tokens = encode(doc)[0]`
			`doc_token_count = len(doc_tokens)`
			`if current_token_count + doc_token_count > max_token_count:`
			`# If adding this document would exceed the max token count,`
			`# truncate the document to fit within the limit.`
			`remaining_tokens = max_token_count - current_token_count`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`truncated_doc = decode(doc_tokens[:remaining_tokens], skip_special_tokens=True)`
			`return_documents.append(truncated_doc)`
			`break`
			`else:`
			`return_documents.append(doc)`
			`current_token_count += doc_token_count`

			`return return_documents`

			`# Get chunks by similarity and then sort by ids`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`def get_sorted_by_ids(self, search_strings: list[str], n_results: int, max_token_count: int) -> list[str]:`
			`with self.lock:`
			`documents, ids, _ = self._get_documents_ids_distances(search_strings, n_results)`
			`sorted_docs = [x for _, x in sorted(zip(ids, documents))]`

			`return self._get_documents_up_to_token_count(sorted_docs, max_token_count)`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`# Get chunks by similarity and then sort by distance (lowest distance is last).`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`def get_sorted_by_dist(self, search_strings: list[str], n_results: int, max_token_count: int) -> list[str]:`
			`with self.lock:`
			`documents, _, distances = self._get_documents_ids_distances(search_strings, n_results)`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`sorted_docs = [doc for doc, _ in sorted(zip(documents, distances), key=lambda x: x[1])] # sorted lowest -> highest`

Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`# If a document is truncated or competely skipped, it would be with high distance.`
			`return_documents = self._get_documents_up_to_token_count(sorted_docs, max_token_count)`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`return_documents.reverse() # highest -> lowest`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00
			`return return_documents`

			`def delete(self, ids_to_delete: list[str], where: dict):`
			`with self.lock:`
			`ids_to_delete = self.collection.get(ids=ids_to_delete, where=where)['ids']`
			`self.collection.delete(ids=ids_to_delete, where=where)`

			`# Remove the deleted ids from self.ids and self.id_to_info`
			`ids_set = set(ids_to_delete)`
			`self.ids = [id_ for id_ in self.ids if id_ not in ids_set]`
			`for id_ in ids_to_delete:`
			`self.id_to_info.pop(id_, None)`

			`logger.info(f'Successfully deleted {len(ids_to_delete)} records from chromaDB.')`

			`def clear(self):`
			`with self.lock:`
			`self.chroma_client.reset()`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00			`self.ids = []`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`self.chroma_client.delete_collection(name=self.name)`
			`self.collection = self.chroma_client.create_collection(name=self.name, embedding_function=embedder)`
Supercharging superbooga (#3272) 2023-09-26 20:30:19 -04:00
			`logger.info('Successfully cleared all records and reset chromaDB.')`


			`def make_collector():`
Make superbooga & superboogav2 functional again (#5656) 2024-03-07 13:03:18 -05:00			`return ChromaCollector()`