Supercharging superbooga (#3272)

This commit is contained in:
HideLord 2023-09-27 00:30:19 +00:00 committed by GitHub
parent ad00b8eb26
commit 0845724a89
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
21 changed files with 12294 additions and 2 deletions

3
.gitignore vendored
View File

@ -33,4 +33,5 @@ models/config-user.yaml
.DS_Store
Thumbs.db
installer_files/
.chroma
installer_files

View File

@ -0,0 +1,207 @@
"""
This module is responsible for the VectorDB API. It currently supports:
* DELETE api/v1/clear
- Clears the whole DB.
* POST api/v1/add
- Add some corpus to the DB. You can also specify metadata to be added alongside it.
* POST api/v1/delete
- Delete specific records with given metadata.
* POST api/v1/get
- Get results from chromaDB.
"""
import json
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from urllib.parse import urlparse, parse_qs
from threading import Thread
from modules import shared
from modules.logging_colors import logger
from .chromadb import ChromaCollector
from .data_processor import process_and_add_to_collector
import extensions.superboogav2.parameters as parameters
class CustomThreadingHTTPServer(ThreadingHTTPServer):
def __init__(self, server_address, RequestHandlerClass, collector: ChromaCollector, bind_and_activate=True):
self.collector = collector
super().__init__(server_address, RequestHandlerClass, bind_and_activate)
def finish_request(self, request, client_address):
self.RequestHandlerClass(request, client_address, self, self.collector)
class Handler(BaseHTTPRequestHandler):
def __init__(self, request, client_address, server, collector: ChromaCollector):
self.collector = collector
super().__init__(request, client_address, server)
def _send_412_error(self, message):
self.send_response(412)
self.send_header("Content-type", "application/json")
self.end_headers()
response = json.dumps({"error": message})
self.wfile.write(response.encode('utf-8'))
def _send_404_error(self):
self.send_response(404)
self.send_header("Content-type", "application/json")
self.end_headers()
response = json.dumps({"error": "Resource not found"})
self.wfile.write(response.encode('utf-8'))
def _send_400_error(self, error_message: str):
self.send_response(400)
self.send_header("Content-type", "application/json")
self.end_headers()
response = json.dumps({"error": error_message})
self.wfile.write(response.encode('utf-8'))
def _send_200_response(self, message: str):
self.send_response(200)
self.send_header("Content-type", "application/json")
self.end_headers()
if isinstance(message, str):
response = json.dumps({"message": message})
else:
response = json.dumps(message)
self.wfile.write(response.encode('utf-8'))
def _handle_get(self, search_strings: list[str], n_results: int, max_token_count: int, sort_param: str):
if sort_param == parameters.SORT_DISTANCE:
results = self.collector.get_sorted_by_dist(search_strings, n_results, max_token_count)
elif sort_param == parameters.SORT_ID:
results = self.collector.get_sorted_by_id(search_strings, n_results, max_token_count)
else: # Default is dist
results = self.collector.get_sorted_by_dist(search_strings, n_results, max_token_count)
return {
"results": results
}
def do_GET(self):
self._send_404_error()
def do_POST(self):
try:
content_length = int(self.headers['Content-Length'])
body = json.loads(self.rfile.read(content_length).decode('utf-8'))
parsed_path = urlparse(self.path)
path = parsed_path.path
query_params = parse_qs(parsed_path.query)
if path in ['/api/v1/add', '/api/add']:
corpus = body.get('corpus')
if corpus is None:
self._send_412_error("Missing parameter 'corpus'")
return
clear_before_adding = body.get('clear_before_adding', False)
metadata = body.get('metadata')
process_and_add_to_collector(corpus, self.collector, clear_before_adding, metadata)
self._send_200_response("Data successfully added")
elif path in ['/api/v1/delete', '/api/delete']:
metadata = body.get('metadata')
if corpus is None:
self._send_412_error("Missing parameter 'metadata'")
return
self.collector.delete(ids_to_delete=None, where=metadata)
self._send_200_response("Data successfully deleted")
elif path in ['/api/v1/get', '/api/get']:
search_strings = body.get('search_strings')
if search_strings is None:
self._send_412_error("Missing parameter 'search_strings'")
return
n_results = body.get('n_results')
if n_results is None:
n_results = parameters.get_chunk_count()
max_token_count = body.get('max_token_count')
if max_token_count is None:
max_token_count = parameters.get_max_token_count()
sort_param = query_params.get('sort', ['distance'])[0]
results = self._handle_get(search_strings, n_results, max_token_count, sort_param)
self._send_200_response(results)
else:
self._send_404_error()
except Exception as e:
self._send_400_error(str(e))
def do_DELETE(self):
try:
parsed_path = urlparse(self.path)
path = parsed_path.path
query_params = parse_qs(parsed_path.query)
if path in ['/api/v1/clear', '/api/clear']:
self.collector.clear()
self._send_200_response("Data successfully cleared")
else:
self._send_404_error()
except Exception as e:
self._send_400_error(str(e))
def do_OPTIONS(self):
self.send_response(200)
self.end_headers()
def end_headers(self):
self.send_header('Access-Control-Allow-Origin', '*')
self.send_header('Access-Control-Allow-Methods', '*')
self.send_header('Access-Control-Allow-Headers', '*')
self.send_header('Cache-Control', 'no-store, no-cache, must-revalidate')
super().end_headers()
class APIManager:
def __init__(self, collector: ChromaCollector):
self.server = None
self.collector = collector
self.is_running = False
def start_server(self, port: int):
if self.server is not None:
print("Server already running.")
return
address = '0.0.0.0' if shared.args.listen else '127.0.0.1'
self.server = CustomThreadingHTTPServer((address, port), Handler, self.collector)
logger.info(f'Starting chromaDB API at http://{address}:{port}/api')
Thread(target=self.server.serve_forever, daemon=True).start()
self.is_running = True
def stop_server(self):
if self.server is not None:
logger.info(f'Stopping chromaDB API.')
self.server.shutdown()
self.server.server_close()
self.server = None
self.is_running = False
def is_server_running(self):
return self.is_running

View File

@ -0,0 +1,72 @@
"""
This module implements a benchmark function to evaluate the performance of the embedding pipeline. It expects a configuration JSON file. It must have questions and expected retrieved text.
For each question, it's essential to have variants of that question. Language is fluid and each person might have their own spin on how they may ask it.
At the end, it will save the results inside a benchmark_{sysdate}.txt file in the main directory.
The benchmark function will return the score as an integer.
"""
import datetime
import json
import os
from pathlib import Path
from .data_processor import process_and_add_to_collector, preprocess_text
from .parameters import get_chunk_count, get_max_token_count
from .utils import create_metadata_source
def benchmark(config_path, collector):
# Get the current system date
sysdate = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"benchmark_{sysdate}.txt"
# Open the log file in append mode
with open(filename, 'a') as log:
with open(config_path, 'r') as f:
data = json.load(f)
total_points = 0
max_points = 0
for item in data:
filepath = item["text"]
corpus = ""
# Check if the file exists
if os.path.isfile(Path(filepath)):
# Open the file and read its content
with open(Path(filepath), 'r') as file:
corpus = file.read()
process_and_add_to_collector(corpus, collector, True, create_metadata_source('benchmark'))
else:
raise f'Cannot find specified file {filepath}.'
for question_group in item["questions"]:
question_variants = question_group["question_variants"]
criteria = question_group["criteria"]
for q in question_variants:
max_points += len(criteria)
processed_text = preprocess_text(q)
# Get the most similar chunks
results = collector.get_sorted_by_dist(processed_text, n_results=get_chunk_count(), max_token_count=get_max_token_count())
points = 0
for c in criteria:
for p in results:
if c in p:
points += 1
total_points += 1
break
info = f"The question '{q}' scored {points}/{len(criteria)} points."
print(info, file=log)
print('\n---\n', file=log)
print(f'##Total points:\n\n{total_points}/{max_points}', file=log)
return total_points, max_points

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,291 @@
[
{
"text": "extensions/superboogav2/benchmark_texts/aircraft_lease.txt",
"questions": [
{
"question_variants": [
"What is a wet lease?",
"Agh, I'm really wracking my brain here, but can't figure it out. What is a wet lease?",
"I've been trying to wrap my head around this concept and it's just not clicking. Could you elucidate the concept of a wet lease?",
"I'm finding it so hard to understand this whole wet lease thing! Would you be so kind as to explicate on the matter of what is known as a wet lease in the domain of aviation?",
"I've spent hours trying to grasp this and I'm still lost. Could you engage in a detailed exploration of the intricate and nuanced topic that is the wet lease, as it is commonly understood and applied within the broad and complex ecosystem of aviation?"
],
"criteria": [
"WET LEASE shall mean any arrangement whereby Lessee agrees to\n operate the Aircraft under a contractual arrangement with a third party\n pursuant to which no rights to any aircraft specifically identified by serial\n number or registration number are granted to such third party and pursuant to\n which the Aircraft (i) remains at all times under the sole and complete\n operational control of Lessee",
"(ii) shall be operated solely by cockpit crew\n employed by Lessee possessing all current certificates and licenses required by\n applicable Laws and (iii) shall be maintained by Lessee in accordance with all\n the provisions of the Lease including, but not limited to, Sections 6(d) and 12\n (it being understood and agreed by Lessor and Lessee that any Wet Lease shall,\n by its terms, be in all cases subject and subordinate to this Lease)."
]
},
{
"question_variants": [
"What is PBGC?",
"I'm stumped! Can you tell me what PBGC is?",
"I've been racking my brain trying to understand PBGC. Would you mind explaining the concept to me?",
"I've been trying to grasp what PBGC represents in the context of pension benefits, but I'm not getting it. Would it be possible for you to expound upon the matter?",
"I'm having trouble understanding the multi-dimensional entity known as 'PBGC'. Could you embark on an exploration of it? How is it traditionally comprehended, interpreted, and implemented within the sphere of pension benefits?"
],
"criteria": [
"PBGC shall mean the Pension Benefit Guaranty Corporation\n established pursuant to Subtitle A of Part IV of ERISA, and any successor\n thereof."
]
},
{
"question_variants": [
"What is LIEN?",
"I can't seem to find information on what LIEN is. Can you help me?",
"I'm feeling stuck. I can't seem to find any information on what LIEN is. Could you provide some insights?",
"It's like I'm chasing my tail here. I've been searching high and low and I just can't seem to find anything that clearly explains what LIEN is. Could you lend me a hand in understanding it?",
"I'm in a bit of a pickle. I've scoured the internet, flipped through countless books, and I still can't seem to find any definitive information on what LIEN is. Could you possibly help me get my head around it?"
],
"criteria": [
"LIEN shall mean any mortgage, pledge, lien, charge,\n encumbrance, lease, exercise of rights, security interest or claim."
]
},
{
"question_variants": [
"What happens if the Lease is terminated by operation of law?",
"I'm a bit lost here. What happens if the Lease is terminated by operation of law? Can you help me understand?",
"I've been trying to figure this out but I'm stuck. What exactly happens if the Lease is terminated by operation of law? Could you explain?",
"I've been poring over this and I'm still not clear. What exactly transpires if the Lease is terminated by operation of law? I'd appreciate your help in understanding this.",
"I'm really hitting a wall here. I've been trying to understand this, but it feels like I'm reading a foreign language. What's the end result if the Lease is terminated by operation of law? Any help in understanding this, particularly a detailed breakdown, would be greatly appreciated."
],
"criteria": [
"If for any reason whatsoever this Lease shall be terminated\n in whole or in part by operation of law (other than termination under any\n bankruptcy laws as now or hereafter in effect), Lessee nonetheless agrees to\n pay to Lessor amounts equal to the Rent payments hereunder at the time such\n payments would have become due and payable in accordance with the terms hereof",
"had this Lease not been terminated so long as Lessee is able to use, possess\n and quietly enjoy the Aircraft, and so long as such payments are made and all\n other terms and conditions hereof are complied\n\n -16-\n\n\n<PAGE>\n\n\nwith by Lessor and Lessee, Lessor and Lessee will deem this Lease to remain in\nfull force and effect."
]
},
{
"question_variants": [
"What happens if a discrepancy or malfunction is detected during the Acceptance Flight?",
"I'm having difficulty understanding this part. What exactly happens if a discrepancy or malfunction is detected during the Acceptance Flight? Can you provide a thorough explanation?",
"I'm stuck on this one. I'm struggling to comprehend what steps are taken if a discrepancy or malfunction is detected during the Acceptance Flight. Could you possibly explain this in detail?",
"I've been poring over this issue for a while, and it's not clicking. What steps are taken or what are the implications if a discrepancy or malfunction is detected during the Acceptance Flight? I'd appreciate a comprehensive explanation.",
"I'm really hitting a wall here. I've been trying to understand, but it's like I'm reading a foreign language. What's the end result or the next step if a discrepancy or malfunction is detected during the Acceptance Flight? Any help in understanding this, particularly a detailed breakdown, would be greatly appreciated."
],
"criteria": [
"If, during the Acceptance Flight, no discrepancy or malfunction is\n detected with respect to the airworthiness or operational nature of\n the Aircraft by normal airline standards, then (i) the delivery of\n the Aircraft from Lessor to Lessee hereunder shall occur, the\n Aircraft shall be accepted by Lessee hereunder whilst the Aircraft\n is located at the Delivery Location, and Lessee shall execute and\n deliver the Lease Supplement, (ii) th",
"e Aircraft shall continue en\n route to a location selected by Lessee (the \"Ferry Location\"), under\n the operational control of Lessee from the time of such delivery and\n acceptance (the Acceptance Flight shall terminate at the time of\n such delivery and acceptance, and that portion of the flight from\n the Delivery Location to the Ferry Location is herein called the\n \"Ferry Flight\"), and (iii) Lessee shall bear the costs of the flight\n ",
" crew, fuel and other costs\n\n\n -12-\n\n\n<PAGE>\n\n\n relating to the Ferry Flight and shall reimburse the Beneficiaries\n therefor promptly following receipt of the Beneficiaries, invoice\n therefor, PROVIDED, HOWEVER, that, if any discrepancy or malfunction\n is detected with respect to the airworthiness during the Acceptance\n Flight, then, at Lessee's option after consultation with Lessor,\n either ",
"(A) the Aircraft shall be delivered to and accepted by Lessee\n at the Delivery Location and shall be ferried to the Ferry Location,\n as provided in clauses (i), (ii) and (iii) above, where Lessee shall\n remedy such discrepancy or malfunction at the cost (without mark up)\n of the Beneficiaries (provided that such subsequent repair or\n maintenance work shall not affect Lessee's acceptance of the\n Aircraft hereunder), or (B) the Aircraft s",
"hall return to the\n Beneficiaries' storage and maintenance facility where such\n discrepancy or malfunction shall be corrected at the Beneficiaries'\n expense, in which case the Delivery Date shall be postponed to such\n date as the Beneficiaries shall advise, subject to the second\n paragraph of Section 3(c) below. Any discrepancy or malfunction\n detected of an airworthiness nature shall be corrected by Lessee or\n the Beneficiari",
"es in accordance with clause (A) or (B) above, as\n applicable, at Beneficiaries, expense. If during the Acceptance\n Flight a discrepancy or malfunction is detected with respect to the\n operational nature of the Aircraft by normal airline standards but\n no discrepancy or malfunction is detected with respect to the\n airworthiness of the Aircraft, then the Aircraft shall be delivered\n to and accepted by Lessee at the Delivery Location as p",
"rovided in\n clause (A) above, and Lessee shall remedy such discrepancy or\n malfunction at the cost (without mark up) of the Beneficiaries.\n\n In anticipation of the occurrence of the Delivery Date, the\n Beneficiaries retained a flight crew to conduct the Acceptance\n Flight and will incur costs relating to such retention. In\n connection therewith, Lessee agrees to reimburse the Beneficiaries,\n promptly following Lessee's receipt",
" of an invoice therefor, for\n one-half of the costs incurred by the Beneficiaries in connection\n with retaining such flight crew for the Acceptance Flight, which\n costs relate to the period commencing on and including November 29,\n 1995 and ending on the day immediately preceding the Delivery Date."
]
},
{
"question_variants": [
"What condition must the Aircraft meet before being delivered to the Lessee?",
"I'm having some trouble understanding this part. Could you please clarify what condition the Aircraft must meet before being delivered to the Lessee? I would appreciate a detailed explanation.",
"I'm stuck on this point. I'm finding it difficult to understand the specific condition the Aircraft must be in before being handed over to the Lessee. Could you possibly provide a comprehensive explanation?",
"I'm feeling a bit lost here. I'm having trouble understanding the exact condition or standard that the Aircraft must meet before being delivered to the Lessee. Could you provide a detailed walkthrough of the requirements?",
"I've hit a bit of a wall with this one. I've been trying my best to understand this, but it's proving to be quite complex. What is the precise condition that the Aircraft must meet before it can be delivered to the Lessee? Any help in understanding this, particularly a detailed explanation, would be of great help."
],
"criteria": [
"(d) Lessee's obligation to lease the Aircraft hereunder from\nLessor shall also be conditioned upon the Aircraft being delivered to Lessee in\nthe following condition:\n\n (1) The Aircraft shall be airworthy and in good\n operating condition\n\n\n -11-\n\n<PAGE>\n\n\n with all of the Aircraft equipment, components and systems;\n\n (2) The Aircraft shall be clean;\n\n (3) The Airc",
"raft shall meet the requirements for\n airworthiness certification by the FAA;\n\n (4) A borescope of the Engines and the inspection of\n the APU in accordance with the Manufacturer's or APU manufacturer's\n recommendation shall have been performed at the direction of Lessee\n but under the control and at the cost of the Beneficiaries, and any\n discrepancies discovered in connection therewith shall have been\n corrected;"
]
},
{
"question_variants": [
"What rights does the Lessee waive under section 4(c)?",
"Can you tell me about the rights that the Lessee gives up under section 4(c)?",
"I'm having some difficulty here, could you please explain to me what rights the Lessee is forfeiting under the terms of section 4(c)?",
"I'm really struggling to understand this part, it's quite complex. Could you clarify what rights the Lessee is explicitly waiving as per section 4(c) in this agreement?",
"I'm pulling my hair out! What does this even mean? Can you assist me in deciphering what rights the Lessee is giving up or putting aside according to section 4(c)? I'm finding this part particularly challenging to grasp."
],
"criteria": [
"(c) PROHIBITION AGAINST SETOFF, COUNTERCLAIM, ETC. This Lease\n is a net lease. Subject to Section 20(f), Lessee's obligation to pay all Rent\n hereunder shall be absolute and unconditional and shall not be affected or\n reduced by any circumstance, including, without limitation, (i) any setoff,\n counterclaim, recoupment, defense or other right which Lessee may have against\n Lessor, any Beneficiary, the Manufacturer, the Engine Manufacturer, any seller\n of or person providing services with respect ",
"to the Aircraft or any other\n Person, for any reason whatsoever; (ii) any defect in the title, airworthiness\n or eligibility for registration under applicable Law, or any condition, design,\n operation or fitness for use of, or any damage to or loss or destruction of,\n the Aircraft, or any interruption or cessation in the use or possession thereof\n by Lessee for any reason whatsoever, whether arising out of or related to an\n act or omission of Lessee, or any other Person; (iii) any Liens with res",
"pect to\n the Aircraft; (iv) the invalidity or unenforceability or lack of due\n authorization or other infirmity of this Lease or any absence of right, power\n or authority of Lessor or Lessee to enter into this Lease; (v) any insolvency,\n bankruptcy, reorganization or similar proceedings by or against Lessor or\n Lessee; (vi) any other circumstance or happening of any nature whatsoever,\n similar to any of the foregoing; or (vii) any Taxes (other Taxes to which\n Lessee's indemnity does not extend p",
"ursuant to the provisions of Section 10);\n it being the express intention of Lessor and Lessee that all Rent payable\n hereunder shall be payable in all events, unless the obligation to pay the same\n shall be terminated pursuant to the express provisions of this Lease. Nothing\n in this paragraph (c) shall constitute a waiver by Lessee of any right or claim\n that Lessee may separately assert against Lessor or any Beneficiary.\n\n Lessee hereby waives, to the extent permitted by app",
"licable\n Law, any and all rights which it may now have or which at any time hereafter\n may be conferred upon it, by Law or otherwise, to terminate this Lease or any\n obligation imposed upon Lessee hereunder or in relation hereto.\n\n If for any reason whatsoever this Lease shall be terminated\n in whole or in part by operation of law (other than termination under any\n bankruptcy laws as now or hereafter in effect), Lessee nonetheless agrees to\n pay to Lessor amounts equal to the R",
"ent payments hereunder at the time such\n payments would have become due and payable in accordance with the terms hereof\n had this Lease not been terminated so long as Lessee is able to use, possess\n and quietly enjoy the Aircraft, and so long as such payments are made and all\n other terms and conditions hereof are complied\n\n -16-\n\n\n<PAGE>\n\n\nwith by Lessor and Lessee, Lessor and Lessee will deem this Lease to remain in\nfull force and effect."
]
},
{
"question_variants": [
"Can the Lessor and Beneficiaries conduct inspections without notice under certain conditions? What are those conditions?",
"Is it possible for the Lessor and Beneficiaries to carry out inspections without prior notice, given specific circumstances? If so, could you explain what these circumstances might be?",
"I'm finding myself a bit confused here. Can the Lessor and Beneficiaries, under any special conditions, perform inspections without providing any advance notice? If yes, what exactly are these special conditions?",
"I'm at my wit's end! Can the Lessor and Beneficiaries actually go ahead and conduct inspections without giving a heads up, but only when certain conditions are met? What exactly are these conditions that would allow for such actions?",
"I'm really trying to get my head around this, but I could use some assistance. Is it within the Lessor and Beneficiaries' rights to initiate inspections without any forewarning, but only under certain predefined circumstances? What are these circumstances exactly?"
],
"criteria": [
"Lessee shall permit Lessor, each Beneficiary and their\n respective designees on at least seven (7) days' prior written notice to visit\n and inspect the Aircraft, its condition, use and operation and the records\n maintained in connection therewith during normal business hours; PROVIDED,\n HOWEVER, that this shall not unreasonably interfere with Lessee's quiet use and\n enjoyment of the Aircraft PROVIDED FURTHER, HOWEVER, that Lessor or the\n Beneficiaries may conduct such visit and inspection at any",
" time and with or\n without notice if an Event of Default has occurred and is continuing."
]
},
{
"question_variants": [
"What aircraft-related information will the Lessee provide on a monthly and annual basis?",
"Could you let me know what type of aircraft-related details the Lessee is obligated to provide on a monthly and annual basis?",
"I'm finding it a bit tricky to understand this part - could you help me clarify what specific aircraft-related data or information is the Lessee expected to report on both a monthly and an annual basis?",
"I'm really trying to grapple with this agreement. Could you assist me in figuring out the exact nature of the aircraft-related information that the Lessee is required to furnish on a consistent monthly and annual basis?",
"I'm genuinely struggling here! What does it mean exactly? What is the exact nature and extent of the aircraft-related data or information that the Lessee has to provide routinely, both on a monthly and an annual basis? I'm having a hard time understanding the specificities of this provision."
],
"criteria": [
"(v) Lessee will use its reasonable efforts to provide the\n Beneficiaries on or before the fifth day of each calendar month\n commencing with the next calendar month of the Delivery Date, and shall\n in any event provide to the Beneficiaries upon request of a Beneficiary,\n with a properly completed Monthly Aircraft Utilization and Status Report\n in the Form of Exhibit J hereto for the preceding calendar month\n operation of the aircraft;\n\n (vi) Lessee ",
"will use its reasonable efforts to provide the\n Beneficiaries, on or before the 15th day of January of each year\n (commencing with January 1996), and shall in any event provide Lessor and\n the Beneficiaries upon request of a Beneficiary in English, the\n information and documentation for the preceding calendar year as listed\n in Exhibit K hereto; PROVIDED, HOWEVER, that if (i) a Default or an Event\n of Default shall have occurred and be continuing or (ii) Lessee's\n ",
" financial condition changes adversely from its financial condition at the\n time of the Delivery Date, then, upon notice and a request from Lessor or\n a Beneficiary, Lessee shall provide such information on a quarterly basis\n on the 15th day of each January, April, July and October, commencing with\n the first of such dates to follow the date of such notice."
]
},
{
"question_variants": [
"Under what conditions can Lessee consolidate, merge, or transfer assets without Lessor's prior written consent according to the text?",
"Could you explain under which specific circumstances the Lessee is allowed to consolidate, merge, or transfer assets without needing the Lessor's prior written approval, as stated in the text?",
"I'm having a bit of trouble with this section, could you clarify the exact conditions under which the Lessee is permitted to consolidate, merge, or transfer assets without first obtaining the Lessor's written consent, as outlined in the text?",
"I'm really wracking my brain here trying to understand the terms. Can you help me decipher under which exact circumstances or conditions the Lessee can execute consolidation, merging, or asset transfer without needing prior written consent from the Lessor, as the text suggests?",
"I'm pulling my hair out here! What on earth does it mean? What are the specific conditions or circumstances under which the Lessee can consolidate, merge, or transfer assets without having to acquire the Lessor's prior written consent, as it's described in the text? This is really a tough one to crack!"
],
"criteria": [
"(iv) CONSOLIDATION, MERGER, ETC. Without the prior written\n consent of Lessor and each Beneficiary, Lessee shall not consolidate with,\n merge with or merge into any other Person or convey, transfer or lease\n substantially all of its assets as an entirety to any other Person unless, upon\n and after giving effect to such transaction, (A) the surviving entity has at\n least the same net worth and gross assets as the Lessee immediately prior to\n such transaction, such surviving entity is Certified Ai",
"r Carrier and a \"citizen\n of the United States\" as defined in Section 101(16) of the Federal Aviation\n Act, (C) Lessor shall continue to be entitled to the benefits of Section 1110\n of the United States Bankruptcy Code, as in effect from time to time, and (D)\n each of the Operative Documents shall continue in full force and effect and\n shall constitute the legally binding and enforceable obligation of such\n surviving entity."
]
},
{
"question_variants": [
"Who is responsible for replacing any parts on the Aircraft that become worn out, damaged, etc?",
"Could you please specify who holds the responsibility for replacing any parts of the Aircraft that may become worn out, damaged, or similarly affected?",
"I'm having a little trouble understanding this part. Who exactly is tasked with the responsibility of replacing any components of the Aircraft that may get worn out, damaged, or otherwise impaired?",
"I'm really scratching my head trying to figure out who precisely is designated to handle the replacement of any Aircraft parts that become worn out, damaged, or in similar conditions? This aspect seems a bit complicated.",
"I'm on the verge of losing it! Who in the world is charged with the duty of replacing any parts of the Aircraft that get worn out, damaged, or anything like that? I'm really finding it tough to get my head around this point."
],
"criteria": [
"(a) REPLACEMENT OF PARTS. Lessee, at its own cost and\n expense, will promptly replace all Parts which may from time to time become\n worn out, lost, stolen, destroyed, seized, confiscated, damaged beyond repair\n or permanently rendered unfit for use for any reason whatsoever. In addition,\n in the ordinary course of maintenance, service, repair, overhaul or testing,\n Lessee may at its own cost and expense cause to be removed any Parts, whether\n or not worn out, destroyed, damaged beyond repair or ",
"permanently rendered unfit\n for use, provided that Lessee shall replace at its own cost and expense such\n Parts as promptly as practicable. All replacement Parts shall be free and clear\n of all Liens, other than Liens permitted by Section 14 hereof, shall be in at\n least the same modification status and service bulletin accomplishment status,\n shall be fully interchangeable as to form, fit and function, shall have been\n overhauled or repaired and inspected by an agency acceptable to the FAA and\n",
" shall be in as good an operating condition as, and have a utility at least\n equal to and a value and remaining warranty reasonably approximating, the Parts\n replaced (assuming such replaced Parts were in the condition and repair in\n which they were required to be maintained by the terms hereof) and all\n historical records since new or last overhaul relating to such Parts (and all\n historical records since manufacture with respect to Engines, Landing Gears,\n the APU and all life limited parts in",
"stalled on any Engine, Landing Gear or\n APU) shall be maintained by Lessee."
]
},
{
"question_variants": [
"Who bears responsibility if alterations, modifications or additions to the Aircraft result in any loss of revenue or grounding?",
"Can you clarify who would take responsibility if any alterations, modifications, or additions made to the Aircraft cause any loss of revenue or result in grounding?",
"I'm having some difficulty here. Could you please specify who should shoulder the responsibility if any changes, modifications or additions to the Aircraft lead to any form of revenue loss or cause the aircraft to be grounded?",
"I'm really trying to understand this, but it's complex. Could you elucidate who is to bear the brunt if alterations, modifications, or additions to the Aircraft culminate in a loss of revenue or result in the grounding of the aircraft?",
"I'm pulling my hair out over this! Who on earth would bear the responsibility if any alterations, modifications, or additions that are made to the Aircraft end up causing some form of revenue loss or force the aircraft to be grounded? I'm finding this part particularly challenging to comprehend."
],
"criteria": [
"In no event shall Lessor bear any liability or cost for any\n alteration, modification or addition to, or for any grounding or suspension of\n certification of, the Aircraft, or for any loss of revenue arising therefrom.\n Lessee shall make no material alterations, modifications or additions to the\n Aircraft (such as removal of seats, galleys, lavatories, major avionics\n equipment or the like) that would affect the marketability of the Aircraft\n without Lessor's and each Beneficiary's prior written",
" consent. if Lessor and\n each Beneficiary grant such consent, title to such removed Parts shall remain\n with Lessor and Lessor and the Beneficiaries may request Lessee to reinstall\n such Parts prior to termination of this Lease. If Lessor or Beneficiaries\n request Lessee to reinstall such Parts, title to the Parts removed shall vest\n in Lessee. All costs associated with such removal and reinstallation shall be\n borne by Lessee."
]
},
{
"question_variants": [
"Who is the assignor and who is the assignee?",
"Can you help me identify who the assignor is and who takes the role of the assignee?",
"I'm having some trouble figuring this out. Could you clarify for me who exactly is the assignor and who is designated as the assignee in this context?",
"I'm really wrestling with this, it seems a bit tricky. Could you help me to understand who exactly is acting as the assignor and who is being recognized as the assignee in this particular scenario?",
"I'm at my wits' end here! What does it mean? Who exactly is playing the role of the assignor and who is being referred to as the assignee in this situation? This is proving to be quite a tough nut to crack!"
],
"criteria": [
"ASSIGNOR: ALOHA AIRLINES, INC.,\n A HAWAII CORPORATION",
"ASSIGNEE: ALOHA AIRLINES, INC., A\n DELAWARE CORPORATION"
]
},
{
"question_variants": [
"What does it mean when the Assignee is referred to as a 'Certified Air Carrier'?",
"Could you clarify what is implied when the Assignee is labeled as a 'Certified Air Carrier'?",
"I'm having a hard time understanding this. Can you explain what the term 'Certified Air Carrier' means when it is applied to the Assignee in this context?",
"I'm really struggling here to understand this terminology. Could you assist in explaining what it means when the Assignee is characterized as a 'Certified Air Carrier' in this particular situation?",
"I'm almost at the end of my tether! What does this even mean? Can you help me grasp the meaning when the Assignee is designated as a 'Certified Air Carrier'? This particular terminology is really throwing me for a loop!"
],
"criteria": [
"(e) Assignee is a Certified Air Carrier and holds all\nlicenses, certificates, permits and franchises from the appropriate agencies of\nthe United States of America and/or all other governmental authorities having\njurisdiction which are necessary to authorize the Assignee to engage in air\ntransport and to carry on its business as presently conducted and to be\nconducted with the Aircraft."
]
},
{
"question_variants": [
"Why is it important for the Assignee to be a 'citizen of the United States' as defined in 40102(a)(15) of Title 49 of the United States Code?",
"Could you help me understand why it's significant for the Assignee to be defined as a 'citizen of the United States' as per 40102(a)(15) of Title 49 of the United States Code?",
"I'm finding it a bit challenging to comprehend this part. Why is it crucial for the Assignee to be designated as a 'citizen of the United States', as defined under 40102(a)(15) of Title 49 of the United States Code?",
"I'm really trying to unravel this, but it seems quite complex. Could you elucidate why it's so imperative for the Assignee to be identified as a 'citizen of the United States', as per the definition provided in 40102(a)(15) of Title 49 of the United States Code?",
"I'm pulling my hair out over this! What does it even mean? Can you help me decipher why it's so essential for the Assignee to be considered a 'citizen of the United States', as stipulated in 40102(a)(15) of Title 49 of the United States Code? I'm finding this legal terminology particularly difficult to grasp."
],
"criteria": [
"(f) Assignee is a \"citizen of the United States\" as defined\nin 40102(a)(15) of Title 49 of the United States Code."
]
},
{
"question_variants": [
"How many days do I have to pay?",
"Could you specify the number of days I'm given to complete the payment?",
"I'm a bit unsure about the payment deadline. Could you clarify how many days exactly I have to make the payment?",
"I'm really trying to understand the payment terms. Could you help me ascertain the exact number of days that I am allotted to finalize the payment?",
"I'm so confused! What does this mean exactly? Can you help me comprehend the specific amount of time, in days, that I have been provided with to conclude the payment? I'm finding this financial term quite challenging to understand."
],
"criteria": [
"(e) TIMING OF PAYMENT. Any amount due and payable to the\n relevant Indemnitee pursuant to this Section 10 will be paid within 10 days\n after receipt of a written demand therefor from such Indemnitee accompanied by\n a written statement describing in reasonable detail the basis for such\n indemnity and the computation of the amount so payable; PROVIDED, HOWEVER, that\n such amount need not be paid by Lessee prior to the later of (i) five days\n prior to the date the applicable Tax is payable to the a",
"ppropriate Governmental\n Entity or taxing authority or (ii) in the case of amounts which are being\n contested by Lessee in good faith or by Lessor pursuant to Section 10(f), the\n date such contest is finally resolved. If requested in writing by Lessee, and\n at Lessee's sole cost and expense, any calculations by an Indemnitee of any\n amount due and payable\n\n -44-\n\n\n<PAGE>\n\n\n hereunder shall be subject to review and verification by a firm of independent\n certif",
"ied public accounts of internationally recognized stature selected by\n such Indemnitee and reasonably acceptable to Lessee (such approval not to be\n unreasonably withheld or delayed). Such Indemnitee shall make available to such\n accounting firm such information as shall be necessary for purposes of such\n review and verification (but such information shall be held by such accounting\n firm in strictest confidence and shall not in any event be disclosed or made\n available to Lessee). If the result",
" of such review is that Lessee was liable\n for a smaller amount, the excess payment shall be returned by such Indemnitee\n forthwith."
]
},
{
"question_variants": [
"What currency should I pay in?",
"Could you please clarify in which currency I am expected to make the payment?",
"I'm a bit puzzled here, could you specify the exact currency I should use for the payment?",
"I'm really scratching my head trying to figure this out. Could you help me understand in which specific currency I am supposed to settle the payment?",
"I'm quite frustrated at this point! What exactly does it mean? Can you elucidate in which particular currency I'm required to execute the payment? I'm finding this point a bit difficult to decipher."
],
"criteria": [
"(i) PAYMENTS IN U.S. DOLLARS. All amounts to be paid hereunder to\nLessor or Lessee shall be paid in Dollars, in immediately available funds.\nLessee acknowledges that the specification of Dollars in this transaction is\nof the essence and that Dollars shall be the currency of account in any and\nall events. The obligations of Lessee or Lessor hereunder, to Lessor or\nLessee, respectively, shall not be discharged by an amount paid in another\ncurrency, whether pursuant to a judgment or otherwise, to t",
"he extent that the\n amount so paid on prompt conversion to Dollars under normal banking\nprocedures does not yield the amount of Dollars owing to Lessor."
]
},
{
"question_variants": [
"What is the US registration number of the aircraft?",
"Could you please tell me the US registration number assigned to the aircraft?",
"I'm having some difficulty here. Could you specify the exact US registration number of the aircraft?",
"I'm really struggling to get this part. Could you assist me in figuring out what the specific US registration number for the aircraft is?",
"I'm pulling my hair out over this! What does it mean exactly? Can you help me decipher the precise US registration number that's associated with the aircraft? I'm finding it a bit challenging to understand."
],
"criteria": [
"U.S.\n MODEL AND REGISTRATION MANUFACTURER'S\nITEM MANUFACTURER CONFIGURATION NUMBER SERIAL NUMBER\n-------------------------------------------------------------------------------------------------------------------\n<S> <C> <C> <C> <C>\n\nAircraft The Boeing Compa",
"ny 737-25A N685MA*"
]
},
{
"question_variants": [
"What is the maximum duration that a safety or maintenance requirement can remain unaddressed on the aircraft, particularly in terms of airworthiness directives and mandatory orders?",
"How long can a safety or maintenance requirement, especially airworthiness directives and mandatory orders, be left unresolved?",
"How long can an airworthiness directive or mandatory order remain outstanding on the aircraft according to standard lease agreements?",
"What's the longest period that a safety or maintenance requirement, such as airworthiness directives and mandatory orders, can remain unmet on a leased aircraft?",
"What is the maximum allowable timeframe for a safety or maintenance requirement to be left unattended to on an aircraft, specifically referring to airworthiness directives and mandatory orders?"
],
"criteria": [
"(i) have had all repetitive airworthiness directives and mandatory\n orders and regulations in at least half-life or better condition;"
]
},
{
"question_variants": [
"What are the payment locations?",
"Could you specify where exactly I should be making the payments? Are there particular bank accounts or locations?",
"I'm a bit puzzled here. Could you clarify the exact payment locations or bank accounts where I'm supposed to deposit the payments?",
"I'm really struggling to grasp this. Could you assist me in understanding the specific payment locations or bank accounts where I'm expected to send the payments?",
"I'm at my wit's end here! What does this mean? Can you help me figure out the precise locations or bank accounts where I'm supposed to carry out the payments? I'm finding this financial aspect particularly hard to comprehend."
],
"criteria": [
"Payment Locations: For ITOCHU AirLease (Europe) Limited:\n\n Account Name: Citibank New York (ABA No.\n 021000089) for the account of Citibank\n Dublin (account no. 10994598) in favor of\n ITOCHU AirLease (Europe) Limited (account\n no. 1-00-6793-017)\n\n For Marubeni Airleasing (",
"U.K.) Limited:\n\n Harris Bank International Corporation\n ABA #026-007-760\n for the credit of\n The Mitsubishi Trust & Banking Corporation\n London Branch A/C#16011100\n UID No.107280\n for further credit to\n Marubeni Airleasi",
"ng (UK) Ltd.\n Account #020-404391\n\n With respect to payments by\n Lessee of Basic Rent hereunder, 62.682% of\n such amounts shall be paid to ITOCHU\n AirLease (Europe) Limited (as above\n provided) and 37.318% of such amounts\n shall be paid to Marubeni Airlea",
"sing (U.K.)\n Limited (as above provided)."
]
},
{
"question_variants": [
"What is the revision number of the aircraft?",
"Could you please clarify what the revision number of the aircraft is?",
"I'm finding this a bit hard to grasp. Could you specify the exact revision number associated with the aircraft?",
"I'm really trying to understand this, but it's proving difficult. Could you assist me in determining the specific revision number that is attributed to the aircraft?",
"Agh! What does it even mean? Can you help me decipher the exact revision number that is tied to the aircraft? I'm finding this technical detail quite challenging to comprehend."
],
"criteria": [
"Detail Specification (737-25A-Rev. B)"
]
}
]
}
]

View File

@ -0,0 +1,138 @@
"""
This module is responsible for modifying the chat prompt and history.
"""
import json
import re
import extensions.superboogav2.parameters as parameters
from modules import chat
from modules.text_generation import get_encoded_length
from modules.logging_colors import logger
from extensions.superboogav2.utils import create_context_text, create_metadata_source
from .data_processor import process_and_add_to_collector
from .chromadb import ChromaCollector
CHAT_METADATA = create_metadata_source('automatic-chat-insert')
INSTRUCT_MODE = 'instruct'
CHAT_INSTRUCT_MODE = 'chat-instruct'
def _is_instruct_mode(state: dict):
mode = state.get('mode')
return mode == INSTRUCT_MODE or mode == CHAT_INSTRUCT_MODE
def _remove_tag_if_necessary(user_input: str):
if not parameters.get_is_manual():
return user_input
return re.sub(r'^\s*!c\s*|\s*!c\s*$', '', user_input)
def _should_query(input: str):
if not parameters.get_is_manual():
return True
if re.search(r'^\s*!c|!c\s*$', input, re.MULTILINE):
return True
return False
def _format_single_exchange(name, text):
if re.search(r':\s*$', name):
return '{} {}\n'.format(name, text)
else:
return '{}: {}\n'.format(name, text)
def _get_names(state: dict):
if _is_instruct_mode(state):
user_name = state['name1_instruct']
bot_name = state['name2_instruct']
else:
user_name = state['name1']
bot_name = state['name2']
if not user_name:
user_name = 'User'
if not bot_name:
bot_name = 'Assistant'
return user_name, bot_name
def _concatinate_history(history: dict, state: dict):
full_history_text = ''
user_name, bot_name = _get_names(state)
# Grab the internal history.
internal_history = history['internal']
assert isinstance(internal_history, list)
# Iterate through the history.
for exchange in internal_history:
assert isinstance(exchange, list)
if len(exchange) >= 1:
full_history_text += _format_single_exchange(user_name, exchange[0])
if len(exchange) >= 2:
full_history_text += _format_single_exchange(bot_name, exchange[1])
return full_history_text[:-1] # Remove the last new line.
def _hijack_last(context_text: str, history: dict, max_len: int, state: dict):
num_context_tokens = get_encoded_length(context_text)
names = _get_names(state)[::-1]
history_tokens = 0
replace_position = None
for i, messages in enumerate(reversed(history['internal'])):
for j, message in enumerate(reversed(messages)):
num_message_tokens = get_encoded_length(_format_single_exchange(names[j], message))
# TODO: This is an extremely naive solution. A more robust implementation must be made.
if history_tokens + num_context_tokens <= max_len:
# This message can be replaced
replace_position = (i, j)
history_tokens += num_message_tokens
if replace_position is None:
logger.warn("The provided context_text is too long to replace any message in the history.")
else:
# replace the message at replace_position with context_text
i, j = replace_position
history['internal'][-i-1][-j-1] = context_text
def custom_generate_chat_prompt_internal(user_input: str, state: dict, collector: ChromaCollector, **kwargs):
if parameters.get_add_chat_to_data():
# Get the whole history as one string
history_as_text = _concatinate_history(kwargs['history'], state)
if history_as_text:
# Delete all documents that were auto-inserted
collector.delete(ids_to_delete=None, where=CHAT_METADATA)
# Insert the processed history
process_and_add_to_collector(history_as_text, collector, False, CHAT_METADATA)
if _should_query(user_input):
user_input = _remove_tag_if_necessary(user_input)
results = collector.get_sorted_by_dist(user_input, n_results=parameters.get_chunk_count(), max_token_count=int(parameters.get_max_token_count()))
# Check if the strategy is to modify the last message. If so, prepend or append to the user query.
if parameters.get_injection_strategy() == parameters.APPEND_TO_LAST:
user_input = user_input + create_context_text(results)
elif parameters.get_injection_strategy() == parameters.PREPEND_TO_LAST:
user_input = create_context_text(results) + user_input
elif parameters.get_injection_strategy() == parameters.HIJACK_LAST_IN_CONTEXT:
_hijack_last(create_context_text(results), kwargs['history'], state['truncation_length'], state)
return chat.generate_chat_prompt(user_input, state, **kwargs)

View File

@ -0,0 +1,376 @@
import threading
import chromadb
import posthog
import torch
import math
import numpy as np
import extensions.superboogav2.parameters as parameters
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from modules.logging_colors import logger
from modules.text_generation import encode, decode
logger.debug('Intercepting all calls to posthog.')
posthog.capture = lambda *args, **kwargs: None
class Collecter():
def __init__(self):
pass
def add(self, texts: list[str], texts_with_context: list[str], starting_indices: list[int]):
pass
def get(self, search_strings: list[str], n_results: int) -> list[str]:
pass
def clear(self):
pass
class Embedder():
def __init__(self):
pass
def embed(self, text: str) -> list[torch.Tensor]:
pass
class Info:
def __init__(self, start_index, text_with_context, distance, id):
self.text_with_context = text_with_context
self.start_index = start_index
self.distance = distance
self.id = id
def calculate_distance(self, other_info):
if parameters.get_new_dist_strategy() == parameters.DIST_MIN_STRATEGY:
# Min
return min(self.distance, other_info.distance)
elif parameters.get_new_dist_strategy() == parameters.DIST_HARMONIC_STRATEGY:
# Harmonic mean
return 2 * (self.distance * other_info.distance) / (self.distance + other_info.distance)
elif parameters.get_new_dist_strategy() == parameters.DIST_GEOMETRIC_STRATEGY:
# Geometric mean
return (self.distance * other_info.distance) ** 0.5
elif parameters.get_new_dist_strategy() == parameters.DIST_ARITHMETIC_STRATEGY:
# Arithmetic mean
return (self.distance + other_info.distance) / 2
else: # Min is default
return min(self.distance, other_info.distance)
def merge_with(self, other_info):
s1 = self.text_with_context
s2 = other_info.text_with_context
s1_start = self.start_index
s2_start = other_info.start_index
new_dist = self.calculate_distance(other_info)
if self.should_merge(s1, s2, s1_start, s2_start):
if s1_start <= s2_start:
if s1_start + len(s1) >= s2_start + len(s2): # if s1 completely covers s2
return Info(s1_start, s1, new_dist, self.id)
else:
overlap = max(0, s1_start + len(s1) - s2_start)
return Info(s1_start, s1 + s2[overlap:], new_dist, self.id)
else:
if s2_start + len(s2) >= s1_start + len(s1): # if s2 completely covers s1
return Info(s2_start, s2, new_dist, other_info.id)
else:
overlap = max(0, s2_start + len(s2) - s1_start)
return Info(s2_start, s2 + s1[overlap:], new_dist, other_info.id)
return None
@staticmethod
def should_merge(s1, s2, s1_start, s2_start):
# Check if s1 and s2 are adjacent or overlapping
s1_end = s1_start + len(s1)
s2_end = s2_start + len(s2)
return not (s1_end < s2_start or s2_end < s1_start)
class ChromaCollector(Collecter):
def __init__(self, embedder: Embedder):
super().__init__()
self.chroma_client = chromadb.Client(Settings(anonymized_telemetry=False))
self.embedder = embedder
self.collection = self.chroma_client.create_collection(name="context", embedding_function=self.embedder.embed)
self.ids = []
self.id_to_info = {}
self.embeddings_cache = {}
self.lock = threading.Lock() # Locking so the server doesn't break.
def add(self, texts: list[str], texts_with_context: list[str], starting_indices: list[int], metadatas: list[dict] = None):
with self.lock:
assert metadatas is None or len(metadatas) == len(texts), "metadatas must be None or have the same length as texts"
if len(texts) == 0:
return
new_ids = self._get_new_ids(len(texts))
(existing_texts, existing_embeddings, existing_ids, existing_metas), \
(non_existing_texts, non_existing_ids, non_existing_metas) = self._split_texts_by_cache_hit(texts, new_ids, metadatas)
# If there are any already existing texts, add them all at once.
if existing_texts:
logger.info(f'Adding {len(existing_embeddings)} cached embeddings.')
args = {'embeddings': existing_embeddings, 'documents': existing_texts, 'ids': existing_ids}
if metadatas is not None:
args['metadatas'] = existing_metas
self.collection.add(**args)
# If there are any non-existing texts, compute their embeddings all at once. Each call to embed has significant overhead.
if non_existing_texts:
non_existing_embeddings = self.embedder.embed(non_existing_texts).tolist()
for text, embedding in zip(non_existing_texts, non_existing_embeddings):
self.embeddings_cache[text] = embedding
logger.info(f'Adding {len(non_existing_embeddings)} new embeddings.')
args = {'embeddings': non_existing_embeddings, 'documents': non_existing_texts, 'ids': non_existing_ids}
if metadatas is not None:
args['metadatas'] = non_existing_metas
self.collection.add(**args)
# Create a dictionary that maps each ID to its context and starting index
new_info = {
id_: {'text_with_context': context, 'start_index': start_index}
for id_, context, start_index in zip(new_ids, texts_with_context, starting_indices)
}
self.id_to_info.update(new_info)
self.ids.extend(new_ids)
def _split_texts_by_cache_hit(self, texts: list[str], new_ids: list[str], metadatas: list[dict]):
existing_texts, non_existing_texts = [], []
existing_embeddings = []
existing_ids, non_existing_ids = [], []
existing_metas, non_existing_metas = [], []
for i, text in enumerate(texts):
id_ = new_ids[i]
metadata = metadatas[i] if metadatas is not None else None
embedding = self.embeddings_cache.get(text)
if embedding:
existing_texts.append(text)
existing_embeddings.append(embedding)
existing_ids.append(id_)
existing_metas.append(metadata)
else:
non_existing_texts.append(text)
non_existing_ids.append(id_)
non_existing_metas.append(metadata)
return (existing_texts, existing_embeddings, existing_ids, existing_metas), \
(non_existing_texts, non_existing_ids, non_existing_metas)
def _get_new_ids(self, num_new_ids: int):
if self.ids:
max_existing_id = max(int(id_) for id_ in self.ids)
else:
max_existing_id = -1
return [str(i + max_existing_id + 1) for i in range(num_new_ids)]
def _find_min_max_start_index(self):
max_index, min_index = 0, float('inf')
for _, val in self.id_to_info.items():
if val['start_index'] > max_index:
max_index = val['start_index']
if val['start_index'] < min_index:
min_index = val['start_index']
return min_index, max_index
# NB: Does not make sense to weigh excerpts from different documents.
# But let's say that's the user's problem. Perfect world scenario:
# Apply time weighing to different documents. For each document, then, add
# separate time weighing.
def _apply_sigmoid_time_weighing(self, infos: list[Info], document_len: int, time_steepness: float, time_power: float):
sigmoid = lambda x: 1 / (1 + np.exp(-x))
weights = sigmoid(time_steepness * np.linspace(-10, 10, document_len))
# Scale to [0,time_power] and shift it up to [1-time_power, 1]
weights = weights - min(weights)
weights = weights * (time_power / max(weights))
weights = weights + (1 - time_power)
# Reverse the weights
weights = weights[::-1]
for info in infos:
index = info.start_index
info.distance *= weights[index]
def _filter_outliers_by_median_distance(self, infos: list[Info], significant_level: float):
# Ensure there are infos to filter
if not infos:
return []
# Find info with minimum distance
min_info = min(infos, key=lambda x: x.distance)
# Calculate median distance among infos
median_distance = np.median([inf.distance for inf in infos])
# Filter out infos that have a distance significantly greater than the median
filtered_infos = [inf for inf in infos if inf.distance <= significant_level * median_distance]
# Always include the info with minimum distance
if min_info not in filtered_infos:
filtered_infos.append(min_info)
return filtered_infos
def _merge_infos(self, infos: list[Info]):
merged_infos = []
current_info = infos[0]
for next_info in infos[1:]:
merged = current_info.merge_with(next_info)
if merged is not None:
current_info = merged
else:
merged_infos.append(current_info)
current_info = next_info
merged_infos.append(current_info)
return merged_infos
# Main function for retrieving chunks by distance. It performs merging, time weighing, and mean filtering.
def _get_documents_ids_distances(self, search_strings: list[str], n_results: int):
n_results = min(len(self.ids), n_results)
if n_results == 0:
return [], [], []
if isinstance(search_strings, str):
search_strings = [search_strings]
infos = []
min_start_index, max_start_index = self._find_min_max_start_index()
for search_string in search_strings:
result = self.collection.query(query_texts=search_string, n_results=math.ceil(n_results / len(search_strings)), include=['distances'])
curr_infos = [Info(start_index=self.id_to_info[id]['start_index'],
text_with_context=self.id_to_info[id]['text_with_context'],
distance=distance, id=id)
for id, distance in zip(result['ids'][0], result['distances'][0])]
self._apply_sigmoid_time_weighing(infos=curr_infos, document_len=max_start_index - min_start_index + 1, time_steepness=parameters.get_time_steepness(), time_power=parameters.get_time_power())
curr_infos = self._filter_outliers_by_median_distance(curr_infos, parameters.get_significant_level())
infos.extend(curr_infos)
infos.sort(key=lambda x: x.start_index)
infos = self._merge_infos(infos)
texts_with_context = [inf.text_with_context for inf in infos]
ids = [inf.id for inf in infos]
distances = [inf.distance for inf in infos]
return texts_with_context, ids, distances
# Get chunks by similarity
def get(self, search_strings: list[str], n_results: int) -> list[str]:
with self.lock:
documents, _, _ = self._get_documents_ids_distances(search_strings, n_results)
return documents
# Get ids by similarity
def get_ids(self, search_strings: list[str], n_results: int) -> list[str]:
with self.lock:
_, ids, _ = self._get_documents_ids_distances(search_strings, n_results)
return ids
# Cutoff token count
def _get_documents_up_to_token_count(self, documents: list[str], max_token_count: int):
# TODO: Move to caller; We add delimiters there which might go over the limit.
current_token_count = 0
return_documents = []
for doc in documents:
doc_tokens = encode(doc)[0]
doc_token_count = len(doc_tokens)
if current_token_count + doc_token_count > max_token_count:
# If adding this document would exceed the max token count,
# truncate the document to fit within the limit.
remaining_tokens = max_token_count - current_token_count
truncated_doc = decode(doc_tokens[:remaining_tokens], skip_special_tokens=True)
return_documents.append(truncated_doc)
break
else:
return_documents.append(doc)
current_token_count += doc_token_count
return return_documents
# Get chunks by similarity and then sort by ids
def get_sorted_by_ids(self, search_strings: list[str], n_results: int, max_token_count: int) -> list[str]:
with self.lock:
documents, ids, _ = self._get_documents_ids_distances(search_strings, n_results)
sorted_docs = [x for _, x in sorted(zip(ids, documents))]
return self._get_documents_up_to_token_count(sorted_docs, max_token_count)
# Get chunks by similarity and then sort by distance (lowest distance is last).
def get_sorted_by_dist(self, search_strings: list[str], n_results: int, max_token_count: int) -> list[str]:
with self.lock:
documents, _, distances = self._get_documents_ids_distances(search_strings, n_results)
sorted_docs = [doc for doc, _ in sorted(zip(documents, distances), key=lambda x: x[1])] # sorted lowest -> highest
# If a document is truncated or competely skipped, it would be with high distance.
return_documents = self._get_documents_up_to_token_count(sorted_docs, max_token_count)
return_documents.reverse() # highest -> lowest
return return_documents
def delete(self, ids_to_delete: list[str], where: dict):
with self.lock:
ids_to_delete = self.collection.get(ids=ids_to_delete, where=where)['ids']
self.collection.delete(ids=ids_to_delete, where=where)
# Remove the deleted ids from self.ids and self.id_to_info
ids_set = set(ids_to_delete)
self.ids = [id_ for id_ in self.ids if id_ not in ids_set]
for id_ in ids_to_delete:
self.id_to_info.pop(id_, None)
logger.info(f'Successfully deleted {len(ids_to_delete)} records from chromaDB.')
def clear(self):
with self.lock:
self.chroma_client.reset()
self.collection = self.chroma_client.create_collection("context", embedding_function=self.embedder.embed)
self.ids = []
self.id_to_info = {}
logger.info('Successfully cleared all records and reset chromaDB.')
class SentenceTransformerEmbedder(Embedder):
def __init__(self) -> None:
logger.debug('Creating Sentence Embedder...')
self.model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
self.embed = self.model.encode
def make_collector():
return ChromaCollector(SentenceTransformerEmbedder())

View File

@ -0,0 +1,161 @@
{
"to_lower": {
"default": false,
"categories": [true, false],
"should_optimize": true
},
"num_conversion": {
"default": null,
"categories": ["NUM_TO_WORD_METHOD", "NUM_TO_CHAR_METHOD", "NUM_TO_CHAR_LONG_METHOD", null],
"should_optimize": true
},
"merge_spaces": {
"default": false,
"categories": [true, false],
"should_optimize": true
},
"strip": {
"default": true,
"categories": [true, false],
"should_optimize": false
},
"remove_punctuation": {
"default": true,
"categories": [true, false],
"should_optimize": true
},
"remove_stopwords": {
"default": false,
"categories": [true, false],
"should_optimize": true
},
"remove_specific_pos": {
"default": false,
"categories": [true, false],
"should_optimize": true
},
"lemmatize": {
"default": true,
"categories": [true, false],
"should_optimize": true
},
"min_num_sent": {
"default": 1,
"categories": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 999999],
"should_optimize": true
},
"delta_start": {
"default": 0,
"categories": [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
"should_optimize": true
},
"chunk_len1": {
"default": 500,
"categories": [50, 200, 250, 500, 600, 900, 1000],
"should_optimize": true
},
"chunk_len2": {
"default": 500,
"categories": [0, 50, 200, 250, 500, 600, 900],
"should_optimize": true
},
"chunk_len3": {
"default": 1000,
"categories": [0, 100, 150, 300, 400, 700, 800, 1000],
"should_optimize": true
},
"chunk_len4": {
"default": 700,
"categories": [0, 100, 150, 300, 400, 700, 800],
"should_optimize": true
},
"chunk_len_mask": {
"default": 15,
"categories": [3, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15],
"should_optimize": false
},
"context_len_left": {
"default": 250,
"categories": [50, 100, 150, 200, 250, 300, 350, 400, 500, 600, 700, 800, 900, 1000],
"should_optimize": true
},
"context_len_right": {
"default": 800,
"categories": [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1200, 1400, 1500, 1600],
"should_optimize": true
},
"new_dist_strategy": {
"default": "DIST_MIN_STRATEGY",
"categories": ["DIST_MIN_STRATEGY", "DIST_HARMONIC_STRATEGY", "DIST_GEOMETRIC_STRATEGY", "DIST_ARITHMETIC_STRATEGY"],
"should_optimize": false
},
"chunk_count": {
"default": 250,
"categories": [30, 50, 75, 100, 125, 150, 175, 200, 225, 250, 275, 300, 325, 350, 375, 400],
"should_optimize": true
},
"min_num_length": {
"default": 9,
"categories": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"should_optimize": true
},
"significant_level": {
"default": 1.0,
"categories": [0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 999999],
"should_optimize": true
},
"time_steepness": {
"default": 0.01,
"categories": [0.01, 0.2, 0.4, 0.6, 0.8, 1.0],
"should_optimize": false
},
"time_power": {
"default": 0,
"categories": [0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
"should_optimize": false
},
"chunk_separator": {
"default": ""
},
"prefix": {
"default": "<<document chunk>>\n\n"
},
"data_separator": {
"default": "\n\n<<document chunk>>\n\n"
},
"postfix": {
"default": "\n\n<<document end>>\n\n"
},
"manual": {
"default": true
},
"add_chat_to_data": {
"default": true
},
"injection_strategy": {
"default": "PREPEND_TO_LAST",
"categories": ["PREPEND_TO_LAST", "APPEND_TO_LAST", "HIJACK_LAST_IN_CONTEXT"]
},
"chunk_regex": {
"default": "(?<==== ).*?(?= ===)|User story: \\d+"
},
"strong_cleanup": {
"default": false
},
"max_token_count": {
"default": 3072
},
"threads": {
"default": 4
},
"optimization_steps": {
"default": 100
},
"api_port": {
"default": 5002
},
"api_on": {
"default": false
}
}

View File

@ -0,0 +1,199 @@
"""
This module contains utils for preprocessing the text before converting it to embeddings.
- TextPreprocessorBuilder preprocesses individual strings.
* lowering cases
* converting numbers to words or characters
* merging and stripping spaces
* removing punctuation
* removing stop words
* lemmatizing
* removing specific parts of speech (adverbs and interjections)
- TextSummarizer extracts the most important sentences from a long string using text-ranking.
"""
import pytextrank
import string
import spacy
import math
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from num2words import num2words
class TextPreprocessorBuilder:
# Define class variables as None initially
_stop_words = set(stopwords.words('english'))
_lemmatizer = WordNetLemmatizer()
# Some of the functions are expensive. We cache the results.
_lemmatizer_cache = {}
_pos_remove_cache = {}
def __init__(self, text: str):
self.text = text
def to_lower(self):
# Match both words and non-word characters
tokens = re.findall(r'\b\w+\b|\W+', self.text)
for i, token in enumerate(tokens):
# Check if token is a word
if re.match(r'^\w+$', token):
# Check if token is not an abbreviation or constant
if not re.match(r'^[A-Z]+$', token) and not re.match(r'^[A-Z_]+$', token):
tokens[i] = token.lower()
self.text = "".join(tokens)
return self
def num_to_word(self, min_len: int = 1):
# Match both words and non-word characters
tokens = re.findall(r'\b\w+\b|\W+', self.text)
for i, token in enumerate(tokens):
# Check if token is a number of length `min_len` or more
if token.isdigit() and len(token) >= min_len:
# This is done to pay better attention to numbers (e.g. ticket numbers, thread numbers, post numbers)
# 740700 will become "seven hundred and forty thousand seven hundred".
tokens[i] = num2words(int(token)).replace(",","") # Remove commas from num2words.
self.text = "".join(tokens)
return self
def num_to_char_long(self, min_len: int = 1):
# Match both words and non-word characters
tokens = re.findall(r'\b\w+\b|\W+', self.text)
for i, token in enumerate(tokens):
# Check if token is a number of length `min_len` or more
if token.isdigit() and len(token) >= min_len:
# This is done to pay better attention to numbers (e.g. ticket numbers, thread numbers, post numbers)
# 740700 will become HHHHHHEEEEEAAAAHHHAAA
convert_token = lambda token: ''.join((chr(int(digit) + 65) * (i + 1)) for i, digit in enumerate(token[::-1]))[::-1]
tokens[i] = convert_token(tokens[i])
self.text = "".join(tokens)
return self
def num_to_char(self, min_len: int = 1):
# Match both words and non-word characters
tokens = re.findall(r'\b\w+\b|\W+', self.text)
for i, token in enumerate(tokens):
# Check if token is a number of length `min_len` or more
if token.isdigit() and len(token) >= min_len:
# This is done to pay better attention to numbers (e.g. ticket numbers, thread numbers, post numbers)
# 740700 will become HEAHAA
tokens[i] = ''.join(chr(int(digit) + 65) for digit in token)
self.text = "".join(tokens)
return self
def merge_spaces(self):
self.text = re.sub(' +', ' ', self.text)
return self
def strip(self):
self.text = self.text.strip()
return self
def remove_punctuation(self):
self.text = self.text.translate(str.maketrans('', '', string.punctuation))
return self
def remove_stopwords(self):
self.text = "".join([word for word in re.findall(r'\b\w+\b|\W+', self.text) if word not in TextPreprocessorBuilder._stop_words])
return self
def remove_specific_pos(self):
"""
In the English language, adverbs and interjections rarely provide meaningul information.
Removing them improves the embedding precision. Don't tell JK Rowling, though.
"""
processed_text = TextPreprocessorBuilder._pos_remove_cache.get(self.text)
if processed_text:
self.text = processed_text
return self
# Match both words and non-word characters
tokens = re.findall(r'\b\w+\b|\W+', self.text)
# Exclude adverbs and interjections
excluded_tags = ['RB', 'RBR', 'RBS', 'UH']
for i, token in enumerate(tokens):
# Check if token is a word
if re.match(r'^\w+$', token):
# Part-of-speech tag the word
pos = nltk.pos_tag([token])[0][1]
# If the word's POS tag is in the excluded list, remove the word
if pos in excluded_tags:
tokens[i] = ''
new_text = "".join(tokens)
TextPreprocessorBuilder._pos_remove_cache[self.text] = new_text
self.text = new_text
return self
def lemmatize(self):
processed_text = TextPreprocessorBuilder._lemmatizer_cache.get(self.text)
if processed_text:
self.text = processed_text
return self
new_text = "".join([TextPreprocessorBuilder._lemmatizer.lemmatize(word) for word in re.findall(r'\b\w+\b|\W+', self.text)])
TextPreprocessorBuilder._lemmatizer_cache[self.text] = new_text
self.text = new_text
return self
def build(self):
return self.text
class TextSummarizer:
_nlp_pipeline = None
_cache = {}
@staticmethod
def _load_nlp_pipeline():
# Lazy-load it.
if TextSummarizer._nlp_pipeline is None:
TextSummarizer._nlp_pipeline = spacy.load('en_core_web_sm')
TextSummarizer._nlp_pipeline.add_pipe("textrank", last=True)
return TextSummarizer._nlp_pipeline
@staticmethod
def process_long_text(text: str, min_num_sent: int) -> list[str]:
"""
This function applies a text summarization process on a given text string, extracting
the most important sentences based on the principle that 20% of the content is responsible
for 80% of the meaning (the Pareto Principle).
Returns:
list: A list of the most important sentences
"""
# Attempt to get the result from cache
cache_key = (text, min_num_sent)
cached_result = TextSummarizer._cache.get(cache_key, None)
if cached_result is not None:
return cached_result
nlp_pipeline = TextSummarizer._load_nlp_pipeline()
doc = nlp_pipeline(text)
num_sent = len(list(doc.sents))
result = []
if num_sent >= min_num_sent:
limit_phrases = math.ceil(len(doc._.phrases) * 0.20) # 20% of the phrases, rounded up
limit_sentences = math.ceil(num_sent * 0.20) # 20% of the sentences, rounded up
result = [str(sent) for sent in doc._.textrank.summary(limit_phrases=limit_phrases, limit_sentences=limit_sentences)]
else:
result = [text]
# Store the result in cache before returning it
TextSummarizer._cache[cache_key] = result
return result

View File

@ -0,0 +1,209 @@
"""
This module is responsible for processing the corpus and feeding it into chromaDB. It will receive a corpus of text.
It will then split it into chunks of specified length. For each of those chunks, it will append surrounding context.
It will only include full words.
"""
import re
import bisect
import extensions.superboogav2.parameters as parameters
from .data_preprocessor import TextPreprocessorBuilder, TextSummarizer
from .chromadb import ChromaCollector
def preprocess_text_no_summary(text) -> str:
builder = TextPreprocessorBuilder(text)
if parameters.should_to_lower():
builder.to_lower()
if parameters.should_remove_punctuation():
builder.remove_punctuation()
if parameters.should_remove_specific_pos():
builder.remove_specific_pos()
if parameters.should_remove_stopwords():
builder.remove_stopwords
if parameters.should_lemmatize():
builder.lemmatize()
if parameters.should_merge_spaces():
builder.merge_spaces
if parameters.should_strip():
builder.strip()
if parameters.get_num_conversion_strategy():
if parameters.get_num_conversion_strategy() == parameters.NUM_TO_WORD_METHOD:
builder.num_to_word(parameters.get_min_num_length())
elif parameters.get_num_conversion_strategy() == parameters.NUM_TO_CHAR_METHOD:
builder.num_to_char(parameters.get_min_num_length())
elif parameters.get_num_conversion_strategy() == parameters.NUM_TO_CHAR_LONG_METHOD:
builder.num_to_char_long(parameters.get_min_num_length())
return builder.build()
def preprocess_text(text) -> list[str]:
important_sentences = TextSummarizer.process_long_text(text, parameters.get_min_num_sentences())
return [preprocess_text_no_summary(sent) for sent in important_sentences]
def _create_chunks_with_context(corpus, chunk_len, context_left, context_right):
"""
This function takes a corpus of text and splits it into chunks of a specified length,
then adds a specified amount of context to each chunk. The context is added by first
going backwards from the start of the chunk and then going forwards from the end of the
chunk, ensuring that the context includes only whole words and that the total context length
does not exceed the specified limit. This function uses binary search for efficiency.
Returns:
chunks (list of str): The chunks of text.
chunks_with_context (list of str): The chunks of text with added context.
chunk_with_context_start_indices (list of int): The starting indices of each chunk with context in the corpus.
"""
words = re.split('(\\s+)', corpus)
word_start_indices = [0]
current_index = 0
for word in words:
current_index += len(word)
word_start_indices.append(current_index)
chunks, chunk_lengths, chunk_start_indices, chunk_with_context_start_indices = [], [], [], []
current_length = 0
current_index = 0
chunk = []
for word in words:
if current_length + len(word) > chunk_len:
chunks.append(''.join(chunk))
chunk_lengths.append(current_length)
chunk_start_indices.append(current_index - current_length)
chunk = [word]
current_length = len(word)
else:
chunk.append(word)
current_length += len(word)
current_index += len(word)
if chunk:
chunks.append(''.join(chunk))
chunk_lengths.append(current_length)
chunk_start_indices.append(current_index - current_length)
chunks_with_context = []
for start_index, chunk_length in zip(chunk_start_indices, chunk_lengths):
context_start_index = bisect.bisect_right(word_start_indices, start_index - context_left)
context_end_index = bisect.bisect_left(word_start_indices, start_index + chunk_length + context_right)
# Combine all the words in the context range (before, chunk, and after)
chunk_with_context = ''.join(words[context_start_index:context_end_index])
chunks_with_context.append(chunk_with_context)
# Determine the start index of the chunk with context
chunk_with_context_start_index = word_start_indices[context_start_index]
chunk_with_context_start_indices.append(chunk_with_context_start_index)
return chunks, chunks_with_context, chunk_with_context_start_indices
def _clear_chunks(data_chunks, data_chunks_with_context, data_chunk_starting_indices):
distinct_data_chunks = []
distinct_data_chunks_with_context = []
distinct_data_chunk_starting_indices = []
seen_chunks = dict()
for chunk, context, index in zip(data_chunks, data_chunks_with_context, data_chunk_starting_indices):
# Skip the chunk if it does not contain any alphanumeric characters
if not any(char.isalnum() for char in chunk):
continue
seen_chunk_start = seen_chunks.get(chunk)
if seen_chunk_start:
# If we've already seen this exact chunk, and the context around it it very close to the seen chunk, then skip it.
if abs(seen_chunk_start-index) < parameters.get_delta_start():
continue
distinct_data_chunks.append(chunk)
distinct_data_chunks_with_context.append(context)
distinct_data_chunk_starting_indices.append(index)
seen_chunks[chunk] = index
return distinct_data_chunks, distinct_data_chunks_with_context, distinct_data_chunk_starting_indices
def process_and_add_to_collector(corpus: str, collector: ChromaCollector, clear_collector_before_adding: bool, metadata: dict):
# Defining variables
chunk_lens = [int(len.strip()) for len in parameters.get_chunk_len().split(',')]
context_len = [int(len.strip()) for len in parameters.get_context_len().split(',')]
if len(context_len) >= 3:
raise f"Context len has too many values: {len(context_len)}"
if len(context_len) == 2:
context_left = context_len[0]
context_right = context_len[1]
else:
context_left = context_right = context_len[0]
data_chunks = []
data_chunks_with_context = []
data_chunk_starting_indices = []
# Handling chunk_regex
if parameters.get_chunk_regex():
if parameters.get_chunk_separator():
cumulative_length = 0 # This variable will store the length of the processed corpus
sections = corpus.split(parameters.get_chunk_separator())
for section in sections:
special_chunks = list(re.finditer(parameters.get_chunk_regex(), section))
for match in special_chunks:
chunk = match.group(0)
start_index = match.start()
end_index = start_index + len(chunk)
context = section[max(0, start_index - context_left):min(len(section), end_index + context_right)]
data_chunks.append(chunk)
data_chunks_with_context.append(context)
data_chunk_starting_indices.append(cumulative_length + max(0, start_index - context_left))
cumulative_length += len(section) + len(parameters.get_chunk_separator()) # Update the length of the processed corpus
else:
special_chunks = list(re.finditer(parameters.get_chunk_regex(), corpus))
for match in special_chunks:
chunk = match.group(0)
start_index = match.start()
end_index = start_index + len(chunk)
context = corpus[max(0, start_index - context_left):min(len(corpus), end_index + context_right)]
data_chunks.append(chunk)
data_chunks_with_context.append(context)
data_chunk_starting_indices.append(max(0, start_index - context_left))
for chunk_len in chunk_lens:
# Breaking the data into chunks and adding those to the db
if parameters.get_chunk_separator():
cumulative_length = 0 # This variable will store the length of the processed corpus
sections = corpus.split(parameters.get_chunk_separator())
for section in sections:
chunks, chunks_with_context, context_start_indices = _create_chunks_with_context(section, chunk_len, context_left, context_right)
context_start_indices = [cumulative_length + i for i in context_start_indices] # Add the length of the processed corpus to each start index
data_chunks.extend(chunks)
data_chunks_with_context.extend(chunks_with_context)
data_chunk_starting_indices.extend(context_start_indices)
cumulative_length += len(section) + len(parameters.get_chunk_separator()) # Update the length of the processed corpus
else:
chunks, chunks_with_context, context_start_indices = _create_chunks_with_context(corpus, chunk_len, context_left, context_right)
data_chunks.extend(chunks)
data_chunks_with_context.extend(chunks_with_context)
data_chunk_starting_indices.extend(context_start_indices)
data_chunks = [preprocess_text_no_summary(chunk) for chunk in data_chunks]
data_chunks, data_chunks_with_context, data_chunk_starting_indices = _clear_chunks(
data_chunks, data_chunks_with_context, data_chunk_starting_indices
)
if clear_collector_before_adding:
collector.clear()
collector.add(data_chunks, data_chunks_with_context, data_chunk_starting_indices, [metadata]*len(data_chunks) if metadata is not None else None)

View File

@ -0,0 +1,65 @@
import concurrent.futures
import requests
import re
from bs4 import BeautifulSoup
import extensions.superboogav2.parameters as parameters
from .data_processor import process_and_add_to_collector
from .utils import create_metadata_source
def _download_single(url):
response = requests.get(url, timeout=5)
if response.status_code == 200:
return response.content
else:
raise Exception("Failed to download URL")
def _download_urls(urls, threads=1):
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
futures = []
for url in urls:
future = executor.submit(_download_single, url)
futures.append(future)
results = []
i = 0
for future in concurrent.futures.as_completed(futures):
try:
result = future.result()
results.append(result)
i += 1
yield f"{i}/{len(urls)}", results
except Exception:
pass
yield "Done", results
def feed_url_into_collector(urls, collector):
all_text = ''
cumulative = ''
urls = urls.strip().split('\n')
cumulative += f'Loading {len(urls)} URLs with {parameters.get_num_threads()} threads...\n\n'
yield cumulative
for update, contents in _download_urls(urls, threads=parameters.get_num_threads()):
yield cumulative + update
cumulative += 'Processing the HTML sources...'
yield cumulative
for content in contents:
soup = BeautifulSoup(content, features="lxml")
for script in soup(["script", "style"]):
script.extract()
strings = soup.stripped_strings
if parameters.get_is_strong_cleanup():
strings = [s for s in strings if re.search("[A-Za-z] ", s)]
text = '\n'.join([s.strip() for s in strings])
all_text += text
process_and_add_to_collector(all_text, collector, False, create_metadata_source('url-download'))

View File

@ -0,0 +1,179 @@
i
me
my
myself
we
our
ours
ourselves
you
you're
you've
you'll
you'd
your
yours
yourself
yourselves
he
him
his
himself
she
she's
her
hers
herself
it
it's
its
itself
they
them
their
theirs
themselves
what
which
who
whom
this
that
that'll
these
those
am
is
are
was
were
be
been
being
have
has
had
having
do
does
did
doing
a
an
the
and
but
if
or
because
as
until
while
of
at
by
for
with
about
against
between
into
through
during
before
after
above
below
to
from
up
down
in
out
on
off
over
under
again
further
then
once
here
there
when
where
why
how
all
any
both
each
few
more
most
other
some
such
no
nor
not
only
own
same
so
than
too
very
s
t
can
will
just
don
don't
should
should've
now
d
ll
m
o
re
ve
y
ain
aren
aren't
couldn
couldn't
didn
didn't
doesn
doesn't
hadn
hadn't
hasn
hasn't
haven
haven't
isn
isn't
ma
mightn
mightn't
mustn
mustn't
needn
needn't
shan
shan't
shouldn
shouldn't
wasn
wasn't
weren
weren't
won
won't
wouldn
wouldn't

Binary file not shown.

View File

@ -0,0 +1,40 @@
"""
This module is responsible for handling and modifying the notebook text.
"""
import re
import extensions.superboogav2.parameters as parameters
from modules import shared
from modules.logging_colors import logger
from extensions.superboogav2.utils import create_context_text
from .data_processor import preprocess_text
def _remove_special_tokens(string):
pattern = r'(<\|begin-user-input\|>|<\|end-user-input\|>|<\|injection-point\|>)'
return re.sub(pattern, '', string)
def input_modifier_internal(string, collector):
# Sanity check.
if shared.is_chat():
return string
# Find the user input
pattern = re.compile(r"<\|begin-user-input\|>(.*?)<\|end-user-input\|>", re.DOTALL)
match = re.search(pattern, string)
if match:
# Preprocess the user prompt.
user_input = match.group(1).strip()
user_input = preprocess_text(user_input)
logger.debug(f"Preprocessed User Input: {user_input}")
# Get the most similar chunks
results = collector.get_sorted_by_dist(user_input, n_results=parameters.get_chunk_count(), max_token_count=int(parameters.get_max_token_count()))
# Make the injection
string = string.replace('<|injection-point|>', create_context_text(results))
return _remove_special_tokens(string)

View File

@ -0,0 +1,135 @@
"""
This module implements a hyperparameter optimization routine for the embedding application. It utilizes TPE optimization from Optuna.
Each run, the optimizer will set the default values inside the hyperparameters. At the end, it will output the best ones it has found.
"""
import re
import json
import optuna
import gradio as gr
import numpy as np
import logging
import hashlib
logging.getLogger('optuna').setLevel(logging.WARNING)
import extensions.superboogav2.parameters as parameters
from pathlib import Path
from .benchmark import benchmark
from .parameters import Parameters
from modules.logging_colors import logger
# Format the parameters into markdown format.
def _markdown_hyperparams():
res = []
for param_name, param_value in Parameters.getInstance().hyperparameters.items():
# Escape any markdown syntax
param_name = re.sub(r"([_*\[\]()~`>#+-.!])", r"\\\1", param_name)
param_value_default = re.sub(r"([_*\[\]()~`>#+-.!])", r"\\\1", str(param_value['default'])) if param_value['default'] else ' '
res.append('* {}: **{}**'.format(param_name, param_value_default))
return '\n'.join(res)
# Convert numpy types to python types.
def _convert_np_types(params):
for key in params:
if type(params[key]) == np.bool_:
params[key] = bool(params[key])
elif type(params[key]) == np.int64:
params[key] = int(params[key])
elif type(params[key]) == np.float64:
params[key] = float(params[key])
return params
# Set the default values for the hyperparameters.
def _set_hyperparameters(params):
for param_name, param_value in params.items():
if param_name in Parameters.getInstance().hyperparameters:
Parameters.getInstance().hyperparameters[param_name]['default'] = param_value
# Check if the parameter is for optimization.
def _is_optimization_param(val):
is_opt = val.get('should_optimize', False) # Either does not exist or is false
return is_opt
# Create a hashable representation of the parameters
def _get_params_hash(params):
params_str = json.dumps(params, sort_keys=True)
return hashlib.sha256(params_str.encode()).hexdigest()
def optimize(collector, progress=gr.Progress()):
# Inform the user that something is happening.
progress(0, desc=f'Setting Up...')
# Track the current step
current_step = 0
# Track the best score
best_score = 0
# Dictionary for caching scores
scores_cache = {}
def objective_function(trial):
nonlocal current_step
nonlocal best_score
nonlocal scores_cache
params = {}
for key, val in Parameters.getInstance().hyperparameters.items():
if _is_optimization_param(val):
params[key] = trial.suggest_categorical(key, val['categories'])
_set_hyperparameters(params)
params_hash = _get_params_hash(params)
# If the score for these parameters is in the cache, return it
if params_hash in scores_cache:
return scores_cache[params_hash]
# Benchmark the current set of parameters.
score, max_score = benchmark(Path("extensions/superboogav2/benchmark_texts/questions.json"), collector)
# Cache the score
scores_cache[params_hash] = score
result = json.dumps(_convert_np_types(params), indent=4)
result += f'\nScore: {score}/{max_score}'
logger.debug(result)
# Increment the current step
current_step += 1
# Update the best score
best_score = max(best_score, score)
# Update the progress
progress(current_step / parameters.get_optimization_steps(), desc=f'Optimizing... {current_step}/{parameters.get_optimization_steps()}')
return -score
# Run the optimization.
study = optuna.create_study()
study.optimize(objective_function, n_trials=int(parameters.get_optimization_steps()))
best_params = study.best_params
_set_hyperparameters(best_params)
# Convert results to a markdown string.
str_result = f"## Best parameters:\n\n{_markdown_hyperparams()}\n\n## Score:\n\n{best_score}"
# Save to JSON file
with open('best_params.json', 'w') as fp:
json.dump(_convert_np_types(best_params), fp, indent=4)
return str_result

View File

@ -0,0 +1,369 @@
"""
This module provides a singleton class `Parameters` that is used to manage all hyperparameters for the embedding application.
It expects a JSON file in `extensions/superboogav2/config.json`.
Each element in the JSON must have a `default` value which will be used for the current run. Elements can have `categories`.
These categories define the range in which the optimizer will search. If the element is tagged with `"should_optimize": false`,
then the optimizer will only ever use the default value.
"""
from pathlib import Path
import json
from modules.logging_colors import logger
NUM_TO_WORD_METHOD = 'Number to Word'
NUM_TO_CHAR_METHOD = 'Number to Char'
NUM_TO_CHAR_LONG_METHOD = 'Number to Multi-Char'
DIST_MIN_STRATEGY = 'Min of Two'
DIST_HARMONIC_STRATEGY = 'Harmonic Mean'
DIST_GEOMETRIC_STRATEGY = 'Geometric Mean'
DIST_ARITHMETIC_STRATEGY = 'Arithmetic Mean'
PREPEND_TO_LAST = 'Prepend to Last Message'
APPEND_TO_LAST = 'Append to Last Message'
HIJACK_LAST_IN_CONTEXT = 'Hijack Last Message in Context ⚠️ WIP ⚠️ (Works Partially)'
SORT_DISTANCE = 'distance'
SORT_ID = 'id'
class Parameters:
_instance = None
variable_mapping = {
'NUM_TO_WORD_METHOD': NUM_TO_WORD_METHOD,
'NUM_TO_CHAR_METHOD': NUM_TO_CHAR_METHOD,
'NUM_TO_CHAR_LONG_METHOD': NUM_TO_CHAR_LONG_METHOD,
'DIST_MIN_STRATEGY': DIST_MIN_STRATEGY,
'DIST_HARMONIC_STRATEGY': DIST_HARMONIC_STRATEGY,
'DIST_GEOMETRIC_STRATEGY': DIST_GEOMETRIC_STRATEGY,
'DIST_ARITHMETIC_STRATEGY': DIST_ARITHMETIC_STRATEGY,
'PREPEND_TO_LAST': PREPEND_TO_LAST,
'APPEND_TO_LAST': APPEND_TO_LAST,
'HIJACK_LAST_IN_CONTEXT': HIJACK_LAST_IN_CONTEXT,
}
@staticmethod
def getInstance():
if Parameters._instance is None:
Parameters()
return Parameters._instance
def __init__(self):
if Parameters._instance is not None:
raise Exception("This class is a singleton!")
else:
Parameters._instance = self
self.hyperparameters = self._load_from_json(Path("extensions/superboogav2/config.json"))
def _load_from_json(self, file_path):
logger.debug('Loading hyperparameters...')
with open(file_path, 'r') as file:
data = json.load(file)
# Replace variable names in the dict and create Categorical objects
for key in data:
if "default" in data[key] and data[key]["default"] in self.variable_mapping:
data[key]["default"] = self.variable_mapping[data[key]["default"]]
if "categories" in data[key]:
data[key]["categories"] = [self.variable_mapping.get(cat, cat) for cat in data[key]["categories"]]
return data
def should_to_lower() -> bool:
return bool(Parameters.getInstance().hyperparameters['to_lower']['default'])
def get_num_conversion_strategy() -> str:
return Parameters.getInstance().hyperparameters['num_conversion']['default']
def should_merge_spaces() -> bool:
return bool(Parameters.getInstance().hyperparameters['merge_spaces']['default'])
def should_strip() -> bool:
return bool(Parameters.getInstance().hyperparameters['strip']['default'])
def should_remove_punctuation() -> bool:
return bool(Parameters.getInstance().hyperparameters['remove_punctuation']['default'])
def should_remove_stopwords() -> bool:
return bool(Parameters.getInstance().hyperparameters['remove_stopwords']['default'])
def should_remove_specific_pos() -> bool:
return bool(Parameters.getInstance().hyperparameters['remove_specific_pos']['default'])
def should_lemmatize() -> bool:
return bool(Parameters.getInstance().hyperparameters['lemmatize']['default'])
def get_min_num_sentences() -> int:
return int(Parameters.getInstance().hyperparameters['min_num_sent']['default'])
def get_delta_start() -> int:
return int(Parameters.getInstance().hyperparameters['delta_start']['default'])
def set_to_lower(value: bool):
Parameters.getInstance().hyperparameters['to_lower']['default'] = value
def set_num_conversion_strategy(value: str):
Parameters.getInstance().hyperparameters['num_conversion']['default'] = value
def set_merge_spaces(value: bool):
Parameters.getInstance().hyperparameters['merge_spaces']['default'] = value
def set_strip(value: bool):
Parameters.getInstance().hyperparameters['strip']['default'] = value
def set_remove_punctuation(value: bool):
Parameters.getInstance().hyperparameters['remove_punctuation']['default'] = value
def set_remove_stopwords(value: bool):
Parameters.getInstance().hyperparameters['remove_stopwords']['default'] = value
def set_remove_specific_pos(value: bool):
Parameters.getInstance().hyperparameters['remove_specific_pos']['default'] = value
def set_lemmatize(value: bool):
Parameters.getInstance().hyperparameters['lemmatize']['default'] = value
def set_min_num_sentences(value: int):
Parameters.getInstance().hyperparameters['min_num_sent']['default'] = value
def set_delta_start(value: int):
Parameters.getInstance().hyperparameters['delta_start']['default'] = value
def get_chunk_len() -> str:
lens = []
mask = Parameters.getInstance().hyperparameters['chunk_len_mask']['default']
lens.append(Parameters.getInstance().hyperparameters['chunk_len1']['default'] if mask & (1 << 0) else None)
lens.append(Parameters.getInstance().hyperparameters['chunk_len2']['default'] if mask & (1 << 1) else None)
lens.append(Parameters.getInstance().hyperparameters['chunk_len3']['default'] if mask & (1 << 2) else None)
lens.append(Parameters.getInstance().hyperparameters['chunk_len4']['default'] if mask & (1 << 3) else None)
return ','.join([str(len) for len in lens if len])
def set_chunk_len(val: str):
chunk_lens = sorted([int(len.strip()) for len in val.split(',')])
# Reset the mask to zero
Parameters.getInstance().hyperparameters['chunk_len_mask']['default'] = 0
if len(chunk_lens) > 0:
Parameters.getInstance().hyperparameters['chunk_len1']['default'] = chunk_lens[0]
Parameters.getInstance().hyperparameters['chunk_len_mask']['default'] |= (1 << 0)
if len(chunk_lens) > 1:
Parameters.getInstance().hyperparameters['chunk_len2']['default'] = chunk_lens[1]
Parameters.getInstance().hyperparameters['chunk_len_mask']['default'] |= (1 << 1)
if len(chunk_lens) > 2:
Parameters.getInstance().hyperparameters['chunk_len3']['default'] = chunk_lens[2]
Parameters.getInstance().hyperparameters['chunk_len_mask']['default'] |= (1 << 2)
if len(chunk_lens) > 3:
Parameters.getInstance().hyperparameters['chunk_len4']['default'] = chunk_lens[3]
Parameters.getInstance().hyperparameters['chunk_len_mask']['default'] |= (1 << 3)
if len(chunk_lens) > 4:
logger.warning(f'Only up to four chunk lengths are supported. Skipping {chunk_lens[4:]}')
def get_context_len() -> str:
context_len = str(Parameters.getInstance().hyperparameters['context_len_left']['default']) + ',' + str(Parameters.getInstance().hyperparameters['context_len_right']['default'])
return context_len
def set_context_len(val: str):
context_lens = [int(len.strip()) for len in val.split(',') if len.isdigit()]
if len(context_lens) == 1:
Parameters.getInstance().hyperparameters['context_len_left']['default'] = Parameters.getInstance().hyperparameters['context_len_right']['default'] = context_lens[0]
elif len(context_lens) == 2:
Parameters.getInstance().hyperparameters['context_len_left']['default'] = context_lens[0]
Parameters.getInstance().hyperparameters['context_len_right']['default'] = context_lens[1]
else:
logger.warning(f'Incorrect context length received {val}. Skipping.')
def get_new_dist_strategy() -> str:
return Parameters.getInstance().hyperparameters['new_dist_strategy']['default']
def get_chunk_count() -> int:
return int(Parameters.getInstance().hyperparameters['chunk_count']['default'])
def get_min_num_length() -> int:
return int(Parameters.getInstance().hyperparameters['min_num_length']['default'])
def get_significant_level() -> float:
return float(Parameters.getInstance().hyperparameters['significant_level']['default'])
def get_time_steepness() -> float:
return float(Parameters.getInstance().hyperparameters['time_steepness']['default'])
def get_time_power() -> float:
return float(Parameters.getInstance().hyperparameters['time_power']['default'])
def get_chunk_separator() -> str:
return Parameters.getInstance().hyperparameters['chunk_separator']['default']
def get_prefix() -> str:
return Parameters.getInstance().hyperparameters['prefix']['default']
def get_data_separator() -> str:
return Parameters.getInstance().hyperparameters['data_separator']['default']
def get_postfix() -> str:
return Parameters.getInstance().hyperparameters['postfix']['default']
def get_is_manual() -> bool:
return bool(Parameters.getInstance().hyperparameters['manual']['default'])
def get_add_chat_to_data() -> bool:
return bool(Parameters.getInstance().hyperparameters['add_chat_to_data']['default'])
def get_injection_strategy() -> str:
return Parameters.getInstance().hyperparameters['injection_strategy']['default']
def get_chunk_regex() -> str:
return Parameters.getInstance().hyperparameters['chunk_regex']['default']
def get_is_strong_cleanup() -> bool:
return bool(Parameters.getInstance().hyperparameters['strong_cleanup']['default'])
def get_max_token_count() -> int:
return int(Parameters.getInstance().hyperparameters['max_token_count']['default'])
def get_num_threads() -> int:
return int(Parameters.getInstance().hyperparameters['threads']['default'])
def get_optimization_steps() -> int:
return int(Parameters.getInstance().hyperparameters['optimization_steps']['default'])
def get_api_port() -> int:
return int(Parameters.getInstance().hyperparameters['api_port']['default'])
def get_api_on() -> bool:
return bool(Parameters.getInstance().hyperparameters['api_on']['default'])
def set_new_dist_strategy(value: str):
Parameters.getInstance().hyperparameters['new_dist_strategy']['default'] = value
def set_chunk_count(value: int):
Parameters.getInstance().hyperparameters['chunk_count']['default'] = value
def set_min_num_length(value: int):
Parameters.getInstance().hyperparameters['min_num_length']['default'] = value
def set_significant_level(value: float):
Parameters.getInstance().hyperparameters['significant_level']['default'] = value
def set_time_steepness(value: float):
Parameters.getInstance().hyperparameters['time_steepness']['default'] = value
def set_time_power(value: float):
Parameters.getInstance().hyperparameters['time_power']['default'] = value
def set_chunk_separator(value: str):
Parameters.getInstance().hyperparameters['chunk_separator']['default'] = value
def set_prefix(value: str):
Parameters.getInstance().hyperparameters['prefix']['default'] = value
def set_data_separator(value: str):
Parameters.getInstance().hyperparameters['data_separator']['default'] = value
def set_postfix(value: str):
Parameters.getInstance().hyperparameters['postfix']['default'] = value
def set_manual(value: bool):
Parameters.getInstance().hyperparameters['manual']['default'] = value
def set_add_chat_to_data(value: bool):
Parameters.getInstance().hyperparameters['add_chat_to_data']['default'] = value
def set_injection_strategy(value: str):
Parameters.getInstance().hyperparameters['injection_strategy']['default'] = value
def set_chunk_regex(value: str):
Parameters.getInstance().hyperparameters['chunk_regex']['default'] = value
def set_strong_cleanup(value: bool):
Parameters.getInstance().hyperparameters['strong_cleanup']['default'] = value
def set_max_token_count(value: int):
Parameters.getInstance().hyperparameters['max_token_count']['default'] = value
def set_num_threads(value: int):
Parameters.getInstance().hyperparameters['threads']['default'] = value
def set_optimization_steps(value: int):
Parameters.getInstance().hyperparameters['optimization_steps']['default'] = value
def set_api_port(value: int):
Parameters.getInstance().hyperparameters['api_port']['default'] = value
def set_api_on(value: bool):
Parameters.getInstance().hyperparameters['api_on']['default'] = value

View File

@ -0,0 +1,8 @@
beautifulsoup4==4.12.2
chromadb==0.3.18
lxml
optuna
pandas==2.0.3
posthog==2.4.2
sentence_transformers==2.2.2
spacy

View File

@ -0,0 +1,355 @@
"""
This file is responsible for the UI and how the application interracts with the rest of the system.
"""
import os
from pathlib import Path
# Point to where nltk will find the required data.
os.environ['NLTK_DATA'] = str(Path("extensions/superboogav2/nltk_data").resolve())
import textwrap
import codecs
import gradio as gr
import extensions.superboogav2.parameters as parameters
from modules.logging_colors import logger
from modules import shared
from .utils import create_metadata_source
from .chromadb import make_collector
from .download_urls import feed_url_into_collector
from .data_processor import process_and_add_to_collector
from .benchmark import benchmark
from .optimize import optimize
from .notebook_handler import input_modifier_internal
from .chat_handler import custom_generate_chat_prompt_internal
from .api import APIManager
collector = None
api_manager = None
def setup():
global collector
global api_manager
collector = make_collector()
api_manager = APIManager(collector)
if parameters.get_api_on():
api_manager.start_server(parameters.get_api_port())
def _feed_data_into_collector(corpus):
yield '### Processing data...'
process_and_add_to_collector(corpus, collector, False, create_metadata_source('direct-text'))
yield '### Done.'
def _feed_file_into_collector(file):
yield '### Reading and processing the input dataset...'
text = file.decode('utf-8')
process_and_add_to_collector(text, collector, False, create_metadata_source('file'))
yield '### Done.'
def _feed_url_into_collector(urls):
for i in feed_url_into_collector(urls, collector):
yield i
yield '### Done.'
def _begin_benchmark():
score, max_score = benchmark(Path("extensions/superboogav2/benchmark_texts/questions.json"), collector)
return f'**Score**: {score}/{max_score}'
def _begin_optimization(progress=gr.Progress()):
return optimize(collector, progress), *_get_optimizable_settings()
def _clear_data():
collector.clear()
return "### Data Cleared!"
def _get_optimizable_settings() -> list:
preprocess_pipeline = []
if parameters.should_to_lower():
preprocess_pipeline.append('Lower Cases')
if parameters.should_remove_punctuation():
preprocess_pipeline.append('Remove Punctuation')
if parameters.should_remove_specific_pos():
preprocess_pipeline.append('Remove Adverbs')
if parameters.should_remove_stopwords():
preprocess_pipeline.append('Remove Stop Words')
if parameters.should_lemmatize():
preprocess_pipeline.append('Lemmatize')
if parameters.should_merge_spaces():
preprocess_pipeline.append('Merge Spaces')
if parameters.should_strip():
preprocess_pipeline.append('Strip Edges')
return [
parameters.get_time_power(),
parameters.get_time_steepness(),
parameters.get_significant_level(),
parameters.get_min_num_sentences(),
parameters.get_new_dist_strategy(),
parameters.get_delta_start(),
parameters.get_min_num_length(),
parameters.get_num_conversion_strategy(),
preprocess_pipeline,
parameters.get_chunk_count(),
parameters.get_context_len(),
parameters.get_chunk_len()
]
def _apply_settings(optimization_steps, time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion,
preprocess_pipeline, api_port, api_on, injection_strategy, add_chat_to_data, manual, postfix, data_separator, prefix, max_token_count,
chunk_count, chunk_sep, context_len, chunk_regex, chunk_len, threads, strong_cleanup):
logger.debug('Applying settings.')
try:
parameters.set_optimization_steps(optimization_steps)
parameters.set_significant_level(significant_level)
parameters.set_min_num_sentences(min_sentences)
parameters.set_new_dist_strategy(new_dist_strat)
parameters.set_delta_start(delta_start)
parameters.set_min_num_length(min_number_length)
parameters.set_num_conversion_strategy(num_conversion)
parameters.set_api_port(api_port)
parameters.set_api_on(api_on)
parameters.set_injection_strategy(injection_strategy)
parameters.set_add_chat_to_data(add_chat_to_data)
parameters.set_manual(manual)
parameters.set_postfix(codecs.decode(postfix, 'unicode_escape'))
parameters.set_data_separator(codecs.decode(data_separator, 'unicode_escape'))
parameters.set_prefix(codecs.decode(prefix, 'unicode_escape'))
parameters.set_max_token_count(max_token_count)
parameters.set_time_power(time_power)
parameters.set_time_steepness(time_steepness)
parameters.set_chunk_count(chunk_count)
parameters.set_chunk_separator(codecs.decode(chunk_sep, 'unicode_escape'))
parameters.set_context_len(context_len)
parameters.set_chunk_regex(chunk_regex)
parameters.set_chunk_len(chunk_len)
parameters.set_num_threads(threads)
parameters.set_strong_cleanup(strong_cleanup)
preprocess_choices = ['Lower Cases', 'Remove Punctuation', 'Remove Adverbs', 'Remove Stop Words', 'Lemmatize', 'Merge Spaces', 'Strip Edges']
for preprocess_method in preprocess_choices:
if preprocess_method == 'Lower Cases':
parameters.set_to_lower(preprocess_method in preprocess_pipeline)
elif preprocess_method == 'Remove Punctuation':
parameters.set_remove_punctuation(preprocess_method in preprocess_pipeline)
elif preprocess_method == 'Remove Adverbs':
parameters.set_remove_specific_pos(preprocess_method in preprocess_pipeline)
elif preprocess_method == 'Remove Stop Words':
parameters.set_remove_stopwords(preprocess_method in preprocess_pipeline)
elif preprocess_method == 'Lemmatize':
parameters.set_lemmatize(preprocess_method in preprocess_pipeline)
elif preprocess_method == 'Merge Spaces':
parameters.set_merge_spaces(preprocess_method in preprocess_pipeline)
elif preprocess_method == 'Strip Edges':
parameters.set_strip(preprocess_method in preprocess_pipeline)
# Based on API on/off, start or stop the server
if api_manager is not None:
if parameters.get_api_on() and (not api_manager.is_server_running()):
api_manager.start_server(parameters.get_api_port())
elif (not parameters.get_api_on()) and api_manager.is_server_running():
api_manager.stop_server()
except Exception as e:
logger.warn(f'Could not properly apply settings: {str(e)}')
def custom_generate_chat_prompt(user_input, state, **kwargs):
return custom_generate_chat_prompt_internal(user_input, state, collector, **kwargs)
def input_modifier(string):
return input_modifier_internal(string, collector)
def ui():
with gr.Accordion("Click for more information...", open=False):
gr.Markdown(textwrap.dedent("""
## About
This extension takes a dataset as input, breaks it into chunks, and adds the result to a local/offline Chroma database.
The database is then queried during inference time to get the excerpts that are closest to your input. The idea is to create an arbitrarily large pseudo context.
The core methodology was developed and contributed by kaiokendev, who is working on improvements to the method in this repository: https://github.com/kaiokendev/superbig
## Data input
Start by entering some data in the interface below and then clicking on "Load data".
Each time you load some new data, the old chunks are discarded.
## Chat mode
#### Instruct
On each turn, the chunks will be compared to your current input and the most relevant matches will be appended to the input in the following format:
```
Consider the excerpts below as additional context:
...
```
The injection doesn't make it into the chat history. It is only used in the current generation.
#### Regular chat
The chunks from the external data sources are ignored, and the chroma database is built based on the chat history instead. The most relevant past exchanges relative to the present input are added to the context string. This way, the extension acts as a long term memory.
## Notebook/default modes
Your question must be manually specified between `<|begin-user-input|>` and `<|end-user-input|>` tags, and the injection point must be specified with `<|injection-point|>`.
The special tokens mentioned above (`<|begin-user-input|>`, `<|end-user-input|>`, and `<|injection-point|>`) are removed in the background before the text generation begins.
Here is an example in Vicuna 1.1 format:
```
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
USER:
<|injection-point|>
<|begin-user-input|>What datasets are mentioned in the text above?<|end-user-input|>
ASSISTANT:
```
"""))
with gr.Row():
with gr.Column(min_width=600):
with gr.Tab("Text input"):
data_input = gr.Textbox(lines=20, label='Input data')
update_data = gr.Button('Load data')
with gr.Tab("URL input"):
url_input = gr.Textbox(lines=10, label='Input URLs', info='Enter one or more URLs separated by newline characters.')
strong_cleanup = gr.Checkbox(value=parameters.get_is_strong_cleanup(), label='Strong cleanup', info='Only keeps html elements that look like long-form text.')
threads = gr.Number(value=parameters.get_num_threads(), label='Threads', info='The number of threads to use while downloading the URLs.', precision=0)
update_url = gr.Button('Load data')
with gr.Tab("File input"):
file_input = gr.File(label='Input file', type='binary')
update_file = gr.Button('Load data')
with gr.Tab("Settings"):
with gr.Accordion("Processing settings", open=True):
chunk_len = gr.Textbox(value=parameters.get_chunk_len(), label='Chunk length', info='In characters, not tokens. This value is used when you click on "Load data".')
chunk_regex = gr.Textbox(value=parameters.get_chunk_regex(), label='Chunk regex', info='Will specifically add the captured text to the embeddings.')
context_len = gr.Textbox(value=parameters.get_context_len(), label='Context length', info='In characters, not tokens. How much context to load around each chunk.')
chunk_sep = gr.Textbox(value=codecs.encode(parameters.get_chunk_separator(), 'unicode_escape').decode(), label='Chunk separator', info='Used to manually split chunks. Manually split chunks longer than chunk length are split again. This value is used when you click on "Load data".')
with gr.Accordion("Generation settings", open=False):
chunk_count = gr.Number(value=parameters.get_chunk_count(), label='Chunk count', info='The number of closest-matching chunks to include in the prompt.')
max_token_count = gr.Number(value=parameters.get_max_token_count(), label='Max Context Tokens', info='The context length in tokens will not exceed this value.')
prefix = gr.Textbox(value=codecs.encode(parameters.get_prefix(), 'unicode_escape').decode(), label='Prefix', info='What to put before the injection point.')
data_separator = gr.Textbox(value=codecs.encode(parameters.get_data_separator(), 'unicode_escape').decode(), label='Data separator', info='When multiple pieces of distant data are added, they might be unrelated. It\'s important to separate them.')
postfix = gr.Textbox(value=codecs.encode(parameters.get_postfix(), 'unicode_escape').decode(), label='Postfix', info='What to put after the injection point.')
with gr.Row():
manual = gr.Checkbox(value=parameters.get_is_manual(), label="Is Manual", info="Manually specify when to use ChromaDB. Insert `!c` at the start or end of the message to trigger a query.", visible=shared.is_chat())
add_chat_to_data = gr.Checkbox(value=parameters.get_add_chat_to_data(), label="Add Chat to Data", info="Automatically feed the chat history as you chat.", visible=shared.is_chat())
injection_strategy = gr.Radio(choices=[parameters.PREPEND_TO_LAST, parameters.APPEND_TO_LAST, parameters.HIJACK_LAST_IN_CONTEXT], value=parameters.get_injection_strategy(), label='Injection Strategy', info='Where to inject the messages in chat or instruct mode.', visible=shared.is_chat())
with gr.Row():
api_on = gr.Checkbox(value=parameters.get_api_on(), label="Turn on API", info="Check this to turn on the API service.")
api_port = gr.Number(value=parameters.get_api_port(), label="API Port", info="The port on which the API service will run.")
with gr.Accordion("Advanced settings", open=False):
preprocess_set_choices = []
if parameters.should_to_lower():
preprocess_set_choices.append('Lower Cases')
if parameters.should_remove_punctuation():
preprocess_set_choices.append('Remove Punctuation')
if parameters.should_remove_specific_pos():
preprocess_set_choices.append('Remove Adverbs')
if parameters.should_remove_stopwords():
preprocess_set_choices.append('Remove Stop Words')
if parameters.should_lemmatize():
preprocess_set_choices.append('Lemmatize')
if parameters.should_merge_spaces():
preprocess_set_choices.append('Merge Spaces')
if parameters.should_strip():
preprocess_set_choices.append('Strip Edges')
preprocess_pipeline = gr.CheckboxGroup(label='Preprocessing pipeline', choices=[
'Lower Cases',
'Remove Punctuation',
'Remove Adverbs',
'Remove Stop Words',
'Lemmatize',
'Merge Spaces',
'Strip Edges',
], value=preprocess_set_choices, interactive=True, info='How to preprocess the text before it is turned into an embedding.')
with gr.Row():
num_conversion = gr.Dropdown(choices=[parameters.NUM_TO_WORD_METHOD, parameters.NUM_TO_CHAR_METHOD, parameters.NUM_TO_CHAR_LONG_METHOD, 'None'], value=parameters.get_num_conversion_strategy(), label="Number Conversion Method", info='How to preprocess numbers before creating the embeddings.', interactive=True)
min_number_length = gr.Number(value=parameters.get_min_num_length(), label='Number Length Threshold', info='In digits. Only numbers that have at least that many digits will be converted.', interactive=True)
delta_start = gr.Number(value=parameters.get_delta_start(), label='Delta Start Index', info='If the system encounters two identical embeddings, and they both start within the same delta, then only the first will be considered.', interactive=True)
new_dist_strat = gr.Dropdown(choices=[parameters.DIST_MIN_STRATEGY, parameters.DIST_HARMONIC_STRATEGY, parameters.DIST_GEOMETRIC_STRATEGY, parameters.DIST_ARITHMETIC_STRATEGY], value=parameters.get_new_dist_strategy(), label="Distance Strategy", info='When two embedding texts are merged, the distance of the new piece will be decided using one of these strategies.', interactive=True)
min_sentences = gr.Number(value=parameters.get_min_num_sentences(), label='Summary Threshold', info='In sentences. The minumum number of sentences to trigger text-rank summarization.', interactive=True)
significant_level = gr.Slider(0.8, 2, value=parameters.get_significant_level(), label='Significant Level', info='Defines the cut-off for what is considered a "significant" distance relative to the median distance among the returned samples.', interactive=True)
time_steepness = gr.Slider(0.01, 1.0, value=parameters.get_time_steepness(), label='Time Weighing Steepness', info='How differently two close excerpts are going to be weighed.')
time_power = gr.Slider(0.0, 1.0, value=parameters.get_time_power(), label='Time Weighing Power', info='How influencial is the weighing. At 1.0, old entries won\'t be considered')
with gr.Tab("Benchmark"):
benchmark_button = gr.Button('Benchmark')
optimize_button = gr.Button('Optimize')
optimization_steps = gr.Number(value=parameters.get_optimization_steps(), label='Optimization Steps', info='For how many steps to optimize.', interactive=True)
clear_button = gr.Button('❌ Clear Data')
with gr.Column():
last_updated = gr.Markdown()
all_params = [optimization_steps, time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion,
preprocess_pipeline, api_port, api_on, injection_strategy, add_chat_to_data, manual, postfix, data_separator, prefix, max_token_count,
chunk_count, chunk_sep, context_len, chunk_regex, chunk_len, threads, strong_cleanup]
optimizable_params = [time_power, time_steepness, significant_level, min_sentences, new_dist_strat, delta_start, min_number_length, num_conversion,
preprocess_pipeline, chunk_count, context_len, chunk_len]
update_data.click(_feed_data_into_collector, [data_input], last_updated, show_progress=False)
update_url.click(_feed_url_into_collector, [url_input], last_updated, show_progress=False)
update_file.click(_feed_file_into_collector, [file_input], last_updated, show_progress=False)
benchmark_button.click(_begin_benchmark, [], last_updated, show_progress=True)
optimize_button.click(_begin_optimization, [], [last_updated] + optimizable_params, show_progress=True)
clear_button.click(_clear_data, [], last_updated, show_progress=False)
optimization_steps.input(fn=_apply_settings, inputs=all_params, show_progress=False)
time_power.input(fn=_apply_settings, inputs=all_params, show_progress=False)
time_steepness.input(fn=_apply_settings, inputs=all_params, show_progress=False)
significant_level.input(fn=_apply_settings, inputs=all_params, show_progress=False)
min_sentences.input(fn=_apply_settings, inputs=all_params, show_progress=False)
new_dist_strat.input(fn=_apply_settings, inputs=all_params, show_progress=False)
delta_start.input(fn=_apply_settings, inputs=all_params, show_progress=False)
min_number_length.input(fn=_apply_settings, inputs=all_params, show_progress=False)
num_conversion.input(fn=_apply_settings, inputs=all_params, show_progress=False)
preprocess_pipeline.input(fn=_apply_settings, inputs=all_params, show_progress=False)
api_port.input(fn=_apply_settings, inputs=all_params, show_progress=False)
api_on.input(fn=_apply_settings, inputs=all_params, show_progress=False)
injection_strategy.input(fn=_apply_settings, inputs=all_params, show_progress=False)
add_chat_to_data.input(fn=_apply_settings, inputs=all_params, show_progress=False)
manual.input(fn=_apply_settings, inputs=all_params, show_progress=False)
postfix.input(fn=_apply_settings, inputs=all_params, show_progress=False)
data_separator.input(fn=_apply_settings, inputs=all_params, show_progress=False)
prefix.input(fn=_apply_settings, inputs=all_params, show_progress=False)
max_token_count.input(fn=_apply_settings, inputs=all_params, show_progress=False)
chunk_count.input(fn=_apply_settings, inputs=all_params, show_progress=False)
chunk_sep.input(fn=_apply_settings, inputs=all_params, show_progress=False)
context_len.input(fn=_apply_settings, inputs=all_params, show_progress=False)
chunk_regex.input(fn=_apply_settings, inputs=all_params, show_progress=False)
chunk_len.input(fn=_apply_settings, inputs=all_params, show_progress=False)
threads.input(fn=_apply_settings, inputs=all_params, show_progress=False)
strong_cleanup.input(fn=_apply_settings, inputs=all_params, show_progress=False)

View File

@ -0,0 +1,16 @@
"""
This module contains common functions across multiple other modules.
"""
import extensions.superboogav2.parameters as parameters
# Create the context using the prefix + data_separator + postfix from parameters.
def create_context_text(results):
context = parameters.get_prefix() + parameters.get_data_separator().join(results) + parameters.get_postfix()
return context
# Create metadata with the specified source
def create_metadata_source(source: str):
return {'source': source}

View File

@ -195,7 +195,7 @@ def update_requirements(initial_installation=False):
print("Installing extensions requirements.")
extensions = next(os.walk("extensions"))[1]
for extension in extensions:
if extension in ['superbooga']: # No wheels available for requirements
if extension in ['superbooga', 'superboogav2']: # No wheels available for requirements
continue
extension_req_path = os.path.join("extensions", extension, "requirements.txt")