deploying new version with streaming

2024-10-01 01:06:10 -04:00 · 2023-05-18 11:21:32 -04:00 · 2023-05-18 11:21:32 -04:00 · 057b9f51bc
commit 057b9f51bc
parent bce2b3025b
4 changed files with 53 additions and 56 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -170,25 +170,25 @@ workflows:
            branches:
              only:
                - main
-  # build-py-deploy:
-  #   jobs:
-  #     - build-py-linux:
-  #         filters:
-  #           branches:
-  #             only:
-  #     - build-py-macos:
-  #         filters:
-  #           branches:
-  #             only:
-  #     - build-py-windows:
-  #         filters:
-  #           branches:
-  #             only:
-  #     - store-and-upload-wheels:
-  #         filters:
-  #           branches:
-  #             only:
-  #         requires:
-  #           - build-py-windows
-  #           - build-py-linux
-  #           - build-py-macos
+  build-py-deploy:
+    jobs:
+      - build-py-linux:
+          filters:
+            branches:
+              only:
+      - build-py-macos:
+          filters:
+            branches:
+              only:
+      - build-py-windows:
+          filters:
+            branches:
+              only:
+      - store-and-upload-wheels:
+          filters:
+            branches:
+              only:
+          requires:
+            - build-py-windows
+            - build-py-linux
+            - build-py-macos
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@ -155,24 +155,26 @@ class GPT4All():
        print("Model downloaded at: " + download_path)
        return download_path

-    def generate(self, prompt: str, **generate_kwargs) -> str:
+    def generate(self, prompt: str, streaming: bool = False, **generate_kwargs) -> str:
        """
        Surfaced method of running generate without accessing model object.

        Args:
            prompt: Raw string to be passed to model.
+            streaming: True if want output streamed to stdout.
            **generate_kwargs: Optional kwargs to pass to prompt context.
        
        Returns:
            Raw string of generated model response.
        """
-        return self.model.generate(prompt, **generate_kwargs)
+        return self.model.generate(prompt, streaming=streaming, **generate_kwargs)
    
    def chat_completion(self, 
                        messages: List[Dict], 
                        default_prompt_header: bool = True, 
                        default_prompt_footer: bool = True, 
                        verbose: bool = True,
+                        streaming: bool = True,
                        **generate_kwargs) -> str:
        """
        Format list of message dictionaries into a prompt and call model
@ -189,6 +191,7 @@ class GPT4All():
                before user/assistant role messages.
            default_prompt_footer: If True (default), add default footer at end of prompt.
            verbose: If True (default), print full prompt and generated response.
+            streaming: True if want output streamed to stdout.
            **generate_kwargs: Optional kwargs to pass to prompt context.

        Returns:
@ -206,7 +209,7 @@ class GPT4All():
        if verbose:
            print(full_prompt)

-        response = self.model.generate(full_prompt, **generate_kwargs)
+        response = self.model.generate(full_prompt, streaming=streaming, **generate_kwargs)

        if verbose:
            print(response)
--- a/gpt4all-bindings/python/gpt4all/pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/pyllmodel.py
@ -1,25 +1,23 @@
-from io import StringIO
 import pkg_resources
 import ctypes
 import os
 import platform
 import re
+import subprocess
 import sys

-class DualOutput:
-    def __init__(self, stdout, string_io):
-        self.stdout = stdout
-        self.string_io = string_io
+class DualStreamProcessor:
+    def __init__(self, stream=None):
+        self.stream = stream
+        self.output = ""

    def write(self, text):
-        self.stdout.write(text)
-        self.string_io.write(text)
+        cleaned_text = re.sub(r"\n(?!\n)", "", text)
+        if self.stream is not None:
+            self.stream.write(cleaned_text)
+            self.stream.flush()
+        self.output += cleaned_text

-    def flush(self):
-        # It's a good idea to also define a flush method that flushes both
-        # outputs, as sys.stdout is expected to have this method.
-        self.stdout.flush()
-        self.string_io.flush()

 # TODO: provide a config file to make this more robust
 LLMODEL_PATH = os.path.join("llmodel_DO_NOT_MODIFY", "build").replace("\\", "\\\\")
@ -175,7 +173,7 @@ class LLModel:
                 repeat_penalty: float = 1.2, 
                 repeat_last_n: int = 10, 
                 context_erase: float = .5,
-                 std_passthrough: bool = False) -> str:
+                 streaming: bool = False) -> str:
        """
        Generate response from model from a prompt.

@ -183,12 +181,8 @@ class LLModel:
        ----------
        prompt: str
            Question, task, or conversation for model to respond to
-        add_default_header: bool, optional
-            Whether to add a prompt header (default is True)
-        add_default_footer: bool, optional
-            Whether to add a prompt footer (default is True)
-        verbose: bool, optional
-            Whether to print prompt and response
+        streaming: bool
+            Stream response to stdout

        Returns
        -------
@ -198,13 +192,14 @@ class LLModel:
        prompt = prompt.encode('utf-8')
        prompt = ctypes.c_char_p(prompt)

-        # Change stdout to StringIO so we can collect response
        old_stdout = sys.stdout 
-        collect_response = StringIO()
-        if std_passthrough:
-            sys.stdout = DualOutput(old_stdout, collect_response)
-        else:
-            sys.stdout = collect_response
+
+        stream_processor = DualStreamProcessor()
+    
+        if streaming:
+            stream_processor.stream = sys.stdout
+        
+        sys.stdout = stream_processor

        context = LLModelPromptContext(
            logits_size=logits_size, 
@ -228,13 +223,10 @@ class LLModel:
                               RecalculateCallback(self._recalculate_callback), 
                               context)

-        response = collect_response.getvalue()
+        # Revert to old stdout
        sys.stdout = old_stdout

-        # Remove the unnecessary new lines from response
-        response = re.sub(r"\n(?!\n)", "", response).strip()
-        
-        return response
+        return stream_processor.output

    # Empty prompt callback
    @staticmethod
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@ -78,6 +78,8 @@ setup(
        'dev': [
            'pytest',
            'twine',
+            'wheel',
+            'setuptools',
            'mkdocs-material',
            'mkautodoc',
            'mkdocstrings[python]',