diff --git a/Dockerfile.cuda b/Dockerfile.cuda deleted file mode 100644 index 957cf07..0000000 --- a/Dockerfile.cuda +++ /dev/null @@ -1,31 +0,0 @@ -FROM nvidia/cuda:11.7.1-devel-ubuntu22.04 AS build - -RUN apt-get update && apt-get install -y build-essential cmake libboost-dev libasio-dev - -ADD ./ /turbopilot - -RUN mkdir /turbopilot/build - -WORKDIR /turbopilot/build - -RUN cmake -DGGML_CUBLAS=ON -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc .. -RUN make turbopilot - -FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu22.04 AS runtime - - -WORKDIR /app - -COPY --from=build /turbopilot/build/bin/turbopilot /app/turbopilot - -ENV THREADS=4 - -ENV MODEL="/models/codegen-2B-multi-ggml-4bit-quant.bin" - -ENV BATCHSIZE=64 - -COPY ./run.sh /app/ - -EXPOSE 18080 - -CMD /app/run.sh \ No newline at end of file diff --git a/README.md b/README.md index 263d58d..4373413 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,9 @@ TurboPilot is a self-hosted [copilot](https://github.com/features/copilot) clone ![a screen recording of turbopilot running through fauxpilot plugin](assets/vscode-status.gif) -**NEW:** As of v0.0.5 turbopilot supports cuda inference which greatly accelerates suggestions when working with longer prompts (i.e. longer existing code files). +**New: Refactored + Simplified**: The source code has been improved to make it easier to extend and add new models to Turbopilot. The system now supports multiple flavours of model + +**New: Wizardcoder, Starcoder, Santacoder support** - Turbopilot now supports state of the art local code completion models which provide more programming languages and "fill in the middle" support. ## 🤝 Contributing @@ -23,6 +25,7 @@ Make a fork, make your changes and then open a [PR](https://github.com/ravenscro The easiest way to try the project out is to grab the pre-processed models and then run the server in docker. + ### Getting The Models You have 2 options for getting the model @@ -31,21 +34,6 @@ You have 2 options for getting the model You can download the pre-converted, pre-quantized models from Huggingface. -The `multi` flavour models can provide auto-complete suggestions for `C`, `C++`, `Go`, `Java`, `JavaScript`, and `Python`. - -The `mono` flavour models can provide auto-complete suggestions for `Python` only (but the quality of Python-specific suggestions may be higher). - -Pre-converted and pre-quantized models are available for download from here: - -| Model Name | RAM Requirement | Supported Languages | Direct Download | HF Project Link | -|---------------------|-----------------|---------------------------|-----------------|-----------------| -| CodeGen 350M multi | ~800MiB | `C`, `C++`, `Go`, `Java`, `JavaScript`, `Python` | [:arrow_down:](https://huggingface.co/ravenscroftj/CodeGen-350M-multi-ggml-quant/resolve/main/codegen-350M-multi-ggml-4bit-quant.bin) | [:hugs:](https://huggingface.co/ravenscroftj/CodeGen-350M-multi-ggml-quant) | -| CodeGen 350M mono | ~800MiB | `Python` | [:arrow_down:](https://huggingface.co/Guglielmo/CodeGen-350M-mono-ggml-quant/resolve/main/ggml-model-quant.bin) | [:hugs:](https://huggingface.co/Guglielmo/CodeGen-350M-mono-ggml-quant) | -| CodeGen 2B multi | ~4GiB | `C`, `C++`, `Go`, `Java`, `JavaScript`, `Python` | [:arrow_down:](https://huggingface.co/ravenscroftj/CodeGen-2B-multi-ggml-quant/resolve/main/codegen-2B-multi-ggml-4bit-quant.bin) | [:hugs:](https://huggingface.co/ravenscroftj/CodeGen-2B-multi-ggml-quant) | -| CodeGen 2B mono | ~4GiB | `Python` | [:arrow_down:](https://huggingface.co/Guglielmo/CodeGen-2B-mono-ggml-quant/resolve/main/ggml-model-quant.bin) | [:hugs:](https://huggingface.co/Guglielmo/CodeGen-2B-mono-ggml-quant/) | -| CodeGen 6B multi | ~8GiB | `C`, `C++`, `Go`, `Java`, `JavaScript`, `Python` | [:arrow_down:](https://huggingface.co/ravenscroftj/CodeGen-6B-multi-ggml-quant/resolve/main/codegen-6B-multi-ggml-4bit-quant.bin) | [:hugs:](https://huggingface.co/ravenscroftj/CodeGen-6B-multi-ggml-quant) | -| CodeGen 6B mono | ~8GiB | `Python` | [:arrow_down:](https://huggingface.co/Guglielmo/CodeGen-6B-mono-ggml-quant/resolve/main/ggml-model-quant.bin) | [:hugs:](https://huggingface.co/Guglielmo/CodeGen-6B-mono-ggml-quant/) | - #### Option B: Convert The Models Yourself - Hard, More Flexible @@ -58,17 +46,21 @@ Download the [latest binary](https://github.com/ravenscroftj/turbopilot/releases Run: ```bash -./codegen-serve -m ./models/codegen-6B-multi-ggml-4bit-quant.bin +./turbopilot -m starcoder -f ./models/santacoder-q4_0.bin ``` -The application should start a server on port `18080` +The application should start a server on port `18080`, you can change this with the `-p` option but this is the default port that vscode-fauxpilot tries to connect to so you probably want to leave this alone unless you are sure you know what you're doing. If you have a multi-core system you can control how many CPUs are used with the `-t` option - for example, on my AMD Ryzen 5000 which has 6 cores/12 threads I use: ```bash -./codegen-serve -t 6 -m ./models/codegen-6B-multi-ggml-4bit-quant.bin +./codegen-serve -t 6 -m starcoder -f ./models/santacoder-q4_0.bin ``` +Turbopilot also supports the legacy codegen models. Just change the model type flag `-m` to `codegen` instead. + +**NOTE: the latest version of GGML requires that you re-quantize your codegen models. Old models downloaded from here will no longer work. I am working on providing updated quantized codegen models** + ### 📦 Running From Docker You can also run Turbopilot from the pre-built docker image supplied [here](https://github.com/users/ravenscroftj/packages/container/package/turbopilot) @@ -79,7 +71,8 @@ You will still need to download the models separately, then you can run: docker run --rm -it \ -v ./models:/models \ -e THREADS=6 \ - -e MODEL="/models/codegen-2B-multi-ggml-4bit-quant.bin" \ + -e MODEL_TYPE=starcoder \ + -e MODEL="/models/santacoder-q4_0.bin" \ -p 18080:18080 \ ghcr.io/ravenscroftj/turbopilot:latest ``` diff --git a/run.sh b/run.sh index ef1339f..e508ee8 100755 --- a/run.sh +++ b/run.sh @@ -1,3 +1,3 @@ #!/bin/sh -/app/codegen-serve -t $THREADS -m $MODEL -b $BATCHSIZE \ No newline at end of file +/app/turbopilot -t $THREADS -m $MODEL_TYPE -f $MODEL \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index d8435b5..939d820 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -22,10 +22,16 @@ int main(int argc, char **argv) .help("Path to the model that turbopilot should serve") .required(); - program.add_argument("-t", "--model-type") - .help("The type of model to load. Can be codegen/gpt-j or starcoder architectures.") + program.add_argument("-m", "--model-type") + .help("The type of model to load. Can be codegen,starcoder,wizardcoder") .default_value("codegen"); + program.add_argument("-t", "--threads") + .help("The number of CPU threads turbopilot is allowed to use. Defaults to 4") + .default_value(4) + .scan<'i', int>(); + + program.add_argument("-p", "--port") .help("The tcp port that turbopilot should listen on") .default_value(18080) @@ -62,6 +68,8 @@ int main(int argc, char **argv) ModelConfig config{}; std::mt19937 rng(program.get("--random-seed")); + config.n_threads = program.get("--threads"); + if(model_type.compare("codegen") == 0) { spdlog::info("Initializing GPT-J type model for '{}' model", model_type); model = new GPTJModel(config, rng);