zylon-ai · DanielusG · May 29, 2023 · May 29, 2023 · May 29, 2023 · May 29, 2023
diff --git a/.gitignore b/.gitignore
@@ -168,3 +168,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+source_documents/
+.vscode/
diff --git a/README.md b/README.md
@@ -7,10 +7,14 @@ Built with [LangChain](https://github.com/hwchase17/langchain), [GPT4All](https:
 
 # Environment Setup
 In order to set your environment up to run the code here, first install all requirements:
-
-```shell
-pip3 install -r requirements.txt
-```
+- For NVIDIA GPUs (on Linux):
+   ```shell
+   sh ./install_cuda.sh
+   ```
+- For CPU only setup:
+   ```shell
+   pip3 install -r requirements.txt
+   ```
 
 Then, download the LLM model and place it in a directory of your choice:
 - LLM: default to [ggml-gpt4all-j-v1.3-groovy.bin](https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin). If you prefer a different GPT4All-J compatible model, just download it and reference it in your `.env` file.
@@ -23,16 +27,16 @@ MODEL_PATH: Path to your GPT4All or LlamaCpp supported LLM
 MODEL_N_CTX: Maximum token limit for the LLM model
 EMBEDDINGS_MODEL_NAME: SentenceTransformers embeddings model name (see https://www.sbert.net/docs/pretrained_models.html)
 TARGET_SOURCE_CHUNKS: The amount of chunks (sources) that will be used to answer a question
+N_BATCH: The number of tokens processed per request, increasing this value leads to an increase in resource utilisation. A lower value, on the other hand, slows down inference.
+USE_MLOCK: When this value is 1, the model is loaded entirely in RAM, reduces the read on disk, but uses more RAM
+N_GPU_LAYERS: If an Nvidia GPU is detected, this setting allocates part of the model to the video card to speed up processing time. If a value is set too high, and the video card is unable to support it, an error with `out of memory` will be thrown
 ```
 
 Note: because of the way `langchain` loads the `SentenceTransformers` embeddings, the first time you run the script it will require internet connection to download the embeddings model itself.
 
-## Test dataset
-This repo uses a [state of the union transcript](https://github.com/imartinez/privateGPT/blob/main/source_documents/state_of_the_union.txt) as an example.
-
 ## Instructions for ingesting your own dataset
 
-Put any and all your files into the `source_documents` directory
+Put all the files you want to analyse in the `source_documents` folder
 
 The supported extensions are:
 
@@ -70,7 +74,7 @@ Using embedded DuckDB with persistence: data will be stored in: db
 Ingestion complete! You can now run privateGPT.py to query your documents
 ```
 
-It will create a `db` folder containing the local vectorstore. Will take 20-30 seconds per document, depending on the size of the document.
+It will create a `db` folder containing the local vectorstore. Will take 20-30 seconds per document (much less if you use an Nvidia GPU) , depending on the size of the document.
 You can ingest as many documents as you want, and all will be accumulated in the local embeddings database.
 If you want to start from an empty database, delete the `db` folder.
 

diff --git a/check_lang.py b/check_lang.py
@@ -0,0 +1,20 @@
+from dotenv import load_dotenv
+import os
+from deep_translator import GoogleTranslator
+import langdetect
+load_dotenv()
+
+auto_translate = os.environ.get("AUTO_TRANSLATE")
+
+def translate(text):
+    if auto_translate == None or auto_translate == "false" or auto_translate == "False" or auto_translate == "0":
+        return text
+    else:
+        if langdetect.detect(text) == "en":
+            return text
+        new_text =  GoogleTranslator(source="auto", target="en").translate(text)
+        print(f"Translated '{text}' to '{new_text}'")
+        return new_text
+
+if __name__ == "__main__":
+    print(translate("Qual è la massa di un elettrone?"))
diff --git a/example.env b/example.env
@@ -1,6 +1,11 @@
 PERSIST_DIRECTORY=db
-MODEL_TYPE=GPT4All
-MODEL_PATH=models/ggml-gpt4all-j-v1.3-groovy.bin
-EMBEDDINGS_MODEL_NAME=all-MiniLM-L6-v2
-MODEL_N_CTX=1000
-TARGET_SOURCE_CHUNKS=4
+MODEL_TYPE=LlamaCpp
+MODEL_PATH=/path/for/model
+#best english embeddings model
+#best italian efederici/sentence-it5-base
+EMBEDDINGS_MODEL_NAME=all-mpnet-base-v2
+MODEL_N_CTX=4096
+N_GPU_LAYERS=12
+USE_MLOCK=1
+TARGET_SOURCE_CHUNKS=8
+N_BATCH=1024
diff --git a/install_cuda.sh b/install_cuda.sh
@@ -0,0 +1,7 @@
+export LLAMA_CUBLAS=1
+#check if venv exists
+if [ ! -d "venv" ]; then
+    python3 -m venv venv
+fi
+source venv/bin/activate
+pip install -r requirements.txt
diff --git a/privateGPT.py b/privateGPT.py
@@ -5,9 +5,10 @@
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.vectorstores import Chroma
 from langchain.llms import GPT4All, LlamaCpp
+
 import os
 import argparse
-
+from check_lang import translate
 load_dotenv()
 
 embeddings_model_name = os.environ.get("EMBEDDINGS_MODEL_NAME")
@@ -17,6 +18,9 @@
 model_path = os.environ.get('MODEL_PATH')
 model_n_ctx = os.environ.get('MODEL_N_CTX')
 target_source_chunks = int(os.environ.get('TARGET_SOURCE_CHUNKS',4))
+n_gpu_layers = os.environ.get('N_GPU_LAYERS')
+use_mlock = os.environ.get('USE_MLOCK')
+n_batch = os.environ.get('N_BATCH') if os.environ.get('N_BATCH') != None else 512
 
 from constants import CHROMA_SETTINGS
 
@@ -31,12 +35,13 @@ def main():
     # Prepare the LLM
     match model_type:
         case "LlamaCpp":
-            llm = LlamaCpp(model_path=model_path, n_ctx=model_n_ctx, callbacks=callbacks, verbose=False)
+            llm = LlamaCpp(model_path=model_path, n_ctx=model_n_ctx, callbacks=callbacks, verbose=False,n_gpu_layers=n_gpu_layers, use_mlock=use_mlock,top_p=0.9, n_batch=n_batch)
         case "GPT4All":
             llm = GPT4All(model=model_path, n_ctx=model_n_ctx, backend='gptj', callbacks=callbacks, verbose=False)
         case _default:
             print(f"Model {model_type} not supported!")
             exit;
+
     qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents= not args.hide_source)
     # Interactive questions and answers
     while True:
@@ -45,20 +50,20 @@ def main():
             break
 
         # Get the answer from the chain
-        res = qa(query)
+        res = qa(translate(query))
         answer, docs = res['result'], [] if args.hide_source else res['source_documents']
 
+        # Print the relevant sources used for the answer
+        for document in docs:
+            print("\n> " + document.metadata["source"] + ":")
+            print(document.page_content)
+
         # Print the result
         print("\n\n> Question:")
         print(query)
         print("\n> Answer:")
         print(answer)
 
-        # Print the relevant sources used for the answer
-        for document in docs:
-            print("\n> " + document.metadata["source"] + ":")
-            print(document.page_content)
-
 def parse_arguments():
     parser = argparse.ArgumentParser(description='privateGPT: Ask questions to your documents without an internet connection, '
                                                  'using the power of LLMs.')

diff --git a/requirements.txt b/requirements.txt
@@ -1,7 +1,7 @@
-langchain==0.0.177
+langchain==0.0.183
 gpt4all==0.2.3
 chromadb==0.3.23
-llama-cpp-python==0.1.50
+llama-cpp-python==0.1.55
 urllib3==2.0.2
 pdfminer.six==20221105
 python-dotenv==1.0.0
@@ -11,3 +11,4 @@ tabulate==0.9.0
 pandoc==2.3
 pypandoc==1.11
 tqdm==4.65.0
+langdetect==1.0.9