diff --git a/README.md b/README.md index b07bf7c..c8b4730 100644 --- a/README.md +++ b/README.md @@ -158,6 +158,19 @@ python -m llama2_wrapper.server --backend_type gptq Navigate to http://localhost:8000/docs to see the OpenAPI documentation. +#### Basic settings + +| Flag | Description | +| ---------------- | ------------------------------------------------------------ | +| `-h`, `--help` | Show this help message. | +| `--model_path` | The path to the model to use for generating completions. | +| `--backend_type` | Backend for llama2, options: llama.cpp, gptq, transformers | +| `--max_tokens` | Maximum context size. | +| `--load_in_8bit` | Whether to use bitsandbytes to run model in 8 bit mode (only for transformers models). | +| `--verbose` | Whether to print verbose output to stderr. | +| `--host` | API address | +| `--port` | API port | + ## Benchmark Run benchmark script to compute performance on your device, `benchmark.py` will load the same `.env` as `app.py`.: diff --git a/llama2_wrapper/download/__init__.py b/llama2_wrapper/download/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/llama2_wrapper/download/__main__.py b/llama2_wrapper/download/__main__.py new file mode 100644 index 0000000..01ea1d5 --- /dev/null +++ b/llama2_wrapper/download/__main__.py @@ -0,0 +1,59 @@ +import os +import argparse + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--repo_id", + type=str, + default="", + required=True, + help="Repo ID like 'TheBloke/Llama-2-7B-Chat-GGML' ", + ) + parser.add_argument( + "--filename", + type=str, + default=None, + help="Filename like llama-2-7b-chat.ggmlv3.q4_0.bin", + ) + parser.add_argument( + "--save_dir", type=str, default="./models", help="Directory to save models" + ) + + args = parser.parse_args() + + repo_id = args.repo_id + save_dir = args.save_dir + + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + if args.filename: + filename = args.filename + from huggingface_hub import hf_hub_download + + print(f"Start downloading model {repo_id} {filename} to: {save_dir}") + + hf_hub_download( + repo_id=repo_id, + filename=filename, + local_dir=save_dir, + ) + else: + repo_name = repo_id.split("/")[1] + save_path = os.path.join(save_dir, repo_name) + if not os.path.exists(save_path): + os.makedirs(save_path) + print(f"Start downloading model {repo_id} to: {save_path}") + + from huggingface_hub import snapshot_download + + snapshot_download( + repo_id=repo_id, + local_dir=save_path, + ) + + +if __name__ == "__main__": + main() diff --git a/llama2_wrapper/server/app.py b/llama2_wrapper/server/app.py index 29f76d6..fb2e5a5 100644 --- a/llama2_wrapper/server/app.py +++ b/llama2_wrapper/server/app.py @@ -35,17 +35,17 @@ class Settings(BaseSettings): default="llama.cpp", description="Backend for llama2, options: llama.cpp, gptq, transformers", ) - max_tokens: int = Field(default=2048, ge=1, description="Maximum context size.") + max_tokens: int = Field(default=4000, ge=1, description="Maximum context size.") load_in_8bit: bool = Field( default=False, - description="Use bitsandbytes to run model in 8 bit mode (only for transformers models).", + description="`Whether to use bitsandbytes to run model in 8 bit mode (only for transformers models).", ) verbose: bool = Field( default=False, - description="Print verbose output to stderr.", + description="Whether to print verbose output to stderr.", ) - host: str = Field(default="localhost", description="Listen address") - port: int = Field(default=8000, description="Listen port") + host: str = Field(default="localhost", description="API address") + port: int = Field(default=8000, description="API port") interrupt_requests: bool = Field( default=True, description="Whether to interrupt requests when a new request is received.", diff --git a/pyproject.toml b/pyproject.toml index 011e996..424565e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama2-wrapper" -version = "0.1.11" +version = "0.1.12" description = "Use llama2-wrapper as your local llama2 backend for Generative Agents / Apps" authors = ["liltom-eth "] license = "MIT"