From 3bd365f86a7d22ff472130b54e5b59655ec88683 Mon Sep 17 00:00:00 2001 From: Jack Cloudman Date: Sun, 24 Nov 2024 06:10:31 -0600 Subject: [PATCH 1/2] Added excluded pattern param to download-model.py script --- download-model.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/download-model.py b/download-model.py index 306784a355..ba963f103c 100644 --- a/download-model.py +++ b/download-model.py @@ -2,7 +2,7 @@ Downloads models from Hugging Face to models/username_modelname. Example: -python download-model.py facebook/opt-1.3b +python download-model.py facebook/opt-1.3b --exclude-pattern consolidated-.*\.safetensors ''' @@ -72,7 +72,7 @@ def sanitize_model_and_branch_names(self, model, branch): return model, branch - def get_download_links_from_huggingface(self, model, branch, text_only=False, specific_file=None): + def get_download_links_from_huggingface(self, model, branch, text_only=False, specific_file=None, exclude_pattern=None): session = self.session page = f"/api/models/{model}/tree/{branch}" cursor = b"" @@ -100,6 +100,10 @@ def get_download_links_from_huggingface(self, model, branch, text_only=False, sp if specific_file not in [None, ''] and fname != specific_file: continue + # Excluir archivos que coinciden con el patrón de exclusión + if exclude_pattern is not None and re.match(exclude_pattern, fname): + continue + if not is_lora and fname.endswith(('adapter_config.json', 'adapter_model.bin')): is_lora = True @@ -138,18 +142,14 @@ def get_download_links_from_huggingface(self, model, branch, text_only=False, sp cursor = base64.b64encode(cursor) cursor = cursor.replace(b'=', b'%3D') - # If both pytorch and safetensors are available, download safetensors only - # Also if GGUF and safetensors are available, download only safetensors - # (why do people do this?) + # Si hay archivos PyTorch o GGUF y también safetensors, solo descargamos safetensors if (has_pytorch or has_pt or has_gguf) and has_safetensors: has_gguf = False for i in range(len(classifications) - 1, -1, -1): if classifications[i] in ['pytorch', 'pt', 'gguf']: links.pop(i) - # For GGUF, try to download only the Q4_K_M if no specific file is specified. - # If not present, exclude all GGUFs, as that's likely a repository with both - # GGUF and fp16 files. + # Lógica para GGUF if has_gguf and specific_file is None: has_q4km = False for i in range(len(classifications) - 1, -1, -1): @@ -174,7 +174,7 @@ def get_output_folder(self, model, branch, is_lora, is_llamacpp=False, model_dir else: base_folder = 'models' if not is_lora else 'loras' - # If the model is of type GGUF, save directly in the base_folder + # Si el modelo es de tipo GGUF, guardamos directamente en la carpeta base if is_llamacpp: return Path(base_folder) @@ -312,6 +312,7 @@ def check_model_files(self, model, branch, links, sha256, output_folder): parser.add_argument('--threads', type=int, default=4, help='Number of files to download simultaneously.') parser.add_argument('--text-only', action='store_true', help='Only download text files (txt/json).') parser.add_argument('--specific-file', type=str, default=None, help='Name of the specific file to download (if not provided, downloads all).') + parser.add_argument('--exclude-pattern', type=str, default=None, help='Regex pattern to exclude files from download.') parser.add_argument('--output', type=str, default=None, help='Save the model files to this folder.') parser.add_argument('--model-dir', type=str, default=None, help='Save the model files to a subfolder of this folder instead of the default one (text-generation-webui/models).') parser.add_argument('--clean', action='store_true', help='Does not resume the previous download.') @@ -322,6 +323,7 @@ def check_model_files(self, model, branch, links, sha256, output_folder): branch = args.branch model = args.MODEL specific_file = args.specific_file + exclude_pattern = args.exclude_pattern if model is None: print("Error: Please specify the model you'd like to download (e.g. 'python download-model.py facebook/opt-1.3b').") @@ -336,7 +338,9 @@ def check_model_files(self, model, branch, links, sha256, output_folder): sys.exit() # Get the download links from Hugging Face - links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface(model, branch, text_only=args.text_only, specific_file=specific_file) + links, sha256, is_lora, is_llamacpp = downloader.get_download_links_from_huggingface( + model, branch, text_only=args.text_only, specific_file=specific_file, exclude_pattern=exclude_pattern + ) # Get the output folder if args.output: @@ -349,4 +353,8 @@ def check_model_files(self, model, branch, links, sha256, output_folder): downloader.check_model_files(model, branch, links, sha256, output_folder) else: # Download files - downloader.download_model_files(model, branch, links, sha256, output_folder, specific_file=specific_file, threads=args.threads, is_llamacpp=is_llamacpp) + downloader.download_model_files( + model, branch, links, sha256, output_folder, + specific_file=specific_file, threads=args.threads, is_llamacpp=is_llamacpp + ) + From aa02d6e0d3784abf1f775e985c71352779940465 Mon Sep 17 00:00:00 2001 From: Jack Cloudman Date: Sun, 24 Nov 2024 06:20:16 -0600 Subject: [PATCH 2/2] Fixed comments --- download-model.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/download-model.py b/download-model.py index ba963f103c..8fe94371f2 100644 --- a/download-model.py +++ b/download-model.py @@ -2,7 +2,7 @@ Downloads models from Hugging Face to models/username_modelname. Example: -python download-model.py facebook/opt-1.3b --exclude-pattern consolidated-.*\.safetensors +python download-model.py facebook/opt-1.3b ''' @@ -100,7 +100,7 @@ def get_download_links_from_huggingface(self, model, branch, text_only=False, sp if specific_file not in [None, ''] and fname != specific_file: continue - # Excluir archivos que coinciden con el patrón de exclusión + # Exclude files matching the exclude pattern if exclude_pattern is not None and re.match(exclude_pattern, fname): continue @@ -110,7 +110,7 @@ def get_download_links_from_huggingface(self, model, branch, text_only=False, sp is_pytorch = re.match(r"(pytorch|adapter|gptq)_model.*\.bin", fname) is_safetensors = re.match(r".*\.safetensors", fname) is_pt = re.match(r".*\.pt", fname) - is_gguf = re.match(r'.*\.gguf', fname) + is_gguf = re.match(r".*\.gguf", fname) is_tiktoken = re.match(r".*\.tiktoken", fname) is_tokenizer = re.match(r"(tokenizer|ice|spiece).*\.model", fname) or is_tiktoken is_text = re.match(r".*\.(txt|json|py|md)", fname) or is_tokenizer @@ -142,14 +142,15 @@ def get_download_links_from_huggingface(self, model, branch, text_only=False, sp cursor = base64.b64encode(cursor) cursor = cursor.replace(b'=', b'%3D') - # Si hay archivos PyTorch o GGUF y también safetensors, solo descargamos safetensors + # If both pytorch and safetensors are available, download safetensors only + # Also if GGUF and safetensors are available, download only safetensors if (has_pytorch or has_pt or has_gguf) and has_safetensors: has_gguf = False for i in range(len(classifications) - 1, -1, -1): if classifications[i] in ['pytorch', 'pt', 'gguf']: links.pop(i) - # Lógica para GGUF + # For GGUF, try to download only the Q4_K_M if no specific file is specified. if has_gguf and specific_file is None: has_q4km = False for i in range(len(classifications) - 1, -1, -1): @@ -174,7 +175,7 @@ def get_output_folder(self, model, branch, is_lora, is_llamacpp=False, model_dir else: base_folder = 'models' if not is_lora else 'loras' - # Si el modelo es de tipo GGUF, guardamos directamente en la carpeta base + # If the model is of type GGUF, save directly in the base_folder if is_llamacpp: return Path(base_folder) @@ -357,4 +358,3 @@ def check_model_files(self, model, branch, links, sha256, output_folder): model, branch, links, sha256, output_folder, specific_file=specific_file, threads=args.threads, is_llamacpp=is_llamacpp ) -