diff --git a/docs/features/quantization/auto_awq.md b/docs/features/quantization/auto_awq.md index fc998387d29a..02087d0b61e8 100644 --- a/docs/features/quantization/auto_awq.md +++ b/docs/features/quantization/auto_awq.md @@ -18,13 +18,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the from awq import AutoAWQForCausalLM from transformers import AutoTokenizer - model_path = 'mistralai/Mistral-7B-Instruct-v0.2' - quant_path = 'mistral-instruct-v0.2-awq' - quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } + model_path = "mistralai/Mistral-7B-Instruct-v0.2" + quant_path = "mistral-instruct-v0.2-awq" + quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"} # Load model model = AutoAWQForCausalLM.from_pretrained( - model_path, **{"low_cpu_mem_usage": True, "use_cache": False} + model_path, + low_cpu_mem_usage=True, + use_cache=False, ) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) diff --git a/docs/features/quantization/bitblas.md b/docs/features/quantization/bitblas.md index 53b689ad53ff..c3a127657622 100644 --- a/docs/features/quantization/bitblas.md +++ b/docs/features/quantization/bitblas.md @@ -34,7 +34,7 @@ llm = LLM( model=model_id, dtype=torch.bfloat16, trust_remote_code=True, - quantization="bitblas" + quantization="bitblas", ) ``` @@ -53,6 +53,6 @@ llm = LLM( dtype=torch.float16, trust_remote_code=True, quantization="bitblas", - max_model_len=1024 + max_model_len=1024, ) ``` diff --git a/docs/features/quantization/bnb.md b/docs/features/quantization/bnb.md index 3b15a6072d47..2348c7739c06 100644 --- a/docs/features/quantization/bnb.md +++ b/docs/features/quantization/bnb.md @@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit" llm = LLM( model=model_id, dtype=torch.bfloat16, - trust_remote_code=True + trust_remote_code=True, ) ``` @@ -43,7 +43,7 @@ llm = LLM( model=model_id, dtype=torch.bfloat16, trust_remote_code=True, - quantization="bitsandbytes" + quantization="bitsandbytes", ) ``` diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index 834c03cbe05b..a54acdbb9622 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", + MODEL_ID, + device_map="auto", + torch_dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` @@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio # Configure the simple PTQ quantization recipe = QuantizationModifier( - targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) + targets="Linear", + scheme="FP8_DYNAMIC", + ignore=["lm_head"], + ) # Apply the quantization algorithm. oneshot(model=model, recipe=recipe) diff --git a/docs/features/quantization/gguf.md b/docs/features/quantization/gguf.md index 2a1c3bdd775f..2a731e9b7e03 100644 --- a/docs/features/quantization/gguf.md +++ b/docs/features/quantization/gguf.md @@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint: conversation = [ { "role": "system", - "content": "You are a helpful assistant" + "content": "You are a helpful assistant", }, { "role": "user", - "content": "Hello" + "content": "Hello", }, { "role": "assistant", - "content": "Hello! How can I assist you today?" + "content": "Hello! How can I assist you today?", }, { "role": "user", @@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint: sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. - llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", - tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") + llm = LLM( + model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", + tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + ) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.chat(conversation, sampling_params) diff --git a/docs/features/quantization/gptqmodel.md b/docs/features/quantization/gptqmodel.md index 47cb2d65bae4..f14a931725da 100644 --- a/docs/features/quantization/gptqmodel.md +++ b/docs/features/quantization/gptqmodel.md @@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`: calibration_dataset = load_dataset( "allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", - split="train" + split="train", ).select(range(1024))["text"] quant_config = QuantizeConfig(bits=4, group_size=128) diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md index d6fdac7b07f7..5d8e06ffb5d7 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", + MODEL_ID, + device_map="auto", + torch_dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` @@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y }, ignore=["lm_head"], update_size=NUM_CALIBRATION_SAMPLES, - dampening_frac=0.01 + dampening_frac=0.01, ) ``` diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index af3650e701ad..ee1de2146057 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", + MODEL_ID, + device_map="auto", + torch_dtype="auto", ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) ``` diff --git a/docs/features/quantization/modelopt.md b/docs/features/quantization/modelopt.md index 39ae03b1bdac..c48ccb719a79 100644 --- a/docs/features/quantization/modelopt.md +++ b/docs/features/quantization/modelopt.md @@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll from vllm import LLM, SamplingParams def main(): - model_id = "nvidia/Llama-3.1-8B-Instruct-FP8" - # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint + + # Ensure you specify quantization="modelopt" when loading the modelopt checkpoint llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True) sampling_params = SamplingParams(temperature=0.8, top_p=0.9) diff --git a/docs/features/quantization/quantized_kvcache.md b/docs/features/quantization/quantized_kvcache.md index b2b417309e92..e0585a88451d 100644 --- a/docs/features/quantization/quantized_kvcache.md +++ b/docs/features/quantization/quantized_kvcache.md @@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization: from vllm import LLM, SamplingParams sampling_params = SamplingParams(temperature=0.7, top_p=0.8) - llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", - kv_cache_dtype="fp8", - calculate_kv_scales=True) + llm = LLM( + model="meta-llama/Llama-2-7b-chat-hf", + kv_cache_dtype="fp8", + calculate_kv_scales=True, + ) prompt = "London is the capital of" out = llm.generate(prompt, sampling_params)[0].outputs[0].text print(out) diff --git a/docs/features/quantization/quark.md b/docs/features/quantization/quark.md index 85b7d8ec84ed..f0cd20b7335c 100644 --- a/docs/features/quantization/quark.md +++ b/docs/features/quantization/quark.md @@ -48,7 +48,9 @@ to fetch model and tokenizer. MAX_SEQ_LEN = 512 model = AutoModelForCausalLM.from_pretrained( - MODEL_ID, device_map="auto", torch_dtype="auto", + MODEL_ID, + device_map="auto", + torch_dtype="auto", ) model.eval() @@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation") text_data = dataset["text"][:NUM_CALIBRATION_DATA] - tokenized_outputs = tokenizer(text_data, return_tensors="pt", - padding=True, truncation=True, max_length=MAX_SEQ_LEN) - calib_dataloader = DataLoader(tokenized_outputs['input_ids'], - batch_size=BATCH_SIZE, drop_last=True) + tokenized_outputs = tokenizer( + text_data, + return_tensors="pt", + padding=True, + truncation=True, + max_length=MAX_SEQ_LEN, + ) + calib_dataloader = DataLoader( + tokenized_outputs['input_ids'], + batch_size=BATCH_SIZE, + drop_last=True, + ) ``` ### 3. Set the Quantization Configuration @@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant. load_quant_algo_config_from_file) # Define fp8/per-tensor/static spec. - FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max", - is_dynamic=False).to_quantization_spec() + FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec( + observer_method="min_max", + is_dynamic=False, + ).to_quantization_spec() # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC. - global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC, - weight=FP8_PER_TENSOR_SPEC) + global_quant_config = QuantizationConfig( + input_tensors=FP8_PER_TENSOR_SPEC, + weight=FP8_PER_TENSOR_SPEC, + ) # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC. KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"] - kv_cache_quant_config = {name : - QuantizationConfig(input_tensors=global_quant_config.input_tensors, - weight=global_quant_config.weight, - output_tensors=KV_CACHE_SPEC) - for name in kv_cache_layer_names_for_llama} + kv_cache_quant_config = { + name: QuantizationConfig( + input_tensors=global_quant_config.input_tensors, + weight=global_quant_config.weight, + output_tensors=KV_CACHE_SPEC, + ) + for name in kv_cache_layer_names_for_llama + } layer_quant_config = kv_cache_quant_config.copy() # Define algorithm config by config file. - LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = - 'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json' + LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json" algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE) EXCLUDE_LAYERS = ["lm_head"] @@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant. layer_quant_config=layer_quant_config, kv_cache_quant_config=kv_cache_quant_config, exclude=EXCLUDE_LAYERS, - algo_config=algo_config) + algo_config=algo_config, + ) ``` ### 4. Quantize the Model and Export @@ -165,8 +182,11 @@ for more exporting format details. EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant" exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR) with torch.no_grad(): - exporter.export_safetensors_model(freezed_model, - quant_config=quant_config, tokenizer=tokenizer) + exporter.export_safetensors_model( + freezed_model, + quant_config=quant_config, + tokenizer=tokenizer, + ) ``` ### 5. Evaluation in vLLM @@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent sampling_params = SamplingParams(temperature=0.8, top_p=0.95) # Create an LLM. - llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", - kv_cache_dtype='fp8',quantization='quark') + llm = LLM( + model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", + kv_cache_dtype="fp8", + quantization="quark", + ) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/docs/getting_started/quickstart.md b/docs/getting_started/quickstart.md index 49e1f6fac715..1cba21cf5f6d 100644 --- a/docs/getting_started/quickstart.md +++ b/docs/getting_started/quickstart.md @@ -194,8 +194,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep api_key=openai_api_key, base_url=openai_api_base, ) - completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", - prompt="San Francisco is a") + completion = client.completions.create( + model="Qwen/Qwen2.5-1.5B-Instruct", + prompt="San Francisco is a", + ) print("Completion result:", completion) ``` @@ -239,7 +241,7 @@ Alternatively, you can use the `openai` Python package: messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Tell me a joke."}, - ] + ], ) print("Chat response:", chat_response) ``` diff --git a/docs/models/extensions/tensorizer.md b/docs/models/extensions/tensorizer.md index f70ab0c6f4e5..3df80d5af6c4 100644 --- a/docs/models/extensions/tensorizer.md +++ b/docs/models/extensions/tensorizer.md @@ -60,7 +60,7 @@ from vllm import LLM llm = LLM( "s3://my-bucket/vllm/facebook/opt-125m/v1", load_format="tensorizer", - enable_lora=True + enable_lora=True, ) ``` @@ -97,6 +97,6 @@ llm = LLM( "s3://my-bucket/vllm/facebook/opt-125m/v1", load_format="tensorizer", enable_lora=True, - model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}} + model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}}, ) ``` diff --git a/docs/models/generative_models.md b/docs/models/generative_models.md index 05f8d16cc4ca..9ea32ed61645 100644 --- a/docs/models/generative_models.md +++ b/docs/models/generative_models.md @@ -98,15 +98,15 @@ and automatically applies the model's [chat template](https://huggingface.co/doc conversation = [ { "role": "system", - "content": "You are a helpful assistant" + "content": "You are a helpful assistant", }, { "role": "user", - "content": "Hello" + "content": "Hello", }, { "role": "assistant", - "content": "Hello! How can I assist you today?" + "content": "Hello! How can I assist you today?", }, { "role": "user", diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index 50982d3d0d0f..45bfba2cbf59 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -130,8 +130,10 @@ It is designed for embedding models and cross-encoder models. Embedding models u from vllm import LLM llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling") -(output,) = llm.score("What is the capital of France?", - "The capital of Brazil is Brasilia.") +(output,) = llm.score( + "What is the capital of France?", + "The capital of Brazil is Brasilia.", +) score = output.outputs.score print(f"Score: {score}") @@ -209,7 +211,7 @@ For models that support Matryoshka Embeddings but not recognized by vLLM, please Here is an example to serve a model with Matryoshka Embeddings enabled. -```text +```bash vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}' ``` @@ -220,11 +222,15 @@ You can change the output dimensions of embedding models that support Matryoshka ```python from vllm import LLM, PoolingParams -llm = LLM(model="jinaai/jina-embeddings-v3", - runner="pooling", - trust_remote_code=True) -outputs = llm.embed(["Follow the white rabbit."], - pooling_params=PoolingParams(dimensions=32)) +llm = LLM( + model="jinaai/jina-embeddings-v3", + runner="pooling", + trust_remote_code=True, +) +outputs = llm.embed( + ["Follow the white rabbit."], + pooling_params=PoolingParams(dimensions=32), +) print(outputs[0].outputs) ``` @@ -234,13 +240,13 @@ A code example can be found here: ```python import os -os.environ['http_proxy'] = 'http://your.proxy.server:port' -os.environ['https_proxy'] = 'http://your.proxy.server:port' +os.environ["http_proxy"] = "http://your.proxy.server:port" +os.environ["https_proxy"] = "http://your.proxy.server:port" ``` ### ModelScope diff --git a/docs/serving/expert_parallel_deployment.md b/docs/serving/expert_parallel_deployment.md index cd6515dde75e..f1dfb05ea5d4 100644 --- a/docs/serving/expert_parallel_deployment.md +++ b/docs/serving/expert_parallel_deployment.md @@ -243,10 +243,10 @@ try: "remote_engine_id": None, # Will be populated by vLLM "remote_block_ids": None, # Will be populated by vLLM "remote_host": None, # Will be populated by vLLM - "remote_port": None # Will be populated by vLLM + "remote_port": None, # Will be populated by vLLM } }, - extra_headers={"X-Request-Id": request_id} + extra_headers={"X-Request-Id": request_id}, ) print("-" * 50) @@ -262,7 +262,7 @@ try: extra_body={ "kv_transfer_params": prefill_response.kv_transfer_params # Pass KV cache info }, - extra_headers={"X-Request-Id": request_id} # Same request ID + extra_headers={"X-Request-Id": request_id}, # Same request ID ) print("-" * 50) diff --git a/docs/serving/integrations/langchain.md b/docs/serving/integrations/langchain.md index 47074f411ac9..192a61ea5b90 100644 --- a/docs/serving/integrations/langchain.md +++ b/docs/serving/integrations/langchain.md @@ -15,13 +15,15 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain` ```python from langchain_community.llms import VLLM - llm = VLLM(model="mosaicml/mpt-7b", - trust_remote_code=True, # mandatory for hf models - max_new_tokens=128, - top_k=10, - top_p=0.95, - temperature=0.8, - # tensor_parallel_size=... # for distributed inference + llm = VLLM( + model="mosaicml/mpt-7b", + trust_remote_code=True, # mandatory for hf models + max_new_tokens=128, + top_k=10, + top_p=0.95, + temperature=0.8, + # for distributed inference + # tensor_parallel_size=..., ) print(llm("What is the capital of France ?")) diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index fe0e1e3df378..215c7bf0ced3 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -24,8 +24,8 @@ To call the server, in your preferred text editor, create a script that uses an completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", messages=[ - {"role": "user", "content": "Hello!"} - ] + {"role": "user", "content": "Hello!"}, + ], ) print(completion.choices[0].message) @@ -101,8 +101,13 @@ both a `type` and a `text` field. An example is provided below: completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", messages=[ - {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]} - ] + { + "role": "user", + "content": [ + {"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}, + ], + }, + ], ) ``` @@ -130,11 +135,11 @@ Or directly merge them into the JSON payload if you are using HTTP call directly completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}, ], extra_body={ - "structured_outputs": {"choice": ["positive", "negative"]} - } + "structured_outputs": {"choice": ["positive", "negative"]}, + }, ) ``` @@ -149,11 +154,11 @@ with `--enable-request-id-headers`. completion = client.chat.completions.create( model="NousResearch/Meta-Llama-3-8B-Instruct", messages=[ - {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} + {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}, ], extra_headers={ "x-request-id": "sentiment-classification-00001", - } + }, ) print(completion._request_id) @@ -162,7 +167,7 @@ with `--enable-request-id-headers`. prompt="A robot may not injure a human being", extra_headers={ "x-request-id": "completion-test", - } + }, ) print(completion._request_id) ``` @@ -403,7 +408,7 @@ The Transcriptions API supports uploading audio files in various formats includi model="openai/whisper-large-v3-turbo", file=audio_file, language="en", - response_format="verbose_json" + response_format="verbose_json", ) print(transcription.text) @@ -812,22 +817,22 @@ You can pass multi-modal inputs to scoring models by passing `content` including "model": "jinaai/jina-reranker-m0", "text_1": "slm markdown", "text_2": { - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" - }, - }, - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" - }, - }, - ] - } + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" + }, + }, + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" + }, + }, + ], }, + }, ) response.raise_for_status() response_json = response.json() diff --git a/examples/offline_inference/openai_batch/README.md b/examples/offline_inference/openai_batch/README.md index 3c6f6c7a6c58..7d5a1af8f5a4 100644 --- a/examples/offline_inference/openai_batch/README.md +++ b/examples/offline_inference/openai_batch/README.md @@ -152,7 +152,9 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_ """ try: url = s3_client.generate_presigned_url( - ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in + ClientMethod=client_method, + Params=method_parameters, + ExpiresIn=expires_in, ) except ClientError: raise @@ -161,10 +163,16 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_ s3_client = boto3.client("s3") input_url = generate_presigned_url( - s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600 + s3_client, + "get_object", + {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, + expires_in=3600, ) output_url = generate_presigned_url( - s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600 + s3_client, + "put_object", + {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, + expires_in=3600, ) print(f"{input_url=}") print(f"{output_url=}") diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py index acbfd8cda489..2601c9eff971 100644 --- a/examples/others/tensorize_vllm_model.py +++ b/examples/others/tensorize_vllm_model.py @@ -84,7 +84,7 @@ from vllm import LLM llm = LLM( "s3://my-bucket/vllm/facebook/opt-125m/v1", - load_format="tensorizer" + load_format="tensorizer", ) ```