-
-
Notifications
You must be signed in to change notification settings - Fork 5k
/
openai_chat_completion_client_for_multimodal.py
236 lines (199 loc) · 6.95 KB
/
openai_chat_completion_client_for_multimodal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
"""An example showing how to use vLLM to serve multimodal models
and run online inference with OpenAI client.
Launch the vLLM server with the following command:
(single image inference with Llava)
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
(audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
"""
import base64
import requests
from openai import OpenAI
from vllm.assets.audio import AudioAsset
from vllm.utils import FlexibleArgumentParser
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
def encode_base64_content_from_url(content_url: str) -> str:
"""Encode a content retrieved from a remote url to base64 format."""
with requests.get(content_url) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode('utf-8')
return result
# Text-only inference
def run_text_only() -> None:
chat_completion = client.chat.completions.create(
messages=[{
"role": "user",
"content": "What's the capital of France?"
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion.choices[0].message.content
print("Chat completion output:", result)
# Single-image input inference
def run_single_image() -> None:
## Use image url in the payload
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this image?"
},
{
"type": "image_url",
"image_url": {
"url": image_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from image url:", result)
## Use base64 encoded image in the payload
image_base64 = encode_base64_content_from_url(image_url)
chat_completion_from_base64 = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this image?"
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from base64 encoded image:", result)
# Multi-image input inference
def run_multi_image() -> None:
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What are the animals in these images?"
},
{
"type": "image_url",
"image_url": {
"url": image_url_duck
},
},
{
"type": "image_url",
"image_url": {
"url": image_url_lion
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output:", result)
# Audio input inference
def run_audio() -> None:
# Any format supported by librosa is supported
audio_url = AudioAsset("winning_call").url
# Use audio url in the payload
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "audio_url",
"audio_url": {
"url": audio_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from audio url:", result)
audio_base64 = encode_base64_content_from_url(audio_url)
chat_completion_from_base64 = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "audio_url",
"audio_url": {
# Any format supported by librosa is supported
"url": f"data:audio/ogg;base64,{audio_base64}"
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from base64 encoded audio:", result)
example_function_map = {
"text-only": run_text_only,
"single-image": run_single_image,
"multi-image": run_multi_image,
"audio": run_audio,
}
def main(args) -> None:
chat_type = args.chat_type
example_function_map[chat_type]()
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description='Demo on using OpenAI client for online inference with '
'multimodal language models served with vLLM.')
parser.add_argument(
'--chat-type',
'-c',
type=str,
default="single-image",
choices=["text-only", "single-image", "multi-image", "audio"],
help='Conversation type with multimodal data.')
args = parser.parse_args()
main(args)