diff --git a/evals/benchmark/stresscli/locust/aistress.py b/evals/benchmark/stresscli/locust/aistress.py index 1264ac41..436713e0 100644 --- a/evals/benchmark/stresscli/locust/aistress.py +++ b/evals/benchmark/stresscli/locust/aistress.py @@ -120,12 +120,16 @@ def bench_main(self): "faqgenfixed", "faqgenbench", ] + if self.environment.parsed_options.bench_target in ["faqgenfixed", "faqgenbench"]: + req_params = {"data": reqData} + else: + req_params = {"json": reqData} test_start_time = time.time() try: start_ts = time.perf_counter() with self.client.post( url, - json=reqData, + **req_params, stream=True if self.environment.parsed_options.bench_target in streaming_bench_target else False, catch_response=True, timeout=self.environment.parsed_options.http_timeout, @@ -169,6 +173,22 @@ def bench_main(self): complete_response += content except json.JSONDecodeError: continue + elif self.environment.parsed_options.bench_target in ["faqgenfixed", "faqgenbench"]: + client = sseclient.SSEClient(resp) + for event in client.events(): + if first_token_ts is None: + first_token_ts = time.perf_counter() + try: + data = json.loads(event.data) + for op in data["ops"]: + if op["path"] == "/logs/HuggingFaceEndpoint/final_output": + generations = op["value"].get("generations", []) + for generation in generations: + for item in generation: + text = item.get("text", "") + complete_response += text + except json.JSONDecodeError: + continue else: client = sseclient.SSEClient(resp) for event in client.events(): diff --git a/evals/benchmark/stresscli/locust/faqgenfixed.py b/evals/benchmark/stresscli/locust/faqgenfixed.py index b648a567..75f3d29b 100644 --- a/evals/benchmark/stresscli/locust/faqgenfixed.py +++ b/evals/benchmark/stresscli/locust/faqgenfixed.py @@ -9,12 +9,11 @@ def getUrl(): def getReqData(): - # return { - # "inputs": "What is the revenue of Nike in last 10 years before 2023? Give me detail", - # "parameters": {"max_new_tokens": 128, "do_sample": True}, - # } - # return {"query": "What is the revenue of Nike in last 10 years before 2023? Give me detail", "max_tokens": 128} - return {"messages": "What is the revenue of Nike in last 10 years before 2023? Give me detail", "max_tokens": 128} + return { + "messages": "Text Embeddings Inference (TEI) is a toolkit for deploying and serving open source text embeddings and sequence classification models. TEI enables high-performance extraction for the most popular models, including FlagEmbedding, Ember, GTE and E6.", + "max_tokens": 128, + "top_k": 1, + } def respStatics(environment, reqData, respData): diff --git a/evals/benchmark/stresscli/locust/tokenresponse.py b/evals/benchmark/stresscli/locust/tokenresponse.py index 4b6bfe75..afa487ef 100644 --- a/evals/benchmark/stresscli/locust/tokenresponse.py +++ b/evals/benchmark/stresscli/locust/tokenresponse.py @@ -15,7 +15,7 @@ def testFunc(): def respStatics(environment, req, resp): tokenizer = transformers.AutoTokenizer.from_pretrained(environment.parsed_options.llm_model) - if environment.parsed_options.bench_target in ["chatqnafixed", "chatqnabench"]: + if environment.parsed_options.bench_target in ["chatqnafixed", "chatqnabench", "faqgenfixed", "faqgenbench"]: num_token_input_prompt = len(tokenizer.encode(req["messages"])) elif environment.parsed_options.bench_target in ["llmfixed"]: num_token_input_prompt = len(tokenizer.encode(req["query"]))