Skip to content

Commit

Permalink
Support microservice level benchmark (#95)
Browse files Browse the repository at this point in the history
* Support microservice level benchmark

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
  • Loading branch information
lvliang-intel authored Sep 6, 2024
1 parent 77bb66c commit 626d269
Show file tree
Hide file tree
Showing 9 changed files with 98 additions and 33 deletions.
32 changes: 16 additions & 16 deletions evals/benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,25 @@
service_endpoints = {
"chatqna": {
"embedding": "/v1/embeddings",
"embedding_serving": "/v1/embeddings",
"embedserve": "/v1/embeddings",
"retriever": "/v1/retrieval",
"reranking": "/v1/reranking",
"reranking_serving": "/rerank",
"rerankserve": "/rerank",
"llm": "/v1/chat/completions",
"llm_serving": "/v1/chat/completions",
"llmserve": "/v1/chat/completions",
"e2e": "/v1/chatqna",
},
"codegen": {"llm": "/generate_stream", "llm_serving": "/v1/chat/completions", "e2e": "/v1/codegen"},
"codetrans": {"llm": "/generate", "llm_serving": "/v1/chat/completions", "e2e": "/v1/codetrans"},
"faqgen": {"llm": "/v1/chat/completions", "llm_serving": "/v1/chat/completions", "e2e": "/v1/faqgen"},
"codegen": {"llm": "/generate_stream", "llmserve": "/v1/chat/completions", "e2e": "/v1/codegen"},
"codetrans": {"llm": "/generate", "llmserve": "/v1/chat/completions", "e2e": "/v1/codetrans"},
"faqgen": {"llm": "/v1/chat/completions", "llmserve": "/v1/chat/completions", "e2e": "/v1/faqgen"},
"audioqna": {
"asr": "/v1/audio/transcriptions",
"llm": "/v1/chat/completions",
"llm_serving": "/v1/chat/completions",
"llmserve": "/v1/chat/completions",
"tts": "/v1/audio/speech",
"e2e": "/v1/audioqna",
},
"visualqna": {"lvm": "/v1/chat/completions", "lvm_serving": "/v1/chat/completions", "e2e": "/v1/visualqna"},
"visualqna": {"lvm": "/v1/chat/completions", "lvmserve": "/v1/chat/completions", "e2e": "/v1/visualqna"},
}


Expand Down Expand Up @@ -200,19 +200,19 @@ def process_service(example, service_type, case_data, test_suite_config):
example_service_map = {
"chatqna": [
"embedding",
"embedding_serving",
"embedserve",
"retriever",
"reranking",
"reranking_serving",
"rerankserve",
"llm",
"llm_serving",
"llmserve",
"e2e",
],
"codegen": ["llm", "llm_serving", "e2e"],
"codetrans": ["llm", "llm_serving", "e2e"],
"faqgen": ["llm", "llm_serving", "e2e"],
"audioqna": ["asr", "llm", "llm_serving", "tts", "e2e"],
"visualqna": ["lvm", "lvm_serving", "e2e"],
"codegen": ["llm", "llmserve", "e2e"],
"codetrans": ["llm", "llmserve", "e2e"],
"faqgen": ["llm", "llmserve", "e2e"],
"audioqna": ["asr", "llm", "llmserve", "tts", "e2e"],
"visualqna": ["lvm", "lvmserve", "e2e"],
}

# Process each example's services
Expand Down
2 changes: 1 addition & 1 deletion evals/benchmark/benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ test_cases:
top_p: 0.95
repetition_penalty: 1.03
streaming: true
llm_serving:
llmserve:
run_test: false
service_name: "faq-micro-svc" # Replace with your service name
e2e:
Expand Down
22 changes: 21 additions & 1 deletion evals/benchmark/stresscli/locust/aistress.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,19 +73,39 @@ def bench_main(self):
self.environment.runner.send_message("worker_reqsent", 1)
reqData = bench_package.getReqData()
url = bench_package.getUrl()
streaming_bench_target = [
"llmfixed",
"llmbench",
"chatqnafixed",
"chatqnabench",
"codegenfixed",
"codegenbench",
"faqgenfixed",
"faqgenbench",
]
try:
start_ts = time.perf_counter()
with self.client.post(
url,
json=reqData,
stream=True,
stream=True if self.environment.parsed_options.bench_target in streaming_bench_target else False,
catch_response=True,
timeout=self.environment.parsed_options.http_timeout,
) as resp:
logging.debug("Got response...........................")

if resp.status_code >= 200 and resp.status_code < 400:
if self.environment.parsed_options.bench_target in [
"embedservefixed",
"embeddingfixed",
"retrieverfixed",
"rerankservefixed",
"rerankingfixed",
]:
respData = {
"total_latency": time.perf_counter() - start_ts,
}
elif self.environment.parsed_options.bench_target in [
"audioqnafixed",
"audioqnabench",
]: # non-stream case
Expand Down
8 changes: 5 additions & 3 deletions evals/benchmark/stresscli/locust/embeddingfixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@ def getReqData():
}


def respStatics(environment, resp):
return token.respStatics(environment, resp)
def respStatics(environment, reqData, resp):
return {
"total_latency": resp["total_latency"] * 1000,
}


def staticsOutput(environment, reqlist):
token.staticsOutput(environment, reqlist)
token.staticsOutputForMicroservice(environment, reqlist)
8 changes: 5 additions & 3 deletions evals/benchmark/stresscli/locust/embedservefixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@ def getReqData():
}


def respStatics(environment, resp):
return token.respStatics(environment, resp)
def respStatics(environment, reqData, resp):
return {
"total_latency": resp["total_latency"] * 1000,
}


def staticsOutput(environment, reqlist):
token.staticsOutput(environment, reqlist)
token.staticsOutputForMicroservice(environment, reqlist)
8 changes: 5 additions & 3 deletions evals/benchmark/stresscli/locust/rerankingfixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@ def getReqData():
return {"initial_query": my_query, "retrieved_docs": [{"text": query_rerank_1}, {"text": query_rerank_2}]}


def respStatics(environment, resp):
return token.respStatics(environment, resp)
def respStatics(environment, reqData, resp):
return {
"total_latency": resp["total_latency"] * 1000,
}


def staticsOutput(environment, reqlist):
token.staticsOutput(environment, reqlist)
token.staticsOutputForMicroservice(environment, reqlist)
8 changes: 5 additions & 3 deletions evals/benchmark/stresscli/locust/rerankservefixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@ def getReqData():
return {"query": my_query, "texts": [query_rerank_1, query_rerank_2]}


def respStatics(environment, resp):
return token.respStatics(environment, resp)
def respStatics(environment, reqData, resp):
return {
"total_latency": resp["total_latency"] * 1000,
}


def staticsOutput(environment, reqlist):
token.staticsOutput(environment, reqlist)
token.staticsOutputForMicroservice(environment, reqlist)
8 changes: 5 additions & 3 deletions evals/benchmark/stresscli/locust/retrieverfixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -786,9 +786,11 @@ def getReqData():
return ({"text": my_query, "embedding": my_embedding},)


def respStatics(environment, resp):
return token.respStatics(environment, resp)
def respStatics(environment, reqData, resp):
return {
"total_latency": resp["total_latency"] * 1000,
}


def staticsOutput(environment, reqlist):
token.staticsOutput(environment, reqlist)
token.staticsOutputForMicroservice(environment, reqlist)
35 changes: 35 additions & 0 deletions evals/benchmark/stresscli/locust/tokenresponse.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,38 @@ def staticsOutput(environment, reqlist):
console_logger.warning(average_msg.format(numpy.average(avg_token)))
console_logger.warning("======================================================\n\n")
logging.shutdown()


def staticsOutputForMicroservice(environment, reqlist):
e2e_lat = []
duration = environment.runner.stats.last_request_timestamp - environment.runner.stats.start_time

if len(reqlist) == 0:
logging.debug(f"len(reqlist): {len(reqlist)}, skip printing")
return
for req in iter(reqlist):
e2e_lat.append(req["total_latency"])

# Statistics for success response data only
req_msg = "Succeed Response: {} (Total {}, {:.1%} Success), Duration: {:.2f}s, RPS: {:.2f}"
e2e_msg = "End to End latency(ms), P50: {:.2f}, P90: {:.2f}, P99: {:.2f}, Avg: {:.2f}"
console_logger.warning("\n=================Total statistics=====================")
console_logger.warning(
req_msg.format(
len(reqlist),
environment.runner.stats.num_requests,
len(reqlist) / environment.runner.stats.num_requests,
duration,
len(reqlist) / duration,
)
)
console_logger.warning(
e2e_msg.format(
numpy.percentile(e2e_lat, 50),
numpy.percentile(e2e_lat, 90),
numpy.percentile(e2e_lat, 99),
numpy.average(e2e_lat),
)
)
console_logger.warning("======================================================\n\n")
logging.shutdown()

0 comments on commit 626d269

Please sign in to comment.