diff --git a/src/README.md b/src/README.md index 71d1a230c..9a96daa9d 100644 --- a/src/README.md +++ b/src/README.md @@ -304,6 +304,7 @@ mean_tpot 3.80 >**Note**: If the input prompt is just a string, the generate function returns only a string without perf_metrics. To obtain perf_metrics, provide the prompt as a list with at least one element or call generate with encoded inputs. +#### Accumulating metrics Several `perf_metrics` can be added to each other. In that case `raw_metrics` are concatenated and mean/std values are recalculated. This accumulates statistics from several `generate()` calls ```cpp @@ -338,6 +339,61 @@ print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token') print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s') ``` +#### Using raw performance metrics +In addition to mean and standard deviation values, the `perf_metrics` object has a `raw_metrics` field. This field stores raw data, including: + +- Timestamps for each batch of generated tokens +- Batch sizes for each timestamp +- Tokenization durations +- Detokenization durations +- Other relevant metrics + +These metrics can be use for more fine grained analysis, such as getting exact calculating median values, percentiles, etc. Below are a few examples of how to use raw metrics. + +Getting timestamps for each generated token: +```python +import openvino_genai as ov_genai +pipe = ov_genai.LLMPipeline(models_path, "CPU") +result = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) +perf_metrics = result.perf_metrics +raw_metrics = perf_metrics.raw_metrics + +print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f}') +print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s') +print(f'Timestamps: {" ms, ".join(f"{i:.2f}" for i in raw_metrics.m_new_token_times)}') +``` + +Getting pure inference time without tokenizatin and detokenization duration: +```python +import openvino_genai as ov_genai +import numpy as np +pipe = ov_genai.LLMPipeline(models_path, "CPU") +result = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) +perf_metrics = result.perf_metrics +print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f} ms') + +raw_metrics = perf_metrics.raw_metrics +generate_duration = np.array(raw_metrics.generate_durations) +tok_detok_duration = np.array(raw_metrics.tokenization_durations) - np.array(raw_metrics.detokenization_durations) +pure_inference_duration = np.sum(generate_duration - tok_detok_duration) / 1000 # in milliseconds +print(f'Pure Inference duration: {pure_inference_duration:.2f} ms') +``` + +Example of using raw metrics to calculate median value of generate duration: +```python +import openvino_genai as ov_genai +import numpy as np +pipe = ov_genai.LLMPipeline(models_path, "CPU") +result = pipe.generate(["The Sun is yellow because"], max_new_tokens=20) +perf_metrics = result.perf_metrics +raw_metrics = perf_metrics.raw_metrics + +print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f}') +print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s') +durations = np.array(raw_metrics.m_new_token_times[1:]) - np.array(raw_metrics.m_new_token_times[:-1]) +print(f'Median from token to token duration: {np.median(durations):.2f} ms') +``` + For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](../samples/python/benchmark_genai/README.md) and C++ [benchmark_genai](../samples/cpp/benchmark_genai/README.md) samples. ## How It Works diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp index 822246f01..6ec88fa31 100644 --- a/src/cpp/src/perf_metrics.cpp +++ b/src/cpp/src/perf_metrics.cpp @@ -29,7 +29,6 @@ ov::genai::MeanStdPair calc_mean_and_std(const std::vector start_time) { auto start_time_val = *start_time; auto& tok_times = raw_metrics.m_new_token_times; auto& batch_sizes = raw_metrics.m_batch_sizes; - raw_metrics.m_durations = std::vector(tok_times.size()); + raw_metrics.m_durations.reserve(tok_times.size()); auto ttft = tok_times[0] - start_time_val; raw_metrics.m_times_to_first_token = std::vector(); raw_metrics.m_times_to_first_token.emplace_back(ttft / batch_sizes[0]); - num_generated_tokens = 0; - for (size_t i = 0; i < tok_times.size(); ++i) { - raw_metrics.m_durations[i] = tok_times[i] - start_time_val; - + num_generated_tokens = batch_sizes[0]; + + // The very first infer request (prefill stage) is slower than subsequent ones since we process a sequence of tokens. + // To have a clearer TPOT number, the time taken to generate the very first token at the prefill stage + // must not be included in the TPOT calculation. The first duration used for TPOT is from the first token + // to the second token, not from the start time to the first token. + for (size_t i = 1; i < tok_times.size(); ++i) { // If in 10 ms a batch of 5 new tokens is generated then TPOT is 10 / 5 = 2 tok/ms. - raw_metrics.m_durations[i] /= batch_sizes[i]; + raw_metrics.m_durations.emplace_back((tok_times[i] - tok_times[i - 1]) / batch_sizes[i]); num_generated_tokens += batch_sizes[i]; - start_time_val = tok_times[i]; } } diff --git a/src/python/py_perf_metrics.cpp b/src/python/py_perf_metrics.cpp index 500ce0abe..679acc2b9 100644 --- a/src/python/py_perf_metrics.cpp +++ b/src/python/py_perf_metrics.cpp @@ -32,8 +32,8 @@ auto raw_perf_metrics_docstring = R"( :param m_times_to_first_token: Times to the first token for each call in microseconds. :type m_times_to_first_token: List[MicroSeconds] - :param m_new_token_times: Time points for each new token generated. - :type m_new_token_times: List[TimePoint] + :param m_new_token_times: Timestamps of generation every token or batch of tokens in milliseconds. + :type m_new_token_times: List[MilliSeconds] :param m_batch_sizes: Batch sizes for each generate call. :type m_batch_sizes: List[int] @@ -109,6 +109,22 @@ std::vector get_ms(const T& instance, U T::*member) { return res; } +template +std::vector timestamp_to_ms(const T& instance, U T::*member) { + // Converts c++ duration to double so that it can be used in Python. + // Use double instead of float bacuse timestamp in ms contains 14 digits + // while float only allows to store ~7 significant digits. + // And the current timestamp (number of secs from 1970) is already 11 digits. + std::vector res; + const auto& timestamps = instance.*member; + res.reserve(timestamps.size()); + std::transform(timestamps.begin(), timestamps.end(), std::back_inserter(res), + [](const auto& timestamp) { + return std::chrono::duration(timestamp.time_since_epoch()).count(); + }); + return res; +} + } // namespace void init_perf_metrics(py::module_& m) { @@ -126,6 +142,9 @@ void init_perf_metrics(py::module_& m) { .def_property_readonly("m_times_to_first_token", [](const RawPerfMetrics &rw) { return get_ms(rw, &RawPerfMetrics::m_times_to_first_token); }) + .def_property_readonly("m_new_token_times", [](const RawPerfMetrics &rw) { + return timestamp_to_ms(rw, &RawPerfMetrics::m_new_token_times); + }) .def_property_readonly("m_durations", [](const RawPerfMetrics &rw) { return get_ms(rw, &RawPerfMetrics::m_durations); })