openvinotoolkit · ilya-lavrenov · Nov 5, 2024 · Oct 31, 2024 · Oct 31, 2024 · Oct 31, 2024
diff --git a/src/README.md b/src/README.md
@@ -304,6 +304,7 @@ mean_tpot 3.80
 
 >**Note**: If the input prompt is just a string, the generate function returns only a string without perf_metrics. To obtain perf_metrics, provide the prompt as a list with at least one element or call generate with encoded inputs.
 
+#### Accumulating metrics
 Several `perf_metrics` can be added to each other. In that case `raw_metrics` are concatenated and mean/std values are recalculated. This accumulates statistics from several `generate()` calls
 
 ```cpp
@@ -338,6 +339,62 @@ print(f'TPOT: {perf_metrics.get_tpot().mean:.2f} ms/token')
 print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s')
 ```
 
+#### Using raw performance metrics
+In addition to mean and standard deviation values, the `perf_metrics` object has a `raw_metrics` field. This field stores raw data, including:
+
+- Timestamps for each batch of generated tokens
+- Batch sizes for each timestamp
+- Tokenization durations
+- Detokenization durations
+- Other relevant metrics
+
+These metrics can be use for more fine grained analysis, such as getting exact calculating median values, percentiles, etc. Below are a few examples of how to use raw metrics.
+
+Getting timestamps for each generated token:
+```python
+import openvino_genai as ov_genai
+pipe = ov_genai.LLMPipeline(models_path, "CPU")
+result = pipe.generate(["The Sun is yellow because"], max_new_tokens=20)
+perf_metrics = result.perf_metrics
+raw_metrics = perf_metrics.raw_metrics
+
+print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f}')
+print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s')
+print(f'Timestamps: {" ms, ".join(f"{i:.2f}" for i in raw_metrics.m_new_token_times)}')
+```
+
+Getting pure inference time without tokenizatin and detokenization duration:
+```python
+import openvino_genai as ov_genai
+import openvino_genai as ov_genai
+import numpy as np
+pipe = ov_genai.LLMPipeline(models_path, "CPU")
+result = pipe.generate(["The Sun is yellow because"], max_new_tokens=20)
+perf_metrics = result.perf_metrics
+print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f}')
+
+raw_metrics = perf_metrics.raw_metrics
+generate_duration = np.array(raw_metrics.generate_durations)
+tok_detok_duration = np.array(raw_metrics.tokenization_durations) - np.array(raw_metrics.detokenization_durations)
+pure_inference_duration = np.mean(generate_duration - tok_detok_duration) / 1000 # in seconds
+print(f'Pure Inference duration: {pure_inference_duration:.2f} ms')
+```
+
+Example of using raw metrics to calculate median value of generate duration:
+```python
+import openvino_genai as ov_genai
+import numpy as np
+pipe = ov_genai.LLMPipeline(models_path, "CPU")
+result = pipe.generate(["The Sun is yellow because"], max_new_tokens=20)
+perf_metrics = result.perf_metrics
+raw_metrics = perf_metrics.raw_metrics
+
+print(f'Generate duration: {perf_metrics.get_generate_duration().mean:.2f}')
+print(f'Throughput: {perf_metrics.get_throughput().mean:.2f} tokens/s')
+durations = np.array(raw_metrics.m_new_token_times[1:]) - np.array(raw_metrics.m_new_token_times[:-1])
+print(f'Median from token to token duration: {np.median(durations):.2f} ms')
+```
+
 For more examples of how metrics are used, please refer to the Python [benchmark_genai.py](../samples/python/benchmark_genai/README.md) and C++ [benchmark_genai](../samples/cpp/benchmark_genai/README.md) samples.
 
 ## How It Works

diff --git a/src/cpp/src/perf_metrics.cpp b/src/cpp/src/perf_metrics.cpp
@@ -29,7 +29,6 @@ ov::genai::MeanStdPair calc_mean_and_std(const std::vector<ov::genai::MicroSecon
     return {mean, std};
 }
 
-
 } // namespace
 
 namespace ov {
@@ -102,19 +101,21 @@ void PerfMetrics::evaluate_statistics(std::optional<TimePoint> start_time) {
         auto start_time_val = *start_time;
         auto& tok_times = raw_metrics.m_new_token_times;
         auto& batch_sizes = raw_metrics.m_batch_sizes;
-        raw_metrics.m_durations = std::vector<MicroSeconds>(tok_times.size());
+        raw_metrics.m_durations.reserve(tok_times.size());
 
         auto ttft = tok_times[0] - start_time_val;
         raw_metrics.m_times_to_first_token = std::vector<MicroSeconds>();
         raw_metrics.m_times_to_first_token.emplace_back(ttft / batch_sizes[0]);
-        num_generated_tokens = 0;
-        for (size_t i = 0; i < tok_times.size(); ++i) {
-            raw_metrics.m_durations[i] = tok_times[i] - start_time_val;
-
+        num_generated_tokens = batch_sizes[0];
+
+        // The very first infer request (prefill stage) is slower than sunsequent ones since we process a sequence of tokens.
+        // To have a clearer TPOT number, the time taken to generate the very first token at the prefill stage 
+        // must not be included in the TPOT calculation. The first duration used for TPOT is from the first token 
+        // to the second token, not from the start time to the first token.
+        for (size_t i = 1; i < tok_times.size(); ++i) {
             // If in 10 ms a batch of 5 new tokens is generated then TPOT is 10 / 5 = 2 tok/ms.
-            raw_metrics.m_durations[i] /= batch_sizes[i];
+            raw_metrics.m_durations.emplace_back((tok_times[i] - tok_times[i - 1]) / batch_sizes[i]);
             num_generated_tokens += batch_sizes[i];
-            start_time_val = tok_times[i];
         }
     }
 

diff --git a/src/python/py_perf_metrics.cpp b/src/python/py_perf_metrics.cpp
@@ -32,8 +32,8 @@ auto raw_perf_metrics_docstring = R"(
     :param m_times_to_first_token: Times to the first token for each call in microseconds.
     :type m_times_to_first_token: List[MicroSeconds]
 
-    :param m_new_token_times: Time points for each new token generated.
-    :type m_new_token_times: List[TimePoint]
+    :param m_new_token_times: Timestamps of generation every token or batch of tokens in milliseconds.
+    :type m_new_token_times: List[MilliSeconds]
 
     :param m_batch_sizes: Batch sizes for each generate call.
     :type m_batch_sizes: List[int]
@@ -109,6 +109,21 @@ std::vector<float> get_ms(const T& instance, U T::*member) {
     return res;
 }
 
+template <typename T, typename U>
+std::vector<double> timestamp_to_ms(const T& instance, U T::*member) {
+    // Converts c++ duration to double so that it can be used in Python.
+    // Use double instead of float bacuse timestamp in ms contains 14 digits
+    // while float only allows to store ~7 significant digits.
+    std::vector<double> res;
+    const auto& timestamps = instance.*member;
+    res.reserve(timestamps.size());
+    std::transform(timestamps.begin(), timestamps.end(), std::back_inserter(res),
+                   [](const auto& timestamp) { 
+                        return std::chrono::duration<double, std::milli>(timestamp.time_since_epoch()).count(); 
+                    });
+    return res;
+}
+
 } // namespace
 
 void init_perf_metrics(py::module_& m) {
@@ -126,6 +141,9 @@ void init_perf_metrics(py::module_& m) {
         .def_property_readonly("m_times_to_first_token", [](const RawPerfMetrics &rw) {
             return get_ms(rw, &RawPerfMetrics::m_times_to_first_token);
         })
+        .def_property_readonly("m_new_token_times", [](const RawPerfMetrics &rw) {
+            return timestamp_to_ms(rw, &RawPerfMetrics::m_new_token_times);
+        })
         .def_property_readonly("m_durations", [](const RawPerfMetrics &rw) {
             return get_ms(rw, &RawPerfMetrics::m_durations);
         })