| 
4 | 4 | 
 
  | 
5 | 5 | import numpy as np  | 
6 | 6 | 
 
  | 
7 |  | -from vllm.config import SpeculativeConfig  | 
8 | 7 | from vllm.logger import init_logger  | 
9 | 8 | 
 
  | 
10 | 9 | logger = init_logger(__name__)  | 
 | 
14 | 13 | class SpecDecodingStats:  | 
15 | 14 |     num_draft_tokens: int = 0  | 
16 | 15 |     num_accepted_tokens: int = 0  | 
17 |  | -    num_emitted_tokens: int = 0  | 
18 | 16 | 
 
  | 
19 | 17 |     def take(self):  | 
20 | 18 |         copied = SpecDecodingStats(self.num_draft_tokens,  | 
21 |  | -                                   self.num_accepted_tokens,  | 
22 |  | -                                   self.num_emitted_tokens)  | 
 | 19 | +                                   self.num_accepted_tokens)  | 
23 | 20 |         self.reset()  | 
24 | 21 |         return copied  | 
25 | 22 | 
 
  | 
26 | 23 |     def reset(self):  | 
27 | 24 |         self.num_draft_tokens = 0  | 
28 | 25 |         self.num_accepted_tokens = 0  | 
29 |  | -        self.num_emitted_tokens = 0  | 
30 | 26 | 
 
  | 
31 |  | -    def observe(self, num_draft_tokens: int, num_accepted_tokens: int,  | 
32 |  | -                num_emitted_tokens: int):  | 
 | 27 | +    def observe(self, num_draft_tokens: int, num_accepted_tokens: int):  | 
33 | 28 |         self.num_draft_tokens += num_draft_tokens  | 
34 | 29 |         self.num_accepted_tokens += num_accepted_tokens  | 
35 |  | -        self.num_emitted_tokens += num_emitted_tokens  | 
36 | 30 | 
 
  | 
37 | 31 | 
 
  | 
38 | 32 | class SpecDecodingMetrics:  | 
39 | 33 | 
 
  | 
40 |  | -    def __init__(self, speculative_config: SpeculativeConfig):  | 
41 |  | -        self.num_spec_tokens = (speculative_config.num_speculative_tokens  | 
42 |  | -                                if speculative_config is not None else 0)  | 
 | 34 | +    def __init__(self):  | 
43 | 35 |         self.reset()  | 
44 | 36 | 
 
  | 
45 | 37 |     def reset(self):  | 
46 | 38 |         self.num_draft_tokens: list[int] = []  | 
47 | 39 |         self.num_accepted_tokens: list[int] = []  | 
48 |  | -        self.num_emitted_tokens: list[int] = []  | 
49 | 40 | 
 
  | 
50 | 41 |     def observe(self, spec_decoding_stats: SpecDecodingStats):  | 
51 | 42 |         self.num_draft_tokens.append(spec_decoding_stats.num_draft_tokens)  | 
52 | 43 |         self.num_accepted_tokens.append(  | 
53 | 44 |             spec_decoding_stats.num_accepted_tokens)  | 
54 |  | -        self.num_emitted_tokens.append(spec_decoding_stats.num_emitted_tokens)  | 
55 | 45 | 
 
  | 
56 | 46 |     def log(self):  | 
57 | 47 |         num_draft_tokens = np.sum(self.num_draft_tokens)  | 
58 | 48 |         num_accepted_tokens = np.sum(self.num_accepted_tokens)  | 
59 |  | -        num_emitted_tokens = np.sum(self.num_emitted_tokens)  | 
60 | 49 | 
 
  | 
61 | 50 |         draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens  | 
62 | 51 |                                  if num_draft_tokens > 0 else float("nan"))  | 
63 | 52 | 
 
  | 
64 | 53 |         logger.info(  | 
65 | 54 |             "Speculative metrics: "  | 
66 | 55 |             "Draft acceptance rate: %.3f, "  | 
67 |  | -            "Number of speculative tokens: %d, "  | 
68 | 56 |             "Number of accepted tokens: %d, "  | 
69 |  | -            "Number of draft tokens: %d, "  | 
70 |  | -            "Number of emitted tokens: %d.", draft_acceptance_rate,  | 
71 |  | -            num_accepted_tokens, num_draft_tokens, num_emitted_tokens)  | 
 | 57 | +            "Number of draft tokens: %d, ", draft_acceptance_rate,  | 
 | 58 | +            num_accepted_tokens, num_draft_tokens)  | 
72 | 59 |         self.reset()  | 
0 commit comments