-
Notifications
You must be signed in to change notification settings - Fork 2
/
token_stats.py
74 lines (61 loc) · 2.27 KB
/
token_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import json
import os
from typing import Dict, List
import pandas as pd
from loguru import logger
from tqdm import tqdm
# clear contents of log file
with open("logs/token_stats.log", "w") as f:
pass
# set log file
logger.add("logs/token_stats.log", rotation="500 MB")
def extract_token_counts(strategy: str, split: str, examplars: bool = False) -> Dict[str, List[int]]:
"""
Extracts token counts from data files.
"""
assert strategy in ["self_ask", "direct", "baseline", "self-ask"]
assert split in ["train", "dev", "test"]
# read data file
if split == "test":
DATA_DIR = "data/MultihopEvaluation/"
if examplars:
examplar_status = "with-examplars"
else:
examplar_status = "without-examplars"
data_file = os.path.join(DATA_DIR, f'{strategy}-{examplar_status}.json')
else:
DATA_DIR = "data/FinetuningData/"
data_file = os.path.join(DATA_DIR, f'{strategy}_{split}.json')
with open(data_file, 'r') as f:
data = json.load(f)
# count tokens
prompt_token_counts = []
target_token_counts = []
total_token_counts = []
for record in tqdm(data, desc=f"Extracting token counts for {strategy}-{split} split"):
prompt_token_counts.append(record['num_prompt_tokens'])
target_token_counts.append(record['num_target_tokens'])
total_token_counts.append(record['num_tokens'])
return {"prompt_token_counts": prompt_token_counts,
"target_token_counts": target_token_counts,
"total_token_counts": total_token_counts}
def summarize_token_counts(token_counts: Dict[str, List[int]]) -> pd.DataFrame:
"""
Summarizes token counts.
"""
# convert token_counts to pandas dataframe
df = pd.DataFrame(token_counts)
# create summary dataframe
summary = df.describe()
# add sum of columns
summary.loc['sum'] = df.sum()
return summary.round(2)
def main():
for strategy in ["self_ask", "direct"]:
for split in ["train", "dev"]:
counts = extract_token_counts(strategy, split)
# summarize token counts
stats = summarize_token_counts(counts)
logger.info(f"Token stats for {strategy}-{split} split:\n{stats.to_string()}")
if __name__ == "__main__":
main()