Skip to content

Commit

Permalink
Update make_hf_dataset.py
Browse files Browse the repository at this point in the history
  • Loading branch information
alilevy committed Feb 14, 2024
1 parent ee1696c commit b900cb9
Showing 1 changed file with 23 additions and 18 deletions.
41 changes: 23 additions & 18 deletions examples/script_data_processing/make_hf_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,35 +14,40 @@ def make_json_serializable(input_dict):

return input_dict


def make_hf_dataset(source_dir, target_dir, split='test'):
data_pkl = load_pickle(source_dir)

data_pkl['dim_process'] = int(data_pkl['dim_process'])

data_json = dict({'dim_process': int(data_pkl['dim_process'])})

data_json['event_seqs'] = dict()
dim_process = int(data_pkl['dim_process'])

seq_len = []
data_json = []
for idx, seq in enumerate(data_pkl[split]):
data_json['event_seqs'][f'seq_{idx}'] = dict()
seq_len.append(len(seq))
seq_len = len(seq)
time_since_start, time_since_last_event, type_event = [], [], []
for idx_event, event in enumerate(data_pkl[split][idx]):
if idx_event == 0:
if idx_event == 0 and event['time_since_start'] > 0:
start_timestamp = event['time_since_start']
event['time_since_last_event'] -= start_timestamp if event[
'time_since_last_event'] == start_timestamp else \
event['time_since_last_event']
event['time_since_start'] -= start_timestamp
data_json['event_seqs'][f'seq_{idx}'][f'event_{idx_event}'] = make_json_serializable(event)

data_json['num_seqs'] = len(data_pkl[split])
data_json['avg_seq_len'] = np.mean(seq_len)
data_json['min_seq_len'] = min(seq_len)
data_json['max_seq_len'] = max(seq_len)
event['time_since_last_event']
event['time_since_start'] -= start_timestamp
event = make_json_serializable(event)
time_since_start.append(event['time_since_start'])
time_since_last_event.append(event['time_since_last_event'])
type_event.append(event['type_event'])

temp_dict = {'dim_process': dim_process,
'seq_idx': idx,
'seq_len': seq_len,
'time_since_start': time_since_start,
'time_since_last_event': time_since_last_event,
'type_event': type_event}
data_json.append(temp_dict)

with open(target_dir, "w") as outfile:
json.dump(data_json, outfile)

return


if __name__ == '__main__':
make_hf_dataset('../data/taxi/test.pkl', 'test.json', split='test')

0 comments on commit b900cb9

Please sign in to comment.