diff --git a/examples/script_data_processing/make_hf_dataset.py b/examples/script_data_processing/make_hf_dataset.py index 7c5c059..db2e968 100644 --- a/examples/script_data_processing/make_hf_dataset.py +++ b/examples/script_data_processing/make_hf_dataset.py @@ -14,35 +14,40 @@ def make_json_serializable(input_dict): return input_dict - def make_hf_dataset(source_dir, target_dir, split='test'): data_pkl = load_pickle(source_dir) - data_pkl['dim_process'] = int(data_pkl['dim_process']) - - data_json = dict({'dim_process': int(data_pkl['dim_process'])}) - - data_json['event_seqs'] = dict() + dim_process = int(data_pkl['dim_process']) - seq_len = [] + data_json = [] for idx, seq in enumerate(data_pkl[split]): - data_json['event_seqs'][f'seq_{idx}'] = dict() - seq_len.append(len(seq)) + seq_len = len(seq) + time_since_start, time_since_last_event, type_event = [], [], [] for idx_event, event in enumerate(data_pkl[split][idx]): - if idx_event == 0: + if idx_event == 0 and event['time_since_start'] > 0: start_timestamp = event['time_since_start'] event['time_since_last_event'] -= start_timestamp if event[ 'time_since_last_event'] == start_timestamp else \ - event['time_since_last_event'] - event['time_since_start'] -= start_timestamp - data_json['event_seqs'][f'seq_{idx}'][f'event_{idx_event}'] = make_json_serializable(event) - - data_json['num_seqs'] = len(data_pkl[split]) - data_json['avg_seq_len'] = np.mean(seq_len) - data_json['min_seq_len'] = min(seq_len) - data_json['max_seq_len'] = max(seq_len) + event['time_since_last_event'] + event['time_since_start'] -= start_timestamp + event = make_json_serializable(event) + time_since_start.append(event['time_since_start']) + time_since_last_event.append(event['time_since_last_event']) + type_event.append(event['type_event']) + + temp_dict = {'dim_process': dim_process, + 'seq_idx': idx, + 'seq_len': seq_len, + 'time_since_start': time_since_start, + 'time_since_last_event': time_since_last_event, + 'type_event': type_event} + data_json.append(temp_dict) with open(target_dir, "w") as outfile: json.dump(data_json, outfile) return + + +if __name__ == '__main__': + make_hf_dataset('../data/taxi/test.pkl', 'test.json', split='test')