-
Notifications
You must be signed in to change notification settings - Fork 94
/
Copy pathingestor.py
61 lines (45 loc) · 1.49 KB
/
ingestor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from abc import ABC, abstractmethod
from functools import partial
import ijson
import csv
from datasets import Dataset, load_dataset, concatenate_datasets
def get_ingestor(data_type: str):
if data_type == "json":
return JsonIngestor
elif data_type == "csv":
return CsvIngestor
elif data_type == "huggingface":
return HuggingfaceIngestor
else:
raise ValueError(
f"'type' must be one of 'json', 'csv', or 'huggingface', you have {data_type}"
)
class Ingestor(ABC):
@abstractmethod
def to_dataset(self) -> Dataset:
pass
class JsonIngestor(Ingestor):
def __init__(self, path: str):
self.path = path
def _json_generator(self):
with open(self.path, "rb") as f:
for item in ijson.items(f, "item"):
yield item
def to_dataset(self) -> Dataset:
return Dataset.from_generator(self._json_generator)
class CsvIngestor(Ingestor):
def __init__(self, path: str):
self.path = path
def _csv_generator(self):
with open(self.path) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
yield row
def to_dataset(self) -> Dataset:
return Dataset.from_generator(self._csv_generator)
class HuggingfaceIngestor(Ingestor):
def __init__(self, path: str):
self.path = path
def to_dataset(self) -> Dataset:
ds = load_dataset(self.path)
return concatenate_datasets(ds.values())