Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updated the util code to buffer mode write #261

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 15 additions & 3 deletions data/preprocess/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

### DEFAULT PREPROCESS UTILS ##########################################################

BUFFER_SIZE = 1000000

@dataclass
class RawSplit:
Expand Down Expand Up @@ -52,6 +53,7 @@ def write_splits(self, entities: dict, relations: dict, folder):
for split in self.splits:
split.process_triple(t, entities, relations, n=n)
for split in self.splits:
dump_buffer_to_file(split.file_buffer, split.file)
split.file.close()

def update_config(self, config: Dict) -> Dict:
Expand Down Expand Up @@ -80,11 +82,13 @@ class Split:

def prepare(self, folder: str):
self.file = open(path.join(folder, self.options["filename"]), "w")
self.file_buffer = ""
self.options["size"] = 0

def process_triple(self, triple: List, entities: Dict, relations: Dict, **kwargs):
write_triple(
self.file_buffer = write_triple(
self.file,
self.file_buffer,
entities,
relations,
triple,
Expand Down Expand Up @@ -252,10 +256,18 @@ def store_map(symbol_map: Dict, filename: str):
for symbol, index in symbol_map.items():
f.write(f"{index}\t{symbol}\n")

def dump_buffer_to_file(buffer, fptr):
"""Dump buffer to file."""
fptr.write(f"{buffer}")

def write_triple(f, ent, rel, t, S, P, O):
def write_triple(f, buffer, ent, rel, t, S, P, O):
"""Write a triple to a file. """
f.write(str(ent[t[S]]) + "\t" + str(rel[t[P]]) + "\t" + str(ent[t[O]]) + "\n")
buffer += f"{str(ent[t[S]])}\t{str(rel[t[P]])}\t{str(ent[t[O]])}\n"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it is faster, to make the buffer a list of strings and finally join this list to a single string

buffer_list = []
buffer_list.append(new_triple_string)

buffer = ""
buffer.join(buffer_list)

if len(buffer) > BUFFER_SIZE:
dump_buffer_to_file(buffer, f)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

did you make sure, that the rest of the buffer is written to file at every necessary position? you did so in the RawSplit but process_triple is called also in other splits

buffer = ""
return buffer
# f.write(str(ent[t[S]]) + "\t" + str(rel[t[P]]) + "\t" + str(ent[t[O]]) + "\n")


def write_dataset_yaml(config: Dict, folder: str):
Expand Down