Skip to content

Commit

Permalink
Use a context-manager when opening files (NVIDIA#10895)
Browse files Browse the repository at this point in the history
* Use a context-manager when opening files

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: artbataev <artbataev@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>
  • Loading branch information
3 people authored and HuiyingLi committed Nov 15, 2024
1 parent 2260f7c commit 5e109cc
Showing 1 changed file with 14 additions and 6 deletions.
20 changes: 14 additions & 6 deletions nemo/collections/nlp/data/language_modeling/text_memmap_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,8 @@ def load_file(self, fn, index_mapping_dir: Optional[str] = None):
raise RuntimeError(f"Missing header, expected {self._header_lines} header lines")

# load meta info
idx_info_dict = pickle.load(open(idx_fn + ".info", "rb"))
with open(idx_fn + ".info", "rb") as fp:
idx_info_dict = pickle.load(fp)
# test for mismatch in expected newline_int
if "newline_int" in idx_info_dict:
newline_int = idx_info_dict["newline_int"]
Expand Down Expand Up @@ -378,9 +379,7 @@ def __init__(
self._data_sep = data_sep

def _build_data_from_text(self, text: str):
"""
"""
""" """
_build_data_from_text = super()._build_data_from_text
data = {}
text_fields = text.split(self._data_sep)
Expand Down Expand Up @@ -513,7 +512,11 @@ def _build_memmap_index_files(newline_int, build_index_fn, fn, index_mapping_dir


def build_index_files(
dataset_paths, newline_int, workers=None, build_index_fn=_build_index_from_memdata, index_mapping_dir: str = None,
dataset_paths,
newline_int,
workers=None,
build_index_fn=_build_index_from_memdata,
index_mapping_dir: str = None,
):
"""Auxiliary method to build multiple index files"""
if len(dataset_paths) < 1:
Expand All @@ -528,7 +531,12 @@ def build_index_files(
ctx = mp.get_context("fork")
with ctx.Pool(workers) as p:
build_status = p.map(
partial(_build_memmap_index_files, newline_int, build_index_fn, index_mapping_dir=index_mapping_dir,),
partial(
_build_memmap_index_files,
newline_int,
build_index_fn,
index_mapping_dir=index_mapping_dir,
),
dataset_paths,
)

Expand Down

0 comments on commit 5e109cc

Please sign in to comment.