Skip to content

Commit

Permalink
memmap worker arg (#7062)
Browse files Browse the repository at this point in the history
* memmap worker arg

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

Signed-off-by: arendu <adithya.r@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

Signed-off-by: arendu <adithya.r@gmail.com>

* update

Signed-off-by: arendu <adithya.r@gmail.com>

---------

Signed-off-by: arendu <adithya.r@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Signed-off-by: jubick1337 <mattyson.so@gmail.com>
  • Loading branch information
2 people authored and jubick1337 committed Aug 8, 2023
1 parent ba16234 commit 4957058
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 2 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ model:
micro_batch_size: ${model.micro_batch_size}
shuffle: True
num_workers: 0
memmap_workers: null
pin_memory: True
max_seq_length: 2048
min_seq_length: 1
Expand Down Expand Up @@ -143,6 +144,7 @@ model:
micro_batch_size: ${model.micro_batch_size}
shuffle: False
num_workers: 0
memmap_workers: ${model.data.train_ds.memmap_workers}
pin_memory: True
max_seq_length: 2048
min_seq_length: 1
Expand Down Expand Up @@ -170,6 +172,7 @@ model:
micro_batch_size: ${model.micro_batch_size}
shuffle: False
num_workers: 4
memmap_workers: ${model.data.train_ds.memmap_workers}
pin_memory: True
max_seq_length: 2048
min_seq_length: 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ model:
micro_batch_size: ${model.micro_batch_size}
shuffle: True
num_workers: 4
memmap_workers: null
pin_memory: True
max_seq_length: 2048
min_seq_length: 1
Expand Down Expand Up @@ -109,6 +110,7 @@ model:
micro_batch_size: ${model.micro_batch_size}
shuffle: True
num_workers: 4
memmap_workers: ${model.data.train_ds.memmap_workers}
pin_memory: True
max_seq_length: 2048
min_seq_length: 1
Expand Down Expand Up @@ -137,6 +139,7 @@ model:
micro_batch_size: ${model.micro_batch_size}
shuffle: True
num_workers: 4
memmap_workers: ${model.data.train_ds.memmap_workers}
pin_memory: True
max_seq_length: 2048
min_seq_length: 1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional

import numpy as np
import torch

Expand Down Expand Up @@ -40,12 +42,13 @@ def __init__(
label_key: str = "answer",
separate_prompt_and_response_with_newline: bool = False,
answer_only_loss: bool = True,
truncation_field: str = "answer",
truncation_field: str = "context",
pad_to_max_length: bool = False, # (@adithyare) allows for much faster training especially in PEFT settings.
index_mapping_dir: str = None,
prompt_template: str = None,
virtual_tokens: int = 0,
tokens_to_generate: int = 0,
memmap_workers: Optional[int] = None,
):
"""
file_path: Path to a JSONL GPT supervised fine-tuning dataset. Data is formatted as multiple JSON lines with each line formatted as follows. {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
Expand Down Expand Up @@ -94,7 +97,11 @@ def __init__(
assert self.truncation_field in ["answer", "context"]

self.indexed_dataset = JSONLMemMapDataset(
dataset_paths=[file_path], tokenizer=None, header_lines=0, index_mapping_dir=index_mapping_dir
dataset_paths=[file_path],
tokenizer=None,
header_lines=0,
index_mapping_dir=index_mapping_dir,
workers=memmap_workers,
)

# Will be None after this call if `max_num_samples` is None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,9 @@ def _build_dataset(self, data_cfg, is_train=True):
tokens_to_generate=data_cfg.get(
'tokens_to_generate', 0
), # used at inference time to allocate tensor positions for tokens that will be generated by inf procedure.
memmap_workers=data_cfg.get(
'memmap_workers', None
), # used to set num. of workers to create the memmap index files
)
datasets.append(dataset)

Expand Down

0 comments on commit 4957058

Please sign in to comment.