Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tensor parallel support to T5 via NxD #697

Merged
merged 29 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 10 additions & 15 deletions benchmark/text-generation-inference/performance/generate_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import os

import pandas as pd

from guidellm.core import GuidanceReport, TextGenerationBenchmark


Expand All @@ -16,11 +15,7 @@ def _benchmark_rate_id(benchmark: TextGenerationBenchmark) -> str:
:return: A string representing the benchmark rate ID.
:rtype: str
"""
rate_id = (
f"{benchmark.mode}@{benchmark.rate:.2f} req/sec"
if benchmark.rate
else f"{benchmark.mode}"
)
rate_id = f"{benchmark.mode}@{benchmark.rate:.2f} req/sec" if benchmark.rate else f"{benchmark.mode}"
return rate_id


Expand All @@ -38,20 +33,20 @@ def main():
for path in paths:
filename = os.path.basename(path)
# Extract model_id
model_id, date = filename.replace(suffix, '').split('#')
model_id, date = filename.replace(suffix, "").split("#")
with open(path) as f:
report = GuidanceReport.from_json(f.read())
for benchmark in report.benchmarks:
for b in benchmark.benchmarks_sorted:
d = {
"model_id": model_id,
"Date": date,
"Input type": _benchmark_rate_id(b),
"Requests per Second": b.completed_request_rate,
"Request Latency (s)": b.request_latency,
"Time-to-first-token (ms)": b.time_to_first_token,
"Inter Token Latency (ms)": b.inter_token_latency,
"Output Token Throughput (t/s)": b.output_token_throughput,
"model_id": model_id,
"Date": date,
"Input type": _benchmark_rate_id(b),
"Requests per Second": b.completed_request_rate,
"Request Latency (s)": b.request_latency,
"Time-to-first-token (ms)": b.time_to_first_token,
"Inter Token Latency (ms)": b.inter_token_latency,
"Output Token Throughput (t/s)": b.output_token_throughput,
}
results.append(pd.DataFrame.from_dict(d, orient="index").transpose())

Expand Down
5 changes: 5 additions & 0 deletions docs/source/package_reference/modeling.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ The following Neuron model classes are available for natural language processing
[[autodoc]] modeling.NeuronModelForCausalLM
- forward

### NeuronModelForSeq2SeqLM

[[autodoc]] modeling_seq2seq.NeuronModelForSeq2SeqLM
- forward

## Computer Vision

The following Neuron model classes are available for computer vision tasks.
Expand Down
6 changes: 6 additions & 0 deletions optimum/commands/export/neuronx.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,12 @@ def parse_args_neuronx(parser: "ArgumentParser"):
choices=["bf16", "fp16", "tf32"],
help='The data type to cast FP32 operations to when auto-cast mode is enabled. Can be `"bf16"`, `"fp16"` or `"tf32"`.',
)
optional_group.add_argument(
"--tensor_parallel_size",
JingyaHuang marked this conversation as resolved.
Show resolved Hide resolved
type=int,
default=1,
help="Tensor parallelism degree, the number of neuron cores on which to shard the model.",
JingyaHuang marked this conversation as resolved.
Show resolved Hide resolved
)
optional_group.add_argument(
"--dynamic-batch-size",
action="store_true",
Expand Down
26 changes: 25 additions & 1 deletion optimum/exporters/neuron/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@ def get_submodels_and_neuron_configs(
task: str,
output: Path,
library_name: str,
tensor_parallel_size: int = 1,
subfolder: str = "",
dynamic_batch_size: bool = False,
model_name_or_path: Optional[Union[str, Path]] = None,
Expand Down Expand Up @@ -300,7 +301,14 @@ def get_submodels_and_neuron_configs(
elif is_encoder_decoder:
optional_outputs = {"output_attentions": output_attentions, "output_hidden_states": output_hidden_states}
models_and_neuron_configs, output_model_names = _get_submodels_and_neuron_configs_for_encoder_decoder(
model, input_shapes, task, output, dynamic_batch_size, model_name_or_path, **optional_outputs
model=model,
input_shapes=input_shapes,
tensor_parallel_size=tensor_parallel_size,
task=task,
output=output,
dynamic_batch_size=dynamic_batch_size,
model_name_or_path=model_name_or_path,
**optional_outputs,
)
else:
# TODO: Enable optional outputs for encoders
Expand Down Expand Up @@ -427,6 +435,7 @@ def _get_submodels_and_neuron_configs_for_stable_diffusion(
def _get_submodels_and_neuron_configs_for_encoder_decoder(
model: "PreTrainedModel",
input_shapes: Dict[str, int],
tensor_parallel_size: int,
task: str,
output: Path,
dynamic_batch_size: bool = False,
Expand All @@ -442,15 +451,19 @@ def _get_submodels_and_neuron_configs_for_encoder_decoder(
models_and_neuron_configs = get_encoder_decoder_models_for_export(
model=model,
task=task,
tensor_parallel_size=tensor_parallel_size,
dynamic_batch_size=dynamic_batch_size,
input_shapes=input_shapes,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
model_name_or_path=model_name_or_path,
JingyaHuang marked this conversation as resolved.
Show resolved Hide resolved
)
output_model_names = {
ENCODER_NAME: os.path.join(ENCODER_NAME, NEURON_FILE_NAME),
DECODER_NAME: os.path.join(DECODER_NAME, NEURON_FILE_NAME),
}
model.config.save_pretrained(output)
model.generation_config.save_pretrained(output)
maybe_save_preprocessors(model_name_or_path, output)

return models_and_neuron_configs, output_model_names
Expand All @@ -475,6 +488,7 @@ def load_models_and_neuron_configs(
lora_weight_names: Optional[Union[str, List[str]]],
lora_adapter_names: Optional[Union[str, List[str]]],
lora_scales: Optional[Union[float, List[float]]],
tensor_parallel_size: int = 1,
JingyaHuang marked this conversation as resolved.
Show resolved Hide resolved
controlnet_ids: Optional[Union[str, List[str]]] = None,
output_attentions: bool = False,
output_hidden_states: bool = False,
Expand All @@ -499,6 +513,7 @@ def load_models_and_neuron_configs(
models_and_neuron_configs, output_model_names = get_submodels_and_neuron_configs(
model=model,
input_shapes=input_shapes,
tensor_parallel_size=tensor_parallel_size,
task=task,
library_name=library_name,
output=output,
Expand All @@ -522,6 +537,7 @@ def main_export(
model_name_or_path: str,
output: Union[str, Path],
compiler_kwargs: Dict[str, Any],
tensor_parallel_size: int = 1,
model: Optional[Union["PreTrainedModel", "ModelMixin"]] = None,
task: str = "auto",
dynamic_batch_size: bool = False,
Expand Down Expand Up @@ -563,6 +579,7 @@ def main_export(
model_name_or_path=model_name_or_path,
output=output,
model=model,
tensor_parallel_size=tensor_parallel_size,
task=task,
dynamic_batch_size=dynamic_batch_size,
cache_dir=cache_dir,
Expand Down Expand Up @@ -597,6 +614,12 @@ def main_export(
)

# Validate compiled model
if do_validation and tensor_parallel_size > 1:
# TODO: support the validation of tp models.
logger.warning(
"The validation is not yet supported for tensor parallel model, the validation will be turned off."
)
do_validation = False
if do_validation is True:
try:
validate_models_outputs(
Expand Down Expand Up @@ -698,6 +721,7 @@ def main():
model_name_or_path=args.model,
output=args.output,
compiler_kwargs=compiler_kwargs,
tensor_parallel_size=args.tensor_parallel_size,
task=task,
dynamic_batch_size=args.dynamic_batch_size,
atol=args.atol,
Expand Down
10 changes: 10 additions & 0 deletions optimum/exporters/neuron/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def __init__(
task: str,
compiler_type: Optional[str] = None,
compiler_version: Optional[str] = None,
tensor_parallel_size: int = 1,
batch_size: Optional[int] = None,
text_batch_size: Optional[int] = None,
image_batch_size: Optional[int] = None,
Expand Down Expand Up @@ -174,6 +175,7 @@ def __init__(
self._config = config
self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config)
self.mandatory_axes = ()
self.tp_degree = tensor_parallel_size
JingyaHuang marked this conversation as resolved.
Show resolved Hide resolved
self.task = task
self._axes: Dict[str, int] = {}
self.dynamic_batch_size = dynamic_batch_size
Expand Down Expand Up @@ -227,6 +229,14 @@ def task(self, value: str):
self._task = value
self.mandatory_axes = self.get_mandatory_axes_for_task(self.task)

@property
def tp_degree(self) -> int:
return self._tp_degree

@tp_degree.setter
def tp_degree(self, value: int):
self._tp_degree = value

def __getattr__(self, attr_name) -> Any:
if attr_name != "_axes" and attr_name in self._axes:
return self._axes[attr_name]
Expand Down
Loading
Loading