comments

RdoubleA · RdoubleA · commit 81b6abef8067 · 2024-10-28T09:39:40.000-07:00
diff --git a/docs/source/basics/packing.rst b/docs/source/basics/packing.rst
@@ -4,10 +4,12 @@
 Sample packing
 ==============
 
-You can use sample packing with any of the single dataset builders by passing in
-:code:`packed=True`. This requires some pre-processing of the dataset which may
+Sample packing involves concatenating multiple samples from your dataset into a single sequence, upto a maximum
+sequence length. This requires some pre-processing of the dataset which may
 slow down time-to-first-batch, but can introduce significant training speedups
-depending on the dataset.
+depending on the dataset. In torchtune, sample packing is done by iterating through your dataset and performing
+greedy packing upon dataset initialization. You can use sample packing with any of the single dataset builders by passing in
+:code:`packed=True`.
 
 To set the max sequence length to pack to, make sure to define ``max_seq_len`` on your tokenizer.
 
@@ -48,5 +50,5 @@ To set the max sequence length to pack to, make sure to define ``max_seq_len`` o
 torchtune will automatically handle document masking and relative position IDs when sample packing is enabled
 to prevent different irrelevant samples from cross-attending. This is done via PyTorch's `Flex Attention <https://pytorch.org/blog/flexattention/#document-maskingjagged-sequences>`_,
 which enables the use of flash attention with non-causal masks. If your hardware does not support Flex Attention
-(for CUDA devices, it must be Turing or above), standard SDPA with ememory-efficient attention will be used as a fallback,
+(for CUDA devices, it must be Turing or above), standard SDPA with memory-efficient attention will be used as a fallback,
 while retaining the document masking and relative position IDs.
diff --git a/recipes/configs/generation.yaml b/recipes/configs/generation.yaml
@@ -30,7 +30,9 @@ tokenizer:
   prompt_template: null
 
 # Generation arguments; defaults taken from gpt-fast
-prompt: "Tell me a joke?"
+prompt:
+  system: null
+  user: "Tell me a joke."
 max_new_tokens: 300
 temperature: 0.6 # 0.8 and 0.6 are popular values to try
 top_k: 300
diff --git a/recipes/generate.py b/recipes/generate.py
@@ -13,7 +13,7 @@
 from torch import nn
 
 from torchtune import config, generation, training, utils
-from torchtune.data import Message
+from torchtune.data import Message, Role
 from torchtune.training import FullModelTorchTuneCheckpointer
 
 logger = utils.get_logger("DEBUG")
@@ -99,17 +99,22 @@ def _setup_model(
 
     def convert_prompt_to_tokens(
         self,
-        prompt: str,
+        prompt: Dict[Role, str],
     ) -> List[int]:
         """
-        Convert the prompt string to a user message and tokenize using the prompt template
-        defined on the tokenizer.
+        Convert the prompt string to a user message with optional system messages
+        and tokenize using the prompt template defined on the tokenizer.
         """
-        messages = [
-            Message(role="user", content=prompt),
-            # Empty assistant message to kick-start generation
-            Message(role="assistant", content=""),
-        ]
+        messages = []
+        if "system" in prompt and prompt["system"] is not None:
+            messages.append(Message(role="system", content=prompt["system"]))
+        messages.extend(
+            [
+                Message(role="user", content=prompt["user"]),
+                # Empty assistant message to kick-start generation
+                Message(role="assistant", content=""),
+            ]
+        )
         return self._tokenizer({"messages": messages}, inference=True)["tokens"]
 
     @torch.inference_mode()