pytorch · RdoubleA · Apr 28, 2024 · Apr 27, 2024 · Apr 28, 2024 · Apr 28, 2024
diff --git a/torchtune/data/__init__.py b/torchtune/data/__init__.py
@@ -11,7 +11,10 @@
     MistralChatFormat,
 )
 from torchtune.data._common import CROSS_ENTROPY_IGNORE_IDX
-from torchtune.data._converters import sharegpt_to_llama2_messages
+from torchtune.data._converters import (
+    sharegpt_to_llama2_messages,
+    standard_chat_to_llama2_messages
+)
 from torchtune.data._instruct_templates import (
     AlpacaInstructTemplate,
     GrammarErrorCorrectionTemplate,
@@ -33,6 +36,7 @@
     "MistralChatFormat",
     "ChatMLFormat",
     "sharegpt_to_llama2_messages",
+    "standard_chat_to_llama2_messages",
     "truncate",
     "Message",
     "validate_messages",

diff --git a/torchtune/data/_converters.py b/torchtune/data/_converters.py
@@ -55,3 +55,59 @@ def sharegpt_to_llama2_messages(
         masked = (role != "assistant") and (not train_on_input)
         messages.append(Message(role=role, content=content, masked=masked))
     return messages
+
+
+def standard_chat_to_llama2_messages(
+    sample: Mapping[str, Any],
+    train_on_input: bool = False,
+) -> List[Message]:
+    """
+    Convert a chat sample adhering to the OpenAI API standard chat format to the Llama2 chat format.
+
+    OpenAI API standard chat format follows::
+
+        {
+            # key could be "messages" OR "conversations"
+            "messages": [
+                {
+                    "role": <system|user|assistant>,
+                    "content": <message>,
+                },
+                ...
+            ]
+        }
+
+    Llama2 follows::
+
+        [
+            {
+                "role": <system|user|assistant>,
+                "content": <message>,
+            },
+            ...
+        ]
+
+    Args:
+        sample (Mapping[str, Any]): a single data sample with "conversations" field pointing
+            to a list of dict messages.
+        train_on_input (bool): whether the prompt should remain unmasked. Default: False
+        messages_key (str): the key in the sample that contains the messages. Default: "messages"
+
+    Returns:
+        List[Message]: A list of messages with "role" and "content" fields.
+    """
+    if "messages" in sample:
+        messages_key = "messages"
+    elif "conversations" in sample:
+        messages_key = "conversations"
+    else:
+        raise ValueError(f"Sample does not contain 'messages' or 'conversations' key. Existing keys: {sample.keys()}")
+    conversations = sample[messages_key]
+
+    messages = []
+    for message in conversations:
+        role = message["role"]
+        content = message["content"]
+        masked = (role != "assistant") and (not train_on_input)
+        messages.append(Message(role=role, content=content, masked=masked))
+    return messages
diff --git a/torchtune/datasets/_chat.py b/torchtune/datasets/_chat.py
@@ -16,6 +16,7 @@
     CROSS_ENTROPY_IGNORE_IDX,
     Message,
     sharegpt_to_llama2_messages,
+    standard_chat_to_llama2_messages,
     validate_messages,
 )
 from torchtune.modules.tokenizers import Tokenizer
@@ -159,6 +160,8 @@ def chat_dataset(
     """
     if conversation_style == "sharegpt":
         convert_to_messages = sharegpt_to_llama2_messages
+    elif conversation_style == "standard_chat":
+        convert_to_messages = standard_chat_to_llama2_messages
     else:
         raise ValueError(f"Unsupported conversation style: {conversation_style}")