From a97b53d8a4cb3355253bd3837ace86764b14f46f Mon Sep 17 00:00:00 2001 From: RdoubleA Date: Thu, 12 Sep 2024 17:56:17 -0700 Subject: [PATCH 1/4] first --- docs/source/basics/message_transforms.rst | 5 + docs/source/basics/messages.rst | 143 ++++++++++++++++++++++ docs/source/index.rst | 9 ++ 3 files changed, 157 insertions(+) create mode 100644 docs/source/basics/message_transforms.rst create mode 100644 docs/source/basics/messages.rst diff --git a/docs/source/basics/message_transforms.rst b/docs/source/basics/message_transforms.rst new file mode 100644 index 0000000000..8953d43442 --- /dev/null +++ b/docs/source/basics/message_transforms.rst @@ -0,0 +1,5 @@ +.. _message_transform_usage_label: + +================== +Message Transforms +================== diff --git a/docs/source/basics/messages.rst b/docs/source/basics/messages.rst new file mode 100644 index 0000000000..4860dbf7e8 --- /dev/null +++ b/docs/source/basics/messages.rst @@ -0,0 +1,143 @@ +.. _messages_usage_label: + +======== +Messages +======== + +Messages are a core component in torchtune that govern how text and multimodal content is tokenized. It serves as the common interface +for all tokenizer and datasets APIs to operate on. Messages contain information about the text content, which role is sending the text +content, and other information relevant for special tokens in model tokenizers. For more information about the individual parameters +for Messages, see the API ref for :class:`~torchtune.data.Message`. Here, we briefly discuss how to create messages, format messages, and +tokenize messages. + + +Creating Messages +----------------- + +Messages can be created via the standard class constructor or directly from a dictionary. + +.. code-block:: python + + from torchtune.data import Message + + msg = Message( + role="user", + content="Hello world!", + masked=True, + eot=True, + ipython=False, + ) + # This is identical + msg = Message.from_dict( + { + "role": "user", + "content": "Hello world!", + "masked": True, + "eot": True, + "ipython": False, + }, + ) + print(msg.content) + # [{'type': 'text', 'content': 'Hello world!'}] + +Content is formatted as a list of dictionaries. This is because Messages can also contain multimodal content, such as images. + +Images in Messages +^^^^^^^^^^^^^^^^^^ +For multimodal datasets, you need to add the image as a :class:`~PIL.Image.Image` to the corresponding :class:`~torchtune.data.Message`. +To add it to the beginning of the message, simply prepend it to the content list. + +.. code-block:: python + + import PIL + from torchtune.data import Message + + img_msg = Message( + role="user", + content=[ + { + "type": "image", + # Place your image here + "content": PIL.Image.new(mode="RGB", size=(4, 4)), + }, + {"type": "text", "content": "What's in this image?"}, + ], + ) + +This will indicate to the model tokenizers where to add the image special token and will be processed by the model transform +appropriately. + +In many cases, you will have an image path instead of a raw :class:`~PIL.Image.Image`. You can use the :func:`~torchtune.data.load_image` +utility for both local paths and remote paths. + +.. code-block:: python + + import PIL + from torchtune.data import Message, load_image + + image_path = "path/to/image.jpg" + img_msg = Message( + role="user", + content=[ + { + "type": "image", + # Place your image here + "content": load_image(image_path), + }, + {"type": "text", "content": "What's in this image?"}, + ], + ) + +If your dataset contain image tags, or placeholder text to indicate where in the text the image should be inserted, +you can use the :func:`~torchtune.data.format_content_with_images` to split the text into the correct content list +that you can pass into the content field of Message. + +.. code-block:: python + + import PIL + from torchtune.data import format_content_with_images + + content = format_content_with_images( + "<|image|>hello <|image|>world", + image_tag="<|image|>", + images=[PIL.Image.new(mode="RGB", size=(4, 4)), PIL.Image.new(mode="RGB", size=(4, 4))] + ) + print(content) + # [ + # {"type": "image", "content": }, + # {"type": "text", "content": "hello "}, + # {"type": "image", "content": }, + # {"type": "text", "content": "world"} + # ] + +Message transforms +^^^^^^^^^^^^^^^^^^ +Message transforms are convenient utilities to format raw data into a list of torchtune :class:`~torchtune.data.Message` +objects. See :ref:`message_transform_usage_label` for more discussion. + + +Formatting messages with prompt templates +----------------------------------------- + +Prompt templates provide a way to format messages into a structured text template. You can simply call any class that inherits +from :class:`~torchtune.data.PromptTemplateInterface` on a list of Messages and it will add the appropriate text to the content +list. + +.. code-block:: python + + from torchtune.models.mistral import MistralChatTemplate + from torchtune.data import Message + + msg = Message( + role="user", + content="Hello world!", + masked=True, + eot=True, + ipython=False, + ) + template = MistralChatTemplate() + templated_msg = template([msg]) + print(templated_msg[0].content) + # [{'type': 'text', 'content': '[INST] '}, + # {'type': 'text', 'content': 'Hello world!'}, + # {'type': 'text', 'content': ' [/INST] '}] diff --git a/docs/source/index.rst b/docs/source/index.rst index 248fa8541d..ee2a1d0b15 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -107,6 +107,15 @@ torchtune tutorials. recipes/lora_finetune_single_device recipes/qat_distributed +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Basics + :hidden: + + basics/messages + basics/message_transforms + .. toctree:: :glob: :maxdepth: 1 From a0a490fef7fdf5513c91562c9a9dc2101b5ce70d Mon Sep 17 00:00:00 2001 From: RdoubleA Date: Fri, 13 Sep 2024 13:39:22 -0700 Subject: [PATCH 2/4] finish message transforms --- docs/source/basics/message_transforms.rst | 130 ++++++++++++++++++++++ docs/source/basics/messages.rst | 1 + 2 files changed, 131 insertions(+) diff --git a/docs/source/basics/message_transforms.rst b/docs/source/basics/message_transforms.rst index 8953d43442..49e2300e5c 100644 --- a/docs/source/basics/message_transforms.rst +++ b/docs/source/basics/message_transforms.rst @@ -3,3 +3,133 @@ ================== Message Transforms ================== + +Message transforms perform the conversion of raw sample dictionaries from your dataset into torchtune's +:class:`~torchtune.data.Message` structure. This is where you, as the architect of your data and training pipeline, +get to place all your data preprocessing and formatting logic without having to worry about the rest of +the pipeline to tokenization and model inputs. We provide utilities to make this as easy as possible, +and if your dataset folows a standard format, you can use our built-in utilities to quickly get +fine-tuning without having to write any data transforms. + + +Configuring message transforms +------------------------------ +Most of our built-in message transforms contain parameters for controlling input masking (``train_on_input``), +adding a system prompt (``new_system_prompt``), and changing the expected column names (``column_map``). +These are exposed in our dataset builders :func:`~torchtune.datasets.instruct_dataset` and :func:`~torchtune.datasets.chat_dataset` +so you don't have to worry about the message transform itself and can configure this directly from the config. +You can view the API ref for these builders for more details. + +For example, :func:`~torchtune.datasets.instruct_dataset` uses :class:`~torchtune.data.InputOutputToMessages` as the message transform. +The following code will configure :class:`~torchtune.data.InputOutputToMessages` to train on the user prompt, prepend a custom system +prompt, and use the column names "prompt" and "response" instead. + +.. code-block:: python + + # In code + from torchtune.datasets import instruct_dataset + + ds = instruct_dataset( + source="json", + data_files="data/my_data.json", + split="train", + train_on_input=True, + new_system_prompt="You are a friendly AI assistant.", + column_map={"input": "prompt", "output": "response"}, + ) + +.. code-block:: yaml + + # In config + dataset: + _component_: torchtune.datasets.instruct_dataset + source: json + data_files: data/my_data.json + split: train + train_on_input: True + new_system_prompt: You are a friendly AI assistant. + column_map: + input: prompt + output: response + +Custom message transforms +------------------------- +If our built-in message transforms do not configure for your particular dataset well, +you can create your own class with full flexibility. Simply inherit from the :class:`~torchtune.modules.transforms.Transform` +class and add your code in the ``__call__`` method. + +A simple contrived example would be to take one column from the dataset as the user message and another +column as the model response. Indeed, this is quite similar to :class:`~torchtune.data.InputOutputToMessages`. + +.. code-block:: python + + from torchtune.modules.transforms import Transform + from torchtune.data import Message + from typing import Any, Mapping + + class MessageTransform(Transform): + def __call__(self, sample: Mapping[str, Any]) -> Mapping[str, Any]: + return [ + Message( + role="user", + content=sample["input"], + masked=True, + eot=True, + ), + Message( + role="assistant", + content=sample["output"], + masked=False, + eot=True, + ), + ] + + sample = {"input": "hello world", "output": "bye world"} + transform = MessageTransform() + messages = transform(sample) + print(messages) + # [, + # ] + for msg in messages: + print(msg.role, msg.text_content) + # user hello world + # assistant bye world + +See :ref:`creating_messages` for more details on how to manipulate :class:`~torchtune.data.Message` objects. + +To use this for your dataset, you must create a custom dataset builder that uses the underlying +dataset class, :class:`~torchtune.datasets.SFTDataset`. + +.. code-block:: python + + # In data/dataset.py + from torchtune.datasets import SFTDataset + + def custom_dataset(tokenizer, **load_dataset_kwargs) -> SFTDataset: + message_transform = MyMessageTransform() + return SFTDataset( + source="json", + data_files="data/my_data.json", + split="train", + message_transform=message_transform, + model_transform=tokenizer, + **load_dataset_kwargs, + ) + +This can be used directly from the config. + +.. code-block:: yaml + + dataset: + _component_: data.dataset.custom_dataset + + +Example message transforms +-------------------------- +- Instruct + - :class:`~torchtune.data.InputOutputToMessages` +- Chat + - :class:`~torchtune.data.ShareGPTToMessages` + - :class:`~torchtune.data.JSONToMessages` +- Preference + - :class:`~torchtune.data.ChosenRejectedToMessages` diff --git a/docs/source/basics/messages.rst b/docs/source/basics/messages.rst index 4860dbf7e8..e1d25f2039 100644 --- a/docs/source/basics/messages.rst +++ b/docs/source/basics/messages.rst @@ -10,6 +10,7 @@ content, and other information relevant for special tokens in model tokenizers. for Messages, see the API ref for :class:`~torchtune.data.Message`. Here, we briefly discuss how to create messages, format messages, and tokenize messages. +.. _creating_messages: Creating Messages ----------------- From d8004eaae931503b84549be6ad1b1013a6bb219c Mon Sep 17 00:00:00 2001 From: RdoubleA Date: Fri, 20 Sep 2024 15:09:08 -0700 Subject: [PATCH 3/4] some comments --- docs/source/basics/message_transforms.rst | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/docs/source/basics/message_transforms.rst b/docs/source/basics/message_transforms.rst index 49e2300e5c..6285bb34cf 100644 --- a/docs/source/basics/message_transforms.rst +++ b/docs/source/basics/message_transforms.rst @@ -5,11 +5,10 @@ Message Transforms ================== Message transforms perform the conversion of raw sample dictionaries from your dataset into torchtune's -:class:`~torchtune.data.Message` structure. This is where you, as the architect of your data and training pipeline, -get to place all your data preprocessing and formatting logic without having to worry about the rest of -the pipeline to tokenization and model inputs. We provide utilities to make this as easy as possible, -and if your dataset folows a standard format, you can use our built-in utilities to quickly get -fine-tuning without having to write any data transforms. +:class:`~torchtune.data.Message` structure. Once you data is represented as Messages, torchtune will handle +tokenization and preparing it for the model. + +.. TODO (rafiayub): place an image here to depict overall pipeline Configuring message transforms From 5be46fcd26b5e1b03ca492a549deab88fec677bc Mon Sep 17 00:00:00 2001 From: RdoubleA Date: Sat, 21 Sep 2024 11:55:44 -0700 Subject: [PATCH 4/4] comments --- docs/source/basics/chat_datasets.rst | 2 + docs/source/basics/instruct_datasets.rst | 2 + docs/source/basics/message_transforms.rst | 33 +------ docs/source/basics/messages.rst | 102 +++++++++++++++++++++- 4 files changed, 104 insertions(+), 35 deletions(-) diff --git a/docs/source/basics/chat_datasets.rst b/docs/source/basics/chat_datasets.rst index 2722305b09..c3fa6323fb 100644 --- a/docs/source/basics/chat_datasets.rst +++ b/docs/source/basics/chat_datasets.rst @@ -22,6 +22,8 @@ The primary entry point for fine-tuning with chat datasets in torchtune is the : builder. This lets you specify a local or Hugging Face dataset that follows the chat data format directly from the config and train your LLM on it. +.. _example_chat: + Example chat dataset -------------------- diff --git a/docs/source/basics/instruct_datasets.rst b/docs/source/basics/instruct_datasets.rst index a5f6371549..628f22a716 100644 --- a/docs/source/basics/instruct_datasets.rst +++ b/docs/source/basics/instruct_datasets.rst @@ -14,6 +14,8 @@ The primary entry point for fine-tuning with instruct datasets in torchtune is t builder. This lets you specify a local or Hugging Face dataset that follows the instruct data format directly from the config and train your LLM on it. +.. _example_instruct: + Example instruct dataset ------------------------ diff --git a/docs/source/basics/message_transforms.rst b/docs/source/basics/message_transforms.rst index 6285bb34cf..53375703d2 100644 --- a/docs/source/basics/message_transforms.rst +++ b/docs/source/basics/message_transforms.rst @@ -17,39 +17,8 @@ Most of our built-in message transforms contain parameters for controlling input adding a system prompt (``new_system_prompt``), and changing the expected column names (``column_map``). These are exposed in our dataset builders :func:`~torchtune.datasets.instruct_dataset` and :func:`~torchtune.datasets.chat_dataset` so you don't have to worry about the message transform itself and can configure this directly from the config. -You can view the API ref for these builders for more details. +You can see :ref:`example_instruct` or :ref:`example_chat` for more details. -For example, :func:`~torchtune.datasets.instruct_dataset` uses :class:`~torchtune.data.InputOutputToMessages` as the message transform. -The following code will configure :class:`~torchtune.data.InputOutputToMessages` to train on the user prompt, prepend a custom system -prompt, and use the column names "prompt" and "response" instead. - -.. code-block:: python - - # In code - from torchtune.datasets import instruct_dataset - - ds = instruct_dataset( - source="json", - data_files="data/my_data.json", - split="train", - train_on_input=True, - new_system_prompt="You are a friendly AI assistant.", - column_map={"input": "prompt", "output": "response"}, - ) - -.. code-block:: yaml - - # In config - dataset: - _component_: torchtune.datasets.instruct_dataset - source: json - data_files: data/my_data.json - split: train - train_on_input: True - new_system_prompt: You are a friendly AI assistant. - column_map: - input: prompt - output: response Custom message transforms ------------------------- diff --git a/docs/source/basics/messages.rst b/docs/source/basics/messages.rst index e1d25f2039..bade08732d 100644 --- a/docs/source/basics/messages.rst +++ b/docs/source/basics/messages.rst @@ -7,8 +7,7 @@ Messages Messages are a core component in torchtune that govern how text and multimodal content is tokenized. It serves as the common interface for all tokenizer and datasets APIs to operate on. Messages contain information about the text content, which role is sending the text content, and other information relevant for special tokens in model tokenizers. For more information about the individual parameters -for Messages, see the API ref for :class:`~torchtune.data.Message`. Here, we briefly discuss how to create messages, format messages, and -tokenize messages. +for Messages, see the API ref for :class:`~torchtune.data.Message`. .. _creating_messages: @@ -114,7 +113,24 @@ that you can pass into the content field of Message. Message transforms ^^^^^^^^^^^^^^^^^^ Message transforms are convenient utilities to format raw data into a list of torchtune :class:`~torchtune.data.Message` -objects. See :ref:`message_transform_usage_label` for more discussion. +objects. + +.. code-block:: python + + from torchtune.data import InputOutputToMessages + + sample = { + "input": "What is your name?", + "output": "I am an AI assistant, I don't have a name." + } + transform = InputOutputToMessages() + output = transform(sample) + for message in output["messages"]: + print(message.role, message.text_content) + # user What is your name? + # assistant I am an AI assistant, I don't have a name. + +See :ref:`message_transform_usage_label` for more discussion. Formatting messages with prompt templates @@ -142,3 +158,83 @@ list. # [{'type': 'text', 'content': '[INST] '}, # {'type': 'text', 'content': 'Hello world!'}, # {'type': 'text', 'content': ' [/INST] '}] + +Accessing text content in messages +---------------------------------- +.. code-block:: python + + from torchtune.models.mistral import MistralChatTemplate + from torchtune.data import Message + + msg = Message( + role="user", + content="Hello world!", + masked=True, + eot=True, + ipython=False, + ) + template = MistralChatTemplate() + templated_msg = template([msg]) + print(templated_msg[0].text_content) + # [INST] Hello world! [/INST] + +Accessing images in messages +---------------------------- +.. code-block:: python + + from torchtune.data import Message + import PIL + + msg = Message( + role="user", + content=[ + { + "type": "image", + # Place your image here + "content": PIL.Image.new(mode="RGB", size=(4, 4)), + }, + {"type": "text", "content": "What's in this image?"}, + ], + ) + if msg.contains_media: + print(msg.get_media()) + # [] + +Tokenizing messages +------------------- +All model tokenizers have a ``tokenize_messsages`` method that converts a list of +:class:`~torchtune.data.Message` objects into token IDs and a loss mask. + +.. code-block:: python + + from torchtune.models.mistral import mistral_tokenizer + from torchtune.data import Message + + m_tokenizer = mistral_tokenizer( + path="/tmp/Mistral-7B-v0.1/tokenizer.model", + prompt_template="torchtune.models.mistral.MistralChatTemplate", + max_seq_len=8192, + ) + msgs = [ + Message( + role="user", + content="Hello world!", + masked=True, + eot=True, + ipython=False, + ), + Message( + role="assistant", + content="Hi, I am an AI assistant.", + masked=False, + eot=True, + ipython=False, + ) + ] + tokens, mask = m_tokenizer.tokenize_messages(msgs) + print(tokens) + # [1, 733, 16289, 28793, 22557, 1526, 28808, 28705, 733, 28748, 16289, 28793, 15359, 28725, 315, 837, 396, 16107, 13892, 28723, 2] + print(mask) # User message is masked from the loss + # [True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False] + print(m_tokenizer.decode(tokens)) + # [INST] Hello world! [/INST] Hi, I am an AI assistant.