From 0ce021af2bd0708df5f89d128c5944b146fccb0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Tue, 11 Jun 2024 12:31:59 +0800 Subject: [PATCH 01/14] Add Qwen2 support. --- recipes/configs/qwen2/7B_full.yaml | 77 + recipes/configs/qwen2/7B_full_low_memory.yaml | 79 + recipes/configs/qwen2/7B_lora.yaml | 110 + .../configs/qwen2/7B_lora_single_device.yaml | 108 + tests/assets/tiny_bpe_tokenizer.json | 3986 +++++++++++++++++ tests/torchtune/models/qwen2/__init__.py | 5 + .../torchtune/models/qwen2/test_lora_qwen2.py | 187 + tests/torchtune/models/qwen2/test_qwen2.py | 48 + .../models/qwen2/test_qwen2_tokenizer.py | 282 ++ torchtune/models/qwen2/__init__.py | 31 + torchtune/models/qwen2/_component_builders.py | 426 ++ torchtune/models/qwen2/_convert_weights.py | 117 + torchtune/models/qwen2/_model_builders.py | 113 + .../models/qwen2/_positional_embeddings.py | 117 + torchtune/models/qwen2/_tokenizer.py | 186 + torchtune/models/qwen2/transformer.py | 172 + .../utils/_checkpointing/_checkpointer.py | 20 + .../_checkpointing/_checkpointer_utils.py | 2 + 18 files changed, 6066 insertions(+) create mode 100644 recipes/configs/qwen2/7B_full.yaml create mode 100644 recipes/configs/qwen2/7B_full_low_memory.yaml create mode 100644 recipes/configs/qwen2/7B_lora.yaml create mode 100644 recipes/configs/qwen2/7B_lora_single_device.yaml create mode 100644 tests/assets/tiny_bpe_tokenizer.json create mode 100644 tests/torchtune/models/qwen2/__init__.py create mode 100644 tests/torchtune/models/qwen2/test_lora_qwen2.py create mode 100644 tests/torchtune/models/qwen2/test_qwen2.py create mode 100644 tests/torchtune/models/qwen2/test_qwen2_tokenizer.py create mode 100644 torchtune/models/qwen2/__init__.py create mode 100644 torchtune/models/qwen2/_component_builders.py create mode 100644 torchtune/models/qwen2/_convert_weights.py create mode 100644 torchtune/models/qwen2/_model_builders.py create mode 100644 torchtune/models/qwen2/_positional_embeddings.py create mode 100644 torchtune/models/qwen2/_tokenizer.py create mode 100644 torchtune/models/qwen2/transformer.py diff --git a/recipes/configs/qwen2/7B_full.yaml b/recipes/configs/qwen2/7B_full.yaml new file mode 100644 index 0000000000..45296d59df --- /dev/null +++ b/recipes/configs/qwen2/7B_full.yaml @@ -0,0 +1,77 @@ +# Config for multi-device full finetuning in full_finetune_distributed.py +# using a Qwen2 7B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2-7B-Instruct --output-dir /tmp/Qwen2-7B-Instruct +# +# To launch on 4 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config qwen2/7B_full +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config qwen2/7B_full checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# Single device full finetuning requires more memory optimizations. It's +# best to use 7B_full_single_device.yaml for those cases + +# Tokenizer +tokenizer: + _component_: torchtune.models.qwen2._tokenizer.Qwen2Tokenizer + path: /tmp/Qwen2-7B-Instruct/tokenizer.json + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_dataset +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.qwen2.qwen2_7b + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2-7B-Instruct + checkpoint_files: [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2-7B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 3 +optimizer: + _component_: torch.optim.AdamW + lr: 2e-5 +loss: + _component_: torch.nn.CrossEntropyLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 1 + + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: True +memory_efficient_fsdp_wrap: False + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Qwen2-7B-Instruct-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False diff --git a/recipes/configs/qwen2/7B_full_low_memory.yaml b/recipes/configs/qwen2/7B_full_low_memory.yaml new file mode 100644 index 0000000000..3580d9ee3f --- /dev/null +++ b/recipes/configs/qwen2/7B_full_low_memory.yaml @@ -0,0 +1,79 @@ +# Config for single device full finetuning in full_finetune_single_device.py +# using a Qwen2 7B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2-7B-Instruct --output-dir /tmp/Qwen2-7B-Instruct +# +# The default config uses an optimizer from bitsandbytes. If you do not have it installed, +# you can install it with +# pip install bitsandbytes +# +# To launch on a single device, run the following command from root: +# tune run full_finetune_single_device --config qwen2/7B_full_low_memory +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run full_finetune_single_device --config qwen2/7B_full_low_memory checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + +# Tokenizer +tokenizer: + _component_: torchtune.models.qwen2._tokenizer.Qwen2Tokenizer + path: /tmp/Qwen2-7B-Instruct/tokenizer.json + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_dataset +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.qwen2.qwen2_7b + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2-7B-Instruct + checkpoint_files: [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2-7B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 3 +optimizer: + _component_: bitsandbytes.optim.PagedAdamW + lr: 2e-5 +optimizer_in_bwd: True +loss: + _component_: torch.nn.CrossEntropyLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 1 +compile: False + +# Training environment +device: cuda + +# Memory management +enable_activation_checkpointing: True + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Qwen2-7B-Instruct-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False diff --git a/recipes/configs/qwen2/7B_lora.yaml b/recipes/configs/qwen2/7B_lora.yaml new file mode 100644 index 0000000000..5263cacf32 --- /dev/null +++ b/recipes/configs/qwen2/7B_lora.yaml @@ -0,0 +1,110 @@ +# Config for multi-device LoRA finetuning in lora_finetune_distributed.py +# using a Qwen2 7B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2-7B-Instruct --output-dir /tmp/Qwen2-7B-Instruct +# +# To launch on 2 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2/7B_lora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2/7B_lora checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# For single device LoRA finetuning please use 7B_lora_single_device.yaml +# or 7B_qlora_single_device.yaml + + +# Model Arguments +model: + _component_: torchtune.models.qwen2.lora_qwen2_7b + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + +tokenizer: + _component_: torchtune.models.qwen2._tokenizer.Qwen2Tokenizer + path: /tmp/Qwen2-7B-Instruct/tokenizer.json + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2-7B-Instruct + checkpoint_files: [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2-7B-Instruct-lora-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.modules.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torch.nn.CrossEntropyLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 32 + +# Logging +output_dir: /tmp/Qwen2-7B-Instruct-lora-finetune +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: False + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.utils.setup_torch_profiler + + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/qwen2/7B_lora_single_device.yaml b/recipes/configs/qwen2/7B_lora_single_device.yaml new file mode 100644 index 0000000000..1dd37725d7 --- /dev/null +++ b/recipes/configs/qwen2/7B_lora_single_device.yaml @@ -0,0 +1,108 @@ +# Config for single device LoRA finetuning in lora_finetune_single_device.py +# using a Qwen2 7B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2-7B-Instruct --output-dir /tmp/Qwen2-7B-Instruct +# +# To launch on a single device, run the following command from root: +# tune run lora_finetune_single_device --config qwen2/7B_lora_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run lora_finetune_single_device --config qwen2/7B_lora_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + + +# Model Arguments +model: + _component_: torchtune.models.qwen2.lora_qwen2_7b + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + +tokenizer: + _component_: torchtune.models.qwen2._tokenizer.Qwen2Tokenizer + path: /tmp/Qwen2-7B-Instruct/tokenizer.json + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2-7B-Instruct + checkpoint_files: [ + model-00001-of-00004.safetensors, + model-00002-of-00004.safetensors, + model-00003-of-00004.safetensors, + model-00004-of-00004.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2-7B-Instruct-lora-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.modules.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torch.nn.CrossEntropyLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 64 +compile: False + +# Logging +output_dir: /tmp/Qwen2-7B-Instruct-lora-finetune +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: True + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.utils.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/tests/assets/tiny_bpe_tokenizer.json b/tests/assets/tiny_bpe_tokenizer.json new file mode 100644 index 0000000000..470508b681 --- /dev/null +++ b/tests/assets/tiny_bpe_tokenizer.json @@ -0,0 +1,3986 @@ +{ + "version": "1.0", + "truncation": null, + "padding": null, + "added_tokens": [ + { + "id": 2000, + "content": "<|endoftext|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2001, + "content": "<|im_start|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + }, + { + "id": 2002, + "content": "<|im_end|>", + "single_word": false, + "lstrip": false, + "rstrip": false, + "normalized": false, + "special": true + } + ], + "normalizer": { + "type": "NFC" + }, + "pre_tokenizer": { + "type": "Sequence", + "pretokenizers": [ + { + "type": "Split", + "pattern": { + "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + }, + "behavior": "Isolated", + "invert": false + }, + { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": false, + "use_regex": false + } + ] + }, + "post_processor": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": false, + "use_regex": false + }, + "decoder": { + "type": "ByteLevel", + "add_prefix_space": false, + "trim_offsets": false, + "use_regex": false + }, + "model": { + "type": "BPE", + "dropout": null, + "unk_token": null, + "continuing_subword_prefix": null, + "end_of_word_suffix": null, + "fuse_unk": false, + "byte_fallback": false, + "ignore_merges": false, + "vocab": { + "!": 0, + "\"": 1, + "#": 2, + "$": 3, + "%": 4, + "&": 5, + "'": 6, + "(": 7, + ")": 8, + "*": 9, + "+": 10, + ",": 11, + "-": 12, + ".": 13, + "/": 14, + "0": 15, + "1": 16, + "2": 17, + "3": 18, + "4": 19, + "5": 20, + "6": 21, + "7": 22, + "8": 23, + "9": 24, + ":": 25, + ";": 26, + "<": 27, + "=": 28, + ">": 29, + "?": 30, + "@": 31, + "A": 32, + "B": 33, + "C": 34, + "D": 35, + "E": 36, + "F": 37, + "G": 38, + "H": 39, + "I": 40, + "J": 41, + "K": 42, + "L": 43, + "M": 44, + "N": 45, + "O": 46, + "P": 47, + "Q": 48, + "R": 49, + "S": 50, + "T": 51, + "U": 52, + "V": 53, + "W": 54, + "X": 55, + "Y": 56, + "Z": 57, + "[": 58, + "\\": 59, + "]": 60, + "^": 61, + "_": 62, + "`": 63, + "a": 64, + "b": 65, + "c": 66, + "d": 67, + "e": 68, + "f": 69, + "g": 70, + "h": 71, + "i": 72, + "j": 73, + "k": 74, + "l": 75, + "m": 76, + "n": 77, + "o": 78, + "p": 79, + "q": 80, + "r": 81, + "s": 82, + "t": 83, + "u": 84, + "v": 85, + "w": 86, + "x": 87, + "y": 88, + "z": 89, + "{": 90, + "|": 91, + "}": 92, + "~": 93, + "Ċ": 94, + "Ġ": 95, + "ĠĠ": 96, + "Ġt": 97, + "Ġa": 98, + "in": 99, + "he": 100, + "re": 101, + "on": 102, + "Ġthe": 103, + "Ġs": 104, + "er": 105, + "at": 106, + "Ġc": 107, + "ĠĠĠĠ": 108, + "en": 109, + "Ġo": 110, + "Ġ\"": 111, + "nd": 112, + "es": 113, + "ing": 114, + "ĠĠĠ": 115, + "it": 116, + "Ġp": 117, + "or": 118, + "ou": 119, + "Ġand": 120, + "Ġw": 121, + "is": 122, + "Ġf": 123, + "an": 124, + "ion": 125, + "al": 126, + "Ġb": 127, + "Ġto": 128, + "Ġm": 129, + "Ġin": 130, + "Ġof": 131, + "le": 132, + "ct": 133, + "ar": 134, + "ut": 135, + "Ġd": 136, + "st": 137, + "ed": 138, + "ĠĠĠĠĠĠĠ": 139, + "ic": 140, + "\":": 141, + ",Ċ": 142, + "ro": 143, + "ent": 144, + "\\n": 145, + "Ġe": 146, + "put": 147, + "om": 148, + "Ġre": 149, + "as": 150, + "ve": 151, + "Ġh": 152, + "Ġth": 153, + "\",Ċ": 154, + "Ġl": 155, + "Ġis": 156, + "et": 157, + "ce": 158, + "Ġn": 159, + ".\\": 160, + "im": 161, + "il": 162, + "Ġg": 163, + "Ġu": 164, + "ction": 165, + "ru": 166, + "ation": 167, + "ol": 168, + "ch": 169, + "ĠT": 170, + "Ġfor": 171, + "out": 172, + "ra": 173, + "ow": 174, + "id": 175, + "ly": 176, + "Ġst": 177, + "Ġbe": 178, + "Ġy": 179, + "Ġpro": 180, + "ig": 181, + "se": 182, + "ate": 183, + "Ġthat": 184, + "ith": 185, + "ir": 186, + "ur": 187, + "ot": 188, + "Ġor": 189, + "Ġon": 190, + "Ġyou": 191, + "ers": 192, + "stru": 193, + "Ġan": 194, + "if": 195, + "ul": 196, + "struction": 197, + "Ġ{": 198, + "Ġ}": 199, + "Ġcan": 200, + "input": 201, + "output": 202, + "instruction": 203, + "Ġ{Ċ": 204, + "Ġ},Ċ": 205, + "\"Ċ": 206, + "Ġhe": 207, + "Ġcon": 208, + "Ġit": 209, + "ay": 210, + "ess": 211, + "Ġwith": 212, + "ver": 213, + "el": 214, + "Ġas": 215, + "am": 216, + "ĠA": 217, + "ge": 218, + "Ġsu": 219, + "iv": 220, + ".\",Ċ": 221, + "Ġcom": 222, + "ĠI": 223, + "ment": 224, + "ak": 225, + "Ġal": 226, + "\\\"": 227, + ".\"Ċ": 228, + "ive": 229, + "Ġare": 230, + "ab": 231, + "ad": 232, + "Ġmo": 233, + "Ġex": 234, + "Ġv": 235, + "ĠS": 236, + "res": 237, + "pp": 238, + "qu": 239, + "Ġde": 240, + "Ġwh": 241, + "ity": 242, + "Ġen": 243, + "ĠThe": 244, + "her": 245, + "ld": 246, + "ri": 247, + "ter": 248, + "ant": 249, + "ĠC": 250, + "ist": 251, + "Ġ\"\",Ċ": 252, + "um": 253, + "Ġus": 254, + "Ġne": 255, + "ain": 256, + "th": 257, + "ect": 258, + "Ġle": 259, + "op": 260, + "em": 261, + "ies": 262, + "Ġch": 263, + "Ġim": 264, + "du": 265, + "od": 266, + "ort": 267, + "nt": 268, + "est": 269, + "igh": 270, + "ere": 271, + "Ġha": 272, + "us": 273, + "ure": 274, + "ial": 275, + "oc": 276, + "Ġwor": 277, + "Ġtheir": 278, + "ac": 279, + "ence": 280, + "iz": 281, + "Ġyour": 282, + "os": 283, + "Ġimp": 284, + "ud": 285, + "Ġby": 286, + "Ġse": 287, + "ine": 288, + "ould": 289, + "low": 290, + "ill": 291, + "age": 292, + "rom": 293, + "Ġsp": 294, + "ĠP": 295, + "Ġsh": 296, + "ust": 297, + "The": 298, + "un": 299, + "'s": 300, + "Ġinc": 301, + "ide": 302, + "pl": 303, + "ight": 304, + "og": 305, + "Ġpl": 306, + "pt": 307, + "are": 308, + "Ġte": 309, + "Ġint": 310, + "Ġ\\": 311, + "his": 312, + "Ġr": 313, + "ake": 314, + "per": 315, + "orm": 316, + "ag": 317, + "ff": 318, + "ĠE": 319, + "art": 320, + "Ġk": 321, + "end": 322, + "ĠM": 323, + "Ġwe": 324, + "ĠB": 325, + "Ġad": 326, + "cess": 327, + "rou": 328, + "ical": 329, + "all": 330, + "able": 331, + "Ġfrom": 332, + "and": 333, + "ĠH": 334, + "Ġab": 335, + "act": 336, + "Ġcomp": 337, + "ome": 338, + "ach": 339, + "ĠThis": 340, + "Ġhave": 341, + "form": 342, + "Ġ\\\"": 343, + "ast": 344, + "Ġat": 345, + "ĠW": 346, + "Ġres": 347, + "Ġdat": 348, + ":\\": 349, + "ther": 350, + "ions": 351, + "ore": 352, + "Ġ(": 353, + "Ġcont": 354, + "our": 355, + "ep": 356, + "ĠF": 357, + "Ġac": 358, + "ance": 359, + "ĠR": 360, + "gh": 361, + "Ġme": 362, + "ces": 363, + "Ġwas": 364, + "ind": 365, + "vel": 366, + "ations": 367, + "Ġhel": 368, + "Ġmore": 369, + "ult": 370, + "ĠD": 371, + "reat": 372, + "ign": 373, + "Ġhelp": 374, + "ime": 375, + "ard": 376, + "Ġcl": 377, + "Ġapp": 378, + "ans": 379, + "ie": 380, + "Ġdata": 381, + "ich": 382, + "ang": 383, + "ous": 384, + "ell": 385, + "ks": 386, + "ase": 387, + "ice": 388, + "ip": 389, + "ite": 390, + "Ġsuch": 391, + "Ġfe": 392, + "Ġwhe": 393, + "ib": 394, + "Ġother": 395, + "Ġthis": 396, + "ass": 397, + "ual": 398, + "ile": 399, + "ne": 400, + "red": 401, + "Ġhas": 402, + "oo": 403, + "ress": 404, + "ific": 405, + "ning": 406, + "Ġ=": 407, + "Ġup": 408, + "Ġman": 409, + "Ġar": 410, + "ong": 411, + "ec": 412, + "Ġtra": 413, + "av": 414, + "Ġwhich": 415, + "Ġgo": 416, + "Ġprov": 417, + "Ġdis": 418, + "**": 419, + "so": 420, + "ĠG": 421, + "one": 422, + "Ġem": 423, + "Ġnot": 424, + "ue": 425, + "ĠO": 426, + "Ġj": 427, + "ace": 428, + "Ġthey": 429, + "ame": 430, + "Ġqu": 431, + "ĠL": 432, + "iff": 433, + "Ġfol": 434, + "ary": 435, + "ated": 436, + "ustom": 437, + "ition": 438, + "Ġits": 439, + "Ġsy": 440, + "ke": 441, + "ack": 442, + "ry": 443, + "--": 444, + "Ġtime": 445, + "Ġdes": 446, + "Ġnew": 447, + "ents": 448, + "ount": 449, + "Ġfollow": 450, + "Ġalso": 451, + "Ġcomm": 452, + "Ġout": 453, + "Ġeff": 454, + "Ġdiff": 455, + "iven": 456, + "ap": 457, + "Ġsent": 458, + "\\u": 459, + "Ġso": 460, + "Ġprodu": 461, + "Ġuse": 462, + "Ġsc": 463, + "Ġ-": 464, + "Ġun": 465, + "lud": 466, + "ĠIt": 467, + "ener": 468, + "king": 469, + "Ġev": 470, + "Ġabout": 471, + "Ġthem": 472, + "ĠU": 473, + "Ġcustom": 474, + "Ġro": 475, + "Ġinclud": 476, + "les": 477, + "etw": 478, + "stem": 479, + "xt": 480, + "Ġinto": 481, + "Ġper": 482, + "ĠIn": 483, + "ĠN": 484, + "Ġwill": 485, + "Ġlear": 486, + "ber": 487, + "Ġall": 488, + "Ġpe": 489, + "ds": 490, + "Ġtw": 491, + "aking": 492, + "ark": 493, + "ful": 494, + "Ġmake": 495, + "chn": 496, + "erv": 497, + "ost": 498, + "rough": 499, + "Ġone": 500, + "Ġinter": 501, + "ities": 502, + "ail": 503, + "ike": 504, + "ree": 505, + "ple": 506, + "alth": 507, + "Ġused": 508, + "ors": 509, + "Ġover": 510, + "ility": 511, + "ments": 512, + "ange": 513, + "Ġway": 514, + "ory": 515, + "Ġcol": 516, + "Ġpr": 517, + "Ġcould": 518, + "Ġnum": 519, + "reate": 520, + "int": 521, + "Ġredu": 522, + "erson": 523, + "Ġrec": 524, + "Ġher": 525, + "Ġneed": 526, + "ms": 527, + "ater": 528, + "oy": 529, + "Ġsystem": 530, + "Ġinform": 531, + "Ġtwo": 532, + "Ġtechn": 533, + "Ġsentence": 534, + "ience": 535, + "ize": 536, + "get": 537, + "Ġdiffere": 538, + "ood": 539, + "rib": 540, + "Ġbut": 541, + "Ġfollowing": 542, + "ased": 543, + "olog": 544, + "erg": 545, + "led": 546, + "ures": 547, + "In": 548, + "ear": 549, + "Ġph": 550, + "own": 551, + "Ġpre": 552, + "Ġwould": 553, + "Ġusing": 554, + "Ġcons": 555, + "Ġwork": 556, + "Ġmod": 557, + "ating": 558, + "ia": 559, + "ire": 560, + "Ġpos": 561, + "ient": 562, + "ob": 563, + "ject": 564, + "Ġinv": 565, + "ons": 566, + "Ġdo": 567, + "ular": 568, + "Ġdec": 569, + "Ġhealth": 570, + "Ġimpro": 571, + "Ġany": 572, + "Ġthrough": 573, + "yp": 574, + "row": 575, + "velop": 576, + "Ġprocess": 577, + "Ġtr": 578, + "lic": 579, + "very": 580, + "als": 581, + "ify": 582, + "``": 583, + "ari": 584, + "Ġstr": 585, + "Ġimport": 586, + "Ġlike": 587, + "Ġproduct": 588, + "Ġsome": 589, + "ph": 590, + "ential": 591, + "Ġam": 592, + "ates": 593, + "Ġacc": 594, + "ens": 595, + "ns": 596, + "Ġsm": 597, + "Ġind": 598, + "een": 599, + "Ġexper": 600, + "lect": 601, + "Ġval": 602, + "Ġrel": 603, + "its": 604, + "Ġinformation": 605, + "ings": 606, + "ĠJ": 607, + "ople": 608, + "iness": 609, + "Ġgiven": 610, + "mm": 611, + "ices": 612, + "Ġpart": 613, + "ild": 614, + "ys": 615, + "Ġour": 616, + "nder": 617, + "Ġperson": 618, + "ally": 619, + "Ġke": 620, + "etween": 621, + "ft": 622, + "oth": 623, + "Ġspec": 624, + "Ġbetween": 625, + "ergy": 626, + "ĠAI": 627, + "Ġwho": 628, + "Ġmay": 629, + "ef": 630, + "ative": 631, + "ise": 632, + "Ġlist": 633, + "Ġkn": 634, + "Ġadd": 635, + ",\\": 636, + "ord": 637, + "ics": 638, + "Ġpeople": 639, + "ĠSt": 640, + "Ġhis": 641, + "Ġexp": 642, + "ible": 643, + "Ġthere": 644, + "Ġserv": 645, + "Ġincre": 646, + "Ġdevelop": 647, + "ound": 648, + "ower": 649, + "Ġtrans": 650, + "bs": 651, + "Ġenergy": 652, + "Ġoff": 653, + "Ġbus": 654, + "Ġwhile": 655, + "ose": 656, + "Ġact": 657, + "Ġexam": 658, + "Ġlearning": 659, + "ctions": 660, + "con": 661, + "gor": 662, + "gan": 663, + "ution": 664, + "round": 665, + "pport": 666, + "Ġhow": 667, + "Ġbl": 668, + "Ġmed": 669, + "anc": 670, + "Ġtyp": 671, + "Ġra": 672, + "Ġcar": 673, + "ife": 674, + "Ġworld": 675, + "Ġvari": 676, + "Ġrep": 677, + "au": 678, + "Ġsoc": 679, + "Ġprovid": 680, + "Ġset": 681, + "ten": 682, + "Ġsol": 683, + "Ġeach": 684, + "Ġwhen": 685, + "Ġeffect": 686, + "Ġpo": 687, + "Ġshe": 688, + "ick": 689, + "Ġwhere": 690, + "Ġmodel": 691, + "Ġimportant": 692, + "Ġunder": 693, + "Ġprog": 694, + "enerate": 695, + "ural": 696, + "tain": 697, + "Ġass": 698, + "ology": 699, + "Ġhad": 700, + "ook": 701, + "gg": 702, + "Ġcustomer": 703, + "ting": 704, + "ving": 705, + "Ġresp": 706, + "line": 707, + "Ġcreat": 708, + "ll": 709, + "ily": 710, + "Ġreg": 711, + "Ġdet": 712, + "Ġif": 713, + "Ġ+": 714, + "Ġbusiness": 715, + "\\nIn": 716, + "ish": 717, + "Ġmost": 718, + "ĠĠĠĠĠĠĠĠ": 719, + "hes": 720, + "angu": 721, + "Ġprovide": 722, + "Ġadv": 723, + "erm": 724, + "ub": 725, + "Ġsk": 726, + "irst": 727, + "any": 728, + "Ġday": 729, + "ivid": 730, + "arm": 731, + "ract": 732, + "nce": 733, + "Ġ|": 734, + "Ġimprove": 735, + ")\\": 736, + "Ġco": 737, + "Ġcommun": 738, + "arket": 739, + "Ġmet": 740, + "cy": 741, + "Ġdifferent": 742, + "ized": 743, + "Ġart": 744, + "\\nThe": 745, + "rit": 746, + "Ġcomput": 747, + "Ġform": 748, + "ck": 749, + "Ġhum": 750, + "Ġchar": 751, + "ble": 752, + "Ġlead": 753, + "iron": 754, + "Ġrem": 755, + "Ġshould": 756, + "te": 757, + "Ġallow": 758, + "ness": 759, + "hat": 760, + "Ġfun": 761, + "Ġcomple": 762, + "Ġlangu": 763, + "ages": 764, + "Ġbec": 765, + "Ġsign": 766, + "ues": 767, + "ature": 768, + "Ġfind": 769, + "riend": 770, + "Ġstud": 771, + "Ġmain": 772, + "imate": 773, + "ove": 774, + "Ġresult": 775, + "Ġplay": 776, + "Ġreduce": 777, + "Ġeng": 778, + "ware": 779, + "redi": 780, + "Ġnumber": 781, + "Ġlar": 782, + "Ġpol": 783, + "Ġpat": 784, + "Ġwell": 785, + "ident": 786, + "viron": 787, + "rite": 788, + "crib": 789, + "Ġbu": 790, + "Ġhigh": 791, + "Ġthese": 792, + "ives": 793, + "ves": 794, + "Ġdesign": 795, + "urn": 796, + "Ġthan": 797, + "der": 798, + "Ġanal": 799, + "Ġwater": 800, + "Ġmarket": 801, + "Ġexample": 802, + "way": 803, + "stand": 804, + "ng": 805, + "ax": 806, + "itive": 807, + "Ġ`": 808, + "iqu": 809, + "Ġsim": 810, + "Ġequ": 811, + "gorith": 812, + "Ġtext": 813, + "resent": 814, + "Ġmany": 815, + "uring": 816, + "----": 817, + "\\nA": 818, + "Ġdi": 819, + "Ġsa": 820, + "vironment": 821, + "arch": 822, + "Ġatt": 823, + "Ġpot": 824, + "Ġtas": 825, + "Ġcreate": 826, + "ough": 827, + "Ġfl": 828, + "Ġmaking": 829, + "ious": 830, + "Ġgra": 831, + "Ġlife": 832, + "\\nO": 833, + "Ġalgorith": 834, + "ality": 835, + "eng": 836, + "Ġfin": 837, + "uc": 838, + "?\",Ċ": 839, + "ĠY": 840, + "Ġret": 841, + "Ġbeen": 842, + "Ġtechnology": 843, + "Ġprogra": 844, + "Ġhand": 845, + "hip": 846, + "wn": 847, + "Ġcal": 848, + "Ġwhat": 849, + "ividual": 850, + "iss": 851, + "ety": 852, + "Ġlanguage": 853, + "ources": 854, + "Ġclass": 855, + "Ġtake": 856, + "Ġeas": 857, + "ric": 858, + "Ġvis": 859, + "bject": 860, + "Ġref": 861, + "Ġenvironment": 862, + "Ġfirst": 863, + "eg": 864, + "Ġindividual": 865, + "Ġplan": 866, + "Ġperform": 867, + "Ġru": 868, + "ien": 869, + "Ġimpact": 870, + "Ġag": 871, + "ade": 872, + "Ġcle": 873, + "Ġrequ": 874, + "dition": 875, + "__": 876, + "Ġche": 877, + "ption": 878, + "Ġappro": 879, + "Ġ**": 880, + "Ġgreat": 881, + "ved": 882, + "Ġexpl": 883, + "Ġgrow": 884, + "Generate": 885, + "Ġmy": 886, + "Ġincluding": 887, + "Ġaccess": 888, + "Ġpop": 889, + "Ġmin": 890, + "fore": 891, + "Ġsocial": 892, + "ines": 893, + "Ġcharact": 894, + "Ġbr": 895, + "Ġstep": 896, + "Ġunderstand": 897, + "Ġorgan": 898, + "ĠAd": 899, + "Ġdisc": 900, + "Ġpower": 901, + "Ġlong": 902, + "hed": 903, + "Ġconc": 904, + "ward": 905, + "ited": 906, + "Ġele": 907, + "cing": 908, + "Ġevery": 909, + "Ġca": 910, + "Ġoften": 911, + "Ġuser": 912, + "vie": 913, + "ĠV": 914, + "Ġfood": 915, + "Ġinclude": 916, + "Ġloc": 917, + "ases": 918, + "ically": 919, + "ode": 920, + "ants": 921, + "Ġinvol": 922, + "Ġsmall": 923, + "Ġsur": 924, + "achine": 925, + "Ġbeing": 926, + "Ġpotential": 927, + "Ġno": 928, + "ĠCh": 929, + "Ġdep": 930, + "ather": 931, + "Ġboth": 932, + "Ġens": 933, + "Ġposs": 934, + "Ġed": 935, + "cribe": 936, + "ts": 937, + "ork": 938, + "ĠThey": 939, + "Ġpur": 940, + "ivity": 941, + "Ġwords": 942, + "Ġsignific": 943, + "Ġwere": 944, + "ĠHow": 945, + "Ġprom": 946, + "Ġexperience": 947, + "ĠK": 948, + "up": 949, + "Ġcount": 950, + "ered": 951, + "Des": 952, + "Ġfam": 953, + "```": 954, + "akes": 955, + "Ġgl": 956, + "ĠHe": 957, + "Ġfeel": 958, + "Ġback": 959, + "Ġfi": 960, + "Ġproble": 961, + "ization": 962, + "ling": 963, + "Ġcommunic": 964, + "ploy": 965, + "Ġaut": 966, + "Ġfriend": 967, + "Ġhuman": 968, + "Ġspe": 969, + "ew": 970, + "Ġpersonal": 971, + "Ġtop": 972, + "Ġent": 973, + "other": 974, + "Ġchang": 975, + "Ġcor": 976, + "Ġchange": 977, + "Ġdecis": 978, + "ability": 979, + "hing": 980, + "atural": 981, + "ever": 982, + "Ġcost": 983, + "Ġgood": 984, + "ause": 985, + "Ġident": 986, + "Ġsoft": 987, + "ined": 988, + "Ġpass": 989, + "'t": 990, + "atures": 991, + "Ġben": 992, + "Ġcompany": 993, + "Ġstart": 994, + "Ġsignificant": 995, + "Ġsumm": 996, + "ond": 997, + "old": 998, + "bers": 999, + "sel": 1000, + "?\\": 1001, + "Ġcur": 1002, + "Ġlight": 1003, + "Ġcommon": 1004, + ".\\\"": 1005, + "Ġcustomers": 1006, + "iving": 1007, + "conom": 1008, + "Ġfunction": 1009, + "Ġve": 1010, + "Ġthree": 1011, + "Ġeven": 1012, + "ining": 1013, + "Ġgener": 1014, + "ries": 1015, + "Ġlevel": 1016, + "Ġspecific": 1017, + "Ġwebs": 1018, + "Ġthen": 1019, + "Ġeffective": 1020, + "cur": 1021, + "ense": 1022, + "Ġlarge": 1023, + "Ġdist": 1024, + "Ġeffic": 1025, + "Ġsupport": 1026, + "Ġget": 1027, + "Create": 1028, + "read": 1029, + "port": 1030, + "Ġinf": 1031, + "Ġ'": 1032, + "Ġyear": 1033, + "Ġstate": 1034, + "Ġkey": 1035, + "ccess": 1036, + ":**": 1037, + "Ġav": 1038, + "Ġknow": 1039, + "Ġbenef": 1040, + "Ġess": 1041, + "ables": 1042, + "ren": 1043, + "Ġown": 1044, + "ĠThese": 1045, + "ock": 1046, + "-t": 1047, + "Ġide": 1048, + "omm": 1049, + "reen": 1050, + "ced": 1051, + "cture": 1052, + "Ġteam": 1053, + "Ġris": 1054, + "Ġtasks": 1055, + "Ġdown": 1056, + "Ġstru": 1057, + "Ġcomputer": 1058, + "-b": 1059, + "Ġfact": 1060, + "Ġmem": 1061, + "etter": 1062, + "\\nS": 1063, + "Ġaround": 1064, + "Ġword": 1065, + "Ġbased": 1066, + "Ġbeh": 1067, + "Ġright": 1068, + "Ġdel": 1069, + "Ġpoint": 1070, + "Ġnatural": 1071, + "ss": 1072, + "Ġeconom": 1073, + "Ġmade": 1074, + "Ġins": 1075, + "Ġinst": 1076, + "Ġmat": 1077, + "Ġvalue": 1078, + "Ġanim": 1079, + "Ġsever": 1080, + "\\nT": 1081, + "ational": 1082, + "ital": 1083, + "ze": 1084, + "ote": 1085, + "ills": 1086, + "tern": 1087, + "Ġread": 1088, + "Ġcontent": 1089, + "Ġonline": 1090, + "Ġend": 1091, + "ĠUn": 1092, + "vent": 1093, + "Ġsee": 1094, + "ending": 1095, + "Ġmon": 1096, + "Ġdr": 1097, + "Ġkeep": 1098, + "Ġsystems": 1099, + "cul": 1100, + "ven": 1101, + "Ġstory": 1102, + "Ġmedia": 1103, + "Ġseveral": 1104, + "hen": 1105, + "ateg": 1106, + "Ġcontin": 1107, + "Ġdev": 1108, + "Ġlearn": 1109, + "Ġla": 1110, + "Ġstre": 1111, + "Ġpartic": 1112, + "Ġair": 1113, + "ually": 1114, + "Ġsuccess": 1115, + "ouse": 1116, + "Ġiss": 1117, + "ied": 1118, + "Ġmachine": 1119, + "Ġopt": 1120, + "Ġx": 1121, + "Ġop": 1122, + "Ġprof": 1123, + "ocus": 1124, + "chie": 1125, + "Ġmeth": 1126, + "ner": 1127, + "omp": 1128, + "ron": 1129, + "Ġhome": 1130, + "Ġbetter": 1131, + "ĠPro": 1132, + "Ġmult": 1133, + "omet": 1134, + "Ġincrease": 1135, + "Ġanaly": 1136, + "vert": 1137, + "Ġrele": 1138, + "Ġbra": 1139, + "ink": 1140, + "Ġtem": 1141, + "Ġpredi": 1142, + "Ġtre": 1143, + "Ġservice": 1144, + "Ġwebsite": 1145, + "Ġmanage": 1146, + "Ġsoftware": 1147, + "here": 1148, + "Ġprot": 1149, + "-s": 1150, + "Ġquest": 1151, + "ier": 1152, + "Ġknown": 1153, + "Ġorder": 1154, + "Ġphys": 1155, + "cept": 1156, + "Ġachie": 1157, + "Ġinput": 1158, + "Ġpossible": 1159, + "ĠIf": 1160, + "Ġext": 1161, + "fter": 1162, + "Ġelect": 1163, + "Ġmethod": 1164, + "Ġbre": 1165, + "ĠAn": 1166, + "ways": 1167, + "ering": 1168, + "ets": 1169, + "Ġjust": 1170, + "Ġstore": 1171, + "Ġdevelopment": 1172, + "Ġcare": 1173, + "Ġobject": 1174, + "Ġtype": 1175, + "ĠFor": 1176, + "Ġfocus": 1177, + "ggest": 1178, + "Ġonly": 1179, + "Ġconsid": 1180, + "ars": 1181, + "Ġchall": 1182, + "Ġdeterm": 1183, + "Ġsal": 1184, + "ins": 1185, + "Ġfeatures": 1186, + "Ġtru": 1187, + "ody": 1188, + "Ġtool": 1189, + ">\\": 1190, + "Ġensure": 1191, + "oss": 1192, + "ublic": 1193, + "Ġitem": 1194, + "Here": 1195, + "ination": 1196, + "Ġdef": 1197, + "Describe": 1198, + "ional": 1199, + "roup": 1200, + "Ġconf": 1201, + "Ġneeds": 1202, + "Ġcharacter": 1203, + "Ġvarious": 1204, + "Ġlet": 1205, + "Ġapplic": 1206, + "aut": 1207, + "Ġjob": 1208, + "ellig": 1209, + "ĠCon": 1210, + "Ġbest": 1211, + "Ġfore": 1212, + "Ġamount": 1213, + "rop": 1214, + "Ġbuild": 1215, + "ique": 1216, + "aging": 1217, + "Ġemploy": 1218, + "Ġrest": 1219, + "air": 1220, + "What": 1221, + "Ġtoget": 1222, + "Ġways": 1223, + "Ġidentify": 1224, + "Ġtogether": 1225, + "Ġreal": 1226, + "Ġusers": 1227, + "Ġmean": 1228, + "asing": 1229, + "ĠAm": 1230, + "Ġeduc": 1231, + "Ġalgorithm": 1232, + "Ġnetw": 1233, + "Ġcode": 1234, + "Write": 1235, + "ov": 1236, + "-d": 1237, + "oura": 1238, + "ĠHowever": 1239, + "uture": 1240, + "view": 1241, + "Ġindu": 1242, + "Ġproducts": 1243, + "ected": 1244, + "ertain": 1245, + ";\\": 1246, + "ĠAs": 1247, + "pr": 1248, + "aste": 1249, + "Ġoper": 1250, + "Ġ$": 1251, + "avi": 1252, + "self": 1253, + "Ġ<": 1254, + "Ġindust": 1255, + "Ġgu": 1256, + "Ġothers": 1257, + "Ex": 1258, + "ian": 1259, + "Ġ\"\\\"": 1260, + "-f": 1261, + "nces": 1262, + "Ġfil": 1263, + "Ġrespons": 1264, + "rol": 1265, + "Ġcap": 1266, + "Ġbefore": 1267, + "vern": 1268, + "Ġcomplex": 1269, + "lus": 1270, + "ribut": 1271, + "ats": 1272, + "Ġpositive": 1273, + "oh": 1274, + "Ġlo": 1275, + "Ġgroup": 1276, + "Ġfound": 1277, + "ee": 1278, + "ogn": 1279, + "Ġsw": 1280, + "Ġindividuals": 1281, + "Ġpract": 1282, + "Ġenc": 1283, + "Ġshare": 1284, + "raph": 1285, + "Ġrange": 1286, + "Ġsun": 1287, + "\\t": 1288, + "Ġproviding": 1289, + "icle": 1290, + "Ġdem": 1291, + "Ġplace": 1292, + "Ġaud": 1293, + "joy": 1294, + "Ġmust": 1295, + "els": 1296, + "ery": 1297, + "One": 1298, + "Ġfamily": 1299, + "Ġfuture": 1300, + "less": 1301, + "rent": 1302, + "Ġproblem": 1303, + "Ġessential": 1304, + "rodu": 1305, + "ired": 1306, + "Ġreducing": 1307, + "ism": 1308, + "Ġwarm": 1309, + "ray": 1310, + "Ġability": 1311, + "Ġstrong": 1312, + "Ġalways": 1313, + "Ġresources": 1314, + "Ġbenefits": 1315, + "Ġstrateg": 1316, + "Ġinvolves": 1317, + "Ġassist": 1318, + "erest": 1319, + "nA": 1320, + "ression": 1321, + "Ġ[": 1322, + "ilities": 1323, + "Ġsteps": 1324, + "verall": 1325, + "Ġshow": 1326, + "obal": 1327, + "\\nF": 1328, + "Ġland": 1329, + "ĠHere": 1330, + "Ġbusinesses": 1331, + "ĠEn": 1332, + "pportun": 1333, + "Ġmeas": 1334, + "Ġreturn": 1335, + "Ġdig": 1336, + "Ġhist": 1337, + "yth": 1338, + "Ġcent": 1339, + "Ġable": 1340, + "Ġwithout": 1341, + "yc": 1342, + "plain": 1343, + "Ġrelations": 1344, + "Ġservices": 1345, + "-c": 1346, + "Ġtest": 1347, + "arth": 1348, + "Ġcommunication": 1349, + "Ġintern": 1350, + "new": 1351, + "Ġsit": 1352, + "Ġinvest": 1353, + "Ġcaus": 1354, + "Ġunt": 1355, + "Ġfriends": 1356, + "Ġchanges": 1357, + "cri": 1358, + "dit": 1359, + "ĠBy": 1360, + "ĠYou": 1361, + "Ġmeans": 1362, + "Ġrese": 1363, + "ool": 1364, + "ted": 1365, + "elligence": 1366, + "ains": 1367, + "pping": 1368, + "Ġbel": 1369, + "Ġrepresent": 1370, + "Ġhapp": 1371, + "Ġser": 1372, + "Ġperformance": 1373, + "Ġopportun": 1374, + "Ġtemper": 1375, + "ĠShe": 1376, + "Ġfu": 1377, + "ix": 1378, + "bot": 1379, + "Ġwrit": 1380, + "Ġbehavi": 1381, + "Ġproject": 1382, + "ĠWith": 1383, + "ivers": 1384, + "day": 1385, + "Ġphysical": 1386, + "izing": 1387, + "Ġactiv": 1388, + "Ġwithin": 1389, + "Ġinterest": 1390, + "olution": 1391, + "wards": 1392, + "ffic": 1393, + "Ġquick": 1394, + "Ġpublic": 1395, + "Ġgrowth": 1396, + "Ġcho": 1397, + "Ġrelationship": 1398, + "Ġuntil": 1399, + "Ġhelps": 1400, + "Ġstudents": 1401, + "Ġfiel": 1402, + "imes": 1403, + "ulation": 1404, + "ibility": 1405, + "elf": 1406, + "Ġful": 1407, + "Ġsub": 1408, + "ank": 1409, + "ides": 1410, + "Ġskills": 1411, + "Ġclimate": 1412, + "Given": 1413, + "Ġpar": 1414, + "Ġclear": 1415, + "irt": 1416, + "Name": 1417, + "Ġpresent": 1418, + "Ġtri": 1419, + "Ġchalleng": 1420, + "ream": 1421, + "Ġlay": 1422, + "Ġmarketing": 1423, + "Ġsummary": 1424, + "Ġchild": 1425, + "Ġsaf": 1426, + "Ġsure": 1427, + "Ġsame": 1428, + "Ġmu": 1429, + "Ġemail": 1430, + "bon": 1431, + "Ġsomet": 1432, + "```\\": 1433, + "Ġcurrent": 1434, + "amp": 1435, + "ences": 1436, + "ĠRe": 1437, + "Ġtransport": 1438, + "me": 1439, + "-p": 1440, + "action": 1441, + "ĠEx": 1442, + "Ġyears": 1443, + "Ġcomb": 1444, + "hor": 1445, + "anced": 1446, + "ty": 1447, + "Ġlove": 1448, + "Ġgreen": 1449, + "Ġpopular": 1450, + "Ġless": 1451, + "Ġdra": 1452, + "Ġcontrol": 1453, + "Ġaff": 1454, + "Ġconsum": 1455, + "Ġgame": 1456, + "ental": 1457, + "ights": 1458, + "arget": 1459, + "omes": 1460, + "ox": 1461, + "icult": 1462, + "erc": 1463, + "Ġgoals": 1464, + "ancial": 1465, + "tle": 1466, + "Ġgovern": 1467, + "Ġnumbers": 1468, + "Ġfive": 1469, + "Ġstand": 1470, + "Ġsearch": 1471, + "Ġefficient": 1472, + "Ġwal": 1473, + "Ġname": 1474, + "ath": 1475, + "Ġheart": 1476, + "Ġduring": 1477, + "rect": 1478, + "Ġoverall": 1479, + "ython": 1480, + "Ġallows": 1481, + "Ġcity": 1482, + "ave": 1483, + "vant": 1484, + "aterial": 1485, + "Ġwide": 1486, + "Ġmus": 1487, + "ificial": 1488, + "Ġhard": 1489, + "ĠTh": 1490, + "oose": 1491, + "Ġglobal": 1492, + "aj": 1493, + "Ġter": 1494, + "Ġdifficult": 1495, + "Ġline": 1496, + "ĠAl": 1497, + "care": 1498, + "ived": 1499, + "Ġregular": 1500, + "Ġgr": 1501, + "),": 1502, + "lement": 1503, + "Ġhim": 1504, + "Ġunique": 1505, + "Ġenjoy": 1506, + "Ġmeaning": 1507, + "Ġopen": 1508, + "Ġi": 1509, + "abor": 1510, + "Ġarea": 1511, + "Ġitems": 1512, + "Ġclean": 1513, + "ditionally": 1514, + "oid": 1515, + "ĠWe": 1516, + "Ġbeaut": 1517, + "Ġmeet": 1518, + "iple": 1519, + "Ġstatement": 1520, + "Ġagain": 1521, + "ysis": 1522, + "Ġfac": 1523, + "Ġsources": 1524, + "Ġbody": 1525, + "Ġalgorithms": 1526, + "Ġaudience": 1527, + "Ġwant": 1528, + "Ġlog": 1529, + "Ġmaintain": 1530, + "Ġactivities": 1531, + "Ġmove": 1532, + "Ġcult": 1533, + "oney": 1534, + "Ġtarget": 1535, + "\\nB": 1536, + "Ġmaterial": 1537, + "Ġcreating": 1538, + "Ġstructure": 1539, + "atform": 1540, + "ext": 1541, + "Ġexperien": 1542, + "Ġvalues": 1543, + "ead": 1544, + "ohn": 1545, + "Ġhealthy": 1546, + "ross": 1547, + "Ġinteg": 1548, + "Ġresearch": 1549, + "atch": 1550, + "ooking": 1551, + "Ġrole": 1552, + "Ġprovides": 1553, + "iety": 1554, + "ists": 1555, + "Ġfinancial": 1556, + "ories": 1557, + "dent": 1558, + "Ġer": 1559, + "Ġarticle": 1560, + "Ġelements": 1561, + "Ġaddress": 1562, + "Ġconn": 1563, + "ĠUse": 1564, + "mp": 1565, + "Ġeasy": 1566, + "Ġneg": 1567, + "Ġcolor": 1568, + "Ġcalcul": 1569, + "Explain": 1570, + "ĠPl": 1571, + "pect": 1572, + "ince": 1573, + "ale": 1574, + "Ġrisk": 1575, + "curity": 1576, + "ert": 1577, + "Ġfeed": 1578, + "Ġevent": 1579, + "vers": 1580, + "ples": 1581, + "Ġlevels": 1582, + "Ġbi": 1583, + "Ġstay": 1584, + "Ġplatform": 1585, + "Ġbreak": 1586, + "back": 1587, + "Ġsat": 1588, + "\\nOverall": 1589, + "Ġeducation": 1590, + "\\nC": 1591, + "Ġcarbon": 1592, + "--------": 1593, + "ape": 1594, + "Ġprevent": 1595, + "Ġaddition": 1596, + "Ġstress": 1597, + "ral": 1598, + "ource": 1599, + "rus": 1600, + "Ġcome": 1601, + "Ġrecogn": 1602, + "ĠUnited": 1603, + "Ġproper": 1604, + "Ġpoll": 1605, + "dentify": 1606, + "Ġunderstanding": 1607, + "Ġdecisions": 1608, + "ict": 1609, + "Ġdire": 1610, + "Ġbehavior": 1611, + "Ġ*": 1612, + "\\nI": 1613, + "Ġmess": 1614, + "Ġanimals": 1615, + "Ġsl": 1616, + "Ġwind": 1617, + "Ġbas": 1618, + "Ġpain": 1619, + "Ġleading": 1620, + "ern": 1621, + "ger": 1622, + "Ġpres": 1623, + "Ġthough": 1624, + "Ġinteract": 1625, + "yle": 1626, + "Ġdoes": 1627, + "Ġhead": 1628, + "Ġintelligence": 1629, + "orts": 1630, + "Ġbecome": 1631, + "Ġrun": 1632, + "aring": 1633, + "Ġimplement": 1634, + "Ġaction": 1635, + "oot": 1636, + "terns": 1637, + "Ġprotect": 1638, + "eric": 1639, + "Ġflow": 1640, + "Ġemot": 1641, + "cessary": 1642, + "urate": 1643, + "Ġsuggest": 1644, + "Ġprogram": 1645, + "Ġphr": 1646, + "Ġhealthcare": 1647, + "ention": 1648, + "Ġsust": 1649, + "Ġwhy": 1650, + "Ġaccurate": 1651, + "lu": 1652, + "Ġhig": 1653, + "Ġreach": 1654, + "Ġallowing": 1655, + "Ġtravel": 1656, + "Ġrequire": 1657, + "Ġareas": 1658, + "Ġdeep": 1659, + "He": 1660, + "Ġfew": 1661, + "Ġself": 1662, + "oun": 1663, + "Ġ#": 1664, + "osp": 1665, + "str": 1666, + "Ġminut": 1667, + "Ġdecision": 1668, + "ĠThere": 1669, + "ances": 1670, + "Ġquality": 1671, + "Ġavail": 1672, + "Ġspace": 1673, + "Ġsomething": 1674, + "Ġweb": 1675, + "Ġpatterns": 1676, + "Ġmot": 1677, + "oring": 1678, + "isf": 1679, + "Ġanother": 1680, + "Ġaccount": 1681, + "\\nW": 1682, + "uss": 1683, + "Ġmaj": 1684, + "uation": 1685, + "Ġsustain": 1686, + "Ġautom": 1687, + "iques": 1688, + "issions": 1689, + "verse": 1690, + "Ġconcept": 1691, + "Ġsecurity": 1692, + "Ġthose": 1693, + "Ġprofess": 1694, + "Ġshort": 1695, + "Ġnight": 1696, + "ength": 1697, + "apt": 1698, + "ex": 1699, + "ĠAdditionally": 1700, + "Ġtaking": 1701, + "Ġtoo": 1702, + "agn": 1703, + "Ġsimple": 1704, + "lusion": 1705, + "iency": 1706, + "ash": 1707, + "ours": 1708, + "Ġpa": 1709, + "Ġlit": 1710, + "ĠSp": 1711, + "iting": 1712, + "Ġdon": 1713, + "Ġlim": 1714, + "lish": 1715, + "mat": 1716, + "aves": 1717, + "ledge": 1718, + "ditional": 1719, + "inc": 1720, + "Ġevents": 1721, + "Ġoffer": 1722, + "thing": 1723, + "Ġworking": 1724, + "Ġanalysis": 1725, + "Ġachieve": 1726, + "Ġpie": 1727, + "Ġbook": 1728, + "Ġfre": 1729, + "Ġmuch": 1730, + "oon": 1731, + "Ġtry": 1732, + "esp": 1733, + "Ġwaste": 1734, + "face": 1735, + "Ġear": 1736, + "Ġfru": 1737, + "Ġtransportation": 1738, + "chool": 1739, + "Ġtechniques": 1740, + "Ġprogramm": 1741, + "ĠEarth": 1742, + "Ġpredict": 1743, + "Ġnever": 1744, + "ws": 1745, + "ument": 1746, + "imately": 1747, + "ared": 1748, + "Ġparticular": 1749, + "Ġtowards": 1750, + "Ġeconomic": 1751, + "Ġincreasing": 1752, + "Ġfast": 1753, + "iment": 1754, + "Ġnetwork": 1755, + "Ġcorrect": 1756, + "Ġmight": 1757, + "Ġoc": 1758, + "Ġbecause": 1759, + "ĠWh": 1760, + "az": 1761, + "play": 1762, + "Ġresults": 1763, + "Ġmanagement": 1764, + "Ġpurch": 1765, + "Ġsound": 1766, + "Ġpast": 1767, + "Ġtraining": 1768, + "____": 1769, + "ope": 1770, + "Ġengage": 1771, + "ourage": 1772, + "Ġsense": 1773, + "Ġfree": 1774, + "Ġpref": 1775, + "ees": 1776, + "Ġcountries": 1777, + "ney": 1778, + "anies": 1779, + "Ġafter": 1780, + "Ġmind": 1781, + "Ġexc": 1782, + "ĠOnce": 1783, + "ĠĠĠĠĠĠĠĠĠĠĠ": 1784, + "Ġcomplete": 1785, + "Ġimm": 1786, + "Ġest": 1787, + "Ġgenerate": 1788, + "verb": 1789, + "ĠDe": 1790, + "'m": 1791, + "Ġtools": 1792, + "redients": 1793, + "Ġmajor": 1794, + "ently": 1795, + "Ġcontribut": 1796, + "leep": 1797, + "Ġpoints": 1798, + "ditions": 1799, + "Ġfactors": 1800, + "Ġel": 1801, + "Ġnext": 1802, + "ium": 1803, + "oud": 1804, + "Ġcru": 1805, + "Ġreas": 1806, + "riate": 1807, + "ĠInd": 1808, + "Ġpromot": 1809, + "Ġhistory": 1810, + "Ġjour": 1811, + "Ġdue": 1812, + "Con": 1813, + "Ġveget": 1814, + "ency": 1815, + "ĠAmeric": 1816, + "Ġfra": 1817, + "Ġdifference": 1818, + "oard": 1819, + "lex": 1820, + "Ġequation": 1821, + "irtual": 1822, + "Ġcup": 1823, + "Ġforest": 1824, + "Ġnegative": 1825, + "Ġsecon": 1826, + "ones": 1827, + "Ġnature": 1828, + "Ġuses": 1829, + "ah": 1830, + "por": 1831, + "Ġsec": 1832, + "ording": 1833, + "Ġlast": 1834, + "ĠSome": 1835, + "Ġissues": 1836, + "Ġscient": 1837, + "Ġprint": 1838, + "ĠStates": 1839, + "over": 1840, + "Ġsatisf": 1841, + "Ġdevices": 1842, + "Ġdise": 1843, + "Ġtemperature": 1844, + "Ġfeedback": 1845, + "Ġnecessary": 1846, + "Ġemissions": 1847, + "mb": 1848, + "Ġlow": 1849, + "for": 1850, + "tal": 1851, + "Ġchallenges": 1852, + "Ġarray": 1853, + "Ġside": 1854, + "Ġengine": 1855, + "Ġboo": 1856, + "ata": 1857, + "Ġbelie": 1858, + "-m": 1859, + "Ġmultiple": 1860, + "Ġsing": 1861, + "Ġgovernment": 1862, + "ames": 1863, + "ified": 1864, + "Ġminutes": 1865, + "Ġsuccessful": 1866, + "Ġmoney": 1867, + "Ġquickly": 1868, + "Ġbir": 1869, + "Ġtypically": 1870, + "Ġpost": 1871, + "Ġprep": 1872, + "Ġknowledge": 1873, + "pped": 1874, + "actions": 1875, + "Ġmethods": 1876, + "Ġoptim": 1877, + "\\nP": 1878, + "Ġoutput": 1879, + "Ġfield": 1880, + "Ġtable": 1881, + "Ġbal": 1882, + "Ġcoll": 1883, + "Ġcharacters": 1884, + "volution": 1885, + "ords": 1886, + "ilar": 1887, + "ification": 1888, + "ane": 1889, + "Ġcell": 1890, + "Ġmil": 1891, + "ĠWhat": 1892, + "Ġsqu": 1893, + "Ġlives": 1894, + "ĠAr": 1895, + "Ġphrase": 1896, + "Ġnut": 1897, + "Ġdigital": 1898, + "Ġinternet": 1899, + "lass": 1900, + "ura": 1901, + "ommend": 1902, + "Ġtreat": 1903, + "Ġapprop": 1904, + "resh": 1905, + "urther": 1906, + "ĠOne": 1907, + "Ġvisual": 1908, + "ategor": 1909, + "Ġapproach": 1910, + "Ġcertain": 1911, + "Ġsho": 1912, + "val": 1913, + "Ġtask": 1914, + "ires": 1915, + "Ġappropriate": 1916, + "Ġvie": 1917, + "Ġdesigned": 1918, + "pose": 1919, + "**:": 1920, + "fort": 1921, + "Ġ|\\": 1922, + "Ġapplications": 1923, + "Ġpay": 1924, + "Ġnow": 1925, + "Ġheat": 1926, + "Ġindustry": 1927, + "pre": 1928, + "Ġeffectively": 1929, + "Ġpopulation": 1930, + "Ġopportunities": 1931, + " \\", + "Ġens ure", + "os s", + "ub lic", + "Ġit em", + "H ere", + "in ation", + "Ġde f", + "Des cribe", + "ion al", + "rou p", + "Ġcon f", + "Ġneed s", + "Ġcharact er", + "Ġvari ous", + "Ġle t", + "Ġapp lic", + "a ut", + "Ġj ob", + "ell ig", + "ĠC on", + "Ġb est", + "Ġf ore", + "Ġam ount", + "ro p", + "Ġbu ild", + "iqu e", + "ag ing", + "Ġem ploy", + "Ġre st", + "a ir", + "W hat", + "Ġto get", + "Ġway s", + "Ġident ify", + "Ġtoget her", + "Ġre al", + "Ġus ers", + "Ġme an", + "as ing", + "ĠA m", + "Ġed uc", + "Ġalgorith m", + "Ġn etw", + "Ġc ode", + "W rite", + "o v", + "- d", + "ou ra", + "ĠHow ever", + "ut ure", + "vie w", + "Ġin du", + "Ġproduct s", + "ect ed", + "er tain", + "; \\", + "ĠA s", + "p r", + "ast e", + "Ġo per", + "Ġ $", + "av i", + "sel f", + "Ġ <", + "Ġindu st", + "Ġg u", + "Ġother s", + "E x", + "i an", + "Ġ\" \\\"", + "- f", + "n ces", + "Ġf il", + "Ġresp ons", + "ro l", + "Ġc ap", + "Ġbe fore", + "ver n", + "Ġcomple x", + "l us", + "rib ut", + "at s", + "Ġpos itive", + "o h", + "Ġl o", + "Ġg roup", + "Ġf ound", + "e e", + "og n", + "Ġs w", + "Ġindividual s", + "Ġp ract", + "Ġen c", + "Ġsh are", + "ra ph", + "Ġr ange", + "Ġsu n", + "\\ t", + "Ġprovid ing", + "ic le", + "Ġde m", + "Ġpl ace", + "Ġa ud", + "j oy", + "Ġm ust", + "el s", + "er y", + "O ne", + "Ġfam ily", + "Ġf uture", + "l ess", + "re nt", + "Ġproble m", + "Ġess ential", + "ro du", + "i red", + "Ġredu cing", + "is m", + "Ġw arm", + "ra y", + "Ġab ility", + "Ġstr ong", + "Ġal ways", + "Ġres ources", + "Ġbenef its", + "Ġstr ateg", + "Ġinvol ves", + "Ġass ist", + "ere st", + "n A", + "ress ion", + "Ġ [", + "il ities", + "Ġstep s", + "ver all", + "Ġsh ow", + "ob al", + "\\n F", + "Ġl and", + "ĠH ere", + "Ġbusiness es", + "ĠE n", + "pport un", + "Ġme as", + "Ġret urn", + "Ġd ig", + "Ġh ist", + "y th", + "Ġc ent", + "Ġab le", + "Ġwith out", + "y c", + "pl ain", + "Ġrel ations", + "Ġserv ices", + "- c", + "Ġt est", + "ar th", + "Ġcommunic ation", + "Ġinter n", + "ne w", + "Ġs it", + "Ġinv est", + "Ġca us", + "Ġu nt", + "Ġfriend s", + "Ġchang es", + "c ri", + "d it", + "ĠB y", + "ĠY ou", + "Ġme ans", + "Ġre se", + "o ol", + "t ed", + "ellig ence", + "ain s", + "pp ing", + "Ġbe l", + "Ġrep resent", + "Ġha pp", + "Ġs er", + "Ġperform ance", + "Ġo pportun", + "Ġtem per", + "ĠS he", + "Ġf u", + "i x", + "b ot", + "Ġw rit", + "Ġbeh avi", + "Ġpro ject", + "ĠW ith", + "iv ers", + "d ay", + "Ġphys ical", + "iz ing", + "Ġact iv", + "Ġwith in", + "Ġint erest", + "ol ution", + "ward s", + "ff ic", + "Ġqu ick", + "Ġp ublic", + "Ġgrow th", + "Ġch o", + "Ġrelations hip", + "Ġunt il", + "Ġhelp s", + "Ġstud ents", + "Ġfi el", + "im es", + "ul ation", + "ib ility", + "el f", + "Ġf ul", + "Ġsu b", + "an k", + "id es", + "Ġsk ills", + "Ġcl imate", + "G iven", + "Ġp ar", + "Ġcle ar", + "ir t", + "N ame", + "Ġp resent", + "Ġt ri", + "Ġchall eng", + "re am", + "Ġl ay", + "Ġmarket ing", + "Ġsumm ary", + "Ġch ild", + "Ġsa f", + "Ġsu re", + "Ġs ame", + "Ġm u", + "Ġem ail", + "b on", + "Ġs omet", + "``` \\", + "Ġcur rent", + "am p", + "en ces", + "ĠR e", + "Ġtrans port", + "m e", + "- p", + "a ction", + "ĠE x", + "Ġyear s", + "Ġcom b", + "h or", + "anc ed", + "t y", + "Ġl ove", + "Ġg reen", + "Ġpop ular", + "Ġl ess", + "Ġd ra", + "Ġcont rol", + "Ġa ff", + "Ġcons um", + "Ġg ame", + "ent al", + "ight s", + "ar get", + "om es", + "o x", + "ic ult", + "er c", + "Ġgo als", + "anc ial", + "t le", + "Ġgo vern", + "Ġnum bers", + "Ġf ive", + "Ġst and", + "Ġse arch", + "Ġeffic ient", + "Ġw al", + "Ġn ame", + "at h", + "Ġhe art", + "Ġd uring", + "re ct", + "Ġover all", + "yth on", + "Ġallow s", + "Ġc ity", + "a ve", + "v ant", + "ater ial", + "Ġw ide", + "Ġm us", + "ific ial", + "Ġh ard", + "ĠT h", + "oo se", + "Ġgl obal", + "a j", + "Ġt er", + "Ġdiff icult", + "Ġl ine", + "ĠA l", + "c are", + "iv ed", + "Ġreg ular", + "Ġg r", + ") ,", + "le ment", + "Ġh im", + "Ġun ique", + "Ġen joy", + "Ġmean ing", + "Ġop en", + "Ġ i", + "ab or", + "Ġare a", + "Ġitem s", + "Ġcle an", + "dition ally", + "o id", + "ĠW e", + "Ġbe aut", + "Ġme et", + "ip le", + "Ġstate ment", + "Ġag ain", + "ys is", + "Ġf ac", + "Ġs ources", + "Ġb ody", + "Ġalgorith ms", + "Ġaud ience", + "Ġw ant", + "Ġl og", + "Ġmain tain", + "Ġactiv ities", + "Ġmo ve", + "Ġc ult", + "one y", + "Ġt arget", + "\\n B", + "Ġm aterial", + "Ġcreat ing", + "Ġstru cture", + "at form", + "e xt", + "Ġexper ien", + "Ġval ues", + "e ad", + "oh n", + "Ġhealth y", + "ro ss", + "Ġint eg", + "Ġrese arch", + "at ch", + "oo king", + "Ġro le", + "Ġprovid es", + "i ety", + "ist s", + "Ġfin ancial", + "or ies", + "d ent", + "Ġ er", + "Ġart icle", + "Ġele ments", + "Ġadd ress", + "Ġcon n", + "ĠU se", + "m p", + "Ġeas y", + "Ġne g", + "Ġcol or", + "Ġcal cul", + "Ex plain", + "ĠP l", + "p ect", + "in ce", + "al e", + "Ġris k", + "cur ity", + "er t", + "Ġfe ed", + "Ġev ent", + "v ers", + "pl es", + "Ġlevel s", + "Ġb i", + "Ġst ay", + "Ġpl atform", + "Ġbre ak", + "b ack", + "Ġs at", + "\\nO verall", + "Ġeduc ation", + "\\n C", + "Ġcar bon", + "---- ----", + "ap e", + "Ġpre vent", + "Ġadd ition", + "Ġst ress", + "r al", + "our ce", + "ru s", + "Ġcom e", + "Ġrec ogn", + "ĠUn ited", + "Ġpro per", + "Ġpol l", + "dent ify", + "Ġunderstand ing", + "Ġdecis ions", + "i ct", + "Ġd ire", + "Ġbehavi or", + "Ġ *", + "\\n I", + "Ġm ess", + "Ġanim als", + "Ġs l", + "Ġw ind", + "Ġb as", + "Ġp ain", + "Ġlead ing", + "er n", + "g er", + "Ġp res", + "Ġth ough", + "Ġinter act", + "y le", + "Ġdo es", + "Ġhe ad", + "Ġint elligence", + "ort s", + "Ġbec ome", + "Ġru n", + "ar ing", + "Ġimp lement", + "Ġa ction", + "o ot", + "ter ns", + "Ġprot ect", + "er ic", + "Ġf low", + "Ġem ot", + "cess ary", + "ur ate", + "Ġsu ggest", + "Ġprogra m", + "Ġph r", + "Ġhealth care", + "ent ion", + "Ġsu st", + "Ġwh y", + "Ġacc urate", + "l u", + "Ġh ig", + "Ġre ach", + "Ġallow ing", + "Ġtra vel", + "Ġrequ ire", + "Ġare as", + "Ġde ep", + "H e", + "Ġfe w", + "Ġs elf", + "ou n", + "Ġ #", + "os p", + "st r", + "Ġmin ut", + "Ġdecis ion", + "ĠThe re", + "an ces", + "Ġqu ality", + "Ġav ail", + "Ġsp ace", + "Ġsomet hing", + "Ġwe b", + "Ġpat terns", + "Ġm ot", + "or ing", + "is f", + "Ġan other", + "Ġacc ount", + "\\n W", + "us s", + "Ġm aj", + "u ation", + "Ġsust ain", + "Ġaut om", + "iqu es", + "iss ions", + "ver se", + "Ġcon cept", + "Ġse curity", + "Ġth ose", + "Ġprof ess", + "Ġsh ort", + "Ġn ight", + "eng th", + "a pt", + "e x", + "ĠAd ditionally", + "Ġt aking", + "Ġto o", + "ag n", + "Ġsim ple", + "lus ion", + "ien cy", + "as h", + "our s", + "Ġp a", + "Ġl it", + "ĠS p", + "it ing", + "Ġd on", + "Ġl im", + "l ish", + "m at", + "av es", + "led ge", + "dition al", + "in c", + "Ġev ents", + "Ġoff er", + "th ing", + "Ġwor king", + "Ġanal ysis", + "Ġachie ve", + "Ġp ie", + "Ġb ook", + "Ġf re", + "Ġmu ch", + "o on", + "Ġt ry", + "es p", + "Ġw aste", + "f ace", + "Ġe ar", + "Ġf ru", + "Ġtransport ation", + "ch ool", + "Ġtechn iques", + "Ġprogra mm", + "ĠE arth", + "Ġpredi ct", + "Ġne ver", + "w s", + "u ment", + "imate ly", + "are d", + "Ġpartic ular", + "Ġto wards", + "Ġeconom ic", + "Ġincre asing", + "Ġf ast", + "im ent", + "Ġnetw ork", + "Ġcor rect", + "Ġm ight", + "Ġo c", + "Ġbec ause", + "ĠW h", + "a z", + "pl ay", + "Ġresult s", + "Ġmanage ment", + "Ġpur ch", + "Ġs ound", + "Ġp ast", + "Ġtra ining", + "__ __", + "op e", + "Ġeng age", + "oura ge", + "Ġs ense", + "Ġf ree", + "Ġpre f", + "e es", + "Ġcount ries", + "ne y", + "an ies", + "Ġa fter", + "Ġm ind", + "Ġex c", + "ĠO nce", + "ĠĠĠĠ ĠĠĠĠĠĠĠ", + "Ġcomple te", + "Ġim m", + "Ġ est", + "Ġg enerate", + "ver b", + "ĠD e", + "' m", + "Ġtool s", + "redi ents", + "Ġmaj or", + "ent ly", + "Ġcont ribut", + "le ep", + "Ġpoint s", + "dit ions", + "Ġfact ors", + "Ġe l", + "Ġne xt", + "i um", + "ou d", + "Ġc ru", + "Ġre as", + "ri ate", + "ĠI nd", + "Ġprom ot", + "Ġhist ory", + "Ġj our", + "Ġd ue", + "C on", + "Ġve get", + "en cy", + "ĠAm eric", + "Ġf ra", + "Ġdiffere nce", + "o ard", + "le x", + "Ġequ ation", + "irt ual", + "Ġc up", + "Ġfore st", + "Ġneg ative", + "Ġse con", + "on es", + "Ġn ature", + "Ġus es", + "a h", + "p or", + "Ġse c", + "ord ing", + "Ġl ast", + "ĠS ome", + "Ġiss ues", + "Ġsc ient", + "Ġpr int", + "ĠSt ates", + "o ver", + "Ġsat isf", + "Ġdev ices", + "Ġdis e", + "Ġtemper ature", + "Ġfeed back", + "Ġne cessary", + "Ġem issions", + "m b", + "Ġl ow", + "f or", + "t al", + "Ġchalleng es", + "Ġar ray", + "Ġs ide", + "Ġeng ine", + "Ġb oo", + "at a", + "Ġbel ie", + "- m", + "Ġmult iple", + "Ġs ing", + "Ġgovern ment", + "am es", + "if ied", + "Ġminut es", + "Ġsuccess ful", + "Ġm oney", + "Ġquick ly", + "Ġb ir", + "Ġtyp ically", + "Ġp ost", + "Ġpre p", + "Ġknow ledge", + "pp ed", + "a ctions", + "Ġmethod s", + "Ġopt im", + "\\n P", + "Ġout put", + "Ġfiel d", + "Ġt able", + "Ġb al", + "Ġcol l", + "Ġcharact ers", + "v olution", + "or ds", + "il ar", + "ific ation", + "an e", + "Ġc ell", + "Ġm il", + "ĠW hat", + "Ġs qu", + "Ġl ives", + "ĠA r", + "Ġphr ase", + "Ġn ut", + "Ġdig ital", + "Ġintern et", + "l ass", + "u ra", + "omm end", + "Ġt reat", + "Ġappro p", + "res h", + "ur ther", + "ĠO ne", + "Ġvis ual", + "ate gor", + "Ġappro ach", + "Ġc ertain", + "Ġsh o", + "v al", + "Ġtas k", + "i res", + "Ġapprop riate", + "Ġv ie", + "Ġdesign ed", + "p ose", + "** :", + "f ort", + "Ġ| \\", + "Ġapplic ations", + "Ġp ay", + "Ġn ow", + "Ġhe at", + "Ġindust ry", + "p re", + "Ġeffective ly", + "Ġpop ulation", + "Ġopportun ities", + "< /", + "ĠT o", + "Ġup d", + "Ġinclud es", + "ĠE ng", + "Ġtyp es", + "Ġup on", + "Ġconsid er", + "le t", + "Ġg en", + "og raph", + "pl ace", + "Ġt imes", + "Ġar g", + "C omp", + "ĠG o", + "Ġre ce", + "Ġchild ren", + "Ġtra ck", + "Ġsome one", + "w ord", + "Ġyou ng", + "Ġcon ditions", + "Ġtra ditional", + "Ġmodel s", + "I dentify", + "Ġc amp", + "Ġm akes", + "ist ic", + "Ġar r", + "Ġc ard", + "ut ions", + "l t", + "Ġo ld", + "Ġide as", + "Ġe y", + "Ġt ree", + "Ġiss ue", + "Ġh arm", + "Ġavail able", + "Ġc r", + "Ġpower ful", + "n ov", + "Ġmo vie", + "Ġwe ather", + "Ġsk y", + "Ġquest ions", + "e et", + "Ġact ivity", + "Ġbra nd", + "is hed", + "Ġanaly ze", + "ĠS h", + "Ġen h", + "av or", + "Ġbe g", + "Ġs chool", + "i ate", + "Ġeas ier", + "Ġinf lu", + "Ġn on", + "Ġstud y", + "Ġl ook", + "Ġsol ution", + "Ġle g", + "Ġcon st", + "H ow", + "Ġcomp et" + ] + } +} \ No newline at end of file diff --git a/tests/torchtune/models/qwen2/__init__.py b/tests/torchtune/models/qwen2/__init__.py new file mode 100644 index 0000000000..2e41cd717f --- /dev/null +++ b/tests/torchtune/models/qwen2/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/tests/torchtune/models/qwen2/test_lora_qwen2.py b/tests/torchtune/models/qwen2/test_lora_qwen2.py new file mode 100644 index 0000000000..405397b814 --- /dev/null +++ b/tests/torchtune/models/qwen2/test_lora_qwen2.py @@ -0,0 +1,187 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import pytest +import torch +from torch import nn + +from tests.test_utils import assert_expected, fixed_init_model +from torchtune.models.qwen2 import qwen2, lora_qwen2 +from torchtune.models.qwen2._component_builders import lora_qwen2_self_attention +from torchtune.modules.peft import LoRALinear +from torchtune.utils.seed import set_seed + +RANK = 4 +ALPHA = 1.0 +BSZ = 2 +SEQ_LEN = 32 +EMBED_DIM = 64 +INTERMEDIATE_DIM = 168 +NUM_HEADS = 4 +NUM_KV_HEADS = 2 +MAX_SEQ_LEN = 64 + + +@pytest.fixture(autouse=True) +def random(): + set_seed(16) + + +class TestLoRAQwen2SelfAttention: + @pytest.fixture + def inputs(self) -> torch.Tensor: + inputs = torch.randn(BSZ, SEQ_LEN, EMBED_DIM) + return inputs + + def get_lora_qwen2_self_attention(self, lora_modules): + lora_qwen2 = lora_qwen2_self_attention( + lora_modules=lora_modules, + embed_dim=EMBED_DIM, + num_heads=NUM_HEADS, + num_kv_heads=NUM_KV_HEADS, + max_seq_len=MAX_SEQ_LEN, + lora_rank=RANK, + lora_alpha=ALPHA, + ) + fixed_init_model(lora_qwen2) + return lora_qwen2 + + def test_empty_lora_modules(self): + with pytest.raises(ValueError, match="Must pass one or more of"): + _ = self.get_lora_qwen2_self_attention([]) + + @pytest.mark.parametrize( + "lora_modules, expected", + [ + (["q_proj", "v_proj"], torch.tensor(83.6596)), + (["q_proj", "k_proj", "v_proj", "output_proj"], torch.tensor(129.4454)), + (["k_proj"], torch.tensor(69.3473)), + ], + ) + def test_forward(self, inputs, lora_modules, expected): + lora_qwen2_sa = self.get_lora_qwen2_self_attention(lora_modules) + actual = lora_qwen2_sa(inputs) + assert_expected(actual.shape, (BSZ, SEQ_LEN, EMBED_DIM)) + assert_expected(actual.mean(), expected, atol=1e-4, rtol=1e-6) + + +class TestLoRAQwen2: + @pytest.fixture + def vocab_size(self): + return 50 + + @pytest.fixture + def inputs(self, vocab_size): + return torch.randint(low=0, high=vocab_size, size=(BSZ, SEQ_LEN)) + + def get_lora_qwen2( + self, + lora_modules, + apply_lora_to_mlp, + apply_lora_to_output, + vocab_size, + reset_norm=True, + quantize_base=False, + embed_dim=EMBED_DIM, + dtype=None, + ): + num_layers = 3 + model = lora_qwen2( + lora_attn_modules=lora_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=vocab_size, + num_layers=num_layers, + num_heads=NUM_HEADS, + num_kv_heads=NUM_KV_HEADS, + embed_dim=embed_dim, + intermediate_dim=INTERMEDIATE_DIM, + max_seq_len=MAX_SEQ_LEN, + lora_rank=RANK, + lora_alpha=ALPHA, + quantize_base=quantize_base, + ) + # To make final outputs less trivial + if reset_norm: + model.norm = nn.Identity() + + # dtype=None means to just read dtype from parameters + # in the model. This dtype is set explicitly to bf16 currently + # when initializing QLoRA models, as ops such as `arange` aren't + # yet supported with the actual nf4 tensor dtype yet. + fixed_init_model(model, dtype=dtype) + + return model + + def get_ref_qwen2(self, vocab_size, embed_dim=EMBED_DIM): + num_layers = 3 + model = qwen2( + vocab_size=vocab_size, + num_layers=num_layers, + num_heads=NUM_HEADS, + num_kv_heads=NUM_KV_HEADS, + embed_dim=embed_dim, + intermediate_dim=INTERMEDIATE_DIM, + max_seq_len=MAX_SEQ_LEN, + ) + return model + + @pytest.mark.parametrize( + "lora_modules, apply_lora_to_mlp, apply_lora_to_output, expected", + [ + (["q_proj", "v_proj"], False, False, torch.tensor(3736558.0)), + ( + ["q_proj", "k_proj", "v_proj", "output_proj"], + True, + False, + torch.tensor(13962364.0), + ), + (["k_proj"], True, True, torch.tensor(21335964.0)), + ], + ) + def test_forward( + self, + vocab_size, + inputs, + lora_modules, + apply_lora_to_mlp, + apply_lora_to_output, + expected, + ): + model = self.get_lora_qwen2( + lora_modules, apply_lora_to_mlp, apply_lora_to_output, vocab_size + ) + actual = model(inputs) + assert_expected(actual.shape, (BSZ, SEQ_LEN, vocab_size)) + assert_expected(actual.mean(), expected, atol=1e-4, rtol=1e-6) + + @pytest.mark.parametrize( + "lora_modules, apply_lora_to_mlp, apply_lora_to_output", + [ + (["q_proj", "v_proj"], True, False), + (["q_proj", "k_proj", "v_proj", "output_proj"], False, False), + (["k_proj"], True, True), + ], + ) + def test_lora_qwen2_state_dict_parity( + self, lora_modules, apply_lora_to_mlp, apply_lora_to_output, vocab_size + ): + lora_qwen2 = self.get_lora_qwen2( + lora_modules, + apply_lora_to_mlp, + apply_lora_to_output, + vocab_size, + reset_norm=False, + ) + ref_qwen2 = self.get_ref_qwen2(vocab_size) + # Ensure ref_qwen2 state_dict can be loaded into lora_qwen2 with only "lora" + # keys missing. + ref_qwen2_state_dict = ref_qwen2.state_dict() + missing, unexpected = lora_qwen2.load_state_dict( + ref_qwen2_state_dict, strict=False + ) + assert not unexpected + assert all(["lora" in key for key in missing]) diff --git a/tests/torchtune/models/qwen2/test_qwen2.py b/tests/torchtune/models/qwen2/test_qwen2.py new file mode 100644 index 0000000000..e43e62aa4d --- /dev/null +++ b/tests/torchtune/models/qwen2/test_qwen2.py @@ -0,0 +1,48 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import pytest +import torch +from tests.test_utils import fixed_init_model +from torchtune.models.qwen2 import qwen2 +from torchtune.utils.seed import set_seed + +EMBED_DIM = 128 +INTER_DIM = 256 +NUM_LAYERS = 4 +NUM_HEADS = 16 +NUM_KV_HEADS = 8 +VOCAB_SIZE = 32000 +MAX_SEQ_LEN = 2048 +BSZ = 2 +SEQ_LEN = 100 + + +@pytest.fixture(autouse=True) +def random(): + set_seed(16) + + +class TestQwen2: + @pytest.fixture + def inputs(self): + return torch.randint(0, VOCAB_SIZE, (BSZ, SEQ_LEN)) + + def test_forward(self, inputs): + model = qwen2( + vocab_size=VOCAB_SIZE, + num_layers=NUM_LAYERS, + num_heads=NUM_HEADS, + num_kv_heads=NUM_KV_HEADS, + embed_dim=EMBED_DIM, + intermediate_dim=INTER_DIM, + max_seq_len=MAX_SEQ_LEN, + ) + fixed_init_model(model, min_val=-0.25, max_val=0.5) + actual = model(inputs) + expected = torch.tensor(3.9763) + assert actual.shape == (BSZ, SEQ_LEN, VOCAB_SIZE) + torch.testing.assert_close(actual.mean(), expected, atol=1e-4, rtol=1e-4) diff --git a/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py b/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py new file mode 100644 index 0000000000..118187f735 --- /dev/null +++ b/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py @@ -0,0 +1,282 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from pathlib import Path + +import pytest +from torchtune.data import Message +from torchtune.models.qwen2 import qwen2_tokenizer + +ASSETS = Path(__file__).parent.parent.parent.parent / "assets" + + +class TestQwen2Tokenizer: + @pytest.fixture + def tokenizer(self): + # tiny_bpe_tokenizer.json is a pretrained tokenizers BPE tokenizer model. + return qwen2_tokenizer(str(ASSETS / "tiny_bpe_tokenizer.json")) + + def test_tokenize_messages(self, tokenizer): + messages = [ + Message( + role="user", + content="Below is an instruction that describes a task. Write a response " + "that appropriately completes the request.\n\n### Instruction:\nGenerate " + "a realistic dating profile bio.\n\n### Response:\n", + masked=True, + ), + Message( + role="assistant", + content="I'm an outgoing and friendly person who loves spending time with " + "friends and family. I'm also a big-time foodie and love trying out new " + "restaurants and different cuisines. I'm a big fan of the arts and enjoy " + "going to museums and galleries. I'm looking for someone who shares my " + "interest in exploring new places, as well as someone who appreciates a " + "good conversation over coffee.", + ), + ] + tokens, mask = tokenizer.tokenize_messages(messages) + expected_tokens = [ + 2001, + 273, + 105, + 94, + 58, + 90, + 6, + 83, + 574, + 68, + 6, + 25, + 1032, + 757, + 480, + 6, + 11, + 1032, + 661, + 83, + 144, + 6, + 25, + 1032, + 33, + 214, + 174, + 156, + 194, + 130, + 197, + 184, + 446, + 789, + 113, + 98, + 1914, + 13, + 346, + 788, + 98, + 706, + 102, + 182, + 184, + 1916, + 176, + 762, + 83, + 113, + 103, + 874, + 269, + 160, + 77, + 145, + 2, + 2, + 2, + 483, + 197, + 349, + 77, + 885, + 98, + 1226, + 1960, + 348, + 114, + 1123, + 399, + 1583, + 78, + 160, + 77, + 145, + 2, + 2, + 2, + 360, + 1733, + 102, + 182, + 349, + 77, + 6, + 92, + 60, + 2002, + 94, + 2001, + 397, + 251, + 249, + 94, + 58, + 90, + 6, + 83, + 574, + 68, + 6, + 25, + 1032, + 757, + 480, + 6, + 11, + 1032, + 661, + 83, + 144, + 6, + 25, + 111, + 40, + 1791, + 194, + 453, + 70, + 78, + 114, + 120, + 967, + 176, + 618, + 628, + 1275, + 794, + 294, + 1095, + 445, + 212, + 1356, + 120, + 1299, + 13, + 223, + 1791, + 451, + 98, + 127, + 181, + 1047, + 375, + 915, + 380, + 120, + 1448, + 1732, + 114, + 453, + 447, + 1219, + 64, + 187, + 921, + 120, + 742, + 107, + 84, + 122, + 893, + 13, + 223, + 1791, + 98, + 127, + 181, + 123, + 124, + 131, + 103, + 744, + 82, + 120, + 1506, + 416, + 114, + 128, + 1429, + 182, + 253, + 82, + 120, + 163, + 330, + 105, + 262, + 13, + 223, + 1791, + 155, + 1551, + 171, + 1951, + 628, + 296, + 64, + 237, + 886, + 1390, + 130, + 883, + 1678, + 447, + 306, + 279, + 113, + 11, + 215, + 785, + 215, + 1951, + 628, + 378, + 101, + 66, + 72, + 593, + 98, + 984, + 208, + 1580, + 167, + 510, + 737, + 318, + 1278, + 13, + 1, + 92, + 60, + 2002, + 94, + 2000 + ] + expected_mask = [True] * 90 + [False] * 146 + assert expected_tokens == tokens + assert expected_mask == mask diff --git a/torchtune/models/qwen2/__init__.py b/torchtune/models/qwen2/__init__.py new file mode 100644 index 0000000000..c01f873c25 --- /dev/null +++ b/torchtune/models/qwen2/__init__.py @@ -0,0 +1,31 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from ._component_builders import lora_qwen2, qwen2 # noqa +from ._convert_weights import qwen2_hf_to_tune, qwen2_tune_to_hf # noqa +from ._positional_embeddings import Qwen2RotaryPositionalEmbeddings +from ._model_builders import ( # noqa + qwen2_7b, + qwen2_tokenizer, + lora_qwen2_7b, + qlora_qwen2_7b, + # TODO +) + +__all__ = [ + "qwen2_7b", + "qwen2_tokenizer", + "lora_qwen2_7b", + "qlora_qwen2_7b", + + "qwen2", + "lora_qwen2", + + "qwen2_hf_to_tune", + "qwen2_tune_to_hf", + + "Qwen2RotaryPositionalEmbeddings", +] diff --git a/torchtune/models/qwen2/_component_builders.py b/torchtune/models/qwen2/_component_builders.py new file mode 100644 index 0000000000..e986c76502 --- /dev/null +++ b/torchtune/models/qwen2/_component_builders.py @@ -0,0 +1,426 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from functools import partial +from typing import List +from torchtune.modules.common_utils import reparametrize_as_dtype_state_dict_post_hook + +from torch import nn + +from torchtune.models.qwen2.transformer import Qwen2TransformerDecoder +from torchtune.models.qwen2._positional_embeddings import Qwen2RotaryPositionalEmbeddings + +from torchtune.modules import ( + CausalSelfAttention, + FeedForward, + RMSNorm, + TransformerDecoderLayer, +) + + +from torchtune.modules.peft import LORA_ATTN_MODULES, LoRALinear + +""" +Component builders for the Qwen2 model and popular variants such as LoRA. + +torchtune provides composable building blocks. Builder functions help +stitch these building blocks into higher-level components. This design has +two benefits: +- The building blocks themselves are very flexible. For example, ``CausalSelfAttention`` +can take either nn.Linear or nn.LoRALinear for ``q_proj``. +- Builder functions expose a set of configurable params which keep the constructors of +the building blocks simple. +""" + + +def qwen2( + vocab_size: int, + num_layers: int, + num_heads: int, + num_kv_heads: int, + embed_dim: int, + intermediate_dim: int, + max_seq_len: int, + attn_dropout: float = 0.0, + norm_eps: float = 1e-5, + rope_base: float = 1_000_000.0, + tie_word_embeddings: bool = False, +) -> Qwen2TransformerDecoder: + """ + Build the decoder associated with the Qwen2 model. This includes: + - Token embeddings + - num_layers number of TransformerDecoderLayer blocks + - RMS Norm layer applied to the output of the transformer + - Final projection into token space + + Args: + vocab_size (int): number of tokens in vocabulary. + num_layers (int): number of layers in the transformer decoder. + num_heads (int): number of query heads. For MHA this is also the + number of heads for key and value + num_kv_heads (int): number of key and value heads. User should ensure + `num_heads` % `num_kv_heads` == 0. For standard MHA set `num_kv_heads` == `num_heads`, + for GQA `num_kv_heads` < `num_heads`, and for MQA set `num_kv_heads` == 1. + embed_dim (int): embedding dimension for self-attention + max_seq_len (int): maximum sequence length the model will be run with, as used + by :func:`~torchtune.modules.KVCache` + attn_dropout (float): dropout value passed onto scaled_dot_product_attention. + Default: 0.0 + intermediate_dim (Optional[int]): intermediate dimension for MLP. If not specified, + this is computed using :func:`~torchtune.modules.scale_hidden_dim_for_mlp` + norm_eps (float): epsilon in RMS norms. + rope_base (float): the base period of the RoPE embeddings. + tie_word_embeddings (bool): whether the model's input and output word embeddings should be tied. + + Returns: + Qwen2TransformerDecoder: Instantiation of Qwen2 model. + """ + head_dim = embed_dim // num_heads + num_kv_heads = num_kv_heads if num_kv_heads else num_heads + + rope = Qwen2RotaryPositionalEmbeddings(dim=head_dim, max_seq_len=max_seq_len, base=rope_base) + self_attn = CausalSelfAttention( + embed_dim=embed_dim, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + head_dim=head_dim, + q_proj=nn.Linear(embed_dim, num_heads * head_dim, bias=True), + k_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=True), + v_proj=nn.Linear(embed_dim, num_kv_heads * head_dim, bias=True), + output_proj=nn.Linear(embed_dim, embed_dim, bias=False), + pos_embeddings=rope, + kv_cache=None, + max_seq_len=max_seq_len, + attn_dropout=attn_dropout, + ) + mlp = qwen2_mlp(dim=embed_dim, hidden_dim=intermediate_dim) + layer = TransformerDecoderLayer( + attn=self_attn, + mlp=mlp, + sa_norm=RMSNorm(dim=embed_dim, eps=norm_eps), + mlp_norm=RMSNorm(dim=embed_dim, eps=norm_eps), + ) + tok_embeddings = nn.Embedding(vocab_size, embed_dim) + output_proj = None if tie_word_embeddings else nn.Linear(embed_dim, vocab_size, bias=False) + return Qwen2TransformerDecoder( + tok_embeddings=tok_embeddings, + layer=layer, + num_layers=num_layers, + max_seq_len=max_seq_len, + num_heads=num_heads, + head_dim=head_dim, + norm=RMSNorm(embed_dim, eps=norm_eps), + output=output_proj, + ) + + +def qwen2_mlp(dim: int, hidden_dim: int) -> FeedForward: + """ + Build the MLP layer associated with the Qwen2 model. + """ + gate_proj = nn.Linear(dim, hidden_dim, bias=False) + down_proj = nn.Linear(hidden_dim, dim, bias=False) + up_proj = nn.Linear(dim, hidden_dim, bias=False) + return FeedForward(gate_proj=gate_proj, down_proj=down_proj, up_proj=up_proj) + + +def lora_qwen2( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + *, + # qwen2 args + vocab_size: int, + num_layers: int, + num_heads: int, + num_kv_heads: int, + embed_dim: int, + intermediate_dim: int, + max_seq_len: int, + attn_dropout: float = 0.0, + norm_eps: float = 1e-5, + rope_base: float = 1_000_000.0, + tie_word_embeddings: bool = False, + # LoRA args + lora_rank: int, + lora_alpha: float, + lora_dropout: float = 0.0, + # Quantization args + quantize_base: bool = False, +) -> Qwen2TransformerDecoder: + """ + Return a version of Qwen2 (an instance of :func:`~torchtune.models.qwen2.transformer.Qwen2TransformerDecoder`) + with LoRA applied based on the passed in configuration. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + vocab_size (int): number of tokens in vocabulary. + num_layers (int): number of layers in the transformer decoder. + num_heads (int): number of query heads. For MHA this is also the + number of heads for key and value + num_kv_heads (int): number of key and value heads. User should ensure + `num_heads` % `num_kv_heads` == 0. For standard MHA set `num_kv_heads` == `num_heads`, + for GQA `num_kv_heads` < `num_heads`, and for MQA set `num_kv_heads` == 1. + embed_dim (int): embedding dimension for self-attention + max_seq_len (int): maximum sequence length the model will be run with, as used + by :func:`~torchtune.modules.KVCache` + attn_dropout (float): dropout value passed onto scaled_dot_product_attention. + Default: 0.0 + intermediate_dim (Optional[int]): intermediate dimension for MLP. If not specified, + this is computed using :func:`~torchtune.modules.scale_hidden_dim_for_mlp` + norm_eps (float): epsilon in RMS norms. + rope_base (float): the base period of the RoPE embeddings. + tie_word_embeddings (bool): whether the model's input and output word embeddings should be tied. + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): LoRA dropout probability. Default: 0.0 + quantize_base: (bool): Whether to quantize base model weights or not. Only applied to base + weights within linear layers LoRA is applied to. The final output linear projection is not + supported for quantization currently. + + Returns: + Qwen2TransformerDecoder: Instantiation of Qwen2 model with LoRA applied to + a subset of the attention projections in each layer. + + """ + + self_attn = lora_qwen2_self_attention( + lora_modules=lora_attn_modules, + embed_dim=embed_dim, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + max_seq_len=max_seq_len, + attn_dropout=attn_dropout, + rope_base=rope_base, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + quantize_base=quantize_base, + ) + + if apply_lora_to_mlp: + mlp = lora_qwen2_mlp( + dim=embed_dim, + hidden_dim=intermediate_dim, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + quantize_base=quantize_base, + lora_dropout=lora_dropout, + ) + else: + mlp = qwen2_mlp(dim=embed_dim, hidden_dim=intermediate_dim) + + layer = TransformerDecoderLayer( + attn=self_attn, + mlp=mlp, + sa_norm=RMSNorm(dim=embed_dim, eps=norm_eps), + mlp_norm=RMSNorm(dim=embed_dim, eps=norm_eps), + ) + + tok_embeddings = nn.Embedding(vocab_size, embed_dim) + + if tie_word_embeddings: + output_proj = None + else: + # TODO: quantize_base is not applied to final output_proj currently. + output_proj = ( + LoRALinear(embed_dim, vocab_size, rank=lora_rank, alpha=lora_alpha, dropout=lora_dropout) + if apply_lora_to_output + else nn.Linear(embed_dim, vocab_size, bias=False) + ) + model = Qwen2TransformerDecoder( + tok_embeddings=tok_embeddings, + layer=layer, + num_layers=num_layers, + max_seq_len=max_seq_len, + num_heads=num_heads, + head_dim=(embed_dim // num_heads), + norm=RMSNorm(embed_dim, eps=norm_eps), + output=output_proj, + ) + + if quantize_base: + # For QLoRA, we reparametrize 4-bit tensors to higher precision, and offload to CPU on the fly + # so as to not increase peak memory + model._register_state_dict_hook( + partial( + reparametrize_as_dtype_state_dict_post_hook, + # TODO this is clowny, figure out a better way to get what precision the rest + # of the model is in + dtype=tok_embeddings.weight.dtype, + offload_to_cpu=True, + ) + ) + + return model + + +def lora_qwen2_self_attention( + lora_modules: List[LORA_ATTN_MODULES], + *, + # CausalSelfAttention args + embed_dim: int, + num_heads: int, + num_kv_heads: int, + max_seq_len: int, + attn_dropout: float = 0.0, + rope_base: float = 1_000_000.0, + # LoRA args + lora_rank: int, + lora_alpha: float, + lora_dropout: float = 0.0, + quantize_base: bool = False, +) -> CausalSelfAttention: + """ + Return an instance of :func:`~torchtune.modules.CausalSelfAttention` with LoRA + applied to a subset of its linear layers + + Args: + lora_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to. Options are ``{"q_proj", "k_proj", "v_proj", + "output_proj"}``. + embed_dim (int): embedding dimension for self-attention + num_heads (int): number of query heads. For MHA this is also the + number of heads for key and value + num_kv_heads (int): number of key and value heads. User should ensure + `num_heads` % `num_kv_heads` == 0. For standard MHA set `num_kv_heads` == `num_heads`, + for GQA `num_kv_heads` < `num_heads`, and for MQA set `num_kv_heads` == 1. + max_seq_len (int): maximum sequence length the model will be run with, as used + by :func:`~torchtune.modules.KVCache` + attn_dropout (float): dropout value passed onto scaled_dot_product_attention. + Default: 0.0 + rope_base (float): the base period of the RoPE embeddings. Default: 1_000_000.0 + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): LoRA dropout probability. Default: 0.0 + quantize_base (bool): Whether to quantize base model parameters for linear layers + LoRA is being applied to. Default is ``False``. + + Returns: + CausalSelfAttention: instantiation of self-attention module with LoRA + applied to a subset of Q, K, V, output projections. + + Raises: + ValueError: If lora_modules arg is an empty list + """ + if not lora_modules: + raise ValueError( + f"Must pass one or more of {LORA_ATTN_MODULES} as lora_modules" + ) + + head_dim = embed_dim // num_heads + num_kv_heads = num_kv_heads if num_kv_heads else num_heads + q_proj = ( + LoRALinear( + embed_dim, + num_heads * head_dim, + rank=lora_rank, + alpha=lora_alpha, + dropout=lora_dropout, + use_bias=True, + quantize_base=quantize_base, + ) + if "q_proj" in lora_modules + else nn.Linear(embed_dim, num_heads * head_dim, bias=True) + ) + k_proj = ( + LoRALinear( + embed_dim, + num_kv_heads * head_dim, + rank=lora_rank, + alpha=lora_alpha, + dropout=lora_dropout, + use_bias=True, + quantize_base=quantize_base, + ) + if "k_proj" in lora_modules + else nn.Linear(embed_dim, num_kv_heads * head_dim, bias=True) + ) + v_proj = ( + LoRALinear( + embed_dim, + num_kv_heads * head_dim, + rank=lora_rank, + alpha=lora_alpha, + dropout=lora_dropout, + use_bias=True, + quantize_base=quantize_base, + ) + if "v_proj" in lora_modules + else nn.Linear(embed_dim, num_kv_heads * head_dim, bias=True) + ) + output_proj = ( + LoRALinear( + embed_dim, + embed_dim, + rank=lora_rank, + alpha=lora_alpha, + dropout=lora_dropout, + quantize_base=quantize_base, + ) + if "output_proj" in lora_modules + else nn.Linear(embed_dim, embed_dim, bias=False) + ) + rope = Qwen2RotaryPositionalEmbeddings(dim=head_dim, max_seq_len=max_seq_len, base=rope_base) + self_attn = CausalSelfAttention( + embed_dim=embed_dim, + num_heads=num_heads, + num_kv_heads=num_kv_heads, + head_dim=head_dim, + q_proj=q_proj, + k_proj=k_proj, + v_proj=v_proj, + output_proj=output_proj, + pos_embeddings=rope, + kv_cache=None, + max_seq_len=max_seq_len, + attn_dropout=attn_dropout, + ) + return self_attn + + +def lora_qwen2_mlp( + *, + dim: int, + hidden_dim: int, + lora_rank: int, + lora_alpha: float, + lora_dropout: float = 0.0, + quantize_base: bool = False, +) -> FeedForward: + # TODO(suyang.fy): check code. + gate_proj = LoRALinear( + in_dim=dim, + out_dim=hidden_dim, + rank=lora_rank, + alpha=lora_alpha, + dropout=lora_dropout, + quantize_base=quantize_base, + ) + down_proj = LoRALinear( + in_dim=hidden_dim, + out_dim=dim, + rank=lora_rank, + alpha=lora_alpha, + dropout=lora_dropout, + quantize_base=quantize_base, + ) + up_proj = LoRALinear( + in_dim=dim, + out_dim=hidden_dim, + rank=lora_rank, + alpha=lora_alpha, + dropout=lora_dropout, + quantize_base=quantize_base, + ) + return FeedForward( + gate_proj=gate_proj, + down_proj=down_proj, + up_proj=up_proj, + ) diff --git a/torchtune/models/qwen2/_convert_weights.py b/torchtune/models/qwen2/_convert_weights.py new file mode 100644 index 0000000000..c548f1aee1 --- /dev/null +++ b/torchtune/models/qwen2/_convert_weights.py @@ -0,0 +1,117 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Dict + +import torch + +from torchtune.models.convert_weights import get_mapped_key + +# state dict key mappings from HF's format to torchtune's format +_FROM_HF = { + "model.embed_tokens.weight": "tok_embeddings.weight", + "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attn.q_proj.weight", + "model.layers.{}.self_attn.q_proj.bias": "layers.{}.attn.q_proj.bias", + "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attn.k_proj.weight", + "model.layers.{}.self_attn.k_proj.bias": "layers.{}.attn.k_proj.bias", + "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attn.v_proj.weight", + "model.layers.{}.self_attn.v_proj.bias": "layers.{}.attn.v_proj.bias", + "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attn.output_proj.weight", + "model.layers.{}.self_attn.rotary_emb.inv_freq": None, + "model.layers.{}.mlp.gate_proj.weight": "layers.{}.mlp.w1.weight", + "model.layers.{}.mlp.up_proj.weight": "layers.{}.mlp.w3.weight", + "model.layers.{}.mlp.down_proj.weight": "layers.{}.mlp.w2.weight", + "model.layers.{}.input_layernorm.weight": "layers.{}.sa_norm.scale", + "model.layers.{}.post_attention_layernorm.weight": "layers.{}.mlp_norm.scale", + "model.norm.weight": "norm.scale", + "lm_head.weight": "output.weight", +} + + +QWEN2_TIED_KEY = "lm_head.weight" + + +def qwen2_hf_to_tune( + state_dict: Dict[str, torch.Tensor], + num_heads: int = 32, + num_kv_heads: int = 32, + dim: int = 4096, + head_dim: int = None, + tie_word_embeddings: bool = False, +) -> Dict[str, torch.Tensor]: + """ + Convert a state dict from HF's format to TorchTune's format, which contains the weights + of a Qwen2 model. + State dicts from multiple checkpoint files should be consolidated into a single state dict + before calling this function. + The logic is identical to :func:`~torchtune.models.convert_weights.hf_to_tune`, but may not load + output projection weights. + + Args: + state_dict (Dict[str, torch.Tensor]): State dict in HF's format. + num_heads (int): Number of heads in the model. + num_kv_heads (int): Number of heads in the key/value projection layers. + dim (int): Dimension of the model. + head_dim (int): Dimension of the head. If not provided, it will be calculated + as dim // num_heads. + tie_word_embeddings (bool): Whether the model's input and output word embeddings should be tied. + + Returns: + Dict[str, torch.Tensor]: State dict in torchtune's format. + """ + converted_state_dict = {} + if head_dim is None: + head_dim = dim // num_heads + + for key, value in state_dict.items(): + if tie_word_embeddings and QWEN2_TIED_KEY not in key: # Skip loading the output projection weights + continue + if "rotary_emb.inv_freq" in key: # Skip loading the position embeddings + continue + + new_key = get_mapped_key(key, _FROM_HF) + converted_state_dict[new_key] = value + return converted_state_dict + + +def qwen2_tune_to_hf( + state_dict: Dict[str, torch.Tensor], + num_heads: int = 32, + num_kv_heads: int = 32, + dim: int = 4096, + head_dim: int = None, + tie_word_embeddings: bool = False, +): + """ + Convert a state dict from torchtune's format to HF's format. This function + doesn't handle any sharding or splitting of state dicts. It follows the + state_dict IN -> state_dict OUT pattern. + + Args: + state_dict (Dict[str, torch.Tensor]): State dict in torchtune's format. + num_heads (int): Number of heads in the model. + num_kv_heads (int): Number of heads in the key/value projection layers. + dim (int): Dimension of the model. + head_dim (int): Dimension of the head. If not provided, it will be calculated + as dim // num_heads. + tie_word_embeddings (bool): Whether the model's input and output word embeddings should be tied. + + Returns: + Dict[str, torch.Tensor]: State dict in HF's format. + """ + converted_state_dict = {} + inverted_mapping_dict = {v: k for k, v in _FROM_HF.items()} + + if head_dim is None: + head_dim = dim // num_heads + + for key, value in state_dict.items(): + new_key = get_mapped_key(key, inverted_mapping_dict) + if "tok_embeddings" in key and tie_word_embeddings: + converted_state_dict[QWEN2_TIED_KEY] = value + converted_state_dict[new_key] = value + + return converted_state_dict diff --git a/torchtune/models/qwen2/_model_builders.py b/torchtune/models/qwen2/_model_builders.py new file mode 100644 index 0000000000..c83a281931 --- /dev/null +++ b/torchtune/models/qwen2/_model_builders.py @@ -0,0 +1,113 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from typing import List +from functools import partial + +from torchtune.models.qwen2._component_builders import qwen2, lora_qwen2 +from torchtune.models.qwen2._tokenizer import Qwen2Tokenizer +from torchtune.models.qwen2.transformer import Qwen2TransformerDecoder + +from torchtune.modules import TransformerDecoder +from torchtune.modules.peft import LORA_ATTN_MODULES + +""" +Model builders build specific instantiations using component builders. For example +the qwen2_7b model builder uses the qwen2 component builder to create the +qwen2 7B model. +""" + + +def qwen2_7b() -> Qwen2TransformerDecoder: + """ + Builder for creating a Qwen2 model initialized w/ the default 7B parameter values + from https://huggingface.co/Qwen/Qwen2-7B-Instruct + + Returns: + TransformerDecoder: Instantiation of Qwen2 7B model + """ + return qwen2( + vocab_size=152064, + num_layers=28, + num_heads=28, + num_kv_heads=4, + embed_dim=3584, + intermediate_dim=18944, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-06, + rope_base=1000000.0, + ) + + +def qwen2_tokenizer(path: str) -> Qwen2Tokenizer: + return Qwen2Tokenizer( + path, + unk_token="<|endoftext|>", + bos_token=None, + eos_token="<|endoftext|>", + pad_token="<|endoftext|>", + ) + + +def lora_qwen2_7b( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.05, + quantize_base: bool = False, +) -> Qwen2TransformerDecoder: + """ + Builder for creating a Qwen2 7B model with LoRA enabled. + + The Qwen2 defaults are the same as in :func:`~torchtune.models.qwen2.qwen2_7b`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + quantize_base (bool): Whether to quantize base model weights + + Returns: + TransformerDecoder: Instantiation of Qwen2 7B model with LoRA applied + """ + return lora_qwen2( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=152064, + num_layers=28, + num_heads=28, + num_kv_heads=4, + embed_dim=3584, + intermediate_dim=18944, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + quantize_base=quantize_base, + ) + + +qlora_qwen2_7b = partial(lora_qwen2_7b, quantize_base=True) + +qlora_qwen2_7b.__doc__ = """ +Builder for creating a Qwen2 7B model with QLoRA enabled. Base model weights in linear layers +that LoRA is applied to are quantized per the QLoRA paper: https://arxiv.org/abs/2305.14314. +Please see `lora_qwen2_7b` for full API arguments. +""" diff --git a/torchtune/models/qwen2/_positional_embeddings.py b/torchtune/models/qwen2/_positional_embeddings.py new file mode 100644 index 0000000000..a402bc6c70 --- /dev/null +++ b/torchtune/models/qwen2/_positional_embeddings.py @@ -0,0 +1,117 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional + +import torch + +from torch import nn, Tensor + + +class Qwen2RotaryPositionalEmbeddings(nn.Module): + """ + RoPE Embeddings used in the Qwen2 model. + Ref: https://huggingface.co/Qwen/Qwen2-7B-Instruct + + This class is not numerically equivalent to the RoPE Embedding module + used by Llama2 and Llama3. + + Args: + dim (int): Embedding dimension. This is usually set to the dim of each + head in the attention module computed as ``embed_dim`` // ``num_heads`` + max_seq_len (int): Maximum expected sequence length for the + model, if exceeded the cached freqs will be recomputed + base (float): The base for the geometric progression used to compute + the rotation angles + """ + + def __init__( + self, + dim: int, + max_seq_len: int = 4096, + base: float = 1_000_000.0, + ) -> None: + super().__init__() + self.dim = dim + self.base = base + self.max_seq_len = max_seq_len + self._rope_init() + + def _rope_init(self): + theta = 1.0 / ( + self.base + ** (torch.arange(0, self.dim, 2)[: (self.dim // 2)].float() / self.dim) + ) + self.register_buffer("theta", theta, persistent=False) + self.build_rope_cache(self.max_seq_len) + + def build_rope_cache(self, max_seq_len: int = 4096) -> None: + # Create position indexes `[0, 1, ..., max_seq_len - 1]` + seq_idx = torch.arange( + max_seq_len, dtype=self.theta.dtype, device=self.theta.device + ) + + # Outer product of theta and position index; output tensor has + # a shape of [max_seq_len, dim // 2] + idx_theta = torch.einsum("i, j -> ij", seq_idx, self.theta).float() + + # We cache the cos and sin embeddings instead of the IDs. This helps + # ensure we have correct behavior when training with bf16 + # Size: [max_seq_len, (dim * 2)] + freqs = torch.cat([idx_theta, idx_theta], dim=-1) + cache = torch.cat([freqs.cos(), freqs.sin()], dim=-1) + self.register_buffer("cache", cache, persistent=False) + + def forward(self, x: Tensor, input_pos: Optional[Tensor] = None) -> Tensor: + """ + Args: + x (Tensor): input tensor with shape + [b, s, n_h, h_d] + input_pos (Optional[Tensor]): Optional tensor which contains the position ids + of each token. During training, this is used to indicate the positions + of each token relative to its sample when packed, shape [b, s]. + During inference, this indicates the position of the current token. + If none, assume the index of the token is its position id. Default is None. + + Returns: + Tensor: output tensor with RoPE applied + + Notation used for tensor shapes: + - b: batch size + - s: sequence length + - n_h: num heads + - h_d: head dim + + TODO: The implementation below can be made more efficient + for inference. + """ + # input tensor has shape [b, s, n_h, h_d] + seq_len = x.size(1) + head_dim = x.size(-1) + + # extract the values based on whether input_pos is set or not. When + # input_pos is provided, we're in inference mode + rope_cache = ( + self.cache[:seq_len] if input_pos is None else self.cache[input_pos] + ) + + # reshape the cache for broadcasting + # tensor has shape [b, s, 1, h_d * 2] if packed samples, + # otherwise has shape [1, s, 1, h_d * 2] + rope_cache = rope_cache.view(-1, seq_len, 1, head_dim * 2) + + # [b, s, 1, h_d] + cos = rope_cache[..., :head_dim].to(x.dtype) + sin = rope_cache[..., head_dim:].to(x.dtype) + + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + rotated = torch.cat((-x2, x1), dim=-1) + + # cos: [b, s, 1, h_d] + # x: [b, s, n_h, h_d] + x_out = (x * cos) + (rotated * sin) + return x_out.type_as(x) diff --git a/torchtune/models/qwen2/_tokenizer.py b/torchtune/models/qwen2/_tokenizer.py new file mode 100644 index 0000000000..6bfdbff6e9 --- /dev/null +++ b/torchtune/models/qwen2/_tokenizer.py @@ -0,0 +1,186 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional, List, Tuple + +from tokenizers import Tokenizer as TokenizerFast + +from torchtune.data import Message, truncate +from torchtune.modules.tokenizers import ModelTokenizer + + +ENDOFTEXT = "<|endoftext|>" +IM_START = "<|im_start|>" +IM_END = "<|im_end|>" + + +class Qwen2Tokenizer(ModelTokenizer): + """This class construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). + + See . + + Args: + path (str): Path to tokenizer.json file. + + Example: + >>> tokenizer = Qwen2Tokenizer("/path/to/tokenizer.json") + >>> tokenized_text = tokenizer.encode("Hello world!") + >>> print(tokenized_text) + [] + """ + + system = f"{IM_START}system\n{{content}}{IM_END}\n" + user = f"{IM_START}user\n{{content}}{IM_END}\n" + assistant = f"{IM_START}assistant\n{{content}}{IM_END}\n" + assistant_for_generation = f"{IM_START}assistant\n" + + def __init__( + self, + path: str, + *, + unk_token: Optional[str] = ENDOFTEXT, + bos_token: Optional[str] = None, + eos_token: str = ENDOFTEXT, + pad_token: Optional[str] = ENDOFTEXT, + ): + # Build backend tokenizer. + self._tokenizer = TokenizerFast.from_file(path) + + _truncation = self._tokenizer.truncation + if _truncation is not None: + self._tokenizer.enable_truncation(**_truncation) + else: + self._tokenizer.no_truncation() + + _padding = self._tokenizer.padding + if _padding is not None: + self._tokenizer.enable_padding(**_padding) + + vocab = self._tokenizer.get_vocab() + self.unk_id = None if unk_token is None else vocab[unk_token] + self.bos_id = None if bos_token is None else vocab[bos_token] + self.eos_id = None if eos_token is None else vocab[eos_token] + self.pad_id = None if pad_token is None else vocab[pad_token] + self.im_start_id = vocab[IM_START] + self.im_end_id = vocab[IM_END] + self.stop_tokens = [self.eos_id, self.im_end_id] + + def encode( + self, text: str, add_bos: bool = True, add_eos: bool = True, **kwargs + ) -> List[int]: + """ + Encode a string into a list of token ids. + + Args: + text (str): The string to encode. + add_bos (bool): (Optional) Whether to add the beginning of sequence token. + add_eos (bool): (Optional) Whether to add the end of sequence token. + + Returns: + List[int]: The list of token ids. + """ + return self.encode_batch([text], add_bos=add_bos, add_eos=add_eos, **kwargs)[0] + + def encode_batch( + self, + batch_text: List[str], + add_bos: bool = True, + add_eos: bool = True, + **kwargs, + ) -> List[List[int]]: + """Encode a batch of strings into lists of token ids. + + Args: + batch_text (List[str]): The batch of strings to encode. + add_bos (bool): (Optional) Whether to add the beginning of sequence token. + add_eos (bool): (Optional) Whether to add the end of sequence token. + + Returns: + List[List[int]]: A batch of lists of token ids. + """ + encodings = self._tokenizer.encode_batch(batch_text) + encoded_token_ids = [] + for encoding in encodings: + encoding_ids = encoding.ids[:] + if add_bos and self.bos_id is not None: + encoding_ids.insert(0, self.bos_id) + if add_eos and self.eos_id is not None: + encoding_ids.append(self.eos_id) + encoded_token_ids.append(encoding_ids) + return encoded_token_ids + + def decode( + self, + token_ids: List[int], + skip_special_tokens: bool = False, + **kwargs, + ) -> str: + """ + Decode a list of token ids into a string. + + Args: + token_ids (List[int]): The list of token ids. + skip_special_tokens (bool): Whether the special tokens should be removed from the decoded string. + + Returns: + str: The decoded string. + """ + text = self._tokenizer.decode( + token_ids, skip_special_tokens=skip_special_tokens + ) + return text + + def tokenize_messages( + self, + messages: List[Message], + max_seq_len: Optional[int] = None, + apply_chat_template: bool = True, + **kwargs, + ) -> Tuple[List[int], List[bool]]: + """ + Given a list of messages, return a list of tokens for the concatenated + and formatted messages. + + Args: + messages (List[Message]): The message list to tokenize. + max_seq_len (Optional[int]): The maximum sequence length. + apply_chat_template (bool): Whether to apply Qwen2 chat template. + + Returns: + Tuple[List[int], List[bool]]: The list of token ids and the list of masks. + """ + tokens = [] + mask = [] + is_generation = False + for index, message in enumerate(messages): + content = "" + if message.role == "system": + content = self.system.format(content=message.content) + elif message.role == "user": + content = self.user.format(content=message.content) + elif message.role == "assistant": + if index == len(messages) - 1 and not message.content: + content = self.assistant_for_generation + is_generation = True + else: + content = self.assistant.format(content=message.content) + tokenized_message = self.encode(content, add_bos=False, add_eos=False) + tokens.extend(tokenized_message) + mask.extend([message.masked] * len(tokenized_message)) + + if max_seq_len and len(tokens) >= max_seq_len: + break + + if not is_generation: + tokens = tokens + [self.eos_id] + last_message_masked = False + if messages: + last_message_masked = messages[-1].masked + mask = mask + [last_message_masked] + if max_seq_len: + tokens = truncate(tokens, max_seq_len, self.eos_id) + mask = truncate(mask, max_seq_len, True) + return tokens, mask diff --git a/torchtune/models/qwen2/transformer.py b/torchtune/models/qwen2/transformer.py new file mode 100644 index 0000000000..32ea2f385c --- /dev/null +++ b/torchtune/models/qwen2/transformer.py @@ -0,0 +1,172 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +from torchtune.modules import KVCache + +from torchtune.modules.transformer import _get_clones, TransformerDecoderLayer + + +class Qwen2TransformerDecoder(nn.Module): + """ + Transformer Decoder derived from the Qwen2 architecture. A key difference between + the Qwen2 transformer decoder and :class:`~torchtune.modules.TransformerDecoder` + is that the output projection may be replaced with token embeddings weights + (see https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/qwen2/modeling_qwen2.py#L1092). + + Args: + tok_embeddings (nn.Embedding): PyTorch embedding layer, to be used to move + tokens to an embedding space. + layer (TransformerDecoderLayer): Transformer Decoder layer. + num_layers (int): Number of Transformer Decoder layers. + max_seq_len (int): maximum sequence length the model will be run with, as used + by :func:`~torchtune.modules.KVCache` + num_heads (int): number of query heads. For MHA this is also the + number of heads for key and value. This is used to setup the + :func:`~torchtune.modules.KVCache` + head_dim (int): embedding dimension for each head in self-attention. This is used + to setup the :func:`~torchtune.modules.KVCache` + norm (nn.Module): Callable that applies normalization to the output of the decoder, + before final MLP. + output (nn.Linear, **optional**): Callable that applies a linear transformation to the output of + the decoder. None means use token_embeddings. + + Note: + Arg values are checked for correctness (eg: ``attn_dropout`` belongs to [0,1]) + in the module where they are used. This helps reduces the number of raise + statements in code and improves readability. + """ + + def __init__( + self, + tok_embeddings: nn.Embedding, + layer: TransformerDecoderLayer, + num_layers: int, + max_seq_len: int, + num_heads: int, + head_dim: int, + norm: nn.Module, + output: Optional[nn.Linear] = None, + ) -> None: + super().__init__() + + self.tok_embeddings = tok_embeddings + self.layers = _get_clones(layer, num_layers) + self.norm = norm + self.output = output + self.max_seq_len = max_seq_len + self.num_heads = num_heads + self.head_dim = head_dim + self.causal_mask = None + + def setup_caches(self, batch_size: int, dtype: torch.dtype) -> None: + """Setup key value caches for attention calculation. + + Args: + batch_size (int): batch size for the caches. + dtype (torch.dtype): dtype for the caches. + """ + for layer in self.layers: + layer.attn.kv_cache = KVCache( + batch_size=batch_size, + max_seq_len=self.max_seq_len, + num_heads=self.num_heads, + head_dim=self.head_dim, + dtype=dtype, + ) + + # causal_mask is used during inference to ensure we're attending + # to the right tokens + self.causal_mask = torch.tril( + torch.ones(self.max_seq_len, self.max_seq_len, dtype=torch.bool) + ) + + def reset_caches(self): + """Reset the key value caches.""" + if self.layers[0].attn.kv_cache is None: + raise RuntimeError( + "Key value caches are not setup. Call ``setup_caches()`` first." + ) + + for layer in self.layers: + layer.attn.kv_cache.reset() + + def forward( + self, + tokens: Tensor, + *, + mask: Optional[Tensor] = None, + input_pos: Optional[Tensor] = None, + ) -> Tensor: + """ + Args: + tokens (Tensor): input tensor with shape [b x s] + mask (Optional[Tensor]): Optional boolean tensor which contains the attention mask + with shape [b x s x s]. This is applied after the query-key multiplication and + before the softmax. A value of True in row i and column j means token i attends + to token j. A value of False means token i does not attend to token j. If no + mask is specified, a causal mask is used by default. Default is None. + input_pos (Optional[Tensor]): Optional tensor which contains the position ids + of each token. During training, this is used to indicate the positions + of each token relative to its sample when packed, shape [b x s]. + During inference, this indicates the position of the current token. + If none, assume the index of the token is its position id. Default is None. + + Note: At the very first step of inference, when the model is provided with a prompt, + ``input_pos`` would contain the positions of all of the tokens in the prompt + (eg: ``torch.arange(prompt_length)``). This is because we will need to compute the + KV values for each position. + + Returns: + Tensor: output tensor with shape [b x s x v] + + Raises: + ValueError: if causal_mask is set but input_pos is None + + Notation used for tensor shapes: + - b: batch size + - s: sequence length + - v: vocab size + - d: embed dim + - m_s: max seq len + """ + # input tensor of shape [b, s] + bsz, seq_len = tokens.shape + + # shape: [b, s, d] + h = self.tok_embeddings(tokens) + + if self.causal_mask is not None: + if input_pos is None: + raise ValueError( + "Caches are setup, but the position of input token is missing" + ) + if mask is not None: + raise ValueError( + "An attention mask was set. Cannot use a non-causal mask for inference" + ) + # shape: [1, input_pos_len, m_s] + # in most cases input_pos_len should be 1 + mask = self.causal_mask[None, input_pos] + + for layer in self.layers: + # shape: [b, s, d] + h = layer(h, mask=mask, input_pos=input_pos) + + # shape: [b, s, d] + h = self.norm(h) + + # shape: [b, s, out_dim] - out_dim is usually the vocab size + if self.output is None: + output = F.linear(h, self.tok_embeddings.weight).float() + else: + output = self.output(h).float() + return output diff --git a/torchtune/utils/_checkpointing/_checkpointer.py b/torchtune/utils/_checkpointing/_checkpointer.py index 107154230b..3252012535 100644 --- a/torchtune/utils/_checkpointing/_checkpointer.py +++ b/torchtune/utils/_checkpointing/_checkpointer.py @@ -21,6 +21,10 @@ mistral_reward_hf_to_tune, mistral_reward_tune_to_hf, ) +from torchtune.models.qwen2 import ( + qwen2_hf_to_tune, + qwen2_tune_to_hf, +) from torchtune.models.phi3 import phi3_hf_to_tune, phi3_tune_to_hf from torchtune.utils._checkpointing._checkpointer_utils import ( get_path, @@ -435,6 +439,14 @@ def load_checkpoint(self) -> Dict[str, Any]: dim=self._config["hidden_size"], head_dim=self._config["head_dim"], ) + elif self._model_type == ModelType.QWEN2: + converted_state_dict[utils.MODEL_KEY] = qwen2_hf_to_tune( + merged_state_dict, + num_heads=self._config["num_attention_heads"], + num_kv_heads=self._config["num_key_value_heads"], + dim=self._config["hidden_size"], + tie_word_embeddings=self._config["tie_word_embeddings"], + ) else: converted_state_dict[utils.MODEL_KEY] = convert_weights.hf_to_tune( merged_state_dict, @@ -493,6 +505,14 @@ def save_checkpoint( dim=self._config["hidden_size"], head_dim=self._config["head_dim"], ) + elif self._model_type == ModelType.QWEN2: + state_dict[utils.MODEL_KEY] = qwen2_tune_to_hf( + state_dict[utils.MODEL_KEY], + num_heads=self._config["num_attention_heads"], + num_kv_heads=self._config["num_key_value_heads"], + dim=self._config["hidden_size"], + tie_word_embeddings=self._config["tie_word_embeddings"], + ) else: state_dict[utils.MODEL_KEY] = convert_weights.tune_to_hf( state_dict[utils.MODEL_KEY], diff --git a/torchtune/utils/_checkpointing/_checkpointer_utils.py b/torchtune/utils/_checkpointing/_checkpointer_utils.py index 253e285144..2d16ac02a3 100644 --- a/torchtune/utils/_checkpointing/_checkpointer_utils.py +++ b/torchtune/utils/_checkpointing/_checkpointer_utils.py @@ -44,6 +44,8 @@ class ModelType(Enum): MISTRAL_REWARD = "mistral_reward" """Mistral model with a classification head. See :func:`~torchtune.models.mistral.mistral_classifier`""" + QWEN2 = "qwen2" + """Qwen2 family of models. See :func:`~torchtune.models.qwen2.qwen2`""" def get_path(input_dir: Path, filename: str, missing_ok: bool = False) -> Path: From 18a1035349e06238bebb37883938a02fdcfd8efe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Fri, 5 Jul 2024 20:04:40 +0800 Subject: [PATCH 02/14] Reformat code. --- tests/assets/tiny_bpe_tokenizer.json | 2 +- tests/torchtune/models/qwen2/test_lora_qwen2.py | 5 ++--- tests/torchtune/models/qwen2/test_qwen2_tokenizer.py | 2 +- torchtune/models/qwen2/__init__.py | 11 ++++------- torchtune/models/qwen2/_convert_weights.py | 4 +++- torchtune/models/qwen2/_positional_embeddings.py | 12 ++++++------ torchtune/models/qwen2/_tokenizer.py | 2 +- torchtune/utils/_checkpointing/_checkpointer.py | 5 +---- 8 files changed, 19 insertions(+), 24 deletions(-) diff --git a/tests/assets/tiny_bpe_tokenizer.json b/tests/assets/tiny_bpe_tokenizer.json index 470508b681..3e6be56a1a 100644 --- a/tests/assets/tiny_bpe_tokenizer.json +++ b/tests/assets/tiny_bpe_tokenizer.json @@ -3983,4 +3983,4 @@ "Ġcomp et" ] } -} \ No newline at end of file +} diff --git a/tests/torchtune/models/qwen2/test_lora_qwen2.py b/tests/torchtune/models/qwen2/test_lora_qwen2.py index 405397b814..b79b8e977e 100644 --- a/tests/torchtune/models/qwen2/test_lora_qwen2.py +++ b/tests/torchtune/models/qwen2/test_lora_qwen2.py @@ -6,12 +6,11 @@ import pytest import torch -from torch import nn from tests.test_utils import assert_expected, fixed_init_model -from torchtune.models.qwen2 import qwen2, lora_qwen2 +from torch import nn +from torchtune.models.qwen2 import lora_qwen2, qwen2 from torchtune.models.qwen2._component_builders import lora_qwen2_self_attention -from torchtune.modules.peft import LoRALinear from torchtune.utils.seed import set_seed RANK = 4 diff --git a/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py b/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py index 118187f735..13e7c0ee33 100644 --- a/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py +++ b/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py @@ -275,7 +275,7 @@ def test_tokenize_messages(self, tokenizer): 60, 2002, 94, - 2000 + 2000, ] expected_mask = [True] * 90 + [False] * 146 assert expected_tokens == tokens diff --git a/torchtune/models/qwen2/__init__.py b/torchtune/models/qwen2/__init__.py index c01f873c25..a812bcdc78 100644 --- a/torchtune/models/qwen2/__init__.py +++ b/torchtune/models/qwen2/__init__.py @@ -5,27 +5,24 @@ # LICENSE file in the root directory of this source tree. from ._component_builders import lora_qwen2, qwen2 # noqa -from ._convert_weights import qwen2_hf_to_tune, qwen2_tune_to_hf # noqa -from ._positional_embeddings import Qwen2RotaryPositionalEmbeddings +from ._convert_weights import qwen2_hf_to_tune, qwen2_tune_to_hf # noqa from ._model_builders import ( # noqa - qwen2_7b, - qwen2_tokenizer, lora_qwen2_7b, qlora_qwen2_7b, + qwen2_7b, + qwen2_tokenizer, # TODO ) +from ._positional_embeddings import Qwen2RotaryPositionalEmbeddings __all__ = [ "qwen2_7b", "qwen2_tokenizer", "lora_qwen2_7b", "qlora_qwen2_7b", - "qwen2", "lora_qwen2", - "qwen2_hf_to_tune", "qwen2_tune_to_hf", - "Qwen2RotaryPositionalEmbeddings", ] diff --git a/torchtune/models/qwen2/_convert_weights.py b/torchtune/models/qwen2/_convert_weights.py index c548f1aee1..6ef0455ea1 100644 --- a/torchtune/models/qwen2/_convert_weights.py +++ b/torchtune/models/qwen2/_convert_weights.py @@ -67,7 +67,9 @@ def qwen2_hf_to_tune( head_dim = dim // num_heads for key, value in state_dict.items(): - if tie_word_embeddings and QWEN2_TIED_KEY not in key: # Skip loading the output projection weights + if ( + tie_word_embeddings and QWEN2_TIED_KEY not in key + ): # Skip loading the output projection weights continue if "rotary_emb.inv_freq" in key: # Skip loading the position embeddings continue diff --git a/torchtune/models/qwen2/_positional_embeddings.py b/torchtune/models/qwen2/_positional_embeddings.py index a402bc6c70..6ea5322d9f 100644 --- a/torchtune/models/qwen2/_positional_embeddings.py +++ b/torchtune/models/qwen2/_positional_embeddings.py @@ -29,10 +29,10 @@ class Qwen2RotaryPositionalEmbeddings(nn.Module): """ def __init__( - self, - dim: int, - max_seq_len: int = 4096, - base: float = 1_000_000.0, + self, + dim: int, + max_seq_len: int = 4096, + base: float = 1_000_000.0, ) -> None: super().__init__() self.dim = dim @@ -42,8 +42,8 @@ def __init__( def _rope_init(self): theta = 1.0 / ( - self.base - ** (torch.arange(0, self.dim, 2)[: (self.dim // 2)].float() / self.dim) + self.base + ** (torch.arange(0, self.dim, 2)[: (self.dim // 2)].float() / self.dim) ) self.register_buffer("theta", theta, persistent=False) self.build_rope_cache(self.max_seq_len) diff --git a/torchtune/models/qwen2/_tokenizer.py b/torchtune/models/qwen2/_tokenizer.py index 6bfdbff6e9..785b8de065 100644 --- a/torchtune/models/qwen2/_tokenizer.py +++ b/torchtune/models/qwen2/_tokenizer.py @@ -4,7 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import Optional, List, Tuple +from typing import List, Optional, Tuple from tokenizers import Tokenizer as TokenizerFast diff --git a/torchtune/utils/_checkpointing/_checkpointer.py b/torchtune/utils/_checkpointing/_checkpointer.py index 3252012535..363290a2c7 100644 --- a/torchtune/utils/_checkpointing/_checkpointer.py +++ b/torchtune/utils/_checkpointing/_checkpointer.py @@ -21,11 +21,8 @@ mistral_reward_hf_to_tune, mistral_reward_tune_to_hf, ) -from torchtune.models.qwen2 import ( - qwen2_hf_to_tune, - qwen2_tune_to_hf, -) from torchtune.models.phi3 import phi3_hf_to_tune, phi3_tune_to_hf +from torchtune.models.qwen2 import qwen2_hf_to_tune, qwen2_tune_to_hf from torchtune.utils._checkpointing._checkpointer_utils import ( get_path, ModelType, From f72e8acd8ba3a8b763c797bb4e2512e36a982cd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Fri, 5 Jul 2024 20:25:49 +0800 Subject: [PATCH 03/14] Update license line in _component_builders.py. --- torchtune/models/qwen2/_component_builders.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/torchtune/models/qwen2/_component_builders.py b/torchtune/models/qwen2/_component_builders.py index e986c76502..fb7bad3369 100644 --- a/torchtune/models/qwen2/_component_builders.py +++ b/torchtune/models/qwen2/_component_builders.py @@ -1,5 +1,8 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. from functools import partial from typing import List From 44faf3815b5838582babb94d5f5888e4971cedaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Mon, 15 Jul 2024 19:17:28 +0800 Subject: [PATCH 04/14] Rewrite Qwen2Tokenizer and Qwen2TransformerDecoder. --- .github/workflows/build_docs.yaml | 12 +- ...cipe_test_multi_gpu.yaml => gpu_test.yaml} | 14 +- .github/workflows/recipe_test.yaml | 2 +- .github/workflows/recipe_test_nightly.yaml | 8 +- .github/workflows/regression_test.yaml | 4 +- .github/workflows/unit_test.yaml | 2 +- CONTRIBUTING.md | 7 +- README.md | 12 +- docs/source/api_ref_models.rst | 74 +- docs/source/api_ref_modules.rst | 17 + docs/source/tune_cli.rst | 7 +- docs/source/tutorials/chat.rst | 42 +- docs/source/tutorials/datasets.rst | 7 +- pyproject.toml | 3 +- .../code_llama2/7B_full_low_memory.yaml | 3 +- .../code_llama2/7B_lora_single_device.yaml | 3 +- .../code_llama2/7B_qlora_single_device.yaml | 2 +- recipes/configs/qwen2/7B_full.yaml | 3 +- recipes/configs/qwen2/7B_full_low_memory.yaml | 3 +- recipes/configs/qwen2/7B_lora.yaml | 3 +- .../configs/qwen2/7B_lora_single_device.yaml | 3 +- recipes/dev/lora_finetune_fsdp2.py | 4 + tests/assets/tiny_bpe_merges.txt | 1904 ++++++++++++++++ tests/assets/tiny_bpe_vocab.json | 2002 +++++++++++++++++ .../datasets/test_text_completion_dataset.py | 22 + tests/torchtune/models/clip/__init__.py | 5 + .../models/clip/test_clip_image_transform.py | 75 + .../models/clip/test_positional_embeddings.py | 87 + .../models/qwen2/test_qwen2_tokenizer.py | 84 +- tests/torchtune/modules/test_layernorm.py | 57 + .../modules/test_transformer_decoder.py | 8 +- .../modules/test_vision_transformer.py | 207 ++ .../transforms/test_get_canvas_best_fit.py | 160 ++ .../transforms/test_resize_with_pad.py | 84 + .../modules/transforms/test_tile_crop.py | 81 + .../modules/transforms/test_transforms.py | 80 + tests/torchtune/utils/test_checkpointer.py | 4 +- tests/torchtune/utils/test_distributed.py | 4 + torchtune/datasets/_alpaca.py | 17 +- torchtune/datasets/_chat.py | 10 +- torchtune/datasets/_concat.py | 24 +- torchtune/datasets/_grammar.py | 12 +- torchtune/datasets/_instruct.py | 26 +- torchtune/datasets/_preference.py | 10 +- torchtune/datasets/_samsum.py | 10 +- torchtune/datasets/_slimorca.py | 2 +- torchtune/datasets/_text_completion.py | 13 +- torchtune/models/clip/__init__.py | 20 + torchtune/models/clip/_component_builders.py | 101 + torchtune/models/clip/_model_builders.py | 14 + torchtune/models/clip/_position_embeddings.py | 190 ++ torchtune/models/clip/_transforms.py | 179 ++ .../models/code_llama2/_model_builders.py | 3 + torchtune/models/convert_weights.py | 4 +- torchtune/models/gemma/__init__.py | 4 +- torchtune/models/gemma/_convert_weights.py | 108 - torchtune/models/gemma/_tokenizer.py | 5 +- torchtune/models/llama2/__init__.py | 2 - torchtune/models/llama2/_model_builders.py | 3 + torchtune/models/llama2/_tokenizer.py | 15 +- torchtune/models/llama3/__init__.py | 2 - .../models/mistral/_component_builders.py | 2 +- torchtune/models/mistral/_model_builders.py | 2 +- torchtune/models/mistral/_tokenizer.py | 14 +- torchtune/models/phi3/_tokenizer.py | 6 +- torchtune/models/qwen2/_component_builders.py | 73 +- torchtune/models/qwen2/_model_builders.py | 31 +- torchtune/models/qwen2/_tokenizer.py | 279 ++- torchtune/models/qwen2/_trie.py | 237 ++ torchtune/modules/__init__.py | 4 + torchtune/modules/attention.py | 2 +- torchtune/modules/layer_norm.py | 35 + .../_register_nf4_dispatch_ops.py | 24 +- torchtune/modules/low_precision/_utils.py | 66 + torchtune/modules/position_embeddings.py | 2 +- torchtune/modules/rms_norm.py | 2 +- torchtune/modules/transformer.py | 119 + torchtune/modules/transforms/__init__.py | 24 + torchtune/modules/transforms/_transforms.py | 165 ++ .../transforms/vision_utils/__init__.py | 5 + .../vision_utils/get_canvas_best_fit.py | 179 ++ .../vision_utils/resize_with_pad.py | 170 ++ .../transforms/vision_utils/tile_crop.py | 59 + torchtune/modules/vision_transformer.py | 462 ++++ .../utils/_checkpointing/_checkpointer.py | 22 +- torchtune/utils/_profiler.py | 95 +- torchtune/utils/collate.py | 10 +- torchtune/utils/quantization.py | 22 +- 88 files changed, 7502 insertions(+), 482 deletions(-) rename .github/workflows/{recipe_test_multi_gpu.yaml => gpu_test.yaml} (71%) create mode 100644 tests/assets/tiny_bpe_merges.txt create mode 100644 tests/assets/tiny_bpe_vocab.json create mode 100644 tests/torchtune/models/clip/__init__.py create mode 100644 tests/torchtune/models/clip/test_clip_image_transform.py create mode 100644 tests/torchtune/models/clip/test_positional_embeddings.py create mode 100644 tests/torchtune/modules/test_layernorm.py create mode 100644 tests/torchtune/modules/test_vision_transformer.py create mode 100644 tests/torchtune/modules/transforms/test_get_canvas_best_fit.py create mode 100644 tests/torchtune/modules/transforms/test_resize_with_pad.py create mode 100644 tests/torchtune/modules/transforms/test_tile_crop.py create mode 100644 tests/torchtune/modules/transforms/test_transforms.py create mode 100644 torchtune/models/clip/__init__.py create mode 100644 torchtune/models/clip/_component_builders.py create mode 100644 torchtune/models/clip/_model_builders.py create mode 100644 torchtune/models/clip/_position_embeddings.py create mode 100644 torchtune/models/clip/_transforms.py delete mode 100644 torchtune/models/gemma/_convert_weights.py create mode 100644 torchtune/models/qwen2/_trie.py create mode 100644 torchtune/modules/layer_norm.py create mode 100644 torchtune/modules/low_precision/_utils.py create mode 100644 torchtune/modules/transforms/__init__.py create mode 100644 torchtune/modules/transforms/_transforms.py create mode 100644 torchtune/modules/transforms/vision_utils/__init__.py create mode 100644 torchtune/modules/transforms/vision_utils/get_canvas_best_fit.py create mode 100644 torchtune/modules/transforms/vision_utils/resize_with_pad.py create mode 100644 torchtune/modules/transforms/vision_utils/tile_crop.py create mode 100644 torchtune/modules/vision_transformer.py diff --git a/.github/workflows/build_docs.yaml b/.github/workflows/build_docs.yaml index 1f690a7617..3eaed1147a 100644 --- a/.github/workflows/build_docs.yaml +++ b/.github/workflows/build_docs.yaml @@ -39,7 +39,7 @@ jobs: run: python -m pip install --upgrade pip - name: Install dependencies run: | - python -m pip install torch + python -m pip install torch torchvision python -m pip install -e . cd docs python -m pip install -r requirements.txt @@ -108,21 +108,21 @@ jobs: run: | git remote set-url origin https://pytorchbot:${GITHUB_PYTORCHBOT_TOKEN}@github.com/pytorch/torchtune.git set -euo pipefail - - # Convert refs/tags/v1.12.0rc3 into 1.12. + + # Convert refs/tags/v1.12.0rc3 into 1.12. # Adopted from https://github.com/pytorch/pytorch/blob/main/.github/workflows/_docs.yml#L150C11-L155C13 - GITHUB_REF=${{ github.ref }} + GITHUB_REF=${{ github.ref }} if [[ "${GITHUB_REF}" =~ ^refs/tags/v([0-9]+\.[0-9]+)\.* ]]; then TARGET_FOLDER="${BASH_REMATCH[1]}" else TARGET_FOLDER="main" fi - + echo "Target Folder: ${TARGET_FOLDER}" mkdir -p "${TARGET_FOLDER}" rm -rf "${TARGET_FOLDER}"/* mv docs/* "${TARGET_FOLDER}" - + git config user.name 'pytorchbot' git config user.email 'soumith+bot@pytorch.org' git add "${TARGET_FOLDER}" || true diff --git a/.github/workflows/recipe_test_multi_gpu.yaml b/.github/workflows/gpu_test.yaml similarity index 71% rename from .github/workflows/recipe_test_multi_gpu.yaml rename to .github/workflows/gpu_test.yaml index d62bcc81bc..c7f1840a5f 100644 --- a/.github/workflows/recipe_test_multi_gpu.yaml +++ b/.github/workflows/gpu_test.yaml @@ -1,4 +1,4 @@ -name: Multi-GPU Recipe Tests +name: GPU tests on: push: @@ -7,7 +7,7 @@ on: workflow_dispatch: concurrency: - group: recipe-test-multi-gpu-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + group: gpu-test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} cancel-in-progress: true permissions: @@ -19,7 +19,7 @@ defaults: shell: bash -l -eo pipefail {0} jobs: - recipe_test_multi_gpu: + gpu_test: runs-on: linux.8xlarge.nvidia.gpu strategy: matrix: @@ -39,15 +39,15 @@ jobs: run: python -m pip install --upgrade pip - name: Install torch nightly if: ${{ matrix.torch-version == 'nightly' }} - run: python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu118 + run: python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118 - name: Install torch stable if: ${{ matrix.torch-version == 'stable' }} - run: python -m pip install torch + run: python -m pip install torch torchvision - name: Install remaining dependencies run: | python -m pip install -e ".[dev]" python -m pip install lm-eval==0.4.* - - name: Run recipe tests with coverage - run: pytest tests -m integration_test --cov=. --cov-report=xml --durations=20 -vv + - name: Run recipe and unit tests with coverage + run: pytest tests --with-integration --cov=. --cov-report=xml --durations=20 -vv - name: Upload Coverage to Codecov uses: codecov/codecov-action@v3 diff --git a/.github/workflows/recipe_test.yaml b/.github/workflows/recipe_test.yaml index 39157c321e..59b693a055 100644 --- a/.github/workflows/recipe_test.yaml +++ b/.github/workflows/recipe_test.yaml @@ -39,7 +39,7 @@ jobs: run: python -m pip install --upgrade pip - name: Install dependencies run: | - python -m pip install torch + python -m pip install torch torchvision python -m pip install -e ".[dev]" python -m pip install lm-eval==0.4.* - name: Run recipe tests with coverage diff --git a/.github/workflows/recipe_test_nightly.yaml b/.github/workflows/recipe_test_nightly.yaml index d0f3cd0941..cda5fef33f 100644 --- a/.github/workflows/recipe_test_nightly.yaml +++ b/.github/workflows/recipe_test_nightly.yaml @@ -4,6 +4,7 @@ on: schedule: # Runs at midnight every day - cron: '0 0 * * *' + workflow_dispatch: concurrency: group: recipe-test-nightly-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} @@ -38,14 +39,17 @@ jobs: run: python -m pip install --upgrade pip - name: Install torch nightly if: ${{ matrix.torch-version == 'nightly' }} - run: python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu118 + run: python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu121 - name: Install torch stable if: ${{ matrix.torch-version == 'stable' }} - run: python -m pip install torch + run: python -m pip install torch torchvision - name: Install remaining dependencies run: | python -m pip install -e ".[dev]" python -m pip install lm-eval==0.4.* + - name: Install torchao nightly + if: ${{ matrix.torch-version == 'nightly' }} + run: pip install --pre torchao-nightly --index-url https://download.pytorch.org/whl/nightly/cu121 - name: Run recipe tests with coverage run: pytest tests -m integration_test --cov=. --cov-report=xml --durations=20 -vv - name: Upload Coverage to Codecov diff --git a/.github/workflows/regression_test.yaml b/.github/workflows/regression_test.yaml index 1a82add1af..7ea639aebd 100644 --- a/.github/workflows/regression_test.yaml +++ b/.github/workflows/regression_test.yaml @@ -47,10 +47,10 @@ jobs: python3 -m pip install awscli==1.32.6 - name: Install torch nightly if: ${{ matrix.torch-version == 'nightly' }} - run: python -m pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu118 + run: python -m pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu118 - name: Install torch stable if: ${{ matrix.torch-version == 'stable' }} - run: python -m pip install torch + run: python -m pip install torch torchvision - name: Install remaining dependencies run: | python -m pip install -e ".[dev]" diff --git a/.github/workflows/unit_test.yaml b/.github/workflows/unit_test.yaml index eb7dfe42e5..3fd3c5bfbc 100644 --- a/.github/workflows/unit_test.yaml +++ b/.github/workflows/unit_test.yaml @@ -33,7 +33,7 @@ jobs: run: python -m pip install --upgrade pip - name: Install dependencies run: | - python -m pip install torch + python -m pip install torch torchvision python -m pip install -e ".[dev]" - name: Run unit tests with coverage run: pytest tests --cov=. --cov-report=xml --durations=20 -vv diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a301aab3f1..77663848ff 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -5,12 +5,17 @@ We want to make contributing to this project as easy and transparent as possible ## Dev install You should first [fork](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo) the torchtune repository -and then clone your forked repository. +and then clone your forked repository. Make sure to keep your fork in sync with the torchtune repository over time. ```git clone https://github.com//torchtune.git``` Then navigate into the newly cloned repo and install dependencies needed for development. +**Step 1:** [Install PyTorch](https://pytorch.org/get-started/locally/). torchtune is tested with the latest stable PyTorch release as well as the preview nightly version. + + +**Step 2:** Install all the additional dependencies and dev dependencies in the local repo: + ``` cd torchtune pip install -e ".[dev]" diff --git a/README.md b/README.md index 4b05f775ef..b250e9a7ba 100644 --- a/README.md +++ b/README.md @@ -46,7 +46,7 @@ torchtune currently supports the following models. |-----------------------------------------------|-----------| | [Llama3](https://llama.meta.com/llama3) | 8B, 70B [[models](torchtune/models/llama3/_model_builders.py), [configs](recipes/configs/llama3/)] | | [Llama2](https://llama.meta.com/llama2/) | 7B, 13B, 70B [[models](torchtune/models/llama2/_model_builders.py), [configs](recipes/configs/llama2/)] | -| [Code-Llama2](https://huggingface.co/codellama) | 7B, 13B, 70B [[model](torchtune/models/code_llama2/_model_builders.py), [configs](recipes/configs/code_llama2/)] | +| [Code-Llama2](https://ai.meta.com/blog/code-llama-large-language-model-coding/) | 7B, 13B, 70B [[model](torchtune/models/code_llama2/_model_builders.py), [configs](recipes/configs/code_llama2/)] | | [Mistral](https://huggingface.co/mistralai) | 7B [[model](torchtune/models/mistral/_model_builders.py), [configs](recipes/configs/mistral/)] | | [Gemma](https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b) | 2B, 7B [[model](torchtune/models/gemma/_model_builders.py), [configs](recipes/configs/gemma/)] | | [Microsoft Phi3](https://huggingface.co/collections/microsoft/phi-3-6626e15e9585a200d2d761e3) | Mini [[model](torchtune/models/phi3/), [configs](recipes/configs/phi3/)] @@ -156,7 +156,15 @@ You can find a full list of all our Llama3 configs [here.](recipes/configs/llama ## Installation -**Step 1:** [Install PyTorch](https://pytorch.org/get-started/locally/). torchtune is tested with the latest stable PyTorch release as well as the preview nightly version. +**Step 1:** [Install PyTorch](https://pytorch.org/get-started/locally/). torchtune is tested with the latest stable PyTorch release as well as the preview nightly version. For fine-tuning the multimodal LLMs available in the repo, you'll need to install torchvision as well. + +``` +# Install stable version of PyTorch using pip +pip install torch torchvision + +# Nightly install for latest features +pip install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu121 +``` **Step 2:** The latest stable version of torchtune is hosted on PyPI and can be downloaded with the following command: diff --git a/docs/source/api_ref_models.rst b/docs/source/api_ref_models.rst index cb3dd7fb67..1ade3d6303 100644 --- a/docs/source/api_ref_models.rst +++ b/docs/source/api_ref_models.rst @@ -11,15 +11,26 @@ llama3 All models from the `Llama3 family `_. +To download the Llama3-8B-Instruct model: + .. code-block:: bash - tune download meta-llama/Meta-Llama-3-8B-Instruct --hf-token + tune download meta-llama/Meta-Llama-3-8B-Instruct --hf-token + +To download the Llama3-70B-Instruct model: + +.. code-block:: bash + + tune download meta-llama/Meta-Llama-3-70B-Instruct --hf-token + --ignore-patterns "original/consolidated*" .. autosummary:: :toctree: generated/ :nosignatures: + llama3.llama3 + llama3.lora_llama3 llama3.llama3_8b llama3.lora_llama3_8b llama3.qlora_llama3_8b @@ -35,16 +46,30 @@ llama2 All models from the `Llama2 family `_. -Pre-trained models can be downloaded from the Hugging Face Hub with the following command: +To download the Llama2-7B model: + +.. code-block:: bash + + tune download meta-llama/Llama-2-7b-hf --hf-token + +To download the Llama2-13B model: + +.. code-block:: bash + + tune download meta-llama/Llama-2-13b-hf --hf-token + +To download the Llama2-70B model: .. code-block:: bash - tune download meta-llama/Llama-2-7b-hf --hf-token + tune download meta-llama/Llama-2-70b-hf --hf-token .. autosummary:: :toctree: generated/ :nosignatures: + llama2.llama2 + llama2.lora_llama2 llama2.llama2_7b llama2.lora_llama2_7b llama2.qlora_llama2_7b @@ -63,11 +88,11 @@ code llama Models from the `Code Llama family `_. -Pre-trained models can be downloaded from the Hugging Face Hub with the following command: +To download the CodeLlama-7B model: .. code-block:: bash - tune download codellama/CodeLlama-7b-hf --hf-token + tune download codellama/CodeLlama-7b-hf --hf-token .. autosummary:: :toctree: generated/ @@ -89,7 +114,7 @@ phi-3 Models from the `Phi-3 mini family `_. -Pre-trained models can be download from the Hugging Face Hub with the following command: +To download the Phi-3 Mini 4k instruct model: .. code-block:: bash @@ -99,6 +124,8 @@ Pre-trained models can be download from the Hugging Face Hub with the following :toctree: generated/ :nosignatures: + phi3.phi3 + phi3.lora_phi3 phi3.phi3_mini phi3.lora_phi3_mini phi3.qlora_phi3_mini @@ -111,16 +138,20 @@ mistral All models from `Mistral AI family `_. -Pre-trained models can be downloaded from the Hugging Face Hub with the following command: +To download the Mistral 7B v0.1 model: .. code-block:: bash - tune download mistralai/Mistral-7B-v0.1 + tune download mistralai/Mistral-7B-v0.1 --hf-token .. autosummary:: :toctree: generated/ :nosignatures: + mistral.mistral + mistral.lora_mistral + mistral.mistral_classifier + mistral.lora_mistral_classifier mistral.mistral_7b mistral.lora_mistral_7b mistral.qlora_mistral_7b @@ -136,16 +167,24 @@ gemma Models of size 2B and 7B from the `Gemma family `_. -Pre-trained models can be downloaded from the Hugging Face Hub with the following command: +To download the Gemma 2B model: .. code-block:: bash - tune download google/gemma-2b --hf-token --ignore-patterns "" + tune download google/gemma-2b --hf-token --ignore-patterns "" + +To download the Gemma 7B model: + +.. code-block:: bash + + tune download google/gemma-7b --hf-token --ignore-patterns "gemma-7b.gguf" .. autosummary:: :toctree: generated/ :nosignatures: + gemma.gemma + gemma.lora_gemma gemma.gemma_2b gemma.lora_gemma_2b gemma.qlora_gemma_2b @@ -154,3 +193,18 @@ Pre-trained models can be downloaded from the Hugging Face Hub with the followin gemma.qlora_gemma_7b gemma.gemma_tokenizer gemma.GemmaTokenizer + + +clip +----- + +Vision components to support multimodality using `CLIP encoder `_. + +.. autosummary:: + :toctree: generated/ + :nosignatures: + + clip.clip_vision_encoder + clip.TokenPositionalEmbedding + clip.TiledTokenPositionalEmbedding + clip.TilePositionalEmbedding diff --git a/docs/source/api_ref_modules.rst b/docs/source/api_ref_modules.rst index 3dada7c0fb..f6e8f93b38 100644 --- a/docs/source/api_ref_modules.rst +++ b/docs/source/api_ref_modules.rst @@ -17,8 +17,10 @@ Modeling Components and Building Blocks get_cosine_schedule_with_warmup RotaryPositionalEmbeddings RMSNorm + Fp32LayerNorm TransformerDecoderLayer TransformerDecoder + VisionTransformer Base Tokenizers --------------- @@ -79,3 +81,18 @@ Loss :nosignatures: loss.DPOLoss + + +Vision Transforms +------------------ +Functions used for preprocessing images. + +.. autosummary:: + :toctree: generated/ + :nosignatures: + + transforms.get_canvas_best_fit + transforms.resize_with_pad + transforms.tile_crop + transforms.find_supported_resolutions + transforms.VisionCrossAttentionMask diff --git a/docs/source/tune_cli.rst b/docs/source/tune_cli.rst index 40dee04aa5..f15837402e 100644 --- a/docs/source/tune_cli.rst +++ b/docs/source/tune_cli.rst @@ -126,12 +126,14 @@ The ``tune ls`` command lists out all the built-in recipes and configs within to Copy a built-in recipe or config -------------------------------- -The ``tune cp `` command copies built-in recipes and configs to a provided location. This allows you to make a local copy of a library +The ``tune cp `` command copies built-in recipes and configs to a provided location. This allows you to make a local copy of a library recipe or config to edit directly for yourself. .. list-table:: :widths: 30 60 + * - \-n, \--no-clobber + - Do not overwrite destination if it already exists * - \--make-parents - Create parent directories for destination if they do not exist. If not set to True, will error if parent directories do not exist @@ -207,5 +209,6 @@ The ``tune validate `` command will validate that your config is formatt .. code-block:: bash - $ tune validate recipes/configs/full_finetune_distributed.yaml + # If you've copied over a built-in config and want to validate custom changes + $ tune validate my_configs/llama3/8B_full.yaml Config is well-formed! diff --git a/docs/source/tutorials/chat.rst b/docs/source/tutorials/chat.rst index cfef43c6ba..3eb069a175 100644 --- a/docs/source/tutorials/chat.rst +++ b/docs/source/tutorials/chat.rst @@ -2,24 +2,25 @@ Fine-tuning Llama3 with Chat Data ================================= -Llama3 introduced a new prompt template for fine-tuning with chat data. In this tutorial, +Llama3 Instruct introduced a new prompt template for fine-tuning with chat data. In this tutorial, we'll cover what you need to know to get you quickly started on preparing your own -custom chat dataset for fine-tuning Llama3. +custom chat dataset for fine-tuning Llama3 Instruct. .. grid:: 2 .. grid-item-card:: :octicon:`mortar-board;1em;` You will learn: - * How the Llama3 format differs from Llama2 + * How the Llama3 Instruct format differs from Llama2 * All about prompt templates and special tokens - * How to use your own chat dataset to fine-tune Llama3 + * How to use your own chat dataset to fine-tune Llama3 Instruct .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites * Be familiar with :ref:`configuring datasets` - * Know how to :ref:`download Llama3 weights ` + * Know how to :ref:`download Llama3 Instruct weights ` -Note: this tutorial requires a version of torchtune > 0.1.1 +.. note:: + This tutorial requires a version of torchtune > 0.1.1 Template changes from Llama2 to Llama3 -------------------------------------- @@ -42,9 +43,9 @@ for the Llama2 chat model, we can see that special tags are added: Hi! I am a human. [/INST] Hello there! Nice to meet you! I'm Meta AI, your friendly AI assistant -Llama3 `overhauled `_ +Llama3 Instruct `overhauled `_ the template from Llama2 to better support multiturn conversations. The same text -in the Llama3 format would look like this: +in the Llama3 Instruct format would look like this: .. code-block:: text @@ -60,6 +61,15 @@ The tags are entirely different, and they are actually encoded differently than Llama2. Let's walk through tokenizing an example with the Llama2 template and the Llama3 template to understand how. +.. note:: + The Llama3 Base model uses a `different prompt template + `_ than Llama3 Instruct + because it has not yet been instruct tuned and the extra special tokens are untrained. If you + are running inference on the Llama3 Base model without fine-tuning we recommend the base + template for optimal performance. Generally, for instruct and chat data, we recommend using + Llama3 Instruct with its prompt template. The rest of this tutorial assumes you are using + Llama3 Instruct. + Tokenizing prompt templates & special tokens -------------------------------------------- @@ -122,7 +132,7 @@ why. from torchtune.models.llama2 import llama2_tokenizer tokenizer = llama2_tokenizer("/tmp/Llama-2-7b-hf/tokenizer.model") - user_message = formatted_messages[0].content + user_message = formatted_messages[0].text_content tokens = tokenizer.encode(user_message, add_bos=True, add_eos=True) print(tokens) # [1, 518, 25580, 29962, 3532, 14816, 29903, 6778, ..., 2] @@ -132,9 +142,9 @@ as IDs 1 and 2. We can verify that these are our BOS and EOS tokens. .. code-block:: python - print(tokenizer.spm_model.piece_to_id("")) + print(tokenizer._spm_model.spm_model.piece_to_id("")) # 1 - print(tokenizer.spm_model.piece_to_id("")) + print(tokenizer._spm_model.spm_model.piece_to_id("")) # 2 The BOS and EOS tokens are what we call special tokens, because they have their own @@ -169,7 +179,7 @@ than Llama2. from torchtune.models.llama3 import llama3_tokenizer - tokenizer = llama3_tokenizer("/tmp/Meta-Llama-3-8B/original/tokenizer.model") + tokenizer = llama3_tokenizer("/tmp/Meta-Llama-3-8B-Instruct/original/tokenizer.model") messages = [Message.from_dict(msg) for msg in sample] tokens, mask = tokenizer.tokenize_messages(messages) print(tokenizer.decode(tokens)) @@ -191,9 +201,9 @@ as their own token IDs. .. code-block:: python - print(tokenizer._encode_special_token("<|begin_of_text|>")) + print(tokenizer.special_tokens["<|begin_of_text|>"]) # 128000 - print(tokenizer._encode_special_token("<|eot_id|>")) + print(tokenizer.special_tokens["<|eot_id|>"]) # 128009 The best part is - all these special tokens are handled purely by the tokenizer. @@ -314,7 +324,9 @@ object. Now we're ready to start fine-tuning! We'll use the built-in LoRA single device recipe. Use the :code:`tune cp` command to get a copy of the :code:`8B_lora_single_device.yaml` -config and update it to use your new dataset. +config and update it to use your new dataset. Create a new folder for your project +and make sure the dataset builder and message converter are saved in that directory, +then specify it in the config. .. code-block:: yaml diff --git a/docs/source/tutorials/datasets.rst b/docs/source/tutorials/datasets.rst index 67302e7c1b..7807c5bb21 100644 --- a/docs/source/tutorials/datasets.rst +++ b/docs/source/tutorials/datasets.rst @@ -171,7 +171,7 @@ Custom unstructured text corpus For continued pre-training, typically a similar data setup to pre-training is used for a simple text completion task. This means no instruct templates, chat formats, -and minimal special tokens (only BOS and EOS). To specify an unstructured text corpus, +and minimal special tokens (only BOS and, optionally, EOS). To specify an unstructured text corpus, you can use the :func:`~torchtune.datasets.text_completion_dataset` builder with a Hugging Face dataset or a custom local corpus. Here is how to specify it for local files: @@ -395,8 +395,9 @@ you can also add more advanced behavior. Multiple in-memory datasets --------------------------- -It is also possible to train on multiple datasets and configure them individually. -You can even mix instruct and chat datasets or other custom datasets. +It is also possible to train on multiple datasets and configure them individually using +our :class:`~torchtune.datasets.ConcatDataset` interface. You can even mix instruct and chat datasets +or other custom datasets. .. code-block:: yaml diff --git a/pyproject.toml b/pyproject.toml index 6c34a1307c..664c79c8cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ authors = [ ] keywords = ["pytorch", "finetuning", "llm"] dependencies = [ + # Hugging Face integrations "datasets", "huggingface_hub", @@ -65,7 +66,7 @@ where = [""] include = ["torchtune*", "recipes*"] [tool.setuptools.package-data] -recipes = ["configs/*.yaml", "configs/*/*.yaml"] +recipes = ["configs/*.yaml", "configs/*/*.yaml", "configs/*/*/*.yaml"] # ---- Tooling specifications ---- # diff --git a/recipes/configs/code_llama2/7B_full_low_memory.yaml b/recipes/configs/code_llama2/7B_full_low_memory.yaml index 6e32f83472..c78b635fb8 100644 --- a/recipes/configs/code_llama2/7B_full_low_memory.yaml +++ b/recipes/configs/code_llama2/7B_full_low_memory.yaml @@ -3,7 +3,8 @@ # # This config assumes that you've run the following command before launching # this run: -# tune download codellama/CodeLlama-7b-hf --output-dir /tmp/CodeLlama-7b-hf +# tune download meta-llama/CodeLlama-7b-hf --output-dir /tmp/CodeLlama-7b-hf +# # The default config uses an optimizer from bitsandbytes. If you do not have it installed, # you can install it with # pip install bitsandbytes diff --git a/recipes/configs/code_llama2/7B_lora_single_device.yaml b/recipes/configs/code_llama2/7B_lora_single_device.yaml index 0b07271e36..b85117fc57 100644 --- a/recipes/configs/code_llama2/7B_lora_single_device.yaml +++ b/recipes/configs/code_llama2/7B_lora_single_device.yaml @@ -3,7 +3,7 @@ # # This config assumes that you've run the following command before launching # this run: -# tune download codellama/CodeLlama-7b-hf --output-dir /tmp/CodeLlama-7b-hf +# tune download meta-llama/CodeLlama-7b-hf --output-dir /tmp/CodeLlama-7b-hf # # To launch on a single device, run the following command from root: # tune run lora_finetune_single_device --config code_llama2/7B_lora_single_device @@ -48,6 +48,7 @@ checkpointer: recipe_checkpoint: null output_dir: /tmp/CodeLlama-7b-hf model_type: LLAMA2 +resume_from_checkpoint: False # Fine-tuning arguments batch_size: 2 diff --git a/recipes/configs/code_llama2/7B_qlora_single_device.yaml b/recipes/configs/code_llama2/7B_qlora_single_device.yaml index 768e501ddb..1887ebaf39 100644 --- a/recipes/configs/code_llama2/7B_qlora_single_device.yaml +++ b/recipes/configs/code_llama2/7B_qlora_single_device.yaml @@ -3,7 +3,7 @@ # # This config assumes that you've run the following command before launching # this run: -# tune download codellama/CodeLlama-7b-hf --output-dir /tmp/CodeLlama-7b-hf +# tune download meta-llama/CodeLlama-7b-hf --output-dir /tmp/CodeLlama-7b-hf # # To launch on a single device, run the following command from root: # tune run lora_finetune_single_device --config code_llama2/7B_qlora_single_device diff --git a/recipes/configs/qwen2/7B_full.yaml b/recipes/configs/qwen2/7B_full.yaml index 45296d59df..b92bff80d7 100644 --- a/recipes/configs/qwen2/7B_full.yaml +++ b/recipes/configs/qwen2/7B_full.yaml @@ -20,7 +20,8 @@ # Tokenizer tokenizer: _component_: torchtune.models.qwen2._tokenizer.Qwen2Tokenizer - path: /tmp/Qwen2-7B-Instruct/tokenizer.json + vocab_file: /tmp/Qwen2-7B-Instruct/vocab.json + merges_file: /tmp/Qwen2-7B-Instruct/merges.txt # Dataset dataset: diff --git a/recipes/configs/qwen2/7B_full_low_memory.yaml b/recipes/configs/qwen2/7B_full_low_memory.yaml index 3580d9ee3f..d248e7f2a2 100644 --- a/recipes/configs/qwen2/7B_full_low_memory.yaml +++ b/recipes/configs/qwen2/7B_full_low_memory.yaml @@ -22,7 +22,8 @@ # Tokenizer tokenizer: _component_: torchtune.models.qwen2._tokenizer.Qwen2Tokenizer - path: /tmp/Qwen2-7B-Instruct/tokenizer.json + vocab_file: /tmp/Qwen2-7B-Instruct/vocab.json + merges_file: /tmp/Qwen2-7B-Instruct/merges.txt # Dataset dataset: diff --git a/recipes/configs/qwen2/7B_lora.yaml b/recipes/configs/qwen2/7B_lora.yaml index 5263cacf32..7e93c64855 100644 --- a/recipes/configs/qwen2/7B_lora.yaml +++ b/recipes/configs/qwen2/7B_lora.yaml @@ -29,7 +29,8 @@ model: tokenizer: _component_: torchtune.models.qwen2._tokenizer.Qwen2Tokenizer - path: /tmp/Qwen2-7B-Instruct/tokenizer.json + vocab_file: /tmp/Qwen2-7B-Instruct/vocab.json + merges_file: /tmp/Qwen2-7B-Instruct/merges.txt checkpointer: _component_: torchtune.utils.FullModelHFCheckpointer diff --git a/recipes/configs/qwen2/7B_lora_single_device.yaml b/recipes/configs/qwen2/7B_lora_single_device.yaml index 1dd37725d7..6cf8694809 100644 --- a/recipes/configs/qwen2/7B_lora_single_device.yaml +++ b/recipes/configs/qwen2/7B_lora_single_device.yaml @@ -27,7 +27,8 @@ model: tokenizer: _component_: torchtune.models.qwen2._tokenizer.Qwen2Tokenizer - path: /tmp/Qwen2-7B-Instruct/tokenizer.json + vocab_file: /tmp/Qwen2-7B-Instruct/vocab.json + merges_file: /tmp/Qwen2-7B-Instruct/merges.txt checkpointer: _component_: torchtune.utils.FullModelHFCheckpointer diff --git a/recipes/dev/lora_finetune_fsdp2.py b/recipes/dev/lora_finetune_fsdp2.py index 9812619e4b..3b4b1964ee 100644 --- a/recipes/dev/lora_finetune_fsdp2.py +++ b/recipes/dev/lora_finetune_fsdp2.py @@ -105,6 +105,10 @@ class LoRAFinetuneRecipeDistributed(FTRecipeInterface): """ def __init__(self, cfg: DictConfig) -> None: + + if not utils.torch_version_ge("2.4.0"): + raise RuntimeError("FSDP2 recipe is only available on PyTorch nightlies") + self._device = utils.get_device(device=cfg.device) self._dtype = utils.get_dtype(cfg.dtype, device=self._device) diff --git a/tests/assets/tiny_bpe_merges.txt b/tests/assets/tiny_bpe_merges.txt new file mode 100644 index 0000000000..aff1dc83eb --- /dev/null +++ b/tests/assets/tiny_bpe_merges.txt @@ -0,0 +1,1904 @@ +Ġ Ġ +Ġ t +Ġ a +i n +h e +r e +o n +Ġt he +Ġ s +e r +a t +Ġ c +ĠĠ ĠĠ +e n +Ġ o +Ġ " +n d +e s +in g +ĠĠ Ġ +i t +Ġ p +o r +o u +Ġa nd +Ġ w +i s +Ġ f +a n +i on +a l +Ġ b +Ġt o +Ġ m +Ġ in +Ġo f +l e +c t +a r +u t +Ġ d +s t +e d +ĠĠĠĠ ĠĠĠ +i c +" : +, Ċ +r o +en t +\ n +Ġ e +p ut +o m +Ġ re +a s +v e +Ġ h +Ġt h +" ,Ċ +Ġ l +Ġ is +e t +c e +Ġ n +. \ +i m +i l +Ġ g +Ġ u +ct ion +r u +at ion +o l +c h +Ġ T +Ġf or +ou t +r a +o w +i d +l y +Ġs t +Ġb e +Ġ y +Ġp ro +i g +s e +at e +Ġth at +it h +i r +u r +o t +Ġo r +Ġ on +Ġy ou +er s +st ru +Ġa n +i f +u l +stru ction +Ġ { +Ġ } +Ġc an +in put +out put +in struction +Ġ{ Ċ +Ġ} ,Ċ +" Ċ +Ġ he +Ġc on +Ġ it +a y +es s +Ġw ith +v er +e l +Ġa s +a m +Ġ A +g e +Ġs u +i v +. ",Ċ +Ġc om +Ġ I +m ent +a k +Ġa l +\ " +. "Ċ +i ve +Ġa re +a b +a d +Ġm o +Ġe x +Ġ v +Ġ S +re s +p p +q u +Ġd e +Ġw h +it y +Ġ en +ĠT he +he r +l d +r i +t er +an t +Ġ C +is t +Ġ" ",Ċ +u m +Ġu s +Ġn e +a in +t h +e ct +Ġ le +o p +e m +i es +Ġc h +Ġ im +d u +o d +or t +n t +es t +ig h +e re +Ġh a +u s +u re +i al +o c +Ġw or +Ġthe ir +a c +en ce +i z +Ġyou r +o s +Ġim p +u d +Ġb y +Ġs e +in e +ou ld +l ow +il l +a ge +ro m +Ġs p +Ġ P +Ġs h +u st +T he +u n +' s +Ġin c +id e +p l +igh t +o g +Ġp l +p t +a re +Ġt e +Ġin t +Ġ \ +h is +Ġ r +ak e +p er +or m +a g +f f +Ġ E +ar t +Ġ k +en d +Ġ M +Ġw e +Ġ B +Ġa d +c ess +r ou +ic al +al l +ab le +Ġf rom +a nd +Ġ H +Ġa b +a ct +Ġcom p +om e +a ch +ĠT his +Ġha ve +f orm +Ġ \" +a st +Ġa t +Ġ W +Ġre s +Ġd at +: \ +t her +ion s +o re +Ġ ( +Ġcon t +ou r +e p +Ġ F +Ġa c +an ce +Ġ R +g h +Ġm e +c es +Ġw as +in d +ve l +ation s +Ġhe l +Ġmo re +ul t +Ġ D +re at +ig n +Ġhel p +im e +ar d +Ġc l +Ġa pp +an s +i e +Ġdat a +ic h +an g +ou s +el l +k s +as e +ic e +i p +it e +Ġsu ch +Ġf e +Ġw he +i b +Ġo ther +Ġth is +as s +u al +i le +n e +re d +Ġh as +o o +res s +if ic +n ing +Ġ = +Ġu p +Ġm an +Ġa r +on g +e c +Ġt ra +a v +Ġwh ich +Ġg o +Ġpro v +Ġd is +* * +s o +Ġ G +on e +Ġe m +Ġn ot +u e +Ġ O +Ġ j +a ce +Ġthe y +am e +Ġ qu +Ġ L +if f +Ġf ol +ar y +at ed +ust om +it ion +Ġit s +Ġs y +k e +ac k +r y +- - +Ġt ime +Ġd es +Ġne w +ent s +ou nt +Ġfol low +Ġal so +Ġcom m +Ġo ut +Ġe ff +Ġd iff +iv en +a p +Ġs ent +\ u +Ġs o +Ġpro du +Ġu se +Ġs c +Ġ - +Ġu n +l ud +ĠI t +en er +k ing +Ġe v +Ġab out +Ġthe m +Ġ U +Ġc ustom +Ġ ro +Ġinc lud +l es +et w +st em +x t +Ġint o +Ġp er +ĠI n +Ġ N +Ġw ill +Ġle ar +b er +Ġal l +Ġp e +d s +Ġt w +ak ing +ar k +f ul +Ġm ake +ch n +er v +o st +rou gh +Ġon e +Ġin ter +it ies +a il +i ke +re e +p le +al th +Ġus ed +or s +Ġo ver +il ity +ment s +an ge +Ġw ay +or y +Ġc ol +Ġp r +Ġc ould +Ġn um +re ate +in t +Ġre du +ers on +Ġre c +Ġhe r +Ġne ed +m s +at er +o y +Ġsy stem +Ġin form +Ġtw o +Ġte chn +Ġsent ence +i ence +iz e +g et +Ġdiff ere +o od +ri b +Ġb ut +Ġfollow ing +as ed +ol og +er g +le d +u res +I n +e ar +Ġp h +ow n +Ġp re +Ġw ould +Ġus ing +Ġcon s +Ġwor k +Ġmo d +at ing +i a +i re +Ġp os +i ent +o b +j ect +Ġin v +on s +Ġd o +ul ar +Ġde c +Ġhe alth +Ġimp ro +Ġan y +Ġth rough +y p +ro w +vel op +Ġpro cess +Ġt r +l ic +ver y +al s +if y +` ` +ar i +Ġst r +Ġimp ort +Ġl ike +Ġprodu ct +Ġs ome +p h +ent ial +Ġa m +at es +Ġac c +en s +n s +Ġs m +Ġin d +e en +Ġex per +le ct +Ġv al +Ġre l +it s +Ġinform ation +ing s +Ġ J +op le +in ess +Ġg iven +m m +ic es +Ġp art +il d +y s +Ġo ur +nd er +Ġp erson +al ly +Ġk e +etw een +f t +ot h +Ġsp ec +Ġb etween +erg y +ĠA I +Ġwh o +Ġm ay +e f +at ive +is e +Ġl ist +Ġk n +Ġad d +, \ +or d +ic s +Ġpe ople +ĠS t +Ġh is +Ġex p +ib le +Ġthe re +Ġs erv +Ġinc re +Ġde velop +ou nd +ow er +Ġtr ans +b s +Ġen ergy +Ġof f +Ġb us +Ġwh ile +o se +Ġa ct +Ġex am +Ġlear ning +ction s +c on +g or +g an +ut ion +rou nd +pp ort +Ġh ow +Ġb l +Ġm ed +an c +Ġt yp +Ġ ra +Ġc ar +if e +Ġwor ld +Ġv ari +Ġre p +a u +Ġs oc +Ġprov id +Ġs et +t en +Ġs ol +Ġe ach +Ġwhe n +Ġeff ect +Ġp o +Ġs he +ic k +Ġwhe re +Ġmod el +Ġimport ant +Ġu nder +Ġpro g +ener ate +ur al +t ain +Ġas s +olog y +Ġh ad +oo k +g g +Ġcustom er +t ing +v ing +Ġres p +l ine +Ġc reat +l l +il y +Ġre g +Ġd et +Ġ if +Ġ + +Ġbus iness +\n In +is h +Ġmo st +ĠĠĠĠ ĠĠĠĠ +he s +ang u +Ġprov ide +Ġad v +er m +u b +Ġs k +ir st +an y +Ġd ay +iv id +ar m +ra ct +n ce +Ġ | +Ġimpro ve +) \ +Ġc o +Ġcomm un +ark et +Ġm et +c y +Ġdiffere nt +iz ed +Ġar t +\n The +r it +Ġcom put +Ġfor m +c k +Ġh um +Ġch ar +b le +Ġle ad +ir on +Ġre m +Ġsh ould +t e +Ġal low +n ess +h at +Ġf un +Ġcomp le +Ġl angu +ag es +Ġbe c +Ġs ign +u es +at ure +Ġf ind +ri end +Ġst ud +Ġm ain +im ate +o ve +Ġres ult +Ġpl ay +Ġredu ce +Ġen g +w are +red i +Ġnum ber +Ġl ar +Ġp ol +Ġp at +Ġw ell +id ent +v iron +r ite +c rib +Ġb u +Ġh igh +Ġthe se +iv es +v es +Ġdes ign +ur n +Ġth an +d er +Ġan al +Ġw ater +Ġm arket +Ġexam ple +w ay +st and +n g +a x +it ive +Ġ ` +i qu +Ġs im +Ġe qu +gor ith +Ġte xt +res ent +Ġman y +ur ing +-- -- +\n A +Ġd i +Ġs a +viron ment +ar ch +Ġat t +Ġp ot +Ġt as +Ġc reate +ou gh +Ġf l +Ġm aking +i ous +Ġg ra +Ġl ife +\n O +Ġal gorith +al ity +en g +Ġf in +u c +? ",Ċ +Ġ Y +Ġre t +Ġbe en +Ġtechn ology +Ġprog ra +Ġha nd +h ip +w n +Ġc al +Ġwh at +ivid ual +is s +et y +Ġlangu age +our ces +Ġcl ass +Ġt ake +Ġe as +r ic +Ġv is +b ject +Ġre f +Ġen vironment +Ġf irst +e g +Ġind ividual +Ġpl an +Ġper form +Ġ ru +i en +Ġimp act +Ġa g +ad e +Ġc le +Ġre qu +d ition +_ _ +Ġc he +pt ion +Ġapp ro +Ġ ** +Ġg reat +v ed +Ġex pl +Ġg row +G enerate +Ġm y +Ġinclud ing +Ġac cess +Ġp op +Ġm in +f ore +Ġsoc ial +in es +Ġchar act +Ġb r +Ġst ep +Ġunder stand +Ġor gan +ĠA d +Ġdis c +Ġp ower +Ġl ong +he d +Ġcon c +w ard +it ed +Ġe le +c ing +Ġe very +Ġc a +Ġof ten +Ġus er +v ie +Ġ V +Ġf ood +Ġinclud e +Ġl oc +as es +ical ly +od e +ant s +Ġinv ol +Ġsm all +Ġs ur +ach ine +Ġbe ing +Ġpot ential +Ġn o +ĠC h +Ġde p +at her +Ġb oth +Ġen s +Ġpos s +Ġ ed +crib e +t s +or k +ĠThe y +Ġp ur +iv ity +Ġwor ds +Ġsign ific +Ġw ere +ĠH ow +Ġpro m +Ġexper ience +Ġ K +u p +Ġc ount +ere d +D es +Ġf am +`` ` +ak es +Ġg l +ĠH e +Ġfe el +Ġb ack +Ġf i +Ġpro ble +iz ation +l ing +Ġcommun ic +pl oy +Ġa ut +Ġf riend +Ġhum an +Ġsp e +e w +Ġperson al +Ġto p +Ġ ent +ot her +Ġch ang +Ġc or +Ġch ange +Ġdec is +ab ility +h ing +at ural +e ver +Ġc ost +Ġgo od +au se +Ġ ident +Ġso ft +in ed +Ġp ass +' t +at ures +Ġb en +Ġcomp any +Ġst art +Ġsignific ant +Ġsu mm +on d +ol d +b ers +se l +? \ +Ġc ur +Ġl ight +Ġcomm on +.\ " +Ġcustom ers +iv ing +con om +Ġfun ction +Ġ ve +Ġth ree +Ġev en +in ing +Ġg ener +ri es +Ġle vel +Ġspec ific +Ġwe bs +Ġthe n +Ġeffect ive +c ur +en se +Ġlar ge +Ġd ist +Ġeff ic +Ġsu pport +Ġg et +C reate +re ad +p ort +Ġin f +Ġ ' +Ġy ear +Ġst ate +Ġke y +c cess +: ** +Ġa v +Ġkn ow +Ġben ef +Ġ ess +ab les +re n +Ġo wn +ĠThe se +oc k +- t +Ġ ide +om m +re en +c ed +ct ure +Ġte am +Ġr is +Ġtas ks +Ġd own +Ġst ru +Ġcomput er +- b +Ġf act +Ġm em +et ter +\n S +Ġa round +Ġwor d +Ġb ased +Ġbe h +Ġr ight +Ġd el +Ġpo int +Ġn atural +s s +Ġe conom +Ġm ade +Ġin s +Ġin st +Ġm at +Ġval ue +Ġan im +Ġse ver +\n T +ation al +it al +z e +ot e +ill s +ter n +Ġre ad +Ġcont ent +Ġon line +Ġen d +ĠU n +v ent +Ġse e +end ing +Ġm on +Ġd r +Ġke ep +Ġsystem s +c ul +v en +Ġst ory +Ġmed ia +Ġsever al +he n +ate g +Ġcont in +Ġde v +Ġlear n +Ġl a +Ġst re +Ġpart ic +Ġa ir +ual ly +Ġsu ccess +ou se +Ġis s +i ed +Ġm achine +Ġo pt +Ġ x +Ġo p +Ġpro f +oc us +ch ie +Ġmet h +n er +om p +r on +Ġh ome +Ġb etter +ĠP ro +Ġm ult +om et +Ġincre ase +Ġanal y +ver t +Ġre le +Ġb ra +in k +Ġt em +Ġp redi +Ġt re +Ġserv ice +Ġwebs ite +Ġman age +Ġsoft ware +he re +Ġpro t +- s +Ġqu est +i er +Ġkn own +Ġor der +Ġph ys +ce pt +Ġa chie +Ġin put +Ġposs ible +ĠI f +Ġex t +f ter +Ġe lect +Ġmeth od +Ġb re +ĠA n +way s +er ing +et s +Ġj ust +Ġst ore +Ġdevelop ment +Ġc are +Ġo bject +Ġtyp e +ĠF or +Ġf ocus +gg est +Ġon ly +Ġcons id +ar s +Ġch all +Ġdet erm +Ġs al +in s +Ġfe atures +Ġt ru +od y +Ġto ol +> \ +Ġens ure +os s +ub lic +Ġit em +H ere +in ation +Ġde f +Des cribe +ion al +rou p +Ġcon f +Ġneed s +Ġcharact er +Ġvari ous +Ġle t +Ġapp lic +a ut +Ġj ob +ell ig +ĠC on +Ġb est +Ġf ore +Ġam ount +ro p +Ġbu ild +iqu e +ag ing +Ġem ploy +Ġre st +a ir +W hat +Ġto get +Ġway s +Ġident ify +Ġtoget her +Ġre al +Ġus ers +Ġme an +as ing +ĠA m +Ġed uc +Ġalgorith m +Ġn etw +Ġc ode +W rite +o v +- d +ou ra +ĠHow ever +ut ure +vie w +Ġin du +Ġproduct s +ect ed +er tain +; \ +ĠA s +p r +ast e +Ġo per +Ġ $ +av i +sel f +Ġ < +Ġindu st +Ġg u +Ġother s +E x +i an +Ġ" \" +- f +n ces +Ġf il +Ġresp ons +ro l +Ġc ap +Ġbe fore +ver n +Ġcomple x +l us +rib ut +at s +Ġpos itive +o h +Ġl o +Ġg roup +Ġf ound +e e +og n +Ġs w +Ġindividual s +Ġp ract +Ġen c +Ġsh are +ra ph +Ġr ange +Ġsu n +\ t +Ġprovid ing +ic le +Ġde m +Ġpl ace +Ġa ud +j oy +Ġm ust +el s +er y +O ne +Ġfam ily +Ġf uture +l ess +re nt +Ġproble m +Ġess ential +ro du +i red +Ġredu cing +is m +Ġw arm +ra y +Ġab ility +Ġstr ong +Ġal ways +Ġres ources +Ġbenef its +Ġstr ateg +Ġinvol ves +Ġass ist +ere st +n A +ress ion +Ġ [ +il ities +Ġstep s +ver all +Ġsh ow +ob al +\n F +Ġl and +ĠH ere +Ġbusiness es +ĠE n +pport un +Ġme as +Ġret urn +Ġd ig +Ġh ist +y th +Ġc ent +Ġab le +Ġwith out +y c +pl ain +Ġrel ations +Ġserv ices +- c +Ġt est +ar th +Ġcommunic ation +Ġinter n +ne w +Ġs it +Ġinv est +Ġca us +Ġu nt +Ġfriend s +Ġchang es +c ri +d it +ĠB y +ĠY ou +Ġme ans +Ġre se +o ol +t ed +ellig ence +ain s +pp ing +Ġbe l +Ġrep resent +Ġha pp +Ġs er +Ġperform ance +Ġo pportun +Ġtem per +ĠS he +Ġf u +i x +b ot +Ġw rit +Ġbeh avi +Ġpro ject +ĠW ith +iv ers +d ay +Ġphys ical +iz ing +Ġact iv +Ġwith in +Ġint erest +ol ution +ward s +ff ic +Ġqu ick +Ġp ublic +Ġgrow th +Ġch o +Ġrelations hip +Ġunt il +Ġhelp s +Ġstud ents +Ġfi el +im es +ul ation +ib ility +el f +Ġf ul +Ġsu b +an k +id es +Ġsk ills +Ġcl imate +G iven +Ġp ar +Ġcle ar +ir t +N ame +Ġp resent +Ġt ri +Ġchall eng +re am +Ġl ay +Ġmarket ing +Ġsumm ary +Ġch ild +Ġsa f +Ġsu re +Ġs ame +Ġm u +Ġem ail +b on +Ġs omet +``` \ +Ġcur rent +am p +en ces +ĠR e +Ġtrans port +m e +- p +a ction +ĠE x +Ġyear s +Ġcom b +h or +anc ed +t y +Ġl ove +Ġg reen +Ġpop ular +Ġl ess +Ġd ra +Ġcont rol +Ġa ff +Ġcons um +Ġg ame +ent al +ight s +ar get +om es +o x +ic ult +er c +Ġgo als +anc ial +t le +Ġgo vern +Ġnum bers +Ġf ive +Ġst and +Ġse arch +Ġeffic ient +Ġw al +Ġn ame +at h +Ġhe art +Ġd uring +re ct +Ġover all +yth on +Ġallow s +Ġc ity +a ve +v ant +ater ial +Ġw ide +Ġm us +ific ial +Ġh ard +ĠT h +oo se +Ġgl obal +a j +Ġt er +Ġdiff icult +Ġl ine +ĠA l +c are +iv ed +Ġreg ular +Ġg r +) , +le ment +Ġh im +Ġun ique +Ġen joy +Ġmean ing +Ġop en +Ġ i +ab or +Ġare a +Ġitem s +Ġcle an +dition ally +o id +ĠW e +Ġbe aut +Ġme et +ip le +Ġstate ment +Ġag ain +ys is +Ġf ac +Ġs ources +Ġb ody +Ġalgorith ms +Ġaud ience +Ġw ant +Ġl og +Ġmain tain +Ġactiv ities +Ġmo ve +Ġc ult +one y +Ġt arget +\n B +Ġm aterial +Ġcreat ing +Ġstru cture +at form +e xt +Ġexper ien +Ġval ues +e ad +oh n +Ġhealth y +ro ss +Ġint eg +Ġrese arch +at ch +oo king +Ġro le +Ġprovid es +i ety +ist s +Ġfin ancial +or ies +d ent +Ġ er +Ġart icle +Ġele ments +Ġadd ress +Ġcon n +ĠU se +m p +Ġeas y +Ġne g +Ġcol or +Ġcal cul +Ex plain +ĠP l +p ect +in ce +al e +Ġris k +cur ity +er t +Ġfe ed +Ġev ent +v ers +pl es +Ġlevel s +Ġb i +Ġst ay +Ġpl atform +Ġbre ak +b ack +Ġs at +\nO verall +Ġeduc ation +\n C +Ġcar bon +---- ---- +ap e +Ġpre vent +Ġadd ition +Ġst ress +r al +our ce +ru s +Ġcom e +Ġrec ogn +ĠUn ited +Ġpro per +Ġpol l +dent ify +Ġunderstand ing +Ġdecis ions +i ct +Ġd ire +Ġbehavi or +Ġ * +\n I +Ġm ess +Ġanim als +Ġs l +Ġw ind +Ġb as +Ġp ain +Ġlead ing +er n +g er +Ġp res +Ġth ough +Ġinter act +y le +Ġdo es +Ġhe ad +Ġint elligence +ort s +Ġbec ome +Ġru n +ar ing +Ġimp lement +Ġa ction +o ot +ter ns +Ġprot ect +er ic +Ġf low +Ġem ot +cess ary +ur ate +Ġsu ggest +Ġprogra m +Ġph r +Ġhealth care +ent ion +Ġsu st +Ġwh y +Ġacc urate +l u +Ġh ig +Ġre ach +Ġallow ing +Ġtra vel +Ġrequ ire +Ġare as +Ġde ep +H e +Ġfe w +Ġs elf +ou n +Ġ # +os p +st r +Ġmin ut +Ġdecis ion +ĠThe re +an ces +Ġqu ality +Ġav ail +Ġsp ace +Ġsomet hing +Ġwe b +Ġpat terns +Ġm ot +or ing +is f +Ġan other +Ġacc ount +\n W +us s +Ġm aj +u ation +Ġsust ain +Ġaut om +iqu es +iss ions +ver se +Ġcon cept +Ġse curity +Ġth ose +Ġprof ess +Ġsh ort +Ġn ight +eng th +a pt +e x +ĠAd ditionally +Ġt aking +Ġto o +ag n +Ġsim ple +lus ion +ien cy +as h +our s +Ġp a +Ġl it +ĠS p +it ing +Ġd on +Ġl im +l ish +m at +av es +led ge +dition al +in c +Ġev ents +Ġoff er +th ing +Ġwor king +Ġanal ysis +Ġachie ve +Ġp ie +Ġb ook +Ġf re +Ġmu ch +o on +Ġt ry +es p +Ġw aste +f ace +Ġe ar +Ġf ru +Ġtransport ation +ch ool +Ġtechn iques +Ġprogra mm +ĠE arth +Ġpredi ct +Ġne ver +w s +u ment +imate ly +are d +Ġpartic ular +Ġto wards +Ġeconom ic +Ġincre asing +Ġf ast +im ent +Ġnetw ork +Ġcor rect +Ġm ight +Ġo c +Ġbec ause +ĠW h +a z +pl ay +Ġresult s +Ġmanage ment +Ġpur ch +Ġs ound +Ġp ast +Ġtra ining +__ __ +op e +Ġeng age +oura ge +Ġs ense +Ġf ree +Ġpre f +e es +Ġcount ries +ne y +an ies +Ġa fter +Ġm ind +Ġex c +ĠO nce +ĠĠĠĠ ĠĠĠĠĠĠĠ +Ġcomple te +Ġim m +Ġ est +Ġg enerate +ver b +ĠD e +' m +Ġtool s +redi ents +Ġmaj or +ent ly +Ġcont ribut +le ep +Ġpoint s +dit ions +Ġfact ors +Ġe l +Ġne xt +i um +ou d +Ġc ru +Ġre as +ri ate +ĠI nd +Ġprom ot +Ġhist ory +Ġj our +Ġd ue +C on +Ġve get +en cy +ĠAm eric +Ġf ra +Ġdiffere nce +o ard +le x +Ġequ ation +irt ual +Ġc up +Ġfore st +Ġneg ative +Ġse con +on es +Ġn ature +Ġus es +a h +p or +Ġse c +ord ing +Ġl ast +ĠS ome +Ġiss ues +Ġsc ient +Ġpr int +ĠSt ates +o ver +Ġsat isf +Ġdev ices +Ġdis e +Ġtemper ature +Ġfeed back +Ġne cessary +Ġem issions +m b +Ġl ow +f or +t al +Ġchalleng es +Ġar ray +Ġs ide +Ġeng ine +Ġb oo +at a +Ġbel ie +- m +Ġmult iple +Ġs ing +Ġgovern ment +am es +if ied +Ġminut es +Ġsuccess ful +Ġm oney +Ġquick ly +Ġb ir +Ġtyp ically +Ġp ost +Ġpre p +Ġknow ledge +pp ed +a ctions +Ġmethod s +Ġopt im +\n P +Ġout put +Ġfiel d +Ġt able +Ġb al +Ġcol l +Ġcharact ers +v olution +or ds +il ar +ific ation +an e +Ġc ell +Ġm il +ĠW hat +Ġs qu +Ġl ives +ĠA r +Ġphr ase +Ġn ut +Ġdig ital +Ġintern et +l ass +u ra +omm end +Ġt reat +Ġappro p +res h +ur ther +ĠO ne +Ġvis ual +ate gor +Ġappro ach +Ġc ertain +Ġsh o +v al +Ġtas k +i res +Ġapprop riate +Ġv ie +Ġdesign ed +p ose +** : +f ort +Ġ| \ +Ġapplic ations +Ġp ay +Ġn ow +Ġhe at +Ġindust ry +p re +Ġeffective ly +Ġpop ulation +Ġopportun ities +< / +ĠT o +Ġup d +Ġinclud es +ĠE ng +Ġtyp es +Ġup on +Ġconsid er +le t +Ġg en +og raph +pl ace +Ġt imes +Ġar g +C omp +ĠG o +Ġre ce +Ġchild ren +Ġtra ck +Ġsome one +w ord +Ġyou ng +Ġcon ditions +Ġtra ditional +Ġmodel s +I dentify +Ġc amp +Ġm akes +ist ic +Ġar r +Ġc ard +ut ions +l t +Ġo ld +Ġide as +Ġe y +Ġt ree +Ġiss ue +Ġh arm +Ġavail able +Ġc r +Ġpower ful +n ov +Ġmo vie +Ġwe ather +Ġsk y +Ġquest ions +e et +Ġact ivity +Ġbra nd +is hed +Ġanaly ze +ĠS h +Ġen h +av or +Ġbe g +Ġs chool +i ate +Ġeas ier +Ġinf lu +Ġn on +Ġstud y +Ġl ook +Ġsol ution +Ġle g +Ġcon st +H ow +Ġcomp et diff --git a/tests/assets/tiny_bpe_vocab.json b/tests/assets/tiny_bpe_vocab.json new file mode 100644 index 0000000000..4ddab5d667 --- /dev/null +++ b/tests/assets/tiny_bpe_vocab.json @@ -0,0 +1,2002 @@ +{ + "!": 0, + "\"": 1, + "#": 2, + "$": 3, + "%": 4, + "&": 5, + "'": 6, + "(": 7, + ")": 8, + "*": 9, + "+": 10, + ",": 11, + "-": 12, + ".": 13, + "/": 14, + "0": 15, + "1": 16, + "2": 17, + "3": 18, + "4": 19, + "5": 20, + "6": 21, + "7": 22, + "8": 23, + "9": 24, + ":": 25, + ";": 26, + "<": 27, + "=": 28, + ">": 29, + "?": 30, + "@": 31, + "A": 32, + "B": 33, + "C": 34, + "D": 35, + "E": 36, + "F": 37, + "G": 38, + "H": 39, + "I": 40, + "J": 41, + "K": 42, + "L": 43, + "M": 44, + "N": 45, + "O": 46, + "P": 47, + "Q": 48, + "R": 49, + "S": 50, + "T": 51, + "U": 52, + "V": 53, + "W": 54, + "X": 55, + "Y": 56, + "Z": 57, + "[": 58, + "\\": 59, + "]": 60, + "^": 61, + "_": 62, + "`": 63, + "a": 64, + "b": 65, + "c": 66, + "d": 67, + "e": 68, + "f": 69, + "g": 70, + "h": 71, + "i": 72, + "j": 73, + "k": 74, + "l": 75, + "m": 76, + "n": 77, + "o": 78, + "p": 79, + "q": 80, + "r": 81, + "s": 82, + "t": 83, + "u": 84, + "v": 85, + "w": 86, + "x": 87, + "y": 88, + "z": 89, + "{": 90, + "|": 91, + "}": 92, + "~": 93, + "Ċ": 94, + "Ġ": 95, + "ĠĠ": 96, + "Ġt": 97, + "Ġa": 98, + "in": 99, + "he": 100, + "re": 101, + "on": 102, + "Ġthe": 103, + "Ġs": 104, + "er": 105, + "at": 106, + "Ġc": 107, + "ĠĠĠĠ": 108, + "en": 109, + "Ġo": 110, + "Ġ\"": 111, + "nd": 112, + "es": 113, + "ing": 114, + "ĠĠĠ": 115, + "it": 116, + "Ġp": 117, + "or": 118, + "ou": 119, + "Ġand": 120, + "Ġw": 121, + "is": 122, + "Ġf": 123, + "an": 124, + "ion": 125, + "al": 126, + "Ġb": 127, + "Ġto": 128, + "Ġm": 129, + "Ġin": 130, + "Ġof": 131, + "le": 132, + "ct": 133, + "ar": 134, + "ut": 135, + "Ġd": 136, + "st": 137, + "ed": 138, + "ĠĠĠĠĠĠĠ": 139, + "ic": 140, + "\":": 141, + ",Ċ": 142, + "ro": 143, + "ent": 144, + "\\n": 145, + "Ġe": 146, + "put": 147, + "om": 148, + "Ġre": 149, + "as": 150, + "ve": 151, + "Ġh": 152, + "Ġth": 153, + "\",Ċ": 154, + "Ġl": 155, + "Ġis": 156, + "et": 157, + "ce": 158, + "Ġn": 159, + ".\\": 160, + "im": 161, + "il": 162, + "Ġg": 163, + "Ġu": 164, + "ction": 165, + "ru": 166, + "ation": 167, + "ol": 168, + "ch": 169, + "ĠT": 170, + "Ġfor": 171, + "out": 172, + "ra": 173, + "ow": 174, + "id": 175, + "ly": 176, + "Ġst": 177, + "Ġbe": 178, + "Ġy": 179, + "Ġpro": 180, + "ig": 181, + "se": 182, + "ate": 183, + "Ġthat": 184, + "ith": 185, + "ir": 186, + "ur": 187, + "ot": 188, + "Ġor": 189, + "Ġon": 190, + "Ġyou": 191, + "ers": 192, + "stru": 193, + "Ġan": 194, + "if": 195, + "ul": 196, + "struction": 197, + "Ġ{": 198, + "Ġ}": 199, + "Ġcan": 200, + "input": 201, + "output": 202, + "instruction": 203, + "Ġ{Ċ": 204, + "Ġ},Ċ": 205, + "\"Ċ": 206, + "Ġhe": 207, + "Ġcon": 208, + "Ġit": 209, + "ay": 210, + "ess": 211, + "Ġwith": 212, + "ver": 213, + "el": 214, + "Ġas": 215, + "am": 216, + "ĠA": 217, + "ge": 218, + "Ġsu": 219, + "iv": 220, + ".\",Ċ": 221, + "Ġcom": 222, + "ĠI": 223, + "ment": 224, + "ak": 225, + "Ġal": 226, + "\\\"": 227, + ".\"Ċ": 228, + "ive": 229, + "Ġare": 230, + "ab": 231, + "ad": 232, + "Ġmo": 233, + "Ġex": 234, + "Ġv": 235, + "ĠS": 236, + "res": 237, + "pp": 238, + "qu": 239, + "Ġde": 240, + "Ġwh": 241, + "ity": 242, + "Ġen": 243, + "ĠThe": 244, + "her": 245, + "ld": 246, + "ri": 247, + "ter": 248, + "ant": 249, + "ĠC": 250, + "ist": 251, + "Ġ\"\",Ċ": 252, + "um": 253, + "Ġus": 254, + "Ġne": 255, + "ain": 256, + "th": 257, + "ect": 258, + "Ġle": 259, + "op": 260, + "em": 261, + "ies": 262, + "Ġch": 263, + "Ġim": 264, + "du": 265, + "od": 266, + "ort": 267, + "nt": 268, + "est": 269, + "igh": 270, + "ere": 271, + "Ġha": 272, + "us": 273, + "ure": 274, + "ial": 275, + "oc": 276, + "Ġwor": 277, + "Ġtheir": 278, + "ac": 279, + "ence": 280, + "iz": 281, + "Ġyour": 282, + "os": 283, + "Ġimp": 284, + "ud": 285, + "Ġby": 286, + "Ġse": 287, + "ine": 288, + "ould": 289, + "low": 290, + "ill": 291, + "age": 292, + "rom": 293, + "Ġsp": 294, + "ĠP": 295, + "Ġsh": 296, + "ust": 297, + "The": 298, + "un": 299, + "'s": 300, + "Ġinc": 301, + "ide": 302, + "pl": 303, + "ight": 304, + "og": 305, + "Ġpl": 306, + "pt": 307, + "are": 308, + "Ġte": 309, + "Ġint": 310, + "Ġ\\": 311, + "his": 312, + "Ġr": 313, + "ake": 314, + "per": 315, + "orm": 316, + "ag": 317, + "ff": 318, + "ĠE": 319, + "art": 320, + "Ġk": 321, + "end": 322, + "ĠM": 323, + "Ġwe": 324, + "ĠB": 325, + "Ġad": 326, + "cess": 327, + "rou": 328, + "ical": 329, + "all": 330, + "able": 331, + "Ġfrom": 332, + "and": 333, + "ĠH": 334, + "Ġab": 335, + "act": 336, + "Ġcomp": 337, + "ome": 338, + "ach": 339, + "ĠThis": 340, + "Ġhave": 341, + "form": 342, + "Ġ\\\"": 343, + "ast": 344, + "Ġat": 345, + "ĠW": 346, + "Ġres": 347, + "Ġdat": 348, + ":\\": 349, + "ther": 350, + "ions": 351, + "ore": 352, + "Ġ(": 353, + "Ġcont": 354, + "our": 355, + "ep": 356, + "ĠF": 357, + "Ġac": 358, + "ance": 359, + "ĠR": 360, + "gh": 361, + "Ġme": 362, + "ces": 363, + "Ġwas": 364, + "ind": 365, + "vel": 366, + "ations": 367, + "Ġhel": 368, + "Ġmore": 369, + "ult": 370, + "ĠD": 371, + "reat": 372, + "ign": 373, + "Ġhelp": 374, + "ime": 375, + "ard": 376, + "Ġcl": 377, + "Ġapp": 378, + "ans": 379, + "ie": 380, + "Ġdata": 381, + "ich": 382, + "ang": 383, + "ous": 384, + "ell": 385, + "ks": 386, + "ase": 387, + "ice": 388, + "ip": 389, + "ite": 390, + "Ġsuch": 391, + "Ġfe": 392, + "Ġwhe": 393, + "ib": 394, + "Ġother": 395, + "Ġthis": 396, + "ass": 397, + "ual": 398, + "ile": 399, + "ne": 400, + "red": 401, + "Ġhas": 402, + "oo": 403, + "ress": 404, + "ific": 405, + "ning": 406, + "Ġ=": 407, + "Ġup": 408, + "Ġman": 409, + "Ġar": 410, + "ong": 411, + "ec": 412, + "Ġtra": 413, + "av": 414, + "Ġwhich": 415, + "Ġgo": 416, + "Ġprov": 417, + "Ġdis": 418, + "**": 419, + "so": 420, + "ĠG": 421, + "one": 422, + "Ġem": 423, + "Ġnot": 424, + "ue": 425, + "ĠO": 426, + "Ġj": 427, + "ace": 428, + "Ġthey": 429, + "ame": 430, + "Ġqu": 431, + "ĠL": 432, + "iff": 433, + "Ġfol": 434, + "ary": 435, + "ated": 436, + "ustom": 437, + "ition": 438, + "Ġits": 439, + "Ġsy": 440, + "ke": 441, + "ack": 442, + "ry": 443, + "--": 444, + "Ġtime": 445, + "Ġdes": 446, + "Ġnew": 447, + "ents": 448, + "ount": 449, + "Ġfollow": 450, + "Ġalso": 451, + "Ġcomm": 452, + "Ġout": 453, + "Ġeff": 454, + "Ġdiff": 455, + "iven": 456, + "ap": 457, + "Ġsent": 458, + "\\u": 459, + "Ġso": 460, + "Ġprodu": 461, + "Ġuse": 462, + "Ġsc": 463, + "Ġ-": 464, + "Ġun": 465, + "lud": 466, + "ĠIt": 467, + "ener": 468, + "king": 469, + "Ġev": 470, + "Ġabout": 471, + "Ġthem": 472, + "ĠU": 473, + "Ġcustom": 474, + "Ġro": 475, + "Ġinclud": 476, + "les": 477, + "etw": 478, + "stem": 479, + "xt": 480, + "Ġinto": 481, + "Ġper": 482, + "ĠIn": 483, + "ĠN": 484, + "Ġwill": 485, + "Ġlear": 486, + "ber": 487, + "Ġall": 488, + "Ġpe": 489, + "ds": 490, + "Ġtw": 491, + "aking": 492, + "ark": 493, + "ful": 494, + "Ġmake": 495, + "chn": 496, + "erv": 497, + "ost": 498, + "rough": 499, + "Ġone": 500, + "Ġinter": 501, + "ities": 502, + "ail": 503, + "ike": 504, + "ree": 505, + "ple": 506, + "alth": 507, + "Ġused": 508, + "ors": 509, + "Ġover": 510, + "ility": 511, + "ments": 512, + "ange": 513, + "Ġway": 514, + "ory": 515, + "Ġcol": 516, + "Ġpr": 517, + "Ġcould": 518, + "Ġnum": 519, + "reate": 520, + "int": 521, + "Ġredu": 522, + "erson": 523, + "Ġrec": 524, + "Ġher": 525, + "Ġneed": 526, + "ms": 527, + "ater": 528, + "oy": 529, + "Ġsystem": 530, + "Ġinform": 531, + "Ġtwo": 532, + "Ġtechn": 533, + "Ġsentence": 534, + "ience": 535, + "ize": 536, + "get": 537, + "Ġdiffere": 538, + "ood": 539, + "rib": 540, + "Ġbut": 541, + "Ġfollowing": 542, + "ased": 543, + "olog": 544, + "erg": 545, + "led": 546, + "ures": 547, + "In": 548, + "ear": 549, + "Ġph": 550, + "own": 551, + "Ġpre": 552, + "Ġwould": 553, + "Ġusing": 554, + "Ġcons": 555, + "Ġwork": 556, + "Ġmod": 557, + "ating": 558, + "ia": 559, + "ire": 560, + "Ġpos": 561, + "ient": 562, + "ob": 563, + "ject": 564, + "Ġinv": 565, + "ons": 566, + "Ġdo": 567, + "ular": 568, + "Ġdec": 569, + "Ġhealth": 570, + "Ġimpro": 571, + "Ġany": 572, + "Ġthrough": 573, + "yp": 574, + "row": 575, + "velop": 576, + "Ġprocess": 577, + "Ġtr": 578, + "lic": 579, + "very": 580, + "als": 581, + "ify": 582, + "``": 583, + "ari": 584, + "Ġstr": 585, + "Ġimport": 586, + "Ġlike": 587, + "Ġproduct": 588, + "Ġsome": 589, + "ph": 590, + "ential": 591, + "Ġam": 592, + "ates": 593, + "Ġacc": 594, + "ens": 595, + "ns": 596, + "Ġsm": 597, + "Ġind": 598, + "een": 599, + "Ġexper": 600, + "lect": 601, + "Ġval": 602, + "Ġrel": 603, + "its": 604, + "Ġinformation": 605, + "ings": 606, + "ĠJ": 607, + "ople": 608, + "iness": 609, + "Ġgiven": 610, + "mm": 611, + "ices": 612, + "Ġpart": 613, + "ild": 614, + "ys": 615, + "Ġour": 616, + "nder": 617, + "Ġperson": 618, + "ally": 619, + "Ġke": 620, + "etween": 621, + "ft": 622, + "oth": 623, + "Ġspec": 624, + "Ġbetween": 625, + "ergy": 626, + "ĠAI": 627, + "Ġwho": 628, + "Ġmay": 629, + "ef": 630, + "ative": 631, + "ise": 632, + "Ġlist": 633, + "Ġkn": 634, + "Ġadd": 635, + ",\\": 636, + "ord": 637, + "ics": 638, + "Ġpeople": 639, + "ĠSt": 640, + "Ġhis": 641, + "Ġexp": 642, + "ible": 643, + "Ġthere": 644, + "Ġserv": 645, + "Ġincre": 646, + "Ġdevelop": 647, + "ound": 648, + "ower": 649, + "Ġtrans": 650, + "bs": 651, + "Ġenergy": 652, + "Ġoff": 653, + "Ġbus": 654, + "Ġwhile": 655, + "ose": 656, + "Ġact": 657, + "Ġexam": 658, + "Ġlearning": 659, + "ctions": 660, + "con": 661, + "gor": 662, + "gan": 663, + "ution": 664, + "round": 665, + "pport": 666, + "Ġhow": 667, + "Ġbl": 668, + "Ġmed": 669, + "anc": 670, + "Ġtyp": 671, + "Ġra": 672, + "Ġcar": 673, + "ife": 674, + "Ġworld": 675, + "Ġvari": 676, + "Ġrep": 677, + "au": 678, + "Ġsoc": 679, + "Ġprovid": 680, + "Ġset": 681, + "ten": 682, + "Ġsol": 683, + "Ġeach": 684, + "Ġwhen": 685, + "Ġeffect": 686, + "Ġpo": 687, + "Ġshe": 688, + "ick": 689, + "Ġwhere": 690, + "Ġmodel": 691, + "Ġimportant": 692, + "Ġunder": 693, + "Ġprog": 694, + "enerate": 695, + "ural": 696, + "tain": 697, + "Ġass": 698, + "ology": 699, + "Ġhad": 700, + "ook": 701, + "gg": 702, + "Ġcustomer": 703, + "ting": 704, + "ving": 705, + "Ġresp": 706, + "line": 707, + "Ġcreat": 708, + "ll": 709, + "ily": 710, + "Ġreg": 711, + "Ġdet": 712, + "Ġif": 713, + "Ġ+": 714, + "Ġbusiness": 715, + "\\nIn": 716, + "ish": 717, + "Ġmost": 718, + "ĠĠĠĠĠĠĠĠ": 719, + "hes": 720, + "angu": 721, + "Ġprovide": 722, + "Ġadv": 723, + "erm": 724, + "ub": 725, + "Ġsk": 726, + "irst": 727, + "any": 728, + "Ġday": 729, + "ivid": 730, + "arm": 731, + "ract": 732, + "nce": 733, + "Ġ|": 734, + "Ġimprove": 735, + ")\\": 736, + "Ġco": 737, + "Ġcommun": 738, + "arket": 739, + "Ġmet": 740, + "cy": 741, + "Ġdifferent": 742, + "ized": 743, + "Ġart": 744, + "\\nThe": 745, + "rit": 746, + "Ġcomput": 747, + "Ġform": 748, + "ck": 749, + "Ġhum": 750, + "Ġchar": 751, + "ble": 752, + "Ġlead": 753, + "iron": 754, + "Ġrem": 755, + "Ġshould": 756, + "te": 757, + "Ġallow": 758, + "ness": 759, + "hat": 760, + "Ġfun": 761, + "Ġcomple": 762, + "Ġlangu": 763, + "ages": 764, + "Ġbec": 765, + "Ġsign": 766, + "ues": 767, + "ature": 768, + "Ġfind": 769, + "riend": 770, + "Ġstud": 771, + "Ġmain": 772, + "imate": 773, + "ove": 774, + "Ġresult": 775, + "Ġplay": 776, + "Ġreduce": 777, + "Ġeng": 778, + "ware": 779, + "redi": 780, + "Ġnumber": 781, + "Ġlar": 782, + "Ġpol": 783, + "Ġpat": 784, + "Ġwell": 785, + "ident": 786, + "viron": 787, + "rite": 788, + "crib": 789, + "Ġbu": 790, + "Ġhigh": 791, + "Ġthese": 792, + "ives": 793, + "ves": 794, + "Ġdesign": 795, + "urn": 796, + "Ġthan": 797, + "der": 798, + "Ġanal": 799, + "Ġwater": 800, + "Ġmarket": 801, + "Ġexample": 802, + "way": 803, + "stand": 804, + "ng": 805, + "ax": 806, + "itive": 807, + "Ġ`": 808, + "iqu": 809, + "Ġsim": 810, + "Ġequ": 811, + "gorith": 812, + "Ġtext": 813, + "resent": 814, + "Ġmany": 815, + "uring": 816, + "----": 817, + "\\nA": 818, + "Ġdi": 819, + "Ġsa": 820, + "vironment": 821, + "arch": 822, + "Ġatt": 823, + "Ġpot": 824, + "Ġtas": 825, + "Ġcreate": 826, + "ough": 827, + "Ġfl": 828, + "Ġmaking": 829, + "ious": 830, + "Ġgra": 831, + "Ġlife": 832, + "\\nO": 833, + "Ġalgorith": 834, + "ality": 835, + "eng": 836, + "Ġfin": 837, + "uc": 838, + "?\",Ċ": 839, + "ĠY": 840, + "Ġret": 841, + "Ġbeen": 842, + "Ġtechnology": 843, + "Ġprogra": 844, + "Ġhand": 845, + "hip": 846, + "wn": 847, + "Ġcal": 848, + "Ġwhat": 849, + "ividual": 850, + "iss": 851, + "ety": 852, + "Ġlanguage": 853, + "ources": 854, + "Ġclass": 855, + "Ġtake": 856, + "Ġeas": 857, + "ric": 858, + "Ġvis": 859, + "bject": 860, + "Ġref": 861, + "Ġenvironment": 862, + "Ġfirst": 863, + "eg": 864, + "Ġindividual": 865, + "Ġplan": 866, + "Ġperform": 867, + "Ġru": 868, + "ien": 869, + "Ġimpact": 870, + "Ġag": 871, + "ade": 872, + "Ġcle": 873, + "Ġrequ": 874, + "dition": 875, + "__": 876, + "Ġche": 877, + "ption": 878, + "Ġappro": 879, + "Ġ**": 880, + "Ġgreat": 881, + "ved": 882, + "Ġexpl": 883, + "Ġgrow": 884, + "Generate": 885, + "Ġmy": 886, + "Ġincluding": 887, + "Ġaccess": 888, + "Ġpop": 889, + "Ġmin": 890, + "fore": 891, + "Ġsocial": 892, + "ines": 893, + "Ġcharact": 894, + "Ġbr": 895, + "Ġstep": 896, + "Ġunderstand": 897, + "Ġorgan": 898, + "ĠAd": 899, + "Ġdisc": 900, + "Ġpower": 901, + "Ġlong": 902, + "hed": 903, + "Ġconc": 904, + "ward": 905, + "ited": 906, + "Ġele": 907, + "cing": 908, + "Ġevery": 909, + "Ġca": 910, + "Ġoften": 911, + "Ġuser": 912, + "vie": 913, + "ĠV": 914, + "Ġfood": 915, + "Ġinclude": 916, + "Ġloc": 917, + "ases": 918, + "ically": 919, + "ode": 920, + "ants": 921, + "Ġinvol": 922, + "Ġsmall": 923, + "Ġsur": 924, + "achine": 925, + "Ġbeing": 926, + "Ġpotential": 927, + "Ġno": 928, + "ĠCh": 929, + "Ġdep": 930, + "ather": 931, + "Ġboth": 932, + "Ġens": 933, + "Ġposs": 934, + "Ġed": 935, + "cribe": 936, + "ts": 937, + "ork": 938, + "ĠThey": 939, + "Ġpur": 940, + "ivity": 941, + "Ġwords": 942, + "Ġsignific": 943, + "Ġwere": 944, + "ĠHow": 945, + "Ġprom": 946, + "Ġexperience": 947, + "ĠK": 948, + "up": 949, + "Ġcount": 950, + "ered": 951, + "Des": 952, + "Ġfam": 953, + "```": 954, + "akes": 955, + "Ġgl": 956, + "ĠHe": 957, + "Ġfeel": 958, + "Ġback": 959, + "Ġfi": 960, + "Ġproble": 961, + "ization": 962, + "ling": 963, + "Ġcommunic": 964, + "ploy": 965, + "Ġaut": 966, + "Ġfriend": 967, + "Ġhuman": 968, + "Ġspe": 969, + "ew": 970, + "Ġpersonal": 971, + "Ġtop": 972, + "Ġent": 973, + "other": 974, + "Ġchang": 975, + "Ġcor": 976, + "Ġchange": 977, + "Ġdecis": 978, + "ability": 979, + "hing": 980, + "atural": 981, + "ever": 982, + "Ġcost": 983, + "Ġgood": 984, + "ause": 985, + "Ġident": 986, + "Ġsoft": 987, + "ined": 988, + "Ġpass": 989, + "'t": 990, + "atures": 991, + "Ġben": 992, + "Ġcompany": 993, + "Ġstart": 994, + "Ġsignificant": 995, + "Ġsumm": 996, + "ond": 997, + "old": 998, + "bers": 999, + "sel": 1000, + "?\\": 1001, + "Ġcur": 1002, + "Ġlight": 1003, + "Ġcommon": 1004, + ".\\\"": 1005, + "Ġcustomers": 1006, + "iving": 1007, + "conom": 1008, + "Ġfunction": 1009, + "Ġve": 1010, + "Ġthree": 1011, + "Ġeven": 1012, + "ining": 1013, + "Ġgener": 1014, + "ries": 1015, + "Ġlevel": 1016, + "Ġspecific": 1017, + "Ġwebs": 1018, + "Ġthen": 1019, + "Ġeffective": 1020, + "cur": 1021, + "ense": 1022, + "Ġlarge": 1023, + "Ġdist": 1024, + "Ġeffic": 1025, + "Ġsupport": 1026, + "Ġget": 1027, + "Create": 1028, + "read": 1029, + "port": 1030, + "Ġinf": 1031, + "Ġ'": 1032, + "Ġyear": 1033, + "Ġstate": 1034, + "Ġkey": 1035, + "ccess": 1036, + ":**": 1037, + "Ġav": 1038, + "Ġknow": 1039, + "Ġbenef": 1040, + "Ġess": 1041, + "ables": 1042, + "ren": 1043, + "Ġown": 1044, + "ĠThese": 1045, + "ock": 1046, + "-t": 1047, + "Ġide": 1048, + "omm": 1049, + "reen": 1050, + "ced": 1051, + "cture": 1052, + "Ġteam": 1053, + "Ġris": 1054, + "Ġtasks": 1055, + "Ġdown": 1056, + "Ġstru": 1057, + "Ġcomputer": 1058, + "-b": 1059, + "Ġfact": 1060, + "Ġmem": 1061, + "etter": 1062, + "\\nS": 1063, + "Ġaround": 1064, + "Ġword": 1065, + "Ġbased": 1066, + "Ġbeh": 1067, + "Ġright": 1068, + "Ġdel": 1069, + "Ġpoint": 1070, + "Ġnatural": 1071, + "ss": 1072, + "Ġeconom": 1073, + "Ġmade": 1074, + "Ġins": 1075, + "Ġinst": 1076, + "Ġmat": 1077, + "Ġvalue": 1078, + "Ġanim": 1079, + "Ġsever": 1080, + "\\nT": 1081, + "ational": 1082, + "ital": 1083, + "ze": 1084, + "ote": 1085, + "ills": 1086, + "tern": 1087, + "Ġread": 1088, + "Ġcontent": 1089, + "Ġonline": 1090, + "Ġend": 1091, + "ĠUn": 1092, + "vent": 1093, + "Ġsee": 1094, + "ending": 1095, + "Ġmon": 1096, + "Ġdr": 1097, + "Ġkeep": 1098, + "Ġsystems": 1099, + "cul": 1100, + "ven": 1101, + "Ġstory": 1102, + "Ġmedia": 1103, + "Ġseveral": 1104, + "hen": 1105, + "ateg": 1106, + "Ġcontin": 1107, + "Ġdev": 1108, + "Ġlearn": 1109, + "Ġla": 1110, + "Ġstre": 1111, + "Ġpartic": 1112, + "Ġair": 1113, + "ually": 1114, + "Ġsuccess": 1115, + "ouse": 1116, + "Ġiss": 1117, + "ied": 1118, + "Ġmachine": 1119, + "Ġopt": 1120, + "Ġx": 1121, + "Ġop": 1122, + "Ġprof": 1123, + "ocus": 1124, + "chie": 1125, + "Ġmeth": 1126, + "ner": 1127, + "omp": 1128, + "ron": 1129, + "Ġhome": 1130, + "Ġbetter": 1131, + "ĠPro": 1132, + "Ġmult": 1133, + "omet": 1134, + "Ġincrease": 1135, + "Ġanaly": 1136, + "vert": 1137, + "Ġrele": 1138, + "Ġbra": 1139, + "ink": 1140, + "Ġtem": 1141, + "Ġpredi": 1142, + "Ġtre": 1143, + "Ġservice": 1144, + "Ġwebsite": 1145, + "Ġmanage": 1146, + "Ġsoftware": 1147, + "here": 1148, + "Ġprot": 1149, + "-s": 1150, + "Ġquest": 1151, + "ier": 1152, + "Ġknown": 1153, + "Ġorder": 1154, + "Ġphys": 1155, + "cept": 1156, + "Ġachie": 1157, + "Ġinput": 1158, + "Ġpossible": 1159, + "ĠIf": 1160, + "Ġext": 1161, + "fter": 1162, + "Ġelect": 1163, + "Ġmethod": 1164, + "Ġbre": 1165, + "ĠAn": 1166, + "ways": 1167, + "ering": 1168, + "ets": 1169, + "Ġjust": 1170, + "Ġstore": 1171, + "Ġdevelopment": 1172, + "Ġcare": 1173, + "Ġobject": 1174, + "Ġtype": 1175, + "ĠFor": 1176, + "Ġfocus": 1177, + "ggest": 1178, + "Ġonly": 1179, + "Ġconsid": 1180, + "ars": 1181, + "Ġchall": 1182, + "Ġdeterm": 1183, + "Ġsal": 1184, + "ins": 1185, + "Ġfeatures": 1186, + "Ġtru": 1187, + "ody": 1188, + "Ġtool": 1189, + ">\\": 1190, + "Ġensure": 1191, + "oss": 1192, + "ublic": 1193, + "Ġitem": 1194, + "Here": 1195, + "ination": 1196, + "Ġdef": 1197, + "Describe": 1198, + "ional": 1199, + "roup": 1200, + "Ġconf": 1201, + "Ġneeds": 1202, + "Ġcharacter": 1203, + "Ġvarious": 1204, + "Ġlet": 1205, + "Ġapplic": 1206, + "aut": 1207, + "Ġjob": 1208, + "ellig": 1209, + "ĠCon": 1210, + "Ġbest": 1211, + "Ġfore": 1212, + "Ġamount": 1213, + "rop": 1214, + "Ġbuild": 1215, + "ique": 1216, + "aging": 1217, + "Ġemploy": 1218, + "Ġrest": 1219, + "air": 1220, + "What": 1221, + "Ġtoget": 1222, + "Ġways": 1223, + "Ġidentify": 1224, + "Ġtogether": 1225, + "Ġreal": 1226, + "Ġusers": 1227, + "Ġmean": 1228, + "asing": 1229, + "ĠAm": 1230, + "Ġeduc": 1231, + "Ġalgorithm": 1232, + "Ġnetw": 1233, + "Ġcode": 1234, + "Write": 1235, + "ov": 1236, + "-d": 1237, + "oura": 1238, + "ĠHowever": 1239, + "uture": 1240, + "view": 1241, + "Ġindu": 1242, + "Ġproducts": 1243, + "ected": 1244, + "ertain": 1245, + ";\\": 1246, + "ĠAs": 1247, + "pr": 1248, + "aste": 1249, + "Ġoper": 1250, + "Ġ$": 1251, + "avi": 1252, + "self": 1253, + "Ġ<": 1254, + "Ġindust": 1255, + "Ġgu": 1256, + "Ġothers": 1257, + "Ex": 1258, + "ian": 1259, + "Ġ\"\\\"": 1260, + "-f": 1261, + "nces": 1262, + "Ġfil": 1263, + "Ġrespons": 1264, + "rol": 1265, + "Ġcap": 1266, + "Ġbefore": 1267, + "vern": 1268, + "Ġcomplex": 1269, + "lus": 1270, + "ribut": 1271, + "ats": 1272, + "Ġpositive": 1273, + "oh": 1274, + "Ġlo": 1275, + "Ġgroup": 1276, + "Ġfound": 1277, + "ee": 1278, + "ogn": 1279, + "Ġsw": 1280, + "Ġindividuals": 1281, + "Ġpract": 1282, + "Ġenc": 1283, + "Ġshare": 1284, + "raph": 1285, + "Ġrange": 1286, + "Ġsun": 1287, + "\\t": 1288, + "Ġproviding": 1289, + "icle": 1290, + "Ġdem": 1291, + "Ġplace": 1292, + "Ġaud": 1293, + "joy": 1294, + "Ġmust": 1295, + "els": 1296, + "ery": 1297, + "One": 1298, + "Ġfamily": 1299, + "Ġfuture": 1300, + "less": 1301, + "rent": 1302, + "Ġproblem": 1303, + "Ġessential": 1304, + "rodu": 1305, + "ired": 1306, + "Ġreducing": 1307, + "ism": 1308, + "Ġwarm": 1309, + "ray": 1310, + "Ġability": 1311, + "Ġstrong": 1312, + "Ġalways": 1313, + "Ġresources": 1314, + "Ġbenefits": 1315, + "Ġstrateg": 1316, + "Ġinvolves": 1317, + "Ġassist": 1318, + "erest": 1319, + "nA": 1320, + "ression": 1321, + "Ġ[": 1322, + "ilities": 1323, + "Ġsteps": 1324, + "verall": 1325, + "Ġshow": 1326, + "obal": 1327, + "\\nF": 1328, + "Ġland": 1329, + "ĠHere": 1330, + "Ġbusinesses": 1331, + "ĠEn": 1332, + "pportun": 1333, + "Ġmeas": 1334, + "Ġreturn": 1335, + "Ġdig": 1336, + "Ġhist": 1337, + "yth": 1338, + "Ġcent": 1339, + "Ġable": 1340, + "Ġwithout": 1341, + "yc": 1342, + "plain": 1343, + "Ġrelations": 1344, + "Ġservices": 1345, + "-c": 1346, + "Ġtest": 1347, + "arth": 1348, + "Ġcommunication": 1349, + "Ġintern": 1350, + "new": 1351, + "Ġsit": 1352, + "Ġinvest": 1353, + "Ġcaus": 1354, + "Ġunt": 1355, + "Ġfriends": 1356, + "Ġchanges": 1357, + "cri": 1358, + "dit": 1359, + "ĠBy": 1360, + "ĠYou": 1361, + "Ġmeans": 1362, + "Ġrese": 1363, + "ool": 1364, + "ted": 1365, + "elligence": 1366, + "ains": 1367, + "pping": 1368, + "Ġbel": 1369, + "Ġrepresent": 1370, + "Ġhapp": 1371, + "Ġser": 1372, + "Ġperformance": 1373, + "Ġopportun": 1374, + "Ġtemper": 1375, + "ĠShe": 1376, + "Ġfu": 1377, + "ix": 1378, + "bot": 1379, + "Ġwrit": 1380, + "Ġbehavi": 1381, + "Ġproject": 1382, + "ĠWith": 1383, + "ivers": 1384, + "day": 1385, + "Ġphysical": 1386, + "izing": 1387, + "Ġactiv": 1388, + "Ġwithin": 1389, + "Ġinterest": 1390, + "olution": 1391, + "wards": 1392, + "ffic": 1393, + "Ġquick": 1394, + "Ġpublic": 1395, + "Ġgrowth": 1396, + "Ġcho": 1397, + "Ġrelationship": 1398, + "Ġuntil": 1399, + "Ġhelps": 1400, + "Ġstudents": 1401, + "Ġfiel": 1402, + "imes": 1403, + "ulation": 1404, + "ibility": 1405, + "elf": 1406, + "Ġful": 1407, + "Ġsub": 1408, + "ank": 1409, + "ides": 1410, + "Ġskills": 1411, + "Ġclimate": 1412, + "Given": 1413, + "Ġpar": 1414, + "Ġclear": 1415, + "irt": 1416, + "Name": 1417, + "Ġpresent": 1418, + "Ġtri": 1419, + "Ġchalleng": 1420, + "ream": 1421, + "Ġlay": 1422, + "Ġmarketing": 1423, + "Ġsummary": 1424, + "Ġchild": 1425, + "Ġsaf": 1426, + "Ġsure": 1427, + "Ġsame": 1428, + "Ġmu": 1429, + "Ġemail": 1430, + "bon": 1431, + "Ġsomet": 1432, + "```\\": 1433, + "Ġcurrent": 1434, + "amp": 1435, + "ences": 1436, + "ĠRe": 1437, + "Ġtransport": 1438, + "me": 1439, + "-p": 1440, + "action": 1441, + "ĠEx": 1442, + "Ġyears": 1443, + "Ġcomb": 1444, + "hor": 1445, + "anced": 1446, + "ty": 1447, + "Ġlove": 1448, + "Ġgreen": 1449, + "Ġpopular": 1450, + "Ġless": 1451, + "Ġdra": 1452, + "Ġcontrol": 1453, + "Ġaff": 1454, + "Ġconsum": 1455, + "Ġgame": 1456, + "ental": 1457, + "ights": 1458, + "arget": 1459, + "omes": 1460, + "ox": 1461, + "icult": 1462, + "erc": 1463, + "Ġgoals": 1464, + "ancial": 1465, + "tle": 1466, + "Ġgovern": 1467, + "Ġnumbers": 1468, + "Ġfive": 1469, + "Ġstand": 1470, + "Ġsearch": 1471, + "Ġefficient": 1472, + "Ġwal": 1473, + "Ġname": 1474, + "ath": 1475, + "Ġheart": 1476, + "Ġduring": 1477, + "rect": 1478, + "Ġoverall": 1479, + "ython": 1480, + "Ġallows": 1481, + "Ġcity": 1482, + "ave": 1483, + "vant": 1484, + "aterial": 1485, + "Ġwide": 1486, + "Ġmus": 1487, + "ificial": 1488, + "Ġhard": 1489, + "ĠTh": 1490, + "oose": 1491, + "Ġglobal": 1492, + "aj": 1493, + "Ġter": 1494, + "Ġdifficult": 1495, + "Ġline": 1496, + "ĠAl": 1497, + "care": 1498, + "ived": 1499, + "Ġregular": 1500, + "Ġgr": 1501, + "),": 1502, + "lement": 1503, + "Ġhim": 1504, + "Ġunique": 1505, + "Ġenjoy": 1506, + "Ġmeaning": 1507, + "Ġopen": 1508, + "Ġi": 1509, + "abor": 1510, + "Ġarea": 1511, + "Ġitems": 1512, + "Ġclean": 1513, + "ditionally": 1514, + "oid": 1515, + "ĠWe": 1516, + "Ġbeaut": 1517, + "Ġmeet": 1518, + "iple": 1519, + "Ġstatement": 1520, + "Ġagain": 1521, + "ysis": 1522, + "Ġfac": 1523, + "Ġsources": 1524, + "Ġbody": 1525, + "Ġalgorithms": 1526, + "Ġaudience": 1527, + "Ġwant": 1528, + "Ġlog": 1529, + "Ġmaintain": 1530, + "Ġactivities": 1531, + "Ġmove": 1532, + "Ġcult": 1533, + "oney": 1534, + "Ġtarget": 1535, + "\\nB": 1536, + "Ġmaterial": 1537, + "Ġcreating": 1538, + "Ġstructure": 1539, + "atform": 1540, + "ext": 1541, + "Ġexperien": 1542, + "Ġvalues": 1543, + "ead": 1544, + "ohn": 1545, + "Ġhealthy": 1546, + "ross": 1547, + "Ġinteg": 1548, + "Ġresearch": 1549, + "atch": 1550, + "ooking": 1551, + "Ġrole": 1552, + "Ġprovides": 1553, + "iety": 1554, + "ists": 1555, + "Ġfinancial": 1556, + "ories": 1557, + "dent": 1558, + "Ġer": 1559, + "Ġarticle": 1560, + "Ġelements": 1561, + "Ġaddress": 1562, + "Ġconn": 1563, + "ĠUse": 1564, + "mp": 1565, + "Ġeasy": 1566, + "Ġneg": 1567, + "Ġcolor": 1568, + "Ġcalcul": 1569, + "Explain": 1570, + "ĠPl": 1571, + "pect": 1572, + "ince": 1573, + "ale": 1574, + "Ġrisk": 1575, + "curity": 1576, + "ert": 1577, + "Ġfeed": 1578, + "Ġevent": 1579, + "vers": 1580, + "ples": 1581, + "Ġlevels": 1582, + "Ġbi": 1583, + "Ġstay": 1584, + "Ġplatform": 1585, + "Ġbreak": 1586, + "back": 1587, + "Ġsat": 1588, + "\\nOverall": 1589, + "Ġeducation": 1590, + "\\nC": 1591, + "Ġcarbon": 1592, + "--------": 1593, + "ape": 1594, + "Ġprevent": 1595, + "Ġaddition": 1596, + "Ġstress": 1597, + "ral": 1598, + "ource": 1599, + "rus": 1600, + "Ġcome": 1601, + "Ġrecogn": 1602, + "ĠUnited": 1603, + "Ġproper": 1604, + "Ġpoll": 1605, + "dentify": 1606, + "Ġunderstanding": 1607, + "Ġdecisions": 1608, + "ict": 1609, + "Ġdire": 1610, + "Ġbehavior": 1611, + "Ġ*": 1612, + "\\nI": 1613, + "Ġmess": 1614, + "Ġanimals": 1615, + "Ġsl": 1616, + "Ġwind": 1617, + "Ġbas": 1618, + "Ġpain": 1619, + "Ġleading": 1620, + "ern": 1621, + "ger": 1622, + "Ġpres": 1623, + "Ġthough": 1624, + "Ġinteract": 1625, + "yle": 1626, + "Ġdoes": 1627, + "Ġhead": 1628, + "Ġintelligence": 1629, + "orts": 1630, + "Ġbecome": 1631, + "Ġrun": 1632, + "aring": 1633, + "Ġimplement": 1634, + "Ġaction": 1635, + "oot": 1636, + "terns": 1637, + "Ġprotect": 1638, + "eric": 1639, + "Ġflow": 1640, + "Ġemot": 1641, + "cessary": 1642, + "urate": 1643, + "Ġsuggest": 1644, + "Ġprogram": 1645, + "Ġphr": 1646, + "Ġhealthcare": 1647, + "ention": 1648, + "Ġsust": 1649, + "Ġwhy": 1650, + "Ġaccurate": 1651, + "lu": 1652, + "Ġhig": 1653, + "Ġreach": 1654, + "Ġallowing": 1655, + "Ġtravel": 1656, + "Ġrequire": 1657, + "Ġareas": 1658, + "Ġdeep": 1659, + "He": 1660, + "Ġfew": 1661, + "Ġself": 1662, + "oun": 1663, + "Ġ#": 1664, + "osp": 1665, + "str": 1666, + "Ġminut": 1667, + "Ġdecision": 1668, + "ĠThere": 1669, + "ances": 1670, + "Ġquality": 1671, + "Ġavail": 1672, + "Ġspace": 1673, + "Ġsomething": 1674, + "Ġweb": 1675, + "Ġpatterns": 1676, + "Ġmot": 1677, + "oring": 1678, + "isf": 1679, + "Ġanother": 1680, + "Ġaccount": 1681, + "\\nW": 1682, + "uss": 1683, + "Ġmaj": 1684, + "uation": 1685, + "Ġsustain": 1686, + "Ġautom": 1687, + "iques": 1688, + "issions": 1689, + "verse": 1690, + "Ġconcept": 1691, + "Ġsecurity": 1692, + "Ġthose": 1693, + "Ġprofess": 1694, + "Ġshort": 1695, + "Ġnight": 1696, + "ength": 1697, + "apt": 1698, + "ex": 1699, + "ĠAdditionally": 1700, + "Ġtaking": 1701, + "Ġtoo": 1702, + "agn": 1703, + "Ġsimple": 1704, + "lusion": 1705, + "iency": 1706, + "ash": 1707, + "ours": 1708, + "Ġpa": 1709, + "Ġlit": 1710, + "ĠSp": 1711, + "iting": 1712, + "Ġdon": 1713, + "Ġlim": 1714, + "lish": 1715, + "mat": 1716, + "aves": 1717, + "ledge": 1718, + "ditional": 1719, + "inc": 1720, + "Ġevents": 1721, + "Ġoffer": 1722, + "thing": 1723, + "Ġworking": 1724, + "Ġanalysis": 1725, + "Ġachieve": 1726, + "Ġpie": 1727, + "Ġbook": 1728, + "Ġfre": 1729, + "Ġmuch": 1730, + "oon": 1731, + "Ġtry": 1732, + "esp": 1733, + "Ġwaste": 1734, + "face": 1735, + "Ġear": 1736, + "Ġfru": 1737, + "Ġtransportation": 1738, + "chool": 1739, + "Ġtechniques": 1740, + "Ġprogramm": 1741, + "ĠEarth": 1742, + "Ġpredict": 1743, + "Ġnever": 1744, + "ws": 1745, + "ument": 1746, + "imately": 1747, + "ared": 1748, + "Ġparticular": 1749, + "Ġtowards": 1750, + "Ġeconomic": 1751, + "Ġincreasing": 1752, + "Ġfast": 1753, + "iment": 1754, + "Ġnetwork": 1755, + "Ġcorrect": 1756, + "Ġmight": 1757, + "Ġoc": 1758, + "Ġbecause": 1759, + "ĠWh": 1760, + "az": 1761, + "play": 1762, + "Ġresults": 1763, + "Ġmanagement": 1764, + "Ġpurch": 1765, + "Ġsound": 1766, + "Ġpast": 1767, + "Ġtraining": 1768, + "____": 1769, + "ope": 1770, + "Ġengage": 1771, + "ourage": 1772, + "Ġsense": 1773, + "Ġfree": 1774, + "Ġpref": 1775, + "ees": 1776, + "Ġcountries": 1777, + "ney": 1778, + "anies": 1779, + "Ġafter": 1780, + "Ġmind": 1781, + "Ġexc": 1782, + "ĠOnce": 1783, + "ĠĠĠĠĠĠĠĠĠĠĠ": 1784, + "Ġcomplete": 1785, + "Ġimm": 1786, + "Ġest": 1787, + "Ġgenerate": 1788, + "verb": 1789, + "ĠDe": 1790, + "'m": 1791, + "Ġtools": 1792, + "redients": 1793, + "Ġmajor": 1794, + "ently": 1795, + "Ġcontribut": 1796, + "leep": 1797, + "Ġpoints": 1798, + "ditions": 1799, + "Ġfactors": 1800, + "Ġel": 1801, + "Ġnext": 1802, + "ium": 1803, + "oud": 1804, + "Ġcru": 1805, + "Ġreas": 1806, + "riate": 1807, + "ĠInd": 1808, + "Ġpromot": 1809, + "Ġhistory": 1810, + "Ġjour": 1811, + "Ġdue": 1812, + "Con": 1813, + "Ġveget": 1814, + "ency": 1815, + "ĠAmeric": 1816, + "Ġfra": 1817, + "Ġdifference": 1818, + "oard": 1819, + "lex": 1820, + "Ġequation": 1821, + "irtual": 1822, + "Ġcup": 1823, + "Ġforest": 1824, + "Ġnegative": 1825, + "Ġsecon": 1826, + "ones": 1827, + "Ġnature": 1828, + "Ġuses": 1829, + "ah": 1830, + "por": 1831, + "Ġsec": 1832, + "ording": 1833, + "Ġlast": 1834, + "ĠSome": 1835, + "Ġissues": 1836, + "Ġscient": 1837, + "Ġprint": 1838, + "ĠStates": 1839, + "over": 1840, + "Ġsatisf": 1841, + "Ġdevices": 1842, + "Ġdise": 1843, + "Ġtemperature": 1844, + "Ġfeedback": 1845, + "Ġnecessary": 1846, + "Ġemissions": 1847, + "mb": 1848, + "Ġlow": 1849, + "for": 1850, + "tal": 1851, + "Ġchallenges": 1852, + "Ġarray": 1853, + "Ġside": 1854, + "Ġengine": 1855, + "Ġboo": 1856, + "ata": 1857, + "Ġbelie": 1858, + "-m": 1859, + "Ġmultiple": 1860, + "Ġsing": 1861, + "Ġgovernment": 1862, + "ames": 1863, + "ified": 1864, + "Ġminutes": 1865, + "Ġsuccessful": 1866, + "Ġmoney": 1867, + "Ġquickly": 1868, + "Ġbir": 1869, + "Ġtypically": 1870, + "Ġpost": 1871, + "Ġprep": 1872, + "Ġknowledge": 1873, + "pped": 1874, + "actions": 1875, + "Ġmethods": 1876, + "Ġoptim": 1877, + "\\nP": 1878, + "Ġoutput": 1879, + "Ġfield": 1880, + "Ġtable": 1881, + "Ġbal": 1882, + "Ġcoll": 1883, + "Ġcharacters": 1884, + "volution": 1885, + "ords": 1886, + "ilar": 1887, + "ification": 1888, + "ane": 1889, + "Ġcell": 1890, + "Ġmil": 1891, + "ĠWhat": 1892, + "Ġsqu": 1893, + "Ġlives": 1894, + "ĠAr": 1895, + "Ġphrase": 1896, + "Ġnut": 1897, + "Ġdigital": 1898, + "Ġinternet": 1899, + "lass": 1900, + "ura": 1901, + "ommend": 1902, + "Ġtreat": 1903, + "Ġapprop": 1904, + "resh": 1905, + "urther": 1906, + "ĠOne": 1907, + "Ġvisual": 1908, + "ategor": 1909, + "Ġapproach": 1910, + "Ġcertain": 1911, + "Ġsho": 1912, + "val": 1913, + "Ġtask": 1914, + "ires": 1915, + "Ġappropriate": 1916, + "Ġvie": 1917, + "Ġdesigned": 1918, + "pose": 1919, + "**:": 1920, + "fort": 1921, + "Ġ|\\": 1922, + "Ġapplications": 1923, + "Ġpay": 1924, + "Ġnow": 1925, + "Ġheat": 1926, + "Ġindustry": 1927, + "pre": 1928, + "Ġeffectively": 1929, + "Ġpopulation": 1930, + "Ġopportunities": 1931, + " 0.5 + embedding.gate = torch.nn.Parameter(torch.full(embedding.gate.shape, 0.5)) + + inpt = self.input_tensor.clone() + output = embedding(inpt, self.aspect_ratio) + + # assertion + assert_expected(output.shape, self.input_tensor.shape) + assert_expected(output.mean(), torch.tensor(-0.17208), atol=1e-3, rtol=1e-3) + + def test_tile_positional_embedding(self): + # call model + embedding = TilePositionalEmbedding(self.max_num_tiles, self.embed_dim) + fixed_init_model(embedding, min_val=-1, max_val=1) + + inpt = self.input_tensor.clone() + output = embedding(inpt, self.aspect_ratio) + + # assertion + assert_expected(output.shape, self.input_tensor.shape) + assert_expected(output.mean(), torch.tensor(0.28627), atol=1e-3, rtol=1e-3) diff --git a/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py b/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py index 13e7c0ee33..883eae5c30 100644 --- a/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py +++ b/tests/torchtune/models/qwen2/test_qwen2_tokenizer.py @@ -7,6 +7,7 @@ from pathlib import Path import pytest + from torchtune.data import Message from torchtune.models.qwen2 import qwen2_tokenizer @@ -16,8 +17,11 @@ class TestQwen2Tokenizer: @pytest.fixture def tokenizer(self): - # tiny_bpe_tokenizer.json is a pretrained tokenizers BPE tokenizer model. - return qwen2_tokenizer(str(ASSETS / "tiny_bpe_tokenizer.json")) + return qwen2_tokenizer( + vocab_file=str(ASSETS / "tiny_bpe_vocab.json"), + merges_file=str(ASSETS / "tiny_bpe_merges.txt"), + special_tokens_path=str(ASSETS / "tiny_bpe_tokenizer.json"), + ) def test_tokenize_messages(self, tokenizer): messages = [ @@ -44,26 +48,6 @@ def test_tokenize_messages(self, tokenizer): 273, 105, 94, - 58, - 90, - 6, - 83, - 574, - 68, - 6, - 25, - 1032, - 757, - 480, - 6, - 11, - 1032, - 661, - 83, - 144, - 6, - 25, - 1032, 33, 214, 174, @@ -93,16 +77,16 @@ def test_tokenize_messages(self, tokenizer): 103, 874, 269, - 160, - 77, - 145, + 13, + 94, + 94, 2, 2, 2, 483, 197, - 349, - 77, + 25, + 94, 885, 98, 1226, @@ -113,9 +97,9 @@ def test_tokenize_messages(self, tokenizer): 399, 1583, 78, - 160, - 77, - 145, + 13, + 94, + 94, 2, 2, 2, @@ -123,11 +107,8 @@ def test_tokenize_messages(self, tokenizer): 1733, 102, 182, - 349, - 77, - 6, - 92, - 60, + 25, + 94, 2002, 94, 2001, @@ -135,26 +116,6 @@ def test_tokenize_messages(self, tokenizer): 251, 249, 94, - 58, - 90, - 6, - 83, - 574, - 68, - 6, - 25, - 1032, - 757, - 480, - 6, - 11, - 1032, - 661, - 83, - 144, - 6, - 25, - 111, 40, 1791, 194, @@ -270,13 +231,18 @@ def test_tokenize_messages(self, tokenizer): 318, 1278, 13, - 1, - 92, - 60, 2002, 94, 2000, ] - expected_mask = [True] * 90 + [False] * 146 + expected_mask = [True] * 67 + [False] * 123 assert expected_tokens == tokens assert expected_mask == mask + + formatted_messages = tokenizer.decode(tokens) + expected_formatted_messages = ( + f"<|im_start|>user\n{messages[0].text_content}<|im_end|>\n" + f"<|im_start|>assistant\n{messages[1].text_content}<|im_end|>\n" + "<|endoftext|>" + ) + assert expected_formatted_messages == formatted_messages diff --git a/tests/torchtune/modules/test_layernorm.py b/tests/torchtune/modules/test_layernorm.py new file mode 100644 index 0000000000..c1530a198a --- /dev/null +++ b/tests/torchtune/modules/test_layernorm.py @@ -0,0 +1,57 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import pytest + +import torch + +from tests.test_utils import assert_expected + +from torchtune.modules.layer_norm import Fp32LayerNorm +from torchtune.utils.seed import set_seed + + +@pytest.fixture(autouse=True) +def random(): + set_seed(0) + + +class TestLayerNorm: + """ + Class for testing our LayerNorm, which is just a wrapper around torch.nn.LayerNorm + to support fp16 training. + """ + + @pytest.fixture + def dim(self) -> int: + return 8 + + @pytest.fixture + def eps(self) -> float: + return 1e-6 + + @pytest.fixture + def input_random_fp16(self, dim) -> torch.Tensor: + return torch.randn(dim, dtype=torch.float16) + + @pytest.fixture + def layer_norm(self, dim, eps) -> Fp32LayerNorm: + return Fp32LayerNorm(dim, eps=eps) + + def test_forward_fp16(self, layer_norm, input_random_fp16, eps, dim) -> None: + output_fp16 = layer_norm(input_random_fp16) + + # assert dtype as fp16 + assert ( + output_fp16.dtype == torch.float16 + ), "Expected output to be fp16, but got {output_fp16.dtype=}" + + # assert value as fp32 + expected_output = torch.nn.LayerNorm(dim, eps=eps)(input_random_fp16.float()) + output_fp32 = layer_norm(input_random_fp16.float()) + assert_expected( + output_fp32.mean(), expected_output.mean(), atol=1e-8, rtol=1e-8 + ) diff --git a/tests/torchtune/modules/test_transformer_decoder.py b/tests/torchtune/modules/test_transformer_decoder.py index 3cf8fb9570..3f546bb4a0 100644 --- a/tests/torchtune/modules/test_transformer_decoder.py +++ b/tests/torchtune/modules/test_transformer_decoder.py @@ -116,14 +116,14 @@ class TestTransformerDecoder: """ @pytest.fixture - def input_params(self) -> Tuple[int, int]: + def input_params(self) -> Tuple[int, int, int]: batch_size = 4 seq_len = 512 vocab_size = 256 return batch_size, seq_len, vocab_size @pytest.fixture - def input(self, input_params: Tuple[int, int]) -> Tensor: + def input(self, input_params: Tuple[int, int, int]) -> Tensor: batch_size, seq_len, vocab_size = input_params return torch.randint(low=0, high=vocab_size, size=(batch_size, seq_len)) @@ -140,7 +140,7 @@ def decoder_params(self) -> Tuple[int, int, int, int, int, int]: @pytest.fixture def input_max_len_exceeded( self, - input_params: Tuple[int, int], + input_params: Tuple[int, int, int], decoder_params: Tuple[int, int, int, int, int, int], ) -> Tensor: batch_size, seq_len, vocab_size = input_params @@ -151,7 +151,7 @@ def input_max_len_exceeded( @pytest.fixture def input_max_bs_exceeded( self, - input_params: Tuple[int, int], + input_params: Tuple[int, int, int], decoder_params: Tuple[int, int, int, int, int, int], ) -> Tensor: batch_size, seq_len, vocab_size = input_params diff --git a/tests/torchtune/modules/test_vision_transformer.py b/tests/torchtune/modules/test_vision_transformer.py new file mode 100644 index 0000000000..91260e518b --- /dev/null +++ b/tests/torchtune/modules/test_vision_transformer.py @@ -0,0 +1,207 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import pytest + +import torch + +from tests.test_utils import assert_expected, fixed_init_model, fixed_init_tensor +from torchtune.models.clip._component_builders import clip_vision_encoder + + +@pytest.fixture +def transformer_config(): + return { + "embed_dim": 32, + "cls_output_dim": 64, + "num_layers": 2, + "num_heads": 4, + "tile_size": 49, + "patch_size": 9, + "max_num_tiles": 4, + "in_channels": 3, + "output_cls_projection": False, + "out_indices": None, + } + + +@pytest.fixture +def vision_transformer(transformer_config): + vision_transformer = clip_vision_encoder(**transformer_config).eval() + fixed_init_model(vision_transformer, min_val=-1, max_val=1) + return vision_transformer + + +class TestVisionTransformer: + @pytest.fixture(autouse=True) + def setup_class(self, transformer_config): + self.batch_size = 1 + self.n_imgs = 2 + num_channels = transformer_config["in_channels"] + + # generate aspect ratios up to max_num_tiles, shape (bsz, num_conccurent_media, 2) + self.aspect_ratio = torch.tensor([[1, 3], [2, 2]]).reshape( + self.batch_size, self.n_imgs, 2 + ) + + self.num_tiles = 4 + assert ( + self.num_tiles <= transformer_config["max_num_tiles"] + ), "For this test to be valid, num_tiles should be <= max_num_tiles" + assert ( + torch.prod(self.aspect_ratio, dim=-1).max() <= self.num_tiles + ), "For this test to be vlaid, prod(aspect_ratio).max() should match num_tiles" + + # generate image + image = torch.rand( + ( + self.batch_size, + self.n_imgs, + self.num_tiles, + num_channels, + transformer_config["tile_size"], + transformer_config["tile_size"], + ) + ) + self.image = fixed_init_tensor(image.shape, min_val=-1, max_val=1) + + def test_vision_transformer_without_hidden_layers( + self, vision_transformer, transformer_config + ): + # call model + output, _ = vision_transformer(self.image, self.aspect_ratio) + + # assertion + expected_shape = ( + self.batch_size, + self.n_imgs, + self.num_tiles, + vision_transformer.get_image_tokens_per_tile(), + transformer_config["embed_dim"], + ) + assert ( + output.shape == expected_shape + ), f"Expected shape {expected_shape}, but got {output.shape}" + + assert_expected(output.mean(), torch.tensor(1.0172), atol=1e-3, rtol=1e-3) + + def test_fails_if_ar_none_and_multiple_tiles(self, vision_transformer): + """ + If aspect_ratio is none, then num_tiles shouldnt be greater than 1. + Here the test passes if something actually fails under these conditions. + """ + assert self.image.shape[2] > 1, "This test is not valid for num_tiles=1" + try: + vision_transformer(self.image, aspect_ratio=None) + pytest.fail( + "Expected ValueError: If num_tiles>1, aspect_ratio should not be None" + ) + except ValueError: + pass # If ValueError is raised, the test passes + + def test_vision_transformer_with_cls_projection(self, transformer_config): + transformer_config = transformer_config.copy() + transformer_config["output_cls_projection"] = True + + # call model + model_with_cls = clip_vision_encoder(**transformer_config).eval() + fixed_init_model(model_with_cls, min_val=-1, max_val=1) + output, _ = model_with_cls(self.image, self.aspect_ratio) + + # assertion + expected_shape = ( + self.batch_size, + self.n_imgs, + self.num_tiles, + 1, + transformer_config["cls_output_dim"], + ) + + assert ( + output.shape == expected_shape + ), f"Expected shape {expected_shape}, but got {output.shape}" + + assert_expected(output.mean(), torch.tensor(9.6240), atol=1e-3, rtol=1e-3) + + def test_vision_transformer_return_hidden_layers(self, transformer_config): + transformer_config = transformer_config.copy() + transformer_config["out_indices"] = [ + 0, + 1, + ] + + # call model + model_with_hidden = clip_vision_encoder(**transformer_config) + fixed_init_model(model_with_hidden, min_val=-1, max_val=1) + x, hidden_layers = model_with_hidden(self.image, self.aspect_ratio) + + # assertion x + expected_shape_x = ( + self.batch_size, + self.n_imgs, + self.num_tiles, + model_with_hidden.get_image_tokens_per_tile(), + transformer_config["embed_dim"], + ) + + assert ( + x.shape == expected_shape_x + ), f"Expected shape {expected_shape_x}, but got {x.shape=}" + + assert_expected(x.mean(), torch.tensor(1.0172), atol=1e-3, rtol=1e-3) + + # assertion hidden + num_hidden_layers_expected = len(transformer_config["out_indices"]) + + expected_shape_hidden_layers = ( + self.batch_size, + self.n_imgs, + self.num_tiles, + model_with_hidden.get_image_tokens_per_tile(), + transformer_config["embed_dim"], + ) + + assert ( + len(hidden_layers) == num_hidden_layers_expected + ), f"Expected {num_hidden_layers_expected} hidden layers, but got {len(hidden_layers)}" + + for hidden_layer in hidden_layers: + assert ( + hidden_layer.shape == expected_shape_hidden_layers + ), f"Expected shape {expected_shape_hidden_layers}, but got {hidden_layer.shape=}" + + assert_expected( + torch.stack(hidden_layers, dim=-1).mean(), + torch.tensor(6.6938), + atol=1e-3, + rtol=1e-3, + ) + + def test_vision_transformer_single_tile(self, transformer_config): + transformer_config = transformer_config.copy() + transformer_config["max_num_tiles"] = 1 + + # get single tile: (bsz, n_imgs, 1, num_channels, tile_size, tile_size) + images = self.image[:, :, [0], :, :, :] + + # call model + model_with_multiple_tiles = clip_vision_encoder(**transformer_config) + fixed_init_model(model_with_multiple_tiles, min_val=-1, max_val=1) + output, _ = model_with_multiple_tiles(images, aspect_ratio=None) + + # assertion + expected_shape = ( + self.batch_size, + self.n_imgs, + 1, + model_with_multiple_tiles.get_image_tokens_per_tile(), + transformer_config["embed_dim"], + ) + assert ( + output.shape == expected_shape + ), f"Expected shape {expected_shape}, but got {output.shape}" + + assert_expected(output.mean(), torch.tensor(0.5458), atol=1e-3, rtol=1e-3) diff --git a/tests/torchtune/modules/transforms/test_get_canvas_best_fit.py b/tests/torchtune/modules/transforms/test_get_canvas_best_fit.py new file mode 100644 index 0000000000..16779ed565 --- /dev/null +++ b/tests/torchtune/modules/transforms/test_get_canvas_best_fit.py @@ -0,0 +1,160 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import pytest +import torch + +from torchtune.modules.transforms import find_supported_resolutions, get_canvas_best_fit + + +class TestUtils: + @pytest.mark.parametrize( + "params", + [ + { + "max_num_tiles": 1, + "tile_size": 224, + "expected_resolutions": [(224, 224)], + }, + { + "max_num_tiles": 2, + "tile_size": 100, + "expected_resolutions": [(100, 200), (200, 100), (100, 100)], + }, + { + "max_num_tiles": 3, + "tile_size": 50, + "expected_resolutions": [ + (50, 150), + (150, 50), + (50, 100), + (100, 50), + (50, 50), + ], + }, + { + "max_num_tiles": 4, + "tile_size": 300, + "expected_resolutions": [ + (300, 1200), + (600, 600), + (300, 300), + (1200, 300), + (300, 900), + (900, 300), + (300, 600), + (600, 300), + ], + }, + ], + ) + def test_find_supported_resolutions(self, params): + max_num_tiles = params["max_num_tiles"] + tile_size = params["tile_size"] + expected_resolutions = params["expected_resolutions"] + resolutions = find_supported_resolutions(max_num_tiles, tile_size) + + assert len(set(resolutions)) == len(resolutions), "Resolutions should be unique" + assert set(resolutions) == set( + expected_resolutions + ), f"Expected resolutions {expected_resolutions} but got {resolutions}" + + @pytest.mark.parametrize( + "params", + [ + { + "image_size": (800, 600), + "possible_resolutions": [ + (224, 896), + (448, 448), + (224, 224), + (896, 224), + (224, 672), + (672, 224), + (224, 448), + (448, 224), + ], + "resize_to_max_canvax": False, + "expected_best_resolution": (448, 448), + }, + { + "image_size": (200, 300), + "possible_resolutions": [ + (224, 896), + (448, 448), + (224, 224), + (896, 224), + (224, 672), + (672, 224), + (224, 448), + (448, 224), + ], + "resize_to_max_canvax": False, + "expected_best_resolution": (224, 448), + }, + { + "image_size": (200, 500), + "possible_resolutions": [ + (224, 896), + (448, 448), + (224, 224), + (896, 224), + (224, 672), + (672, 224), + (224, 448), + (448, 224), + ], + "resize_to_max_canvax": True, + "expected_best_resolution": (224, 672), + }, + { + "image_size": (200, 200), + "possible_resolutions": [ + (224, 896), + (448, 448), + (224, 224), + (896, 224), + (224, 672), + (672, 224), + (224, 448), + (448, 224), + ], + "resize_to_max_canvax": False, + "expected_best_resolution": (224, 224), + }, + { + "image_size": (200, 100), + "possible_resolutions": [ + (224, 896), + (448, 448), + (224, 224), + (896, 224), + (224, 672), + (672, 224), + (224, 448), + (448, 224), + ], + "resize_to_max_canvax": True, + "expected_best_resolution": (448, 224), + }, + ], + ) + def test_get_canvas_best_fit(self, params): + image_size = params["image_size"] + possible_resolutions = params["possible_resolutions"] + expected_best_resolution = params["expected_best_resolution"] + resize_to_max_canvax = params["resize_to_max_canvax"] + + possible_resolutions = torch.tensor(possible_resolutions) + + image = torch.rand(*image_size) + best_resolution = get_canvas_best_fit( + image, possible_resolutions, resize_to_max_canvax + ) + + assert ( + tuple(best_resolution) == expected_best_resolution + ), f"Expected best resolution {expected_best_resolution} but got {best_resolution}" diff --git a/tests/torchtune/modules/transforms/test_resize_with_pad.py b/tests/torchtune/modules/transforms/test_resize_with_pad.py new file mode 100644 index 0000000000..29b4fce223 --- /dev/null +++ b/tests/torchtune/modules/transforms/test_resize_with_pad.py @@ -0,0 +1,84 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import pytest + +import torch +import torchvision + +from torchtune.modules.transforms import resize_with_pad + + +class TestTransforms: + @pytest.mark.parametrize( + "params", + [ + { + "image_size": (200, 100), + "target_size": (1000, 1200), + "max_upscaling_size": 600, + "expected_resized_size": (600, 300), + }, + { + "image_size": (2000, 200), + "target_size": (1000, 1200), + "max_upscaling_size": 600, + "expected_resized_size": (1000, 100), + }, + { + "image_size": (400, 200), + "target_size": (1000, 1200), + "max_upscaling_size": 2000, + "expected_resized_size": (1000, 500), + }, + { + "image_size": (400, 200), + "target_size": (1000, 1200), + "max_upscaling_size": None, + "expected_resized_size": (1000, 500), + }, + { + "image_size": (1000, 500), + "target_size": (400, 300), + "max_upscaling_size": None, + "expected_resized_size": [400, 200], + }, + ], + ) + def test_resize_with_pad(self, params): + + image_size = params["image_size"] + target_size = params["target_size"] + max_upscaling_size = params["max_upscaling_size"] + expected_resized_size = params["expected_resized_size"] + + image = torch.rand(3, *image_size) # Create a random image tensor + + resized_image = resize_with_pad( + image=image, + target_size=target_size, + resample=torchvision.transforms.InterpolationMode["BILINEAR"], + max_upscaling_size=max_upscaling_size, + ) + + # assert everything beyond resize has value == 0 + assert torch.all( + resized_image[:, (expected_resized_size[0] + 1) :, :] == 0 + ), "Expected everything beyond resize to be pad with fill=0" + + assert torch.all( + resized_image[:, :, (expected_resized_size[1] + 1) :] == 0 + ), "Expected everything beyond resize to be pad with fill=0" + + assert torch.all( + resized_image[:, : expected_resized_size[0], : expected_resized_size[1]] + != 0 + ), "Expected no padding where the image is supposed to be" + + # output should have shape target_size + assert ( + resized_image.shape[-2:] == target_size + ), f"Expected output with shape {target_size} but got {resized_image.shape[-2:]}" diff --git a/tests/torchtune/modules/transforms/test_tile_crop.py b/tests/torchtune/modules/transforms/test_tile_crop.py new file mode 100644 index 0000000000..7afde495a3 --- /dev/null +++ b/tests/torchtune/modules/transforms/test_tile_crop.py @@ -0,0 +1,81 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import pytest + +import torch + +from torchtune.modules.transforms import tile_crop + + +class TestTransforms: + @pytest.mark.parametrize( + "params", + [ + { + "expected_output_shape": torch.Size([24, 3, 50, 50]), + "image_size": (3, 200, 300), + "status": "Passed", + "tile_size": 50, + }, + { + "expected_output_shape": torch.Size([6, 3, 200, 200]), + "image_size": (3, 400, 600), + "status": "Passed", + "tile_size": 200, + }, + { + "expected_output_shape": torch.Size([1, 3, 250, 250]), + "image_size": (3, 250, 250), + "status": "Passed", + "tile_size": 250, + }, + { + "error": "Image size 250x250 is not divisible by tile size 500", + "image_size": (3, 250, 250), + "status": "Failed", + "tile_size": 500, + }, + { + "error": "Image size 250x250 is not divisible by tile size 80", + "image_size": (3, 250, 250), + "status": "Failed", + "tile_size": 80, + }, + ], + ) + def test_tile_crop(self, params): + image_size = params["image_size"] + tile_size = params["tile_size"] + status = params["status"] + + image = torch.rand(*image_size) # Create a random image tensor + + if status == "Passed": + tiles = tile_crop(image, tile_size) + expected_output_shape = params["expected_output_shape"] + assert ( + tiles.shape == expected_output_shape + ), f"Expected shape {expected_output_shape} but got {tiles.shape}" + + # check if first and last tile matches the image + first_tile = image[..., :tile_size, :tile_size] + last_tile = image[..., -tile_size:, -tile_size:] + assert torch.equal( + tiles[0], first_tile + ), "Expected first tile to match the image" + assert torch.equal( + tiles[-1], last_tile + ), "Expected last tile to match the image" + + elif status == "Failed": + with pytest.raises(Exception) as exc_info: + tile_crop(image, tile_size) + expected_error = params["error"] + actual_error = str(exc_info.value) + assert ( + str(exc_info.value) == params["error"] + ), f"Expected error message '{expected_error}' but got '{actual_error}'" diff --git a/tests/torchtune/modules/transforms/test_transforms.py b/tests/torchtune/modules/transforms/test_transforms.py new file mode 100644 index 0000000000..0436a34ec1 --- /dev/null +++ b/tests/torchtune/modules/transforms/test_transforms.py @@ -0,0 +1,80 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import pytest +import torch +from torchtune.modules.transforms import VisionCrossAttentionMask + + +IMAGE_TOKEN_ID = 1 + + +class TestVisionCrossAttentionMask: + @pytest.fixture + def num_tiles(self): + return 2 + + @pytest.fixture + def tile_size(self): + return 4 + + @pytest.fixture + def patch_size(self): + return 2 + + @pytest.fixture + def image_num_tokens(self, num_tiles, tile_size, patch_size): + return ((tile_size // patch_size) ** 2 + 1) * num_tiles + + @pytest.fixture + def tokens(self): + # This tests image tokens not at start, consecutive images, and image + # with text until end. + # text = 2, image = 1 + return [2, 2, IMAGE_TOKEN_ID, IMAGE_TOKEN_ID, 2, 2, IMAGE_TOKEN_ID, 2, 2] + + @pytest.fixture + def images(self, num_tiles, tokens): + n_img = len([token_id for token_id in tokens if token_id == IMAGE_TOKEN_ID]) + n_channels = 3 + tile_size = 2 + return [ + torch.ones(num_tiles, n_channels, tile_size, tile_size) + for _ in range(n_img) + ] + + @pytest.fixture + def cross_attn_mask_transform(self, tile_size, patch_size): + # patches per tile = 4 + return VisionCrossAttentionMask( + tile_size=tile_size, + patch_size=patch_size, + image_token_id=IMAGE_TOKEN_ID, + ) + + def test_get_image_attention_intervals(self, cross_attn_mask_transform, tokens): + actual = cross_attn_mask_transform._get_image_attention_intervals(tokens) + expected = [[2, 6], [3, 6], [6, 9]] + assert actual == expected + + def test_call(self, cross_attn_mask_transform, tokens, images, image_num_tokens): + sample = {"tokens": tokens, "images": images} + dummy_kwargs = {"hello": 8} + sample.update(dummy_kwargs) + actual = cross_attn_mask_transform(sample) + expected = [ + torch.zeros(len(tokens), image_num_tokens, dtype=torch.bool) + for _ in range(len(images)) + ] + expected[0][2:6, :] = True + expected[1][3:6, :] = True + expected[2][6:9, :] = True + for i in range(len(images)): + torch.testing.assert_close(actual["encoder_mask"][i], expected[i]) + torch.testing.assert_close(actual["images"][i], images[i]) + + assert actual["tokens"] == tokens + assert actual["hello"] == dummy_kwargs["hello"] diff --git a/tests/torchtune/utils/test_checkpointer.py b/tests/torchtune/utils/test_checkpointer.py index 71161a2460..14c6b090d5 100644 --- a/tests/torchtune/utils/test_checkpointer.py +++ b/tests/torchtune/utils/test_checkpointer.py @@ -628,7 +628,6 @@ def state_dict(self, weight_dtype): ), "model.norm.weight": randn(_DIM, dtype=weight_dtype), } - state_dict["lm_head.weight"] = state_dict["model.embed_tokens.weight"] return state_dict @pytest.fixture @@ -704,8 +703,7 @@ def test_load_save_checkpoint_single_file( # Converted state dict from the checkpointer state_dict = single_file_checkpointer.load_checkpoint() - # Check that we've loaded all the keys - we're loading one less key in: lm_head.weight - assert len(state_dict["model"].keys()) == (len(orig_state_dict.keys()) - 1) + assert len(state_dict["model"].keys()) == len(orig_state_dict.keys()) # the keys in original state dict should match up with the keys in the weight_map for key in orig_state_dict.keys(): diff --git a/tests/torchtune/utils/test_distributed.py b/tests/torchtune/utils/test_distributed.py index 7b6754704d..38c82f8ed7 100644 --- a/tests/torchtune/utils/test_distributed.py +++ b/tests/torchtune/utils/test_distributed.py @@ -262,6 +262,10 @@ def world_size(self) -> int: return 2 @gpu_test(gpu_count=2) + @pytest.mark.skipif( + version.parse(torch.__version__).base_version < "2.4.0", + reason="torch >= 2.4 required", + ) def test_lora_state_dict(self): rank = self.rank is_rank_zero = rank == 0 diff --git a/torchtune/datasets/_alpaca.py b/torchtune/datasets/_alpaca.py index 2d1b33f99e..3a2c980017 100644 --- a/torchtune/datasets/_alpaca.py +++ b/torchtune/datasets/_alpaca.py @@ -23,18 +23,18 @@ def alpaca_dataset( Support for family of Alpaca-style datasets from Hugging Face Datasets using the `data input format `_ and `prompt template `_ - from the original alpaca codebase, where `instruction`, `input`, and `output` + from the original alpaca codebase, where ``instruction``, ``input``, and ``output`` are fields from the dataset. - Masking of the prompt during training is controlled by the `train_on_input` flag, which is - set to `True` by `default `_ - - If `train_on_input` is True, the prompt is used during training and + Masking of the prompt during training is controlled by the ``train_on_input`` flag, which is + set to ``True`` by `default `_ + - If ``train_on_input`` is True, the prompt is used during training and contributes to the loss. - - If `train_on_input` is False, the prompt is masked out (tokens replaced with -100) + - If ``train_on_input`` is False, the prompt is masked out (tokens replaced with -100) Args: tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method. - source (str): path string of dataset, anything supported by Hugging Face's `load_dataset`. + source (str): path string of dataset, anything supported by Hugging Face's ``load_dataset``. train_on_input (bool): Whether the model is trained on the prompt or not. Default is True. max_seq_len (int): Maximum number of tokens in the returned input and label token id lists. Default is 512, but we recommend setting this to the highest you can fit in memory and @@ -64,3 +64,8 @@ def alpaca_dataset( alpaca_cleaned_dataset = partial(alpaca_dataset, source="yahma/alpaca-cleaned") +alpaca_cleaned_dataset.__doc__ = """ +Builder for a variant of Alpaca-style datasets with the cleaned version of the +original Alpaca dataset, `yahma/alpaca-cleaned `_. +See the dataset page and :func:`~torchtune.datasets.alpaca_dataset` for more details. +""" diff --git a/torchtune/datasets/_chat.py b/torchtune/datasets/_chat.py index 943f1cde70..bb79c409ed 100644 --- a/torchtune/datasets/_chat.py +++ b/torchtune/datasets/_chat.py @@ -30,7 +30,7 @@ class ChatDataset(Dataset): The general flow from loading a sample to tokenized prompt is: load sample -> apply transform -> foreach turn{format into template -> tokenize} - If the column/key names differ from the expected names in the ``ChatFormat``, + If the column/key names differ from the expected names in the :class:`~torchtune.data.ChatFormat`, then the ``column_map`` argument can be used to provide this mapping. Use ``convert_to_messages`` to prepare your dataset into the Llama2 chat format @@ -131,13 +131,13 @@ def chat_dataset( (https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset.path) conversation_style (str): string specifying expected style of conversations in the dataset for automatic conversion to the :class:`~torchtune.data.Message` structure. Supported styles are: "sharegpt", "openai" - chat_format (Optional[str]): full import path of ``ChatFormat`` class used to format the messages. See the description in - :class:`~torchtune.datasets.ChatDataset` for more details. For a list of all possible chat formats, - check out :ref:`chat_formats`. Default: None. + chat_format (Optional[str]): full import path of :class:`~torchtune.data.ChatFormat` class used to format the messages. + See the description in :class:`~torchtune.datasets.ChatDataset` for more details. For a list of all + possible chat formats, check out :ref:`chat_formats`. Default: None. max_seq_len (int): Maximum number of tokens in the returned input and label token id lists. train_on_input (bool): Whether the model is trained on the prompt or not. Default is False. packed (bool): Whether or not to pack the dataset to ``max_seq_len`` prior to training. Default is False. - **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to `load_dataset`. + **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``. Examples: >>> from torchtune.datasets import chat_dataset diff --git a/torchtune/datasets/_concat.py b/torchtune/datasets/_concat.py index c650e54a9a..6c7522b884 100644 --- a/torchtune/datasets/_concat.py +++ b/torchtune/datasets/_concat.py @@ -24,8 +24,8 @@ class ConcatDataset(Dataset): very large datasets. Upon initialization, this class computes the cumulative length of all datasets and maintains an internal mapping - of indices to the respective datasets. This approach allows the `ConcatDataset` to delegate data retrieval to - the appropriate sub-dataset transparently when a particular index is accessed. + of indices to the respective datasets. This approach allows the :class:`~torchtune.datasets.ConcatDataset` + to delegate data retrieval to the appropriate sub-dataset transparently when a particular index is accessed. Note: Using this class with very large datasets can lead to high memory consumption, as it requires all datasets to @@ -33,7 +33,7 @@ class ConcatDataset(Dataset): Args: datasets (List[Dataset]): A list of datasets to concatenate. Each dataset must be an instance of a class - derived from `torch.utils.data.Dataset`. + derived from :class:`~torch.utils.data.Dataset`. Attributes: _datasets (List[Dataset]): Stores the list of datasets passed during initialization. @@ -41,13 +41,29 @@ class ConcatDataset(Dataset): _indexes (List[Tuple[int, int, int]]): A list of tuples where each tuple contains the starting index, the ending index, and the dataset index for quick lookup and access during indexing operations. - Example: + Examples: >>> dataset1 = MyCustomDataset(params1) >>> dataset2 = MyCustomDataset(params2) >>> concat_dataset = ConcatDataset([dataset1, dataset2]) >>> print(len(concat_dataset)) # Total length of both datasets >>> data_point = concat_dataset[1500] # Accesses an element from the appropriate dataset + This can also be accomplished by passing in a list of datasets to the YAML config:: + + dataset: + - _component_: torchtune.datasets.instruct_dataset + source: vicgalle/alpaca-gpt4 + template: torchtune.data.AlpacaInstructTemplate + split: train + train_on_input: True + - _component_: torchtune.datasets.instruct_dataset + source: samsum + template: torchtune.data.SummarizeTemplate + column_map: {"output": "summary"} + output: summary + split: train + train_on_input: False + This class primarily focuses on providing a unified interface to access elements from multiple datasets, enhancing the flexibility in handling diverse data sources for training machine learning models. """ diff --git a/torchtune/datasets/_grammar.py b/torchtune/datasets/_grammar.py index a734a265d7..453950fdf4 100644 --- a/torchtune/datasets/_grammar.py +++ b/torchtune/datasets/_grammar.py @@ -22,17 +22,17 @@ def grammar_dataset( The prompt template mirrors what is used in the `llama_recipes codebase `_ - where `input` and `output` are fields from the dataset. + where ``input`` and ``output`` are fields from the dataset. - Masking of the prompt during training is controlled by the `train_on_input` flag, which is - set to `False` by default - - If `train_on_input` is True, the prompt is used during training and + Masking of the prompt during training is controlled by the ``train_on_input`` flag, which is + set to ``False`` by default + - If ``train_on_input`` is True, the prompt is used during training and contributes to the loss. - - If `train_on_input` is False, the prompt is masked out (tokens replaced with -100) + - If ``train_on_input`` is False, the prompt is masked out (tokens replaced with -100) Args: tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method. - source (str): path string of dataset, anything supported by Hugging Face's `load_dataset`. + source (str): path string of dataset, anything supported by Hugging Face's ``load_dataset``. train_on_input (bool): Whether the model is trained on the prompt or not. Default is False. packed (bool): Whether or not to pack the dataset to ``max_seq_len`` prior to training. Default is False. diff --git a/torchtune/datasets/_instruct.py b/torchtune/datasets/_instruct.py index 9ab858613f..99ab8ccfe9 100644 --- a/torchtune/datasets/_instruct.py +++ b/torchtune/datasets/_instruct.py @@ -28,21 +28,21 @@ class InstructDataset(Dataset): The general flow from loading a sample to tokenized prompt is: load sample -> apply transform -> format into template -> tokenize - If the column/key names differ from the expected names in the `InstructTemplate`, - then the `column_map` argument can be used to provide this mapping. + If the column/key names differ from the expected names in the :class:`~torchtune.data.InstructTemplate`, + then the ``column_map`` argument can be used to provide this mapping. - Masking of the prompt during training is controlled by the `train_on_input` flag, which is - set to `False` by default. - - If `train_on_input` is True, the prompt is used during training and + Masking of the prompt during training is controlled by the ``train_on_input`` flag, which is + set to ``False`` by default. + - If ``train_on_input`` is True, the prompt is used during training and contributes to the loss. - - If `train_on_input` is False, the prompt is masked out (tokens replaced with -100) + - If ``train_on_input`` is False, the prompt is masked out (tokens replaced with -100) Args: tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method. - source (str): path string of dataset, anything supported by Hugging Face's `load_dataset` + source (str): path string of dataset, anything supported by Hugging Face's ``load_dataset`` (https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset.path) template (InstructTemplate): template used to format the prompt. If the placeholder variable - names in the template do not match the column/key names in the dataset, use `column_map` to map them. + names in the template do not match the column/key names in the dataset, use ``column_map`` to map them. transform (Optional[Callable]): transform to apply to the sample before formatting to the template. Default is None. column_map (Optional[Dict[str, str]]): a mapping from the expected placeholder names in the template @@ -51,7 +51,7 @@ class InstructDataset(Dataset): max_seq_len (Optional[int]): Maximum number of tokens in the returned input and label token id lists. Default is None, disabling truncation. We recommend setting this to the highest you can fit in memory and is supported by the model. For example, llama2-7B supports up to 4096 for sequence length. - **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to `load_dataset`. + **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``. """ def __init__( @@ -126,14 +126,14 @@ def instruct_dataset( """ Build a configurable dataset with instruction prompts. This method should be used to configure a custom instruct dataset from the yaml config instead of - using `InstructDataset` directly, as it is made to be config friendly. + using :class:`~torchtune.datasets.InstructDataset` directly, as it is made to be config friendly. Args: tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method. - source (str): path string of dataset, anything supported by Hugging Face's `load_dataset` + source (str): path string of dataset, anything supported by Hugging Face's ``load_dataset`` (https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset.path) template (str): full import path of class used to format the prompt. If the placeholder variable - names in the template do not match the column/key names in the dataset, use `column_map` to map them. + names in the template do not match the column/key names in the dataset, use ``column_map`` to map them. column_map (Optional[Dict[str, str]]): a mapping from the expected placeholder names in the template to the column/key names in the sample. If None, assume these are identical. train_on_input (bool): Whether the model is trained on the prompt or not. Default is False. @@ -141,7 +141,7 @@ def instruct_dataset( Default is None, disabling truncation. We recommend setting this to the highest you can fit in memory and is supported by the model. For example, llama2-7B supports up to 4096 for sequence length. packed (bool): Whether or not to pack the dataset to ``max_seq_len`` prior to training. Default is False. - **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to `load_dataset`. + **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``. Examples: >>> from torchtune.datasets import instruct_dataset diff --git a/torchtune/datasets/_preference.py b/torchtune/datasets/_preference.py index 446070b694..70a617f6d7 100644 --- a/torchtune/datasets/_preference.py +++ b/torchtune/datasets/_preference.py @@ -23,15 +23,15 @@ class PreferenceDataset(Dataset): The general flow from loading a sample to tokenized prompt is: load sample -> apply transform -> format into template -> tokenize - If the column/key names differ from the expected names in the `InstructTemplate`, - then the `column_map` argument can be used to provide this mapping. + If the column/key names differ from the expected names in the :class:`~torchtune.data.InstructTemplate`, + then the ``column_map`` argument can be used to provide this mapping. Args: tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method. - source (str): path string of dataset, anything supported by Hugging Face's `load_dataset` + source (str): path string of dataset, anything supported by Hugging Face's ``load_dataset`` (https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset.path) template (InstructTemplate): template used to format the prompt. If the placeholder variable - names in the template do not match the column/key names in the dataset, use `column_map` to map them. + names in the template do not match the column/key names in the dataset, use ``column_map`` to map them. transform (Optional[Callable]): transform to apply to the sample before formatting to the template. Default is None. column_map (Optional[Dict[str, str]]): a mapping from the expected placeholder names in the template @@ -39,7 +39,7 @@ class PreferenceDataset(Dataset): max_seq_len (Optional[int]): Maximum number of tokens in the returned input and label token id lists. Default is None, disabling truncation. We recommend setting this to the highest you can fit in memory and is supported by the model. For example, llama2-7B supports up to 4096 for sequence length. - **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to `load_dataset`. + **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``. """ def __init__( diff --git a/torchtune/datasets/_samsum.py b/torchtune/datasets/_samsum.py index 59eba08b32..63b09eba9f 100644 --- a/torchtune/datasets/_samsum.py +++ b/torchtune/datasets/_samsum.py @@ -22,13 +22,13 @@ def samsum_dataset( The prompt template mirrors what is used in the llama_recipes `codebase `_ - where `dialogue` and `summary` are fields from the dataset. + where ``dialogue`` and ``summary`` are fields from the dataset. - Masking of the prompt during training is controlled by the `train_on_input` flag, which is - set to `False` by default - - If `train_on_input` is True, the prompt is used during training and + Masking of the prompt during training is controlled by the ``train_on_input`` flag, which is + set to ``False`` by default + - If ``train_on_input`` is True, the prompt is used during training and contributes to the loss. - - If `train_on_input` is False, the prompt is masked out (tokens replaced with -100) + - If ``train_on_input`` is False, the prompt is masked out (tokens replaced with -100) Args: tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method. diff --git a/torchtune/datasets/_slimorca.py b/torchtune/datasets/_slimorca.py index fcc1c0c078..5594881127 100644 --- a/torchtune/datasets/_slimorca.py +++ b/torchtune/datasets/_slimorca.py @@ -28,7 +28,7 @@ def slimorca_dataset( The Llama3 models do not prescribe a particular format. The returned data is a tuple of input token id list and label token id - list. If `max_seq_len` keyword argument is provided, the returned + list. If ``max_seq_len`` keyword argument is provided, the returned input token id list is ensured (by truncation if necessary) to be within that length. diff --git a/torchtune/datasets/_text_completion.py b/torchtune/datasets/_text_completion.py index 053b0b0606..bd815e9fdf 100644 --- a/torchtune/datasets/_text_completion.py +++ b/torchtune/datasets/_text_completion.py @@ -28,6 +28,7 @@ class TextCompletionDataset(Dataset): max_seq_len (Optional[int]): Maximum number of tokens in the returned input and label token id lists. Default is None, disabling truncation. We recommend setting this to the highest you can fit in memory and is supported by the model. For example, llama2-7B supports up to 4096 for sequence length. + add_eos (bool): Whether to add an EOS token to the end of the sequence. Default is True. **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``. """ @@ -37,12 +38,14 @@ def __init__( source: str, column: str = "text", max_seq_len: Optional[int] = None, + add_eos: bool = True, **load_dataset_kwargs: Dict[str, Any], ) -> None: self._tokenizer = tokenizer self._data = load_dataset(source, **load_dataset_kwargs) self.max_seq_len = max_seq_len self._column = column + self.add_eos = add_eos def __len__(self): return len(self._data) @@ -53,7 +56,7 @@ def __getitem__(self, index: int) -> Dict[str, List[int]]: def _prepare_sample(self, sample: Mapping[str, Any]) -> Dict[str, List[int]]: prompt = sample[self._column] - tokens = self._tokenizer.encode(text=prompt, add_bos=True, add_eos=True) + tokens = self._tokenizer.encode(text=prompt, add_bos=True, add_eos=self.add_eos) # Truncate if needed, but don't coerce EOS id if self.max_seq_len is not None: @@ -70,13 +73,15 @@ def text_completion_dataset( source: str, column: Optional[str] = None, max_seq_len: Optional[int] = None, + add_eos: bool = True, packed: bool = False, **load_dataset_kwargs: Dict[str, Any], ) -> TextCompletionDataset: """ - Build a configurable freeform text dataset with instruction prompts. This method should be + Build a configurable dataset from a freeform, unstructured text corpus similar + to datasets used in pre-training. This method should be used to configure a custom text dataset from the yaml config instead of - using `TextDataset` directly, as it is made to be config friendly. + using :class:`~torchtune.datasets.TextCompletionDataset` directly, as it is made to be config friendly. Args: tokenizer (ModelTokenizer): Tokenizer used by the model that implements the ``tokenize_messages`` method. @@ -87,6 +92,7 @@ def text_completion_dataset( max_seq_len (Optional[int]): Maximum number of tokens in the returned input and label token id lists. Default is None, disabling truncation. We recommend setting this to the highest you can fit in memory and is supported by the model. For example, llama2-7B supports up to 4096 for sequence length. + add_eos (bool): Whether to add an EOS token to the end of the sequence. Default is True. packed (bool): Whether or not to pack the dataset to ``max_seq_len`` prior to training. Default is False. **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``. @@ -120,6 +126,7 @@ def text_completion_dataset( source=source, column=column, max_seq_len=max_seq_len, + add_eos=add_eos, **load_dataset_kwargs, ) return ( diff --git a/torchtune/models/clip/__init__.py b/torchtune/models/clip/__init__.py new file mode 100644 index 0000000000..ca7ecf2a65 --- /dev/null +++ b/torchtune/models/clip/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from ._component_builders import clip_vision_encoder + +from ._position_embeddings import ( + TiledTokenPositionalEmbedding, + TilePositionalEmbedding, + TokenPositionalEmbedding, +) + +__all__ = [ + "clip_vision_encoder", + "TokenPositionalEmbedding", + "TiledTokenPositionalEmbedding", + "TilePositionalEmbedding", +] diff --git a/torchtune/models/clip/_component_builders.py b/torchtune/models/clip/_component_builders.py new file mode 100644 index 0000000000..3e61110c71 --- /dev/null +++ b/torchtune/models/clip/_component_builders.py @@ -0,0 +1,101 @@ +from typing import List, Optional + +import torch +from torchtune.modules.vision_transformer import VisionTransformer, CLSProjection +from torchtune.models.clip._position_embeddings import TokenPositionalEmbedding, TiledTokenPositionalEmbedding, TilePositionalEmbedding + +import logging + +logger = logging.getLogger(__name__) + +def clip_vision_encoder( + tile_size: int, + patch_size: int, + embed_dim: int, + num_layers: int, + num_heads: int, + cls_output_dim: int = 512, + out_indices: Optional[List[int]] = None, + output_cls_projection: bool = False, + max_num_tiles: int = 4, + in_channels: int = 3, +) -> VisionTransformer: + """ + Builds the vision encoder associated with the clip model. This includes: + + - num_layers TransformerEncoderLayers + - positional embeddings + - CLS projection (optional) + + For details, please check the documentation of + :class:`torchtune.modules.vision_transformer.VisionTransformer`. + + Args: + tile_size (int): The size of your image tiles, if the image was tile-cropped in advance. Otherwise, + the size of the input image. In this case, the function will consider your image as a single tile. + patch_size (int): The size of each patch. Used to divide the tiles into patches. + E.g. for ``patch_size=40``, a tile of shape (400, 400) will have 10x10 grid of patches + with shape (40, 40) each. + embed_dim (int): The dimensionality of each patch embedding (token). + num_layers (int): The number of transformer layers. + num_heads (int): The number of attention heads in each transformer layer. + cls_output_dim (int): The dimensionality of the output tensor from the CLS projection module. + out_indices (Optional[List[int]]): The indices of hidden layers to return. + If provided, it will return the intermediate results of the transformer layers + before they go through a next layer. For example, ``out_indices=[0,3]`` will + return the tokens before they go through the first and fourth layers. + output_cls_projection (bool): If True, only the CLS token projection will be outputted, + instead of all tokens. Defaults to False. + max_num_tiles (int): The maximum number of tiles that can be processed. This is used to + determine the size of the positional embeddings. + in_channels (int): The number of image input channels. + + Returns: + A `VisionTransformer` object. + """ + + cls_projection = CLSProjection(embed_dim=embed_dim, cls_output_dim=cls_output_dim) if output_cls_projection else None + + # TODO (Felipe): Replace with torchtune native encoder module + mlp_ratio = 4.0 + transformer_layer = torch.nn.TransformerEncoderLayer( + d_model=embed_dim, + nhead=num_heads, + dim_feedforward=int(mlp_ratio * embed_dim), + dropout=0.0, + activation=torch.nn.SiLU(), + layer_norm_eps=1e-5, + batch_first=True, + norm_first=True, + bias=True) + + # position embeddings + if max_num_tiles == 1: + pre_tile_pos_embed = None + post_tile_pos_embed = None + token_pos_embedding = TokenPositionalEmbedding( + embed_dim=embed_dim, + patch_size=patch_size, + tile_size=tile_size) + else: + pre_tile_pos_embed = TilePositionalEmbedding(max_num_tiles=max_num_tiles, embed_dim=embed_dim) + post_tile_pos_embed = TilePositionalEmbedding(max_num_tiles=max_num_tiles, embed_dim=embed_dim) + token_pos_embedding = TiledTokenPositionalEmbedding( + max_num_tiles=max_num_tiles, + embed_dim=embed_dim, + patch_size=patch_size, + tile_size=tile_size) + + return VisionTransformer( + num_layers=num_layers, + layer=transformer_layer, + token_pos_embedding=token_pos_embedding, + pre_tile_pos_embed=pre_tile_pos_embed, + post_tile_pos_embed=post_tile_pos_embed, + cls_projection=cls_projection, + out_indices=out_indices, + tile_size=tile_size, + patch_size=patch_size, + embed_dim=embed_dim, + in_channels=in_channels, + ) diff --git a/torchtune/models/clip/_model_builders.py b/torchtune/models/clip/_model_builders.py new file mode 100644 index 0000000000..becef9e5cd --- /dev/null +++ b/torchtune/models/clip/_model_builders.py @@ -0,0 +1,14 @@ +from torchtune.models.clip._transforms import CLIPImageTransform + +def _clip_vit_224_transform(): + image_transform = CLIPImageTransform( + image_mean=[0.48145466, 0.4578275, 0.40821073], + image_std=[0.26862954, 0.26130258, 0.27577711], + tile_size=224, + possible_resolutions=None, + max_num_tiles=1, + resample="bilinear", + resize_to_max_canvas=True, + ) + + return image_transform diff --git a/torchtune/models/clip/_position_embeddings.py b/torchtune/models/clip/_position_embeddings.py new file mode 100644 index 0000000000..b53199c3e4 --- /dev/null +++ b/torchtune/models/clip/_position_embeddings.py @@ -0,0 +1,190 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from torch import nn + +# TODO (@Felipe): add load hooks + interpolation on positional encodings, +# so max_num_tiles can be variable and a trained model can be adapted to a +# new value. + + +class TokenPositionalEmbedding(nn.Module): + """ + Token positional embedding for images, different for every token in an image. + + Notice that tile is different from patch (token). For details, please check the documentation of + :class:`torchtune.modules.vision_transformer.VisionTransformer`. + + Args: + embed_dim (int): The dimensionality of each token embedding. + tile_size (int): The size of your image tiles, if the image was tile-cropped in advance. Otherwise, + the size of the input image. In this case, the function will consider your image as a single tile. + patch_size (int): The size of each patch. Used to divide the tiles into patches. + E.g. for ``patch_size=40``, a tile of shape (400, 400) will have 10x10 grid of patches + with shape (40, 40) each. + """ + + def __init__(self, embed_dim: int, tile_size: int, patch_size: int) -> None: + super().__init__() + patch_grid_size = tile_size // patch_size + scale = embed_dim**-0.5 + self.positional_embedding = nn.Parameter( + scale + * torch.randn((patch_grid_size**2 + 1, embed_dim)) # +1 for CLS token + ) + + def forward(self, x: torch.Tensor, *args) -> torch.Tensor: + """ + Args: + x (torch.Tensor): Tensor with shape (..., n_tokens, embed_dim) + Returns: + torch.Tensor: The input tensor with added positional embeddings. + """ + return x + self.positional_embedding + + +class TiledTokenPositionalEmbedding(nn.Module): + """ + + Token positional embedding for tiled images. There are two positional embeddings in this module: + + * local_token_positional_embedding: same for every tile, different for every token. Equivalent \ + to :class:`torchtune.models.clip._position_embeddings.TokenPositionalEmbedding`, but gated. + * global_token_positional_embedding: different for every tile, different for every token. + + Notice that tile is different from patch (token). For details, please check the documentation of + :class:`torchtune.modules.vision_transformer.VisionTransformer`. + + Args: + max_num_tiles (int): The maximum number of tiles an image can be divided into. + embed_dim (int): The dimensionality of each token embedding. + tile_size (int): The size of your image tiles, if the image was tile-cropped in advance. Otherwise, + the size of the input image. In this case, the function will consider your image as a single tile. + patch_size (int): The size of each patch. Used to divide the tiles into patches. + E.g. for ``patch_size=40``, a tile of shape (400, 400) will have 10x10 grid of patches + with shape (40, 40) each. + """ + + def __init__( + self, max_num_tiles: int, embed_dim: int, tile_size: int, patch_size: int + ) -> None: + super().__init__() + patch_grid_size = tile_size // patch_size + self.n_tokens_per_tile = patch_grid_size**2 + 1 # +1 for cls token + scale = embed_dim**-0.5 + + # different for every token, same for every tile + self.local_token_positional_embedding = nn.Parameter( + scale + * torch.randn((patch_grid_size**2 + 1, embed_dim)) # +1 for CLS token + ) + + # different for every token, different for every tile + self.global_token_positional_embedding = nn.Parameter( + scale + * torch.randn( + max_num_tiles, + max_num_tiles, + self.n_tokens_per_tile, + embed_dim, + ) + ) + + self.gate = nn.Parameter(torch.zeros(1)) + + def forward(self, x: torch.Tensor, aspect_ratio: torch.Tensor) -> torch.Tensor: + """ + Args: + x (torch.Tensor): Tensor with shape (bsz * n_imgs, n_tiles, n_tokens, embed_dim). + aspect_ratio (torch.Tensor): Tensor with shape (bsz * n_imgs, 2), + where aspect_ratio[k] represents the aspect ratio of the k^th image + of the batch before tile-cropping, e.g. aspect_ratio[k] = (2,1). + Returns: + torch.Tensor: The input tensor with added positional embeddings. + """ + bsz_and_n_imgs, n_tiles, n_tokens, embed_dim = x.shape + + # apply local position embedding (same for every tile) + x = x + (self.local_token_positional_embedding * (1 - self.gate.tanh())) + + # apply global positional embedding (different for every tile) + x = x.view(bsz_and_n_imgs, n_tiles, n_tokens, embed_dim) + for batch_idx, (n_tiles_h, n_tiles_w) in enumerate(aspect_ratio): + # When we batch images, all are padded to the same amount of tiles. + # The aspect_ratio lets us know the non padded tiles for each image. + # We only add positional encoding to those. + n_non_padded_tiles = int(n_tiles_h * n_tiles_w) + + # We get only the positional encoding for non padded tiles, + # i.e. n_tiles_h, n_tiles_w. + pos_embed = self.global_token_positional_embedding[ + :n_tiles_h, :n_tiles_w, :, : + ] + + # Add pos encoding to the non padded tiles. + pos_embed = pos_embed.reshape( + n_non_padded_tiles, self.n_tokens_per_tile, embed_dim + ) + pos_embed = pos_embed * self.gate.tanh() + x[batch_idx, :n_non_padded_tiles, :, :] += pos_embed + + return x + + +class TilePositionalEmbedding(nn.Module): + """ + Positional embedding for tiles, different for every tile, same for every token within a tile. + + Notice that tile is different from patch (token). For details, please check the documentation of + :class:`torchtune.modules.vision_transformer.VisionTransformer`. + + Args: + max_num_tiles (int): The maximum number of tiles an image can be divided into. + embed_dim (int): The dimensionality of each tile embedding. + """ + + def __init__( + self, + max_num_tiles: int, + embed_dim: int, + ): + super().__init__() + self.max_num_tiles = max_num_tiles + self.embed_dim = embed_dim + + scale = embed_dim**-0.5 + self.embedding = nn.Parameter( + scale * torch.randn(max_num_tiles, max_num_tiles, 1, embed_dim) + ) + self.gate = nn.Parameter(torch.zeros(1)) + + def forward(self, x: torch.Tensor, aspect_ratio: torch.Tensor) -> torch.Tensor: + """ + args: + x (torch.Tensor): Tensor with shape (bsz * n_imgs, n_tiles, n_tokens, embed_dim). + aspect_ratio (torch.Tensor): Tensor with shape (bsz * n_imgs, 2), + representing the aspect ratio of the image before tile-cropping, e.g. (2,1). + returns: + torch.Tensor: The input tensor with added positional embeddings. + """ + bsz_and_n_imgs, n_tiles, n_tokens, embed_dim = x.shape + + for batch_idx, (n_tiles_h, n_tiles_w) in enumerate(aspect_ratio): + # When we batch images, all are padded to the same amount of tiles. + # The aspect_ratio lets us know the non padded tiles for each image. + # We only add positional encoding to those. + n_non_padded_tiles = int(n_tiles_h * n_tiles_w) + + # We get only the positional encoding for non padded tiles, + # i.e. n_tiles_h, n_tiles_w. + pos_embed = self.embedding[:n_tiles_h, :n_tiles_w, :, :] + + # Add pos encoding to the non padded tiles. + pos_embed = pos_embed.reshape(n_non_padded_tiles, 1, self.embed_dim) + x[batch_idx, :n_non_padded_tiles, :, :] += pos_embed * self.gate.tanh() + + return x diff --git a/torchtune/models/clip/_transforms.py b/torchtune/models/clip/_transforms.py new file mode 100644 index 0000000000..64ab709884 --- /dev/null +++ b/torchtune/models/clip/_transforms.py @@ -0,0 +1,179 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging +from typing import Any, List, Mapping, Optional, Tuple + +import torch +import torchvision +from PIL import Image + +from torchtune.modules.transforms import ( + find_supported_resolutions, + get_canvas_best_fit, + resize_with_pad, + tile_crop, +) + +from torchvision.transforms.v2 import functional as F + +logger = logging.getLogger(__name__) + + +class CLIPImageTransform: + """ + This class accepts images of any size and dynamically resizes, pads, normalizes and tiles it + based on the image aspect ratio and the number of image tiles we allow. + + The algorithm will NOT distort the image to fit a certain aspect ratio, because + that leads to a significant degradation in image quality. + + The user can choose if they want to allow upscaling by using the flag ``resize_to_max_canvas``. + + For example, if an input image is of size 300x800, and we want to allow + a maximum of 16 image tiles, with side 224px, then: + + If ``resize_to_max_canvas=False``, then: + best_resolution = (448, 896) -> smallest canvas, up to 16 tiles, that doesn't require downscaling + image is NOT resized + image is padded (300, 800) -> 448,896 + Image is tiled 2x4, for a final output shape of (8, 3, 224, 224) + + If ``resize_to_max_canvas=True``, then: + best_resolution = (448, 1344) # canvas that allows maximum upscaling, with minimum padding, up to 16 tiles + image is resized without distortion (300,800) -> (448, 1194) #448 is the limiting side for the resize + image is padded (448, 1194) -> (448, 1344) + Image is tiled 2x5, for a final output shape of (10, 3, 224, 224) + + Args: + image_mean (Optional[List[float]]): Mean values of each channel, used for normalization. + Should be the same used for the pre-trained model. If None, no normalization is performed. + image_std Union[float, List[float]]]): Standard deviation values of each channel, used for normalization. + Should be the same used for the pre-trained model. If None, no normalization is performed. + possible_resolutions (Optional[List[Tuple[int, int]]]): List of possible resolutions as tuples (height, width). + where each tuple represents a possible canvas to fit the image into when calling ``get_canvas_best_fit``. + If None, this will be calculated using max_num_tiles and tile_size. + tile_size (int): Size of the tiles to divide the image into + max_num_tiles (Optional[int]): Only used if possible_resolutions is NOT given. + Maximum number of tiles to break an image into. + This will be used to generate possible_resolutions, + e.g. [(224, 224), (224, 448), (448, 224)] if max_num_tiles = 2 and tile_size = 224. + resample (str): Resampling method used when resizing images. Supports any enum of + ``torchvision.transforms.InterpolationMode``, e.g. "nearest", "nearest_exact", "bilinear", "bicubic". + resize_to_max_canvas (bool): + "If True, the image will be upscaled without distortion to fit the largest possible + resolution from possible_resolutions. + If False, it will pick the resolution that minimizes downscaling, including no downscaling at all. + In this case, the image will only be upscaled if it's size < tile_size. + + Examples: + >>> image_transform = CLIPImageTransform( + ... image_mean=None, + ... image_std=None, + ... tile_size=224, + ... possible_resolutions=None, + ... max_num_tiles=4, + ... resample="bilinear", + ... resize_to_max_canvas=True, + ...) + >>> # create random image + >>> image = (np.random.rand(100,200,3) * 255).astype(np.uint8) + >>> image = PIL.Image.fromarray(image) + >>> output = image_transform(image) + >>> output['image'].shape # [num_tiles, num_channels, tile_size, tile_size] + torch.Size([2, 3, 224, 224]) + >>> output['ar'] # image best fits the canvas 224x448 + torch.tensor([1,2]) + """ + + def __init__( + self, + image_mean: Optional[List[float]] = None, + image_std: Optional[List[float]] = None, + possible_resolutions: Optional[List[Tuple[int, int]]] = None, + tile_size: int = 224, + max_num_tiles: Optional[int] = 4, + resample: str = "bilinear", + resize_to_max_canvas: bool = False, + ) -> None: + + # get_canvas_best_fit + assert ( + possible_resolutions is not None or max_num_tiles is not None + ), f"Either possible_resolutions or max_num_tiles must be given. Got {possible_resolutions=} and {max_num_tiles=}" + + # If possible_resolutions are not given, then calculate possible ones based on max_num_tiles + if not possible_resolutions and max_num_tiles: + possible_resolutions = find_supported_resolutions( + max_num_tiles=max_num_tiles, tile_size=tile_size + ) + else: + possible_resolutions = possible_resolutions + + self.possible_resolutions = torch.tensor(possible_resolutions).reshape(-1, 2) + logger.info( + f"Found possible_resolutions: {self.possible_resolutions}. Will fit the images into the canvas with best fit." + ) + + self.resize_to_max_canvas = resize_to_max_canvas + + # normalize + assert (image_mean is None) == ( + image_std is None + ), f"Need to provide both or none of image_mean and image_std. Got {image_mean=} and {image_std=}" + self.image_mean = image_mean + self.image_std = image_std + + # resize_with_pad + self.max_upscaling_size = None if resize_to_max_canvas else tile_size + self.resample = torchvision.transforms.InterpolationMode[resample.upper()] + + # tile_crop + self.tile_size = tile_size + + def __call__(self, *, image: Image.Image, **kwargs) -> Mapping[str, Any]: + + assert isinstance(image, Image.Image), "Input image must be a PIL image." + + # Make image torch.tensor((3, H, W), dtype='float32'), 0<=values<=1 + image_tensor = F.to_dtype( + F.grayscale_to_rgb_image(F.to_image(image)), scale=True + ) + + # Find the best canvas to fit the image without distortion + best_resolution = get_canvas_best_fit( + image=image_tensor, + possible_resolutions=self.possible_resolutions, + resize_to_max_canvas=self.resize_to_max_canvas, + ) + + # resize without distortion + pad to fit best_resolution + image_tensor = resize_with_pad( + image=image_tensor, + target_size=best_resolution, + resample=self.resample, + max_upscaling_size=self.max_upscaling_size, + ) + + # Normalize + if self.image_mean and self.image_std: + image_tensor = F.normalize( + image_tensor, mean=self.image_mean, std=self.image_std + ) + + # Divide the image into equally sized tiles + image_tensor = tile_crop(image=image_tensor, tile_size=self.tile_size) + + aspect_ratio = torch.tensor(best_resolution).reshape(-1) // self.tile_size + + kwargs.update( + { + "image": image_tensor, + "aspect_ratio": aspect_ratio, + } + ) + + return kwargs diff --git a/torchtune/models/code_llama2/_model_builders.py b/torchtune/models/code_llama2/_model_builders.py index 47b17b724b..fca17367d8 100644 --- a/torchtune/models/code_llama2/_model_builders.py +++ b/torchtune/models/code_llama2/_model_builders.py @@ -59,6 +59,7 @@ def lora_code_llama2_7b( Default: False lora_rank (int): rank of each low-rank approximation lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05 quantize_base (bool): Whether to quantize base model weights Returns: @@ -139,6 +140,7 @@ def lora_code_llama2_13b( Default: False lora_rank (int): rank of each low-rank approximation lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05 quantize_base (bool): Whether to quantize base model weights Returns: @@ -220,6 +222,7 @@ def lora_code_llama2_70b( Default: False lora_rank (int): rank of each low-rank approximation lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05 quantize_base (bool): Whether to quantize base model weights Returns: diff --git a/torchtune/models/convert_weights.py b/torchtune/models/convert_weights.py index 3652813c12..68277d6dfc 100644 --- a/torchtune/models/convert_weights.py +++ b/torchtune/models/convert_weights.py @@ -249,6 +249,7 @@ def tune_to_peft_adapter_weights( num_heads: int = 32, num_kv_heads: int = 32, dim: int = 4096, + head_dim: int = None, ): converted_state_dict = {} full_mapping = {} @@ -266,7 +267,8 @@ def tune_to_peft_adapter_weights( } ) - head_dim = dim // num_heads + if head_dim is None: + head_dim = dim // num_heads def _permute_lora_matrix(t, n_heads): rank = t.shape[-1] diff --git a/torchtune/models/gemma/__init__.py b/torchtune/models/gemma/__init__.py index cc73b85413..48e4e84b10 100644 --- a/torchtune/models/gemma/__init__.py +++ b/torchtune/models/gemma/__init__.py @@ -4,8 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from ._component_builders import gemma # noqa -from ._convert_weights import gemma_hf_to_tune, gemma_tune_to_hf # noqa +from ._component_builders import gemma, lora_gemma # noqa from ._model_builders import ( # noqa gemma_2b, gemma_7b, @@ -23,6 +22,7 @@ "gemma_2b", "gemma_7b", "gemma_tokenizer", + "lora_gemma", "lora_gemma_2b", "lora_gemma_7b", "qlora_gemma_2b", diff --git a/torchtune/models/gemma/_convert_weights.py b/torchtune/models/gemma/_convert_weights.py deleted file mode 100644 index 5633c8be36..0000000000 --- a/torchtune/models/gemma/_convert_weights.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from typing import Dict - -import torch - -from torchtune.models.convert_weights import _FROM_HF, get_mapped_key - - -def gemma_hf_to_tune( - state_dict: Dict[str, torch.Tensor], - num_heads: int = 8, - num_kv_heads: int = 1, - dim: int = 2048, - head_dim: int = 256, -) -> Dict[str, torch.Tensor]: - """ - Convert a state dict from HF's format to TorchTune's format, which contains the weights - of a Gemma model. - State dicts from multiple checkpoint files should be consolidated into a single state dict - before calling this function. - The logic is identical to :func:`~torchtune.models.convert_weights.hf_to_tune`, but doesn't load - output projection weights. - - Args: - state_dict (Dict[str, torch.Tensor]): State dict in HF's format. - num_heads (int): Number of heads in the model. Defaults to 8. - num_kv_heads (int): Number of heads in the key/value projection layers. Defaults to 1. - dim (int): Dimension of the model. Defaults to 2048. - head_dim (int): Dimension of the attention head. This value is explicit in Gemma confs. Defaults to 256. - - Returns: - Dict[str, torch.Tensor]: State dict in TorchTune's format. - """ - converted_state_dict = {} - - def _permute(t, n_heads): - return ( - t.view(n_heads, 2, head_dim // 2, dim) - .transpose(1, 2) - .reshape((head_dim * n_heads), dim) - ) - - for key, value in state_dict.items(): - if ( - "rotary_emb.inv_freq" not in key and "lm_head.weight" not in key - ): # Skip loading the position embeddings and output projection weights - new_key = get_mapped_key(key, _FROM_HF) - if "q_proj" in key: - value = _permute(value, num_heads) - elif "k_proj" in key: - value = _permute(value, num_kv_heads) - converted_state_dict[new_key] = value - return converted_state_dict - - -def gemma_tune_to_hf( - state_dict: Dict[str, torch.Tensor], - num_heads: int = 8, - num_kv_heads: int = 1, - dim: int = 2048, - head_dim: int = 256, -) -> Dict[str, torch.Tensor]: - """ - Convert a state dict from TorchTune's format to Hugging Face's format for Gemma. - - This function takes a state dictionary in TorchTune's format, which contains the weights of a Gemma model, - and converts it into a format that can be loaded into a Hugging Face model. - The logic is identical to :func:`~torchtune.models.convert_weights.tune_to_hf`, but saves the tied - output projection weights. - - Args: - state_dict (Dict[str, torch.Tensor]): State dict in TorchTune's format. - num_heads (int, optional): Number of heads in the model. Defaults to 8. - num_kv_heads (int, optional): Number of heads in the key/value projection layers. Defaults to 1. - dim (int, optional): Dimension of the model. Defaults to 2048. - head_dim (int): Dimension of the attention head. This value is explicit in Gemma confs. Defaults to 256. - - Returns: - Dict[str, torch.Tensor]: State dict in Hugging Face's format. - - """ - converted_state_dict = {} - inverted_mapping_dict = {v: k for k, v in _FROM_HF.items()} - - def _permute(t, n_heads): - return ( - t.view(n_heads, head_dim // 2, 2, dim) - .transpose(1, 2) - .reshape((head_dim * n_heads), dim) - ) - - for key, value in state_dict.items(): - new_key = get_mapped_key(key, inverted_mapping_dict) - if "q_proj" in key: - value = _permute(value, num_heads) - elif "k_proj" in key: - value = _permute(value, num_kv_heads) - elif "tok_embeddings" in key: - # HF also uses tied weights, see - # https://github.com/huggingface/transformers/blob/14ff5dd962c1bd0a4e3adaac347ba396d8df5add/src/transformers/models/gemma/convert_gemma_weights_to_hf.py#L104 - converted_state_dict["lm_head.weight"] = value - converted_state_dict[new_key] = value - return converted_state_dict diff --git a/torchtune/models/gemma/_tokenizer.py b/torchtune/models/gemma/_tokenizer.py index dae20351fa..bed4f8606c 100644 --- a/torchtune/models/gemma/_tokenizer.py +++ b/torchtune/models/gemma/_tokenizer.py @@ -92,12 +92,13 @@ def tokenize_messages( Message(role="user", content="user prompt\n", masked=True), Message(role="assistant", content="assistant response\n"), ] - # tokenize_messages encodes messages separately and concats + + >>> # tokenize_messages encodes messages separately and concats >>> tokenizer.tokenize_messages(messages, max_seq_len)[0] [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2] - # Same result as encoding the full string in one go + >>> # Same result as encoding the full string in one go >>> tokenizer.encode(''.join([message.content for message in messages])) [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2] diff --git a/torchtune/models/llama2/__init__.py b/torchtune/models/llama2/__init__.py index 83bebaf478..a08754906b 100644 --- a/torchtune/models/llama2/__init__.py +++ b/torchtune/models/llama2/__init__.py @@ -18,7 +18,6 @@ qlora_llama2_70b, qlora_llama2_7b, ) -from ._model_utils import scale_hidden_dim_for_mlp from ._tokenizer import Llama2Tokenizer __all__ = [ @@ -35,5 +34,4 @@ "qlora_llama2_13b", "qlora_llama2_70b", "qlora_llama2_7b", - "scale_hidden_dim_for_mlp", ] diff --git a/torchtune/models/llama2/_model_builders.py b/torchtune/models/llama2/_model_builders.py index 8e21ced41a..271896940f 100644 --- a/torchtune/models/llama2/_model_builders.py +++ b/torchtune/models/llama2/_model_builders.py @@ -79,6 +79,7 @@ def lora_llama2_7b( lora_rank (int): rank of each low-rank approximation lora_alpha (float): scaling factor for the low-rank approximation quantize_base (bool): Whether to quantize base model weights + lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05 Returns: TransformerDecoder: Instantiation of Llama2 7B model with LoRA applied @@ -158,6 +159,7 @@ def lora_llama2_13b( Default: False lora_rank (int): rank of each low-rank approximation lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05 quantize_base (bool): Whether to quantize base model weights Returns: @@ -239,6 +241,7 @@ def lora_llama2_70b( Default: False lora_rank (int): rank of each low-rank approximation lora_alpha (float): scaling factor for the low-rank approximation + lora_dropout (float): dropout probability for LoRA linear layers. Default: 0.05 quantize_base (bool): Whether to quantize base model weights Returns: diff --git a/torchtune/models/llama2/_tokenizer.py b/torchtune/models/llama2/_tokenizer.py index 96c0ad213f..4358a48566 100644 --- a/torchtune/models/llama2/_tokenizer.py +++ b/torchtune/models/llama2/_tokenizer.py @@ -92,10 +92,11 @@ def tokenize_messages( r"""Tokenize a list of messages one at a time then concatenate them, returning a list of tokens and a list of masks. - Note: llama2 sentencepiece has problems where in general - encode(s1 + s2) != encode(s1) + encode(s2) due to whitespace handling. - We can get around this by prepending s2 with a known token and slicing the - beginning off the tokenized s2. + Note: + sentencepiece has problems where in general + encode(s1 + s2) != encode(s1) + encode(s2) due to whitespace handling. + We can get around this by prepending s2 with a known token and slicing the + beginning off the tokenized s2. Example: >>> tokenizer = Llama2Tokenizer(tokenizer_path) @@ -104,12 +105,12 @@ def tokenize_messages( Message(role="user", content="user prompt\n", masked=True), Message(role="assistant", content="assistant response\n"), ] - # tokenize_messages encodes messages separately and concats + + >>> # tokenize_messages encodes messages separately and concats >>> tokenizer.tokenize_messages(messages, max_seq_len)[0] [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2] - - # Same result as encoding the full string in one go + >>> # Same result as encoding the full string in one go >>> tokenizer.encode(''.join([message.content for message in messages])) [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2] diff --git a/torchtune/models/llama3/__init__.py b/torchtune/models/llama3/__init__.py index b906c5b790..90de8c286f 100644 --- a/torchtune/models/llama3/__init__.py +++ b/torchtune/models/llama3/__init__.py @@ -15,7 +15,6 @@ qlora_llama3_70b, qlora_llama3_8b, ) -from ._model_utils import scale_hidden_dim_for_mlp from ._tokenizer import Llama3Tokenizer __all__ = [ @@ -29,5 +28,4 @@ "lora_llama3_70b", "qlora_llama3_8b", "qlora_llama3_70b", - "scale_hidden_dim_for_mlp", ] diff --git a/torchtune/models/mistral/_component_builders.py b/torchtune/models/mistral/_component_builders.py index bae85fe00a..7a908dc83a 100644 --- a/torchtune/models/mistral/_component_builders.py +++ b/torchtune/models/mistral/_component_builders.py @@ -429,7 +429,7 @@ def mistral_classifier( """ Build a base mistral model with an added classification layer. See :func:`~torchtune.models.mistral.mistral_classifier` - for details on the base mistral classifier model. + for details on the base mistral classifier model. Args: num_classes (int): number of classes for the classification layer. diff --git a/torchtune/models/mistral/_model_builders.py b/torchtune/models/mistral/_model_builders.py index 360d3112fc..66e98cc9f9 100644 --- a/torchtune/models/mistral/_model_builders.py +++ b/torchtune/models/mistral/_model_builders.py @@ -121,7 +121,7 @@ def mistral_classifier_7b() -> TransformerDecoder: Returns: - TransformerClassifier: Instantiation of Mistral 7B classifier model + TransformerDecoder: Instantiation of Mistral 7B classifier model """ return mistral_classifier( num_classes=1, diff --git a/torchtune/models/mistral/_tokenizer.py b/torchtune/models/mistral/_tokenizer.py index d5973d815e..f8a2f4b645 100644 --- a/torchtune/models/mistral/_tokenizer.py +++ b/torchtune/models/mistral/_tokenizer.py @@ -107,10 +107,11 @@ def tokenize_messages( r"""Tokenize a list of messages one at a time then concatenate them, returning a list of tokens and a list of masks. - Note: sentencepiece has problems where in general - encode(s1 + s2) != encode(s1) + encode(s2) due to whitespace handling. - We can get around this by prepending s2 with a known token and slicing the - beginning off the tokenized s2. + Note: + sentencepiece has problems where in general + encode(s1 + s2) != encode(s1) + encode(s2) due to whitespace handling. + We can get around this by prepending s2 with a known token and slicing the + beginning off the tokenized s2. Example: >>> tokenizer = MistralTokenizer(tokenizer_path) @@ -119,12 +120,13 @@ def tokenize_messages( Message(role="user", content="user prompt\n", masked=True), Message(role="assistant", content="assistant response\n"), ] - # tokenize_messages encodes messages separately and concats + + >>> # tokenize_messages encodes messages separately and concats >>> tokenizer.tokenize_messages(messages, max_seq_len)[0] [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2] - # Same result as encoding the full string in one go + >>> # Same result as encoding the full string in one go >>> tokenizer.encode(''.join([message.content for message in messages])) [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2] diff --git a/torchtune/models/phi3/_tokenizer.py b/torchtune/models/phi3/_tokenizer.py index b0196e5cf7..888be82c1c 100644 --- a/torchtune/models/phi3/_tokenizer.py +++ b/torchtune/models/phi3/_tokenizer.py @@ -118,12 +118,12 @@ def tokenize_messages( Message(role="user", content="user prompt\n", masked=True), Message(role="assistant", content="assistant response\n"), ] - # tokenize_messages encodes messages separately and concats + + >>> # tokenize_messages encodes messages separately and concats >>> tokenizer.tokenize_messages(messages, max_seq_len)[0] [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2] - - # Same result as encoding the full string in one go + >>> # Same result as encoding the full string in one go >>> tokenizer.encode(''.join([message.content for message in messages])) [1, 1788, 2643, 13, 1792, 9508, 13, 465, 22137, 2933, 2] diff --git a/torchtune/models/qwen2/_component_builders.py b/torchtune/models/qwen2/_component_builders.py index fb7bad3369..ee5cd261bd 100644 --- a/torchtune/models/qwen2/_component_builders.py +++ b/torchtune/models/qwen2/_component_builders.py @@ -10,7 +10,7 @@ from torch import nn -from torchtune.models.qwen2.transformer import Qwen2TransformerDecoder +from torchtune.modules.transformer import TransformerDecoder, TiedEmbeddingTransformerDecoder from torchtune.models.qwen2._positional_embeddings import Qwen2RotaryPositionalEmbeddings from torchtune.modules import ( @@ -48,7 +48,7 @@ def qwen2( norm_eps: float = 1e-5, rope_base: float = 1_000_000.0, tie_word_embeddings: bool = False, -) -> Qwen2TransformerDecoder: +) -> TransformerDecoder: """ Build the decoder associated with the Qwen2 model. This includes: - Token embeddings @@ -76,7 +76,7 @@ def qwen2( tie_word_embeddings (bool): whether the model's input and output word embeddings should be tied. Returns: - Qwen2TransformerDecoder: Instantiation of Qwen2 model. + TransformerDecoder: Instantiation of Qwen2 model. """ head_dim = embed_dim // num_heads num_kv_heads = num_kv_heads if num_kv_heads else num_heads @@ -105,16 +105,27 @@ def qwen2( ) tok_embeddings = nn.Embedding(vocab_size, embed_dim) output_proj = None if tie_word_embeddings else nn.Linear(embed_dim, vocab_size, bias=False) - return Qwen2TransformerDecoder( - tok_embeddings=tok_embeddings, - layer=layer, - num_layers=num_layers, - max_seq_len=max_seq_len, - num_heads=num_heads, - head_dim=head_dim, - norm=RMSNorm(embed_dim, eps=norm_eps), - output=output_proj, - ) + if output_proj is None: + return TiedEmbeddingTransformerDecoder( + tok_embeddings=tok_embeddings, + layer=layer, + num_layers=num_layers, + max_seq_len=max_seq_len, + num_heads=num_heads, + head_dim=head_dim, + norm=RMSNorm(embed_dim, eps=norm_eps), + ) + else: + return TransformerDecoder( + tok_embeddings=tok_embeddings, + layer=layer, + num_layers=num_layers, + max_seq_len=max_seq_len, + num_heads=num_heads, + head_dim=head_dim, + norm=RMSNorm(embed_dim, eps=norm_eps), + output=output_proj, + ) def qwen2_mlp(dim: int, hidden_dim: int) -> FeedForward: @@ -150,7 +161,7 @@ def lora_qwen2( lora_dropout: float = 0.0, # Quantization args quantize_base: bool = False, -) -> Qwen2TransformerDecoder: +) -> TransformerDecoder: """ Return a version of Qwen2 (an instance of :func:`~torchtune.models.qwen2.transformer.Qwen2TransformerDecoder`) with LoRA applied based on the passed in configuration. @@ -188,7 +199,7 @@ def lora_qwen2( supported for quantization currently. Returns: - Qwen2TransformerDecoder: Instantiation of Qwen2 model with LoRA applied to + TransformerDecoder: Instantiation of Qwen2 model with LoRA applied to a subset of the attention projections in each layer. """ @@ -237,16 +248,27 @@ def lora_qwen2( if apply_lora_to_output else nn.Linear(embed_dim, vocab_size, bias=False) ) - model = Qwen2TransformerDecoder( - tok_embeddings=tok_embeddings, - layer=layer, - num_layers=num_layers, - max_seq_len=max_seq_len, - num_heads=num_heads, - head_dim=(embed_dim // num_heads), - norm=RMSNorm(embed_dim, eps=norm_eps), - output=output_proj, - ) + if output_proj is None: + model = TiedEmbeddingTransformerDecoder( + tok_embeddings=tok_embeddings, + layer=layer, + num_layers=num_layers, + max_seq_len=max_seq_len, + num_heads=num_heads, + head_dim=(embed_dim // num_heads), + norm=RMSNorm(embed_dim, eps=norm_eps), + ) + else: + model = TransformerDecoder( + tok_embeddings=tok_embeddings, + layer=layer, + num_layers=num_layers, + max_seq_len=max_seq_len, + num_heads=num_heads, + head_dim=(embed_dim // num_heads), + norm=RMSNorm(embed_dim, eps=norm_eps), + output=output_proj, + ) if quantize_base: # For QLoRA, we reparametrize 4-bit tensors to higher precision, and offload to CPU on the fly @@ -397,7 +419,6 @@ def lora_qwen2_mlp( lora_dropout: float = 0.0, quantize_base: bool = False, ) -> FeedForward: - # TODO(suyang.fy): check code. gate_proj = LoRALinear( in_dim=dim, out_dim=hidden_dim, diff --git a/torchtune/models/qwen2/_model_builders.py b/torchtune/models/qwen2/_model_builders.py index c83a281931..962d7a76f5 100644 --- a/torchtune/models/qwen2/_model_builders.py +++ b/torchtune/models/qwen2/_model_builders.py @@ -3,7 +3,7 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import List +from typing import List, Optional from functools import partial from torchtune.models.qwen2._component_builders import qwen2, lora_qwen2 @@ -12,6 +12,7 @@ from torchtune.modules import TransformerDecoder from torchtune.modules.peft import LORA_ATTN_MODULES +from torchtune.modules.tokenizers import parse_hf_tokenizer_json """ Model builders build specific instantiations using component builders. For example @@ -20,7 +21,7 @@ """ -def qwen2_7b() -> Qwen2TransformerDecoder: +def qwen2_7b() -> TransformerDecoder: """ Builder for creating a Qwen2 model initialized w/ the default 7B parameter values from https://huggingface.co/Qwen/Qwen2-7B-Instruct @@ -42,14 +43,22 @@ def qwen2_7b() -> Qwen2TransformerDecoder: ) -def qwen2_tokenizer(path: str) -> Qwen2Tokenizer: - return Qwen2Tokenizer( - path, - unk_token="<|endoftext|>", - bos_token=None, - eos_token="<|endoftext|>", - pad_token="<|endoftext|>", - ) +def qwen2_tokenizer(vocab_file: str, merges_file: str, special_tokens_path: Optional[str] = None) -> Qwen2Tokenizer: + """ + Tokenizer for Qwen2. + + Args: + vocab_file (str): path to the vocab file. + merges_file (str): path to the merges file. + special_tokens_path (Optional[str]): Path to ``tokenizer.json`` from Hugging Face + model files that contains all registered special tokens, or a local json file + structured similarly. Default is None to use the canonical Qwen2 special tokens. + + Returns: + Llama3Tokenizer: Instantiation of the Qwen2 tokenizer + """ + special_tokens = parse_hf_tokenizer_json(special_tokens_path) if special_tokens_path is not None else None + return Qwen2Tokenizer(vocab_file=vocab_file, merges_file=merges_file, special_tokens=special_tokens) def lora_qwen2_7b( @@ -60,7 +69,7 @@ def lora_qwen2_7b( lora_alpha: float = 16, lora_dropout: float = 0.05, quantize_base: bool = False, -) -> Qwen2TransformerDecoder: +) -> TransformerDecoder: """ Builder for creating a Qwen2 7B model with LoRA enabled. diff --git a/torchtune/models/qwen2/_tokenizer.py b/torchtune/models/qwen2/_tokenizer.py index 785b8de065..d9460bbcc9 100644 --- a/torchtune/models/qwen2/_tokenizer.py +++ b/torchtune/models/qwen2/_tokenizer.py @@ -3,30 +3,86 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import json +import unicodedata +from functools import lru_cache +from typing import Dict, List, Optional, Tuple -from typing import List, Optional, Tuple - -from tokenizers import Tokenizer as TokenizerFast +import regex as re from torchtune.data import Message, truncate +from torchtune.models.qwen2._trie import Trie from torchtune.modules.tokenizers import ModelTokenizer +PRETOKENIZE_REGEX = ( + r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|" + r"[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" +) + +QWEN2_SPECIAL_TOKENS = { + "<|endoftext|>": 151643, + "<|im_start|>": 151644, + "<|im_end|>": 151645, +} + ENDOFTEXT = "<|endoftext|>" IM_START = "<|im_start|>" IM_END = "<|im_end|>" +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoid mapping to whitespace/control + characters the bpe code barfs on. + + The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab + if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for + decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup + tables between utf-8 bytes and unicode strings. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """ + Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + class Qwen2Tokenizer(ModelTokenizer): """This class construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). See . Args: - path (str): Path to tokenizer.json file. + vocab_file (str): Path to vocab.json file. + merges_file (str): Path to merges.txt file. Example: - >>> tokenizer = Qwen2Tokenizer("/path/to/tokenizer.json") + >>> tokenizer = Qwen2Tokenizer(vocab_file="/path/to/vocab.json", merges_file="/path/to/merges.txt") >>> tokenized_text = tokenizer.encode("Hello world!") >>> print(tokenized_text) [] @@ -39,35 +95,113 @@ class Qwen2Tokenizer(ModelTokenizer): def __init__( self, - path: str, + vocab_file: str, + merges_file: str, *, + special_tokens: Optional[Dict[str, int]] = None, + errors: str = "replace", unk_token: Optional[str] = ENDOFTEXT, bos_token: Optional[str] = None, eos_token: str = ENDOFTEXT, pad_token: Optional[str] = ENDOFTEXT, ): - # Build backend tokenizer. - self._tokenizer = TokenizerFast.from_file(path) - - _truncation = self._tokenizer.truncation - if _truncation is not None: - self._tokenizer.enable_truncation(**_truncation) - else: - self._tokenizer.no_truncation() - - _padding = self._tokenizer.padding - if _padding is not None: - self._tokenizer.enable_padding(**_padding) - - vocab = self._tokenizer.get_vocab() - self.unk_id = None if unk_token is None else vocab[unk_token] - self.bos_id = None if bos_token is None else vocab[bos_token] - self.eos_id = None if eos_token is None else vocab[eos_token] - self.pad_id = None if pad_token is None else vocab[pad_token] - self.im_start_id = vocab[IM_START] - self.im_end_id = vocab[IM_END] + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + bpe_merges = [] + with open(merges_file, encoding="utf-8") as merges_handle: + for i, line in enumerate(merges_handle): + line = line.strip() + if (i == 0 and line.startswith("#version:")) or not line: + continue + bpe_merges.append(tuple(line.split())) + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + # NOTE: the cache can grow without bound and will get really large for long running processes + # (esp. for texts of language that do not use space between word, e.g. Chinese); technically + # not a memory leak but appears as one. + # GPT2Tokenizer has the same problem, so let's be consistent. + self.cache = {} + + self.pat = re.compile(PRETOKENIZE_REGEX) + + self.special_tokens = ( + special_tokens if special_tokens is not None else QWEN2_SPECIAL_TOKENS + ) + self._special_tokens_reversed = {v: k for k, v in self.special_tokens.items()} + + self.unk_id = None if unk_token is None else self.special_tokens[unk_token] + self.bos_id = None if bos_token is None else self.special_tokens[bos_token] + self.eos_id = None if eos_token is None else self.special_tokens[eos_token] + self.pad_id = None if pad_token is None else self.special_tokens[pad_token] + self.im_start_id = self.special_tokens[IM_START] + self.im_end_id = self.special_tokens[IM_END] self.stop_tokens = [self.eos_id, self.im_end_id] + # Tokens trie for special tokens. + self.tokens_trie = Trie() + for special_token in self.special_tokens: + self.tokens_trie.add(special_token) + + def _bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + self.cache[token] = word + return word + + def _tokenize(self, text): + """Tokenize a string.""" + bpe_tokens = [] + for token in re.findall(self.pat, text): + token = "".join( + self.byte_encoder[b] for b in token.encode("utf-8") + ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self._bpe(token).split(" ")) + return bpe_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.unk_id) + def encode( self, text: str, add_bos: bool = True, add_eos: bool = True, **kwargs ) -> List[int]: @@ -81,8 +215,40 @@ def encode( Returns: List[int]: The list of token ids. + + Notes: + This method follows + and + . """ - return self.encode_batch([text], add_bos=add_bos, add_eos=add_eos, **kwargs)[0] + + text = unicodedata.normalize("NFC", text) + + tokens = self.tokens_trie.split(text) + + tokenized_text = [] + for token in tokens: + if not token: + continue + if token in self.special_tokens: + tokenized_text.append(token) + else: + tokenized_text.extend(self._tokenize(token)) + + # Convert tokenized text to token ids. + token_ids = [] + if add_bos and self.bos_id is not None: + token_ids.append(self.bos_id) + for token in tokenized_text: + if token in self.special_tokens: + token_id = self.special_tokens[token] + else: + token_id = self._convert_token_to_id(token) + token_ids.append(token_id) + if add_eos and self.eos_id is not None: + token_ids.append(self.eos_id) + + return token_ids def encode_batch( self, @@ -101,16 +267,26 @@ def encode_batch( Returns: List[List[int]]: A batch of lists of token ids. """ - encodings = self._tokenizer.encode_batch(batch_text) - encoded_token_ids = [] - for encoding in encodings: - encoding_ids = encoding.ids[:] - if add_bos and self.bos_id is not None: - encoding_ids.insert(0, self.bos_id) - if add_eos and self.eos_id is not None: - encoding_ids.append(self.eos_id) - encoded_token_ids.append(encoding_ids) - return encoded_token_ids + batch_token_ids = [] + for text in batch_text: + token_ids = self.encode(text, add_bos=add_bos, add_eos=add_eos, **kwargs) + batch_token_ids.append(token_ids) + return batch_token_ids + + def _convert_id_to_token(self, index: int) -> str: + """Converts an index (integer) in a token (str) using the vocab.""" + token = self._special_tokens_reversed.get(index, None) + if token is None: + return self.decoder.get(index) + return token + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + """Converts a sequence of tokens (string) in a single string.""" + text = "".join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode( + "utf-8", errors=self.errors + ) + return text def decode( self, @@ -128,9 +304,24 @@ def decode( Returns: str: The decoded string. """ - text = self._tokenizer.decode( - token_ids, skip_special_tokens=skip_special_tokens - ) + sub_texts = [] + current_sub_text = [] + for token_id in token_ids: + token = self._convert_id_to_token(token_id) + if token_id in self._special_tokens_reversed: + if current_sub_text: + string = self.convert_tokens_to_string(current_sub_text) + if string: + sub_texts.append(string) + current_sub_text = [] + if not skip_special_tokens: + sub_texts.append(token) + else: + current_sub_text.append(token) + if current_sub_text: + sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + + text = "".join(sub_texts) return text def tokenize_messages( @@ -158,15 +349,15 @@ def tokenize_messages( for index, message in enumerate(messages): content = "" if message.role == "system": - content = self.system.format(content=message.content) + content = self.system.format(content=message.text_content) elif message.role == "user": - content = self.user.format(content=message.content) + content = self.user.format(content=message.text_content) elif message.role == "assistant": - if index == len(messages) - 1 and not message.content: + if index == len(messages) - 1 and not message.text_content: content = self.assistant_for_generation is_generation = True else: - content = self.assistant.format(content=message.content) + content = self.assistant.format(content=message.text_content) tokenized_message = self.encode(content, add_bos=False, add_eos=False) tokens.extend(tokenized_message) mask.extend([message.masked] * len(tokenized_message)) diff --git a/torchtune/models/qwen2/_trie.py b/torchtune/models/qwen2/_trie.py new file mode 100644 index 0000000000..7b5bec41e7 --- /dev/null +++ b/torchtune/models/qwen2/_trie.py @@ -0,0 +1,237 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from collections import OrderedDict +from typing import List + +from torchtune.utils.logging import get_logger + +logger = get_logger() + + +class Trie: + """ + Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass + Loose reference https://en.wikipedia.org/wiki/Trie + + This class is copied from . + """ + + def __init__(self): + self.data = {} + self._tokens = set() + + def add(self, word: str): + """ + Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation. + The special key `""` is used to represent termination. + + This function is idempotent, adding twice the same word will leave the trie unchanged + + Example: + + ```python + >>> trie = Trie() + >>> trie.add("Hello 友達") + >>> trie.data + {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}} + + >>> trie.add("Hello") + >>> trie.data + {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}} + ``` + """ + if not word: + # Prevent empty string + return + + self._tokens.add(word) + ref = self.data + for char in word: + ref[char] = char in ref and ref[char] or {} + ref = ref[char] + ref[""] = 1 + + def split(self, text: str) -> List[str]: + """ + Will look for the words added to the trie within `text`. Output is the original string splitted along the + boundaries of the words found. + + This trie will match the longest possible word first ! + + Example: + + ```python + >>> trie = Trie() + >>> trie.split("[CLS] This is a extra_id_100") + ["[CLS] This is a extra_id_100"] + + >>> trie.add("[CLS]") + >>> trie.add("extra_id_1") + >>> trie.add("extra_id_100") + >>> trie.split("[CLS] This is a extra_id_100") + ["[CLS]", " This is a ", "extra_id_100"] + ``` + """ + # indexes are counted left of the chars index. + # "hello", index 0, is left of h, index 1 is between h and e. + # index 5 is right of the "o". + + # States are going to capture every possible start (indexes as above) + # as keys, and have as values, a pointer to the position in the trie + # where we're at. This is a partial match for now. + # This enables to keep track of multiple matches while we're iterating + # the string + # If the trie contains, "blowing", and "lower" and we encounter the + # string "blower", we need to split into ["b", "lower"]. + # This is where we need to keep track of multiple possible starts. + states = OrderedDict() + + # This will contain every indices where we need + # to cut. + # We force to cut at offset 0 and len(text) (added later) + offsets = [0] + + # This is used by the lookahead which needs to skip over + # some text where the full match exceeded the place in the initial + # for loop + skip = 0 + # Main loop, Giving this algorithm O(n) complexity + for current, current_char in enumerate(text): + if skip and current < skip: + # Prevents the lookahead for matching twice + # like extra_id_100 and id_100 + continue + + # This will track every state + # that stop matching, we need to stop tracking them. + # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then + # fail on "b", we need to remove 0 from the valid states. + to_remove = set() + # Whenever we found a match, we need to drop everything + # this is a greedy algorithm, it will match on the first found token + reset = False + + # In this case, we already have partial matches (But unfinished) + for start, trie_pointer in states.items(): + if "" in trie_pointer: + # This is a final match, we need to reset and + # store the results in `offsets`. + + # Lookahead to match longest first + # Important in case of extra_id_1 vs extra_id_100 + # Here we are also actively looking for other earlier partial + # matches + # "[CLS]", "L", we need to match CLS even if L is special + for lookstart, looktrie_pointer in states.items(): + if lookstart > start: + # This partial match is later, we can stop looking + break + elif lookstart < start: + # This partial match is earlier, the trie pointer + # was already updated, so index is + 1 + lookahead_index = current + 1 + end = current + 1 + else: + # Here lookstart == start and + # looktrie_pointer == trie_pointer + # It wasn't updated yet so indices are current ones + lookahead_index = current + end = current + next_char = ( + text[lookahead_index] + if lookahead_index < len(text) + else None + ) + if "" in looktrie_pointer: + start = lookstart + end = lookahead_index + skip = lookahead_index + + while next_char in looktrie_pointer: + looktrie_pointer = looktrie_pointer[next_char] + lookahead_index += 1 + if "" in looktrie_pointer: + start = lookstart + end = lookahead_index + skip = lookahead_index + + if lookahead_index == len(text): + # End of string + break + next_char = text[lookahead_index] + # End lookahead + + # Storing and resetting + offsets.append(start) + offsets.append(end) + reset = True + break + elif current_char in trie_pointer: + # The current character being looked at has a match within the trie + # update the pointer (it will be stored back into states later). + trie_pointer = trie_pointer[current_char] + + # Storing back the new pointer into the states. + # Partial matches got longer by one. + states[start] = trie_pointer + else: + # The new character has not match in the trie, we need + # to stop keeping track of this partial match. + # We can't do it directly within the loop because of how + # python iteration works + to_remove.add(start) + + # Either clearing the full start (we found a real match) + # Or clearing only the partial matches that didn't work. + if reset: + states = {} + else: + for start in to_remove: + del states[start] + + # If this character is a starting character within the trie + # start keeping track of this partial match. + if current >= skip and current_char in self.data: + states[current] = self.data[current_char] + + # We have a cut at the end with states. + for start, trie_pointer in states.items(): + if "" in trie_pointer: + # This is a final match, we need to reset and + # store the results in `offsets`. + end = len(text) + offsets.append(start) + offsets.append(end) + # Longest cut is always the one with lower start so the first + # item so we need to break. + break + + return self.cut_text(text, offsets) + + def cut_text(self, text, offsets): + # We have all the offsets now, we just need to do the actual splitting. + # We need to eventually add the first part of the string and the eventual + # last part. + offsets.append(len(text)) + tokens = [] + start = 0 + for end in offsets: + if start > end: + logger.error( + "There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it" + " anyway." + ) + continue + elif start == end: + # This might happen if there's a match at index 0 + # we're also preventing zero-width cuts in case of two + # consecutive matches + continue + tokens.append(text[start:end]) + start = end + + return tokens diff --git a/torchtune/modules/__init__.py b/torchtune/modules/__init__.py index 46b8e93b0f..1798f58b4a 100644 --- a/torchtune/modules/__init__.py +++ b/torchtune/modules/__init__.py @@ -8,10 +8,12 @@ from .common_utils import reparametrize_as_dtype_state_dict_post_hook from .feed_forward import FeedForward # noqa from .kv_cache import KVCache # noqa +from .layer_norm import Fp32LayerNorm # noqa from .lr_schedulers import get_cosine_schedule_with_warmup # noqa from .position_embeddings import RotaryPositionalEmbeddings # noqa from .rms_norm import RMSNorm # noqa from .transformer import TransformerDecoder, TransformerDecoderLayer # noqa +from .vision_transformer import VisionTransformer __all__ = [ "CausalSelfAttention", @@ -20,6 +22,8 @@ "KVCache", "RotaryPositionalEmbeddings", "RMSNorm", + "Fp32LayerNorm", + "VisionTransformer", "TransformerDecoder", "TransformerDecoderLayer", "reparametrize_as_dtype_state_dict_post_hook", diff --git a/torchtune/modules/attention.py b/torchtune/modules/attention.py index 7155c21642..f3aead3f3c 100644 --- a/torchtune/modules/attention.py +++ b/torchtune/modules/attention.py @@ -12,7 +12,7 @@ class CausalSelfAttention(nn.Module): """Multi-headed grouped query self-attention (GQA) layer introduced - in https://arxiv.org/pdf/2305.13245v1.pdf. + in https://arxiv.org/abs/2305.13245v1. GQA is a version of multiheaded attention (MHA) which uses fewer key/value heads than query heads by grouping n query heads for each diff --git a/torchtune/modules/layer_norm.py b/torchtune/modules/layer_norm.py new file mode 100644 index 0000000000..12eeef2f94 --- /dev/null +++ b/torchtune/modules/layer_norm.py @@ -0,0 +1,35 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +from typing import Any + +from torch import nn, Tensor + + +class Fp32LayerNorm(nn.LayerNorm): + """ + Wrapper around nn.functional.layer_norm to support mixed-precision training. + """ + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + + def forward(self, x: Tensor) -> Tensor: + """ + Args: + x (torch.Tensor): Input tensor. + Returns: + torch.Tensor: The normalized output tensor. + """ + output = nn.functional.layer_norm( + x.float(), + self.normalized_shape, + self.weight.float() if self.weight is not None else None, + self.bias.float() if self.bias is not None else None, + self.eps, + ) + return output.type_as(x) diff --git a/torchtune/modules/low_precision/_register_nf4_dispatch_ops.py b/torchtune/modules/low_precision/_register_nf4_dispatch_ops.py index 8c85d928f2..b057a1fbc3 100644 --- a/torchtune/modules/low_precision/_register_nf4_dispatch_ops.py +++ b/torchtune/modules/low_precision/_register_nf4_dispatch_ops.py @@ -4,14 +4,9 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from importlib.metadata import PackageNotFoundError, version - import torch from torchao.dtypes.nf4tensor import implements as nf4_tensor_impl, to_nf4 - - -def is_fbcode(): - return not hasattr(torch.version, "git_version") +from torchtune.modules.low_precision._utils import _get_torchao_version @nf4_tensor_impl([torch.ops.aten.clone.default]) @@ -26,17 +21,12 @@ def clone(func, *args, **kwargs): should_define_inplace_copy = True -if not is_fbcode(): - try: - ao_version = version("torchao") - should_define_inplace_copy = ao_version < "0.2.0" - # For importlib metadata, need to check nightly separately - except PackageNotFoundError: - ao_version = version("torchao-nightly") - should_define_inplace_copy = ao_version < "2024.5.20" - except Exception as e: - raise PackageNotFoundError("Could not find torchao version") from e - +ao_version, is_nightly = _get_torchao_version() +if ao_version: + if (is_nightly and ao_version >= "2024.5.20") or ( + not is_nightly and ao_version >= "0.2.0" + ): + should_define_inplace_copy = False if should_define_inplace_copy: # TorchAO have `NF4.copy_` starting from `0.2.0` diff --git a/torchtune/modules/low_precision/_utils.py b/torchtune/modules/low_precision/_utils.py new file mode 100644 index 0000000000..60ddf0b7e7 --- /dev/null +++ b/torchtune/modules/low_precision/_utils.py @@ -0,0 +1,66 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from datetime import datetime +from importlib.metadata import PackageNotFoundError, version +from typing import Optional, Tuple + +import torch + +import torchao + + +def _is_fbcode(): + return not hasattr(torch.version, "git_version") + + +def _nightly_version_ge(ao_version_str: str, date: str) -> bool: + """ + Compare a torchao nightly version to a date of the form + %Y-%m-%d. + + Returns True if the nightly version is greater than or equal to + the date, False otherwise + """ + ao_datetime = datetime.strptime(ao_version_str.split("+")[0], "%Y.%m.%d") + return ao_datetime >= datetime.strptime(date, "%Y-%m-%d") + + +def _get_torchao_version() -> Tuple[Optional[str], Optional[bool]]: + """ + Get torchao version. Returns a tuple of two elements, the first element + is the version string, the second element is whether it's a nightly version. + For fbcode usage, return None, None. + + Checks: + 1) is_fbcode, then + 2) importlib's version(torchao-nightly) for nightlies, then + 3) torchao.__version__ (only defined for torchao >= 0.3.0), then + 4) importlib's version(torchao) for non-nightly + + + If none of these work, raise an error. + + """ + if _is_fbcode(): + return None, None + # Check for nightly install first + try: + ao_version = version("torchao-nightly") + is_nightly = True + except PackageNotFoundError: + try: + ao_version = torchao.__version__ + is_nightly = False + except AttributeError: + ao_version = "unknown" + if ao_version == "unknown": + try: + ao_version = version("torchao") + is_nightly = False + except Exception as e: + raise PackageNotFoundError("Could not find torchao version") from e + return ao_version, is_nightly diff --git a/torchtune/modules/position_embeddings.py b/torchtune/modules/position_embeddings.py index 193a7c652b..bf94eb0c96 100644 --- a/torchtune/modules/position_embeddings.py +++ b/torchtune/modules/position_embeddings.py @@ -18,7 +18,7 @@ class RotaryPositionalEmbeddings(nn.Module): Reference implementation (used for correctness verfication) can be found here: - https://github.com/facebookresearch/llama/blob/main/llama/model.py#L450 + https://github.com/meta-llama/llama/blob/main/llama/model.py#L80 In this implementation we cache the embeddings for each position upto ``max_seq_len`` by computing this during init. diff --git a/torchtune/modules/rms_norm.py b/torchtune/modules/rms_norm.py index 9f23de846d..a2e4e2a7df 100644 --- a/torchtune/modules/rms_norm.py +++ b/torchtune/modules/rms_norm.py @@ -12,7 +12,7 @@ class RMSNorm(nn.Module): """ Implements Root Mean Square Normalization introduced in - https://arxiv.org/pdf/1910.07467.pdf. + https://arxiv.org/abs/1910.07467. Reference implementation (used for correctness verfication) can be found here: diff --git a/torchtune/modules/transformer.py b/torchtune/modules/transformer.py index dee98065e9..28fd6f9481 100644 --- a/torchtune/modules/transformer.py +++ b/torchtune/modules/transformer.py @@ -7,6 +7,7 @@ from typing import Optional import torch +import torch.nn.functional as F from torch import nn, Tensor from torchtune.modules import CausalSelfAttention, KVCache @@ -245,3 +246,121 @@ def forward( # shape: [b, s, out_dim] - out_dim is usually the vocab size output = self.output(h).float() return output + + +class TiedEmbeddingTransformerDecoder(TransformerDecoder): + """ + Transformer Decoder with tied embedding weight. A key difference between + this class and :class:`~torchtune.modules.TransformerDecoder` + is that the output projection is replaced with token embeddings weights. + + Args: + tok_embeddings (nn.Embedding): PyTorch embedding layer, to be used to move + tokens to an embedding space. + layer (TransformerDecoderLayer): Transformer Decoder layer. + num_layers (int): Number of Transformer Decoder layers. + max_seq_len (int): maximum sequence length the model will be run with, as used + by :func:`~torchtune.modules.KVCache` + num_heads (int): number of query heads. For MHA this is also the + number of heads for key and value. This is used to setup the + :func:`~torchtune.modules.KVCache` + head_dim (int): embedding dimension for each head in self-attention. This is used + to setup the :func:`~torchtune.modules.KVCache` + norm (nn.Module): Callable that applies normalization to the output of the decoder, + before final MLP. + + Note: + Arg values are checked for correctness (eg: ``attn_dropout`` belongs to [0,1]) + in the module where they are used. This helps reduces the number of raise + statements in code and improves readability. + """ + + def __init__( + self, + tok_embeddings: nn.Embedding, + layer: TransformerDecoderLayer, + num_layers: int, + max_seq_len: int, + num_heads: int, + head_dim: int, + norm: nn.Module, + ) -> None: + super().__init__( + tok_embeddings=tok_embeddings, + layer=layer, + num_layers=num_layers, + max_seq_len=max_seq_len, + num_heads=num_heads, + head_dim=head_dim, + norm=norm, + output=None, + ) + + def forward( + self, + tokens: Tensor, + *, + mask: Optional[Tensor] = None, + input_pos: Optional[Tensor] = None, + ) -> Tensor: + """ + Args: + tokens (Tensor): input tensor with shape [b x s] + mask (Optional[Tensor]): Optional boolean tensor which contains the attention mask + with shape [b x s x s]. This is applied after the query-key multiplication and + before the softmax. A value of True in row i and column j means token i attends + to token j. A value of False means token i does not attend to token j. If no + mask is specified, a causal mask is used by default. Default is None. + input_pos (Optional[Tensor]): Optional tensor which contains the position ids + of each token. During training, this is used to indicate the positions + of each token relative to its sample when packed, shape [b x s]. + During inference, this indicates the position of the current token. + If none, assume the index of the token is its position id. Default is None. + + Note: At the very first step of inference, when the model is provided with a prompt, + ``input_pos`` would contain the positions of all of the tokens in the prompt + (eg: ``torch.arange(prompt_length)``). This is because we will need to compute the + KV values for each position. + + Returns: + Tensor: output tensor with shape [b x s x v] + + Raises: + ValueError: if causal_mask is set but input_pos is None + + Notation used for tensor shapes: + - b: batch size + - s: sequence length + - v: vocab size + - d: embed dim + - m_s: max seq len + """ + # input tensor of shape [b, s] + bsz, seq_len = tokens.shape + + # shape: [b, s, d] + h = self.tok_embeddings(tokens) + + if self.causal_mask is not None: + if input_pos is None: + raise ValueError( + "Caches are setup, but the position of input token is missing" + ) + if mask is not None: + raise ValueError( + "An attention mask was set. Cannot use a non-causal mask for inference" + ) + # shape: [1, input_pos_len, m_s] + # in most cases input_pos_len should be 1 + mask = self.causal_mask[None, input_pos] + + for layer in self.layers: + # shape: [b, s, d] + h = layer(h, mask=mask, input_pos=input_pos) + + # shape: [b, s, d] + h = self.norm(h) + + # shape: [b, s, out_dim] - out_dim is usually the vocab size + output = F.linear(h, self.tok_embeddings.weight).float() + return output diff --git a/torchtune/modules/transforms/__init__.py b/torchtune/modules/transforms/__init__.py new file mode 100644 index 0000000000..c317e7d7ce --- /dev/null +++ b/torchtune/modules/transforms/__init__.py @@ -0,0 +1,24 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from torchtune.modules.transforms._transforms import Transform, VisionCrossAttentionMask +from torchtune.modules.transforms.vision_utils.get_canvas_best_fit import ( # noqa + find_supported_resolutions, + get_canvas_best_fit, +) +from torchtune.modules.transforms.vision_utils.resize_with_pad import ( # noqa + resize_with_pad, +) +from torchtune.modules.transforms.vision_utils.tile_crop import tile_crop # noqa + +__all__ = [ + "Transform", + "get_canvas_best_fit", + "resize_with_pad", + "tile_crop", + "find_supported_resolutions", + "VisionCrossAttentionMask", +] diff --git a/torchtune/modules/transforms/_transforms.py b/torchtune/modules/transforms/_transforms.py new file mode 100644 index 0000000000..68142686fb --- /dev/null +++ b/torchtune/modules/transforms/_transforms.py @@ -0,0 +1,165 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Any, List, Mapping, Protocol + +import torch + + +class Transform(Protocol): + """ + Loose interface for all data and model transforms. Transforms operate at the + sample level and perform operations on a sample dict, returning the updated dict. + """ + + def __call__(self, sample: Mapping[str, Any]) -> Mapping[str, Any]: + pass + + +class VisionCrossAttentionMask(Transform): + """ + Computes the cross-attention mask for text + image inputs. Text tokens that + participate in cross-attention with an image token will show True in the mask + and follow the interleaved structure laid out in Fig. 7 of the Flamingo paper + (https://arxiv.org/pdf/2204.14198): + + (1) Text tokens immediately following the image token up until the next image token + (2) Consecutive image tokens attend to subsequent text tokens + + :: + + ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ + img1 │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ │ │ │ │ │ │ │ │ │ + └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ + ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ + img2 │ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ │ │ │ │ │ │ │ │ │ + └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ + ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ ┌───┐ + img3 │ │ │ │ │ │ │ │ │ │ │ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ │ ■ │ + └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ └───┘ + These are two dogs. This is a cat. + + + + Resultant mask is constructed per image and is of shape (text_seq_len, image_seq_len), + where True indicates that the token outputted from the image encoder attends + to the token in the text sequence in cross-attention. A list of these masks + are returned with length equal to number of images in the sample. + + Args: + tile_size (int): The size of the image tiles from the image transform + patch_size (int): The size of each patch. Used to divide the tiles into patches. + E.g. for patch_size = 40, a tile of shape (400, 400) will have 10x10 grid of patches + with shape (40, 40) each. + image_token_id (int): Token ID of the image special token. + """ + + def __init__(self, tile_size: int, patch_size: int, image_token_id: int): + patch_grid_size = tile_size // patch_size + self.patches_per_tile = patch_grid_size**2 + self.image_token_id = image_token_id + + def _get_image_attention_intervals(self, tokens: List[int]) -> List[List[int]]: + """ + Returns a list of lists of the form [start, end) where start is the index + of the current image token and end is the index of the next image token, exclusive. + + Args: + tokens (List[int]): List of token IDs in the text sequence + + Returns: + List[List[int]]: List of lists of the form [start, end) indicating + range of positions in text sequence that should attend to the image + + Example: + >>> text = "These are two dogs. This is a cat." + >>> image_token_id = 1 + >>> tokens = [1, 1, 9673, 527, 1403, 12875, 13, 1, 1115, 374, 264, 8415] + >>> transform = VisionCrossAttentionMask(tile_size=400, patch_size=40, image_token_id=1) + >>> intervals = transform._get_image_attention_intervals(tokens) + >>> print(intervals) + [[0, 7], [1, 7], [7, 12]] + """ + end = len(tokens) + vision_token_locations = [ + i for i, token in enumerate(tokens) if token == self.image_token_id + ] + # Return empty list if there are no images + if len(vision_token_locations) == 0: + return [] + # If there is only one image, it will attend to subsequent text until end + if len(vision_token_locations) == 1: + return [[vision_token_locations[0], end]] + + # Construct intervals from previous image token to next image token + vision_masks = [ + [tok_idx_prev, tok_idx_next] + # Offset by one to get consecutive indices + for tok_idx_prev, tok_idx_next in zip( + vision_token_locations[:-1], vision_token_locations[1:] + ) + ] + # Last image will attend to subsequent text until end + vision_masks.append([vision_token_locations[-1], end]) + + # If there are consecutive vision tokens, they should all attend to the + # same subsequent text + last_mask_end = vision_masks[-1][1] + for vision_mask in vision_masks[::-1]: + if vision_mask[0] == vision_mask[1] - 1: + vision_mask[1] = last_mask_end + last_mask_end = vision_mask[1] + return vision_masks + + def __call__(self, sample: Mapping[str, Any]) -> Mapping[str, Any]: + """ + Generates the vision cross-attention mask for the given sample based on + the image token locations interleaved in the text sequence. + + Args: + sample (Mapping[str, Any]): Sample dict containing the following keys: + - tokens (List[int]): List of token IDs in the text sequence. Number of + image token IDs in the sequence must match the number of images. + - images (List[torch.Tensor]): List of image Tensors post-tiling of shape + (n_tiles, c, h, w) each. + + Returns: + Mapping[str, Any]: updated sample with the following keys: + - encoder_mask (List[torch.Tensor]): list of masks with shape (text_seq_len, image_seq_len), + where length of list == number of images in sample + - tokens (List[int]): original tokens + - images (List[torch.Tensor]): original images + """ + tokens, images = sample["tokens"], sample["images"] + # One sample can have multiple images - verify the number of image tokens + # is the same + n_img = len(images) + intervals = self._get_image_attention_intervals(tokens) + if len(intervals) != n_img: + raise RuntimeError( + f"The number of image tokens ({len(intervals)}) does not match the number of images ({n_img})." + ) + + # Create mask for each individual image based on its number of tokens, + # which can vary based on number of tiles since they are not yet tile padded. + # The masks are padded and concatenated together in the batch collator + text_seq_len = len(tokens) + masks = [] + for image_num, interval in enumerate(intervals): + # Identify what part of text sequence should be attended + start, end = interval + # Compute this image's number of tokens based on num tiles, patches per tile + n_tiles = images[image_num].shape[0] + image_seq_len = n_tiles * (self.patches_per_tile + 1) # +1 for CLS token + # Mask will be block of 1s at the corresponding interval in the text. + # It is not a causal block because all the image tokens correspond + # to a single image, so text tokens attend to all the image's tokens + mask = torch.zeros(text_seq_len, image_seq_len, dtype=torch.bool) + mask[start:end, :] = True + masks.append(mask) + + sample.update({"encoder_mask": masks}) + return sample diff --git a/torchtune/modules/transforms/vision_utils/__init__.py b/torchtune/modules/transforms/vision_utils/__init__.py new file mode 100644 index 0000000000..2e41cd717f --- /dev/null +++ b/torchtune/modules/transforms/vision_utils/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. diff --git a/torchtune/modules/transforms/vision_utils/get_canvas_best_fit.py b/torchtune/modules/transforms/vision_utils/get_canvas_best_fit.py new file mode 100644 index 0000000000..77eabc50a9 --- /dev/null +++ b/torchtune/modules/transforms/vision_utils/get_canvas_best_fit.py @@ -0,0 +1,179 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +from collections import defaultdict +from typing import List, Set, Tuple + +import torch + +logger = logging.getLogger(__name__) + + +def get_canvas_best_fit( + image: torch.Tensor, possible_resolutions: torch.Tensor, resize_to_max_canvas: bool +) -> Tuple[int, int]: + """ + Determines the best canvas possible from a list of possible resolutions to + resize an image to, without distortion. + + For each possible resolution, calculates the scaling factors for + width and height, and selects the smallest one, which is the limiting side. + E.g. if to match a canvas shape you have to upscale an image's height by 2x, and width by 1.5x, + then the maximum upscaling without distortion is min(2, 1.5) = 1.5. + + If there are multiple canvases that satisfy the conditions, + we pick the one with the lowest area to minimize padding. + + Args: + image (torch.Tensor): The image we want to fit into a canvas. + possible_resolutions (torch.Tensor): A tensor of shape (N, 2) where each + row represents a possible canvas. + resize_to_max_canvas (bool): If True, pick the canvas that allows maximum scaling. + If False, pick the canvas that minimizes downscaling, including no downscaling at all. + + Returns: + Tuple[int, int]: The best resolution to fit the image into. + + Examples: + >>> image = torch.rand(3, 200, 300) + >>> possible_resolutions = torch.tensor([ + ... [224, 672], + ... [672, 224], + ... [224, 448], + ... [448, 224], + ... [224, 224] + ... ]) + >>> get_canvas_best_fit(image, possible_resolutions, resize_to_max_canvas=False) + (224, 448) + + In the example above, we calculate the scaling factors for each possible resolution + + >>> scale_height = torch.tensor([1.1200, 3.3600, 1.1200, 2.2400, 1.1200]) + >>> scale_width = torch.tensor([2.2400, 0.7467, 1.4933, 0.7467, 0.7467]) + >>> scales = torch.tensor([1.1200, 0.7467, 1.1200, 0.7467, 0.7467]) + + Two options have scaling_factor > 1, since resize_to_max_canvas is False, we pick the smallest + + >>> upscaling_options = torch.tensor([1.1200, 1.1200]) + >>> selected_scale = torch.tensor(1.1200) + + There are two possible options, so we pick the one with the smallest area + + >>> areas = torch.tensor([150528, 100352]) # for resolutions [672, 224] and [224, 448], respectively + >>> optimal_canvas = torch.tensor([224, 448]) # resolution with the smallest area + """ + + original_height, original_width = image.shape[-2:] + + # possible resolutions heights/widths + target_heights, target_widths = ( + possible_resolutions[:, 0], + possible_resolutions[:, 1], + ) + + # scaling factors to resize the image without distortion + scale_w = target_widths / original_width + scale_h = target_heights / original_height + + # get limiting side scaling -> no distortion + scales = torch.where(scale_w > scale_h, scale_h, scale_w) + + # filter only scales that allow upscaling + upscaling_options = scales[scales >= 1] + if len(upscaling_options) > 0: + if resize_to_max_canvas: + selected_scale = torch.max(upscaling_options) + else: + selected_scale = torch.min(upscaling_options) + else: + # no upscaling possible, + # get the minimum downscaling (max scale for scales<1) + downscaling_options = scales[scales < 1] + selected_scale = torch.max(downscaling_options) + + # get all resolutions that support this scaling factor, + # e.g. you can upscale to 224x224, 224x448, 224x672 without distortion + chosen_canvas = possible_resolutions[scales == selected_scale] + + # if there are multiple resolutions, + # get the one with minimum area to reduce padding + if len(chosen_canvas) > 1: + areas = chosen_canvas[:, 0] * chosen_canvas[:, 1] + optimal_idx = torch.argmin(areas) + optimal_canvas = chosen_canvas[optimal_idx] + else: + optimal_canvas = chosen_canvas[0] + + return tuple(optimal_canvas.tolist()) + + +def find_supported_resolutions( + max_num_tiles: int, tile_size: int +) -> List[Tuple[int, int]]: + """ + Computes all combinations of resolutions, multiple of tile_size, + that contain up to max_num_tiles. Useful for when dividing an image into tiles. + + For example, if we want at most 2 tiles per image, then we can support the + following resolutions: (1x1, 1x2, 2x1) * tile_size + + Args: + max_num_tiles (int): Maximum number of tiles. + tile_size (int): Size of the side of the tile. + + Returns: + List[Tuple[int, int]]: List of possible resolutions as tuples (height, width). + + Examples: + + >>> max_num_tiles = 4 + >>> tile_size = 224 + >>> find_supported_resolutions(max_num_tiles, tile_size) + [(224, 896), (448, 448), (224, 224), (896, 224), (224, 672), (672, 224), (224, 448), (448, 224)] + """ + + # create dictionary {aspect_ratio: [resolution1, ..., resolution n]} + # example {0.25: [(1,4)], 1.0: [(2,2), (1,1)], 4.0: [(4,1)]} + asp_dict = defaultdict(list) + for _tile_size in range(max_num_tiles, 0, -1): + factors = sorted(_get_factors(_tile_size)) + asp_ratios = [(factor, _tile_size // factor) for factor in factors] + for height, width in asp_ratios: + ratio_float = height / width + asp_dict[ratio_float].append((height, width)) + + # get the resolutions multiplied by the tile_size + possible_resolutions = [] + for ar, resolution in asp_dict.items(): + for height, width in resolution: + possible_resolutions.append((height * tile_size, width * tile_size)) + + return possible_resolutions + + +def _get_factors(n: int) -> Set[int]: + """ + Calculate all factors of a given number, i.e. a divisor that leaves no remainder. + + Args: + n (int): The number to find factors for. + + Returns: + set: A set containing all factors of the number. + + Examples: + >>> _get_factors(n=12) + {1, 2, 3, 4, 6, 12} + """ + factors_set = set() + + for i in range(1, int(n**0.5) + 1): + if n % i == 0: + factors_set.add(i) + factors_set.add(n // i) + return factors_set diff --git a/torchtune/modules/transforms/vision_utils/resize_with_pad.py b/torchtune/modules/transforms/vision_utils/resize_with_pad.py new file mode 100644 index 0000000000..853402557c --- /dev/null +++ b/torchtune/modules/transforms/vision_utils/resize_with_pad.py @@ -0,0 +1,170 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +import math +from typing import Optional, Tuple + +import torch + +import torchvision +from torchvision.transforms.v2 import functional as F + +logger = logging.getLogger(__name__) + + +def resize_with_pad( + image: torch.Tensor, + target_size: Tuple[int, int], + resample: torchvision.transforms.InterpolationMode, + max_upscaling_size: Optional[int] = None, +) -> torch.Tensor: + """ + Resizes and pads an image to target_size without causing distortion. + The user can set max_upscaling_size to limit upscaling when target_size exceeds image_size. + + Args: + image (torch.Tensor): The input image tensor in the format [..., H, W]. + target_size (Tuple[int, int]): The desired resolution to fit the image into in the format [height, width]. + resample (torchvision.transforms.InterpolationMode): Resampling method used when resizing images. + Supports torchvision.transforms.InterpolationMode.NEAREST, InterpolationMode.NEAREST_EXACT, + InterpolationMode.BILINEAR and InterpolationMode.BICUBIC. + max_upscaling_size (int): The maximum size to upscale the image to. + If None, will upscale up to target_size. + + Returns: + torch.Tensor: The resized and padded image tensor in the format [..., H, W]. + + Examples: + + Example 1: The image will be upscaled from (300, 800) to (448, 1194), since 448 is the limiting side, + and then padded from (448, 1194) to (448, 1344). + + >>> max_upscaling_size = None + >>> image = torch.rand([3, 300, 800]) + >>> target_size = (448, 1344) + >>> resample = torchvision.transforms.InterpolationMode.BILINEAR + >>> output = resize_with_pad(image, target_size, resample, max_upscaling_size) + + Example 2: The image will stay as is, since 800 > 600, and then padded from (300, 800) to (448, 1344). + + >>> max_upscaling_size = 600 + >>> image = torch.rand([3, 300, 800]) + >>> target_size = (448, 1344) + >>> resample = torchvision.transforms.InterpolationMode.BILINEAR + >>> output = resize_with_pad(image, target_size, resample, max_upscaling_size) + + Example 3: The image will be downscaled from (500, 1000) to (224, 448), + and padded from (224, 448) to (448, 448). + + >>> max_upscaling_size = 600 + >>> image = torch.rand([3, 500, 1000]) + >>> target_size = (448, 488) + >>> resample = torchvision.transforms.InterpolationMode.BILINEAR + >>> output = resize_with_pad(image, target_size, resample, max_upscaling_size) + + """ + + image_height, image_width = image.shape[-2:] + image_size = (image_height, image_width) + + # If target_size requires upscaling, we might want to limit the upscaling to max_upscaling_size + if max_upscaling_size is not None: + new_target_height = min(max(image_height, max_upscaling_size), target_size[0]) + new_target_width = min(max(image_width, max_upscaling_size), target_size[1]) + target_size_resize = (new_target_height, new_target_width) + else: + target_size_resize = target_size + + # resize to target_size while preserving aspect ratio + new_size_preserving_aspect_ratio = _get_max_res_without_distortion( + image_size=image_size, + target_size=target_size_resize, + ) + + image = F.resize( + inpt=image, + size=list(new_size_preserving_aspect_ratio), + interpolation=resample, + antialias=True, + ) + + image = _pad_image_top_left(image=image, target_size=target_size) + + return image + + +def _pad_image_top_left( + image: torch.Tensor, + target_size: Tuple[int, int], +) -> torch.Tensor: + """ + Places the image at the top left of the canvas and pads with 0 the right and bottom + to fit to the target resolution. If target_size < image_size, it will crop the image. + + Args: + image (torch.Tensor): The input image tensor in the format [..., H, W]. + target_size (Tuple[int, int]): The desired resolution to fit the image into in the format [height, width]. + + Returns: + torch.Tensor: The padded image tensor in the format [..., H, W]. + """ + + image_size = image.shape[-2:] + + height, width = image_size + target_height, target_width = target_size + + pad_x = target_width - width + pad_y = target_height - height + + padding = [0, 0, pad_x, pad_y] + return F.pad(inpt=image, padding=padding) + + +def _get_max_res_without_distortion( + image_size: Tuple[int, int], + target_size: Tuple[int, int], +) -> Tuple[int, int]: + + """ + Determines the maximum resolution to which an image can be resized to without distorting its + aspect ratio, based on the target resolution. + + For example, if image_size = (200,400) and target_size = (600,800), + scale_h = 600/200 = 3 + scale_w = 800/400 = 2 + So the maximum that we can upscale without distortion is min(scale_h, scale_w) = 2 + + Since scale_w is the limiting side, then new_w = target_w, and new_h = old_h*scale_w + + Args: + image_size (Tuple[int, int]): The original resolution of the image. + target_size (Tuple[int, int]): The desired resolution to fit the image into. + Returns: + Tuple[int, int]: The optimal dimensions to which the image should be resized. + Examples: + >>> _get_max_res_without_distortion([200, 300], target_size = (450, 200)) + (133, 200) + >>> _get_max_res_without_distortion([800, 600], target_size = (450, 1300)) + (450, 337) + """ + + original_height, original_width = image_size + target_height, target_width = target_size + + scale_w = target_width / original_width + scale_h = target_height / original_height + + if scale_w < scale_h: + new_width = target_width + new_height = min(math.floor(original_height * scale_w), target_height) + else: + new_height = target_height + new_width = min(math.floor(original_width * scale_h), target_width) + + return new_height, new_width diff --git a/torchtune/modules/transforms/vision_utils/tile_crop.py b/torchtune/modules/transforms/vision_utils/tile_crop.py new file mode 100644 index 0000000000..17e173c3f7 --- /dev/null +++ b/torchtune/modules/transforms/vision_utils/tile_crop.py @@ -0,0 +1,59 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +import torch + +logger = logging.getLogger(__name__) + + +def tile_crop(image: torch.Tensor, tile_size: int) -> torch.Tensor: + """ + Divides a tensor into equally sized tiles. The tensor should be divisible by tile_size. + + Args: + image (torch.Tensor): Input image to crop into tiles. + tile_size (int): Size of each tile. + + Returns: + torch.Tensor: Tensor of shape [num_tiles, channel_size, tile_size, tile_size] + + Examples: + >>> image = torch.rand(3, 200, 300) + >>> tiles = tile_crop(image, tile_size=50) + >>> tiles.shape # 4x6 = 24 tiles + torch.Size([24, 3, 50, 50]) + + >>> image = torch.rand(3, 400, 600) + >>> tiles = tile_crop(image, tile_size=200) + >>> tiles.shape # 2x3 = 6 tiles + torch.Size([6, 3, 200, 200]) + """ + + channel_size, height, width = image.shape + + # assert sizes are divisible + assert ( + height % tile_size == 0 and width % tile_size == 0 + ), f"Image size {height}x{width} is not divisible by tile size {tile_size}" + + # Reshape to split height and width into tile_size blocks + tiles_height = height // tile_size + tiles_width = width // tile_size + + reshaped = image.view(channel_size, tiles_height, tile_size, tiles_width, tile_size) + + # Transpose to bring tiles together + # We want [tiles_height, tiles_width, channel_size, tile_size, tile_size] + transposed = reshaped.permute(1, 3, 0, 2, 4) + + # Flatten the tiles + tiles = transposed.contiguous().view( + tiles_height * tiles_width, channel_size, tile_size, tile_size + ) + + return tiles diff --git a/torchtune/modules/vision_transformer.py b/torchtune/modules/vision_transformer.py new file mode 100644 index 0000000000..1228fe67d2 --- /dev/null +++ b/torchtune/modules/vision_transformer.py @@ -0,0 +1,462 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Optional, Tuple + +import torch +from torch import nn + +from torchtune.modules import Fp32LayerNorm +from torchtune.modules.transformer import _get_clones + + +class VisionTransformer(nn.Module): + """ + Implementation of the ViT architecture (https://arxiv.org/abs/2010.11929), + with support for tile-cropped images, outputting of hidden layers and optional CLS projection. + + ViT is a transformer architecture that takes in images and outputs N embedded tokens that + represent this image. Each image is divided into **patches** by a convolution. + These patches are flattened and subsequently treated as **tokens** by the transformer. + + To further enhance the performance of ViT and avoid downscaling images, we support tile-cropped images, + which are images divided into **tiles** during the preprocessing stage. For example, instead of + downscaling an 800x400 image to fit 400x400, we may crop it into two 400x400 tiles, + if the ``tile_size=400``. For details on preprocessing, please refer to + :class:`torchtune.models.clip._transforms.CLIPImageTransform`. + + Each of these tiles is further broken down into patches by a convolution operation. For example, if + your ``patch_size=40``, then each (400, 400) tile will become a grid of 10x10 patches, and your whole image will have + num_tiles * n_tokens -> num_tiles * (10x10 patches + 1 CLS token) -> num_tiles * 101. + + Before the transformer layers, a CLS token is added to each tile as the first token. + In transformers, a token called CLS is a special token that is added to the beginning of each sequence. + This token can be used to represent the whole input, instead of using a pooling operation, for example. + + To help the model "see" the whole image, we use positional embeddings. If your image + was tile-cropped, then you need to use tile positional embeddings: + + - token_pos_embedding (tiled): :class:`torchtune.models.clip._position_embeddings.TiledTokenPositionalEmbedding` + - pre_tile_pos_embed: :class:`torchtune.models.clip._position_embeddings.TilePositionalEmbedding` + - post_tile_pos_embed: :class:`torchtune.models.clip._position_embeddings.TilePositionalEmbedding` + + Otherwise, pre and post tile_pos_embed should be None and all you need is a simple + token positional embedding: + + - token_pos_embedding (not tiled): :class:`torchtune.models.clip._position_embeddings.TokenPositionalEmbedding` + + All images will be considered as a stack of tiles, even if your image was not tile-cropped. In such cases, + your image would be composed of a single tile. + + In summary: + + 1) An image is broken down into tiles during preprocessing. + 2) In the ViT, the tiles will be broken down into patches. + 3) The patches will be flattened and transformed. We call them tokens, because that's how the transformer sees them. + + + Image: shape (8x8) + + .. code-block:: text + + | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | + | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | + | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | + | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | + | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | + | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | + | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | + | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | + + Tiles: shape (4,4,4) # (num_tiles, tile_size, tile_size) + + .. code-block:: text + + | 1 | 2 | 3 | 4 | | 5 | 6 | 7 | 8 | + | 9 | 10 | 11 | 12 | | 13 | 14 | 15 | 16 | + | 17 | 18 | 19 | 20 | | 21 | 22 | 23 | 24 | + | 25 | 26 | 27 | 28 | | 29 | 30 | 31 | 32 | + + | 33 | 34 | 35 | 36 | | 37 | 38 | 39 | 40 | + | 41 | 42 | 43 | 44 | | 45 | 46 | 47 | 48 | + | 49 | 50 | 51 | 52 | | 53 | 54 | 55 | 56 | + | 57 | 58 | 59 | 60 | | 61 | 62 | 63 | 64 | + + Patches: shape (4,4,2,2) # (num_tiles, num_patches_per_tile, patch_size, patch_size) + + .. code-block:: text + + | 1 | 2 | | 3 | 4 | | 5 | 6 | | 7 | 8 | + | 9 | 10 | | 11 | 12 | | 13 | 14 | | 15 | 16 | + + | 17 | 18 | | 19 | 20 | | 21 | 22 | | 23 | 24 | + | 25 | 26 | | 27 | 28 | | 29 | 30 | | 31 | 32 | + + | 33 | 34 | | 35 | 36 | | 37 | 38 | | 39 | 40 | + | 41 | 42 | | 43 | 44 | | 45 | 46 | | 47 | 48 | + + | 49 | 50 | | 51 | 52 | | 53 | 54 | | 55 | 56 | + | 57 | 58 | | 59 | 60 | | 61 | 62 | | 63 | 64 | + + token: shape (4, 4, 4) # (num_tiles, num_patches_per_tile, emb_dim) + + .. code-block:: text + + | 1 | 2 | 9 | 10 | | 3 | 4 | 11 | 12 | | 17 | 18 | 25 | 26 | | 19 | 20 | 27 | 28 | + | ... continuation of data ... + | ... continuation of data ... + | 37 | 38 | 45 | 46 | | 39 | 40 | 47 | 48 | | 53 | 54 | 61 | 62 | | 55 | 56 | 63 | 64 | + + For the positional embeddings: + + Same for every tile, different for every token. + + - :class:`torchtune.models.clip._position_embeddings.TokenPositionalEmbedding` + - :class:`torchtune.models.clip._position_embeddings.TiledTokenPositionalEmbedding` + + .. code-block:: text + + | 1 | 2 | 3 | 4 | | 1 | 2 | 3 | 4 | + | 9 | 10 | 11 | 12 | | 9 | 10 | 11 | 12 | + | 17 | 18 | 19 | 20 | | 17 | 18 | 19 | 20 | + | 25 | 26 | 27 | 28 | | 25 | 26 | 27 | 28 | + + | 1 | 2 | 3 | 4 | | 1 | 2 | 3 | 4 | + | 9 | 10 | 11 | 12 | | 9 | 10 | 11 | 12 | + | 17 | 18 | 19 | 20 | | 17 | 18 | 19 | 20 | + | 25 | 26 | 27 | 28 | | 25 | 26 | 27 | 28 | + + Different for every tile, different for every token. + + - :class:`torchtune.models.clip._position_embeddings.TiledTokenPositionalEmbedding` + + .. code-block:: text + + | 1 | 2 | | 3 | 4 | | 5 | 6 | | 7 | 8 | + | 9 | 10 | | 11 | 12 | | 13 | 14 | | 15 | 16 | + + | 17 | 18 | | 19 | 20 | | 21 | 22 | | 23 | 24 | + | 25 | 26 | | 27 | 28 | | 29 | 30 | | 31 | 32 | + + | 33 | 34 | | 35 | 36 | | 37 | 38 | | 39 | 40 | + | 41 | 42 | | 43 | 44 | | 45 | 46 | | 47 | 48 | + + | 49 | 50 | | 51 | 52 | | 53 | 54 | | 55 | 56 | + | 57 | 58 | | 59 | 60 | | 61 | 62 | | 63 | 64 | + + different for every tile, same for every token within a tile. + + - :class:`torchtune.models.clip._position_embeddings.TilePositionalEmbedding` + + .. code-block:: text + + | 1 | 1 | 1 | 1 | | 2 | 2 | 2 | 3 | + | 1 | 1 | 1 | 1 | | 2 | 2 | 2 | 3 | + | 1 | 1 | 1 | 1 | | 2 | 2 | 2 | 3 | + | 1 | 1 | 1 | 1 | | 2 | 2 | 2 | 3 | + + | 3 | 3 | 3 | 3 | | 4 | 4 | 4 | 4 | + | 3 | 3 | 3 | 3 | | 4 | 4 | 4 | 4 | + | 3 | 3 | 3 | 3 | | 4 | 4 | 4 | 4 | + | 3 | 3 | 3 | 3 | | 4 | 4 | 4 | 4 | + + Args: + num_layers (int): The number of transformer layers. + layer (nn.Module): The transformer layer module. + token_pos_embedding (nn.Module): The token positional embedding module. + pre_tile_pos_embed (Optional[nn.Module]): The pre-tile positional embedding module. It should be + None if your image was not tile-cropped in advance. + post_tile_pos_embed (Optional[nn.Module]): The post-tile positional embedding module. It should be + None if your image was not tile-cropped in advance. + cls_projection (Optional[nn.Module]): The CLS projection module. It should take an input tensor + of shape (bsz * n_tiles, n_tokens, embed_dim) and output a tensor of shape + (bsz * n_tiles, cls_output_dim). If provided, only the CLS token projection will be + outputted, instead of all tokens. + out_indices (Optional[List[int]]): The indices of hidden layers to return. + If provided, it will return the intermediate results of the transformer layers + before they go through a next layer. For example, ``out_indices=[0,3]`` will + return the tokens before they go through the first and fourth layers. + tile_size (int): The size of your image tiles, if the image was tile-cropped in advance. Otherwise, + the size of the input image. In this case, the function will consider your image as a single tile. + patch_size (int): The size of each patch. Used to divide the tiles into patches. + E.g. for ``patch_size=40``, a tile of shape (400, 400) will have 10x10 grid of patches + with shape (40, 40) each. + embed_dim (int): The dimensionality of each patch embedding (token). + in_channels (int): The number of image input channels. + + Raises: + ValueError: If `tile_size` is not greater than 0. + ValueError: If `patch_size` is not greater than 0. + ValueError: If `len(out_indices)` is greater than `num_layers`. + """ + + def __init__( + self, + patch_size: int, + tile_size: int, + num_layers: int, + embed_dim: int, + layer: nn.Module, + token_pos_embedding: nn.Module, + pre_tile_pos_embed: Optional[nn.Module] = None, + post_tile_pos_embed: Optional[nn.Module] = None, + cls_projection: Optional[nn.Module] = None, + out_indices: Optional[List[int]] = None, + in_channels: int = 3, + ) -> None: + super().__init__() + + if tile_size <= 0: + raise ValueError("tile_size must be > 0") + if patch_size <= 0: + raise ValueError("patch_size must be > 0") + if out_indices and (len(out_indices) > num_layers): + raise ValueError( + f"len(out_indices) must be <= num_layers. Got {out_indices=} and {num_layers=}" + ) + + # constants + patch_grid_size = tile_size // patch_size + self.patches_per_tile = patch_grid_size**2 + self.out_indices = out_indices + if not out_indices: + self.out_indices = [] + + # input modules + self.pre_tile_pos_embed = pre_tile_pos_embed + self.post_tile_pos_embed = post_tile_pos_embed + self.token_pos_embedding = token_pos_embedding + + self.cls_projection = cls_projection + self.transformer_layers = _get_clones(layer, num_layers) + + # other modules + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=embed_dim, + kernel_size=(patch_size, patch_size), + stride=(patch_size, patch_size), + bias=False, + ) + + self.ln_post = Fp32LayerNorm(embed_dim) + self.ln_pre = Fp32LayerNorm(embed_dim) + + self.cls_token_embedding = CLSEmbedding(embed_dim) + + def get_image_tokens_per_tile(self): + return self.patches_per_tile + 1 # +1 for CLS token + + def forward( + self, images: torch.Tensor, aspect_ratio: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, List[torch.Tensor]]: + """ + Processes images and returns the tokens and hidden states. + + Multiple images per sample: we add a dimension n_imgs to the input. This is useful when a single + sample constains multiple images, for example: + + - sample 1: " what animal is this?" + - sample 2: "I like more than " + + In this case, sample 1 has one image, and sample 2 has two images. max_n_imgs = max(2,1) = 2. + So your input should have shape (bsz=2, n_imgs=2, num_tiles, n_channels, tile_size, tile_size). + + Notice that to batch it, you will have to pad n_imgs to max_n_imgs and max_num_tiles. + + Args: + images (torch.Tensor): Tensor with shape (bsz, n_imgs, n_tiles, n_channels, tile_size, tile_size). + aspect_ratio (Optional[torch.Tensor]): Tensor with shape (bsz, n_imgs, 2). If all + images have a single tile, i.e. they were not tile-cropped, it should be None. + Used to calculate the positional embeddings for the tiles. + + Returns: + Tuple[torch.Tensor, List[torch.Tensor]]: A tuple: (x, hidden_states), + where x is a torch.tensor of shape (bsz, n_imgs, n_tiles, n_tokens, embed_dim) and + hidden_states has shape is a list of len(out_indices) torch.tensor with shape + (bsz, n_imgs, n_tiles, n_tokens, embed_dim). + + Raises: + ValueError: If aspect_ratio is None, but n_tiles > 1 in the batch. + + Examples: + + >>> from torchtune.modules.transforms.vision_utils.tile_crop import tile_crop + >>> from torchtune.modules import VisionTransformer + >>> + >>> num_channels = 3 + >>> image_size = (800,400) + >>> tile_size = 400 + >>> patch_size=40 + >>> patch_grid_size = tile_size // patch_size + >>> + >>> # for details about preprocessing, please check + >>> # torchtune.models.clip._transforms.CLIPImageTransform + >>> + >>> # create a random image + >>> image = torch.rand(num_channels, image_size[0], image_size[1]) + >>> + >>> # (num_tiles, nch, h, w) -> (2, 3, 400, 400) + >>> tile_cropped_image = tile_crop(image, tile_size) + >>> aspect_ratio = torch.tensor([2,1]) + >>> + >>> # make it a batch of 1 image + >>> batch_image = tile_cropped_image.unsqueeze(0) + >>> batch_aspect_ratio = aspect_ratio.unsqueeze(0) + >>> + >>> # make it have only 1 image per sample + >>> batch_image = tile_cropped_image.unsqueeze(1) + >>> batch_aspect_ratio = aspect_ratio.unsqueeze(1) + >>> + >>> # For a detailed example, please check + >>> # torchtune.models.clip._position_embeddings.clip_vision_encoder + >>> # model = VisionTransformer( + ... # out_indices = [1,2,3,4,5], + ... # patch_size=40, + ... # patch_grid_size = patch_grid_size, + ... # embed_dim = 32, + ... # num_layers = 6, + ... # in_channels = num_channels, + ... # ...) + >>> + >>> x, hidden_states = model(images = batch_image, aspect_ratio = batch_aspect_ratio) + >>> + >>> # (bsz, n_imgs, num_tiles, num_patches_per_tile + CLS token, embed_dim) + >>> print(x.shape) + torch.Size([1, 1, 2, 101, 32]) + >>> + >>> # list with tensors of shape (bsz, n_imgs, num_tiles, num_patches_per_tile + CLS token, embed_dim) + >>> print(len(hidden_states)) + 5 + """ + hidden_states = [] + + # parse inputs + bsz, n_imgs, n_tiles, nch, w, h = images.shape + bsz_and_n_imgs = bsz * n_imgs + + # if aspect_ratio is not provided, it defaults to one tile [1,1] + if aspect_ratio is None: + aspect_ratio = torch.ones( + (bsz_and_n_imgs, 2), dtype=torch.int, device=images.device + ) + if n_tiles > 1: + raise ValueError( + f"aspect_ratio was not provided, but found n_tiles>1 for {images.shape=}. Please provide aspect_ratio." + ) + + images = images.reshape(bsz_and_n_imgs * n_tiles, nch, w, h) + aspect_ratio = aspect_ratio.reshape(bsz_and_n_imgs, 2) + + # patch embeddings (tokens) + # A tile becomes a grid of patch_grid_size X patch_grid_size patches + # these patches are flatenned, and called tokens from here on. + + # out: (bsz * n_imgs * n_tiles, embed_dim, patch_grid_size, patch_grid_size) + x = self.conv(images) + + # out: (bsz * n_imgs, n_tiles, n_tokens, embed_dim) + x = x.reshape(bsz_and_n_imgs, n_tiles, -1, self.patches_per_tile).permute( + 0, 1, 3, 2 + ) + bsz_and_n_imgs, n_tiles, n_tokens, embed_dim = x.shape + + # pre_tile_pos_embed + if self.pre_tile_pos_embed: + x = self.pre_tile_pos_embed(x, aspect_ratio) + + # insert cls token + x = self.cls_token_embedding(x) + n_tokens += 1 + + # token_pos_embedding + x = self.token_pos_embedding(x, aspect_ratio) + + # norm + x = self.ln_pre(x) + + # transformer with optional hidden layer outputs + x = x.reshape(bsz_and_n_imgs, n_tiles * n_tokens, embed_dim) + for layer_idx, transformer_layer in enumerate(self.transformer_layers): + if layer_idx in self.out_indices: + hidden_states.append( + x.reshape(bsz, n_imgs, n_tiles, n_tokens, embed_dim) + ) + x = transformer_layer(x) + + # norm + x = self.ln_post(x) + + # post_tile_pos_embed + if self.post_tile_pos_embed: + x = x.reshape(bsz_and_n_imgs, n_tiles, n_tokens, embed_dim) + x = self.post_tile_pos_embed(x, aspect_ratio) + + # reshape output + x = x.reshape(bsz, n_imgs, n_tiles, n_tokens, embed_dim) + + # cls token projection. n_tokens becomes 1 + if self.cls_projection: + x = self.cls_projection(x) + + return x, hidden_states + + +class CLSEmbedding(nn.Module): + """ + Adds a CLS token to every tile in an image. + + Notice that tile is different from patch (token). An image is divided into tiles during pre-processing, + and patches are the outcome of the convolution in the ViT applied to each tile. + + Args: + embed_dim (int): The dimensionality of the input patch embedding. + Returns: + torch.Tensor: The input tensor with inserted CLS tokens at the beginning of the tensor. + """ + + def __init__(self, embed_dim: int) -> None: + super().__init__() + + scale = embed_dim**-0.5 + self.cls_embedding = nn.Parameter(scale * torch.randn(embed_dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + + # add 1 CLS token to every tile + bsz_and_n_imgs, n_tiles, n_tokens, embed_dim = x.shape + cls_emb = self.cls_embedding.broadcast_to(bsz_and_n_imgs, n_tiles, 1, embed_dim) + return torch.cat([cls_emb, x], dim=2) + + +class CLSProjection(nn.Module): + """ + Linear projection of the CLS token. + + Args: + embed_dim (int): The dimensionality of the input patch embedding. + cls_output_dim (int): The dimensionality of the output projection. + Returns: + torch.Tensor: The projected CLS token embedding. + """ + + def __init__(self, embed_dim: int, cls_output_dim: int) -> None: + super().__init__() + + scale = embed_dim**-0.5 + self.cls_output_dim = cls_output_dim + self.projection = nn.Parameter(scale * torch.randn(embed_dim, cls_output_dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + bsz, n_imgs, n_tiles, n_tokens, embed_dim = x.shape + x = x.reshape(bsz * n_imgs * n_tiles, n_tokens, embed_dim) + + # out: (bsz * n_tiles, cls_output_dim) + x = x[:, 0, :] @ self.projection + + # num_tokens becomes 1 because we only return the CLS token projection + x = x.reshape(bsz, n_imgs, n_tiles, 1, self.cls_output_dim) + return x diff --git a/torchtune/utils/_checkpointing/_checkpointer.py b/torchtune/utils/_checkpointing/_checkpointer.py index 363290a2c7..689b118143 100644 --- a/torchtune/utils/_checkpointing/_checkpointer.py +++ b/torchtune/utils/_checkpointing/_checkpointer.py @@ -16,7 +16,6 @@ from torchtune import utils from torchtune.models import convert_weights -from torchtune.models.gemma import gemma_hf_to_tune, gemma_tune_to_hf from torchtune.models.mistral import ( mistral_reward_hf_to_tune, mistral_reward_tune_to_hf, @@ -288,8 +287,8 @@ class FullModelHFCheckpointer(_CheckpointerInterface): the Llama-2-7b-hf model from the meta-llama repo (https://huggingface.co/meta-llama/Llama-2-7b-hf). Note: - HF checkpoint names usually ordered by ID (eg: 0001_of_0003, 0002_of_0003, etc.) To ensure \ - we read the files in the right order, we sort the checkpoint file names before reading + HF checkpoint names are usually ordered by ID (eg: 0001_of_0003, 0002_of_0003, etc.) To ensure \ + we read the files in the right order, we sort the checkpoint file names before reading. Note: Checkpoint conversion to and from HF's format requires access to model params which are \ @@ -428,14 +427,6 @@ def load_checkpoint(self) -> Dict[str, Any]: num_kv_heads=self._config["num_key_value_heads"], dim=self._config["hidden_size"], ) - elif self._model_type == ModelType.GEMMA: - converted_state_dict[utils.MODEL_KEY] = gemma_hf_to_tune( - merged_state_dict, - num_heads=self._config["num_attention_heads"], - num_kv_heads=self._config["num_key_value_heads"], - dim=self._config["hidden_size"], - head_dim=self._config["head_dim"], - ) elif self._model_type == ModelType.QWEN2: converted_state_dict[utils.MODEL_KEY] = qwen2_hf_to_tune( merged_state_dict, @@ -494,14 +485,6 @@ def save_checkpoint( num_kv_heads=self._config["num_key_value_heads"], dim=self._config["hidden_size"], ) - elif self._model_type == ModelType.GEMMA: - state_dict[utils.MODEL_KEY] = gemma_tune_to_hf( - state_dict[utils.MODEL_KEY], - num_heads=self._config["num_attention_heads"], - num_kv_heads=self._config["num_key_value_heads"], - dim=self._config["hidden_size"], - head_dim=self._config["head_dim"], - ) elif self._model_type == ModelType.QWEN2: state_dict[utils.MODEL_KEY] = qwen2_tune_to_hf( state_dict[utils.MODEL_KEY], @@ -571,6 +554,7 @@ def save_checkpoint( num_heads=self._config["num_attention_heads"], num_kv_heads=self._config["num_key_value_heads"], dim=self._config["hidden_size"], + head_dim=self._config.get("head_dim", None), ) peft_output_path = Path.joinpath( self._output_dir, "adapter_model" diff --git a/torchtune/utils/_profiler.py b/torchtune/utils/_profiler.py index 6a14120780..f71c100c9f 100644 --- a/torchtune/utils/_profiler.py +++ b/torchtune/utils/_profiler.py @@ -190,33 +190,55 @@ def setup_torch_profiler( output_dir: Optional[str] = None, ) -> Tuple[torch.profiler.profile, DictConfig]: """ - Sets up torch.profiler.profile and returns the profiler config with post-setup updates. + Sets up :class:`~torch.profiler.profile` and returns the profiler config with post-setup updates. - The profiler config can be provided in configs under the `profiler` key with the following layout: + The profiler config can be provided in configs under the ``profiler`` key with the following layout: .. code-block:: yaml profiler: - enabled: bool - - #Output directory of trace artifacts - output_dir: str - - #`torch.profiler.ProfilerActivity` types to trace - cpu: bool - cuda: bool - - #Trace options - profile_memory: bool - with_stack: bool - record_shapes: bool - with_flops: bool - - #`torch.profiler.schedule` args - wait_steps: int - warmup_steps: int - active_steps: int - num_cycles: int + _component_: torchtune.utils.setup_torch_profiler + enabled: bool + # Output directory of trace artifacts + output_dir: str + + # torch.profiler.ProfilerActivity types to trace + cpu: bool + cuda: bool + + # Trace options + profile_memory: bool + with_stack: bool + record_shapes: bool + with_flops: bool + + # torch.profiler.schedule args + wait_steps: int + warmup_steps: int + active_steps: int + num_cycles: int + + The profiler schedule updates with respect to an optimizer step (e.g., if + ``gradient_accumulation = 2``, then the profiler will step every 2 batches). + + Sensible defaults will be chosen if the config is missing options: + + - If no activities are specified, profiler will default to CPU + CUDA + - If no schedule is specified, profiler will default to ``DEFAULT_SCHEDULE`` + - Certain options will be overridden (``with_stack`` and ``record_shapes``) \ + depending on requirements of other options (e.g., ``profile_memory`` requires \ + ``with_stack`` and ``record_shapes``). + + + Note: + - Enabling the profiler will result in training speed reduction. + - Setting ``profile_memory: True`` will generate large trace files. + - The profiler schedule is context dependent. Calling ``profiler.step()`` \ + at each batch iteration but **outside** the gradient accumulation scope will \ + ``step`` the profiler each forward / backward step. Calling ``profiler.step()`` \ + each batch iteration but **within** the gradient accumulation scope will ``step`` \ + the profiler each optimizer update step such that each ``step`` contains multiple \ + forward / backward passes. Args: enabled (bool): Enable pytorch profiler. Default is False. @@ -226,33 +248,14 @@ def setup_torch_profiler( with_stack (bool): Profile stack. Default is False. record_shapes (bool): Record shapes. Default is True. with_flops (bool): Profile flops. Default is False. - wait_steps (Optional[int]): Wait time in steps. Maps to `wait` kwarg of `torch.profiler.schedule`. - warmup_steps (Optional[int]): Warmup time in steps. Maps to `warmup` kwarg of `torch.profiler.schedule`. - active_steps (Optional[int]): Active time in steps. Maps to `active` kwarg of `torch.profiler.schedule`. - num_cycles (Optional[int]): Number of profiling cycles. Maps to `repeat` kwarg of `torch.profiler.schedule`. + wait_steps (Optional[int]): Wait time in steps. Maps to ``wait`` kwarg of ``torch.profiler.schedule``. + warmup_steps (Optional[int]): Warmup time in steps. Maps to ``warmup`` kwarg of ``torch.profiler.schedule``. + active_steps (Optional[int]): Active time in steps. Maps to ``active`` kwarg of ``torch.profiler.schedule``. + num_cycles (Optional[int]): Number of profiling cycles. Maps to ``repeat`` kwarg of ``torch.profiler.schedule``. output_dir (Optional[str]): Tracing file output path. Returns: - tuple: [torch.profiler.profile, DictConfig] - - NOTE: - - Enabling the profiler will result in training speed reduction. - - Setting ``profile_memory: true`` will generate large trace files. - - The profiler schedule is context dependent: - * Calling ``profiler.step()`` at each batch iteration but outside the gradient accumulation - scope will ``step`` the profiler each forward / backward step - * Calling ``profiler.step()`` each batch iteration but within the gradient accumulation scope - will ``step`` the profiler each optimizer update step such that each ``step`` contains multiple - forward / backward passes. - - Additional notes: - - the profiler schedule updates with respect to an optimizer step: - - e.g., if `gradient_accumulation = 2`, then the profiler will step every 2 batches. - - sensible defaults will be chosen if the config is missing options - - if no activities are specified, profiler will default to CPU + CUDA - - similarly, if no schedule is specified, profiler will default to DEFAULT_SCHEDULE - - certain options will be overridden (`with_stack` and `record_shapes`) depending on requirements of other options - - e.g., `profile_memory` requires `with_stack` and `record_shapes` + Tuple[torch.profiler.profile, DictConfig] """ if not enabled: diff --git a/torchtune/utils/collate.py b/torchtune/utils/collate.py index 2f0c6db4b5..b22795d4bd 100644 --- a/torchtune/utils/collate.py +++ b/torchtune/utils/collate.py @@ -26,22 +26,22 @@ def padded_collate( ignore_idx (int): Padding index for labels. Defaults to -100. Returns: - Collated input and label tensors. + Dict[str, torch.Tensor]: Collated input and label tensors. Example: >>> token_pairs = [ >>> {"tokens": [1, 2, 3], "labels": [4, 5, 6]}, >>> {"tokens": [7,], "labels": [10,]}, >>> ] - >>> inputs, labels = padded_collate( + >>> collated = padded_collate( >>> batch=token_pairs, >>> padding_idx=padding_idx, >>> ignore_idx=ignore_idx, >>> ) - >>> inputs + >>> collated["tokens"] >>> tensor([[1, 2, 3], [7, 0, 0]]) - >>> labels - >>> tensor([[4,5,6], [10,-100,-100]]) + >>> collated["labels"] + >>> tensor([[4, 5, 6], [10, -100, -100]]) """ input_ids = pad_sequence( [torch.tensor(x["tokens"]) for x in batch], diff --git a/torchtune/utils/quantization.py b/torchtune/utils/quantization.py index 619933ac6b..999d1561a0 100644 --- a/torchtune/utils/quantization.py +++ b/torchtune/utils/quantization.py @@ -10,10 +10,20 @@ from torchao.quantization.quant_api import ( Int4WeightOnlyGPTQQuantizer, Int4WeightOnlyQuantizer, - quantize, Quantizer, ) +from torchtune.modules.low_precision._utils import ( + _get_torchao_version, + _nightly_version_ge, +) + +ao_version, is_nightly = _get_torchao_version() +if is_nightly and _nightly_version_ge(ao_version, "2024-07-03"): + from torchao.quantization.quant_api import quantize_ as quantize +else: + from torchao.quantization.quant_api import quantize + # importing TORCH_VERSION_AFTER_2_3 because `Int8DynActInt4WeightQuantizer` # is only available after 2.3 so we have to guard the pytorch versions to decide # the list of supported quantizers @@ -67,7 +77,15 @@ def get_quantizer_mode(quantizer: Optional[Callable]) -> Optional[str]: """Given a quantizer object, returns a string that specifies the type of quantization. For example, in the case of int4 weight only quantization, we'll return "4w". - If the quantizer is not recognized as a known quantizer, we'll return None + If the quantizer is not recognized as a known quantizer, we'll return None. + + Currently supported: + + - :class:`~torchao.quantization.quant_api.Int4WeightOnlyQuantizer`: "4w" + - :class:`~torchao.quantization.quant_api.Int8WeightOnlyQuantizer`: "8w" + - :class:`~torchao.quantization.quant_api.Int4WeightOnlyGPTQQuantizer`: "4w-gptq" + - :class:`~torchao.quantization.quant_api.Int8DynActInt4WeightQuantizer`: "8da4w" (requires ``torch>=2.3.0``) + - :class:`~torchao.quantization.prototype.qat.Int8DynActInt4WeightQATQuantizer`: "8da4w-qat" (requires ``torch>=2.4.0``) Args: quantizer (Optional[Callable]): A callable object that implements the `quantize` method. From 0f63b1a41331c9b807bb4255249d8bb5d6633fc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Tue, 16 Jul 2024 17:33:38 +0800 Subject: [PATCH 05/14] Remove Qwen2TransformerDecoder. --- torchtune/models/qwen2/__init__.py | 1 - torchtune/models/qwen2/_model_builders.py | 1 - torchtune/models/qwen2/transformer.py | 172 ---------------------- 3 files changed, 174 deletions(-) delete mode 100644 torchtune/models/qwen2/transformer.py diff --git a/torchtune/models/qwen2/__init__.py b/torchtune/models/qwen2/__init__.py index a812bcdc78..ed9e3b0b09 100644 --- a/torchtune/models/qwen2/__init__.py +++ b/torchtune/models/qwen2/__init__.py @@ -11,7 +11,6 @@ qlora_qwen2_7b, qwen2_7b, qwen2_tokenizer, - # TODO ) from ._positional_embeddings import Qwen2RotaryPositionalEmbeddings diff --git a/torchtune/models/qwen2/_model_builders.py b/torchtune/models/qwen2/_model_builders.py index 962d7a76f5..21bc9764b2 100644 --- a/torchtune/models/qwen2/_model_builders.py +++ b/torchtune/models/qwen2/_model_builders.py @@ -8,7 +8,6 @@ from torchtune.models.qwen2._component_builders import qwen2, lora_qwen2 from torchtune.models.qwen2._tokenizer import Qwen2Tokenizer -from torchtune.models.qwen2.transformer import Qwen2TransformerDecoder from torchtune.modules import TransformerDecoder from torchtune.modules.peft import LORA_ATTN_MODULES diff --git a/torchtune/models/qwen2/transformer.py b/torchtune/models/qwen2/transformer.py deleted file mode 100644 index 32ea2f385c..0000000000 --- a/torchtune/models/qwen2/transformer.py +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from typing import Optional - -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch import Tensor -from torchtune.modules import KVCache - -from torchtune.modules.transformer import _get_clones, TransformerDecoderLayer - - -class Qwen2TransformerDecoder(nn.Module): - """ - Transformer Decoder derived from the Qwen2 architecture. A key difference between - the Qwen2 transformer decoder and :class:`~torchtune.modules.TransformerDecoder` - is that the output projection may be replaced with token embeddings weights - (see https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/qwen2/modeling_qwen2.py#L1092). - - Args: - tok_embeddings (nn.Embedding): PyTorch embedding layer, to be used to move - tokens to an embedding space. - layer (TransformerDecoderLayer): Transformer Decoder layer. - num_layers (int): Number of Transformer Decoder layers. - max_seq_len (int): maximum sequence length the model will be run with, as used - by :func:`~torchtune.modules.KVCache` - num_heads (int): number of query heads. For MHA this is also the - number of heads for key and value. This is used to setup the - :func:`~torchtune.modules.KVCache` - head_dim (int): embedding dimension for each head in self-attention. This is used - to setup the :func:`~torchtune.modules.KVCache` - norm (nn.Module): Callable that applies normalization to the output of the decoder, - before final MLP. - output (nn.Linear, **optional**): Callable that applies a linear transformation to the output of - the decoder. None means use token_embeddings. - - Note: - Arg values are checked for correctness (eg: ``attn_dropout`` belongs to [0,1]) - in the module where they are used. This helps reduces the number of raise - statements in code and improves readability. - """ - - def __init__( - self, - tok_embeddings: nn.Embedding, - layer: TransformerDecoderLayer, - num_layers: int, - max_seq_len: int, - num_heads: int, - head_dim: int, - norm: nn.Module, - output: Optional[nn.Linear] = None, - ) -> None: - super().__init__() - - self.tok_embeddings = tok_embeddings - self.layers = _get_clones(layer, num_layers) - self.norm = norm - self.output = output - self.max_seq_len = max_seq_len - self.num_heads = num_heads - self.head_dim = head_dim - self.causal_mask = None - - def setup_caches(self, batch_size: int, dtype: torch.dtype) -> None: - """Setup key value caches for attention calculation. - - Args: - batch_size (int): batch size for the caches. - dtype (torch.dtype): dtype for the caches. - """ - for layer in self.layers: - layer.attn.kv_cache = KVCache( - batch_size=batch_size, - max_seq_len=self.max_seq_len, - num_heads=self.num_heads, - head_dim=self.head_dim, - dtype=dtype, - ) - - # causal_mask is used during inference to ensure we're attending - # to the right tokens - self.causal_mask = torch.tril( - torch.ones(self.max_seq_len, self.max_seq_len, dtype=torch.bool) - ) - - def reset_caches(self): - """Reset the key value caches.""" - if self.layers[0].attn.kv_cache is None: - raise RuntimeError( - "Key value caches are not setup. Call ``setup_caches()`` first." - ) - - for layer in self.layers: - layer.attn.kv_cache.reset() - - def forward( - self, - tokens: Tensor, - *, - mask: Optional[Tensor] = None, - input_pos: Optional[Tensor] = None, - ) -> Tensor: - """ - Args: - tokens (Tensor): input tensor with shape [b x s] - mask (Optional[Tensor]): Optional boolean tensor which contains the attention mask - with shape [b x s x s]. This is applied after the query-key multiplication and - before the softmax. A value of True in row i and column j means token i attends - to token j. A value of False means token i does not attend to token j. If no - mask is specified, a causal mask is used by default. Default is None. - input_pos (Optional[Tensor]): Optional tensor which contains the position ids - of each token. During training, this is used to indicate the positions - of each token relative to its sample when packed, shape [b x s]. - During inference, this indicates the position of the current token. - If none, assume the index of the token is its position id. Default is None. - - Note: At the very first step of inference, when the model is provided with a prompt, - ``input_pos`` would contain the positions of all of the tokens in the prompt - (eg: ``torch.arange(prompt_length)``). This is because we will need to compute the - KV values for each position. - - Returns: - Tensor: output tensor with shape [b x s x v] - - Raises: - ValueError: if causal_mask is set but input_pos is None - - Notation used for tensor shapes: - - b: batch size - - s: sequence length - - v: vocab size - - d: embed dim - - m_s: max seq len - """ - # input tensor of shape [b, s] - bsz, seq_len = tokens.shape - - # shape: [b, s, d] - h = self.tok_embeddings(tokens) - - if self.causal_mask is not None: - if input_pos is None: - raise ValueError( - "Caches are setup, but the position of input token is missing" - ) - if mask is not None: - raise ValueError( - "An attention mask was set. Cannot use a non-causal mask for inference" - ) - # shape: [1, input_pos_len, m_s] - # in most cases input_pos_len should be 1 - mask = self.causal_mask[None, input_pos] - - for layer in self.layers: - # shape: [b, s, d] - h = layer(h, mask=mask, input_pos=input_pos) - - # shape: [b, s, d] - h = self.norm(h) - - # shape: [b, s, out_dim] - out_dim is usually the vocab size - if self.output is None: - output = F.linear(h, self.tok_embeddings.weight).float() - else: - output = self.output(h).float() - return output From 727cfc35dd9ca6744c4bbea87571ca8c636050d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Tue, 16 Jul 2024 18:18:27 +0800 Subject: [PATCH 06/14] Fix Qwen2 tokenizer. --- recipes/configs/qwen2/7B_full.yaml | 2 +- recipes/configs/qwen2/7B_full_low_memory.yaml | 2 +- recipes/configs/qwen2/7B_lora.yaml | 2 +- .../configs/qwen2/7B_lora_single_device.yaml | 2 +- torchtune/models/qwen2/_model_builders.py | 7 +- torchtune/models/qwen2/_tokenizer.py | 38 +-- torchtune/models/qwen2/_trie.py | 237 ------------------ 7 files changed, 16 insertions(+), 274 deletions(-) delete mode 100644 torchtune/models/qwen2/_trie.py diff --git a/recipes/configs/qwen2/7B_full.yaml b/recipes/configs/qwen2/7B_full.yaml index b92bff80d7..49971607ad 100644 --- a/recipes/configs/qwen2/7B_full.yaml +++ b/recipes/configs/qwen2/7B_full.yaml @@ -19,7 +19,7 @@ # Tokenizer tokenizer: - _component_: torchtune.models.qwen2._tokenizer.Qwen2Tokenizer + _component_: torchtune.models.qwen2.qwen2_tokenizer vocab_file: /tmp/Qwen2-7B-Instruct/vocab.json merges_file: /tmp/Qwen2-7B-Instruct/merges.txt diff --git a/recipes/configs/qwen2/7B_full_low_memory.yaml b/recipes/configs/qwen2/7B_full_low_memory.yaml index d248e7f2a2..1458f9e234 100644 --- a/recipes/configs/qwen2/7B_full_low_memory.yaml +++ b/recipes/configs/qwen2/7B_full_low_memory.yaml @@ -21,7 +21,7 @@ # Tokenizer tokenizer: - _component_: torchtune.models.qwen2._tokenizer.Qwen2Tokenizer + _component_: torchtune.models.qwen2.qwen2_tokenizer vocab_file: /tmp/Qwen2-7B-Instruct/vocab.json merges_file: /tmp/Qwen2-7B-Instruct/merges.txt diff --git a/recipes/configs/qwen2/7B_lora.yaml b/recipes/configs/qwen2/7B_lora.yaml index 7e93c64855..fbce498ef8 100644 --- a/recipes/configs/qwen2/7B_lora.yaml +++ b/recipes/configs/qwen2/7B_lora.yaml @@ -28,7 +28,7 @@ model: lora_alpha: 16 tokenizer: - _component_: torchtune.models.qwen2._tokenizer.Qwen2Tokenizer + _component_: torchtune.models.qwen2.qwen2_tokenizer vocab_file: /tmp/Qwen2-7B-Instruct/vocab.json merges_file: /tmp/Qwen2-7B-Instruct/merges.txt diff --git a/recipes/configs/qwen2/7B_lora_single_device.yaml b/recipes/configs/qwen2/7B_lora_single_device.yaml index 6cf8694809..92ff4623e9 100644 --- a/recipes/configs/qwen2/7B_lora_single_device.yaml +++ b/recipes/configs/qwen2/7B_lora_single_device.yaml @@ -26,7 +26,7 @@ model: lora_alpha: 16 tokenizer: - _component_: torchtune.models.qwen2._tokenizer.Qwen2Tokenizer + _component_: torchtune.models.qwen2.qwen2_tokenizer vocab_file: /tmp/Qwen2-7B-Instruct/vocab.json merges_file: /tmp/Qwen2-7B-Instruct/merges.txt diff --git a/torchtune/models/qwen2/_model_builders.py b/torchtune/models/qwen2/_model_builders.py index 21bc9764b2..4d6045c90e 100644 --- a/torchtune/models/qwen2/_model_builders.py +++ b/torchtune/models/qwen2/_model_builders.py @@ -42,7 +42,10 @@ def qwen2_7b() -> TransformerDecoder: ) -def qwen2_tokenizer(vocab_file: str, merges_file: str, special_tokens_path: Optional[str] = None) -> Qwen2Tokenizer: +def qwen2_tokenizer( + vocab_file: str, merges_file: str, special_tokens_path: Optional[str] = None, + **kwargs, +) -> Qwen2Tokenizer: """ Tokenizer for Qwen2. @@ -57,7 +60,7 @@ def qwen2_tokenizer(vocab_file: str, merges_file: str, special_tokens_path: Opti Llama3Tokenizer: Instantiation of the Qwen2 tokenizer """ special_tokens = parse_hf_tokenizer_json(special_tokens_path) if special_tokens_path is not None else None - return Qwen2Tokenizer(vocab_file=vocab_file, merges_file=merges_file, special_tokens=special_tokens) + return Qwen2Tokenizer(vocab_file=vocab_file, merges_file=merges_file, special_tokens=special_tokens, **kwargs) def lora_qwen2_7b( diff --git a/torchtune/models/qwen2/_tokenizer.py b/torchtune/models/qwen2/_tokenizer.py index d9460bbcc9..35be6312b1 100644 --- a/torchtune/models/qwen2/_tokenizer.py +++ b/torchtune/models/qwen2/_tokenizer.py @@ -11,7 +11,6 @@ import regex as re from torchtune.data import Message, truncate -from torchtune.models.qwen2._trie import Trie from torchtune.modules.tokenizers import ModelTokenizer PRETOKENIZE_REGEX = ( @@ -142,9 +141,9 @@ def __init__( self.stop_tokens = [self.eos_id, self.im_end_id] # Tokens trie for special tokens. - self.tokens_trie = Trie() - for special_token in self.special_tokens: - self.tokens_trie.add(special_token) + self._pattern_split_special_tokens = re.compile( + r"(\L)", options=self.special_tokens.keys() + ) def _bpe(self, token): if token in self.cache: @@ -224,7 +223,7 @@ def encode( text = unicodedata.normalize("NFC", text) - tokens = self.tokens_trie.split(text) + tokens = self._pattern_split_special_tokens.split(text) tokenized_text = [] for token in tokens: @@ -250,29 +249,6 @@ def encode( return token_ids - def encode_batch( - self, - batch_text: List[str], - add_bos: bool = True, - add_eos: bool = True, - **kwargs, - ) -> List[List[int]]: - """Encode a batch of strings into lists of token ids. - - Args: - batch_text (List[str]): The batch of strings to encode. - add_bos (bool): (Optional) Whether to add the beginning of sequence token. - add_eos (bool): (Optional) Whether to add the end of sequence token. - - Returns: - List[List[int]]: A batch of lists of token ids. - """ - batch_token_ids = [] - for text in batch_text: - token_ids = self.encode(text, add_bos=add_bos, add_eos=add_eos, **kwargs) - batch_token_ids.append(token_ids) - return batch_token_ids - def _convert_id_to_token(self, index: int) -> str: """Converts an index (integer) in a token (str) using the vocab.""" token = self._special_tokens_reversed.get(index, None) @@ -280,7 +256,7 @@ def _convert_id_to_token(self, index: int) -> str: return self.decoder.get(index) return token - def convert_tokens_to_string(self, tokens: List[str]) -> str: + def _convert_tokens_to_string(self, tokens: List[str]) -> str: """Converts a sequence of tokens (string) in a single string.""" text = "".join(tokens) text = bytearray([self.byte_decoder[c] for c in text]).decode( @@ -310,7 +286,7 @@ def decode( token = self._convert_id_to_token(token_id) if token_id in self._special_tokens_reversed: if current_sub_text: - string = self.convert_tokens_to_string(current_sub_text) + string = self._convert_tokens_to_string(current_sub_text) if string: sub_texts.append(string) current_sub_text = [] @@ -319,7 +295,7 @@ def decode( else: current_sub_text.append(token) if current_sub_text: - sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + sub_texts.append(self._convert_tokens_to_string(current_sub_text)) text = "".join(sub_texts) return text diff --git a/torchtune/models/qwen2/_trie.py b/torchtune/models/qwen2/_trie.py deleted file mode 100644 index 7b5bec41e7..0000000000 --- a/torchtune/models/qwen2/_trie.py +++ /dev/null @@ -1,237 +0,0 @@ -# Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from collections import OrderedDict -from typing import List - -from torchtune.utils.logging import get_logger - -logger = get_logger() - - -class Trie: - """ - Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass - Loose reference https://en.wikipedia.org/wiki/Trie - - This class is copied from . - """ - - def __init__(self): - self.data = {} - self._tokens = set() - - def add(self, word: str): - """ - Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation. - The special key `""` is used to represent termination. - - This function is idempotent, adding twice the same word will leave the trie unchanged - - Example: - - ```python - >>> trie = Trie() - >>> trie.add("Hello 友達") - >>> trie.data - {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}} - - >>> trie.add("Hello") - >>> trie.data - {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}} - ``` - """ - if not word: - # Prevent empty string - return - - self._tokens.add(word) - ref = self.data - for char in word: - ref[char] = char in ref and ref[char] or {} - ref = ref[char] - ref[""] = 1 - - def split(self, text: str) -> List[str]: - """ - Will look for the words added to the trie within `text`. Output is the original string splitted along the - boundaries of the words found. - - This trie will match the longest possible word first ! - - Example: - - ```python - >>> trie = Trie() - >>> trie.split("[CLS] This is a extra_id_100") - ["[CLS] This is a extra_id_100"] - - >>> trie.add("[CLS]") - >>> trie.add("extra_id_1") - >>> trie.add("extra_id_100") - >>> trie.split("[CLS] This is a extra_id_100") - ["[CLS]", " This is a ", "extra_id_100"] - ``` - """ - # indexes are counted left of the chars index. - # "hello", index 0, is left of h, index 1 is between h and e. - # index 5 is right of the "o". - - # States are going to capture every possible start (indexes as above) - # as keys, and have as values, a pointer to the position in the trie - # where we're at. This is a partial match for now. - # This enables to keep track of multiple matches while we're iterating - # the string - # If the trie contains, "blowing", and "lower" and we encounter the - # string "blower", we need to split into ["b", "lower"]. - # This is where we need to keep track of multiple possible starts. - states = OrderedDict() - - # This will contain every indices where we need - # to cut. - # We force to cut at offset 0 and len(text) (added later) - offsets = [0] - - # This is used by the lookahead which needs to skip over - # some text where the full match exceeded the place in the initial - # for loop - skip = 0 - # Main loop, Giving this algorithm O(n) complexity - for current, current_char in enumerate(text): - if skip and current < skip: - # Prevents the lookahead for matching twice - # like extra_id_100 and id_100 - continue - - # This will track every state - # that stop matching, we need to stop tracking them. - # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then - # fail on "b", we need to remove 0 from the valid states. - to_remove = set() - # Whenever we found a match, we need to drop everything - # this is a greedy algorithm, it will match on the first found token - reset = False - - # In this case, we already have partial matches (But unfinished) - for start, trie_pointer in states.items(): - if "" in trie_pointer: - # This is a final match, we need to reset and - # store the results in `offsets`. - - # Lookahead to match longest first - # Important in case of extra_id_1 vs extra_id_100 - # Here we are also actively looking for other earlier partial - # matches - # "[CLS]", "L", we need to match CLS even if L is special - for lookstart, looktrie_pointer in states.items(): - if lookstart > start: - # This partial match is later, we can stop looking - break - elif lookstart < start: - # This partial match is earlier, the trie pointer - # was already updated, so index is + 1 - lookahead_index = current + 1 - end = current + 1 - else: - # Here lookstart == start and - # looktrie_pointer == trie_pointer - # It wasn't updated yet so indices are current ones - lookahead_index = current - end = current - next_char = ( - text[lookahead_index] - if lookahead_index < len(text) - else None - ) - if "" in looktrie_pointer: - start = lookstart - end = lookahead_index - skip = lookahead_index - - while next_char in looktrie_pointer: - looktrie_pointer = looktrie_pointer[next_char] - lookahead_index += 1 - if "" in looktrie_pointer: - start = lookstart - end = lookahead_index - skip = lookahead_index - - if lookahead_index == len(text): - # End of string - break - next_char = text[lookahead_index] - # End lookahead - - # Storing and resetting - offsets.append(start) - offsets.append(end) - reset = True - break - elif current_char in trie_pointer: - # The current character being looked at has a match within the trie - # update the pointer (it will be stored back into states later). - trie_pointer = trie_pointer[current_char] - - # Storing back the new pointer into the states. - # Partial matches got longer by one. - states[start] = trie_pointer - else: - # The new character has not match in the trie, we need - # to stop keeping track of this partial match. - # We can't do it directly within the loop because of how - # python iteration works - to_remove.add(start) - - # Either clearing the full start (we found a real match) - # Or clearing only the partial matches that didn't work. - if reset: - states = {} - else: - for start in to_remove: - del states[start] - - # If this character is a starting character within the trie - # start keeping track of this partial match. - if current >= skip and current_char in self.data: - states[current] = self.data[current_char] - - # We have a cut at the end with states. - for start, trie_pointer in states.items(): - if "" in trie_pointer: - # This is a final match, we need to reset and - # store the results in `offsets`. - end = len(text) - offsets.append(start) - offsets.append(end) - # Longest cut is always the one with lower start so the first - # item so we need to break. - break - - return self.cut_text(text, offsets) - - def cut_text(self, text, offsets): - # We have all the offsets now, we just need to do the actual splitting. - # We need to eventually add the first part of the string and the eventual - # last part. - offsets.append(len(text)) - tokens = [] - start = 0 - for end in offsets: - if start > end: - logger.error( - "There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it" - " anyway." - ) - continue - elif start == end: - # This might happen if there's a match at index 0 - # we're also preventing zero-width cuts in case of two - # consecutive matches - continue - tokens.append(text[start:end]) - start = end - - return tokens From 72a9e65ccddc7c1e5d961c1b557d777bfa2a71b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Wed, 17 Jul 2024 15:56:53 +0800 Subject: [PATCH 07/14] Fix the PR based on review comments. --- recipes/configs/qwen2/7B_full.yaml | 2 +- recipes/configs/qwen2/7B_full_low_memory.yaml | 2 +- recipes/configs/qwen2/7B_lora.yaml | 2 +- .../configs/qwen2/7B_lora_single_device.yaml | 2 +- tests/assets/tiny_bpe_tokenizer.json | 3987 +---------------- tests/assets/tiny_bpe_vocab.json | 2003 +-------- .../models/qwen2/test_qwen2_tokenizer.py | 2 +- torchtune/models/qwen2/__init__.py | 8 +- torchtune/models/qwen2/_component_builders.py | 6 +- torchtune/models/qwen2/_model_builders.py | 31 +- torchtune/models/qwen2/_tokenizer.py | 16 +- torchtune/modules/__init__.py | 7 +- torchtune/modules/transformer.py | 51 +- 13 files changed, 78 insertions(+), 6041 deletions(-) diff --git a/recipes/configs/qwen2/7B_full.yaml b/recipes/configs/qwen2/7B_full.yaml index 49971607ad..b71e7ec321 100644 --- a/recipes/configs/qwen2/7B_full.yaml +++ b/recipes/configs/qwen2/7B_full.yaml @@ -20,7 +20,7 @@ # Tokenizer tokenizer: _component_: torchtune.models.qwen2.qwen2_tokenizer - vocab_file: /tmp/Qwen2-7B-Instruct/vocab.json + path: /tmp/Qwen2-7B-Instruct/vocab.json merges_file: /tmp/Qwen2-7B-Instruct/merges.txt # Dataset diff --git a/recipes/configs/qwen2/7B_full_low_memory.yaml b/recipes/configs/qwen2/7B_full_low_memory.yaml index 1458f9e234..a0ff849098 100644 --- a/recipes/configs/qwen2/7B_full_low_memory.yaml +++ b/recipes/configs/qwen2/7B_full_low_memory.yaml @@ -22,7 +22,7 @@ # Tokenizer tokenizer: _component_: torchtune.models.qwen2.qwen2_tokenizer - vocab_file: /tmp/Qwen2-7B-Instruct/vocab.json + path: /tmp/Qwen2-7B-Instruct/vocab.json merges_file: /tmp/Qwen2-7B-Instruct/merges.txt # Dataset diff --git a/recipes/configs/qwen2/7B_lora.yaml b/recipes/configs/qwen2/7B_lora.yaml index fbce498ef8..0b529853a0 100644 --- a/recipes/configs/qwen2/7B_lora.yaml +++ b/recipes/configs/qwen2/7B_lora.yaml @@ -29,7 +29,7 @@ model: tokenizer: _component_: torchtune.models.qwen2.qwen2_tokenizer - vocab_file: /tmp/Qwen2-7B-Instruct/vocab.json + path: /tmp/Qwen2-7B-Instruct/vocab.json merges_file: /tmp/Qwen2-7B-Instruct/merges.txt checkpointer: diff --git a/recipes/configs/qwen2/7B_lora_single_device.yaml b/recipes/configs/qwen2/7B_lora_single_device.yaml index 92ff4623e9..5f34420dae 100644 --- a/recipes/configs/qwen2/7B_lora_single_device.yaml +++ b/recipes/configs/qwen2/7B_lora_single_device.yaml @@ -27,7 +27,7 @@ model: tokenizer: _component_: torchtune.models.qwen2.qwen2_tokenizer - vocab_file: /tmp/Qwen2-7B-Instruct/vocab.json + path: /tmp/Qwen2-7B-Instruct/vocab.json merges_file: /tmp/Qwen2-7B-Instruct/merges.txt checkpointer: diff --git a/tests/assets/tiny_bpe_tokenizer.json b/tests/assets/tiny_bpe_tokenizer.json index 3e6be56a1a..b8c525882a 100644 --- a/tests/assets/tiny_bpe_tokenizer.json +++ b/tests/assets/tiny_bpe_tokenizer.json @@ -1,3986 +1 @@ -{ - "version": "1.0", - "truncation": null, - "padding": null, - "added_tokens": [ - { - "id": 2000, - "content": "<|endoftext|>", - "single_word": false, - "lstrip": false, - "rstrip": false, - "normalized": false, - "special": true - }, - { - "id": 2001, - "content": "<|im_start|>", - "single_word": false, - "lstrip": false, - "rstrip": false, - "normalized": false, - "special": true - }, - { - "id": 2002, - "content": "<|im_end|>", - "single_word": false, - "lstrip": false, - "rstrip": false, - "normalized": false, - "special": true - } - ], - "normalizer": { - "type": "NFC" - }, - "pre_tokenizer": { - "type": "Sequence", - "pretokenizers": [ - { - "type": "Split", - "pattern": { - "Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" - }, - "behavior": "Isolated", - "invert": false - }, - { - "type": "ByteLevel", - "add_prefix_space": false, - "trim_offsets": false, - "use_regex": false - } - ] - }, - "post_processor": { - "type": "ByteLevel", - "add_prefix_space": false, - "trim_offsets": false, - "use_regex": false - }, - "decoder": { - "type": "ByteLevel", - "add_prefix_space": false, - "trim_offsets": false, - "use_regex": false - }, - "model": { - "type": "BPE", - "dropout": null, - "unk_token": null, - "continuing_subword_prefix": null, - "end_of_word_suffix": null, - "fuse_unk": false, - "byte_fallback": false, - "ignore_merges": false, - "vocab": { - "!": 0, - "\"": 1, - "#": 2, - "$": 3, - "%": 4, - "&": 5, - "'": 6, - "(": 7, - ")": 8, - "*": 9, - "+": 10, - ",": 11, - "-": 12, - ".": 13, - "/": 14, - "0": 15, - "1": 16, - "2": 17, - "3": 18, - "4": 19, - "5": 20, - "6": 21, - "7": 22, - "8": 23, - "9": 24, - ":": 25, - ";": 26, - "<": 27, - "=": 28, - ">": 29, - "?": 30, - "@": 31, - "A": 32, - "B": 33, - "C": 34, - "D": 35, - "E": 36, - "F": 37, - "G": 38, - "H": 39, - "I": 40, - "J": 41, - "K": 42, - "L": 43, - "M": 44, - "N": 45, - "O": 46, - "P": 47, - "Q": 48, - "R": 49, - "S": 50, - "T": 51, - "U": 52, - "V": 53, - "W": 54, - "X": 55, - "Y": 56, - "Z": 57, - "[": 58, - "\\": 59, - "]": 60, - "^": 61, - "_": 62, - "`": 63, - "a": 64, - "b": 65, - "c": 66, - "d": 67, - "e": 68, - "f": 69, - "g": 70, - "h": 71, - "i": 72, - "j": 73, - "k": 74, - "l": 75, - "m": 76, - "n": 77, - "o": 78, - "p": 79, - "q": 80, - "r": 81, - "s": 82, - "t": 83, - "u": 84, - "v": 85, - "w": 86, - "x": 87, - "y": 88, - "z": 89, - "{": 90, - "|": 91, - "}": 92, - "~": 93, - "Ċ": 94, - "Ġ": 95, - "ĠĠ": 96, - "Ġt": 97, - "Ġa": 98, - "in": 99, - "he": 100, - "re": 101, - "on": 102, - "Ġthe": 103, - "Ġs": 104, - "er": 105, - "at": 106, - "Ġc": 107, - "ĠĠĠĠ": 108, - "en": 109, - "Ġo": 110, - "Ġ\"": 111, - "nd": 112, - "es": 113, - "ing": 114, - "ĠĠĠ": 115, - "it": 116, - "Ġp": 117, - "or": 118, - "ou": 119, - "Ġand": 120, - "Ġw": 121, - "is": 122, - "Ġf": 123, - "an": 124, - "ion": 125, - "al": 126, - "Ġb": 127, - "Ġto": 128, - "Ġm": 129, - "Ġin": 130, - "Ġof": 131, - "le": 132, - "ct": 133, - "ar": 134, - "ut": 135, - "Ġd": 136, - "st": 137, - "ed": 138, - "ĠĠĠĠĠĠĠ": 139, - "ic": 140, - "\":": 141, - ",Ċ": 142, - "ro": 143, - "ent": 144, - "\\n": 145, - "Ġe": 146, - "put": 147, - "om": 148, - "Ġre": 149, - "as": 150, - "ve": 151, - "Ġh": 152, - "Ġth": 153, - "\",Ċ": 154, - "Ġl": 155, - "Ġis": 156, - "et": 157, - "ce": 158, - "Ġn": 159, - ".\\": 160, - "im": 161, - "il": 162, - "Ġg": 163, - "Ġu": 164, - "ction": 165, - "ru": 166, - "ation": 167, - "ol": 168, - "ch": 169, - "ĠT": 170, - "Ġfor": 171, - "out": 172, - "ra": 173, - "ow": 174, - "id": 175, - "ly": 176, - "Ġst": 177, - "Ġbe": 178, - "Ġy": 179, - "Ġpro": 180, - "ig": 181, - "se": 182, - "ate": 183, - "Ġthat": 184, - "ith": 185, - "ir": 186, - "ur": 187, - "ot": 188, - "Ġor": 189, - "Ġon": 190, - "Ġyou": 191, - "ers": 192, - "stru": 193, - "Ġan": 194, - "if": 195, - "ul": 196, - "struction": 197, - "Ġ{": 198, - "Ġ}": 199, - "Ġcan": 200, - "input": 201, - "output": 202, - "instruction": 203, - "Ġ{Ċ": 204, - "Ġ},Ċ": 205, - "\"Ċ": 206, - "Ġhe": 207, - "Ġcon": 208, - "Ġit": 209, - "ay": 210, - "ess": 211, - "Ġwith": 212, - "ver": 213, - "el": 214, - "Ġas": 215, - "am": 216, - "ĠA": 217, - "ge": 218, - "Ġsu": 219, - "iv": 220, - ".\",Ċ": 221, - "Ġcom": 222, - "ĠI": 223, - "ment": 224, - "ak": 225, - "Ġal": 226, - "\\\"": 227, - ".\"Ċ": 228, - "ive": 229, - "Ġare": 230, - "ab": 231, - "ad": 232, - "Ġmo": 233, - "Ġex": 234, - "Ġv": 235, - "ĠS": 236, - "res": 237, - "pp": 238, - "qu": 239, - "Ġde": 240, - "Ġwh": 241, - "ity": 242, - "Ġen": 243, - "ĠThe": 244, - "her": 245, - "ld": 246, - "ri": 247, - "ter": 248, - "ant": 249, - "ĠC": 250, - "ist": 251, - "Ġ\"\",Ċ": 252, - "um": 253, - "Ġus": 254, - "Ġne": 255, - "ain": 256, - "th": 257, - "ect": 258, - "Ġle": 259, - "op": 260, - "em": 261, - "ies": 262, - "Ġch": 263, - "Ġim": 264, - "du": 265, - "od": 266, - "ort": 267, - "nt": 268, - "est": 269, - "igh": 270, - "ere": 271, - "Ġha": 272, - "us": 273, - "ure": 274, - "ial": 275, - "oc": 276, - "Ġwor": 277, - "Ġtheir": 278, - "ac": 279, - "ence": 280, - "iz": 281, - "Ġyour": 282, - "os": 283, - "Ġimp": 284, - "ud": 285, - "Ġby": 286, - "Ġse": 287, - "ine": 288, - "ould": 289, - "low": 290, - "ill": 291, - "age": 292, - "rom": 293, - "Ġsp": 294, - "ĠP": 295, - "Ġsh": 296, - "ust": 297, - "The": 298, - "un": 299, - "'s": 300, - "Ġinc": 301, - "ide": 302, - "pl": 303, - "ight": 304, - "og": 305, - "Ġpl": 306, - "pt": 307, - "are": 308, - "Ġte": 309, - "Ġint": 310, - "Ġ\\": 311, - "his": 312, - "Ġr": 313, - "ake": 314, - "per": 315, - "orm": 316, - "ag": 317, - "ff": 318, - "ĠE": 319, - "art": 320, - "Ġk": 321, - "end": 322, - "ĠM": 323, - "Ġwe": 324, - "ĠB": 325, - "Ġad": 326, - "cess": 327, - "rou": 328, - "ical": 329, - "all": 330, - "able": 331, - "Ġfrom": 332, - "and": 333, - "ĠH": 334, - "Ġab": 335, - "act": 336, - "Ġcomp": 337, - "ome": 338, - "ach": 339, - "ĠThis": 340, - "Ġhave": 341, - "form": 342, - "Ġ\\\"": 343, - "ast": 344, - "Ġat": 345, - "ĠW": 346, - "Ġres": 347, - "Ġdat": 348, - ":\\": 349, - "ther": 350, - "ions": 351, - "ore": 352, - "Ġ(": 353, - "Ġcont": 354, - "our": 355, - "ep": 356, - "ĠF": 357, - "Ġac": 358, - "ance": 359, - "ĠR": 360, - "gh": 361, - "Ġme": 362, - "ces": 363, - "Ġwas": 364, - "ind": 365, - "vel": 366, - "ations": 367, - "Ġhel": 368, - "Ġmore": 369, - "ult": 370, - "ĠD": 371, - "reat": 372, - "ign": 373, - "Ġhelp": 374, - "ime": 375, - "ard": 376, - "Ġcl": 377, - "Ġapp": 378, - "ans": 379, - "ie": 380, - "Ġdata": 381, - "ich": 382, - "ang": 383, - "ous": 384, - "ell": 385, - "ks": 386, - "ase": 387, - "ice": 388, - "ip": 389, - "ite": 390, - "Ġsuch": 391, - "Ġfe": 392, - "Ġwhe": 393, - "ib": 394, - "Ġother": 395, - "Ġthis": 396, - "ass": 397, - "ual": 398, - "ile": 399, - "ne": 400, - "red": 401, - "Ġhas": 402, - "oo": 403, - "ress": 404, - "ific": 405, - "ning": 406, - "Ġ=": 407, - "Ġup": 408, - "Ġman": 409, - "Ġar": 410, - "ong": 411, - "ec": 412, - "Ġtra": 413, - "av": 414, - "Ġwhich": 415, - "Ġgo": 416, - "Ġprov": 417, - "Ġdis": 418, - "**": 419, - "so": 420, - "ĠG": 421, - "one": 422, - "Ġem": 423, - "Ġnot": 424, - "ue": 425, - "ĠO": 426, - "Ġj": 427, - "ace": 428, - "Ġthey": 429, - "ame": 430, - "Ġqu": 431, - "ĠL": 432, - "iff": 433, - "Ġfol": 434, - "ary": 435, - "ated": 436, - "ustom": 437, - "ition": 438, - "Ġits": 439, - "Ġsy": 440, - "ke": 441, - "ack": 442, - "ry": 443, - "--": 444, - "Ġtime": 445, - "Ġdes": 446, - "Ġnew": 447, - "ents": 448, - "ount": 449, - "Ġfollow": 450, - "Ġalso": 451, - "Ġcomm": 452, - "Ġout": 453, - "Ġeff": 454, - "Ġdiff": 455, - "iven": 456, - "ap": 457, - "Ġsent": 458, - "\\u": 459, - "Ġso": 460, - "Ġprodu": 461, - "Ġuse": 462, - "Ġsc": 463, - "Ġ-": 464, - "Ġun": 465, - "lud": 466, - "ĠIt": 467, - "ener": 468, - "king": 469, - "Ġev": 470, - "Ġabout": 471, - "Ġthem": 472, - "ĠU": 473, - "Ġcustom": 474, - "Ġro": 475, - "Ġinclud": 476, - "les": 477, - "etw": 478, - "stem": 479, - "xt": 480, - "Ġinto": 481, - "Ġper": 482, - "ĠIn": 483, - "ĠN": 484, - "Ġwill": 485, - "Ġlear": 486, - "ber": 487, - "Ġall": 488, - "Ġpe": 489, - "ds": 490, - "Ġtw": 491, - "aking": 492, - "ark": 493, - "ful": 494, - "Ġmake": 495, - "chn": 496, - "erv": 497, - "ost": 498, - "rough": 499, - "Ġone": 500, - "Ġinter": 501, - "ities": 502, - "ail": 503, - "ike": 504, - "ree": 505, - "ple": 506, - "alth": 507, - "Ġused": 508, - "ors": 509, - "Ġover": 510, - "ility": 511, - "ments": 512, - "ange": 513, - "Ġway": 514, - "ory": 515, - "Ġcol": 516, - "Ġpr": 517, - "Ġcould": 518, - "Ġnum": 519, - "reate": 520, - "int": 521, - "Ġredu": 522, - "erson": 523, - "Ġrec": 524, - "Ġher": 525, - "Ġneed": 526, - "ms": 527, - "ater": 528, - "oy": 529, - "Ġsystem": 530, - "Ġinform": 531, - "Ġtwo": 532, - "Ġtechn": 533, - "Ġsentence": 534, - "ience": 535, - "ize": 536, - "get": 537, - "Ġdiffere": 538, - "ood": 539, - "rib": 540, - "Ġbut": 541, - "Ġfollowing": 542, - "ased": 543, - "olog": 544, - "erg": 545, - "led": 546, - "ures": 547, - "In": 548, - "ear": 549, - "Ġph": 550, - "own": 551, - "Ġpre": 552, - "Ġwould": 553, - "Ġusing": 554, - "Ġcons": 555, - "Ġwork": 556, - "Ġmod": 557, - "ating": 558, - "ia": 559, - "ire": 560, - "Ġpos": 561, - "ient": 562, - "ob": 563, - "ject": 564, - "Ġinv": 565, - "ons": 566, - "Ġdo": 567, - "ular": 568, - "Ġdec": 569, - "Ġhealth": 570, - "Ġimpro": 571, - "Ġany": 572, - "Ġthrough": 573, - "yp": 574, - "row": 575, - "velop": 576, - "Ġprocess": 577, - "Ġtr": 578, - "lic": 579, - "very": 580, - "als": 581, - "ify": 582, - "``": 583, - "ari": 584, - "Ġstr": 585, - "Ġimport": 586, - "Ġlike": 587, - "Ġproduct": 588, - "Ġsome": 589, - "ph": 590, - "ential": 591, - "Ġam": 592, - "ates": 593, - "Ġacc": 594, - "ens": 595, - "ns": 596, - "Ġsm": 597, - "Ġind": 598, - "een": 599, - "Ġexper": 600, - "lect": 601, - "Ġval": 602, - "Ġrel": 603, - "its": 604, - "Ġinformation": 605, - "ings": 606, - "ĠJ": 607, - "ople": 608, - "iness": 609, - "Ġgiven": 610, - "mm": 611, - "ices": 612, - "Ġpart": 613, - "ild": 614, - "ys": 615, - "Ġour": 616, - "nder": 617, - "Ġperson": 618, - "ally": 619, - "Ġke": 620, - "etween": 621, - "ft": 622, - "oth": 623, - "Ġspec": 624, - "Ġbetween": 625, - "ergy": 626, - "ĠAI": 627, - "Ġwho": 628, - "Ġmay": 629, - "ef": 630, - "ative": 631, - "ise": 632, - "Ġlist": 633, - "Ġkn": 634, - "Ġadd": 635, - ",\\": 636, - "ord": 637, - "ics": 638, - "Ġpeople": 639, - "ĠSt": 640, - "Ġhis": 641, - "Ġexp": 642, - "ible": 643, - "Ġthere": 644, - "Ġserv": 645, - "Ġincre": 646, - "Ġdevelop": 647, - "ound": 648, - "ower": 649, - "Ġtrans": 650, - "bs": 651, - "Ġenergy": 652, - "Ġoff": 653, - "Ġbus": 654, - "Ġwhile": 655, - "ose": 656, - "Ġact": 657, - "Ġexam": 658, - "Ġlearning": 659, - "ctions": 660, - "con": 661, - "gor": 662, - "gan": 663, - "ution": 664, - "round": 665, - "pport": 666, - "Ġhow": 667, - "Ġbl": 668, - "Ġmed": 669, - "anc": 670, - "Ġtyp": 671, - "Ġra": 672, - "Ġcar": 673, - "ife": 674, - "Ġworld": 675, - "Ġvari": 676, - "Ġrep": 677, - "au": 678, - "Ġsoc": 679, - "Ġprovid": 680, - "Ġset": 681, - "ten": 682, - "Ġsol": 683, - "Ġeach": 684, - "Ġwhen": 685, - "Ġeffect": 686, - "Ġpo": 687, - "Ġshe": 688, - "ick": 689, - "Ġwhere": 690, - "Ġmodel": 691, - "Ġimportant": 692, - "Ġunder": 693, - "Ġprog": 694, - "enerate": 695, - "ural": 696, - "tain": 697, - "Ġass": 698, - "ology": 699, - "Ġhad": 700, - "ook": 701, - "gg": 702, - "Ġcustomer": 703, - "ting": 704, - "ving": 705, - "Ġresp": 706, - "line": 707, - "Ġcreat": 708, - "ll": 709, - "ily": 710, - "Ġreg": 711, - "Ġdet": 712, - "Ġif": 713, - "Ġ+": 714, - "Ġbusiness": 715, - "\\nIn": 716, - "ish": 717, - "Ġmost": 718, - "ĠĠĠĠĠĠĠĠ": 719, - "hes": 720, - "angu": 721, - "Ġprovide": 722, - "Ġadv": 723, - "erm": 724, - "ub": 725, - "Ġsk": 726, - "irst": 727, - "any": 728, - "Ġday": 729, - "ivid": 730, - "arm": 731, - "ract": 732, - "nce": 733, - "Ġ|": 734, - "Ġimprove": 735, - ")\\": 736, - "Ġco": 737, - "Ġcommun": 738, - "arket": 739, - "Ġmet": 740, - "cy": 741, - "Ġdifferent": 742, - "ized": 743, - "Ġart": 744, - "\\nThe": 745, - "rit": 746, - "Ġcomput": 747, - "Ġform": 748, - "ck": 749, - "Ġhum": 750, - "Ġchar": 751, - "ble": 752, - "Ġlead": 753, - "iron": 754, - "Ġrem": 755, - "Ġshould": 756, - "te": 757, - "Ġallow": 758, - "ness": 759, - "hat": 760, - "Ġfun": 761, - "Ġcomple": 762, - "Ġlangu": 763, - "ages": 764, - "Ġbec": 765, - "Ġsign": 766, - "ues": 767, - "ature": 768, - "Ġfind": 769, - "riend": 770, - "Ġstud": 771, - "Ġmain": 772, - "imate": 773, - "ove": 774, - "Ġresult": 775, - "Ġplay": 776, - "Ġreduce": 777, - "Ġeng": 778, - "ware": 779, - "redi": 780, - "Ġnumber": 781, - "Ġlar": 782, - "Ġpol": 783, - "Ġpat": 784, - "Ġwell": 785, - "ident": 786, - "viron": 787, - "rite": 788, - "crib": 789, - "Ġbu": 790, - "Ġhigh": 791, - "Ġthese": 792, - "ives": 793, - "ves": 794, - "Ġdesign": 795, - "urn": 796, - "Ġthan": 797, - "der": 798, - "Ġanal": 799, - "Ġwater": 800, - "Ġmarket": 801, - "Ġexample": 802, - "way": 803, - "stand": 804, - "ng": 805, - "ax": 806, - "itive": 807, - "Ġ`": 808, - "iqu": 809, - "Ġsim": 810, - "Ġequ": 811, - "gorith": 812, - "Ġtext": 813, - "resent": 814, - "Ġmany": 815, - "uring": 816, - "----": 817, - "\\nA": 818, - "Ġdi": 819, - "Ġsa": 820, - "vironment": 821, - "arch": 822, - "Ġatt": 823, - "Ġpot": 824, - "Ġtas": 825, - "Ġcreate": 826, - "ough": 827, - "Ġfl": 828, - "Ġmaking": 829, - "ious": 830, - "Ġgra": 831, - "Ġlife": 832, - "\\nO": 833, - "Ġalgorith": 834, - "ality": 835, - "eng": 836, - "Ġfin": 837, - "uc": 838, - "?\",Ċ": 839, - "ĠY": 840, - "Ġret": 841, - "Ġbeen": 842, - "Ġtechnology": 843, - "Ġprogra": 844, - "Ġhand": 845, - "hip": 846, - "wn": 847, - "Ġcal": 848, - "Ġwhat": 849, - "ividual": 850, - "iss": 851, - "ety": 852, - "Ġlanguage": 853, - "ources": 854, - "Ġclass": 855, - "Ġtake": 856, - "Ġeas": 857, - "ric": 858, - "Ġvis": 859, - "bject": 860, - "Ġref": 861, - "Ġenvironment": 862, - "Ġfirst": 863, - "eg": 864, - "Ġindividual": 865, - "Ġplan": 866, - "Ġperform": 867, - "Ġru": 868, - "ien": 869, - "Ġimpact": 870, - "Ġag": 871, - "ade": 872, - "Ġcle": 873, - "Ġrequ": 874, - "dition": 875, - "__": 876, - "Ġche": 877, - "ption": 878, - "Ġappro": 879, - "Ġ**": 880, - "Ġgreat": 881, - "ved": 882, - "Ġexpl": 883, - "Ġgrow": 884, - "Generate": 885, - "Ġmy": 886, - "Ġincluding": 887, - "Ġaccess": 888, - "Ġpop": 889, - "Ġmin": 890, - "fore": 891, - "Ġsocial": 892, - "ines": 893, - "Ġcharact": 894, - "Ġbr": 895, - "Ġstep": 896, - "Ġunderstand": 897, - "Ġorgan": 898, - "ĠAd": 899, - "Ġdisc": 900, - "Ġpower": 901, - "Ġlong": 902, - "hed": 903, - "Ġconc": 904, - "ward": 905, - "ited": 906, - "Ġele": 907, - "cing": 908, - "Ġevery": 909, - "Ġca": 910, - "Ġoften": 911, - "Ġuser": 912, - "vie": 913, - "ĠV": 914, - "Ġfood": 915, - "Ġinclude": 916, - "Ġloc": 917, - "ases": 918, - "ically": 919, - "ode": 920, - "ants": 921, - "Ġinvol": 922, - "Ġsmall": 923, - "Ġsur": 924, - "achine": 925, - "Ġbeing": 926, - "Ġpotential": 927, - "Ġno": 928, - "ĠCh": 929, - "Ġdep": 930, - "ather": 931, - "Ġboth": 932, - "Ġens": 933, - "Ġposs": 934, - "Ġed": 935, - "cribe": 936, - "ts": 937, - "ork": 938, - "ĠThey": 939, - "Ġpur": 940, - "ivity": 941, - "Ġwords": 942, - "Ġsignific": 943, - "Ġwere": 944, - "ĠHow": 945, - "Ġprom": 946, - "Ġexperience": 947, - "ĠK": 948, - "up": 949, - "Ġcount": 950, - "ered": 951, - "Des": 952, - "Ġfam": 953, - "```": 954, - "akes": 955, - "Ġgl": 956, - "ĠHe": 957, - "Ġfeel": 958, - "Ġback": 959, - "Ġfi": 960, - "Ġproble": 961, - "ization": 962, - "ling": 963, - "Ġcommunic": 964, - "ploy": 965, - "Ġaut": 966, - "Ġfriend": 967, - "Ġhuman": 968, - "Ġspe": 969, - "ew": 970, - "Ġpersonal": 971, - "Ġtop": 972, - "Ġent": 973, - "other": 974, - "Ġchang": 975, - "Ġcor": 976, - "Ġchange": 977, - "Ġdecis": 978, - "ability": 979, - "hing": 980, - "atural": 981, - "ever": 982, - "Ġcost": 983, - "Ġgood": 984, - "ause": 985, - "Ġident": 986, - "Ġsoft": 987, - "ined": 988, - "Ġpass": 989, - "'t": 990, - "atures": 991, - "Ġben": 992, - "Ġcompany": 993, - "Ġstart": 994, - "Ġsignificant": 995, - "Ġsumm": 996, - "ond": 997, - "old": 998, - "bers": 999, - "sel": 1000, - "?\\": 1001, - "Ġcur": 1002, - "Ġlight": 1003, - "Ġcommon": 1004, - ".\\\"": 1005, - "Ġcustomers": 1006, - "iving": 1007, - "conom": 1008, - "Ġfunction": 1009, - "Ġve": 1010, - "Ġthree": 1011, - "Ġeven": 1012, - "ining": 1013, - "Ġgener": 1014, - "ries": 1015, - "Ġlevel": 1016, - "Ġspecific": 1017, - "Ġwebs": 1018, - "Ġthen": 1019, - "Ġeffective": 1020, - "cur": 1021, - "ense": 1022, - "Ġlarge": 1023, - "Ġdist": 1024, - "Ġeffic": 1025, - "Ġsupport": 1026, - "Ġget": 1027, - "Create": 1028, - "read": 1029, - "port": 1030, - "Ġinf": 1031, - "Ġ'": 1032, - "Ġyear": 1033, - "Ġstate": 1034, - "Ġkey": 1035, - "ccess": 1036, - ":**": 1037, - "Ġav": 1038, - "Ġknow": 1039, - "Ġbenef": 1040, - "Ġess": 1041, - "ables": 1042, - "ren": 1043, - "Ġown": 1044, - "ĠThese": 1045, - "ock": 1046, - "-t": 1047, - "Ġide": 1048, - "omm": 1049, - "reen": 1050, - "ced": 1051, - "cture": 1052, - "Ġteam": 1053, - "Ġris": 1054, - "Ġtasks": 1055, - "Ġdown": 1056, - "Ġstru": 1057, - "Ġcomputer": 1058, - "-b": 1059, - "Ġfact": 1060, - "Ġmem": 1061, - "etter": 1062, - "\\nS": 1063, - "Ġaround": 1064, - "Ġword": 1065, - "Ġbased": 1066, - "Ġbeh": 1067, - "Ġright": 1068, - "Ġdel": 1069, - "Ġpoint": 1070, - "Ġnatural": 1071, - "ss": 1072, - "Ġeconom": 1073, - "Ġmade": 1074, - "Ġins": 1075, - "Ġinst": 1076, - "Ġmat": 1077, - "Ġvalue": 1078, - "Ġanim": 1079, - "Ġsever": 1080, - "\\nT": 1081, - "ational": 1082, - "ital": 1083, - "ze": 1084, - "ote": 1085, - "ills": 1086, - "tern": 1087, - "Ġread": 1088, - "Ġcontent": 1089, - "Ġonline": 1090, - "Ġend": 1091, - "ĠUn": 1092, - "vent": 1093, - "Ġsee": 1094, - "ending": 1095, - "Ġmon": 1096, - "Ġdr": 1097, - "Ġkeep": 1098, - "Ġsystems": 1099, - "cul": 1100, - "ven": 1101, - "Ġstory": 1102, - "Ġmedia": 1103, - "Ġseveral": 1104, - "hen": 1105, - "ateg": 1106, - "Ġcontin": 1107, - "Ġdev": 1108, - "Ġlearn": 1109, - "Ġla": 1110, - "Ġstre": 1111, - "Ġpartic": 1112, - "Ġair": 1113, - "ually": 1114, - "Ġsuccess": 1115, - "ouse": 1116, - "Ġiss": 1117, - "ied": 1118, - "Ġmachine": 1119, - "Ġopt": 1120, - "Ġx": 1121, - "Ġop": 1122, - "Ġprof": 1123, - "ocus": 1124, - "chie": 1125, - "Ġmeth": 1126, - "ner": 1127, - "omp": 1128, - "ron": 1129, - "Ġhome": 1130, - "Ġbetter": 1131, - "ĠPro": 1132, - "Ġmult": 1133, - "omet": 1134, - "Ġincrease": 1135, - "Ġanaly": 1136, - "vert": 1137, - "Ġrele": 1138, - "Ġbra": 1139, - "ink": 1140, - "Ġtem": 1141, - "Ġpredi": 1142, - "Ġtre": 1143, - "Ġservice": 1144, - "Ġwebsite": 1145, - "Ġmanage": 1146, - "Ġsoftware": 1147, - "here": 1148, - "Ġprot": 1149, - "-s": 1150, - "Ġquest": 1151, - "ier": 1152, - "Ġknown": 1153, - "Ġorder": 1154, - "Ġphys": 1155, - "cept": 1156, - "Ġachie": 1157, - "Ġinput": 1158, - "Ġpossible": 1159, - "ĠIf": 1160, - "Ġext": 1161, - "fter": 1162, - "Ġelect": 1163, - "Ġmethod": 1164, - "Ġbre": 1165, - "ĠAn": 1166, - "ways": 1167, - "ering": 1168, - "ets": 1169, - "Ġjust": 1170, - "Ġstore": 1171, - "Ġdevelopment": 1172, - "Ġcare": 1173, - "Ġobject": 1174, - "Ġtype": 1175, - "ĠFor": 1176, - "Ġfocus": 1177, - "ggest": 1178, - "Ġonly": 1179, - "Ġconsid": 1180, - "ars": 1181, - "Ġchall": 1182, - "Ġdeterm": 1183, - "Ġsal": 1184, - "ins": 1185, - "Ġfeatures": 1186, - "Ġtru": 1187, - "ody": 1188, - "Ġtool": 1189, - ">\\": 1190, - "Ġensure": 1191, - "oss": 1192, - "ublic": 1193, - "Ġitem": 1194, - "Here": 1195, - "ination": 1196, - "Ġdef": 1197, - "Describe": 1198, - "ional": 1199, - "roup": 1200, - "Ġconf": 1201, - "Ġneeds": 1202, - "Ġcharacter": 1203, - "Ġvarious": 1204, - "Ġlet": 1205, - "Ġapplic": 1206, - "aut": 1207, - "Ġjob": 1208, - "ellig": 1209, - "ĠCon": 1210, - "Ġbest": 1211, - "Ġfore": 1212, - "Ġamount": 1213, - "rop": 1214, - "Ġbuild": 1215, - "ique": 1216, - "aging": 1217, - "Ġemploy": 1218, - "Ġrest": 1219, - "air": 1220, - "What": 1221, - "Ġtoget": 1222, - "Ġways": 1223, - "Ġidentify": 1224, - "Ġtogether": 1225, - "Ġreal": 1226, - "Ġusers": 1227, - "Ġmean": 1228, - "asing": 1229, - "ĠAm": 1230, - "Ġeduc": 1231, - "Ġalgorithm": 1232, - "Ġnetw": 1233, - "Ġcode": 1234, - "Write": 1235, - "ov": 1236, - "-d": 1237, - "oura": 1238, - "ĠHowever": 1239, - "uture": 1240, - "view": 1241, - "Ġindu": 1242, - "Ġproducts": 1243, - "ected": 1244, - "ertain": 1245, - ";\\": 1246, - "ĠAs": 1247, - "pr": 1248, - "aste": 1249, - "Ġoper": 1250, - "Ġ$": 1251, - "avi": 1252, - "self": 1253, - "Ġ<": 1254, - "Ġindust": 1255, - "Ġgu": 1256, - "Ġothers": 1257, - "Ex": 1258, - "ian": 1259, - "Ġ\"\\\"": 1260, - "-f": 1261, - "nces": 1262, - "Ġfil": 1263, - "Ġrespons": 1264, - "rol": 1265, - "Ġcap": 1266, - "Ġbefore": 1267, - "vern": 1268, - "Ġcomplex": 1269, - "lus": 1270, - "ribut": 1271, - "ats": 1272, - "Ġpositive": 1273, - "oh": 1274, - "Ġlo": 1275, - "Ġgroup": 1276, - "Ġfound": 1277, - "ee": 1278, - "ogn": 1279, - "Ġsw": 1280, - "Ġindividuals": 1281, - "Ġpract": 1282, - "Ġenc": 1283, - "Ġshare": 1284, - "raph": 1285, - "Ġrange": 1286, - "Ġsun": 1287, - "\\t": 1288, - "Ġproviding": 1289, - "icle": 1290, - "Ġdem": 1291, - "Ġplace": 1292, - "Ġaud": 1293, - "joy": 1294, - "Ġmust": 1295, - "els": 1296, - "ery": 1297, - "One": 1298, - "Ġfamily": 1299, - "Ġfuture": 1300, - "less": 1301, - "rent": 1302, - "Ġproblem": 1303, - "Ġessential": 1304, - "rodu": 1305, - "ired": 1306, - "Ġreducing": 1307, - "ism": 1308, - "Ġwarm": 1309, - "ray": 1310, - "Ġability": 1311, - "Ġstrong": 1312, - "Ġalways": 1313, - "Ġresources": 1314, - "Ġbenefits": 1315, - "Ġstrateg": 1316, - "Ġinvolves": 1317, - "Ġassist": 1318, - "erest": 1319, - "nA": 1320, - "ression": 1321, - "Ġ[": 1322, - "ilities": 1323, - "Ġsteps": 1324, - "verall": 1325, - "Ġshow": 1326, - "obal": 1327, - "\\nF": 1328, - "Ġland": 1329, - "ĠHere": 1330, - "Ġbusinesses": 1331, - "ĠEn": 1332, - "pportun": 1333, - "Ġmeas": 1334, - "Ġreturn": 1335, - "Ġdig": 1336, - "Ġhist": 1337, - "yth": 1338, - "Ġcent": 1339, - "Ġable": 1340, - "Ġwithout": 1341, - "yc": 1342, - "plain": 1343, - "Ġrelations": 1344, - "Ġservices": 1345, - "-c": 1346, - "Ġtest": 1347, - "arth": 1348, - "Ġcommunication": 1349, - "Ġintern": 1350, - "new": 1351, - "Ġsit": 1352, - "Ġinvest": 1353, - "Ġcaus": 1354, - "Ġunt": 1355, - "Ġfriends": 1356, - "Ġchanges": 1357, - "cri": 1358, - "dit": 1359, - "ĠBy": 1360, - "ĠYou": 1361, - "Ġmeans": 1362, - "Ġrese": 1363, - "ool": 1364, - "ted": 1365, - "elligence": 1366, - "ains": 1367, - "pping": 1368, - "Ġbel": 1369, - "Ġrepresent": 1370, - "Ġhapp": 1371, - "Ġser": 1372, - "Ġperformance": 1373, - "Ġopportun": 1374, - "Ġtemper": 1375, - "ĠShe": 1376, - "Ġfu": 1377, - "ix": 1378, - "bot": 1379, - "Ġwrit": 1380, - "Ġbehavi": 1381, - "Ġproject": 1382, - "ĠWith": 1383, - "ivers": 1384, - "day": 1385, - "Ġphysical": 1386, - "izing": 1387, - "Ġactiv": 1388, - "Ġwithin": 1389, - "Ġinterest": 1390, - "olution": 1391, - "wards": 1392, - "ffic": 1393, - "Ġquick": 1394, - "Ġpublic": 1395, - "Ġgrowth": 1396, - "Ġcho": 1397, - "Ġrelationship": 1398, - "Ġuntil": 1399, - "Ġhelps": 1400, - "Ġstudents": 1401, - "Ġfiel": 1402, - "imes": 1403, - "ulation": 1404, - "ibility": 1405, - "elf": 1406, - "Ġful": 1407, - "Ġsub": 1408, - "ank": 1409, - "ides": 1410, - "Ġskills": 1411, - "Ġclimate": 1412, - "Given": 1413, - "Ġpar": 1414, - "Ġclear": 1415, - "irt": 1416, - "Name": 1417, - "Ġpresent": 1418, - "Ġtri": 1419, - "Ġchalleng": 1420, - "ream": 1421, - "Ġlay": 1422, - "Ġmarketing": 1423, - "Ġsummary": 1424, - "Ġchild": 1425, - "Ġsaf": 1426, - "Ġsure": 1427, - "Ġsame": 1428, - "Ġmu": 1429, - "Ġemail": 1430, - "bon": 1431, - "Ġsomet": 1432, - "```\\": 1433, - "Ġcurrent": 1434, - "amp": 1435, - "ences": 1436, - "ĠRe": 1437, - "Ġtransport": 1438, - "me": 1439, - "-p": 1440, - "action": 1441, - "ĠEx": 1442, - "Ġyears": 1443, - "Ġcomb": 1444, - "hor": 1445, - "anced": 1446, - "ty": 1447, - "Ġlove": 1448, - "Ġgreen": 1449, - "Ġpopular": 1450, - "Ġless": 1451, - "Ġdra": 1452, - "Ġcontrol": 1453, - "Ġaff": 1454, - "Ġconsum": 1455, - "Ġgame": 1456, - "ental": 1457, - "ights": 1458, - "arget": 1459, - "omes": 1460, - "ox": 1461, - "icult": 1462, - "erc": 1463, - "Ġgoals": 1464, - "ancial": 1465, - "tle": 1466, - "Ġgovern": 1467, - "Ġnumbers": 1468, - "Ġfive": 1469, - "Ġstand": 1470, - "Ġsearch": 1471, - "Ġefficient": 1472, - "Ġwal": 1473, - "Ġname": 1474, - "ath": 1475, - "Ġheart": 1476, - "Ġduring": 1477, - "rect": 1478, - "Ġoverall": 1479, - "ython": 1480, - "Ġallows": 1481, - "Ġcity": 1482, - "ave": 1483, - "vant": 1484, - "aterial": 1485, - "Ġwide": 1486, - "Ġmus": 1487, - "ificial": 1488, - "Ġhard": 1489, - "ĠTh": 1490, - "oose": 1491, - "Ġglobal": 1492, - "aj": 1493, - "Ġter": 1494, - "Ġdifficult": 1495, - "Ġline": 1496, - "ĠAl": 1497, - "care": 1498, - "ived": 1499, - "Ġregular": 1500, - "Ġgr": 1501, - "),": 1502, - "lement": 1503, - "Ġhim": 1504, - "Ġunique": 1505, - "Ġenjoy": 1506, - "Ġmeaning": 1507, - "Ġopen": 1508, - "Ġi": 1509, - "abor": 1510, - "Ġarea": 1511, - "Ġitems": 1512, - "Ġclean": 1513, - "ditionally": 1514, - "oid": 1515, - "ĠWe": 1516, - "Ġbeaut": 1517, - "Ġmeet": 1518, - "iple": 1519, - "Ġstatement": 1520, - "Ġagain": 1521, - "ysis": 1522, - "Ġfac": 1523, - "Ġsources": 1524, - "Ġbody": 1525, - "Ġalgorithms": 1526, - "Ġaudience": 1527, - "Ġwant": 1528, - "Ġlog": 1529, - "Ġmaintain": 1530, - "Ġactivities": 1531, - "Ġmove": 1532, - "Ġcult": 1533, - "oney": 1534, - "Ġtarget": 1535, - "\\nB": 1536, - "Ġmaterial": 1537, - "Ġcreating": 1538, - "Ġstructure": 1539, - "atform": 1540, - "ext": 1541, - "Ġexperien": 1542, - "Ġvalues": 1543, - "ead": 1544, - "ohn": 1545, - "Ġhealthy": 1546, - "ross": 1547, - "Ġinteg": 1548, - "Ġresearch": 1549, - "atch": 1550, - "ooking": 1551, - "Ġrole": 1552, - "Ġprovides": 1553, - "iety": 1554, - "ists": 1555, - "Ġfinancial": 1556, - "ories": 1557, - "dent": 1558, - "Ġer": 1559, - "Ġarticle": 1560, - "Ġelements": 1561, - "Ġaddress": 1562, - "Ġconn": 1563, - "ĠUse": 1564, - "mp": 1565, - "Ġeasy": 1566, - "Ġneg": 1567, - "Ġcolor": 1568, - "Ġcalcul": 1569, - "Explain": 1570, - "ĠPl": 1571, - "pect": 1572, - "ince": 1573, - "ale": 1574, - "Ġrisk": 1575, - "curity": 1576, - "ert": 1577, - "Ġfeed": 1578, - "Ġevent": 1579, - "vers": 1580, - "ples": 1581, - "Ġlevels": 1582, - "Ġbi": 1583, - "Ġstay": 1584, - "Ġplatform": 1585, - "Ġbreak": 1586, - "back": 1587, - "Ġsat": 1588, - "\\nOverall": 1589, - "Ġeducation": 1590, - "\\nC": 1591, - "Ġcarbon": 1592, - "--------": 1593, - "ape": 1594, - "Ġprevent": 1595, - "Ġaddition": 1596, - "Ġstress": 1597, - "ral": 1598, - "ource": 1599, - "rus": 1600, - "Ġcome": 1601, - "Ġrecogn": 1602, - "ĠUnited": 1603, - "Ġproper": 1604, - "Ġpoll": 1605, - "dentify": 1606, - "Ġunderstanding": 1607, - "Ġdecisions": 1608, - "ict": 1609, - "Ġdire": 1610, - "Ġbehavior": 1611, - "Ġ*": 1612, - "\\nI": 1613, - "Ġmess": 1614, - "Ġanimals": 1615, - "Ġsl": 1616, - "Ġwind": 1617, - "Ġbas": 1618, - "Ġpain": 1619, - "Ġleading": 1620, - "ern": 1621, - "ger": 1622, - "Ġpres": 1623, - "Ġthough": 1624, - "Ġinteract": 1625, - "yle": 1626, - "Ġdoes": 1627, - "Ġhead": 1628, - "Ġintelligence": 1629, - "orts": 1630, - "Ġbecome": 1631, - "Ġrun": 1632, - "aring": 1633, - "Ġimplement": 1634, - "Ġaction": 1635, - "oot": 1636, - "terns": 1637, - "Ġprotect": 1638, - "eric": 1639, - "Ġflow": 1640, - "Ġemot": 1641, - "cessary": 1642, - "urate": 1643, - "Ġsuggest": 1644, - "Ġprogram": 1645, - "Ġphr": 1646, - "Ġhealthcare": 1647, - "ention": 1648, - "Ġsust": 1649, - "Ġwhy": 1650, - "Ġaccurate": 1651, - "lu": 1652, - "Ġhig": 1653, - "Ġreach": 1654, - "Ġallowing": 1655, - "Ġtravel": 1656, - "Ġrequire": 1657, - "Ġareas": 1658, - "Ġdeep": 1659, - "He": 1660, - "Ġfew": 1661, - "Ġself": 1662, - "oun": 1663, - "Ġ#": 1664, - "osp": 1665, - "str": 1666, - "Ġminut": 1667, - "Ġdecision": 1668, - "ĠThere": 1669, - "ances": 1670, - "Ġquality": 1671, - "Ġavail": 1672, - "Ġspace": 1673, - "Ġsomething": 1674, - "Ġweb": 1675, - "Ġpatterns": 1676, - "Ġmot": 1677, - "oring": 1678, - "isf": 1679, - "Ġanother": 1680, - "Ġaccount": 1681, - "\\nW": 1682, - "uss": 1683, - "Ġmaj": 1684, - "uation": 1685, - "Ġsustain": 1686, - "Ġautom": 1687, - "iques": 1688, - "issions": 1689, - "verse": 1690, - "Ġconcept": 1691, - "Ġsecurity": 1692, - "Ġthose": 1693, - "Ġprofess": 1694, - "Ġshort": 1695, - "Ġnight": 1696, - "ength": 1697, - "apt": 1698, - "ex": 1699, - "ĠAdditionally": 1700, - "Ġtaking": 1701, - "Ġtoo": 1702, - "agn": 1703, - "Ġsimple": 1704, - "lusion": 1705, - "iency": 1706, - "ash": 1707, - "ours": 1708, - "Ġpa": 1709, - "Ġlit": 1710, - "ĠSp": 1711, - "iting": 1712, - "Ġdon": 1713, - "Ġlim": 1714, - "lish": 1715, - "mat": 1716, - "aves": 1717, - "ledge": 1718, - "ditional": 1719, - "inc": 1720, - "Ġevents": 1721, - "Ġoffer": 1722, - "thing": 1723, - "Ġworking": 1724, - "Ġanalysis": 1725, - "Ġachieve": 1726, - "Ġpie": 1727, - "Ġbook": 1728, - "Ġfre": 1729, - "Ġmuch": 1730, - "oon": 1731, - "Ġtry": 1732, - "esp": 1733, - "Ġwaste": 1734, - "face": 1735, - "Ġear": 1736, - "Ġfru": 1737, - "Ġtransportation": 1738, - "chool": 1739, - "Ġtechniques": 1740, - "Ġprogramm": 1741, - "ĠEarth": 1742, - "Ġpredict": 1743, - "Ġnever": 1744, - "ws": 1745, - "ument": 1746, - "imately": 1747, - "ared": 1748, - "Ġparticular": 1749, - "Ġtowards": 1750, - "Ġeconomic": 1751, - "Ġincreasing": 1752, - "Ġfast": 1753, - "iment": 1754, - "Ġnetwork": 1755, - "Ġcorrect": 1756, - "Ġmight": 1757, - "Ġoc": 1758, - "Ġbecause": 1759, - "ĠWh": 1760, - "az": 1761, - "play": 1762, - "Ġresults": 1763, - "Ġmanagement": 1764, - "Ġpurch": 1765, - "Ġsound": 1766, - "Ġpast": 1767, - "Ġtraining": 1768, - "____": 1769, - "ope": 1770, - "Ġengage": 1771, - "ourage": 1772, - "Ġsense": 1773, - "Ġfree": 1774, - "Ġpref": 1775, - "ees": 1776, - "Ġcountries": 1777, - "ney": 1778, - "anies": 1779, - "Ġafter": 1780, - "Ġmind": 1781, - "Ġexc": 1782, - "ĠOnce": 1783, - "ĠĠĠĠĠĠĠĠĠĠĠ": 1784, - "Ġcomplete": 1785, - "Ġimm": 1786, - "Ġest": 1787, - "Ġgenerate": 1788, - "verb": 1789, - "ĠDe": 1790, - "'m": 1791, - "Ġtools": 1792, - "redients": 1793, - "Ġmajor": 1794, - "ently": 1795, - "Ġcontribut": 1796, - "leep": 1797, - "Ġpoints": 1798, - "ditions": 1799, - "Ġfactors": 1800, - "Ġel": 1801, - "Ġnext": 1802, - "ium": 1803, - "oud": 1804, - "Ġcru": 1805, - "Ġreas": 1806, - "riate": 1807, - "ĠInd": 1808, - "Ġpromot": 1809, - "Ġhistory": 1810, - "Ġjour": 1811, - "Ġdue": 1812, - "Con": 1813, - "Ġveget": 1814, - "ency": 1815, - "ĠAmeric": 1816, - "Ġfra": 1817, - "Ġdifference": 1818, - "oard": 1819, - "lex": 1820, - "Ġequation": 1821, - "irtual": 1822, - "Ġcup": 1823, - "Ġforest": 1824, - "Ġnegative": 1825, - "Ġsecon": 1826, - "ones": 1827, - "Ġnature": 1828, - "Ġuses": 1829, - "ah": 1830, - "por": 1831, - "Ġsec": 1832, - "ording": 1833, - "Ġlast": 1834, - "ĠSome": 1835, - "Ġissues": 1836, - "Ġscient": 1837, - "Ġprint": 1838, - "ĠStates": 1839, - "over": 1840, - "Ġsatisf": 1841, - "Ġdevices": 1842, - "Ġdise": 1843, - "Ġtemperature": 1844, - "Ġfeedback": 1845, - "Ġnecessary": 1846, - "Ġemissions": 1847, - "mb": 1848, - "Ġlow": 1849, - "for": 1850, - "tal": 1851, - "Ġchallenges": 1852, - "Ġarray": 1853, - "Ġside": 1854, - "Ġengine": 1855, - "Ġboo": 1856, - "ata": 1857, - "Ġbelie": 1858, - "-m": 1859, - "Ġmultiple": 1860, - "Ġsing": 1861, - "Ġgovernment": 1862, - "ames": 1863, - "ified": 1864, - "Ġminutes": 1865, - "Ġsuccessful": 1866, - "Ġmoney": 1867, - "Ġquickly": 1868, - "Ġbir": 1869, - "Ġtypically": 1870, - "Ġpost": 1871, - "Ġprep": 1872, - "Ġknowledge": 1873, - "pped": 1874, - "actions": 1875, - "Ġmethods": 1876, - "Ġoptim": 1877, - "\\nP": 1878, - "Ġoutput": 1879, - "Ġfield": 1880, - "Ġtable": 1881, - "Ġbal": 1882, - "Ġcoll": 1883, - "Ġcharacters": 1884, - "volution": 1885, - "ords": 1886, - "ilar": 1887, - "ification": 1888, - "ane": 1889, - "Ġcell": 1890, - "Ġmil": 1891, - "ĠWhat": 1892, - "Ġsqu": 1893, - "Ġlives": 1894, - "ĠAr": 1895, - "Ġphrase": 1896, - "Ġnut": 1897, - "Ġdigital": 1898, - "Ġinternet": 1899, - "lass": 1900, - "ura": 1901, - "ommend": 1902, - "Ġtreat": 1903, - "Ġapprop": 1904, - "resh": 1905, - "urther": 1906, - "ĠOne": 1907, - "Ġvisual": 1908, - "ategor": 1909, - "Ġapproach": 1910, - "Ġcertain": 1911, - "Ġsho": 1912, - "val": 1913, - "Ġtask": 1914, - "ires": 1915, - "Ġappropriate": 1916, - "Ġvie": 1917, - "Ġdesigned": 1918, - "pose": 1919, - "**:": 1920, - "fort": 1921, - "Ġ|\\": 1922, - "Ġapplications": 1923, - "Ġpay": 1924, - "Ġnow": 1925, - "Ġheat": 1926, - "Ġindustry": 1927, - "pre": 1928, - "Ġeffectively": 1929, - "Ġpopulation": 1930, - "Ġopportunities": 1931, - " \\", - "Ġens ure", - "os s", - "ub lic", - "Ġit em", - "H ere", - "in ation", - "Ġde f", - "Des cribe", - "ion al", - "rou p", - "Ġcon f", - "Ġneed s", - "Ġcharact er", - "Ġvari ous", - "Ġle t", - "Ġapp lic", - "a ut", - "Ġj ob", - "ell ig", - "ĠC on", - "Ġb est", - "Ġf ore", - "Ġam ount", - "ro p", - "Ġbu ild", - "iqu e", - "ag ing", - "Ġem ploy", - "Ġre st", - "a ir", - "W hat", - "Ġto get", - "Ġway s", - "Ġident ify", - "Ġtoget her", - "Ġre al", - "Ġus ers", - "Ġme an", - "as ing", - "ĠA m", - "Ġed uc", - "Ġalgorith m", - "Ġn etw", - "Ġc ode", - "W rite", - "o v", - "- d", - "ou ra", - "ĠHow ever", - "ut ure", - "vie w", - "Ġin du", - "Ġproduct s", - "ect ed", - "er tain", - "; \\", - "ĠA s", - "p r", - "ast e", - "Ġo per", - "Ġ $", - "av i", - "sel f", - "Ġ <", - "Ġindu st", - "Ġg u", - "Ġother s", - "E x", - "i an", - "Ġ\" \\\"", - "- f", - "n ces", - "Ġf il", - "Ġresp ons", - "ro l", - "Ġc ap", - "Ġbe fore", - "ver n", - "Ġcomple x", - "l us", - "rib ut", - "at s", - "Ġpos itive", - "o h", - "Ġl o", - "Ġg roup", - "Ġf ound", - "e e", - "og n", - "Ġs w", - "Ġindividual s", - "Ġp ract", - "Ġen c", - "Ġsh are", - "ra ph", - "Ġr ange", - "Ġsu n", - "\\ t", - "Ġprovid ing", - "ic le", - "Ġde m", - "Ġpl ace", - "Ġa ud", - "j oy", - "Ġm ust", - "el s", - "er y", - "O ne", - "Ġfam ily", - "Ġf uture", - "l ess", - "re nt", - "Ġproble m", - "Ġess ential", - "ro du", - "i red", - "Ġredu cing", - "is m", - "Ġw arm", - "ra y", - "Ġab ility", - "Ġstr ong", - "Ġal ways", - "Ġres ources", - "Ġbenef its", - "Ġstr ateg", - "Ġinvol ves", - "Ġass ist", - "ere st", - "n A", - "ress ion", - "Ġ [", - "il ities", - "Ġstep s", - "ver all", - "Ġsh ow", - "ob al", - "\\n F", - "Ġl and", - "ĠH ere", - "Ġbusiness es", - "ĠE n", - "pport un", - "Ġme as", - "Ġret urn", - "Ġd ig", - "Ġh ist", - "y th", - "Ġc ent", - "Ġab le", - "Ġwith out", - "y c", - "pl ain", - "Ġrel ations", - "Ġserv ices", - "- c", - "Ġt est", - "ar th", - "Ġcommunic ation", - "Ġinter n", - "ne w", - "Ġs it", - "Ġinv est", - "Ġca us", - "Ġu nt", - "Ġfriend s", - "Ġchang es", - "c ri", - "d it", - "ĠB y", - "ĠY ou", - "Ġme ans", - "Ġre se", - "o ol", - "t ed", - "ellig ence", - "ain s", - "pp ing", - "Ġbe l", - "Ġrep resent", - "Ġha pp", - "Ġs er", - "Ġperform ance", - "Ġo pportun", - "Ġtem per", - "ĠS he", - "Ġf u", - "i x", - "b ot", - "Ġw rit", - "Ġbeh avi", - "Ġpro ject", - "ĠW ith", - "iv ers", - "d ay", - "Ġphys ical", - "iz ing", - "Ġact iv", - "Ġwith in", - "Ġint erest", - "ol ution", - "ward s", - "ff ic", - "Ġqu ick", - "Ġp ublic", - "Ġgrow th", - "Ġch o", - "Ġrelations hip", - "Ġunt il", - "Ġhelp s", - "Ġstud ents", - "Ġfi el", - "im es", - "ul ation", - "ib ility", - "el f", - "Ġf ul", - "Ġsu b", - "an k", - "id es", - "Ġsk ills", - "Ġcl imate", - "G iven", - "Ġp ar", - "Ġcle ar", - "ir t", - "N ame", - "Ġp resent", - "Ġt ri", - "Ġchall eng", - "re am", - "Ġl ay", - "Ġmarket ing", - "Ġsumm ary", - "Ġch ild", - "Ġsa f", - "Ġsu re", - "Ġs ame", - "Ġm u", - "Ġem ail", - "b on", - "Ġs omet", - "``` \\", - "Ġcur rent", - "am p", - "en ces", - "ĠR e", - "Ġtrans port", - "m e", - "- p", - "a ction", - "ĠE x", - "Ġyear s", - "Ġcom b", - "h or", - "anc ed", - "t y", - "Ġl ove", - "Ġg reen", - "Ġpop ular", - "Ġl ess", - "Ġd ra", - "Ġcont rol", - "Ġa ff", - "Ġcons um", - "Ġg ame", - "ent al", - "ight s", - "ar get", - "om es", - "o x", - "ic ult", - "er c", - "Ġgo als", - "anc ial", - "t le", - "Ġgo vern", - "Ġnum bers", - "Ġf ive", - "Ġst and", - "Ġse arch", - "Ġeffic ient", - "Ġw al", - "Ġn ame", - "at h", - "Ġhe art", - "Ġd uring", - "re ct", - "Ġover all", - "yth on", - "Ġallow s", - "Ġc ity", - "a ve", - "v ant", - "ater ial", - "Ġw ide", - "Ġm us", - "ific ial", - "Ġh ard", - "ĠT h", - "oo se", - "Ġgl obal", - "a j", - "Ġt er", - "Ġdiff icult", - "Ġl ine", - "ĠA l", - "c are", - "iv ed", - "Ġreg ular", - "Ġg r", - ") ,", - "le ment", - "Ġh im", - "Ġun ique", - "Ġen joy", - "Ġmean ing", - "Ġop en", - "Ġ i", - "ab or", - "Ġare a", - "Ġitem s", - "Ġcle an", - "dition ally", - "o id", - "ĠW e", - "Ġbe aut", - "Ġme et", - "ip le", - "Ġstate ment", - "Ġag ain", - "ys is", - "Ġf ac", - "Ġs ources", - "Ġb ody", - "Ġalgorith ms", - "Ġaud ience", - "Ġw ant", - "Ġl og", - "Ġmain tain", - "Ġactiv ities", - "Ġmo ve", - "Ġc ult", - "one y", - "Ġt arget", - "\\n B", - "Ġm aterial", - "Ġcreat ing", - "Ġstru cture", - "at form", - "e xt", - "Ġexper ien", - "Ġval ues", - "e ad", - "oh n", - "Ġhealth y", - "ro ss", - "Ġint eg", - "Ġrese arch", - "at ch", - "oo king", - "Ġro le", - "Ġprovid es", - "i ety", - "ist s", - "Ġfin ancial", - "or ies", - "d ent", - "Ġ er", - "Ġart icle", - "Ġele ments", - "Ġadd ress", - "Ġcon n", - "ĠU se", - "m p", - "Ġeas y", - "Ġne g", - "Ġcol or", - "Ġcal cul", - "Ex plain", - "ĠP l", - "p ect", - "in ce", - "al e", - "Ġris k", - "cur ity", - "er t", - "Ġfe ed", - "Ġev ent", - "v ers", - "pl es", - "Ġlevel s", - "Ġb i", - "Ġst ay", - "Ġpl atform", - "Ġbre ak", - "b ack", - "Ġs at", - "\\nO verall", - "Ġeduc ation", - "\\n C", - "Ġcar bon", - "---- ----", - "ap e", - "Ġpre vent", - "Ġadd ition", - "Ġst ress", - "r al", - "our ce", - "ru s", - "Ġcom e", - "Ġrec ogn", - "ĠUn ited", - "Ġpro per", - "Ġpol l", - "dent ify", - "Ġunderstand ing", - "Ġdecis ions", - "i ct", - "Ġd ire", - "Ġbehavi or", - "Ġ *", - "\\n I", - "Ġm ess", - "Ġanim als", - "Ġs l", - "Ġw ind", - "Ġb as", - "Ġp ain", - "Ġlead ing", - "er n", - "g er", - "Ġp res", - "Ġth ough", - "Ġinter act", - "y le", - "Ġdo es", - "Ġhe ad", - "Ġint elligence", - "ort s", - "Ġbec ome", - "Ġru n", - "ar ing", - "Ġimp lement", - "Ġa ction", - "o ot", - "ter ns", - "Ġprot ect", - "er ic", - "Ġf low", - "Ġem ot", - "cess ary", - "ur ate", - "Ġsu ggest", - "Ġprogra m", - "Ġph r", - "Ġhealth care", - "ent ion", - "Ġsu st", - "Ġwh y", - "Ġacc urate", - "l u", - "Ġh ig", - "Ġre ach", - "Ġallow ing", - "Ġtra vel", - "Ġrequ ire", - "Ġare as", - "Ġde ep", - "H e", - "Ġfe w", - "Ġs elf", - "ou n", - "Ġ #", - "os p", - "st r", - "Ġmin ut", - "Ġdecis ion", - "ĠThe re", - "an ces", - "Ġqu ality", - "Ġav ail", - "Ġsp ace", - "Ġsomet hing", - "Ġwe b", - "Ġpat terns", - "Ġm ot", - "or ing", - "is f", - "Ġan other", - "Ġacc ount", - "\\n W", - "us s", - "Ġm aj", - "u ation", - "Ġsust ain", - "Ġaut om", - "iqu es", - "iss ions", - "ver se", - "Ġcon cept", - "Ġse curity", - "Ġth ose", - "Ġprof ess", - "Ġsh ort", - "Ġn ight", - "eng th", - "a pt", - "e x", - "ĠAd ditionally", - "Ġt aking", - "Ġto o", - "ag n", - "Ġsim ple", - "lus ion", - "ien cy", - "as h", - "our s", - "Ġp a", - "Ġl it", - "ĠS p", - "it ing", - "Ġd on", - "Ġl im", - "l ish", - "m at", - "av es", - "led ge", - "dition al", - "in c", - "Ġev ents", - "Ġoff er", - "th ing", - "Ġwor king", - "Ġanal ysis", - "Ġachie ve", - "Ġp ie", - "Ġb ook", - "Ġf re", - "Ġmu ch", - "o on", - "Ġt ry", - "es p", - "Ġw aste", - "f ace", - "Ġe ar", - "Ġf ru", - "Ġtransport ation", - "ch ool", - "Ġtechn iques", - "Ġprogra mm", - "ĠE arth", - "Ġpredi ct", - "Ġne ver", - "w s", - "u ment", - "imate ly", - "are d", - "Ġpartic ular", - "Ġto wards", - "Ġeconom ic", - "Ġincre asing", - "Ġf ast", - "im ent", - "Ġnetw ork", - "Ġcor rect", - "Ġm ight", - "Ġo c", - "Ġbec ause", - "ĠW h", - "a z", - "pl ay", - "Ġresult s", - "Ġmanage ment", - "Ġpur ch", - "Ġs ound", - "Ġp ast", - "Ġtra ining", - "__ __", - "op e", - "Ġeng age", - "oura ge", - "Ġs ense", - "Ġf ree", - "Ġpre f", - "e es", - "Ġcount ries", - "ne y", - "an ies", - "Ġa fter", - "Ġm ind", - "Ġex c", - "ĠO nce", - "ĠĠĠĠ ĠĠĠĠĠĠĠ", - "Ġcomple te", - "Ġim m", - "Ġ est", - "Ġg enerate", - "ver b", - "ĠD e", - "' m", - "Ġtool s", - "redi ents", - "Ġmaj or", - "ent ly", - "Ġcont ribut", - "le ep", - "Ġpoint s", - "dit ions", - "Ġfact ors", - "Ġe l", - "Ġne xt", - "i um", - "ou d", - "Ġc ru", - "Ġre as", - "ri ate", - "ĠI nd", - "Ġprom ot", - "Ġhist ory", - "Ġj our", - "Ġd ue", - "C on", - "Ġve get", - "en cy", - "ĠAm eric", - "Ġf ra", - "Ġdiffere nce", - "o ard", - "le x", - "Ġequ ation", - "irt ual", - "Ġc up", - "Ġfore st", - "Ġneg ative", - "Ġse con", - "on es", - "Ġn ature", - "Ġus es", - "a h", - "p or", - "Ġse c", - "ord ing", - "Ġl ast", - "ĠS ome", - "Ġiss ues", - "Ġsc ient", - "Ġpr int", - "ĠSt ates", - "o ver", - "Ġsat isf", - "Ġdev ices", - "Ġdis e", - "Ġtemper ature", - "Ġfeed back", - "Ġne cessary", - "Ġem issions", - "m b", - "Ġl ow", - "f or", - "t al", - "Ġchalleng es", - "Ġar ray", - "Ġs ide", - "Ġeng ine", - "Ġb oo", - "at a", - "Ġbel ie", - "- m", - "Ġmult iple", - "Ġs ing", - "Ġgovern ment", - "am es", - "if ied", - "Ġminut es", - "Ġsuccess ful", - "Ġm oney", - "Ġquick ly", - "Ġb ir", - "Ġtyp ically", - "Ġp ost", - "Ġpre p", - "Ġknow ledge", - "pp ed", - "a ctions", - "Ġmethod s", - "Ġopt im", - "\\n P", - "Ġout put", - "Ġfiel d", - "Ġt able", - "Ġb al", - "Ġcol l", - "Ġcharact ers", - "v olution", - "or ds", - "il ar", - "ific ation", - "an e", - "Ġc ell", - "Ġm il", - "ĠW hat", - "Ġs qu", - "Ġl ives", - "ĠA r", - "Ġphr ase", - "Ġn ut", - "Ġdig ital", - "Ġintern et", - "l ass", - "u ra", - "omm end", - "Ġt reat", - "Ġappro p", - "res h", - "ur ther", - "ĠO ne", - "Ġvis ual", - "ate gor", - "Ġappro ach", - "Ġc ertain", - "Ġsh o", - "v al", - "Ġtas k", - "i res", - "Ġapprop riate", - "Ġv ie", - "Ġdesign ed", - "p ose", - "** :", - "f ort", - "Ġ| \\", - "Ġapplic ations", - "Ġp ay", - "Ġn ow", - "Ġhe at", - "Ġindust ry", - "p re", - "Ġeffective ly", - "Ġpop ulation", - "Ġopportun ities", - "< /", - "ĠT o", - "Ġup d", - "Ġinclud es", - "ĠE ng", - "Ġtyp es", - "Ġup on", - "Ġconsid er", - "le t", - "Ġg en", - "og raph", - "pl ace", - "Ġt imes", - "Ġar g", - "C omp", - "ĠG o", - "Ġre ce", - "Ġchild ren", - "Ġtra ck", - "Ġsome one", - "w ord", - "Ġyou ng", - "Ġcon ditions", - "Ġtra ditional", - "Ġmodel s", - "I dentify", - "Ġc amp", - "Ġm akes", - "ist ic", - "Ġar r", - "Ġc ard", - "ut ions", - "l t", - "Ġo ld", - "Ġide as", - "Ġe y", - "Ġt ree", - "Ġiss ue", - "Ġh arm", - "Ġavail able", - "Ġc r", - "Ġpower ful", - "n ov", - "Ġmo vie", - "Ġwe ather", - "Ġsk y", - "Ġquest ions", - "e et", - "Ġact ivity", - "Ġbra nd", - "is hed", - "Ġanaly ze", - "ĠS h", - "Ġen h", - "av or", - "Ġbe g", - "Ġs chool", - "i ate", - "Ġeas ier", - "Ġinf lu", - "Ġn on", - "Ġstud y", - "Ġl ook", - "Ġsol ution", - "Ġle g", - "Ġcon st", - "H ow", - "Ġcomp et" - ] - } -} +{"version":"1.0","truncation":null,"padding":null,"added_tokens":[{"id":2000,"content":"<|endoftext|>","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true},{"id":2001,"content":"<|im_start|>","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true},{"id":2002,"content":"<|im_end|>","single_word":false,"lstrip":false,"rstrip":false,"normalized":false,"special":true}],"normalizer":{"type":"NFC"},"pre_tokenizer":{"type":"Sequence","pretokenizers":[{"type":"Split","pattern":{"Regex":"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"},"behavior":"Isolated","invert":false},{"type":"ByteLevel","add_prefix_space":false,"trim_offsets":false,"use_regex":false}]},"post_processor":{"type":"ByteLevel","add_prefix_space":false,"trim_offsets":false,"use_regex":false},"decoder":{"type":"ByteLevel","add_prefix_space":false,"trim_offsets":false,"use_regex":false},"model":{"type":"BPE","dropout":null,"unk_token":null,"continuing_subword_prefix":null,"end_of_word_suffix":null,"fuse_unk":false,"byte_fallback":false,"ignore_merges":false,"vocab":{"0":15,"1":16,"2":17,"3":18,"4":19,"5":20,"6":21,"7":22,"8":23,"9":24,"!":0,"\"":1,"#":2,"$":3,"%":4,"&":5,"'":6,"(":7,")":8,"*":9,"+":10,",":11,"-":12,".":13,"/":14,":":25,";":26,"<":27,"=":28,">":29,"?":30,"@":31,"A":32,"B":33,"C":34,"D":35,"E":36,"F":37,"G":38,"H":39,"I":40,"J":41,"K":42,"L":43,"M":44,"N":45,"O":46,"P":47,"Q":48,"R":49,"S":50,"T":51,"U":52,"V":53,"W":54,"X":55,"Y":56,"Z":57,"[":58,"\\":59,"]":60,"^":61,"_":62,"`":63,"a":64,"b":65,"c":66,"d":67,"e":68,"f":69,"g":70,"h":71,"i":72,"j":73,"k":74,"l":75,"m":76,"n":77,"o":78,"p":79,"q":80,"r":81,"s":82,"t":83,"u":84,"v":85,"w":86,"x":87,"y":88,"z":89,"{":90,"|":91,"}":92,"~":93,"Ċ":94,"Ġ":95,"ĠĠ":96,"Ġt":97,"Ġa":98,"in":99,"he":100,"re":101,"on":102,"Ġthe":103,"Ġs":104,"er":105,"at":106,"Ġc":107,"ĠĠĠĠ":108,"en":109,"Ġo":110,"Ġ\"":111,"nd":112,"es":113,"ing":114,"ĠĠĠ":115,"it":116,"Ġp":117,"or":118,"ou":119,"Ġand":120,"Ġw":121,"is":122,"Ġf":123,"an":124,"ion":125,"al":126,"Ġb":127,"Ġto":128,"Ġm":129,"Ġin":130,"Ġof":131,"le":132,"ct":133,"ar":134,"ut":135,"Ġd":136,"st":137,"ed":138,"ĠĠĠĠĠĠĠ":139,"ic":140,"\":":141,",Ċ":142,"ro":143,"ent":144,"\\n":145,"Ġe":146,"put":147,"om":148,"Ġre":149,"as":150,"ve":151,"Ġh":152,"Ġth":153,"\",Ċ":154,"Ġl":155,"Ġis":156,"et":157,"ce":158,"Ġn":159,".\\":160,"im":161,"il":162,"Ġg":163,"Ġu":164,"ction":165,"ru":166,"ation":167,"ol":168,"ch":169,"ĠT":170,"Ġfor":171,"out":172,"ra":173,"ow":174,"id":175,"ly":176,"Ġst":177,"Ġbe":178,"Ġy":179,"Ġpro":180,"ig":181,"se":182,"ate":183,"Ġthat":184,"ith":185,"ir":186,"ur":187,"ot":188,"Ġor":189,"Ġon":190,"Ġyou":191,"ers":192,"stru":193,"Ġan":194,"if":195,"ul":196,"struction":197,"Ġ{":198,"Ġ}":199,"Ġcan":200,"input":201,"output":202,"instruction":203,"Ġ{Ċ":204,"Ġ},Ċ":205,"\"Ċ":206,"Ġhe":207,"Ġcon":208,"Ġit":209,"ay":210,"ess":211,"Ġwith":212,"ver":213,"el":214,"Ġas":215,"am":216,"ĠA":217,"ge":218,"Ġsu":219,"iv":220,".\",Ċ":221,"Ġcom":222,"ĠI":223,"ment":224,"ak":225,"Ġal":226,"\\\"":227,".\"Ċ":228,"ive":229,"Ġare":230,"ab":231,"ad":232,"Ġmo":233,"Ġex":234,"Ġv":235,"ĠS":236,"res":237,"pp":238,"qu":239,"Ġde":240,"Ġwh":241,"ity":242,"Ġen":243,"ĠThe":244,"her":245,"ld":246,"ri":247,"ter":248,"ant":249,"ĠC":250,"ist":251,"Ġ\"\",Ċ":252,"um":253,"Ġus":254,"Ġne":255,"ain":256,"th":257,"ect":258,"Ġle":259,"op":260,"em":261,"ies":262,"Ġch":263,"Ġim":264,"du":265,"od":266,"ort":267,"nt":268,"est":269,"igh":270,"ere":271,"Ġha":272,"us":273,"ure":274,"ial":275,"oc":276,"Ġwor":277,"Ġtheir":278,"ac":279,"ence":280,"iz":281,"Ġyour":282,"os":283,"Ġimp":284,"ud":285,"Ġby":286,"Ġse":287,"ine":288,"ould":289,"low":290,"ill":291,"age":292,"rom":293,"Ġsp":294,"ĠP":295,"Ġsh":296,"ust":297,"The":298,"un":299,"'s":300,"Ġinc":301,"ide":302,"pl":303,"ight":304,"og":305,"Ġpl":306,"pt":307,"are":308,"Ġte":309,"Ġint":310,"Ġ\\":311,"his":312,"Ġr":313,"ake":314,"per":315,"orm":316,"ag":317,"ff":318,"ĠE":319,"art":320,"Ġk":321,"end":322,"ĠM":323,"Ġwe":324,"ĠB":325,"Ġad":326,"cess":327,"rou":328,"ical":329,"all":330,"able":331,"Ġfrom":332,"and":333,"ĠH":334,"Ġab":335,"act":336,"Ġcomp":337,"ome":338,"ach":339,"ĠThis":340,"Ġhave":341,"form":342,"Ġ\\\"":343,"ast":344,"Ġat":345,"ĠW":346,"Ġres":347,"Ġdat":348,":\\":349,"ther":350,"ions":351,"ore":352,"Ġ(":353,"Ġcont":354,"our":355,"ep":356,"ĠF":357,"Ġac":358,"ance":359,"ĠR":360,"gh":361,"Ġme":362,"ces":363,"Ġwas":364,"ind":365,"vel":366,"ations":367,"Ġhel":368,"Ġmore":369,"ult":370,"ĠD":371,"reat":372,"ign":373,"Ġhelp":374,"ime":375,"ard":376,"Ġcl":377,"Ġapp":378,"ans":379,"ie":380,"Ġdata":381,"ich":382,"ang":383,"ous":384,"ell":385,"ks":386,"ase":387,"ice":388,"ip":389,"ite":390,"Ġsuch":391,"Ġfe":392,"Ġwhe":393,"ib":394,"Ġother":395,"Ġthis":396,"ass":397,"ual":398,"ile":399,"ne":400,"red":401,"Ġhas":402,"oo":403,"ress":404,"ific":405,"ning":406,"Ġ=":407,"Ġup":408,"Ġman":409,"Ġar":410,"ong":411,"ec":412,"Ġtra":413,"av":414,"Ġwhich":415,"Ġgo":416,"Ġprov":417,"Ġdis":418,"**":419,"so":420,"ĠG":421,"one":422,"Ġem":423,"Ġnot":424,"ue":425,"ĠO":426,"Ġj":427,"ace":428,"Ġthey":429,"ame":430,"Ġqu":431,"ĠL":432,"iff":433,"Ġfol":434,"ary":435,"ated":436,"ustom":437,"ition":438,"Ġits":439,"Ġsy":440,"ke":441,"ack":442,"ry":443,"--":444,"Ġtime":445,"Ġdes":446,"Ġnew":447,"ents":448,"ount":449,"Ġfollow":450,"Ġalso":451,"Ġcomm":452,"Ġout":453,"Ġeff":454,"Ġdiff":455,"iven":456,"ap":457,"Ġsent":458,"\\u":459,"Ġso":460,"Ġprodu":461,"Ġuse":462,"Ġsc":463,"Ġ-":464,"Ġun":465,"lud":466,"ĠIt":467,"ener":468,"king":469,"Ġev":470,"Ġabout":471,"Ġthem":472,"ĠU":473,"Ġcustom":474,"Ġro":475,"Ġinclud":476,"les":477,"etw":478,"stem":479,"xt":480,"Ġinto":481,"Ġper":482,"ĠIn":483,"ĠN":484,"Ġwill":485,"Ġlear":486,"ber":487,"Ġall":488,"Ġpe":489,"ds":490,"Ġtw":491,"aking":492,"ark":493,"ful":494,"Ġmake":495,"chn":496,"erv":497,"ost":498,"rough":499,"Ġone":500,"Ġinter":501,"ities":502,"ail":503,"ike":504,"ree":505,"ple":506,"alth":507,"Ġused":508,"ors":509,"Ġover":510,"ility":511,"ments":512,"ange":513,"Ġway":514,"ory":515,"Ġcol":516,"Ġpr":517,"Ġcould":518,"Ġnum":519,"reate":520,"int":521,"Ġredu":522,"erson":523,"Ġrec":524,"Ġher":525,"Ġneed":526,"ms":527,"ater":528,"oy":529,"Ġsystem":530,"Ġinform":531,"Ġtwo":532,"Ġtechn":533,"Ġsentence":534,"ience":535,"ize":536,"get":537,"Ġdiffere":538,"ood":539,"rib":540,"Ġbut":541,"Ġfollowing":542,"ased":543,"olog":544,"erg":545,"led":546,"ures":547,"In":548,"ear":549,"Ġph":550,"own":551,"Ġpre":552,"Ġwould":553,"Ġusing":554,"Ġcons":555,"Ġwork":556,"Ġmod":557,"ating":558,"ia":559,"ire":560,"Ġpos":561,"ient":562,"ob":563,"ject":564,"Ġinv":565,"ons":566,"Ġdo":567,"ular":568,"Ġdec":569,"Ġhealth":570,"Ġimpro":571,"Ġany":572,"Ġthrough":573,"yp":574,"row":575,"velop":576,"Ġprocess":577,"Ġtr":578,"lic":579,"very":580,"als":581,"ify":582,"``":583,"ari":584,"Ġstr":585,"Ġimport":586,"Ġlike":587,"Ġproduct":588,"Ġsome":589,"ph":590,"ential":591,"Ġam":592,"ates":593,"Ġacc":594,"ens":595,"ns":596,"Ġsm":597,"Ġind":598,"een":599,"Ġexper":600,"lect":601,"Ġval":602,"Ġrel":603,"its":604,"Ġinformation":605,"ings":606,"ĠJ":607,"ople":608,"iness":609,"Ġgiven":610,"mm":611,"ices":612,"Ġpart":613,"ild":614,"ys":615,"Ġour":616,"nder":617,"Ġperson":618,"ally":619,"Ġke":620,"etween":621,"ft":622,"oth":623,"Ġspec":624,"Ġbetween":625,"ergy":626,"ĠAI":627,"Ġwho":628,"Ġmay":629,"ef":630,"ative":631,"ise":632,"Ġlist":633,"Ġkn":634,"Ġadd":635,",\\":636,"ord":637,"ics":638,"Ġpeople":639,"ĠSt":640,"Ġhis":641,"Ġexp":642,"ible":643,"Ġthere":644,"Ġserv":645,"Ġincre":646,"Ġdevelop":647,"ound":648,"ower":649,"Ġtrans":650,"bs":651,"Ġenergy":652,"Ġoff":653,"Ġbus":654,"Ġwhile":655,"ose":656,"Ġact":657,"Ġexam":658,"Ġlearning":659,"ctions":660,"con":661,"gor":662,"gan":663,"ution":664,"round":665,"pport":666,"Ġhow":667,"Ġbl":668,"Ġmed":669,"anc":670,"Ġtyp":671,"Ġra":672,"Ġcar":673,"ife":674,"Ġworld":675,"Ġvari":676,"Ġrep":677,"au":678,"Ġsoc":679,"Ġprovid":680,"Ġset":681,"ten":682,"Ġsol":683,"Ġeach":684,"Ġwhen":685,"Ġeffect":686,"Ġpo":687,"Ġshe":688,"ick":689,"Ġwhere":690,"Ġmodel":691,"Ġimportant":692,"Ġunder":693,"Ġprog":694,"enerate":695,"ural":696,"tain":697,"Ġass":698,"ology":699,"Ġhad":700,"ook":701,"gg":702,"Ġcustomer":703,"ting":704,"ving":705,"Ġresp":706,"line":707,"Ġcreat":708,"ll":709,"ily":710,"Ġreg":711,"Ġdet":712,"Ġif":713,"Ġ+":714,"Ġbusiness":715,"\\nIn":716,"ish":717,"Ġmost":718,"ĠĠĠĠĠĠĠĠ":719,"hes":720,"angu":721,"Ġprovide":722,"Ġadv":723,"erm":724,"ub":725,"Ġsk":726,"irst":727,"any":728,"Ġday":729,"ivid":730,"arm":731,"ract":732,"nce":733,"Ġ|":734,"Ġimprove":735,")\\":736,"Ġco":737,"Ġcommun":738,"arket":739,"Ġmet":740,"cy":741,"Ġdifferent":742,"ized":743,"Ġart":744,"\\nThe":745,"rit":746,"Ġcomput":747,"Ġform":748,"ck":749,"Ġhum":750,"Ġchar":751,"ble":752,"Ġlead":753,"iron":754,"Ġrem":755,"Ġshould":756,"te":757,"Ġallow":758,"ness":759,"hat":760,"Ġfun":761,"Ġcomple":762,"Ġlangu":763,"ages":764,"Ġbec":765,"Ġsign":766,"ues":767,"ature":768,"Ġfind":769,"riend":770,"Ġstud":771,"Ġmain":772,"imate":773,"ove":774,"Ġresult":775,"Ġplay":776,"Ġreduce":777,"Ġeng":778,"ware":779,"redi":780,"Ġnumber":781,"Ġlar":782,"Ġpol":783,"Ġpat":784,"Ġwell":785,"ident":786,"viron":787,"rite":788,"crib":789,"Ġbu":790,"Ġhigh":791,"Ġthese":792,"ives":793,"ves":794,"Ġdesign":795,"urn":796,"Ġthan":797,"der":798,"Ġanal":799,"Ġwater":800,"Ġmarket":801,"Ġexample":802,"way":803,"stand":804,"ng":805,"ax":806,"itive":807,"Ġ`":808,"iqu":809,"Ġsim":810,"Ġequ":811,"gorith":812,"Ġtext":813,"resent":814,"Ġmany":815,"uring":816,"----":817,"\\nA":818,"Ġdi":819,"Ġsa":820,"vironment":821,"arch":822,"Ġatt":823,"Ġpot":824,"Ġtas":825,"Ġcreate":826,"ough":827,"Ġfl":828,"Ġmaking":829,"ious":830,"Ġgra":831,"Ġlife":832,"\\nO":833,"Ġalgorith":834,"ality":835,"eng":836,"Ġfin":837,"uc":838,"?\",Ċ":839,"ĠY":840,"Ġret":841,"Ġbeen":842,"Ġtechnology":843,"Ġprogra":844,"Ġhand":845,"hip":846,"wn":847,"Ġcal":848,"Ġwhat":849,"ividual":850,"iss":851,"ety":852,"Ġlanguage":853,"ources":854,"Ġclass":855,"Ġtake":856,"Ġeas":857,"ric":858,"Ġvis":859,"bject":860,"Ġref":861,"Ġenvironment":862,"Ġfirst":863,"eg":864,"Ġindividual":865,"Ġplan":866,"Ġperform":867,"Ġru":868,"ien":869,"Ġimpact":870,"Ġag":871,"ade":872,"Ġcle":873,"Ġrequ":874,"dition":875,"__":876,"Ġche":877,"ption":878,"Ġappro":879,"Ġ**":880,"Ġgreat":881,"ved":882,"Ġexpl":883,"Ġgrow":884,"Generate":885,"Ġmy":886,"Ġincluding":887,"Ġaccess":888,"Ġpop":889,"Ġmin":890,"fore":891,"Ġsocial":892,"ines":893,"Ġcharact":894,"Ġbr":895,"Ġstep":896,"Ġunderstand":897,"Ġorgan":898,"ĠAd":899,"Ġdisc":900,"Ġpower":901,"Ġlong":902,"hed":903,"Ġconc":904,"ward":905,"ited":906,"Ġele":907,"cing":908,"Ġevery":909,"Ġca":910,"Ġoften":911,"Ġuser":912,"vie":913,"ĠV":914,"Ġfood":915,"Ġinclude":916,"Ġloc":917,"ases":918,"ically":919,"ode":920,"ants":921,"Ġinvol":922,"Ġsmall":923,"Ġsur":924,"achine":925,"Ġbeing":926,"Ġpotential":927,"Ġno":928,"ĠCh":929,"Ġdep":930,"ather":931,"Ġboth":932,"Ġens":933,"Ġposs":934,"Ġed":935,"cribe":936,"ts":937,"ork":938,"ĠThey":939,"Ġpur":940,"ivity":941,"Ġwords":942,"Ġsignific":943,"Ġwere":944,"ĠHow":945,"Ġprom":946,"Ġexperience":947,"ĠK":948,"up":949,"Ġcount":950,"ered":951,"Des":952,"Ġfam":953,"```":954,"akes":955,"Ġgl":956,"ĠHe":957,"Ġfeel":958,"Ġback":959,"Ġfi":960,"Ġproble":961,"ization":962,"ling":963,"Ġcommunic":964,"ploy":965,"Ġaut":966,"Ġfriend":967,"Ġhuman":968,"Ġspe":969,"ew":970,"Ġpersonal":971,"Ġtop":972,"Ġent":973,"other":974,"Ġchang":975,"Ġcor":976,"Ġchange":977,"Ġdecis":978,"ability":979,"hing":980,"atural":981,"ever":982,"Ġcost":983,"Ġgood":984,"ause":985,"Ġident":986,"Ġsoft":987,"ined":988,"Ġpass":989,"'t":990,"atures":991,"Ġben":992,"Ġcompany":993,"Ġstart":994,"Ġsignificant":995,"Ġsumm":996,"ond":997,"old":998,"bers":999,"sel":1000,"?\\":1001,"Ġcur":1002,"Ġlight":1003,"Ġcommon":1004,".\\\"":1005,"Ġcustomers":1006,"iving":1007,"conom":1008,"Ġfunction":1009,"Ġve":1010,"Ġthree":1011,"Ġeven":1012,"ining":1013,"Ġgener":1014,"ries":1015,"Ġlevel":1016,"Ġspecific":1017,"Ġwebs":1018,"Ġthen":1019,"Ġeffective":1020,"cur":1021,"ense":1022,"Ġlarge":1023,"Ġdist":1024,"Ġeffic":1025,"Ġsupport":1026,"Ġget":1027,"Create":1028,"read":1029,"port":1030,"Ġinf":1031,"Ġ'":1032,"Ġyear":1033,"Ġstate":1034,"Ġkey":1035,"ccess":1036,":**":1037,"Ġav":1038,"Ġknow":1039,"Ġbenef":1040,"Ġess":1041,"ables":1042,"ren":1043,"Ġown":1044,"ĠThese":1045,"ock":1046,"-t":1047,"Ġide":1048,"omm":1049,"reen":1050,"ced":1051,"cture":1052,"Ġteam":1053,"Ġris":1054,"Ġtasks":1055,"Ġdown":1056,"Ġstru":1057,"Ġcomputer":1058,"-b":1059,"Ġfact":1060,"Ġmem":1061,"etter":1062,"\\nS":1063,"Ġaround":1064,"Ġword":1065,"Ġbased":1066,"Ġbeh":1067,"Ġright":1068,"Ġdel":1069,"Ġpoint":1070,"Ġnatural":1071,"ss":1072,"Ġeconom":1073,"Ġmade":1074,"Ġins":1075,"Ġinst":1076,"Ġmat":1077,"Ġvalue":1078,"Ġanim":1079,"Ġsever":1080,"\\nT":1081,"ational":1082,"ital":1083,"ze":1084,"ote":1085,"ills":1086,"tern":1087,"Ġread":1088,"Ġcontent":1089,"Ġonline":1090,"Ġend":1091,"ĠUn":1092,"vent":1093,"Ġsee":1094,"ending":1095,"Ġmon":1096,"Ġdr":1097,"Ġkeep":1098,"Ġsystems":1099,"cul":1100,"ven":1101,"Ġstory":1102,"Ġmedia":1103,"Ġseveral":1104,"hen":1105,"ateg":1106,"Ġcontin":1107,"Ġdev":1108,"Ġlearn":1109,"Ġla":1110,"Ġstre":1111,"Ġpartic":1112,"Ġair":1113,"ually":1114,"Ġsuccess":1115,"ouse":1116,"Ġiss":1117,"ied":1118,"Ġmachine":1119,"Ġopt":1120,"Ġx":1121,"Ġop":1122,"Ġprof":1123,"ocus":1124,"chie":1125,"Ġmeth":1126,"ner":1127,"omp":1128,"ron":1129,"Ġhome":1130,"Ġbetter":1131,"ĠPro":1132,"Ġmult":1133,"omet":1134,"Ġincrease":1135,"Ġanaly":1136,"vert":1137,"Ġrele":1138,"Ġbra":1139,"ink":1140,"Ġtem":1141,"Ġpredi":1142,"Ġtre":1143,"Ġservice":1144,"Ġwebsite":1145,"Ġmanage":1146,"Ġsoftware":1147,"here":1148,"Ġprot":1149,"-s":1150,"Ġquest":1151,"ier":1152,"Ġknown":1153,"Ġorder":1154,"Ġphys":1155,"cept":1156,"Ġachie":1157,"Ġinput":1158,"Ġpossible":1159,"ĠIf":1160,"Ġext":1161,"fter":1162,"Ġelect":1163,"Ġmethod":1164,"Ġbre":1165,"ĠAn":1166,"ways":1167,"ering":1168,"ets":1169,"Ġjust":1170,"Ġstore":1171,"Ġdevelopment":1172,"Ġcare":1173,"Ġobject":1174,"Ġtype":1175,"ĠFor":1176,"Ġfocus":1177,"ggest":1178,"Ġonly":1179,"Ġconsid":1180,"ars":1181,"Ġchall":1182,"Ġdeterm":1183,"Ġsal":1184,"ins":1185,"Ġfeatures":1186,"Ġtru":1187,"ody":1188,"Ġtool":1189,">\\":1190,"Ġensure":1191,"oss":1192,"ublic":1193,"Ġitem":1194,"Here":1195,"ination":1196,"Ġdef":1197,"Describe":1198,"ional":1199,"roup":1200,"Ġconf":1201,"Ġneeds":1202,"Ġcharacter":1203,"Ġvarious":1204,"Ġlet":1205,"Ġapplic":1206,"aut":1207,"Ġjob":1208,"ellig":1209,"ĠCon":1210,"Ġbest":1211,"Ġfore":1212,"Ġamount":1213,"rop":1214,"Ġbuild":1215,"ique":1216,"aging":1217,"Ġemploy":1218,"Ġrest":1219,"air":1220,"What":1221,"Ġtoget":1222,"Ġways":1223,"Ġidentify":1224,"Ġtogether":1225,"Ġreal":1226,"Ġusers":1227,"Ġmean":1228,"asing":1229,"ĠAm":1230,"Ġeduc":1231,"Ġalgorithm":1232,"Ġnetw":1233,"Ġcode":1234,"Write":1235,"ov":1236,"-d":1237,"oura":1238,"ĠHowever":1239,"uture":1240,"view":1241,"Ġindu":1242,"Ġproducts":1243,"ected":1244,"ertain":1245,";\\":1246,"ĠAs":1247,"pr":1248,"aste":1249,"Ġoper":1250,"Ġ$":1251,"avi":1252,"self":1253,"Ġ<":1254,"Ġindust":1255,"Ġgu":1256,"Ġothers":1257,"Ex":1258,"ian":1259,"Ġ\"\\\"":1260,"-f":1261,"nces":1262,"Ġfil":1263,"Ġrespons":1264,"rol":1265,"Ġcap":1266,"Ġbefore":1267,"vern":1268,"Ġcomplex":1269,"lus":1270,"ribut":1271,"ats":1272,"Ġpositive":1273,"oh":1274,"Ġlo":1275,"Ġgroup":1276,"Ġfound":1277,"ee":1278,"ogn":1279,"Ġsw":1280,"Ġindividuals":1281,"Ġpract":1282,"Ġenc":1283,"Ġshare":1284,"raph":1285,"Ġrange":1286,"Ġsun":1287,"\\t":1288,"Ġproviding":1289,"icle":1290,"Ġdem":1291,"Ġplace":1292,"Ġaud":1293,"joy":1294,"Ġmust":1295,"els":1296,"ery":1297,"One":1298,"Ġfamily":1299,"Ġfuture":1300,"less":1301,"rent":1302,"Ġproblem":1303,"Ġessential":1304,"rodu":1305,"ired":1306,"Ġreducing":1307,"ism":1308,"Ġwarm":1309,"ray":1310,"Ġability":1311,"Ġstrong":1312,"Ġalways":1313,"Ġresources":1314,"Ġbenefits":1315,"Ġstrateg":1316,"Ġinvolves":1317,"Ġassist":1318,"erest":1319,"nA":1320,"ression":1321,"Ġ[":1322,"ilities":1323,"Ġsteps":1324,"verall":1325,"Ġshow":1326,"obal":1327,"\\nF":1328,"Ġland":1329,"ĠHere":1330,"Ġbusinesses":1331,"ĠEn":1332,"pportun":1333,"Ġmeas":1334,"Ġreturn":1335,"Ġdig":1336,"Ġhist":1337,"yth":1338,"Ġcent":1339,"Ġable":1340,"Ġwithout":1341,"yc":1342,"plain":1343,"Ġrelations":1344,"Ġservices":1345,"-c":1346,"Ġtest":1347,"arth":1348,"Ġcommunication":1349,"Ġintern":1350,"new":1351,"Ġsit":1352,"Ġinvest":1353,"Ġcaus":1354,"Ġunt":1355,"Ġfriends":1356,"Ġchanges":1357,"cri":1358,"dit":1359,"ĠBy":1360,"ĠYou":1361,"Ġmeans":1362,"Ġrese":1363,"ool":1364,"ted":1365,"elligence":1366,"ains":1367,"pping":1368,"Ġbel":1369,"Ġrepresent":1370,"Ġhapp":1371,"Ġser":1372,"Ġperformance":1373,"Ġopportun":1374,"Ġtemper":1375,"ĠShe":1376,"Ġfu":1377,"ix":1378,"bot":1379,"Ġwrit":1380,"Ġbehavi":1381,"Ġproject":1382,"ĠWith":1383,"ivers":1384,"day":1385,"Ġphysical":1386,"izing":1387,"Ġactiv":1388,"Ġwithin":1389,"Ġinterest":1390,"olution":1391,"wards":1392,"ffic":1393,"Ġquick":1394,"Ġpublic":1395,"Ġgrowth":1396,"Ġcho":1397,"Ġrelationship":1398,"Ġuntil":1399,"Ġhelps":1400,"Ġstudents":1401,"Ġfiel":1402,"imes":1403,"ulation":1404,"ibility":1405,"elf":1406,"Ġful":1407,"Ġsub":1408,"ank":1409,"ides":1410,"Ġskills":1411,"Ġclimate":1412,"Given":1413,"Ġpar":1414,"Ġclear":1415,"irt":1416,"Name":1417,"Ġpresent":1418,"Ġtri":1419,"Ġchalleng":1420,"ream":1421,"Ġlay":1422,"Ġmarketing":1423,"Ġsummary":1424,"Ġchild":1425,"Ġsaf":1426,"Ġsure":1427,"Ġsame":1428,"Ġmu":1429,"Ġemail":1430,"bon":1431,"Ġsomet":1432,"```\\":1433,"Ġcurrent":1434,"amp":1435,"ences":1436,"ĠRe":1437,"Ġtransport":1438,"me":1439,"-p":1440,"action":1441,"ĠEx":1442,"Ġyears":1443,"Ġcomb":1444,"hor":1445,"anced":1446,"ty":1447,"Ġlove":1448,"Ġgreen":1449,"Ġpopular":1450,"Ġless":1451,"Ġdra":1452,"Ġcontrol":1453,"Ġaff":1454,"Ġconsum":1455,"Ġgame":1456,"ental":1457,"ights":1458,"arget":1459,"omes":1460,"ox":1461,"icult":1462,"erc":1463,"Ġgoals":1464,"ancial":1465,"tle":1466,"Ġgovern":1467,"Ġnumbers":1468,"Ġfive":1469,"Ġstand":1470,"Ġsearch":1471,"Ġefficient":1472,"Ġwal":1473,"Ġname":1474,"ath":1475,"Ġheart":1476,"Ġduring":1477,"rect":1478,"Ġoverall":1479,"ython":1480,"Ġallows":1481,"Ġcity":1482,"ave":1483,"vant":1484,"aterial":1485,"Ġwide":1486,"Ġmus":1487,"ificial":1488,"Ġhard":1489,"ĠTh":1490,"oose":1491,"Ġglobal":1492,"aj":1493,"Ġter":1494,"Ġdifficult":1495,"Ġline":1496,"ĠAl":1497,"care":1498,"ived":1499,"Ġregular":1500,"Ġgr":1501,"),":1502,"lement":1503,"Ġhim":1504,"Ġunique":1505,"Ġenjoy":1506,"Ġmeaning":1507,"Ġopen":1508,"Ġi":1509,"abor":1510,"Ġarea":1511,"Ġitems":1512,"Ġclean":1513,"ditionally":1514,"oid":1515,"ĠWe":1516,"Ġbeaut":1517,"Ġmeet":1518,"iple":1519,"Ġstatement":1520,"Ġagain":1521,"ysis":1522,"Ġfac":1523,"Ġsources":1524,"Ġbody":1525,"Ġalgorithms":1526,"Ġaudience":1527,"Ġwant":1528,"Ġlog":1529,"Ġmaintain":1530,"Ġactivities":1531,"Ġmove":1532,"Ġcult":1533,"oney":1534,"Ġtarget":1535,"\\nB":1536,"Ġmaterial":1537,"Ġcreating":1538,"Ġstructure":1539,"atform":1540,"ext":1541,"Ġexperien":1542,"Ġvalues":1543,"ead":1544,"ohn":1545,"Ġhealthy":1546,"ross":1547,"Ġinteg":1548,"Ġresearch":1549,"atch":1550,"ooking":1551,"Ġrole":1552,"Ġprovides":1553,"iety":1554,"ists":1555,"Ġfinancial":1556,"ories":1557,"dent":1558,"Ġer":1559,"Ġarticle":1560,"Ġelements":1561,"Ġaddress":1562,"Ġconn":1563,"ĠUse":1564,"mp":1565,"Ġeasy":1566,"Ġneg":1567,"Ġcolor":1568,"Ġcalcul":1569,"Explain":1570,"ĠPl":1571,"pect":1572,"ince":1573,"ale":1574,"Ġrisk":1575,"curity":1576,"ert":1577,"Ġfeed":1578,"Ġevent":1579,"vers":1580,"ples":1581,"Ġlevels":1582,"Ġbi":1583,"Ġstay":1584,"Ġplatform":1585,"Ġbreak":1586,"back":1587,"Ġsat":1588,"\\nOverall":1589,"Ġeducation":1590,"\\nC":1591,"Ġcarbon":1592,"--------":1593,"ape":1594,"Ġprevent":1595,"Ġaddition":1596,"Ġstress":1597,"ral":1598,"ource":1599,"rus":1600,"Ġcome":1601,"Ġrecogn":1602,"ĠUnited":1603,"Ġproper":1604,"Ġpoll":1605,"dentify":1606,"Ġunderstanding":1607,"Ġdecisions":1608,"ict":1609,"Ġdire":1610,"Ġbehavior":1611,"Ġ*":1612,"\\nI":1613,"Ġmess":1614,"Ġanimals":1615,"Ġsl":1616,"Ġwind":1617,"Ġbas":1618,"Ġpain":1619,"Ġleading":1620,"ern":1621,"ger":1622,"Ġpres":1623,"Ġthough":1624,"Ġinteract":1625,"yle":1626,"Ġdoes":1627,"Ġhead":1628,"Ġintelligence":1629,"orts":1630,"Ġbecome":1631,"Ġrun":1632,"aring":1633,"Ġimplement":1634,"Ġaction":1635,"oot":1636,"terns":1637,"Ġprotect":1638,"eric":1639,"Ġflow":1640,"Ġemot":1641,"cessary":1642,"urate":1643,"Ġsuggest":1644,"Ġprogram":1645,"Ġphr":1646,"Ġhealthcare":1647,"ention":1648,"Ġsust":1649,"Ġwhy":1650,"Ġaccurate":1651,"lu":1652,"Ġhig":1653,"Ġreach":1654,"Ġallowing":1655,"Ġtravel":1656,"Ġrequire":1657,"Ġareas":1658,"Ġdeep":1659,"He":1660,"Ġfew":1661,"Ġself":1662,"oun":1663,"Ġ#":1664,"osp":1665,"str":1666,"Ġminut":1667,"Ġdecision":1668,"ĠThere":1669,"ances":1670,"Ġquality":1671,"Ġavail":1672,"Ġspace":1673,"Ġsomething":1674,"Ġweb":1675,"Ġpatterns":1676,"Ġmot":1677,"oring":1678,"isf":1679,"Ġanother":1680,"Ġaccount":1681,"\\nW":1682,"uss":1683,"Ġmaj":1684,"uation":1685,"Ġsustain":1686,"Ġautom":1687,"iques":1688,"issions":1689,"verse":1690,"Ġconcept":1691,"Ġsecurity":1692,"Ġthose":1693,"Ġprofess":1694,"Ġshort":1695,"Ġnight":1696,"ength":1697,"apt":1698,"ex":1699,"ĠAdditionally":1700,"Ġtaking":1701,"Ġtoo":1702,"agn":1703,"Ġsimple":1704,"lusion":1705,"iency":1706,"ash":1707,"ours":1708,"Ġpa":1709,"Ġlit":1710,"ĠSp":1711,"iting":1712,"Ġdon":1713,"Ġlim":1714,"lish":1715,"mat":1716,"aves":1717,"ledge":1718,"ditional":1719,"inc":1720,"Ġevents":1721,"Ġoffer":1722,"thing":1723,"Ġworking":1724,"Ġanalysis":1725,"Ġachieve":1726,"Ġpie":1727,"Ġbook":1728,"Ġfre":1729,"Ġmuch":1730,"oon":1731,"Ġtry":1732,"esp":1733,"Ġwaste":1734,"face":1735,"Ġear":1736,"Ġfru":1737,"Ġtransportation":1738,"chool":1739,"Ġtechniques":1740,"Ġprogramm":1741,"ĠEarth":1742,"Ġpredict":1743,"Ġnever":1744,"ws":1745,"ument":1746,"imately":1747,"ared":1748,"Ġparticular":1749,"Ġtowards":1750,"Ġeconomic":1751,"Ġincreasing":1752,"Ġfast":1753,"iment":1754,"Ġnetwork":1755,"Ġcorrect":1756,"Ġmight":1757,"Ġoc":1758,"Ġbecause":1759,"ĠWh":1760,"az":1761,"play":1762,"Ġresults":1763,"Ġmanagement":1764,"Ġpurch":1765,"Ġsound":1766,"Ġpast":1767,"Ġtraining":1768,"____":1769,"ope":1770,"Ġengage":1771,"ourage":1772,"Ġsense":1773,"Ġfree":1774,"Ġpref":1775,"ees":1776,"Ġcountries":1777,"ney":1778,"anies":1779,"Ġafter":1780,"Ġmind":1781,"Ġexc":1782,"ĠOnce":1783,"ĠĠĠĠĠĠĠĠĠĠĠ":1784,"Ġcomplete":1785,"Ġimm":1786,"Ġest":1787,"Ġgenerate":1788,"verb":1789,"ĠDe":1790,"'m":1791,"Ġtools":1792,"redients":1793,"Ġmajor":1794,"ently":1795,"Ġcontribut":1796,"leep":1797,"Ġpoints":1798,"ditions":1799,"Ġfactors":1800,"Ġel":1801,"Ġnext":1802,"ium":1803,"oud":1804,"Ġcru":1805,"Ġreas":1806,"riate":1807,"ĠInd":1808,"Ġpromot":1809,"Ġhistory":1810,"Ġjour":1811,"Ġdue":1812,"Con":1813,"Ġveget":1814,"ency":1815,"ĠAmeric":1816,"Ġfra":1817,"Ġdifference":1818,"oard":1819,"lex":1820,"Ġequation":1821,"irtual":1822,"Ġcup":1823,"Ġforest":1824,"Ġnegative":1825,"Ġsecon":1826,"ones":1827,"Ġnature":1828,"Ġuses":1829,"ah":1830,"por":1831,"Ġsec":1832,"ording":1833,"Ġlast":1834,"ĠSome":1835,"Ġissues":1836,"Ġscient":1837,"Ġprint":1838,"ĠStates":1839,"over":1840,"Ġsatisf":1841,"Ġdevices":1842,"Ġdise":1843,"Ġtemperature":1844,"Ġfeedback":1845,"Ġnecessary":1846,"Ġemissions":1847,"mb":1848,"Ġlow":1849,"for":1850,"tal":1851,"Ġchallenges":1852,"Ġarray":1853,"Ġside":1854,"Ġengine":1855,"Ġboo":1856,"ata":1857,"Ġbelie":1858,"-m":1859,"Ġmultiple":1860,"Ġsing":1861,"Ġgovernment":1862,"ames":1863,"ified":1864,"Ġminutes":1865,"Ġsuccessful":1866,"Ġmoney":1867,"Ġquickly":1868,"Ġbir":1869,"Ġtypically":1870,"Ġpost":1871,"Ġprep":1872,"Ġknowledge":1873,"pped":1874,"actions":1875,"Ġmethods":1876,"Ġoptim":1877,"\\nP":1878,"Ġoutput":1879,"Ġfield":1880,"Ġtable":1881,"Ġbal":1882,"Ġcoll":1883,"Ġcharacters":1884,"volution":1885,"ords":1886,"ilar":1887,"ification":1888,"ane":1889,"Ġcell":1890,"Ġmil":1891,"ĠWhat":1892,"Ġsqu":1893,"Ġlives":1894,"ĠAr":1895,"Ġphrase":1896,"Ġnut":1897,"Ġdigital":1898,"Ġinternet":1899,"lass":1900,"ura":1901,"ommend":1902,"Ġtreat":1903,"Ġapprop":1904,"resh":1905,"urther":1906,"ĠOne":1907,"Ġvisual":1908,"ategor":1909,"Ġapproach":1910,"Ġcertain":1911,"Ġsho":1912,"val":1913,"Ġtask":1914,"ires":1915,"Ġappropriate":1916,"Ġvie":1917,"Ġdesigned":1918,"pose":1919,"**:":1920,"fort":1921,"Ġ|\\":1922,"Ġapplications":1923,"Ġpay":1924,"Ġnow":1925,"Ġheat":1926,"Ġindustry":1927,"pre":1928,"Ġeffectively":1929,"Ġpopulation":1930,"Ġopportunities":1931," \\","Ġens ure","os s","ub lic","Ġit em","H ere","in ation","Ġde f","Des cribe","ion al","rou p","Ġcon f","Ġneed s","Ġcharact er","Ġvari ous","Ġle t","Ġapp lic","a ut","Ġj ob","ell ig","ĠC on","Ġb est","Ġf ore","Ġam ount","ro p","Ġbu ild","iqu e","ag ing","Ġem ploy","Ġre st","a ir","W hat","Ġto get","Ġway s","Ġident ify","Ġtoget her","Ġre al","Ġus ers","Ġme an","as ing","ĠA m","Ġed uc","Ġalgorith m","Ġn etw","Ġc ode","W rite","o v","- d","ou ra","ĠHow ever","ut ure","vie w","Ġin du","Ġproduct s","ect ed","er tain","; \\","ĠA s","p r","ast e","Ġo per","Ġ $","av i","sel f","Ġ <","Ġindu st","Ġg u","Ġother s","E x","i an","Ġ\" \\\"","- f","n ces","Ġf il","Ġresp ons","ro l","Ġc ap","Ġbe fore","ver n","Ġcomple x","l us","rib ut","at s","Ġpos itive","o h","Ġl o","Ġg roup","Ġf ound","e e","og n","Ġs w","Ġindividual s","Ġp ract","Ġen c","Ġsh are","ra ph","Ġr ange","Ġsu n","\\ t","Ġprovid ing","ic le","Ġde m","Ġpl ace","Ġa ud","j oy","Ġm ust","el s","er y","O ne","Ġfam ily","Ġf uture","l ess","re nt","Ġproble m","Ġess ential","ro du","i red","Ġredu cing","is m","Ġw arm","ra y","Ġab ility","Ġstr ong","Ġal ways","Ġres ources","Ġbenef its","Ġstr ateg","Ġinvol ves","Ġass ist","ere st","n A","ress ion","Ġ [","il ities","Ġstep s","ver all","Ġsh ow","ob al","\\n F","Ġl and","ĠH ere","Ġbusiness es","ĠE n","pport un","Ġme as","Ġret urn","Ġd ig","Ġh ist","y th","Ġc ent","Ġab le","Ġwith out","y c","pl ain","Ġrel ations","Ġserv ices","- c","Ġt est","ar th","Ġcommunic ation","Ġinter n","ne w","Ġs it","Ġinv est","Ġca us","Ġu nt","Ġfriend s","Ġchang es","c ri","d it","ĠB y","ĠY ou","Ġme ans","Ġre se","o ol","t ed","ellig ence","ain s","pp ing","Ġbe l","Ġrep resent","Ġha pp","Ġs er","Ġperform ance","Ġo pportun","Ġtem per","ĠS he","Ġf u","i x","b ot","Ġw rit","Ġbeh avi","Ġpro ject","ĠW ith","iv ers","d ay","Ġphys ical","iz ing","Ġact iv","Ġwith in","Ġint erest","ol ution","ward s","ff ic","Ġqu ick","Ġp ublic","Ġgrow th","Ġch o","Ġrelations hip","Ġunt il","Ġhelp s","Ġstud ents","Ġfi el","im es","ul ation","ib ility","el f","Ġf ul","Ġsu b","an k","id es","Ġsk ills","Ġcl imate","G iven","Ġp ar","Ġcle ar","ir t","N ame","Ġp resent","Ġt ri","Ġchall eng","re am","Ġl ay","Ġmarket ing","Ġsumm ary","Ġch ild","Ġsa f","Ġsu re","Ġs ame","Ġm u","Ġem ail","b on","Ġs omet","``` \\","Ġcur rent","am p","en ces","ĠR e","Ġtrans port","m e","- p","a ction","ĠE x","Ġyear s","Ġcom b","h or","anc ed","t y","Ġl ove","Ġg reen","Ġpop ular","Ġl ess","Ġd ra","Ġcont rol","Ġa ff","Ġcons um","Ġg ame","ent al","ight s","ar get","om es","o x","ic ult","er c","Ġgo als","anc ial","t le","Ġgo vern","Ġnum bers","Ġf ive","Ġst and","Ġse arch","Ġeffic ient","Ġw al","Ġn ame","at h","Ġhe art","Ġd uring","re ct","Ġover all","yth on","Ġallow s","Ġc ity","a ve","v ant","ater ial","Ġw ide","Ġm us","ific ial","Ġh ard","ĠT h","oo se","Ġgl obal","a j","Ġt er","Ġdiff icult","Ġl ine","ĠA l","c are","iv ed","Ġreg ular","Ġg r",") ,","le ment","Ġh im","Ġun ique","Ġen joy","Ġmean ing","Ġop en","Ġ i","ab or","Ġare a","Ġitem s","Ġcle an","dition ally","o id","ĠW e","Ġbe aut","Ġme et","ip le","Ġstate ment","Ġag ain","ys is","Ġf ac","Ġs ources","Ġb ody","Ġalgorith ms","Ġaud ience","Ġw ant","Ġl og","Ġmain tain","Ġactiv ities","Ġmo ve","Ġc ult","one y","Ġt arget","\\n B","Ġm aterial","Ġcreat ing","Ġstru cture","at form","e xt","Ġexper ien","Ġval ues","e ad","oh n","Ġhealth y","ro ss","Ġint eg","Ġrese arch","at ch","oo king","Ġro le","Ġprovid es","i ety","ist s","Ġfin ancial","or ies","d ent","Ġ er","Ġart icle","Ġele ments","Ġadd ress","Ġcon n","ĠU se","m p","Ġeas y","Ġne g","Ġcol or","Ġcal cul","Ex plain","ĠP l","p ect","in ce","al e","Ġris k","cur ity","er t","Ġfe ed","Ġev ent","v ers","pl es","Ġlevel s","Ġb i","Ġst ay","Ġpl atform","Ġbre ak","b ack","Ġs at","\\nO verall","Ġeduc ation","\\n C","Ġcar bon","---- ----","ap e","Ġpre vent","Ġadd ition","Ġst ress","r al","our ce","ru s","Ġcom e","Ġrec ogn","ĠUn ited","Ġpro per","Ġpol l","dent ify","Ġunderstand ing","Ġdecis ions","i ct","Ġd ire","Ġbehavi or","Ġ *","\\n I","Ġm ess","Ġanim als","Ġs l","Ġw ind","Ġb as","Ġp ain","Ġlead ing","er n","g er","Ġp res","Ġth ough","Ġinter act","y le","Ġdo es","Ġhe ad","Ġint elligence","ort s","Ġbec ome","Ġru n","ar ing","Ġimp lement","Ġa ction","o ot","ter ns","Ġprot ect","er ic","Ġf low","Ġem ot","cess ary","ur ate","Ġsu ggest","Ġprogra m","Ġph r","Ġhealth care","ent ion","Ġsu st","Ġwh y","Ġacc urate","l u","Ġh ig","Ġre ach","Ġallow ing","Ġtra vel","Ġrequ ire","Ġare as","Ġde ep","H e","Ġfe w","Ġs elf","ou n","Ġ #","os p","st r","Ġmin ut","Ġdecis ion","ĠThe re","an ces","Ġqu ality","Ġav ail","Ġsp ace","Ġsomet hing","Ġwe b","Ġpat terns","Ġm ot","or ing","is f","Ġan other","Ġacc ount","\\n W","us s","Ġm aj","u ation","Ġsust ain","Ġaut om","iqu es","iss ions","ver se","Ġcon cept","Ġse curity","Ġth ose","Ġprof ess","Ġsh ort","Ġn ight","eng th","a pt","e x","ĠAd ditionally","Ġt aking","Ġto o","ag n","Ġsim ple","lus ion","ien cy","as h","our s","Ġp a","Ġl it","ĠS p","it ing","Ġd on","Ġl im","l ish","m at","av es","led ge","dition al","in c","Ġev ents","Ġoff er","th ing","Ġwor king","Ġanal ysis","Ġachie ve","Ġp ie","Ġb ook","Ġf re","Ġmu ch","o on","Ġt ry","es p","Ġw aste","f ace","Ġe ar","Ġf ru","Ġtransport ation","ch ool","Ġtechn iques","Ġprogra mm","ĠE arth","Ġpredi ct","Ġne ver","w s","u ment","imate ly","are d","Ġpartic ular","Ġto wards","Ġeconom ic","Ġincre asing","Ġf ast","im ent","Ġnetw ork","Ġcor rect","Ġm ight","Ġo c","Ġbec ause","ĠW h","a z","pl ay","Ġresult s","Ġmanage ment","Ġpur ch","Ġs ound","Ġp ast","Ġtra ining","__ __","op e","Ġeng age","oura ge","Ġs ense","Ġf ree","Ġpre f","e es","Ġcount ries","ne y","an ies","Ġa fter","Ġm ind","Ġex c","ĠO nce","ĠĠĠĠ ĠĠĠĠĠĠĠ","Ġcomple te","Ġim m","Ġ est","Ġg enerate","ver b","ĠD e","' m","Ġtool s","redi ents","Ġmaj or","ent ly","Ġcont ribut","le ep","Ġpoint s","dit ions","Ġfact ors","Ġe l","Ġne xt","i um","ou d","Ġc ru","Ġre as","ri ate","ĠI nd","Ġprom ot","Ġhist ory","Ġj our","Ġd ue","C on","Ġve get","en cy","ĠAm eric","Ġf ra","Ġdiffere nce","o ard","le x","Ġequ ation","irt ual","Ġc up","Ġfore st","Ġneg ative","Ġse con","on es","Ġn ature","Ġus es","a h","p or","Ġse c","ord ing","Ġl ast","ĠS ome","Ġiss ues","Ġsc ient","Ġpr int","ĠSt ates","o ver","Ġsat isf","Ġdev ices","Ġdis e","Ġtemper ature","Ġfeed back","Ġne cessary","Ġem issions","m b","Ġl ow","f or","t al","Ġchalleng es","Ġar ray","Ġs ide","Ġeng ine","Ġb oo","at a","Ġbel ie","- m","Ġmult iple","Ġs ing","Ġgovern ment","am es","if ied","Ġminut es","Ġsuccess ful","Ġm oney","Ġquick ly","Ġb ir","Ġtyp ically","Ġp ost","Ġpre p","Ġknow ledge","pp ed","a ctions","Ġmethod s","Ġopt im","\\n P","Ġout put","Ġfiel d","Ġt able","Ġb al","Ġcol l","Ġcharact ers","v olution","or ds","il ar","ific ation","an e","Ġc ell","Ġm il","ĠW hat","Ġs qu","Ġl ives","ĠA r","Ġphr ase","Ġn ut","Ġdig ital","Ġintern et","l ass","u ra","omm end","Ġt reat","Ġappro p","res h","ur ther","ĠO ne","Ġvis ual","ate gor","Ġappro ach","Ġc ertain","Ġsh o","v al","Ġtas k","i res","Ġapprop riate","Ġv ie","Ġdesign ed","p ose","** :","f ort","Ġ| \\","Ġapplic ations","Ġp ay","Ġn ow","Ġhe at","Ġindust ry","p re","Ġeffective ly","Ġpop ulation","Ġopportun ities","< /","ĠT o","Ġup d","Ġinclud es","ĠE ng","Ġtyp es","Ġup on","Ġconsid er","le t","Ġg en","og raph","pl ace","Ġt imes","Ġar g","C omp","ĠG o","Ġre ce","Ġchild ren","Ġtra ck","Ġsome one","w ord","Ġyou ng","Ġcon ditions","Ġtra ditional","Ġmodel s","I dentify","Ġc amp","Ġm akes","ist ic","Ġar r","Ġc ard","ut ions","l t","Ġo ld","Ġide as","Ġe y","Ġt ree","Ġiss ue","Ġh arm","Ġavail able","Ġc r","Ġpower ful","n ov","Ġmo vie","Ġwe ather","Ġsk y","Ġquest ions","e et","Ġact ivity","Ġbra nd","is hed","Ġanaly ze","ĠS h","Ġen h","av or","Ġbe g","Ġs chool","i ate","Ġeas ier","Ġinf lu","Ġn on","Ġstud y","Ġl ook","Ġsol ution","Ġle g","Ġcon st","H ow","Ġcomp et"]}} diff --git a/tests/assets/tiny_bpe_vocab.json b/tests/assets/tiny_bpe_vocab.json index 4ddab5d667..a29ae94538 100644 --- a/tests/assets/tiny_bpe_vocab.json +++ b/tests/assets/tiny_bpe_vocab.json @@ -1,2002 +1 @@ -{ - "!": 0, - "\"": 1, - "#": 2, - "$": 3, - "%": 4, - "&": 5, - "'": 6, - "(": 7, - ")": 8, - "*": 9, - "+": 10, - ",": 11, - "-": 12, - ".": 13, - "/": 14, - "0": 15, - "1": 16, - "2": 17, - "3": 18, - "4": 19, - "5": 20, - "6": 21, - "7": 22, - "8": 23, - "9": 24, - ":": 25, - ";": 26, - "<": 27, - "=": 28, - ">": 29, - "?": 30, - "@": 31, - "A": 32, - "B": 33, - "C": 34, - "D": 35, - "E": 36, - "F": 37, - "G": 38, - "H": 39, - "I": 40, - "J": 41, - "K": 42, - "L": 43, - "M": 44, - "N": 45, - "O": 46, - "P": 47, - "Q": 48, - "R": 49, - "S": 50, - "T": 51, - "U": 52, - "V": 53, - "W": 54, - "X": 55, - "Y": 56, - "Z": 57, - "[": 58, - "\\": 59, - "]": 60, - "^": 61, - "_": 62, - "`": 63, - "a": 64, - "b": 65, - "c": 66, - "d": 67, - "e": 68, - "f": 69, - "g": 70, - "h": 71, - "i": 72, - "j": 73, - "k": 74, - "l": 75, - "m": 76, - "n": 77, - "o": 78, - "p": 79, - "q": 80, - "r": 81, - "s": 82, - "t": 83, - "u": 84, - "v": 85, - "w": 86, - "x": 87, - "y": 88, - "z": 89, - "{": 90, - "|": 91, - "}": 92, - "~": 93, - "Ċ": 94, - "Ġ": 95, - "ĠĠ": 96, - "Ġt": 97, - "Ġa": 98, - "in": 99, - "he": 100, - "re": 101, - "on": 102, - "Ġthe": 103, - "Ġs": 104, - "er": 105, - "at": 106, - "Ġc": 107, - "ĠĠĠĠ": 108, - "en": 109, - "Ġo": 110, - "Ġ\"": 111, - "nd": 112, - "es": 113, - "ing": 114, - "ĠĠĠ": 115, - "it": 116, - "Ġp": 117, - "or": 118, - "ou": 119, - "Ġand": 120, - "Ġw": 121, - "is": 122, - "Ġf": 123, - "an": 124, - "ion": 125, - "al": 126, - "Ġb": 127, - "Ġto": 128, - "Ġm": 129, - "Ġin": 130, - "Ġof": 131, - "le": 132, - "ct": 133, - "ar": 134, - "ut": 135, - "Ġd": 136, - "st": 137, - "ed": 138, - "ĠĠĠĠĠĠĠ": 139, - "ic": 140, - "\":": 141, - ",Ċ": 142, - "ro": 143, - "ent": 144, - "\\n": 145, - "Ġe": 146, - "put": 147, - "om": 148, - "Ġre": 149, - "as": 150, - "ve": 151, - "Ġh": 152, - "Ġth": 153, - "\",Ċ": 154, - "Ġl": 155, - "Ġis": 156, - "et": 157, - "ce": 158, - "Ġn": 159, - ".\\": 160, - "im": 161, - "il": 162, - "Ġg": 163, - "Ġu": 164, - "ction": 165, - "ru": 166, - "ation": 167, - "ol": 168, - "ch": 169, - "ĠT": 170, - "Ġfor": 171, - "out": 172, - "ra": 173, - "ow": 174, - "id": 175, - "ly": 176, - "Ġst": 177, - "Ġbe": 178, - "Ġy": 179, - "Ġpro": 180, - "ig": 181, - "se": 182, - "ate": 183, - "Ġthat": 184, - "ith": 185, - "ir": 186, - "ur": 187, - "ot": 188, - "Ġor": 189, - "Ġon": 190, - "Ġyou": 191, - "ers": 192, - "stru": 193, - "Ġan": 194, - "if": 195, - "ul": 196, - "struction": 197, - "Ġ{": 198, - "Ġ}": 199, - "Ġcan": 200, - "input": 201, - "output": 202, - "instruction": 203, - "Ġ{Ċ": 204, - "Ġ},Ċ": 205, - "\"Ċ": 206, - "Ġhe": 207, - "Ġcon": 208, - "Ġit": 209, - "ay": 210, - "ess": 211, - "Ġwith": 212, - "ver": 213, - "el": 214, - "Ġas": 215, - "am": 216, - "ĠA": 217, - "ge": 218, - "Ġsu": 219, - "iv": 220, - ".\",Ċ": 221, - "Ġcom": 222, - "ĠI": 223, - "ment": 224, - "ak": 225, - "Ġal": 226, - "\\\"": 227, - ".\"Ċ": 228, - "ive": 229, - "Ġare": 230, - "ab": 231, - "ad": 232, - "Ġmo": 233, - "Ġex": 234, - "Ġv": 235, - "ĠS": 236, - "res": 237, - "pp": 238, - "qu": 239, - "Ġde": 240, - "Ġwh": 241, - "ity": 242, - "Ġen": 243, - "ĠThe": 244, - "her": 245, - "ld": 246, - "ri": 247, - "ter": 248, - "ant": 249, - "ĠC": 250, - "ist": 251, - "Ġ\"\",Ċ": 252, - "um": 253, - "Ġus": 254, - "Ġne": 255, - "ain": 256, - "th": 257, - "ect": 258, - "Ġle": 259, - "op": 260, - "em": 261, - "ies": 262, - "Ġch": 263, - "Ġim": 264, - "du": 265, - "od": 266, - "ort": 267, - "nt": 268, - "est": 269, - "igh": 270, - "ere": 271, - "Ġha": 272, - "us": 273, - "ure": 274, - "ial": 275, - "oc": 276, - "Ġwor": 277, - "Ġtheir": 278, - "ac": 279, - "ence": 280, - "iz": 281, - "Ġyour": 282, - "os": 283, - "Ġimp": 284, - "ud": 285, - "Ġby": 286, - "Ġse": 287, - "ine": 288, - "ould": 289, - "low": 290, - "ill": 291, - "age": 292, - "rom": 293, - "Ġsp": 294, - "ĠP": 295, - "Ġsh": 296, - "ust": 297, - "The": 298, - "un": 299, - "'s": 300, - "Ġinc": 301, - "ide": 302, - "pl": 303, - "ight": 304, - "og": 305, - "Ġpl": 306, - "pt": 307, - "are": 308, - "Ġte": 309, - "Ġint": 310, - "Ġ\\": 311, - "his": 312, - "Ġr": 313, - "ake": 314, - "per": 315, - "orm": 316, - "ag": 317, - "ff": 318, - "ĠE": 319, - "art": 320, - "Ġk": 321, - "end": 322, - "ĠM": 323, - "Ġwe": 324, - "ĠB": 325, - "Ġad": 326, - "cess": 327, - "rou": 328, - "ical": 329, - "all": 330, - "able": 331, - "Ġfrom": 332, - "and": 333, - "ĠH": 334, - "Ġab": 335, - "act": 336, - "Ġcomp": 337, - "ome": 338, - "ach": 339, - "ĠThis": 340, - "Ġhave": 341, - "form": 342, - "Ġ\\\"": 343, - "ast": 344, - "Ġat": 345, - "ĠW": 346, - "Ġres": 347, - "Ġdat": 348, - ":\\": 349, - "ther": 350, - "ions": 351, - "ore": 352, - "Ġ(": 353, - "Ġcont": 354, - "our": 355, - "ep": 356, - "ĠF": 357, - "Ġac": 358, - "ance": 359, - "ĠR": 360, - "gh": 361, - "Ġme": 362, - "ces": 363, - "Ġwas": 364, - "ind": 365, - "vel": 366, - "ations": 367, - "Ġhel": 368, - "Ġmore": 369, - "ult": 370, - "ĠD": 371, - "reat": 372, - "ign": 373, - "Ġhelp": 374, - "ime": 375, - "ard": 376, - "Ġcl": 377, - "Ġapp": 378, - "ans": 379, - "ie": 380, - "Ġdata": 381, - "ich": 382, - "ang": 383, - "ous": 384, - "ell": 385, - "ks": 386, - "ase": 387, - "ice": 388, - "ip": 389, - "ite": 390, - "Ġsuch": 391, - "Ġfe": 392, - "Ġwhe": 393, - "ib": 394, - "Ġother": 395, - "Ġthis": 396, - "ass": 397, - "ual": 398, - "ile": 399, - "ne": 400, - "red": 401, - "Ġhas": 402, - "oo": 403, - "ress": 404, - "ific": 405, - "ning": 406, - "Ġ=": 407, - "Ġup": 408, - "Ġman": 409, - "Ġar": 410, - "ong": 411, - "ec": 412, - "Ġtra": 413, - "av": 414, - "Ġwhich": 415, - "Ġgo": 416, - "Ġprov": 417, - "Ġdis": 418, - "**": 419, - "so": 420, - "ĠG": 421, - "one": 422, - "Ġem": 423, - "Ġnot": 424, - "ue": 425, - "ĠO": 426, - "Ġj": 427, - "ace": 428, - "Ġthey": 429, - "ame": 430, - "Ġqu": 431, - "ĠL": 432, - "iff": 433, - "Ġfol": 434, - "ary": 435, - "ated": 436, - "ustom": 437, - "ition": 438, - "Ġits": 439, - "Ġsy": 440, - "ke": 441, - "ack": 442, - "ry": 443, - "--": 444, - "Ġtime": 445, - "Ġdes": 446, - "Ġnew": 447, - "ents": 448, - "ount": 449, - "Ġfollow": 450, - "Ġalso": 451, - "Ġcomm": 452, - "Ġout": 453, - "Ġeff": 454, - "Ġdiff": 455, - "iven": 456, - "ap": 457, - "Ġsent": 458, - "\\u": 459, - "Ġso": 460, - "Ġprodu": 461, - "Ġuse": 462, - "Ġsc": 463, - "Ġ-": 464, - "Ġun": 465, - "lud": 466, - "ĠIt": 467, - "ener": 468, - "king": 469, - "Ġev": 470, - "Ġabout": 471, - "Ġthem": 472, - "ĠU": 473, - "Ġcustom": 474, - "Ġro": 475, - "Ġinclud": 476, - "les": 477, - "etw": 478, - "stem": 479, - "xt": 480, - "Ġinto": 481, - "Ġper": 482, - "ĠIn": 483, - "ĠN": 484, - "Ġwill": 485, - "Ġlear": 486, - "ber": 487, - "Ġall": 488, - "Ġpe": 489, - "ds": 490, - "Ġtw": 491, - "aking": 492, - "ark": 493, - "ful": 494, - "Ġmake": 495, - "chn": 496, - "erv": 497, - "ost": 498, - "rough": 499, - "Ġone": 500, - "Ġinter": 501, - "ities": 502, - "ail": 503, - "ike": 504, - "ree": 505, - "ple": 506, - "alth": 507, - "Ġused": 508, - "ors": 509, - "Ġover": 510, - "ility": 511, - "ments": 512, - "ange": 513, - "Ġway": 514, - "ory": 515, - "Ġcol": 516, - "Ġpr": 517, - "Ġcould": 518, - "Ġnum": 519, - "reate": 520, - "int": 521, - "Ġredu": 522, - "erson": 523, - "Ġrec": 524, - "Ġher": 525, - "Ġneed": 526, - "ms": 527, - "ater": 528, - "oy": 529, - "Ġsystem": 530, - "Ġinform": 531, - "Ġtwo": 532, - "Ġtechn": 533, - "Ġsentence": 534, - "ience": 535, - "ize": 536, - "get": 537, - "Ġdiffere": 538, - "ood": 539, - "rib": 540, - "Ġbut": 541, - "Ġfollowing": 542, - "ased": 543, - "olog": 544, - "erg": 545, - "led": 546, - "ures": 547, - "In": 548, - "ear": 549, - "Ġph": 550, - "own": 551, - "Ġpre": 552, - "Ġwould": 553, - "Ġusing": 554, - "Ġcons": 555, - "Ġwork": 556, - "Ġmod": 557, - "ating": 558, - "ia": 559, - "ire": 560, - "Ġpos": 561, - "ient": 562, - "ob": 563, - "ject": 564, - "Ġinv": 565, - "ons": 566, - "Ġdo": 567, - "ular": 568, - "Ġdec": 569, - "Ġhealth": 570, - "Ġimpro": 571, - "Ġany": 572, - "Ġthrough": 573, - "yp": 574, - "row": 575, - "velop": 576, - "Ġprocess": 577, - "Ġtr": 578, - "lic": 579, - "very": 580, - "als": 581, - "ify": 582, - "``": 583, - "ari": 584, - "Ġstr": 585, - "Ġimport": 586, - "Ġlike": 587, - "Ġproduct": 588, - "Ġsome": 589, - "ph": 590, - "ential": 591, - "Ġam": 592, - "ates": 593, - "Ġacc": 594, - "ens": 595, - "ns": 596, - "Ġsm": 597, - "Ġind": 598, - "een": 599, - "Ġexper": 600, - "lect": 601, - "Ġval": 602, - "Ġrel": 603, - "its": 604, - "Ġinformation": 605, - "ings": 606, - "ĠJ": 607, - "ople": 608, - "iness": 609, - "Ġgiven": 610, - "mm": 611, - "ices": 612, - "Ġpart": 613, - "ild": 614, - "ys": 615, - "Ġour": 616, - "nder": 617, - "Ġperson": 618, - "ally": 619, - "Ġke": 620, - "etween": 621, - "ft": 622, - "oth": 623, - "Ġspec": 624, - "Ġbetween": 625, - "ergy": 626, - "ĠAI": 627, - "Ġwho": 628, - "Ġmay": 629, - "ef": 630, - "ative": 631, - "ise": 632, - "Ġlist": 633, - "Ġkn": 634, - "Ġadd": 635, - ",\\": 636, - "ord": 637, - "ics": 638, - "Ġpeople": 639, - "ĠSt": 640, - "Ġhis": 641, - "Ġexp": 642, - "ible": 643, - "Ġthere": 644, - "Ġserv": 645, - "Ġincre": 646, - "Ġdevelop": 647, - "ound": 648, - "ower": 649, - "Ġtrans": 650, - "bs": 651, - "Ġenergy": 652, - "Ġoff": 653, - "Ġbus": 654, - "Ġwhile": 655, - "ose": 656, - "Ġact": 657, - "Ġexam": 658, - "Ġlearning": 659, - "ctions": 660, - "con": 661, - "gor": 662, - "gan": 663, - "ution": 664, - "round": 665, - "pport": 666, - "Ġhow": 667, - "Ġbl": 668, - "Ġmed": 669, - "anc": 670, - "Ġtyp": 671, - "Ġra": 672, - "Ġcar": 673, - "ife": 674, - "Ġworld": 675, - "Ġvari": 676, - "Ġrep": 677, - "au": 678, - "Ġsoc": 679, - "Ġprovid": 680, - "Ġset": 681, - "ten": 682, - "Ġsol": 683, - "Ġeach": 684, - "Ġwhen": 685, - "Ġeffect": 686, - "Ġpo": 687, - "Ġshe": 688, - "ick": 689, - "Ġwhere": 690, - "Ġmodel": 691, - "Ġimportant": 692, - "Ġunder": 693, - "Ġprog": 694, - "enerate": 695, - "ural": 696, - "tain": 697, - "Ġass": 698, - "ology": 699, - "Ġhad": 700, - "ook": 701, - "gg": 702, - "Ġcustomer": 703, - "ting": 704, - "ving": 705, - "Ġresp": 706, - "line": 707, - "Ġcreat": 708, - "ll": 709, - "ily": 710, - "Ġreg": 711, - "Ġdet": 712, - "Ġif": 713, - "Ġ+": 714, - "Ġbusiness": 715, - "\\nIn": 716, - "ish": 717, - "Ġmost": 718, - "ĠĠĠĠĠĠĠĠ": 719, - "hes": 720, - "angu": 721, - "Ġprovide": 722, - "Ġadv": 723, - "erm": 724, - "ub": 725, - "Ġsk": 726, - "irst": 727, - "any": 728, - "Ġday": 729, - "ivid": 730, - "arm": 731, - "ract": 732, - "nce": 733, - "Ġ|": 734, - "Ġimprove": 735, - ")\\": 736, - "Ġco": 737, - "Ġcommun": 738, - "arket": 739, - "Ġmet": 740, - "cy": 741, - "Ġdifferent": 742, - "ized": 743, - "Ġart": 744, - "\\nThe": 745, - "rit": 746, - "Ġcomput": 747, - "Ġform": 748, - "ck": 749, - "Ġhum": 750, - "Ġchar": 751, - "ble": 752, - "Ġlead": 753, - "iron": 754, - "Ġrem": 755, - "Ġshould": 756, - "te": 757, - "Ġallow": 758, - "ness": 759, - "hat": 760, - "Ġfun": 761, - "Ġcomple": 762, - "Ġlangu": 763, - "ages": 764, - "Ġbec": 765, - "Ġsign": 766, - "ues": 767, - "ature": 768, - "Ġfind": 769, - "riend": 770, - "Ġstud": 771, - "Ġmain": 772, - "imate": 773, - "ove": 774, - "Ġresult": 775, - "Ġplay": 776, - "Ġreduce": 777, - "Ġeng": 778, - "ware": 779, - "redi": 780, - "Ġnumber": 781, - "Ġlar": 782, - "Ġpol": 783, - "Ġpat": 784, - "Ġwell": 785, - "ident": 786, - "viron": 787, - "rite": 788, - "crib": 789, - "Ġbu": 790, - "Ġhigh": 791, - "Ġthese": 792, - "ives": 793, - "ves": 794, - "Ġdesign": 795, - "urn": 796, - "Ġthan": 797, - "der": 798, - "Ġanal": 799, - "Ġwater": 800, - "Ġmarket": 801, - "Ġexample": 802, - "way": 803, - "stand": 804, - "ng": 805, - "ax": 806, - "itive": 807, - "Ġ`": 808, - "iqu": 809, - "Ġsim": 810, - "Ġequ": 811, - "gorith": 812, - "Ġtext": 813, - "resent": 814, - "Ġmany": 815, - "uring": 816, - "----": 817, - "\\nA": 818, - "Ġdi": 819, - "Ġsa": 820, - "vironment": 821, - "arch": 822, - "Ġatt": 823, - "Ġpot": 824, - "Ġtas": 825, - "Ġcreate": 826, - "ough": 827, - "Ġfl": 828, - "Ġmaking": 829, - "ious": 830, - "Ġgra": 831, - "Ġlife": 832, - "\\nO": 833, - "Ġalgorith": 834, - "ality": 835, - "eng": 836, - "Ġfin": 837, - "uc": 838, - "?\",Ċ": 839, - "ĠY": 840, - "Ġret": 841, - "Ġbeen": 842, - "Ġtechnology": 843, - "Ġprogra": 844, - "Ġhand": 845, - "hip": 846, - "wn": 847, - "Ġcal": 848, - "Ġwhat": 849, - "ividual": 850, - "iss": 851, - "ety": 852, - "Ġlanguage": 853, - "ources": 854, - "Ġclass": 855, - "Ġtake": 856, - "Ġeas": 857, - "ric": 858, - "Ġvis": 859, - "bject": 860, - "Ġref": 861, - "Ġenvironment": 862, - "Ġfirst": 863, - "eg": 864, - "Ġindividual": 865, - "Ġplan": 866, - "Ġperform": 867, - "Ġru": 868, - "ien": 869, - "Ġimpact": 870, - "Ġag": 871, - "ade": 872, - "Ġcle": 873, - "Ġrequ": 874, - "dition": 875, - "__": 876, - "Ġche": 877, - "ption": 878, - "Ġappro": 879, - "Ġ**": 880, - "Ġgreat": 881, - "ved": 882, - "Ġexpl": 883, - "Ġgrow": 884, - "Generate": 885, - "Ġmy": 886, - "Ġincluding": 887, - "Ġaccess": 888, - "Ġpop": 889, - "Ġmin": 890, - "fore": 891, - "Ġsocial": 892, - "ines": 893, - "Ġcharact": 894, - "Ġbr": 895, - "Ġstep": 896, - "Ġunderstand": 897, - "Ġorgan": 898, - "ĠAd": 899, - "Ġdisc": 900, - "Ġpower": 901, - "Ġlong": 902, - "hed": 903, - "Ġconc": 904, - "ward": 905, - "ited": 906, - "Ġele": 907, - "cing": 908, - "Ġevery": 909, - "Ġca": 910, - "Ġoften": 911, - "Ġuser": 912, - "vie": 913, - "ĠV": 914, - "Ġfood": 915, - "Ġinclude": 916, - "Ġloc": 917, - "ases": 918, - "ically": 919, - "ode": 920, - "ants": 921, - "Ġinvol": 922, - "Ġsmall": 923, - "Ġsur": 924, - "achine": 925, - "Ġbeing": 926, - "Ġpotential": 927, - "Ġno": 928, - "ĠCh": 929, - "Ġdep": 930, - "ather": 931, - "Ġboth": 932, - "Ġens": 933, - "Ġposs": 934, - "Ġed": 935, - "cribe": 936, - "ts": 937, - "ork": 938, - "ĠThey": 939, - "Ġpur": 940, - "ivity": 941, - "Ġwords": 942, - "Ġsignific": 943, - "Ġwere": 944, - "ĠHow": 945, - "Ġprom": 946, - "Ġexperience": 947, - "ĠK": 948, - "up": 949, - "Ġcount": 950, - "ered": 951, - "Des": 952, - "Ġfam": 953, - "```": 954, - "akes": 955, - "Ġgl": 956, - "ĠHe": 957, - "Ġfeel": 958, - "Ġback": 959, - "Ġfi": 960, - "Ġproble": 961, - "ization": 962, - "ling": 963, - "Ġcommunic": 964, - "ploy": 965, - "Ġaut": 966, - "Ġfriend": 967, - "Ġhuman": 968, - "Ġspe": 969, - "ew": 970, - "Ġpersonal": 971, - "Ġtop": 972, - "Ġent": 973, - "other": 974, - "Ġchang": 975, - "Ġcor": 976, - "Ġchange": 977, - "Ġdecis": 978, - "ability": 979, - "hing": 980, - "atural": 981, - "ever": 982, - "Ġcost": 983, - "Ġgood": 984, - "ause": 985, - "Ġident": 986, - "Ġsoft": 987, - "ined": 988, - "Ġpass": 989, - "'t": 990, - "atures": 991, - "Ġben": 992, - "Ġcompany": 993, - "Ġstart": 994, - "Ġsignificant": 995, - "Ġsumm": 996, - "ond": 997, - "old": 998, - "bers": 999, - "sel": 1000, - "?\\": 1001, - "Ġcur": 1002, - "Ġlight": 1003, - "Ġcommon": 1004, - ".\\\"": 1005, - "Ġcustomers": 1006, - "iving": 1007, - "conom": 1008, - "Ġfunction": 1009, - "Ġve": 1010, - "Ġthree": 1011, - "Ġeven": 1012, - "ining": 1013, - "Ġgener": 1014, - "ries": 1015, - "Ġlevel": 1016, - "Ġspecific": 1017, - "Ġwebs": 1018, - "Ġthen": 1019, - "Ġeffective": 1020, - "cur": 1021, - "ense": 1022, - "Ġlarge": 1023, - "Ġdist": 1024, - "Ġeffic": 1025, - "Ġsupport": 1026, - "Ġget": 1027, - "Create": 1028, - "read": 1029, - "port": 1030, - "Ġinf": 1031, - "Ġ'": 1032, - "Ġyear": 1033, - "Ġstate": 1034, - "Ġkey": 1035, - "ccess": 1036, - ":**": 1037, - "Ġav": 1038, - "Ġknow": 1039, - "Ġbenef": 1040, - "Ġess": 1041, - "ables": 1042, - "ren": 1043, - "Ġown": 1044, - "ĠThese": 1045, - "ock": 1046, - "-t": 1047, - "Ġide": 1048, - "omm": 1049, - "reen": 1050, - "ced": 1051, - "cture": 1052, - "Ġteam": 1053, - "Ġris": 1054, - "Ġtasks": 1055, - "Ġdown": 1056, - "Ġstru": 1057, - "Ġcomputer": 1058, - "-b": 1059, - "Ġfact": 1060, - "Ġmem": 1061, - "etter": 1062, - "\\nS": 1063, - "Ġaround": 1064, - "Ġword": 1065, - "Ġbased": 1066, - "Ġbeh": 1067, - "Ġright": 1068, - "Ġdel": 1069, - "Ġpoint": 1070, - "Ġnatural": 1071, - "ss": 1072, - "Ġeconom": 1073, - "Ġmade": 1074, - "Ġins": 1075, - "Ġinst": 1076, - "Ġmat": 1077, - "Ġvalue": 1078, - "Ġanim": 1079, - "Ġsever": 1080, - "\\nT": 1081, - "ational": 1082, - "ital": 1083, - "ze": 1084, - "ote": 1085, - "ills": 1086, - "tern": 1087, - "Ġread": 1088, - "Ġcontent": 1089, - "Ġonline": 1090, - "Ġend": 1091, - "ĠUn": 1092, - "vent": 1093, - "Ġsee": 1094, - "ending": 1095, - "Ġmon": 1096, - "Ġdr": 1097, - "Ġkeep": 1098, - "Ġsystems": 1099, - "cul": 1100, - "ven": 1101, - "Ġstory": 1102, - "Ġmedia": 1103, - "Ġseveral": 1104, - "hen": 1105, - "ateg": 1106, - "Ġcontin": 1107, - "Ġdev": 1108, - "Ġlearn": 1109, - "Ġla": 1110, - "Ġstre": 1111, - "Ġpartic": 1112, - "Ġair": 1113, - "ually": 1114, - "Ġsuccess": 1115, - "ouse": 1116, - "Ġiss": 1117, - "ied": 1118, - "Ġmachine": 1119, - "Ġopt": 1120, - "Ġx": 1121, - "Ġop": 1122, - "Ġprof": 1123, - "ocus": 1124, - "chie": 1125, - "Ġmeth": 1126, - "ner": 1127, - "omp": 1128, - "ron": 1129, - "Ġhome": 1130, - "Ġbetter": 1131, - "ĠPro": 1132, - "Ġmult": 1133, - "omet": 1134, - "Ġincrease": 1135, - "Ġanaly": 1136, - "vert": 1137, - "Ġrele": 1138, - "Ġbra": 1139, - "ink": 1140, - "Ġtem": 1141, - "Ġpredi": 1142, - "Ġtre": 1143, - "Ġservice": 1144, - "Ġwebsite": 1145, - "Ġmanage": 1146, - "Ġsoftware": 1147, - "here": 1148, - "Ġprot": 1149, - "-s": 1150, - "Ġquest": 1151, - "ier": 1152, - "Ġknown": 1153, - "Ġorder": 1154, - "Ġphys": 1155, - "cept": 1156, - "Ġachie": 1157, - "Ġinput": 1158, - "Ġpossible": 1159, - "ĠIf": 1160, - "Ġext": 1161, - "fter": 1162, - "Ġelect": 1163, - "Ġmethod": 1164, - "Ġbre": 1165, - "ĠAn": 1166, - "ways": 1167, - "ering": 1168, - "ets": 1169, - "Ġjust": 1170, - "Ġstore": 1171, - "Ġdevelopment": 1172, - "Ġcare": 1173, - "Ġobject": 1174, - "Ġtype": 1175, - "ĠFor": 1176, - "Ġfocus": 1177, - "ggest": 1178, - "Ġonly": 1179, - "Ġconsid": 1180, - "ars": 1181, - "Ġchall": 1182, - "Ġdeterm": 1183, - "Ġsal": 1184, - "ins": 1185, - "Ġfeatures": 1186, - "Ġtru": 1187, - "ody": 1188, - "Ġtool": 1189, - ">\\": 1190, - "Ġensure": 1191, - "oss": 1192, - "ublic": 1193, - "Ġitem": 1194, - "Here": 1195, - "ination": 1196, - "Ġdef": 1197, - "Describe": 1198, - "ional": 1199, - "roup": 1200, - "Ġconf": 1201, - "Ġneeds": 1202, - "Ġcharacter": 1203, - "Ġvarious": 1204, - "Ġlet": 1205, - "Ġapplic": 1206, - "aut": 1207, - "Ġjob": 1208, - "ellig": 1209, - "ĠCon": 1210, - "Ġbest": 1211, - "Ġfore": 1212, - "Ġamount": 1213, - "rop": 1214, - "Ġbuild": 1215, - "ique": 1216, - "aging": 1217, - "Ġemploy": 1218, - "Ġrest": 1219, - "air": 1220, - "What": 1221, - "Ġtoget": 1222, - "Ġways": 1223, - "Ġidentify": 1224, - "Ġtogether": 1225, - "Ġreal": 1226, - "Ġusers": 1227, - "Ġmean": 1228, - "asing": 1229, - "ĠAm": 1230, - "Ġeduc": 1231, - "Ġalgorithm": 1232, - "Ġnetw": 1233, - "Ġcode": 1234, - "Write": 1235, - "ov": 1236, - "-d": 1237, - "oura": 1238, - "ĠHowever": 1239, - "uture": 1240, - "view": 1241, - "Ġindu": 1242, - "Ġproducts": 1243, - "ected": 1244, - "ertain": 1245, - ";\\": 1246, - "ĠAs": 1247, - "pr": 1248, - "aste": 1249, - "Ġoper": 1250, - "Ġ$": 1251, - "avi": 1252, - "self": 1253, - "Ġ<": 1254, - "Ġindust": 1255, - "Ġgu": 1256, - "Ġothers": 1257, - "Ex": 1258, - "ian": 1259, - "Ġ\"\\\"": 1260, - "-f": 1261, - "nces": 1262, - "Ġfil": 1263, - "Ġrespons": 1264, - "rol": 1265, - "Ġcap": 1266, - "Ġbefore": 1267, - "vern": 1268, - "Ġcomplex": 1269, - "lus": 1270, - "ribut": 1271, - "ats": 1272, - "Ġpositive": 1273, - "oh": 1274, - "Ġlo": 1275, - "Ġgroup": 1276, - "Ġfound": 1277, - "ee": 1278, - "ogn": 1279, - "Ġsw": 1280, - "Ġindividuals": 1281, - "Ġpract": 1282, - "Ġenc": 1283, - "Ġshare": 1284, - "raph": 1285, - "Ġrange": 1286, - "Ġsun": 1287, - "\\t": 1288, - "Ġproviding": 1289, - "icle": 1290, - "Ġdem": 1291, - "Ġplace": 1292, - "Ġaud": 1293, - "joy": 1294, - "Ġmust": 1295, - "els": 1296, - "ery": 1297, - "One": 1298, - "Ġfamily": 1299, - "Ġfuture": 1300, - "less": 1301, - "rent": 1302, - "Ġproblem": 1303, - "Ġessential": 1304, - "rodu": 1305, - "ired": 1306, - "Ġreducing": 1307, - "ism": 1308, - "Ġwarm": 1309, - "ray": 1310, - "Ġability": 1311, - "Ġstrong": 1312, - "Ġalways": 1313, - "Ġresources": 1314, - "Ġbenefits": 1315, - "Ġstrateg": 1316, - "Ġinvolves": 1317, - "Ġassist": 1318, - "erest": 1319, - "nA": 1320, - "ression": 1321, - "Ġ[": 1322, - "ilities": 1323, - "Ġsteps": 1324, - "verall": 1325, - "Ġshow": 1326, - "obal": 1327, - "\\nF": 1328, - "Ġland": 1329, - "ĠHere": 1330, - "Ġbusinesses": 1331, - "ĠEn": 1332, - "pportun": 1333, - "Ġmeas": 1334, - "Ġreturn": 1335, - "Ġdig": 1336, - "Ġhist": 1337, - "yth": 1338, - "Ġcent": 1339, - "Ġable": 1340, - "Ġwithout": 1341, - "yc": 1342, - "plain": 1343, - "Ġrelations": 1344, - "Ġservices": 1345, - "-c": 1346, - "Ġtest": 1347, - "arth": 1348, - "Ġcommunication": 1349, - "Ġintern": 1350, - "new": 1351, - "Ġsit": 1352, - "Ġinvest": 1353, - "Ġcaus": 1354, - "Ġunt": 1355, - "Ġfriends": 1356, - "Ġchanges": 1357, - "cri": 1358, - "dit": 1359, - "ĠBy": 1360, - "ĠYou": 1361, - "Ġmeans": 1362, - "Ġrese": 1363, - "ool": 1364, - "ted": 1365, - "elligence": 1366, - "ains": 1367, - "pping": 1368, - "Ġbel": 1369, - "Ġrepresent": 1370, - "Ġhapp": 1371, - "Ġser": 1372, - "Ġperformance": 1373, - "Ġopportun": 1374, - "Ġtemper": 1375, - "ĠShe": 1376, - "Ġfu": 1377, - "ix": 1378, - "bot": 1379, - "Ġwrit": 1380, - "Ġbehavi": 1381, - "Ġproject": 1382, - "ĠWith": 1383, - "ivers": 1384, - "day": 1385, - "Ġphysical": 1386, - "izing": 1387, - "Ġactiv": 1388, - "Ġwithin": 1389, - "Ġinterest": 1390, - "olution": 1391, - "wards": 1392, - "ffic": 1393, - "Ġquick": 1394, - "Ġpublic": 1395, - "Ġgrowth": 1396, - "Ġcho": 1397, - "Ġrelationship": 1398, - "Ġuntil": 1399, - "Ġhelps": 1400, - "Ġstudents": 1401, - "Ġfiel": 1402, - "imes": 1403, - "ulation": 1404, - "ibility": 1405, - "elf": 1406, - "Ġful": 1407, - "Ġsub": 1408, - "ank": 1409, - "ides": 1410, - "Ġskills": 1411, - "Ġclimate": 1412, - "Given": 1413, - "Ġpar": 1414, - "Ġclear": 1415, - "irt": 1416, - "Name": 1417, - "Ġpresent": 1418, - "Ġtri": 1419, - "Ġchalleng": 1420, - "ream": 1421, - "Ġlay": 1422, - "Ġmarketing": 1423, - "Ġsummary": 1424, - "Ġchild": 1425, - "Ġsaf": 1426, - "Ġsure": 1427, - "Ġsame": 1428, - "Ġmu": 1429, - "Ġemail": 1430, - "bon": 1431, - "Ġsomet": 1432, - "```\\": 1433, - "Ġcurrent": 1434, - "amp": 1435, - "ences": 1436, - "ĠRe": 1437, - "Ġtransport": 1438, - "me": 1439, - "-p": 1440, - "action": 1441, - "ĠEx": 1442, - "Ġyears": 1443, - "Ġcomb": 1444, - "hor": 1445, - "anced": 1446, - "ty": 1447, - "Ġlove": 1448, - "Ġgreen": 1449, - "Ġpopular": 1450, - "Ġless": 1451, - "Ġdra": 1452, - "Ġcontrol": 1453, - "Ġaff": 1454, - "Ġconsum": 1455, - "Ġgame": 1456, - "ental": 1457, - "ights": 1458, - "arget": 1459, - "omes": 1460, - "ox": 1461, - "icult": 1462, - "erc": 1463, - "Ġgoals": 1464, - "ancial": 1465, - "tle": 1466, - "Ġgovern": 1467, - "Ġnumbers": 1468, - "Ġfive": 1469, - "Ġstand": 1470, - "Ġsearch": 1471, - "Ġefficient": 1472, - "Ġwal": 1473, - "Ġname": 1474, - "ath": 1475, - "Ġheart": 1476, - "Ġduring": 1477, - "rect": 1478, - "Ġoverall": 1479, - "ython": 1480, - "Ġallows": 1481, - "Ġcity": 1482, - "ave": 1483, - "vant": 1484, - "aterial": 1485, - "Ġwide": 1486, - "Ġmus": 1487, - "ificial": 1488, - "Ġhard": 1489, - "ĠTh": 1490, - "oose": 1491, - "Ġglobal": 1492, - "aj": 1493, - "Ġter": 1494, - "Ġdifficult": 1495, - "Ġline": 1496, - "ĠAl": 1497, - "care": 1498, - "ived": 1499, - "Ġregular": 1500, - "Ġgr": 1501, - "),": 1502, - "lement": 1503, - "Ġhim": 1504, - "Ġunique": 1505, - "Ġenjoy": 1506, - "Ġmeaning": 1507, - "Ġopen": 1508, - "Ġi": 1509, - "abor": 1510, - "Ġarea": 1511, - "Ġitems": 1512, - "Ġclean": 1513, - "ditionally": 1514, - "oid": 1515, - "ĠWe": 1516, - "Ġbeaut": 1517, - "Ġmeet": 1518, - "iple": 1519, - "Ġstatement": 1520, - "Ġagain": 1521, - "ysis": 1522, - "Ġfac": 1523, - "Ġsources": 1524, - "Ġbody": 1525, - "Ġalgorithms": 1526, - "Ġaudience": 1527, - "Ġwant": 1528, - "Ġlog": 1529, - "Ġmaintain": 1530, - "Ġactivities": 1531, - "Ġmove": 1532, - "Ġcult": 1533, - "oney": 1534, - "Ġtarget": 1535, - "\\nB": 1536, - "Ġmaterial": 1537, - "Ġcreating": 1538, - "Ġstructure": 1539, - "atform": 1540, - "ext": 1541, - "Ġexperien": 1542, - "Ġvalues": 1543, - "ead": 1544, - "ohn": 1545, - "Ġhealthy": 1546, - "ross": 1547, - "Ġinteg": 1548, - "Ġresearch": 1549, - "atch": 1550, - "ooking": 1551, - "Ġrole": 1552, - "Ġprovides": 1553, - "iety": 1554, - "ists": 1555, - "Ġfinancial": 1556, - "ories": 1557, - "dent": 1558, - "Ġer": 1559, - "Ġarticle": 1560, - "Ġelements": 1561, - "Ġaddress": 1562, - "Ġconn": 1563, - "ĠUse": 1564, - "mp": 1565, - "Ġeasy": 1566, - "Ġneg": 1567, - "Ġcolor": 1568, - "Ġcalcul": 1569, - "Explain": 1570, - "ĠPl": 1571, - "pect": 1572, - "ince": 1573, - "ale": 1574, - "Ġrisk": 1575, - "curity": 1576, - "ert": 1577, - "Ġfeed": 1578, - "Ġevent": 1579, - "vers": 1580, - "ples": 1581, - "Ġlevels": 1582, - "Ġbi": 1583, - "Ġstay": 1584, - "Ġplatform": 1585, - "Ġbreak": 1586, - "back": 1587, - "Ġsat": 1588, - "\\nOverall": 1589, - "Ġeducation": 1590, - "\\nC": 1591, - "Ġcarbon": 1592, - "--------": 1593, - "ape": 1594, - "Ġprevent": 1595, - "Ġaddition": 1596, - "Ġstress": 1597, - "ral": 1598, - "ource": 1599, - "rus": 1600, - "Ġcome": 1601, - "Ġrecogn": 1602, - "ĠUnited": 1603, - "Ġproper": 1604, - "Ġpoll": 1605, - "dentify": 1606, - "Ġunderstanding": 1607, - "Ġdecisions": 1608, - "ict": 1609, - "Ġdire": 1610, - "Ġbehavior": 1611, - "Ġ*": 1612, - "\\nI": 1613, - "Ġmess": 1614, - "Ġanimals": 1615, - "Ġsl": 1616, - "Ġwind": 1617, - "Ġbas": 1618, - "Ġpain": 1619, - "Ġleading": 1620, - "ern": 1621, - "ger": 1622, - "Ġpres": 1623, - "Ġthough": 1624, - "Ġinteract": 1625, - "yle": 1626, - "Ġdoes": 1627, - "Ġhead": 1628, - "Ġintelligence": 1629, - "orts": 1630, - "Ġbecome": 1631, - "Ġrun": 1632, - "aring": 1633, - "Ġimplement": 1634, - "Ġaction": 1635, - "oot": 1636, - "terns": 1637, - "Ġprotect": 1638, - "eric": 1639, - "Ġflow": 1640, - "Ġemot": 1641, - "cessary": 1642, - "urate": 1643, - "Ġsuggest": 1644, - "Ġprogram": 1645, - "Ġphr": 1646, - "Ġhealthcare": 1647, - "ention": 1648, - "Ġsust": 1649, - "Ġwhy": 1650, - "Ġaccurate": 1651, - "lu": 1652, - "Ġhig": 1653, - "Ġreach": 1654, - "Ġallowing": 1655, - "Ġtravel": 1656, - "Ġrequire": 1657, - "Ġareas": 1658, - "Ġdeep": 1659, - "He": 1660, - "Ġfew": 1661, - "Ġself": 1662, - "oun": 1663, - "Ġ#": 1664, - "osp": 1665, - "str": 1666, - "Ġminut": 1667, - "Ġdecision": 1668, - "ĠThere": 1669, - "ances": 1670, - "Ġquality": 1671, - "Ġavail": 1672, - "Ġspace": 1673, - "Ġsomething": 1674, - "Ġweb": 1675, - "Ġpatterns": 1676, - "Ġmot": 1677, - "oring": 1678, - "isf": 1679, - "Ġanother": 1680, - "Ġaccount": 1681, - "\\nW": 1682, - "uss": 1683, - "Ġmaj": 1684, - "uation": 1685, - "Ġsustain": 1686, - "Ġautom": 1687, - "iques": 1688, - "issions": 1689, - "verse": 1690, - "Ġconcept": 1691, - "Ġsecurity": 1692, - "Ġthose": 1693, - "Ġprofess": 1694, - "Ġshort": 1695, - "Ġnight": 1696, - "ength": 1697, - "apt": 1698, - "ex": 1699, - "ĠAdditionally": 1700, - "Ġtaking": 1701, - "Ġtoo": 1702, - "agn": 1703, - "Ġsimple": 1704, - "lusion": 1705, - "iency": 1706, - "ash": 1707, - "ours": 1708, - "Ġpa": 1709, - "Ġlit": 1710, - "ĠSp": 1711, - "iting": 1712, - "Ġdon": 1713, - "Ġlim": 1714, - "lish": 1715, - "mat": 1716, - "aves": 1717, - "ledge": 1718, - "ditional": 1719, - "inc": 1720, - "Ġevents": 1721, - "Ġoffer": 1722, - "thing": 1723, - "Ġworking": 1724, - "Ġanalysis": 1725, - "Ġachieve": 1726, - "Ġpie": 1727, - "Ġbook": 1728, - "Ġfre": 1729, - "Ġmuch": 1730, - "oon": 1731, - "Ġtry": 1732, - "esp": 1733, - "Ġwaste": 1734, - "face": 1735, - "Ġear": 1736, - "Ġfru": 1737, - "Ġtransportation": 1738, - "chool": 1739, - "Ġtechniques": 1740, - "Ġprogramm": 1741, - "ĠEarth": 1742, - "Ġpredict": 1743, - "Ġnever": 1744, - "ws": 1745, - "ument": 1746, - "imately": 1747, - "ared": 1748, - "Ġparticular": 1749, - "Ġtowards": 1750, - "Ġeconomic": 1751, - "Ġincreasing": 1752, - "Ġfast": 1753, - "iment": 1754, - "Ġnetwork": 1755, - "Ġcorrect": 1756, - "Ġmight": 1757, - "Ġoc": 1758, - "Ġbecause": 1759, - "ĠWh": 1760, - "az": 1761, - "play": 1762, - "Ġresults": 1763, - "Ġmanagement": 1764, - "Ġpurch": 1765, - "Ġsound": 1766, - "Ġpast": 1767, - "Ġtraining": 1768, - "____": 1769, - "ope": 1770, - "Ġengage": 1771, - "ourage": 1772, - "Ġsense": 1773, - "Ġfree": 1774, - "Ġpref": 1775, - "ees": 1776, - "Ġcountries": 1777, - "ney": 1778, - "anies": 1779, - "Ġafter": 1780, - "Ġmind": 1781, - "Ġexc": 1782, - "ĠOnce": 1783, - "ĠĠĠĠĠĠĠĠĠĠĠ": 1784, - "Ġcomplete": 1785, - "Ġimm": 1786, - "Ġest": 1787, - "Ġgenerate": 1788, - "verb": 1789, - "ĠDe": 1790, - "'m": 1791, - "Ġtools": 1792, - "redients": 1793, - "Ġmajor": 1794, - "ently": 1795, - "Ġcontribut": 1796, - "leep": 1797, - "Ġpoints": 1798, - "ditions": 1799, - "Ġfactors": 1800, - "Ġel": 1801, - "Ġnext": 1802, - "ium": 1803, - "oud": 1804, - "Ġcru": 1805, - "Ġreas": 1806, - "riate": 1807, - "ĠInd": 1808, - "Ġpromot": 1809, - "Ġhistory": 1810, - "Ġjour": 1811, - "Ġdue": 1812, - "Con": 1813, - "Ġveget": 1814, - "ency": 1815, - "ĠAmeric": 1816, - "Ġfra": 1817, - "Ġdifference": 1818, - "oard": 1819, - "lex": 1820, - "Ġequation": 1821, - "irtual": 1822, - "Ġcup": 1823, - "Ġforest": 1824, - "Ġnegative": 1825, - "Ġsecon": 1826, - "ones": 1827, - "Ġnature": 1828, - "Ġuses": 1829, - "ah": 1830, - "por": 1831, - "Ġsec": 1832, - "ording": 1833, - "Ġlast": 1834, - "ĠSome": 1835, - "Ġissues": 1836, - "Ġscient": 1837, - "Ġprint": 1838, - "ĠStates": 1839, - "over": 1840, - "Ġsatisf": 1841, - "Ġdevices": 1842, - "Ġdise": 1843, - "Ġtemperature": 1844, - "Ġfeedback": 1845, - "Ġnecessary": 1846, - "Ġemissions": 1847, - "mb": 1848, - "Ġlow": 1849, - "for": 1850, - "tal": 1851, - "Ġchallenges": 1852, - "Ġarray": 1853, - "Ġside": 1854, - "Ġengine": 1855, - "Ġboo": 1856, - "ata": 1857, - "Ġbelie": 1858, - "-m": 1859, - "Ġmultiple": 1860, - "Ġsing": 1861, - "Ġgovernment": 1862, - "ames": 1863, - "ified": 1864, - "Ġminutes": 1865, - "Ġsuccessful": 1866, - "Ġmoney": 1867, - "Ġquickly": 1868, - "Ġbir": 1869, - "Ġtypically": 1870, - "Ġpost": 1871, - "Ġprep": 1872, - "Ġknowledge": 1873, - "pped": 1874, - "actions": 1875, - "Ġmethods": 1876, - "Ġoptim": 1877, - "\\nP": 1878, - "Ġoutput": 1879, - "Ġfield": 1880, - "Ġtable": 1881, - "Ġbal": 1882, - "Ġcoll": 1883, - "Ġcharacters": 1884, - "volution": 1885, - "ords": 1886, - "ilar": 1887, - "ification": 1888, - "ane": 1889, - "Ġcell": 1890, - "Ġmil": 1891, - "ĠWhat": 1892, - "Ġsqu": 1893, - "Ġlives": 1894, - "ĠAr": 1895, - "Ġphrase": 1896, - "Ġnut": 1897, - "Ġdigital": 1898, - "Ġinternet": 1899, - "lass": 1900, - "ura": 1901, - "ommend": 1902, - "Ġtreat": 1903, - "Ġapprop": 1904, - "resh": 1905, - "urther": 1906, - "ĠOne": 1907, - "Ġvisual": 1908, - "ategor": 1909, - "Ġapproach": 1910, - "Ġcertain": 1911, - "Ġsho": 1912, - "val": 1913, - "Ġtask": 1914, - "ires": 1915, - "Ġappropriate": 1916, - "Ġvie": 1917, - "Ġdesigned": 1918, - "pose": 1919, - "**:": 1920, - "fort": 1921, - "Ġ|\\": 1922, - "Ġapplications": 1923, - "Ġpay": 1924, - "Ġnow": 1925, - "Ġheat": 1926, - "Ġindustry": 1927, - "pre": 1928, - "Ġeffectively": 1929, - "Ġpopulation": 1930, - "Ġopportunities": 1931, - "":29,"?":30,"@":31,"A":32,"B":33,"C":34,"D":35,"E":36,"F":37,"G":38,"H":39,"I":40,"J":41,"K":42,"L":43,"M":44,"N":45,"O":46,"P":47,"Q":48,"R":49,"S":50,"T":51,"U":52,"V":53,"W":54,"X":55,"Y":56,"Z":57,"[":58,"\\":59,"]":60,"^":61,"_":62,"`":63,"a":64,"b":65,"c":66,"d":67,"e":68,"f":69,"g":70,"h":71,"i":72,"j":73,"k":74,"l":75,"m":76,"n":77,"o":78,"p":79,"q":80,"r":81,"s":82,"t":83,"u":84,"v":85,"w":86,"x":87,"y":88,"z":89,"{":90,"|":91,"}":92,"~":93,"Ċ":94,"Ġ":95,"ĠĠ":96,"Ġt":97,"Ġa":98,"in":99,"he":100,"re":101,"on":102,"Ġthe":103,"Ġs":104,"er":105,"at":106,"Ġc":107,"ĠĠĠĠ":108,"en":109,"Ġo":110,"Ġ\"":111,"nd":112,"es":113,"ing":114,"ĠĠĠ":115,"it":116,"Ġp":117,"or":118,"ou":119,"Ġand":120,"Ġw":121,"is":122,"Ġf":123,"an":124,"ion":125,"al":126,"Ġb":127,"Ġto":128,"Ġm":129,"Ġin":130,"Ġof":131,"le":132,"ct":133,"ar":134,"ut":135,"Ġd":136,"st":137,"ed":138,"ĠĠĠĠĠĠĠ":139,"ic":140,"\":":141,",Ċ":142,"ro":143,"ent":144,"\\n":145,"Ġe":146,"put":147,"om":148,"Ġre":149,"as":150,"ve":151,"Ġh":152,"Ġth":153,"\",Ċ":154,"Ġl":155,"Ġis":156,"et":157,"ce":158,"Ġn":159,".\\":160,"im":161,"il":162,"Ġg":163,"Ġu":164,"ction":165,"ru":166,"ation":167,"ol":168,"ch":169,"ĠT":170,"Ġfor":171,"out":172,"ra":173,"ow":174,"id":175,"ly":176,"Ġst":177,"Ġbe":178,"Ġy":179,"Ġpro":180,"ig":181,"se":182,"ate":183,"Ġthat":184,"ith":185,"ir":186,"ur":187,"ot":188,"Ġor":189,"Ġon":190,"Ġyou":191,"ers":192,"stru":193,"Ġan":194,"if":195,"ul":196,"struction":197,"Ġ{":198,"Ġ}":199,"Ġcan":200,"input":201,"output":202,"instruction":203,"Ġ{Ċ":204,"Ġ},Ċ":205,"\"Ċ":206,"Ġhe":207,"Ġcon":208,"Ġit":209,"ay":210,"ess":211,"Ġwith":212,"ver":213,"el":214,"Ġas":215,"am":216,"ĠA":217,"ge":218,"Ġsu":219,"iv":220,".\",Ċ":221,"Ġcom":222,"ĠI":223,"ment":224,"ak":225,"Ġal":226,"\\\"":227,".\"Ċ":228,"ive":229,"Ġare":230,"ab":231,"ad":232,"Ġmo":233,"Ġex":234,"Ġv":235,"ĠS":236,"res":237,"pp":238,"qu":239,"Ġde":240,"Ġwh":241,"ity":242,"Ġen":243,"ĠThe":244,"her":245,"ld":246,"ri":247,"ter":248,"ant":249,"ĠC":250,"ist":251,"Ġ\"\",Ċ":252,"um":253,"Ġus":254,"Ġne":255,"ain":256,"th":257,"ect":258,"Ġle":259,"op":260,"em":261,"ies":262,"Ġch":263,"Ġim":264,"du":265,"od":266,"ort":267,"nt":268,"est":269,"igh":270,"ere":271,"Ġha":272,"us":273,"ure":274,"ial":275,"oc":276,"Ġwor":277,"Ġtheir":278,"ac":279,"ence":280,"iz":281,"Ġyour":282,"os":283,"Ġimp":284,"ud":285,"Ġby":286,"Ġse":287,"ine":288,"ould":289,"low":290,"ill":291,"age":292,"rom":293,"Ġsp":294,"ĠP":295,"Ġsh":296,"ust":297,"The":298,"un":299,"'s":300,"Ġinc":301,"ide":302,"pl":303,"ight":304,"og":305,"Ġpl":306,"pt":307,"are":308,"Ġte":309,"Ġint":310,"Ġ\\":311,"his":312,"Ġr":313,"ake":314,"per":315,"orm":316,"ag":317,"ff":318,"ĠE":319,"art":320,"Ġk":321,"end":322,"ĠM":323,"Ġwe":324,"ĠB":325,"Ġad":326,"cess":327,"rou":328,"ical":329,"all":330,"able":331,"Ġfrom":332,"and":333,"ĠH":334,"Ġab":335,"act":336,"Ġcomp":337,"ome":338,"ach":339,"ĠThis":340,"Ġhave":341,"form":342,"Ġ\\\"":343,"ast":344,"Ġat":345,"ĠW":346,"Ġres":347,"Ġdat":348,":\\":349,"ther":350,"ions":351,"ore":352,"Ġ(":353,"Ġcont":354,"our":355,"ep":356,"ĠF":357,"Ġac":358,"ance":359,"ĠR":360,"gh":361,"Ġme":362,"ces":363,"Ġwas":364,"ind":365,"vel":366,"ations":367,"Ġhel":368,"Ġmore":369,"ult":370,"ĠD":371,"reat":372,"ign":373,"Ġhelp":374,"ime":375,"ard":376,"Ġcl":377,"Ġapp":378,"ans":379,"ie":380,"Ġdata":381,"ich":382,"ang":383,"ous":384,"ell":385,"ks":386,"ase":387,"ice":388,"ip":389,"ite":390,"Ġsuch":391,"Ġfe":392,"Ġwhe":393,"ib":394,"Ġother":395,"Ġthis":396,"ass":397,"ual":398,"ile":399,"ne":400,"red":401,"Ġhas":402,"oo":403,"ress":404,"ific":405,"ning":406,"Ġ=":407,"Ġup":408,"Ġman":409,"Ġar":410,"ong":411,"ec":412,"Ġtra":413,"av":414,"Ġwhich":415,"Ġgo":416,"Ġprov":417,"Ġdis":418,"**":419,"so":420,"ĠG":421,"one":422,"Ġem":423,"Ġnot":424,"ue":425,"ĠO":426,"Ġj":427,"ace":428,"Ġthey":429,"ame":430,"Ġqu":431,"ĠL":432,"iff":433,"Ġfol":434,"ary":435,"ated":436,"ustom":437,"ition":438,"Ġits":439,"Ġsy":440,"ke":441,"ack":442,"ry":443,"--":444,"Ġtime":445,"Ġdes":446,"Ġnew":447,"ents":448,"ount":449,"Ġfollow":450,"Ġalso":451,"Ġcomm":452,"Ġout":453,"Ġeff":454,"Ġdiff":455,"iven":456,"ap":457,"Ġsent":458,"\\u":459,"Ġso":460,"Ġprodu":461,"Ġuse":462,"Ġsc":463,"Ġ-":464,"Ġun":465,"lud":466,"ĠIt":467,"ener":468,"king":469,"Ġev":470,"Ġabout":471,"Ġthem":472,"ĠU":473,"Ġcustom":474,"Ġro":475,"Ġinclud":476,"les":477,"etw":478,"stem":479,"xt":480,"Ġinto":481,"Ġper":482,"ĠIn":483,"ĠN":484,"Ġwill":485,"Ġlear":486,"ber":487,"Ġall":488,"Ġpe":489,"ds":490,"Ġtw":491,"aking":492,"ark":493,"ful":494,"Ġmake":495,"chn":496,"erv":497,"ost":498,"rough":499,"Ġone":500,"Ġinter":501,"ities":502,"ail":503,"ike":504,"ree":505,"ple":506,"alth":507,"Ġused":508,"ors":509,"Ġover":510,"ility":511,"ments":512,"ange":513,"Ġway":514,"ory":515,"Ġcol":516,"Ġpr":517,"Ġcould":518,"Ġnum":519,"reate":520,"int":521,"Ġredu":522,"erson":523,"Ġrec":524,"Ġher":525,"Ġneed":526,"ms":527,"ater":528,"oy":529,"Ġsystem":530,"Ġinform":531,"Ġtwo":532,"Ġtechn":533,"Ġsentence":534,"ience":535,"ize":536,"get":537,"Ġdiffere":538,"ood":539,"rib":540,"Ġbut":541,"Ġfollowing":542,"ased":543,"olog":544,"erg":545,"led":546,"ures":547,"In":548,"ear":549,"Ġph":550,"own":551,"Ġpre":552,"Ġwould":553,"Ġusing":554,"Ġcons":555,"Ġwork":556,"Ġmod":557,"ating":558,"ia":559,"ire":560,"Ġpos":561,"ient":562,"ob":563,"ject":564,"Ġinv":565,"ons":566,"Ġdo":567,"ular":568,"Ġdec":569,"Ġhealth":570,"Ġimpro":571,"Ġany":572,"Ġthrough":573,"yp":574,"row":575,"velop":576,"Ġprocess":577,"Ġtr":578,"lic":579,"very":580,"als":581,"ify":582,"``":583,"ari":584,"Ġstr":585,"Ġimport":586,"Ġlike":587,"Ġproduct":588,"Ġsome":589,"ph":590,"ential":591,"Ġam":592,"ates":593,"Ġacc":594,"ens":595,"ns":596,"Ġsm":597,"Ġind":598,"een":599,"Ġexper":600,"lect":601,"Ġval":602,"Ġrel":603,"its":604,"Ġinformation":605,"ings":606,"ĠJ":607,"ople":608,"iness":609,"Ġgiven":610,"mm":611,"ices":612,"Ġpart":613,"ild":614,"ys":615,"Ġour":616,"nder":617,"Ġperson":618,"ally":619,"Ġke":620,"etween":621,"ft":622,"oth":623,"Ġspec":624,"Ġbetween":625,"ergy":626,"ĠAI":627,"Ġwho":628,"Ġmay":629,"ef":630,"ative":631,"ise":632,"Ġlist":633,"Ġkn":634,"Ġadd":635,",\\":636,"ord":637,"ics":638,"Ġpeople":639,"ĠSt":640,"Ġhis":641,"Ġexp":642,"ible":643,"Ġthere":644,"Ġserv":645,"Ġincre":646,"Ġdevelop":647,"ound":648,"ower":649,"Ġtrans":650,"bs":651,"Ġenergy":652,"Ġoff":653,"Ġbus":654,"Ġwhile":655,"ose":656,"Ġact":657,"Ġexam":658,"Ġlearning":659,"ctions":660,"con":661,"gor":662,"gan":663,"ution":664,"round":665,"pport":666,"Ġhow":667,"Ġbl":668,"Ġmed":669,"anc":670,"Ġtyp":671,"Ġra":672,"Ġcar":673,"ife":674,"Ġworld":675,"Ġvari":676,"Ġrep":677,"au":678,"Ġsoc":679,"Ġprovid":680,"Ġset":681,"ten":682,"Ġsol":683,"Ġeach":684,"Ġwhen":685,"Ġeffect":686,"Ġpo":687,"Ġshe":688,"ick":689,"Ġwhere":690,"Ġmodel":691,"Ġimportant":692,"Ġunder":693,"Ġprog":694,"enerate":695,"ural":696,"tain":697,"Ġass":698,"ology":699,"Ġhad":700,"ook":701,"gg":702,"Ġcustomer":703,"ting":704,"ving":705,"Ġresp":706,"line":707,"Ġcreat":708,"ll":709,"ily":710,"Ġreg":711,"Ġdet":712,"Ġif":713,"Ġ+":714,"Ġbusiness":715,"\\nIn":716,"ish":717,"Ġmost":718,"ĠĠĠĠĠĠĠĠ":719,"hes":720,"angu":721,"Ġprovide":722,"Ġadv":723,"erm":724,"ub":725,"Ġsk":726,"irst":727,"any":728,"Ġday":729,"ivid":730,"arm":731,"ract":732,"nce":733,"Ġ|":734,"Ġimprove":735,")\\":736,"Ġco":737,"Ġcommun":738,"arket":739,"Ġmet":740,"cy":741,"Ġdifferent":742,"ized":743,"Ġart":744,"\\nThe":745,"rit":746,"Ġcomput":747,"Ġform":748,"ck":749,"Ġhum":750,"Ġchar":751,"ble":752,"Ġlead":753,"iron":754,"Ġrem":755,"Ġshould":756,"te":757,"Ġallow":758,"ness":759,"hat":760,"Ġfun":761,"Ġcomple":762,"Ġlangu":763,"ages":764,"Ġbec":765,"Ġsign":766,"ues":767,"ature":768,"Ġfind":769,"riend":770,"Ġstud":771,"Ġmain":772,"imate":773,"ove":774,"Ġresult":775,"Ġplay":776,"Ġreduce":777,"Ġeng":778,"ware":779,"redi":780,"Ġnumber":781,"Ġlar":782,"Ġpol":783,"Ġpat":784,"Ġwell":785,"ident":786,"viron":787,"rite":788,"crib":789,"Ġbu":790,"Ġhigh":791,"Ġthese":792,"ives":793,"ves":794,"Ġdesign":795,"urn":796,"Ġthan":797,"der":798,"Ġanal":799,"Ġwater":800,"Ġmarket":801,"Ġexample":802,"way":803,"stand":804,"ng":805,"ax":806,"itive":807,"Ġ`":808,"iqu":809,"Ġsim":810,"Ġequ":811,"gorith":812,"Ġtext":813,"resent":814,"Ġmany":815,"uring":816,"----":817,"\\nA":818,"Ġdi":819,"Ġsa":820,"vironment":821,"arch":822,"Ġatt":823,"Ġpot":824,"Ġtas":825,"Ġcreate":826,"ough":827,"Ġfl":828,"Ġmaking":829,"ious":830,"Ġgra":831,"Ġlife":832,"\\nO":833,"Ġalgorith":834,"ality":835,"eng":836,"Ġfin":837,"uc":838,"?\",Ċ":839,"ĠY":840,"Ġret":841,"Ġbeen":842,"Ġtechnology":843,"Ġprogra":844,"Ġhand":845,"hip":846,"wn":847,"Ġcal":848,"Ġwhat":849,"ividual":850,"iss":851,"ety":852,"Ġlanguage":853,"ources":854,"Ġclass":855,"Ġtake":856,"Ġeas":857,"ric":858,"Ġvis":859,"bject":860,"Ġref":861,"Ġenvironment":862,"Ġfirst":863,"eg":864,"Ġindividual":865,"Ġplan":866,"Ġperform":867,"Ġru":868,"ien":869,"Ġimpact":870,"Ġag":871,"ade":872,"Ġcle":873,"Ġrequ":874,"dition":875,"__":876,"Ġche":877,"ption":878,"Ġappro":879,"Ġ**":880,"Ġgreat":881,"ved":882,"Ġexpl":883,"Ġgrow":884,"Generate":885,"Ġmy":886,"Ġincluding":887,"Ġaccess":888,"Ġpop":889,"Ġmin":890,"fore":891,"Ġsocial":892,"ines":893,"Ġcharact":894,"Ġbr":895,"Ġstep":896,"Ġunderstand":897,"Ġorgan":898,"ĠAd":899,"Ġdisc":900,"Ġpower":901,"Ġlong":902,"hed":903,"Ġconc":904,"ward":905,"ited":906,"Ġele":907,"cing":908,"Ġevery":909,"Ġca":910,"Ġoften":911,"Ġuser":912,"vie":913,"ĠV":914,"Ġfood":915,"Ġinclude":916,"Ġloc":917,"ases":918,"ically":919,"ode":920,"ants":921,"Ġinvol":922,"Ġsmall":923,"Ġsur":924,"achine":925,"Ġbeing":926,"Ġpotential":927,"Ġno":928,"ĠCh":929,"Ġdep":930,"ather":931,"Ġboth":932,"Ġens":933,"Ġposs":934,"Ġed":935,"cribe":936,"ts":937,"ork":938,"ĠThey":939,"Ġpur":940,"ivity":941,"Ġwords":942,"Ġsignific":943,"Ġwere":944,"ĠHow":945,"Ġprom":946,"Ġexperience":947,"ĠK":948,"up":949,"Ġcount":950,"ered":951,"Des":952,"Ġfam":953,"```":954,"akes":955,"Ġgl":956,"ĠHe":957,"Ġfeel":958,"Ġback":959,"Ġfi":960,"Ġproble":961,"ization":962,"ling":963,"Ġcommunic":964,"ploy":965,"Ġaut":966,"Ġfriend":967,"Ġhuman":968,"Ġspe":969,"ew":970,"Ġpersonal":971,"Ġtop":972,"Ġent":973,"other":974,"Ġchang":975,"Ġcor":976,"Ġchange":977,"Ġdecis":978,"ability":979,"hing":980,"atural":981,"ever":982,"Ġcost":983,"Ġgood":984,"ause":985,"Ġident":986,"Ġsoft":987,"ined":988,"Ġpass":989,"'t":990,"atures":991,"Ġben":992,"Ġcompany":993,"Ġstart":994,"Ġsignificant":995,"Ġsumm":996,"ond":997,"old":998,"bers":999,"sel":1000,"?\\":1001,"Ġcur":1002,"Ġlight":1003,"Ġcommon":1004,".\\\"":1005,"Ġcustomers":1006,"iving":1007,"conom":1008,"Ġfunction":1009,"Ġve":1010,"Ġthree":1011,"Ġeven":1012,"ining":1013,"Ġgener":1014,"ries":1015,"Ġlevel":1016,"Ġspecific":1017,"Ġwebs":1018,"Ġthen":1019,"Ġeffective":1020,"cur":1021,"ense":1022,"Ġlarge":1023,"Ġdist":1024,"Ġeffic":1025,"Ġsupport":1026,"Ġget":1027,"Create":1028,"read":1029,"port":1030,"Ġinf":1031,"Ġ'":1032,"Ġyear":1033,"Ġstate":1034,"Ġkey":1035,"ccess":1036,":**":1037,"Ġav":1038,"Ġknow":1039,"Ġbenef":1040,"Ġess":1041,"ables":1042,"ren":1043,"Ġown":1044,"ĠThese":1045,"ock":1046,"-t":1047,"Ġide":1048,"omm":1049,"reen":1050,"ced":1051,"cture":1052,"Ġteam":1053,"Ġris":1054,"Ġtasks":1055,"Ġdown":1056,"Ġstru":1057,"Ġcomputer":1058,"-b":1059,"Ġfact":1060,"Ġmem":1061,"etter":1062,"\\nS":1063,"Ġaround":1064,"Ġword":1065,"Ġbased":1066,"Ġbeh":1067,"Ġright":1068,"Ġdel":1069,"Ġpoint":1070,"Ġnatural":1071,"ss":1072,"Ġeconom":1073,"Ġmade":1074,"Ġins":1075,"Ġinst":1076,"Ġmat":1077,"Ġvalue":1078,"Ġanim":1079,"Ġsever":1080,"\\nT":1081,"ational":1082,"ital":1083,"ze":1084,"ote":1085,"ills":1086,"tern":1087,"Ġread":1088,"Ġcontent":1089,"Ġonline":1090,"Ġend":1091,"ĠUn":1092,"vent":1093,"Ġsee":1094,"ending":1095,"Ġmon":1096,"Ġdr":1097,"Ġkeep":1098,"Ġsystems":1099,"cul":1100,"ven":1101,"Ġstory":1102,"Ġmedia":1103,"Ġseveral":1104,"hen":1105,"ateg":1106,"Ġcontin":1107,"Ġdev":1108,"Ġlearn":1109,"Ġla":1110,"Ġstre":1111,"Ġpartic":1112,"Ġair":1113,"ually":1114,"Ġsuccess":1115,"ouse":1116,"Ġiss":1117,"ied":1118,"Ġmachine":1119,"Ġopt":1120,"Ġx":1121,"Ġop":1122,"Ġprof":1123,"ocus":1124,"chie":1125,"Ġmeth":1126,"ner":1127,"omp":1128,"ron":1129,"Ġhome":1130,"Ġbetter":1131,"ĠPro":1132,"Ġmult":1133,"omet":1134,"Ġincrease":1135,"Ġanaly":1136,"vert":1137,"Ġrele":1138,"Ġbra":1139,"ink":1140,"Ġtem":1141,"Ġpredi":1142,"Ġtre":1143,"Ġservice":1144,"Ġwebsite":1145,"Ġmanage":1146,"Ġsoftware":1147,"here":1148,"Ġprot":1149,"-s":1150,"Ġquest":1151,"ier":1152,"Ġknown":1153,"Ġorder":1154,"Ġphys":1155,"cept":1156,"Ġachie":1157,"Ġinput":1158,"Ġpossible":1159,"ĠIf":1160,"Ġext":1161,"fter":1162,"Ġelect":1163,"Ġmethod":1164,"Ġbre":1165,"ĠAn":1166,"ways":1167,"ering":1168,"ets":1169,"Ġjust":1170,"Ġstore":1171,"Ġdevelopment":1172,"Ġcare":1173,"Ġobject":1174,"Ġtype":1175,"ĠFor":1176,"Ġfocus":1177,"ggest":1178,"Ġonly":1179,"Ġconsid":1180,"ars":1181,"Ġchall":1182,"Ġdeterm":1183,"Ġsal":1184,"ins":1185,"Ġfeatures":1186,"Ġtru":1187,"ody":1188,"Ġtool":1189,">\\":1190,"Ġensure":1191,"oss":1192,"ublic":1193,"Ġitem":1194,"Here":1195,"ination":1196,"Ġdef":1197,"Describe":1198,"ional":1199,"roup":1200,"Ġconf":1201,"Ġneeds":1202,"Ġcharacter":1203,"Ġvarious":1204,"Ġlet":1205,"Ġapplic":1206,"aut":1207,"Ġjob":1208,"ellig":1209,"ĠCon":1210,"Ġbest":1211,"Ġfore":1212,"Ġamount":1213,"rop":1214,"Ġbuild":1215,"ique":1216,"aging":1217,"Ġemploy":1218,"Ġrest":1219,"air":1220,"What":1221,"Ġtoget":1222,"Ġways":1223,"Ġidentify":1224,"Ġtogether":1225,"Ġreal":1226,"Ġusers":1227,"Ġmean":1228,"asing":1229,"ĠAm":1230,"Ġeduc":1231,"Ġalgorithm":1232,"Ġnetw":1233,"Ġcode":1234,"Write":1235,"ov":1236,"-d":1237,"oura":1238,"ĠHowever":1239,"uture":1240,"view":1241,"Ġindu":1242,"Ġproducts":1243,"ected":1244,"ertain":1245,";\\":1246,"ĠAs":1247,"pr":1248,"aste":1249,"Ġoper":1250,"Ġ$":1251,"avi":1252,"self":1253,"Ġ<":1254,"Ġindust":1255,"Ġgu":1256,"Ġothers":1257,"Ex":1258,"ian":1259,"Ġ\"\\\"":1260,"-f":1261,"nces":1262,"Ġfil":1263,"Ġrespons":1264,"rol":1265,"Ġcap":1266,"Ġbefore":1267,"vern":1268,"Ġcomplex":1269,"lus":1270,"ribut":1271,"ats":1272,"Ġpositive":1273,"oh":1274,"Ġlo":1275,"Ġgroup":1276,"Ġfound":1277,"ee":1278,"ogn":1279,"Ġsw":1280,"Ġindividuals":1281,"Ġpract":1282,"Ġenc":1283,"Ġshare":1284,"raph":1285,"Ġrange":1286,"Ġsun":1287,"\\t":1288,"Ġproviding":1289,"icle":1290,"Ġdem":1291,"Ġplace":1292,"Ġaud":1293,"joy":1294,"Ġmust":1295,"els":1296,"ery":1297,"One":1298,"Ġfamily":1299,"Ġfuture":1300,"less":1301,"rent":1302,"Ġproblem":1303,"Ġessential":1304,"rodu":1305,"ired":1306,"Ġreducing":1307,"ism":1308,"Ġwarm":1309,"ray":1310,"Ġability":1311,"Ġstrong":1312,"Ġalways":1313,"Ġresources":1314,"Ġbenefits":1315,"Ġstrateg":1316,"Ġinvolves":1317,"Ġassist":1318,"erest":1319,"nA":1320,"ression":1321,"Ġ[":1322,"ilities":1323,"Ġsteps":1324,"verall":1325,"Ġshow":1326,"obal":1327,"\\nF":1328,"Ġland":1329,"ĠHere":1330,"Ġbusinesses":1331,"ĠEn":1332,"pportun":1333,"Ġmeas":1334,"Ġreturn":1335,"Ġdig":1336,"Ġhist":1337,"yth":1338,"Ġcent":1339,"Ġable":1340,"Ġwithout":1341,"yc":1342,"plain":1343,"Ġrelations":1344,"Ġservices":1345,"-c":1346,"Ġtest":1347,"arth":1348,"Ġcommunication":1349,"Ġintern":1350,"new":1351,"Ġsit":1352,"Ġinvest":1353,"Ġcaus":1354,"Ġunt":1355,"Ġfriends":1356,"Ġchanges":1357,"cri":1358,"dit":1359,"ĠBy":1360,"ĠYou":1361,"Ġmeans":1362,"Ġrese":1363,"ool":1364,"ted":1365,"elligence":1366,"ains":1367,"pping":1368,"Ġbel":1369,"Ġrepresent":1370,"Ġhapp":1371,"Ġser":1372,"Ġperformance":1373,"Ġopportun":1374,"Ġtemper":1375,"ĠShe":1376,"Ġfu":1377,"ix":1378,"bot":1379,"Ġwrit":1380,"Ġbehavi":1381,"Ġproject":1382,"ĠWith":1383,"ivers":1384,"day":1385,"Ġphysical":1386,"izing":1387,"Ġactiv":1388,"Ġwithin":1389,"Ġinterest":1390,"olution":1391,"wards":1392,"ffic":1393,"Ġquick":1394,"Ġpublic":1395,"Ġgrowth":1396,"Ġcho":1397,"Ġrelationship":1398,"Ġuntil":1399,"Ġhelps":1400,"Ġstudents":1401,"Ġfiel":1402,"imes":1403,"ulation":1404,"ibility":1405,"elf":1406,"Ġful":1407,"Ġsub":1408,"ank":1409,"ides":1410,"Ġskills":1411,"Ġclimate":1412,"Given":1413,"Ġpar":1414,"Ġclear":1415,"irt":1416,"Name":1417,"Ġpresent":1418,"Ġtri":1419,"Ġchalleng":1420,"ream":1421,"Ġlay":1422,"Ġmarketing":1423,"Ġsummary":1424,"Ġchild":1425,"Ġsaf":1426,"Ġsure":1427,"Ġsame":1428,"Ġmu":1429,"Ġemail":1430,"bon":1431,"Ġsomet":1432,"```\\":1433,"Ġcurrent":1434,"amp":1435,"ences":1436,"ĠRe":1437,"Ġtransport":1438,"me":1439,"-p":1440,"action":1441,"ĠEx":1442,"Ġyears":1443,"Ġcomb":1444,"hor":1445,"anced":1446,"ty":1447,"Ġlove":1448,"Ġgreen":1449,"Ġpopular":1450,"Ġless":1451,"Ġdra":1452,"Ġcontrol":1453,"Ġaff":1454,"Ġconsum":1455,"Ġgame":1456,"ental":1457,"ights":1458,"arget":1459,"omes":1460,"ox":1461,"icult":1462,"erc":1463,"Ġgoals":1464,"ancial":1465,"tle":1466,"Ġgovern":1467,"Ġnumbers":1468,"Ġfive":1469,"Ġstand":1470,"Ġsearch":1471,"Ġefficient":1472,"Ġwal":1473,"Ġname":1474,"ath":1475,"Ġheart":1476,"Ġduring":1477,"rect":1478,"Ġoverall":1479,"ython":1480,"Ġallows":1481,"Ġcity":1482,"ave":1483,"vant":1484,"aterial":1485,"Ġwide":1486,"Ġmus":1487,"ificial":1488,"Ġhard":1489,"ĠTh":1490,"oose":1491,"Ġglobal":1492,"aj":1493,"Ġter":1494,"Ġdifficult":1495,"Ġline":1496,"ĠAl":1497,"care":1498,"ived":1499,"Ġregular":1500,"Ġgr":1501,"),":1502,"lement":1503,"Ġhim":1504,"Ġunique":1505,"Ġenjoy":1506,"Ġmeaning":1507,"Ġopen":1508,"Ġi":1509,"abor":1510,"Ġarea":1511,"Ġitems":1512,"Ġclean":1513,"ditionally":1514,"oid":1515,"ĠWe":1516,"Ġbeaut":1517,"Ġmeet":1518,"iple":1519,"Ġstatement":1520,"Ġagain":1521,"ysis":1522,"Ġfac":1523,"Ġsources":1524,"Ġbody":1525,"Ġalgorithms":1526,"Ġaudience":1527,"Ġwant":1528,"Ġlog":1529,"Ġmaintain":1530,"Ġactivities":1531,"Ġmove":1532,"Ġcult":1533,"oney":1534,"Ġtarget":1535,"\\nB":1536,"Ġmaterial":1537,"Ġcreating":1538,"Ġstructure":1539,"atform":1540,"ext":1541,"Ġexperien":1542,"Ġvalues":1543,"ead":1544,"ohn":1545,"Ġhealthy":1546,"ross":1547,"Ġinteg":1548,"Ġresearch":1549,"atch":1550,"ooking":1551,"Ġrole":1552,"Ġprovides":1553,"iety":1554,"ists":1555,"Ġfinancial":1556,"ories":1557,"dent":1558,"Ġer":1559,"Ġarticle":1560,"Ġelements":1561,"Ġaddress":1562,"Ġconn":1563,"ĠUse":1564,"mp":1565,"Ġeasy":1566,"Ġneg":1567,"Ġcolor":1568,"Ġcalcul":1569,"Explain":1570,"ĠPl":1571,"pect":1572,"ince":1573,"ale":1574,"Ġrisk":1575,"curity":1576,"ert":1577,"Ġfeed":1578,"Ġevent":1579,"vers":1580,"ples":1581,"Ġlevels":1582,"Ġbi":1583,"Ġstay":1584,"Ġplatform":1585,"Ġbreak":1586,"back":1587,"Ġsat":1588,"\\nOverall":1589,"Ġeducation":1590,"\\nC":1591,"Ġcarbon":1592,"--------":1593,"ape":1594,"Ġprevent":1595,"Ġaddition":1596,"Ġstress":1597,"ral":1598,"ource":1599,"rus":1600,"Ġcome":1601,"Ġrecogn":1602,"ĠUnited":1603,"Ġproper":1604,"Ġpoll":1605,"dentify":1606,"Ġunderstanding":1607,"Ġdecisions":1608,"ict":1609,"Ġdire":1610,"Ġbehavior":1611,"Ġ*":1612,"\\nI":1613,"Ġmess":1614,"Ġanimals":1615,"Ġsl":1616,"Ġwind":1617,"Ġbas":1618,"Ġpain":1619,"Ġleading":1620,"ern":1621,"ger":1622,"Ġpres":1623,"Ġthough":1624,"Ġinteract":1625,"yle":1626,"Ġdoes":1627,"Ġhead":1628,"Ġintelligence":1629,"orts":1630,"Ġbecome":1631,"Ġrun":1632,"aring":1633,"Ġimplement":1634,"Ġaction":1635,"oot":1636,"terns":1637,"Ġprotect":1638,"eric":1639,"Ġflow":1640,"Ġemot":1641,"cessary":1642,"urate":1643,"Ġsuggest":1644,"Ġprogram":1645,"Ġphr":1646,"Ġhealthcare":1647,"ention":1648,"Ġsust":1649,"Ġwhy":1650,"Ġaccurate":1651,"lu":1652,"Ġhig":1653,"Ġreach":1654,"Ġallowing":1655,"Ġtravel":1656,"Ġrequire":1657,"Ġareas":1658,"Ġdeep":1659,"He":1660,"Ġfew":1661,"Ġself":1662,"oun":1663,"Ġ#":1664,"osp":1665,"str":1666,"Ġminut":1667,"Ġdecision":1668,"ĠThere":1669,"ances":1670,"Ġquality":1671,"Ġavail":1672,"Ġspace":1673,"Ġsomething":1674,"Ġweb":1675,"Ġpatterns":1676,"Ġmot":1677,"oring":1678,"isf":1679,"Ġanother":1680,"Ġaccount":1681,"\\nW":1682,"uss":1683,"Ġmaj":1684,"uation":1685,"Ġsustain":1686,"Ġautom":1687,"iques":1688,"issions":1689,"verse":1690,"Ġconcept":1691,"Ġsecurity":1692,"Ġthose":1693,"Ġprofess":1694,"Ġshort":1695,"Ġnight":1696,"ength":1697,"apt":1698,"ex":1699,"ĠAdditionally":1700,"Ġtaking":1701,"Ġtoo":1702,"agn":1703,"Ġsimple":1704,"lusion":1705,"iency":1706,"ash":1707,"ours":1708,"Ġpa":1709,"Ġlit":1710,"ĠSp":1711,"iting":1712,"Ġdon":1713,"Ġlim":1714,"lish":1715,"mat":1716,"aves":1717,"ledge":1718,"ditional":1719,"inc":1720,"Ġevents":1721,"Ġoffer":1722,"thing":1723,"Ġworking":1724,"Ġanalysis":1725,"Ġachieve":1726,"Ġpie":1727,"Ġbook":1728,"Ġfre":1729,"Ġmuch":1730,"oon":1731,"Ġtry":1732,"esp":1733,"Ġwaste":1734,"face":1735,"Ġear":1736,"Ġfru":1737,"Ġtransportation":1738,"chool":1739,"Ġtechniques":1740,"Ġprogramm":1741,"ĠEarth":1742,"Ġpredict":1743,"Ġnever":1744,"ws":1745,"ument":1746,"imately":1747,"ared":1748,"Ġparticular":1749,"Ġtowards":1750,"Ġeconomic":1751,"Ġincreasing":1752,"Ġfast":1753,"iment":1754,"Ġnetwork":1755,"Ġcorrect":1756,"Ġmight":1757,"Ġoc":1758,"Ġbecause":1759,"ĠWh":1760,"az":1761,"play":1762,"Ġresults":1763,"Ġmanagement":1764,"Ġpurch":1765,"Ġsound":1766,"Ġpast":1767,"Ġtraining":1768,"____":1769,"ope":1770,"Ġengage":1771,"ourage":1772,"Ġsense":1773,"Ġfree":1774,"Ġpref":1775,"ees":1776,"Ġcountries":1777,"ney":1778,"anies":1779,"Ġafter":1780,"Ġmind":1781,"Ġexc":1782,"ĠOnce":1783,"ĠĠĠĠĠĠĠĠĠĠĠ":1784,"Ġcomplete":1785,"Ġimm":1786,"Ġest":1787,"Ġgenerate":1788,"verb":1789,"ĠDe":1790,"'m":1791,"Ġtools":1792,"redients":1793,"Ġmajor":1794,"ently":1795,"Ġcontribut":1796,"leep":1797,"Ġpoints":1798,"ditions":1799,"Ġfactors":1800,"Ġel":1801,"Ġnext":1802,"ium":1803,"oud":1804,"Ġcru":1805,"Ġreas":1806,"riate":1807,"ĠInd":1808,"Ġpromot":1809,"Ġhistory":1810,"Ġjour":1811,"Ġdue":1812,"Con":1813,"Ġveget":1814,"ency":1815,"ĠAmeric":1816,"Ġfra":1817,"Ġdifference":1818,"oard":1819,"lex":1820,"Ġequation":1821,"irtual":1822,"Ġcup":1823,"Ġforest":1824,"Ġnegative":1825,"Ġsecon":1826,"ones":1827,"Ġnature":1828,"Ġuses":1829,"ah":1830,"por":1831,"Ġsec":1832,"ording":1833,"Ġlast":1834,"ĠSome":1835,"Ġissues":1836,"Ġscient":1837,"Ġprint":1838,"ĠStates":1839,"over":1840,"Ġsatisf":1841,"Ġdevices":1842,"Ġdise":1843,"Ġtemperature":1844,"Ġfeedback":1845,"Ġnecessary":1846,"Ġemissions":1847,"mb":1848,"Ġlow":1849,"for":1850,"tal":1851,"Ġchallenges":1852,"Ġarray":1853,"Ġside":1854,"Ġengine":1855,"Ġboo":1856,"ata":1857,"Ġbelie":1858,"-m":1859,"Ġmultiple":1860,"Ġsing":1861,"Ġgovernment":1862,"ames":1863,"ified":1864,"Ġminutes":1865,"Ġsuccessful":1866,"Ġmoney":1867,"Ġquickly":1868,"Ġbir":1869,"Ġtypically":1870,"Ġpost":1871,"Ġprep":1872,"Ġknowledge":1873,"pped":1874,"actions":1875,"Ġmethods":1876,"Ġoptim":1877,"\\nP":1878,"Ġoutput":1879,"Ġfield":1880,"Ġtable":1881,"Ġbal":1882,"Ġcoll":1883,"Ġcharacters":1884,"volution":1885,"ords":1886,"ilar":1887,"ification":1888,"ane":1889,"Ġcell":1890,"Ġmil":1891,"ĠWhat":1892,"Ġsqu":1893,"Ġlives":1894,"ĠAr":1895,"Ġphrase":1896,"Ġnut":1897,"Ġdigital":1898,"Ġinternet":1899,"lass":1900,"ura":1901,"ommend":1902,"Ġtreat":1903,"Ġapprop":1904,"resh":1905,"urther":1906,"ĠOne":1907,"Ġvisual":1908,"ategor":1909,"Ġapproach":1910,"Ġcertain":1911,"Ġsho":1912,"val":1913,"Ġtask":1914,"ires":1915,"Ġappropriate":1916,"Ġvie":1917,"Ġdesigned":1918,"pose":1919,"**:":1920,"fort":1921,"Ġ|\\":1922,"Ġapplications":1923,"Ġpay":1924,"Ġnow":1925,"Ġheat":1926,"Ġindustry":1927,"pre":1928,"Ġeffectively":1929,"Ġpopulation":1930,"Ġopportunities":1931," TransformerDecoder: +) -> Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: """ Build the decoder associated with the Qwen2 model. This includes: - Token embeddings @@ -161,7 +161,7 @@ def lora_qwen2( lora_dropout: float = 0.0, # Quantization args quantize_base: bool = False, -) -> TransformerDecoder: +) -> Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: """ Return a version of Qwen2 (an instance of :func:`~torchtune.models.qwen2.transformer.Qwen2TransformerDecoder`) with LoRA applied based on the passed in configuration. diff --git a/torchtune/models/qwen2/_model_builders.py b/torchtune/models/qwen2/_model_builders.py index 4d6045c90e..05180d8e04 100644 --- a/torchtune/models/qwen2/_model_builders.py +++ b/torchtune/models/qwen2/_model_builders.py @@ -3,13 +3,11 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -from typing import List, Optional -from functools import partial +from typing import List, Optional, Union from torchtune.models.qwen2._component_builders import qwen2, lora_qwen2 from torchtune.models.qwen2._tokenizer import Qwen2Tokenizer - -from torchtune.modules import TransformerDecoder +from torchtune.modules import TransformerDecoder, TiedEmbeddingTransformerDecoder from torchtune.modules.peft import LORA_ATTN_MODULES from torchtune.modules.tokenizers import parse_hf_tokenizer_json @@ -20,13 +18,13 @@ """ -def qwen2_7b() -> TransformerDecoder: +def qwen2_7b() -> Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: """ Builder for creating a Qwen2 model initialized w/ the default 7B parameter values from https://huggingface.co/Qwen/Qwen2-7B-Instruct Returns: - TransformerDecoder: Instantiation of Qwen2 7B model + Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: Instantiation of Qwen2 7B model """ return qwen2( vocab_size=152064, @@ -43,15 +41,15 @@ def qwen2_7b() -> TransformerDecoder: def qwen2_tokenizer( - vocab_file: str, merges_file: str, special_tokens_path: Optional[str] = None, + path: str, merges_file: str = None, special_tokens_path: Optional[str] = None, **kwargs, ) -> Qwen2Tokenizer: """ Tokenizer for Qwen2. Args: - vocab_file (str): path to the vocab file. - merges_file (str): path to the merges file. + path (str): path to the vocab.json file. + merges_file (str): path to the merges.txt file. special_tokens_path (Optional[str]): Path to ``tokenizer.json`` from Hugging Face model files that contains all registered special tokens, or a local json file structured similarly. Default is None to use the canonical Qwen2 special tokens. @@ -60,7 +58,7 @@ def qwen2_tokenizer( Llama3Tokenizer: Instantiation of the Qwen2 tokenizer """ special_tokens = parse_hf_tokenizer_json(special_tokens_path) if special_tokens_path is not None else None - return Qwen2Tokenizer(vocab_file=vocab_file, merges_file=merges_file, special_tokens=special_tokens, **kwargs) + return Qwen2Tokenizer(path=path, merges_file=merges_file, special_tokens=special_tokens, **kwargs) def lora_qwen2_7b( @@ -71,7 +69,7 @@ def lora_qwen2_7b( lora_alpha: float = 16, lora_dropout: float = 0.05, quantize_base: bool = False, -) -> TransformerDecoder: +) -> Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: """ Builder for creating a Qwen2 7B model with LoRA enabled. @@ -92,7 +90,7 @@ def lora_qwen2_7b( quantize_base (bool): Whether to quantize base model weights Returns: - TransformerDecoder: Instantiation of Qwen2 7B model with LoRA applied + Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: Instantiation of Qwen2 7B model with LoRA applied """ return lora_qwen2( lora_attn_modules=lora_attn_modules, @@ -113,12 +111,3 @@ def lora_qwen2_7b( lora_dropout=lora_dropout, quantize_base=quantize_base, ) - - -qlora_qwen2_7b = partial(lora_qwen2_7b, quantize_base=True) - -qlora_qwen2_7b.__doc__ = """ -Builder for creating a Qwen2 7B model with QLoRA enabled. Base model weights in linear layers -that LoRA is applied to are quantized per the QLoRA paper: https://arxiv.org/abs/2305.14314. -Please see `lora_qwen2_7b` for full API arguments. -""" diff --git a/torchtune/models/qwen2/_tokenizer.py b/torchtune/models/qwen2/_tokenizer.py index 35be6312b1..0f01e81b4b 100644 --- a/torchtune/models/qwen2/_tokenizer.py +++ b/torchtune/models/qwen2/_tokenizer.py @@ -4,6 +4,8 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import json +import os.path + import unicodedata from functools import lru_cache from typing import Dict, List, Optional, Tuple @@ -72,9 +74,9 @@ def get_pairs(word): class Qwen2Tokenizer(ModelTokenizer): - """This class construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). + """This class construct a Qwen2 tokenizer, based on GPT-2 byte-level BPE tokenization. - See . + See . Args: vocab_file (str): Path to vocab.json file. @@ -94,18 +96,20 @@ class Qwen2Tokenizer(ModelTokenizer): def __init__( self, - vocab_file: str, - merges_file: str, - *, + path: str, + merges_file: str = None, special_tokens: Optional[Dict[str, int]] = None, + *, errors: str = "replace", unk_token: Optional[str] = ENDOFTEXT, bos_token: Optional[str] = None, eos_token: str = ENDOFTEXT, pad_token: Optional[str] = ENDOFTEXT, ): - with open(vocab_file, encoding="utf-8") as vocab_handle: + with open(path, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) + if merges_file is None: + merges_file = os.path.join(os.path.dirname(path), "merges.txt") self.decoder = {v: k for k, v in self.encoder.items()} self.errors = errors # how to handle errors in decoding diff --git a/torchtune/modules/__init__.py b/torchtune/modules/__init__.py index 1798f58b4a..5c3b39d228 100644 --- a/torchtune/modules/__init__.py +++ b/torchtune/modules/__init__.py @@ -12,7 +12,11 @@ from .lr_schedulers import get_cosine_schedule_with_warmup # noqa from .position_embeddings import RotaryPositionalEmbeddings # noqa from .rms_norm import RMSNorm # noqa -from .transformer import TransformerDecoder, TransformerDecoderLayer # noqa +from .transformer import ( # noqa + TiedEmbeddingTransformerDecoder, + TransformerDecoder, + TransformerDecoderLayer, +) from .vision_transformer import VisionTransformer __all__ = [ @@ -25,6 +29,7 @@ "Fp32LayerNorm", "VisionTransformer", "TransformerDecoder", + "TiedEmbeddingTransformerDecoder", "TransformerDecoderLayer", "reparametrize_as_dtype_state_dict_post_hook", ] diff --git a/torchtune/modules/transformer.py b/torchtune/modules/transformer.py index 28fd6f9481..957fb6ac15 100644 --- a/torchtune/modules/transformer.py +++ b/torchtune/modules/transformer.py @@ -248,7 +248,7 @@ def forward( return output -class TiedEmbeddingTransformerDecoder(TransformerDecoder): +class TiedEmbeddingTransformerDecoder(nn.Module): """ Transformer Decoder with tied embedding weight. A key difference between this class and :class:`~torchtune.modules.TransformerDecoder` @@ -285,17 +285,48 @@ def __init__( head_dim: int, norm: nn.Module, ) -> None: - super().__init__( - tok_embeddings=tok_embeddings, - layer=layer, - num_layers=num_layers, - max_seq_len=max_seq_len, - num_heads=num_heads, - head_dim=head_dim, - norm=norm, - output=None, + super().__init__() + + self.tok_embeddings = tok_embeddings + self.layers = _get_clones(layer, num_layers) + self.norm = norm + self.max_seq_len = max_seq_len + self.num_heads = num_heads + self.head_dim = head_dim + self.causal_mask = None + + def setup_caches(self, batch_size: int, dtype: torch.dtype) -> None: + """Setup key value caches for attention calculation. + + Args: + batch_size (int): batch size for the caches. + dtype (torch.dtype): dtype for the caches. + """ + for layer in self.layers: + layer.attn.kv_cache = KVCache( + batch_size=batch_size, + max_seq_len=self.max_seq_len, + num_heads=self.num_heads, + head_dim=self.head_dim, + dtype=dtype, + ) + + # causal_mask is used during inference to ensure we're attending + # to the right tokens + self.causal_mask = torch.tril( + torch.ones(self.max_seq_len, self.max_seq_len, dtype=torch.bool) ) + def reset_caches(self): + """Reset the key value caches.""" + if self.layers[0].attn.kv_cache is None: + raise RuntimeError( + "Key value caches are not setup. Call ``setup_caches()`` first." + ) + + for layer in self.layers: + layer.attn.kv_cache.reset() + def forward( self, tokens: Tensor, From 01edc3adbbe632ec7043f1e734729be2cfb1c1c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Thu, 25 Jul 2024 18:07:54 +0800 Subject: [PATCH 08/14] Update Qwen2Tokenizer according to PR review comments. --- torchtune/models/qwen2/_tokenizer.py | 43 ++++++++++++++++------------ 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/torchtune/models/qwen2/_tokenizer.py b/torchtune/models/qwen2/_tokenizer.py index 0f01e81b4b..4f7a12dd09 100644 --- a/torchtune/models/qwen2/_tokenizer.py +++ b/torchtune/models/qwen2/_tokenizer.py @@ -4,8 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import json -import os.path - import unicodedata from functools import lru_cache from typing import Dict, List, Optional, Tuple @@ -31,6 +29,8 @@ IM_START = "<|im_start|>" IM_END = "<|im_end|>" +DEFAULT_QWEN2_TOKENIZER_BPE_CACHE_SIZE = 151646 + @lru_cache() def bytes_to_unicode(): @@ -79,14 +79,28 @@ class Qwen2Tokenizer(ModelTokenizer): See . Args: - vocab_file (str): Path to vocab.json file. + path (str): Path to vocab.json file. merges_file (str): Path to merges.txt file. + merges.txt contains all BPE merge operations, and this file is required to split a single word into + byte-level BPE tokens. + errors (str): Paradigm to follow when decoding bytes to UTF-8. Defaults to "replace". + See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + unk_token (Optional[str]): The unknown token. A token that is not in the vocabulary cannot be converted + to an ID and is set to be this token instead. Defaults to "<|endoftext|>". + bos_token (Optional[str]): The beginning of sequence token. Defaults to None. + eos_token (str): The end of sequence token. Defaults to "<|endoftext|>". + pad_token (Optional[str]): The token used for padding. Defaults to "<|endoftext|>". + bpe_cache_size (int): BPE token cache size in Qwen2Tokenizer. + NOTE: large cache size will speed up tokenization, but the cache object will get really + large for long running processes (esp. for texts of language that do not use space between + word, e.g. Chinese); technically not a memory leak but appears as one. + By default, we set the cache size equals to size of the official Qwen2 tokenizer. Example: - >>> tokenizer = Qwen2Tokenizer(vocab_file="/path/to/vocab.json", merges_file="/path/to/merges.txt") + >>> tokenizer = Qwen2Tokenizer(path="/path/to/vocab.json", merges_file="/path/to/merges.txt") >>> tokenized_text = tokenizer.encode("Hello world!") >>> print(tokenized_text) - [] + [39, 385, 78, 675, 0, 2000] """ system = f"{IM_START}system\n{{content}}{IM_END}\n" @@ -97,7 +111,7 @@ class Qwen2Tokenizer(ModelTokenizer): def __init__( self, path: str, - merges_file: str = None, + merges_file: str, special_tokens: Optional[Dict[str, int]] = None, *, errors: str = "replace", @@ -105,11 +119,10 @@ def __init__( bos_token: Optional[str] = None, eos_token: str = ENDOFTEXT, pad_token: Optional[str] = ENDOFTEXT, + bpe_cache_size: int = DEFAULT_QWEN2_TOKENIZER_BPE_CACHE_SIZE, ): with open(path, encoding="utf-8") as vocab_handle: self.encoder = json.load(vocab_handle) - if merges_file is None: - merges_file = os.path.join(os.path.dirname(path), "merges.txt") self.decoder = {v: k for k, v in self.encoder.items()} self.errors = errors # how to handle errors in decoding @@ -123,11 +136,8 @@ def __init__( continue bpe_merges.append(tuple(line.split())) self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) - # NOTE: the cache can grow without bound and will get really large for long running processes - # (esp. for texts of language that do not use space between word, e.g. Chinese); technically - # not a memory leak but appears as one. - # GPT2Tokenizer has the same problem, so let's be consistent. - self.cache = {} + + self._bpe = lru_cache(maxsize=bpe_cache_size)(self._bpe_without_cache) self.pat = re.compile(PRETOKENIZE_REGEX) @@ -144,14 +154,12 @@ def __init__( self.im_end_id = self.special_tokens[IM_END] self.stop_tokens = [self.eos_id, self.im_end_id] - # Tokens trie for special tokens. + # Pattern for special tokens. self._pattern_split_special_tokens = re.compile( r"(\L)", options=self.special_tokens.keys() ) - def _bpe(self, token): - if token in self.cache: - return self.cache[token] + def _bpe_without_cache(self, token): word = tuple(token) pairs = get_pairs(word) @@ -188,7 +196,6 @@ def _bpe(self, token): else: pairs = get_pairs(word) word = " ".join(word) - self.cache[token] = word return word def _tokenize(self, text): From d0671eeacd347c7d1465a545de2f2526d3fbf88e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Mon, 29 Jul 2024 16:06:26 +0800 Subject: [PATCH 09/14] Update _recipe_registry.py. --- torchtune/_recipe_registry.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py index 9072242186..9f40bf6802 100644 --- a/torchtune/_recipe_registry.py +++ b/torchtune/_recipe_registry.py @@ -51,6 +51,10 @@ class Recipe: name="phi3/mini_full_low_memory", file_path="phi3/mini_full_low_memory.yaml", ), + Config( + name="qwen2/7B_full_low_memory", + file_path="qwen2/7B_full_low_memory.yaml", + ), ], supports_distributed=False, ), @@ -68,6 +72,7 @@ class Recipe: Config(name="gemma/2B_full", file_path="gemma/2B_full.yaml"), Config(name="gemma/7B_full", file_path="gemma/7B_full.yaml"), Config(name="phi3/mini_full", file_path="phi3/mini_full.yaml"), + Config(name="qwen2/7B_full", file_path="qwen2/7B_full.yaml"), ], supports_distributed=True, ), @@ -143,6 +148,10 @@ class Recipe: name="phi3/mini_qlora_single_device", file_path="phi3/mini_qlora_single_device.yaml", ), + Config( + name="qwen2/7B_lora_single_device", + file_path="qwen2/7B_lora_single_device.yaml", + ), ], supports_distributed=False, ), @@ -183,6 +192,7 @@ class Recipe: Config(name="gemma/2B_lora", file_path="gemma/2B_lora.yaml"), Config(name="gemma/7B_lora", file_path="gemma/7B_lora.yaml"), Config(name="phi3/mini_lora", file_path="phi3/mini_lora.yaml"), + Config(name="qwen2/7B_lora", file_path="qwen2/7B_lora.yaml"), ], supports_distributed=True, ), From 84c26e5602c0b43bfb9e803d01f47b7b86907241 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Tue, 30 Jul 2024 03:28:44 +0800 Subject: [PATCH 10/14] Add Qwen2-0.5B and Qwen2-1.5B. Fix some bugs in weight converters. --- recipes/configs/qwen2/0.5B_full.yaml | 75 +++++++++ .../configs/qwen2/0.5B_full_low_memory.yaml | 77 +++++++++ recipes/configs/qwen2/0.5B_lora.yaml | 108 +++++++++++++ .../qwen2/0.5B_lora_single_device.yaml | 106 ++++++++++++ recipes/configs/qwen2/1.5B_full.yaml | 75 +++++++++ .../configs/qwen2/1.5B_full_low_memory.yaml | 77 +++++++++ recipes/configs/qwen2/1.5B_lora.yaml | 108 +++++++++++++ .../qwen2/1.5B_lora_single_device.yaml | 106 ++++++++++++ recipes/configs/qwen2/7B_full.yaml | 10 +- recipes/configs/qwen2/7B_full_low_memory.yaml | 8 +- recipes/configs/qwen2/7B_lora.yaml | 2 +- .../configs/qwen2/7B_lora_single_device.yaml | 2 +- torchtune/models/qwen2/__init__.py | 14 +- torchtune/models/qwen2/_convert_weights.py | 4 +- torchtune/models/qwen2/_model_builders.py | 152 ++++++++++++++++++ torchtune/modules/transformer.py | 6 +- 16 files changed, 914 insertions(+), 16 deletions(-) create mode 100644 recipes/configs/qwen2/0.5B_full.yaml create mode 100644 recipes/configs/qwen2/0.5B_full_low_memory.yaml create mode 100644 recipes/configs/qwen2/0.5B_lora.yaml create mode 100644 recipes/configs/qwen2/0.5B_lora_single_device.yaml create mode 100644 recipes/configs/qwen2/1.5B_full.yaml create mode 100644 recipes/configs/qwen2/1.5B_full_low_memory.yaml create mode 100644 recipes/configs/qwen2/1.5B_lora.yaml create mode 100644 recipes/configs/qwen2/1.5B_lora_single_device.yaml diff --git a/recipes/configs/qwen2/0.5B_full.yaml b/recipes/configs/qwen2/0.5B_full.yaml new file mode 100644 index 0000000000..8e31e5cc5e --- /dev/null +++ b/recipes/configs/qwen2/0.5B_full.yaml @@ -0,0 +1,75 @@ +# Config for multi-device full finetuning in full_finetune_distributed.py +# using a Qwen2 0.5B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2-0.5B-Instruct --output-dir /tmp/Qwen2-0.5B-Instruct --ignore-patterns "" +# +# To launch on 4 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config qwen2/0.5B_full +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config qwen2/0.5B_full checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# Single device full finetuning requires more memory optimizations. It's +# best to use 0.5B_full.yaml for those cases + +# Tokenizer +tokenizer: + _component_: torchtune.models.qwen2.qwen2_tokenizer + path: /tmp/Qwen2-0.5B-Instruct/vocab.json + merges_file: /tmp/Qwen2-0.5B-Instruct/merges.txt + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_dataset +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.qwen2.qwen2_0_5b + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2-0.5B-Instruct + checkpoint_files: [ + model.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2-0.5B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 1 +optimizer: + _component_: torch.optim.AdamW + lr: 5e-6 +loss: + _component_: torch.nn.CrossEntropyLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 16 + + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: True +memory_efficient_fsdp_wrap: False + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Qwen2-0.5B-Instruct-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False diff --git a/recipes/configs/qwen2/0.5B_full_low_memory.yaml b/recipes/configs/qwen2/0.5B_full_low_memory.yaml new file mode 100644 index 0000000000..9b0739d57d --- /dev/null +++ b/recipes/configs/qwen2/0.5B_full_low_memory.yaml @@ -0,0 +1,77 @@ +# Config for single device full finetuning in full_finetune_single_device.py +# using a Qwen2 0.5B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2-0.5B-Instruct --output-dir /tmp/Qwen2-0.5B-Instruct --ignore-patterns "" +# +# The default config uses an optimizer from bitsandbytes. If you do not have it installed, +# you can install it with +# pip install bitsandbytes +# +# To launch on a single device, run the following command from root: +# tune run full_finetune_single_device --config qwen2/0.5B_full_low_memory +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run full_finetune_single_device --config qwen2/0.5B_full_low_memory checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + +# Tokenizer +tokenizer: + _component_: torchtune.models.qwen2.qwen2_tokenizer + path: /tmp/Qwen2-0.5B-Instruct/vocab.json + merges_file: /tmp/Qwen2-0.5B-Instruct/merges.txt + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_dataset +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.qwen2.qwen2_0_5b + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2-0.5B-Instruct + checkpoint_files: [ + model.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2-0.5B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 1 +optimizer: + _component_: bitsandbytes.optim.PagedAdamW + lr: 5e-6 +optimizer_in_bwd: True +loss: + _component_: torch.nn.CrossEntropyLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 16 +compile: False + +# Training environment +device: cuda + +# Memory management +enable_activation_checkpointing: True + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Qwen2-0.5B-Instruct-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False diff --git a/recipes/configs/qwen2/0.5B_lora.yaml b/recipes/configs/qwen2/0.5B_lora.yaml new file mode 100644 index 0000000000..f887402fba --- /dev/null +++ b/recipes/configs/qwen2/0.5B_lora.yaml @@ -0,0 +1,108 @@ +# Config for multi-device LoRA finetuning in lora_finetune_distributed.py +# using a Qwen2 0.5B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2-0.5B-Instruct --output-dir /tmp/Qwen2-0.5B-Instruct --ignore-patterns "" +# +# To launch on 2 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2/0.5B_lora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2/0.5B_lora checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# For single device LoRA finetuning please use 0.5B_lora_single_device.yaml +# or 0.5B_qlora_single_device.yaml + + +# Model Arguments +model: + _component_: torchtune.models.qwen2.lora_qwen2_0_5b + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + +tokenizer: + _component_: torchtune.models.qwen2.qwen2_tokenizer + path: /tmp/Qwen2-0.5B-Instruct/vocab.json + merges_file: /tmp/Qwen2-0.5B-Instruct/merges.txt + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2-0.5B-Instruct + checkpoint_files: [ + model.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2-0.5B-Instruct-lora-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.modules.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torch.nn.CrossEntropyLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 32 + +# Logging +output_dir: /tmp/Qwen2-0.5B-Instruct-lora-finetune +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: False + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.utils.setup_torch_profiler + + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/qwen2/0.5B_lora_single_device.yaml b/recipes/configs/qwen2/0.5B_lora_single_device.yaml new file mode 100644 index 0000000000..9425d1bf70 --- /dev/null +++ b/recipes/configs/qwen2/0.5B_lora_single_device.yaml @@ -0,0 +1,106 @@ +# Config for single device LoRA finetuning in lora_finetune_single_device.py +# using a Qwen2 0.5B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2-0.5B-Instruct --output-dir /tmp/Qwen2-0.5B-Instruct --ignore-patterns "" +# +# To launch on a single device, run the following command from root: +# tune run lora_finetune_single_device --config qwen2/0.5B_lora_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run lora_finetune_single_device --config qwen2/0.5B_lora_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + + +# Model Arguments +model: + _component_: torchtune.models.qwen2.lora_qwen2_0_5b + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + +tokenizer: + _component_: torchtune.models.qwen2.qwen2_tokenizer + path: /tmp/Qwen2-0.5B-Instruct/vocab.json + merges_file: /tmp/Qwen2-0.5B-Instruct/merges.txt + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2-0.5B-Instruct + checkpoint_files: [ + model.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2-0.5B-Instruct-lora-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.modules.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torch.nn.CrossEntropyLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 64 +compile: False + +# Logging +output_dir: /tmp/Qwen2-0.5B-Instruct-lora-finetune +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: True + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.utils.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/qwen2/1.5B_full.yaml b/recipes/configs/qwen2/1.5B_full.yaml new file mode 100644 index 0000000000..5f42c434be --- /dev/null +++ b/recipes/configs/qwen2/1.5B_full.yaml @@ -0,0 +1,75 @@ +# Config for multi-device full finetuning in full_finetune_distributed.py +# using a Qwen2 1.5B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2-1.5B-Instruct --output-dir /tmp/Qwen2-1.5B-Instruct --ignore-patterns "" +# +# To launch on 4 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config qwen2/1.5B_full +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config qwen2/1.5B_full checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# Single device full finetuning requires more memory optimizations. It's +# best to use 1.5B_full.yaml for those cases + +# Tokenizer +tokenizer: + _component_: torchtune.models.qwen2.qwen2_tokenizer + path: /tmp/Qwen2-1.5B-Instruct/vocab.json + merges_file: /tmp/Qwen2-1.5B-Instruct/merges.txt + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_dataset +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.qwen2.qwen2_1_5b + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2-1.5B-Instruct + checkpoint_files: [ + model.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2-1.5B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 1 +optimizer: + _component_: torch.optim.AdamW + lr: 5e-6 +loss: + _component_: torch.nn.CrossEntropyLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 16 + + +# Training env +device: cuda + +# Memory management +enable_activation_checkpointing: True +memory_efficient_fsdp_wrap: False + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Qwen2-1.5B-Instruct-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False diff --git a/recipes/configs/qwen2/1.5B_full_low_memory.yaml b/recipes/configs/qwen2/1.5B_full_low_memory.yaml new file mode 100644 index 0000000000..a8fd88e086 --- /dev/null +++ b/recipes/configs/qwen2/1.5B_full_low_memory.yaml @@ -0,0 +1,77 @@ +# Config for single device full finetuning in full_finetune_single_device.py +# using a Qwen2 1.5B +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2-1.5B-Instruct --output-dir /tmp/Qwen2-1.5B-Instruct --ignore-patterns "" +# +# The default config uses an optimizer from bitsandbytes. If you do not have it installed, +# you can install it with +# pip install bitsandbytes +# +# To launch on a single device, run the following command from root: +# tune run full_finetune_single_device --config qwen2/1.5B_full_low_memory +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run full_finetune_single_device --config qwen2/1.5B_full_low_memory checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + +# Tokenizer +tokenizer: + _component_: torchtune.models.qwen2.qwen2_tokenizer + path: /tmp/Qwen2-1.5B-Instruct/vocab.json + merges_file: /tmp/Qwen2-1.5B-Instruct/merges.txt + +# Dataset +dataset: + _component_: torchtune.datasets.alpaca_dataset +seed: null +shuffle: True + +# Model Arguments +model: + _component_: torchtune.models.qwen2.qwen2_1_5b + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2-1.5B-Instruct + checkpoint_files: [ + model.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2-1.5B-Instruct-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Fine-tuning arguments +batch_size: 2 +epochs: 1 +optimizer: + _component_: bitsandbytes.optim.PagedAdamW + lr: 5e-6 +optimizer_in_bwd: True +loss: + _component_: torch.nn.CrossEntropyLoss +max_steps_per_epoch: null +gradient_accumulation_steps: 16 +compile: False + +# Training environment +device: cuda + +# Memory management +enable_activation_checkpointing: True + +# Reduced precision +dtype: bf16 + +# Logging +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +output_dir: /tmp/Qwen2-1.5B-Instruct-finetune +log_every_n_steps: 1 +log_peak_memory_stats: False diff --git a/recipes/configs/qwen2/1.5B_lora.yaml b/recipes/configs/qwen2/1.5B_lora.yaml new file mode 100644 index 0000000000..f887402fba --- /dev/null +++ b/recipes/configs/qwen2/1.5B_lora.yaml @@ -0,0 +1,108 @@ +# Config for multi-device LoRA finetuning in lora_finetune_distributed.py +# using a Qwen2 0.5B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2-0.5B-Instruct --output-dir /tmp/Qwen2-0.5B-Instruct --ignore-patterns "" +# +# To launch on 2 devices, run the following command from root: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2/0.5B_lora +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2/0.5B_lora checkpointer.checkpoint_dir= +# +# This config works best when the model is being fine-tuned on 2+ GPUs. +# For single device LoRA finetuning please use 0.5B_lora_single_device.yaml +# or 0.5B_qlora_single_device.yaml + + +# Model Arguments +model: + _component_: torchtune.models.qwen2.lora_qwen2_0_5b + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + +tokenizer: + _component_: torchtune.models.qwen2.qwen2_tokenizer + path: /tmp/Qwen2-0.5B-Instruct/vocab.json + merges_file: /tmp/Qwen2-0.5B-Instruct/merges.txt + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2-0.5B-Instruct + checkpoint_files: [ + model.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2-0.5B-Instruct-lora-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.modules.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torch.nn.CrossEntropyLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 32 + +# Logging +output_dir: /tmp/Qwen2-0.5B-Instruct-lora-finetune +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: False + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.utils.setup_torch_profiler + + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/qwen2/1.5B_lora_single_device.yaml b/recipes/configs/qwen2/1.5B_lora_single_device.yaml new file mode 100644 index 0000000000..8d26f4b1bb --- /dev/null +++ b/recipes/configs/qwen2/1.5B_lora_single_device.yaml @@ -0,0 +1,106 @@ +# Config for single device LoRA finetuning in lora_finetune_single_device.py +# using a Qwen2 1.5B model +# +# This config assumes that you've run the following command before launching +# this run: +# tune download Qwen/Qwen2-1.5B-Instruct --output-dir /tmp/Qwen2-1.5B-Instruct --ignore-patterns "" +# +# To launch on a single device, run the following command from root: +# tune run lora_finetune_single_device --config qwen2/1.5B_lora_single_device +# +# You can add specific overrides through the command line. For example +# to override the checkpointer directory while launching training +# you can run: +# tune run lora_finetune_single_device --config qwen2/1.5B_lora_single_device checkpointer.checkpoint_dir= +# +# This config works only for training on single device. + + +# Model Arguments +model: + _component_: torchtune.models.qwen2.lora_qwen2_1_5b + lora_attn_modules: ['q_proj', 'v_proj'] + apply_lora_to_mlp: False + apply_lora_to_output: False + lora_rank: 8 + lora_alpha: 16 + +tokenizer: + _component_: torchtune.models.qwen2.qwen2_tokenizer + path: /tmp/Qwen2-1.5B-Instruct/vocab.json + merges_file: /tmp/Qwen2-1.5B-Instruct/merges.txt + +checkpointer: + _component_: torchtune.utils.FullModelHFCheckpointer + checkpoint_dir: /tmp/Qwen2-1.5B-Instruct + checkpoint_files: [ + model.safetensors + ] + recipe_checkpoint: null + output_dir: /tmp/Qwen2-1.5B-Instruct-lora-finetune + model_type: QWEN2 +resume_from_checkpoint: False + +# Dataset and Sampler +dataset: + _component_: torchtune.datasets.alpaca_cleaned_dataset +seed: null +shuffle: True +batch_size: 2 + +# Optimizer and Scheduler +optimizer: + _component_: torch.optim.AdamW + weight_decay: 0.01 + lr: 3e-4 +lr_scheduler: + _component_: torchtune.modules.get_cosine_schedule_with_warmup + num_warmup_steps: 100 + +loss: + _component_: torch.nn.CrossEntropyLoss + +# Training +epochs: 1 +max_steps_per_epoch: null +gradient_accumulation_steps: 64 +compile: False + +# Logging +output_dir: /tmp/Qwen2-1.5B-Instruct-lora-finetune +metric_logger: + _component_: torchtune.utils.metric_logging.DiskLogger + log_dir: ${output_dir} +log_every_n_steps: 1 +log_peak_memory_stats: False + +# Environment +device: cuda +dtype: bf16 +enable_activation_checkpointing: True + +# Show case the usage of pytorch profiler +# Set enabled to False as it's only needed for debugging training +profiler: + _component_: torchtune.utils.setup_torch_profiler + enabled: False + + #Output directory of trace artifacts + output_dir: ${output_dir}/profiling_outputs + + #`torch.profiler.ProfilerActivity` types to trace + cpu: True + cuda: True + + #trace options passed to `torch.profiler.profile` + profile_memory: False + with_stack: False + record_shapes: True + with_flops: False + + # `torch.profiler.schedule` options: + # wait_steps -> wait, warmup_steps -> warmup, active_steps -> active, num_cycles -> repeat + wait_steps: 5 + warmup_steps: 5 + active_steps: 2 + num_cycles: 1 diff --git a/recipes/configs/qwen2/7B_full.yaml b/recipes/configs/qwen2/7B_full.yaml index b71e7ec321..47eccf3636 100644 --- a/recipes/configs/qwen2/7B_full.yaml +++ b/recipes/configs/qwen2/7B_full.yaml @@ -3,7 +3,7 @@ # # This config assumes that you've run the following command before launching # this run: -# tune download Qwen/Qwen2-7B-Instruct --output-dir /tmp/Qwen2-7B-Instruct +# tune download Qwen/Qwen2-7B-Instruct --output-dir /tmp/Qwen2-7B-Instruct --ignore-patterns "" # # To launch on 4 devices, run the following command from root: # tune run --nnodes 1 --nproc_per_node 4 full_finetune_distributed --config qwen2/7B_full @@ -15,7 +15,7 @@ # # This config works best when the model is being fine-tuned on 2+ GPUs. # Single device full finetuning requires more memory optimizations. It's -# best to use 7B_full_single_device.yaml for those cases +# best to use 7B_full.yaml for those cases # Tokenizer tokenizer: @@ -49,14 +49,14 @@ resume_from_checkpoint: False # Fine-tuning arguments batch_size: 2 -epochs: 3 +epochs: 1 optimizer: _component_: torch.optim.AdamW - lr: 2e-5 + lr: 5e-6 loss: _component_: torch.nn.CrossEntropyLoss max_steps_per_epoch: null -gradient_accumulation_steps: 1 +gradient_accumulation_steps: 16 # Training env diff --git a/recipes/configs/qwen2/7B_full_low_memory.yaml b/recipes/configs/qwen2/7B_full_low_memory.yaml index a0ff849098..2c1ad3e3e7 100644 --- a/recipes/configs/qwen2/7B_full_low_memory.yaml +++ b/recipes/configs/qwen2/7B_full_low_memory.yaml @@ -3,7 +3,7 @@ # # This config assumes that you've run the following command before launching # this run: -# tune download Qwen/Qwen2-7B-Instruct --output-dir /tmp/Qwen2-7B-Instruct +# tune download Qwen/Qwen2-7B-Instruct --output-dir /tmp/Qwen2-7B-Instruct --ignore-patterns "" # # The default config uses an optimizer from bitsandbytes. If you do not have it installed, # you can install it with @@ -51,15 +51,15 @@ resume_from_checkpoint: False # Fine-tuning arguments batch_size: 2 -epochs: 3 +epochs: 1 optimizer: _component_: bitsandbytes.optim.PagedAdamW - lr: 2e-5 + lr: 5e-6 optimizer_in_bwd: True loss: _component_: torch.nn.CrossEntropyLoss max_steps_per_epoch: null -gradient_accumulation_steps: 1 +gradient_accumulation_steps: 16 compile: False # Training environment diff --git a/recipes/configs/qwen2/7B_lora.yaml b/recipes/configs/qwen2/7B_lora.yaml index 0b529853a0..e5bd742389 100644 --- a/recipes/configs/qwen2/7B_lora.yaml +++ b/recipes/configs/qwen2/7B_lora.yaml @@ -3,7 +3,7 @@ # # This config assumes that you've run the following command before launching # this run: -# tune download Qwen/Qwen2-7B-Instruct --output-dir /tmp/Qwen2-7B-Instruct +# tune download Qwen/Qwen2-7B-Instruct --output-dir /tmp/Qwen2-7B-Instruct --ignore-patterns "" # # To launch on 2 devices, run the following command from root: # tune run --nnodes 1 --nproc_per_node 2 lora_finetune_distributed --config qwen2/7B_lora diff --git a/recipes/configs/qwen2/7B_lora_single_device.yaml b/recipes/configs/qwen2/7B_lora_single_device.yaml index 5f34420dae..cad9182d9f 100644 --- a/recipes/configs/qwen2/7B_lora_single_device.yaml +++ b/recipes/configs/qwen2/7B_lora_single_device.yaml @@ -3,7 +3,7 @@ # # This config assumes that you've run the following command before launching # this run: -# tune download Qwen/Qwen2-7B-Instruct --output-dir /tmp/Qwen2-7B-Instruct +# tune download Qwen/Qwen2-7B-Instruct --output-dir /tmp/Qwen2-7B-Instruct --ignore-patterns "" # # To launch on a single device, run the following command from root: # tune run lora_finetune_single_device --config qwen2/7B_lora_single_device diff --git a/torchtune/models/qwen2/__init__.py b/torchtune/models/qwen2/__init__.py index 3791bbbaef..5fc94bf438 100644 --- a/torchtune/models/qwen2/__init__.py +++ b/torchtune/models/qwen2/__init__.py @@ -6,13 +6,25 @@ from ._component_builders import lora_qwen2, qwen2 # noqa from ._convert_weights import qwen2_hf_to_tune, qwen2_tune_to_hf # noqa -from ._model_builders import lora_qwen2_7b, qwen2_7b, qwen2_tokenizer # noqa +from ._model_builders import ( + lora_qwen2_0_5b, + lora_qwen2_1_5b, + lora_qwen2_7b, + qwen2_0_5b, + qwen2_1_5b, + qwen2_7b, + qwen2_tokenizer, +) from ._positional_embeddings import Qwen2RotaryPositionalEmbeddings __all__ = [ "qwen2_7b", + "qwen2_0_5b", + "qwen2_1_5b", "qwen2_tokenizer", "lora_qwen2_7b", + "lora_qwen2_0_5b", + "lora_qwen2_1_5b", "qwen2", "lora_qwen2", "qwen2_hf_to_tune", diff --git a/torchtune/models/qwen2/_convert_weights.py b/torchtune/models/qwen2/_convert_weights.py index 6ef0455ea1..d23f6745cd 100644 --- a/torchtune/models/qwen2/_convert_weights.py +++ b/torchtune/models/qwen2/_convert_weights.py @@ -68,7 +68,7 @@ def qwen2_hf_to_tune( for key, value in state_dict.items(): if ( - tie_word_embeddings and QWEN2_TIED_KEY not in key + tie_word_embeddings and QWEN2_TIED_KEY in key ): # Skip loading the output projection weights continue if "rotary_emb.inv_freq" in key: # Skip loading the position embeddings @@ -112,8 +112,6 @@ def qwen2_tune_to_hf( for key, value in state_dict.items(): new_key = get_mapped_key(key, inverted_mapping_dict) - if "tok_embeddings" in key and tie_word_embeddings: - converted_state_dict[QWEN2_TIED_KEY] = value converted_state_dict[new_key] = value return converted_state_dict diff --git a/torchtune/models/qwen2/_model_builders.py b/torchtune/models/qwen2/_model_builders.py index 05180d8e04..8dc34e5359 100644 --- a/torchtune/models/qwen2/_model_builders.py +++ b/torchtune/models/qwen2/_model_builders.py @@ -40,6 +40,52 @@ def qwen2_7b() -> Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: ) +def qwen2_0_5b() -> Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: + """ + Builder for creating a Qwen2 model initialized w/ the default 0.5B parameter values + from https://huggingface.co/Qwen/Qwen2-0.5B-Instruct + + Returns: + Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: Instantiation of Qwen2 0.5B model + """ + return qwen2( + vocab_size=151936, + num_layers=24, + num_heads=14, + num_kv_heads=2, + embed_dim=896, + intermediate_dim=4864, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-06, + rope_base=1000000.0, + tie_word_embeddings=True, + ) + + +def qwen2_1_5b() -> Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: + """ + Builder for creating a Qwen2 model initialized w/ the default 1.5B parameter values + from https://huggingface.co/Qwen/Qwen2-1.5B-Instruct + + Returns: + Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: Instantiation of Qwen2 1.5B model + """ + return qwen2( + vocab_size=151936, + num_layers=28, + num_heads=12, + num_kv_heads=2, + embed_dim=1536, + intermediate_dim=8960, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-06, + rope_base=1000000.0, + tie_word_embeddings=True, + ) + + def qwen2_tokenizer( path: str, merges_file: str = None, special_tokens_path: Optional[str] = None, **kwargs, @@ -111,3 +157,109 @@ def lora_qwen2_7b( lora_dropout=lora_dropout, quantize_base=quantize_base, ) + + +def lora_qwen2_0_5b( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.05, + quantize_base: bool = False, +) -> Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: + """ + Builder for creating a Qwen2 0.5B model with LoRA enabled. + + The Qwen2 defaults are the same as in :func:`~torchtune.models.qwen2.qwen2_0_5b`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + quantize_base (bool): Whether to quantize base model weights + + Returns: + Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: Instantiation of Qwen2 0.5B model with LoRA applied + """ + return lora_qwen2( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=151936, + num_layers=24, + num_heads=14, + num_kv_heads=2, + embed_dim=896, + intermediate_dim=4864, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + tie_word_embeddings=True, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + quantize_base=quantize_base, + ) + + +def lora_qwen2_1_5b( + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.05, + quantize_base: bool = False, +) -> Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: + """ + Builder for creating a Qwen2 1.5B model with LoRA enabled. + + The Qwen2 defaults are the same as in :func:`~torchtune.models.qwen2.qwen2_1_5b`, + while LoRA default params are based on + https://github.com/tloen/alpaca-lora/blob/8bb8579e403dc78e37fe81ffbb253c413007323f/finetune.py#L41-L43. + + Args: + lora_attn_modules (List[LORA_ATTN_MODULES]): list of which linear layers + LoRA should be applied to in each self-attention block. Options are + ``{"q_proj", "k_proj", "v_proj", "output_proj"}``. + apply_lora_to_mlp (bool): whether to apply LoRA to the MLP in each transformer layer. + Default: False + apply_lora_to_output (bool): whether to apply LoRA to the model's final output projection. + Default: False + lora_rank (int): rank of each low-rank approximation + lora_alpha (float): scaling factor for the low-rank approximation + quantize_base (bool): Whether to quantize base model weights + + Returns: + Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: Instantiation of Qwen2 1.5B model with LoRA applied + """ + return lora_qwen2( + lora_attn_modules=lora_attn_modules, + apply_lora_to_mlp=apply_lora_to_mlp, + apply_lora_to_output=apply_lora_to_output, + vocab_size=151936, + num_layers=28, + num_heads=12, + num_kv_heads=2, + embed_dim=1536, + intermediate_dim=8960, + max_seq_len=32768, + attn_dropout=0.0, + norm_eps=1e-6, + rope_base=1000000.0, + tie_word_embeddings=True, + lora_rank=lora_rank, + lora_alpha=lora_alpha, + lora_dropout=lora_dropout, + quantize_base=quantize_base, + ) diff --git a/torchtune/modules/transformer.py b/torchtune/modules/transformer.py index 392ed9f4b7..5f3805a9cd 100644 --- a/torchtune/modules/transformer.py +++ b/torchtune/modules/transformer.py @@ -318,9 +318,13 @@ def setup_caches(self, batch_size: int, dtype: torch.dtype) -> None: torch.ones(self.max_seq_len, self.max_seq_len, dtype=torch.bool) ) + def caches_are_enabled(self) -> bool: + """Check if the key value caches are setup.""" + return self.layers[0].attn.kv_cache is not None + def reset_caches(self): """Reset the key value caches.""" - if self.layers[0].attn.kv_cache is None: + if not self.caches_are_enabled(): raise RuntimeError( "Key value caches are not setup. Call ``setup_caches()`` first." ) From a610723a0ccc2c34fdb2b732722477c51a048076 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Wed, 31 Jul 2024 05:13:14 +0800 Subject: [PATCH 11/14] Update _recipe_registry.py. --- torchtune/_recipe_registry.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py index 9f40bf6802..97db9af7f2 100644 --- a/torchtune/_recipe_registry.py +++ b/torchtune/_recipe_registry.py @@ -55,6 +55,14 @@ class Recipe: name="qwen2/7B_full_low_memory", file_path="qwen2/7B_full_low_memory.yaml", ), + Config( + name="qwen2/0.5B_full_low_memory", + file_path="qwen2/0.5B_full_low_memory.yaml", + ), + Config( + name="qwen2/1.5B_full_low_memory", + file_path="qwen2/1.5B_full_low_memory.yaml", + ), ], supports_distributed=False, ), @@ -73,6 +81,8 @@ class Recipe: Config(name="gemma/7B_full", file_path="gemma/7B_full.yaml"), Config(name="phi3/mini_full", file_path="phi3/mini_full.yaml"), Config(name="qwen2/7B_full", file_path="qwen2/7B_full.yaml"), + Config(name="qwen2/0.5B_full", file_path="qwen2/0.5B_full.yaml"), + Config(name="qwen2/1.5B_full", file_path="qwen2/1.5B_full.yaml"), ], supports_distributed=True, ), @@ -152,6 +162,14 @@ class Recipe: name="qwen2/7B_lora_single_device", file_path="qwen2/7B_lora_single_device.yaml", ), + Config( + name="qwen2/0.5B_lora_single_device", + file_path="qwen2/0.5B_lora_single_device.yaml", + ), + Config( + name="qwen2/1.5B_lora_single_device", + file_path="qwen2/1.5B_lora_single_device.yaml", + ), ], supports_distributed=False, ), @@ -193,6 +211,8 @@ class Recipe: Config(name="gemma/7B_lora", file_path="gemma/7B_lora.yaml"), Config(name="phi3/mini_lora", file_path="phi3/mini_lora.yaml"), Config(name="qwen2/7B_lora", file_path="qwen2/7B_lora.yaml"), + Config(name="qwen2/0.5B_lora", file_path="qwen2/0.5B_lora.yaml"), + Config(name="qwen2/1.5B_lora", file_path="qwen2/1.5B_lora.yaml"), ], supports_distributed=True, ), From c59269ae2f437254df08c988ca77c6a73a55deea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Wed, 31 Jul 2024 05:20:54 +0800 Subject: [PATCH 12/14] Fix in recipe configs. --- recipes/configs/qwen2/0.5B_full.yaml | 2 +- recipes/configs/qwen2/0.5B_full_low_memory.yaml | 2 +- recipes/configs/qwen2/1.5B_full.yaml | 2 +- recipes/configs/qwen2/1.5B_full_low_memory.yaml | 2 +- recipes/configs/qwen2/7B_full.yaml | 2 +- recipes/configs/qwen2/7B_full_low_memory.yaml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/recipes/configs/qwen2/0.5B_full.yaml b/recipes/configs/qwen2/0.5B_full.yaml index 8e31e5cc5e..5be1a5e363 100644 --- a/recipes/configs/qwen2/0.5B_full.yaml +++ b/recipes/configs/qwen2/0.5B_full.yaml @@ -25,7 +25,7 @@ tokenizer: # Dataset dataset: - _component_: torchtune.datasets.alpaca_dataset + _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True diff --git a/recipes/configs/qwen2/0.5B_full_low_memory.yaml b/recipes/configs/qwen2/0.5B_full_low_memory.yaml index 9b0739d57d..6e4541340e 100644 --- a/recipes/configs/qwen2/0.5B_full_low_memory.yaml +++ b/recipes/configs/qwen2/0.5B_full_low_memory.yaml @@ -27,7 +27,7 @@ tokenizer: # Dataset dataset: - _component_: torchtune.datasets.alpaca_dataset + _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True diff --git a/recipes/configs/qwen2/1.5B_full.yaml b/recipes/configs/qwen2/1.5B_full.yaml index 5f42c434be..6037a03a95 100644 --- a/recipes/configs/qwen2/1.5B_full.yaml +++ b/recipes/configs/qwen2/1.5B_full.yaml @@ -25,7 +25,7 @@ tokenizer: # Dataset dataset: - _component_: torchtune.datasets.alpaca_dataset + _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True diff --git a/recipes/configs/qwen2/1.5B_full_low_memory.yaml b/recipes/configs/qwen2/1.5B_full_low_memory.yaml index a8fd88e086..78784a02ad 100644 --- a/recipes/configs/qwen2/1.5B_full_low_memory.yaml +++ b/recipes/configs/qwen2/1.5B_full_low_memory.yaml @@ -27,7 +27,7 @@ tokenizer: # Dataset dataset: - _component_: torchtune.datasets.alpaca_dataset + _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True diff --git a/recipes/configs/qwen2/7B_full.yaml b/recipes/configs/qwen2/7B_full.yaml index 47eccf3636..0ec686d7b8 100644 --- a/recipes/configs/qwen2/7B_full.yaml +++ b/recipes/configs/qwen2/7B_full.yaml @@ -25,7 +25,7 @@ tokenizer: # Dataset dataset: - _component_: torchtune.datasets.alpaca_dataset + _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True diff --git a/recipes/configs/qwen2/7B_full_low_memory.yaml b/recipes/configs/qwen2/7B_full_low_memory.yaml index 2c1ad3e3e7..80779f130b 100644 --- a/recipes/configs/qwen2/7B_full_low_memory.yaml +++ b/recipes/configs/qwen2/7B_full_low_memory.yaml @@ -27,7 +27,7 @@ tokenizer: # Dataset dataset: - _component_: torchtune.datasets.alpaca_dataset + _component_: torchtune.datasets.alpaca_cleaned_dataset seed: null shuffle: True From 72ef1ddac2d3d155b54e1c8323a6d865ba0bd0ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Wed, 31 Jul 2024 05:37:44 +0800 Subject: [PATCH 13/14] Fix in _checkpointer_utils.py. --- torchtune/utils/_checkpointing/_checkpointer_utils.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/torchtune/utils/_checkpointing/_checkpointer_utils.py b/torchtune/utils/_checkpointing/_checkpointer_utils.py index 18211635ed..962dab9f49 100644 --- a/torchtune/utils/_checkpointing/_checkpointer_utils.py +++ b/torchtune/utils/_checkpointing/_checkpointer_utils.py @@ -28,6 +28,7 @@ class ModelType(Enum): REWARD (str): A Llama2, Llama3, or Mistral model with a classification head projecting to a single class for reward modelling. See :func:`~torchtune.models.mistral.mistral_reward_7b` or :func:`~torchtune.models.llama2.llama2_reward_7b` + QWEN2 (str): Qwen2 family of models. See :func:`~torchtune.models.qwen2.qwen2` Example: >>> # Usage in a checkpointer class @@ -43,12 +44,7 @@ class ModelType(Enum): MISTRAL: str = "mistral" PHI3_MINI: str = "phi3_mini" REWARD: str = "reward" - - QWEN2 = "qwen2" - """Qwen2 family of models. See :func:`~torchtune.models.qwen2.qwen2`""" - - QWEN2 = "qwen2" - """Qwen2 family of models. See :func:`~torchtune.models.qwen2.qwen2`""" + QWEN2: str = "qwen2" def get_path(input_dir: Path, filename: str, missing_ok: bool = False) -> Path: From 856555b57811c49f59c5bc3824f21cf671d2e2e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=8B=8F=E9=98=B3?= Date: Tue, 6 Aug 2024 14:59:42 +0800 Subject: [PATCH 14/14] Rename Qwen2 recipe files. --- ...mory.yaml => 0.5B_full_single_device.yaml} | 4 +- ...mory.yaml => 1.5B_full_single_device.yaml} | 4 +- ...memory.yaml => 7B_full_single_device.yaml} | 4 +- torchtune/_recipe_registry.py | 12 +-- torchtune/models/qwen2/_model_builders.py | 84 +++++++++++-------- 5 files changed, 62 insertions(+), 46 deletions(-) rename recipes/configs/qwen2/{0.5B_full_low_memory.yaml => 0.5B_full_single_device.yaml} (96%) rename recipes/configs/qwen2/{1.5B_full_low_memory.yaml => 1.5B_full_single_device.yaml} (96%) rename recipes/configs/qwen2/{7B_full_low_memory.yaml => 7B_full_single_device.yaml} (96%) diff --git a/recipes/configs/qwen2/0.5B_full_low_memory.yaml b/recipes/configs/qwen2/0.5B_full_single_device.yaml similarity index 96% rename from recipes/configs/qwen2/0.5B_full_low_memory.yaml rename to recipes/configs/qwen2/0.5B_full_single_device.yaml index 6e4541340e..59fbcb6b55 100644 --- a/recipes/configs/qwen2/0.5B_full_low_memory.yaml +++ b/recipes/configs/qwen2/0.5B_full_single_device.yaml @@ -10,12 +10,12 @@ # pip install bitsandbytes # # To launch on a single device, run the following command from root: -# tune run full_finetune_single_device --config qwen2/0.5B_full_low_memory +# tune run full_finetune_single_device --config qwen2/0.5B_full_single_device # # You can add specific overrides through the command line. For example # to override the checkpointer directory while launching training # you can run: -# tune run full_finetune_single_device --config qwen2/0.5B_full_low_memory checkpointer.checkpoint_dir= +# tune run full_finetune_single_device --config qwen2/0.5B_full_single_device checkpointer.checkpoint_dir= # # This config works only for training on single device. diff --git a/recipes/configs/qwen2/1.5B_full_low_memory.yaml b/recipes/configs/qwen2/1.5B_full_single_device.yaml similarity index 96% rename from recipes/configs/qwen2/1.5B_full_low_memory.yaml rename to recipes/configs/qwen2/1.5B_full_single_device.yaml index 78784a02ad..99408d5114 100644 --- a/recipes/configs/qwen2/1.5B_full_low_memory.yaml +++ b/recipes/configs/qwen2/1.5B_full_single_device.yaml @@ -10,12 +10,12 @@ # pip install bitsandbytes # # To launch on a single device, run the following command from root: -# tune run full_finetune_single_device --config qwen2/1.5B_full_low_memory +# tune run full_finetune_single_device --config qwen2/1.5B_full_single_device # # You can add specific overrides through the command line. For example # to override the checkpointer directory while launching training # you can run: -# tune run full_finetune_single_device --config qwen2/1.5B_full_low_memory checkpointer.checkpoint_dir= +# tune run full_finetune_single_device --config qwen2/1.5B_full_single_device checkpointer.checkpoint_dir= # # This config works only for training on single device. diff --git a/recipes/configs/qwen2/7B_full_low_memory.yaml b/recipes/configs/qwen2/7B_full_single_device.yaml similarity index 96% rename from recipes/configs/qwen2/7B_full_low_memory.yaml rename to recipes/configs/qwen2/7B_full_single_device.yaml index 80779f130b..075aae8f96 100644 --- a/recipes/configs/qwen2/7B_full_low_memory.yaml +++ b/recipes/configs/qwen2/7B_full_single_device.yaml @@ -10,12 +10,12 @@ # pip install bitsandbytes # # To launch on a single device, run the following command from root: -# tune run full_finetune_single_device --config qwen2/7B_full_low_memory +# tune run full_finetune_single_device --config qwen2/7B_full_single_device # # You can add specific overrides through the command line. For example # to override the checkpointer directory while launching training # you can run: -# tune run full_finetune_single_device --config qwen2/7B_full_low_memory checkpointer.checkpoint_dir= +# tune run full_finetune_single_device --config qwen2/7B_full_single_device checkpointer.checkpoint_dir= # # This config works only for training on single device. diff --git a/torchtune/_recipe_registry.py b/torchtune/_recipe_registry.py index bf6eb071b5..fd53a753ec 100644 --- a/torchtune/_recipe_registry.py +++ b/torchtune/_recipe_registry.py @@ -52,16 +52,16 @@ class Recipe: file_path="phi3/mini_full_low_memory.yaml", ), Config( - name="qwen2/7B_full_low_memory", - file_path="qwen2/7B_full_low_memory.yaml", + name="qwen2/7B_full_single_device", + file_path="qwen2/7B_full_single_device.yaml", ), Config( - name="qwen2/0.5B_full_low_memory", - file_path="qwen2/0.5B_full_low_memory.yaml", + name="qwen2/0.5B_full_single_device", + file_path="qwen2/0.5B_full_single_device.yaml", ), Config( - name="qwen2/1.5B_full_low_memory", - file_path="qwen2/1.5B_full_low_memory.yaml", + name="qwen2/1.5B_full_single_device", + file_path="qwen2/1.5B_full_single_device.yaml", ), ], supports_distributed=False, diff --git a/torchtune/models/qwen2/_model_builders.py b/torchtune/models/qwen2/_model_builders.py index 8dc34e5359..1d4a58324b 100644 --- a/torchtune/models/qwen2/_model_builders.py +++ b/torchtune/models/qwen2/_model_builders.py @@ -18,13 +18,13 @@ """ -def qwen2_7b() -> Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: +def qwen2_7b() -> TransformerDecoder: """ Builder for creating a Qwen2 model initialized w/ the default 7B parameter values from https://huggingface.co/Qwen/Qwen2-7B-Instruct Returns: - Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: Instantiation of Qwen2 7B model + TransformerDecoder: Instantiation of Qwen2 7B model """ return qwen2( vocab_size=152064, @@ -40,13 +40,17 @@ def qwen2_7b() -> Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: ) -def qwen2_0_5b() -> Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: +def qwen2_0_5b() -> TiedEmbeddingTransformerDecoder: """ Builder for creating a Qwen2 model initialized w/ the default 0.5B parameter values from https://huggingface.co/Qwen/Qwen2-0.5B-Instruct Returns: - Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: Instantiation of Qwen2 0.5B model + TiedEmbeddingTransformerDecoder: Instantiation of Qwen2 0.5B model + + Note: + Qwen2 0.5B and Qwen2 1.5B model builders will enable `tie_word_embeddings` by default + and returns an instance of `TiedEmbeddingTransformerDecoder`. """ return qwen2( vocab_size=151936, @@ -63,13 +67,17 @@ def qwen2_0_5b() -> Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: ) -def qwen2_1_5b() -> Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: +def qwen2_1_5b() -> TiedEmbeddingTransformerDecoder: """ Builder for creating a Qwen2 model initialized w/ the default 1.5B parameter values from https://huggingface.co/Qwen/Qwen2-1.5B-Instruct Returns: - Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: Instantiation of Qwen2 1.5B model + TiedEmbeddingTransformerDecoder: Instantiation of Qwen2 1.5B model + + Note: + Qwen2 0.5B and Qwen2 1.5B model builders will enable `tie_word_embeddings` by default + and returns an instance of `TiedEmbeddingTransformerDecoder`. """ return qwen2( vocab_size=151936, @@ -101,21 +109,21 @@ def qwen2_tokenizer( structured similarly. Default is None to use the canonical Qwen2 special tokens. Returns: - Llama3Tokenizer: Instantiation of the Qwen2 tokenizer + Qwen2Tokenizer: Instantiation of the Qwen2 tokenizer """ special_tokens = parse_hf_tokenizer_json(special_tokens_path) if special_tokens_path is not None else None return Qwen2Tokenizer(path=path, merges_file=merges_file, special_tokens=special_tokens, **kwargs) def lora_qwen2_7b( - lora_attn_modules: List[LORA_ATTN_MODULES], - apply_lora_to_mlp: bool = False, - apply_lora_to_output: bool = False, - lora_rank: int = 8, - lora_alpha: float = 16, - lora_dropout: float = 0.05, - quantize_base: bool = False, -) -> Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.05, + quantize_base: bool = False, +) -> TransformerDecoder: """ Builder for creating a Qwen2 7B model with LoRA enabled. @@ -136,7 +144,7 @@ def lora_qwen2_7b( quantize_base (bool): Whether to quantize base model weights Returns: - Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: Instantiation of Qwen2 7B model with LoRA applied + TransformerDecoder: Instantiation of Qwen2 7B model with LoRA applied """ return lora_qwen2( lora_attn_modules=lora_attn_modules, @@ -160,14 +168,14 @@ def lora_qwen2_7b( def lora_qwen2_0_5b( - lora_attn_modules: List[LORA_ATTN_MODULES], - apply_lora_to_mlp: bool = False, - apply_lora_to_output: bool = False, - lora_rank: int = 8, - lora_alpha: float = 16, - lora_dropout: float = 0.05, - quantize_base: bool = False, -) -> Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.05, + quantize_base: bool = False, +) -> TiedEmbeddingTransformerDecoder: """ Builder for creating a Qwen2 0.5B model with LoRA enabled. @@ -188,7 +196,11 @@ def lora_qwen2_0_5b( quantize_base (bool): Whether to quantize base model weights Returns: - Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: Instantiation of Qwen2 0.5B model with LoRA applied + TiedEmbeddingTransformerDecoder: Instantiation of Qwen2 0.5B model with LoRA applied + + Note: + Qwen2 0.5B and Qwen2 1.5B model builders will enable `tie_word_embeddings` by default + and returns an instance of `TiedEmbeddingTransformerDecoder`. """ return lora_qwen2( lora_attn_modules=lora_attn_modules, @@ -213,14 +225,14 @@ def lora_qwen2_0_5b( def lora_qwen2_1_5b( - lora_attn_modules: List[LORA_ATTN_MODULES], - apply_lora_to_mlp: bool = False, - apply_lora_to_output: bool = False, - lora_rank: int = 8, - lora_alpha: float = 16, - lora_dropout: float = 0.05, - quantize_base: bool = False, -) -> Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: + lora_attn_modules: List[LORA_ATTN_MODULES], + apply_lora_to_mlp: bool = False, + apply_lora_to_output: bool = False, + lora_rank: int = 8, + lora_alpha: float = 16, + lora_dropout: float = 0.05, + quantize_base: bool = False, +) -> TiedEmbeddingTransformerDecoder: """ Builder for creating a Qwen2 1.5B model with LoRA enabled. @@ -241,7 +253,11 @@ def lora_qwen2_1_5b( quantize_base (bool): Whether to quantize base model weights Returns: - Union[TransformerDecoder, TiedEmbeddingTransformerDecoder]: Instantiation of Qwen2 1.5B model with LoRA applied + TiedEmbeddingTransformerDecoder: Instantiation of Qwen2 1.5B model with LoRA applied + + Note: + Qwen2 0.5B and Qwen2 1.5B model builders will enable `tie_word_embeddings` by default + and returns an instance of `TiedEmbeddingTransformerDecoder`. """ return lora_qwen2( lora_attn_modules=lora_attn_modules,