diff --git a/Data/TinyStoriesChinese b/Data/TinyStoriesChinese new file mode 160000 index 0000000..98547ad --- /dev/null +++ b/Data/TinyStoriesChinese @@ -0,0 +1 @@ +Subproject commit 98547ad8ca1205e2e5ce564343cb78f972f72ffa diff --git a/Experiments/chatglm3_6b_tokenizer/special_tokens_map.json b/Experiments/chatglm3_6b_tokenizer/special_tokens_map.json deleted file mode 100644 index 0967ef4..0000000 --- a/Experiments/chatglm3_6b_tokenizer/special_tokens_map.json +++ /dev/null @@ -1 +0,0 @@ -{} diff --git a/Experiments/chatglm3_6b_tokenizer/tokenization_chatglm.py b/Experiments/chatglm3_6b_tokenizer/tokenization_chatglm.py deleted file mode 100644 index be1cbbf..0000000 --- a/Experiments/chatglm3_6b_tokenizer/tokenization_chatglm.py +++ /dev/null @@ -1,328 +0,0 @@ -import json -import os -import re -from typing import List, Optional, Union, Dict -from sentencepiece import SentencePieceProcessor -from transformers import PreTrainedTokenizer -from transformers.utils import logging, PaddingStrategy -from transformers.tokenization_utils_base import EncodedInput, BatchEncoding - - -logger = logging.get_logger(__name__) - - -class SPTokenizer: - def __init__(self, model_path: str): - # reload tokenizer - assert os.path.isfile(model_path), model_path - self.sp_model = SentencePieceProcessor(model_file=model_path) - - # BOS / EOS token IDs - self.n_words: int = self.sp_model.vocab_size() - self.bos_id: int = self.sp_model.bos_id() - self.eos_id: int = self.sp_model.eos_id() - self.pad_id: int = self.sp_model.unk_id() - assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() - - role_special_tokens = ["<|system|>", "<|user|>", "<|assistant|>", "<|observation|>"] - special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "sop", "eop"] + role_special_tokens - self.special_tokens = {} - self.index_special_tokens = {} - for token in special_tokens: - self.special_tokens[token] = self.n_words - self.index_special_tokens[self.n_words] = token - self.n_words += 1 - self.role_special_token_expression = "|".join([re.escape(token) for token in special_tokens]) # for apply_chat_template - - def tokenize(self, s: str, encode_special_tokens=False): - if encode_special_tokens: - last_index = 0 - t = [] - for match in re.finditer(self.role_special_token_expression, s): - if last_index < match.start(): - t.extend(self.sp_model.EncodeAsPieces(s[last_index:match.start()])) - t.append(s[match.start():match.end()]) - last_index = match.end() - if last_index < len(s): - t.extend(self.sp_model.EncodeAsPieces(s[last_index:])) - return t - else: - return self.sp_model.EncodeAsPieces(s) - - def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]: - assert type(s) is str - t = self.sp_model.encode(s) - if bos: - t = [self.bos_id] + t - if eos: - t = t + [self.eos_id] - return t - - def decode(self, t: List[int]) -> str: - text, buffer = "", [] - for token in t: - if token in self.index_special_tokens: - if buffer: - text += self.sp_model.decode(buffer) - buffer = [] - text += self.index_special_tokens[token] - else: - buffer.append(token) - if buffer: - text += self.sp_model.decode(buffer) - return text - - def decode_tokens(self, tokens: List[str]) -> str: - text = self.sp_model.DecodePieces(tokens) - return text - - def convert_token_to_id(self, token): - """ Converts a token (str) in an id using the vocab. """ - if token in self.special_tokens: - return self.special_tokens[token] - return self.sp_model.PieceToId(token) - - def convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - if index in self.index_special_tokens: - return self.index_special_tokens[index] - if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0 or index > self.sp_model.vocab_size(): - return "" - return self.sp_model.IdToPiece(index) - - -class ChatGLMTokenizer(PreTrainedTokenizer): - - vocab_files_names = {"vocab_file": "tokenizer.model"} - model_input_names = ["input_ids", "attention_mask", "position_ids"] - - def __init__( - self, - vocab_file, - padding_side="left", - clean_up_tokenization_spaces=False, - encode_special_tokens=False, - **kwargs - ): - self.name = "GLMTokenizer" - self.vocab_file = vocab_file - self.tokenizer = SPTokenizer(vocab_file) - self.special_tokens = { - "": self.tokenizer.bos_id, - "": self.tokenizer.eos_id, - "": self.tokenizer.pad_id, - "": self.tokenizer.pad_id - } - self.encode_special_tokens = encode_special_tokens - - super().__init__( - padding_side=padding_side, - clean_up_tokenization_spaces=clean_up_tokenization_spaces, - **kwargs - ) - - def get_command(self, token): - if token in self.special_tokens: - return self.special_tokens[token] - assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}" - return self.tokenizer.special_tokens[token] - - @property - def unk_token(self) -> str: - return self.tokenizer.sp_model.IdToPiece(self.get_command("")) - - @property - def pad_token(self) -> str: - return self.tokenizer.sp_model.IdToPiece(self.get_command("")) - - @property - def eos_token(self) -> str: - return self.tokenizer.sp_model.IdToPiece(self.get_command("")) - - @property - def unk_token_id(self) -> int: - return self.get_command("") - - @property - def pad_token_id(self) -> int: - return self.get_command("") - - @property - def eos_token_id(self): - return self.get_command("") - - @unk_token.setter - def unk_token(self, value): - logger.warning("Setting unk_token is not supported, use the default one.") - - @pad_token.setter - def pad_token(self, value): - logger.warning("Setting pad_token is not supported, use the default one.") - - @eos_token.setter - def eos_token(self, value): - logger.warning("Setting eos_token is not supported, use the default one.") - - @property - def vocab_size(self): - return self.tokenizer.n_words - - def get_vocab(self): - """ Returns vocab as a dict """ - vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)} - vocab.update(self.added_tokens_encoder) - return vocab - - def _tokenize(self, text, **kwargs): - return self.tokenizer.tokenize(text, encode_special_tokens=self.encode_special_tokens) - - def _convert_token_to_id(self, token): - """ Converts a token (str) in an id using the vocab. """ - return self.tokenizer.convert_token_to_id(token) - - def _convert_id_to_token(self, index): - """Converts an index (integer) in a token (str) using the vocab.""" - return self.tokenizer.convert_id_to_token(index) - - def convert_tokens_to_string(self, tokens: List[str]) -> str: - return self.tokenizer.decode_tokens(tokens) - - def save_vocabulary(self, save_directory, filename_prefix=None): - """ - Save the vocabulary and special tokens file to a directory. - - Args: - save_directory (`str`): - The directory in which to save the vocabulary. - filename_prefix (`str`, *optional*): - An optional prefix to add to the named of the saved files. - - Returns: - `Tuple(str)`: Paths to the files saved. - """ - if os.path.isdir(save_directory): - vocab_file = os.path.join( - save_directory, self.vocab_files_names["vocab_file"] - ) - else: - vocab_file = save_directory - - with open(self.vocab_file, 'rb') as fin: - proto_str = fin.read() - - with open(vocab_file, "wb") as writer: - writer.write(proto_str) - - return (vocab_file,) - - def get_prefix_tokens(self): - prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")] - return prefix_tokens - - def build_single_message(self, role, metadata, message): - assert role in ["system", "user", "assistant", "observation"], role - role_tokens = [self.get_command(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n") - message_tokens = self.tokenizer.encode(message) - tokens = role_tokens + message_tokens - return tokens - - def build_chat_input(self, query, history=None, role="user"): - if history is None: - history = [] - input_ids = [] - for item in history: - content = item["content"] - if item["role"] == "system" and "tools" in item: - content = content + "\n" + json.dumps(item["tools"], indent=4, ensure_ascii=False) - input_ids.extend(self.build_single_message(item["role"], item.get("metadata", ""), content)) - input_ids.extend(self.build_single_message(role, "", query)) - input_ids.extend([self.get_command("<|assistant|>")]) - return self.batch_encode_plus([input_ids], return_tensors="pt", is_split_into_words=True) - - def build_inputs_with_special_tokens( - self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None - ) -> List[int]: - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. A BERT sequence has the following format: - - - single sequence: `[CLS] X [SEP]` - - pair of sequences: `[CLS] A [SEP] B [SEP]` - - Args: - token_ids_0 (`List[int]`): - List of IDs to which the special tokens will be added. - token_ids_1 (`List[int]`, *optional*): - Optional second list of IDs for sequence pairs. - - Returns: - `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. - """ - prefix_tokens = self.get_prefix_tokens() - token_ids_0 = prefix_tokens + token_ids_0 - if token_ids_1 is not None: - token_ids_0 = token_ids_0 + token_ids_1 + [self.get_command("")] - return token_ids_0 - - def _pad( - self, - encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], - max_length: Optional[int] = None, - padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, - pad_to_multiple_of: Optional[int] = None, - return_attention_mask: Optional[bool] = None, - ) -> dict: - """ - Pad encoded inputs (on left/right and up to predefined length or max length in the batch) - - Args: - encoded_inputs: - Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). - max_length: maximum length of the returned list and optionally padding length (see below). - Will truncate by taking into account the special tokens. - padding_strategy: PaddingStrategy to use for padding. - - - PaddingStrategy.LONGEST Pad to the longest sequence in the batch - - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) - - PaddingStrategy.DO_NOT_PAD: Do not pad - The tokenizer padding sides are defined in self.padding_side: - - - 'left': pads on the left of the sequences - - 'right': pads on the right of the sequences - pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. - This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability - `>= 7.5` (Volta). - return_attention_mask: - (optional) Set to False to avoid returning attention mask (default: set to model specifics) - """ - # Load from model defaults - assert self.padding_side == "left" - - required_input = encoded_inputs[self.model_input_names[0]] - seq_length = len(required_input) - - if padding_strategy == PaddingStrategy.LONGEST: - max_length = len(required_input) - - if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): - max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of - - needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length - - # Initialize attention mask if not present. - if "attention_mask" not in encoded_inputs: - encoded_inputs["attention_mask"] = [1] * seq_length - - if "position_ids" not in encoded_inputs: - encoded_inputs["position_ids"] = list(range(seq_length)) - - if needs_to_be_padded: - difference = max_length - len(required_input) - - if "attention_mask" in encoded_inputs: - encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] - if "position_ids" in encoded_inputs: - encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"] - encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input - - return encoded_inputs diff --git a/Experiments/chatglm3_6b_tokenizer/tokenizer.model b/Experiments/chatglm3_6b_tokenizer/tokenizer.model deleted file mode 100644 index c8336ad..0000000 Binary files a/Experiments/chatglm3_6b_tokenizer/tokenizer.model and /dev/null differ diff --git a/Experiments/chatglm3_6b_tokenizer/tokenizer_config.json b/Experiments/chatglm3_6b_tokenizer/tokenizer_config.json deleted file mode 100644 index fbcb2c8..0000000 --- a/Experiments/chatglm3_6b_tokenizer/tokenizer_config.json +++ /dev/null @@ -1,52 +0,0 @@ -{ - "added_tokens_decoder": { - "64790": { - "content": "[gMASK]", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false, - "special": false - }, - "64792": { - "content": "sop", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false, - "special": false - }, - "64795": { - "content": "<|user|>", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false, - "special": false - }, - "64796": { - "content": "<|assistant|>", - "lstrip": false, - "normalized": true, - "rstrip": false, - "single_word": false, - "special": false - } - }, - "auto_map": { - "AutoTokenizer": [ - "tokenization_chatglm.ChatGLMTokenizer", - null - ] - }, - "chat_template": "{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}", - "clean_up_tokenization_spaces": false, - "do_lower_case": false, - "eos_token": "", - "model_max_length": 1000000000000000019884624838656, - "pad_token": "", - "padding_side": "left", - "remove_space": false, - "tokenizer_class": "ChatGLMTokenizer", - "unk_token": "" -} diff --git a/Journey/Day001/Day001.ipynb b/Journey/Day001/Day001.ipynb deleted file mode 100644 index 3779481..0000000 --- a/Journey/Day001/Day001.ipynb +++ /dev/null @@ -1,131 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 从0️⃣跑通中文大模型构建|🚀Day001" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 参考项目搜集" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- [nano-llama31](https://github.com/karpathy/nano-llama31/tree/master)\n", - "\n", - " - Karpathy大神的项目,用700多行代码完成了一个尽可能低依赖的最简洁llama3.1实现,能够进行训练、微调和推理(跪拜🧎‍♂️)\n", - "\n", - "- [TinyLlama](https://github.com/jzhang38/TinyLlama/blob/main/README_zh-CN.md)\n", - "\n", - " - TinyLlama项目在3万亿tokens上进行预训练,构建一个拥有1.1B参数的Llama模型\n", - " \n", - " - 采用了与Llama 2完全相同的架构和分词器\n", - "\n", - "- [baby-llama2-chinese](https://github.com/DLLXW/baby-llama2-chinese?tab=readme-ov-file)\n", - "\n", - " - 中文数据从头训练+SFT的一个小参数实现,可以在24G单卡运行;目前包含:预训练、SFT指令微调,**奖励模型**以及**强化学习**待做(但似乎已经断更3个月)\n", - " \n", - " - 基于llama2架构\n", - " \n", - "- [ChatLM-mini-Chinese](https://github.com/charent/ChatLM-mini-Chinese)\n", - "\n", - " - 中文对话0.2B小模型(ChatLM-Chinese-0.2B),开源所有数据集来源、数据清洗、tokenizer训练、模型预训练、SFT指令微调、RLHF优化等流程的全部代码\n", - " \n", - " - 支持下游任务sft微调,给出了三元组信息抽取任务的微调示例\n", - " \n", - " - 有意思的是作者选的模型架构是T5,还有另一个衍生项目用的是phi-2架构" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 数据集确定" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "这个计划的主要目地是能从头熟悉一遍大模型训练的流程,所以不需要收集大量的文本数据集(我显然也没有那么多的GPU资源能让我在超大的数据集上快速训练🤷‍♂️)\n", - "\n", - "之前看到微软的一篇论文[TinyStories](https://arxiv.org/abs/2305.07759),探索的是语言模型在多小的情况下还能流利地讲故事。为此他们构造了一个小数据集,包含了一些小故事,我觉得这个数据集很适合我这个项目的目标,所以决定使用这个数据集。\n", - "\n", - "由于希望在中文数据集上进行项目,所以原本的打算是通过机器翻译把数据集翻译一遍,结果发现已经有人做了这个工作,所以我打算直接使用这个翻译好的[数据集](https://huggingface.co/datasets/adam89/TinyStoriesChinese)。🙏感谢大佬们的无私奉献。\n", - "\n", - "一个样例如下:\n", - "```json\n", - "{\n", - " \"story\": \"\\n\\nLily and Ben are friends. They like to play in the park. One day, they see a big tree with a swing. Lily wants to try the swing. She runs to the tree and climbs on the swing.\\n\\\"Push me, Ben!\\\" she says. Ben pushes her gently. Lily feels happy. She swings higher and higher. She laughs and shouts.\\nBen watches Lily. He thinks she is cute. He wants to swing too. He waits for Lily to stop. But Lily does not stop. She swings faster and faster. She is having too much fun.\\n\\\"Can I swing too, Lily?\\\" Ben asks. Lily does not hear him. She is too busy swinging. Ben feels sad. He walks away.\\nLily swings so high that she loses her grip. She falls off the swing. She lands on the ground. She hurts her foot. She cries.\\n\\\"Ow, ow, ow!\\\" she says. She looks for Ben. She wants him to help her. But Ben is not there. He is gone.\\nLily feels sorry. She wishes she had shared the swing with Ben. She wishes he was there to hug her. She limps to the tree. She sees something hanging from a branch. It is Ben's hat. He left it for her.\\nLily smiles. She thinks Ben is nice. She puts on his hat. She hopes he will come back. She wants to say sorry. She wants to be friends again.\",\n", - " \"instruction\": {\n", - " \"prompt:\": \"Write a short story (3-5 paragraphs) which only uses very simple words that a 3 year old child would understand. The story should use the verb \\\"hang\\\", the noun \\\"foot\\\" and the adjective \\\"cute\\\". The story has the following features: the story should contain at least one dialogue. Remember to only use simple words!\\n\\nPossible story:\",\n", - " \"words\": [\n", - " \"hang\",\n", - " \"foot\",\n", - " \"cute\"\n", - " ],\n", - " \"features\": [\n", - " \"Dialogue\"\n", - " ]\n", - " },\n", - " \"summary\": \"Lily and Ben play in the park and Lily gets too caught up in swinging, causing Ben to leave. Lily falls off the swing and hurts herself, but Ben leaves his hat for her as a kind gesture.\",\n", - " \"source\": \"GPT-4\",\n", - " \"story_zh\": \"莉莉和本是朋友。他们喜欢在公园里玩。有一天,他们在一棵大树下看到了一个秋千。莉莉想试试那个秋千。她跑到树下,爬上了秋千。\\n\\\"推我,本!\\\"她说。本轻轻地推了她一下。莉莉感到很开心。她越荡越高,笑着喊叫。\\n本看着莉莉。他觉得她很可爱。他也想荡秋千。他在莉莉停下来之后等着。但是莉莉没有停下来。她越荡越快。她玩得太高兴了。\\n\\\"我也可以荡秋千吗,莉莉?\\\"本问。莉莉没听到他的话。她忙着荡秋千。本觉得很难过。他走开了。\\n莉莉荡得太高,失去了平衡。她从秋千上摔下来,落在地上。她扭伤了脚。她哭了起来。\\n\\\"哎呀,哎呀,哎呀!\\\"她说。她在找本。她希望他能帮助她。但本不在那里。他走了。\\n莉莉感到很抱歉。她希望她能和本分享秋千。她希望他在那里拥抱她。她一瘸一拐地走到树下。她看到有什么东西挂在树枝上。那是本的帽子。他留给她的。\\n莉莉笑了。她觉得本很好。她戴上了他的帽子。她希望他会回来。她想道歉。她想再次成为朋友。\"\n", - "}\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 数据清洗\n", - "在标准的流程里,文本数据量会非常大且来自于不同的来源,无论从质量上和内容重复度上都需要进行清洗。\n", - "\n", - "由于这个数据集是一个单独的小数据集,所以我打算先**跳过这一步**。\n", - "\n", - "不过这里还是简单说一下数据清洗的一些方法:\n", - "- 去除重复数据\n", - " - 当量级巨大的时候一般会采用SimHash或者MinHash的方法\n", - " - 短文本过滤(太短的文本对于模型训练而言意义不大,可以直接去除)\n", - "- 去除低质量数据\n", - " - 一般会采用一些规则或者模型来判断文本的质量,例如:语法错误、拼写错误、不合理内容、无意义数据等\n", - " - 这一步的成本会比较高,因为需要人工标注或者构建模型(但高质量的数据对于模型训练是非常重要的)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 训练框架选择\n", - "\n", - "一个好的训练框架还是很重要的,在上面的参考项目中发现了这个名为[⚡️litgpt](https://github.com/Lightning-AI/litgpt/tree/main)的大模型预训练/微调/部署框架,而且其早期的model代码是基于上面Karpahty大神的nano-llama31项目。\n", - "\n", - "框架已经支持了20+大模型,包括最新的Llama 3.1。大概看了一下代码仓库,结构清晰且简洁。所以我决定花时间熟悉一下这个框架作为后续的训练工具。\n", - "\n", - "个人觉得如今的大模型训练框架已经非常成熟,大家都在追求更高的性能和更好的易用性,所以选择一个好的框架对于后续的工作来说是非常重要的。\n", - "\n", - "也没有必要自己造轮子,毕竟大家都在造了,我们只需要选择一个适合自己的就好了。" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/Journey/Day002/Day002.ipynb b/Journey/Day002/Day002.ipynb deleted file mode 100644 index 5cec828..0000000 --- a/Journey/Day002/Day002.ipynb +++ /dev/null @@ -1,191 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# 从0️⃣跑通中文大模型构建|🚀Day002" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Tokenizer选择" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**tokenization**是大模型训练的第一步,是将文本转换为模型可以理解的数字表示(后面也能反向decode回来)。\n", - "\n", - "其中目前比较主流的是[BPE(Byte Pair Encoding)](https://zhuanlan.zhihu.com/p/424631681)(详细的介绍可以参考链接文章,下面只进行一些简单的介绍)。\n", - "\n", - "**BPE**是一种简单的数据压缩形式,这种方法用数据中不存在的一个字节表示最常出现的连续字节数据。这样的替换需要重建全部原始数据。\n", - "\n", - "### BPE简介\n", - "\n", - "假设我们要编码如下数据\n", - "\n", - "> aaabdaaabac\n", - "\n", - "字节对“aa”出现次数最多,所以我们用数据中没有出现的字节“Z”替换“aa”得到替换表\n", - "\n", - "> Z <- aa\n", - "\n", - "数据转变为\n", - "\n", - "> ZabdZabac\n", - "\n", - "在这个数据中,字节对“Za”出现的次数最多,我们用另外一个字节“Y”来替换它(这种情况下由于所有的“Z”都将被替换,所以也可以用“Z”来替换“Za”),得到替换表以及数据\n", - "\n", - "> Z <- aa\n", - "> Y <- Za\n", - "\n", - "> YbdYbac\n", - "\n", - "我们再次替换最常出现的字节对得到:\n", - "\n", - "> Z <- aa\n", - "> Y <- Za\n", - "> X <- Yb\n", - "\n", - "> XdXac\n", - "\n", - "由于不再有重复出现的字节对,所以这个数据不能再被进一步压缩。\n", - "\n", - "解压的时候,就是按照相反的顺序执行替换过程。\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 以ChatGLM3-6B的tokenizer为例" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Setting eos_token is not supported, use the default one.\n", - "Setting pad_token is not supported, use the default one.\n", - "Setting unk_token is not supported, use the default one.\n" - ] - } - ], - "source": [ - "from transformers import AutoTokenizer\n", - "\n", - "tokenizer = AutoTokenizer.from_pretrained(\"THUDM/chatglm3-6b\", trust_remote_code=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'input_ids': [64790, 64792, 30910, 36037, 32882], 'attention_mask': [1, 1, 1, 1, 1], 'position_ids': [0, 1, 2, 3, 4]}" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer(\"这是一个测试\")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'[gMASK] sop 这是一个测试'" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.decode(tokenizer(\"这是一个测试\")[\"input_ids\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "64798" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tokenizer.vocab_size" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### 选择ChatGLM3-6B的tokenizer的原因\n", - "\n", - "该词表大小为64798,值得注意的是:这是一个很妙的数字,因为它**刚好在uint16的表示范围(0~65535的无符号整数)**,每一个token只需要两个字节即可表示。\n", - "\n", - "当我们的语料较大时候,相比常用的int32可以**节省一半的存储空间**。\n", - "\n", - "另外这里选择一个小尺寸的词表还有一个更重要的原因:我们后面的模型会选择一个小参数量的,如果词表过大,会导致**大部分参数被embedding层占用**,而无法训练出更好的模型。" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "bigmodel", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.10" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/Journey/Day002/Day002.md b/Journey/Day002/Day002.md deleted file mode 100644 index b4e2915..0000000 --- a/Journey/Day002/Day002.md +++ /dev/null @@ -1,101 +0,0 @@ -# 从0️⃣跑通中文大模型构建|🚀Day002 - -## Tokenizer选择 - -**tokenization**是大模型训练的第一步,是将文本转换为模型可以理解的数字表示(后面也能反向decode回来)。 - -其中目前比较主流的是[BPE(Byte Pair Encoding)](https://zhuanlan.zhihu.com/p/424631681)(详细的介绍可以参考链接文章,下面只进行一些简单的介绍)。 - -**BPE**是一种简单的数据压缩形式,这种方法用数据中不存在的一个字节表示最常出现的连续字节数据。这样的替换需要重建全部原始数据。 - -### BPE简介 - -假设我们要编码如下数据 - -> aaabdaaabac - -字节对“aa”出现次数最多,所以我们用数据中没有出现的字节“Z”替换“aa”得到替换表 - -> Z <- aa - -数据转变为 - -> ZabdZabac - -在这个数据中,字节对“Za”出现的次数最多,我们用另外一个字节“Y”来替换它(这种情况下由于所有的“Z”都将被替换,所以也可以用“Z”来替换“Za”),得到替换表以及数据 - -> Z <- aa -> Y <- Za - -> YbdYbac - -我们再次替换最常出现的字节对得到: - -> Z <- aa -> Y <- Za -> X <- Yb - -> XdXac - -由于不再有重复出现的字节对,所以这个数据不能再被进一步压缩。 - -解压的时候,就是按照相反的顺序执行替换过程。 - - - - -### 以ChatGLM-6B的tokenizer为例 - - -```python -from transformers import AutoTokenizer - -tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True) -``` - - Setting eos_token is not supported, use the default one. - Setting pad_token is not supported, use the default one. - Setting unk_token is not supported, use the default one. - - - -```python -tokenizer("这是一个测试") -# fsdfdsf -``` - - - - - {'input_ids': [64790, 64792, 30910, 36037, 32882], 'attention_mask': [1, 1, 1, 1, 1], 'position_ids': [0, 1, 2, 3, 4]} - - - - -```python -tokenizer.decode(tokenizer("这是一个测试")["input_ids"]) -``` - - - - - '[gMASK] sop 这是一个测试' - - - - -```python -tokenizer.vocab_size -``` - - - - - 64798 - - - - -```python - -``` diff --git a/Journey/Day01/Day01.ipynb b/Journey/Day01/Day01.ipynb new file mode 100644 index 0000000..97e8f70 --- /dev/null +++ b/Journey/Day01/Day01.ipynb @@ -0,0 +1,181 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 从零手搓中文大模型|🚀Day01" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 参考项目搜集" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- [nano-llama31](https://github.com/karpathy/nano-llama31/tree/master)\n", + "\n", + " - Karpathy大神的项目,用700多行代码完成了一个尽可能低依赖的最简洁`Llama3.1`实现,能够进行训练、微调和推理(跪拜🧎‍♂️)\n", + "\n", + "- [TinyLlama](https://github.com/jzhang38/TinyLlama/blob/main/README_zh-CN.md)\n", + "\n", + " - `TinyLlama`项目在3万亿tokens上进行预训练,构建一个拥有`1.1B`参数的`Llama`模型\n", + " \n", + " - 采用了与`Llama 2`完全相同的架构和分词器\n", + "\n", + "- [baby-llama2-chinese](https://github.com/DLLXW/baby-llama2-chinese?tab=readme-ov-file)\n", + "\n", + " - 中文数据从头训练+SFT的一个小参数实现,可以在`24G`单卡运行;目前包含:**预训练**、**SFT指令微调**,**奖励模型**以及**强化学习**待做(但似乎已经断更3个月)\n", + " \n", + " - 基于`Llama 2`架构\n", + " \n", + "- [ChatLM-mini-Chinese](https://github.com/charent/ChatLM-mini-Chinese)\n", + "\n", + " - 中文对话0.2B小模型(ChatLM-Chinese-0.2B),开源所有**数据集来源、数据清洗、tokenizer训练、模型预训练、SFT指令微调、RLHF优化**等流程的全部代码\n", + " \n", + " - 支持下游任务`sft`微调,给出了三元组信息抽取任务的微调示例\n", + " \n", + " - 有意思的是作者选的模型架构是`T5`,还有另一个衍生项目用的是`phi-2`架构\n", + "\n", + "除了第一个K神的项目是关于模型实现,后面三个项目的`README.md`文件看一遍下来,对于后续整体的实现流程心里便能有了一个大致的概念。\n", + "\n", + "之后遇到了实际问题时再去里面寻找细节来参考。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 数据集确定" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "这个计划的主要目地是能从头熟悉一遍大模型训练的流程,所以不需要收集大量的文本数据集(我显然也没有那么多的GPU资源能让我在超大的数据集上快速训练🤷‍♂️)\n", + "\n", + "之前看到微软的一篇论文[TinyStories](https://arxiv.org/abs/2305.07759),探索的是语言模型在多小的情况下还能流利地讲故事。\n", + "\n", + "为此他们构造了一个小数据集,包含了一些小故事,我觉得这个数据集很适合我这个项目的目标,所以决定使用这个数据集。\n", + "\n", + "由于希望在中文数据集上进行实践,所以原本的打算是通过机器翻译把数据集翻译一遍。\n", + "\n", + "结果发现已经有人做了这个工作,所以我打算直接使用这个翻译好的[数据集](https://huggingface.co/datasets/adam89/TinyStoriesChinese)。🙏感谢大佬们的无私奉献。\n", + "\n", + "一个样例如下:\n", + "```json\n", + "{\n", + " \"story\": \"\\n\\nLily and Ben are friends. They like to play in the park. One day, they see a big tree with a swing. Lily wants to try the swing. She runs to the tree and climbs on the swing.\\n\\\"Push me, Ben!\\\" she says. Ben pushes her gently. Lily feels happy. She swings higher and higher. She laughs and shouts.\\nBen watches Lily. He thinks she is cute. He wants to swing too. He waits for Lily to stop. But Lily does not stop. She swings faster and faster. She is having too much fun.\\n\\\"Can I swing too, Lily?\\\" Ben asks. Lily does not hear him. She is too busy swinging. Ben feels sad. He walks away.\\nLily swings so high that she loses her grip. She falls off the swing. She lands on the ground. She hurts her foot. She cries.\\n\\\"Ow, ow, ow!\\\" she says. She looks for Ben. She wants him to help her. But Ben is not there. He is gone.\\nLily feels sorry. She wishes she had shared the swing with Ben. She wishes he was there to hug her. She limps to the tree. She sees something hanging from a branch. It is Ben's hat. He left it for her.\\nLily smiles. She thinks Ben is nice. She puts on his hat. She hopes he will come back. She wants to say sorry. She wants to be friends again.\",\n", + " \"instruction\": {\n", + " \"prompt:\": \"Write a short story (3-5 paragraphs) which only uses very simple words that a 3 year old child would understand. The story should use the verb \\\"hang\\\", the noun \\\"foot\\\" and the adjective \\\"cute\\\". The story has the following features: the story should contain at least one dialogue. Remember to only use simple words!\\n\\nPossible story:\",\n", + " \"words\": [\n", + " \"hang\",\n", + " \"foot\",\n", + " \"cute\"\n", + " ],\n", + " \"features\": [\n", + " \"Dialogue\"\n", + " ]\n", + " },\n", + " \"summary\": \"Lily and Ben play in the park and Lily gets too caught up in swinging, causing Ben to leave. Lily falls off the swing and hurts herself, but Ben leaves his hat for her as a kind gesture.\",\n", + " \"source\": \"GPT-4\",\n", + " \"story_zh\": \"莉莉和本是朋友。他们喜欢在公园里玩。有一天,他们在一棵大树下看到了一个秋千。莉莉想试试那个秋千。她跑到树下,爬上了秋千。\\n\\\"推我,本!\\\"她说。本轻轻地推了她一下。莉莉感到很开心。她越荡越高,笑着喊叫。\\n本看着莉莉。他觉得她很可爱。他也想荡秋千。他在莉莉停下来之后等着。但是莉莉没有停下来。她越荡越快。她玩得太高兴了。\\n\\\"我也可以荡秋千吗,莉莉?\\\"本问。莉莉没听到他的话。她忙着荡秋千。本觉得很难过。他走开了。\\n莉莉荡得太高,失去了平衡。她从秋千上摔下来,落在地上。她扭伤了脚。她哭了起来。\\n\\\"哎呀,哎呀,哎呀!\\\"她说。她在找本。她希望他能帮助她。但本不在那里。他走了。\\n莉莉感到很抱歉。她希望她能和本分享秋千。她希望他在那里拥抱她。她一瘸一拐地走到树下。她看到有什么东西挂在树枝上。那是本的帽子。他留给她的。\\n莉莉笑了。她觉得本很好。她戴上了他的帽子。她希望他会回来。她想道歉。她想再次成为朋友。\"\n", + "}\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据清洗\n", + "在标准的流程里,**文本数据量会非常大**且来自于不同的来源,无论从**质量**上和**内容重复度**上都需要进行清洗。\n", + "\n", + "由于这个数据集是一个单独的小数据集,所以我打算先**跳过这一步**。\n", + "\n", + "不过这里还是简单说一下数据清洗的一些方法:\n", + "- 去除重复数据\n", + " - 当量级巨大的时候一般会采用`SimHash`或者`MinHash`的方法\n", + " - 短文本过滤(太短的文本对于模型训练而言意义不大,可以直接去除)\n", + "- 去除低质量数据\n", + " - 一般会采用一些规则或者模型来判断文本的质量,例如:**语法错误、拼写错误、不合理内容、无意义数据**等\n", + " - 这一步的成本会比较高,因为需要人工标注或者构建模型(但高质量的数据对于模型训练是非常重要的)" + ] + }, + { + "attachments": { + "image-2.png": { + "image/png": "" + }, + "image.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 训练框架选择\n", + "\n", + "一个好的训练框架还是很重要的。\n", + "\n", + "在上面的参考项目中发现了这个名为[⚡️litgpt](https://github.com/Lightning-AI/litgpt/tree/main)的大模型预训练/微调/部署框架,而且其模型实现代码是基于上面Karpahty大神的`nano-llama31`项目。\n", + "\n", + "框架已经支持了`20+`大模型,包括最新的`Llama 3.1`。大概看了一下代码仓库,结构清晰且简洁。\n", + "\n", + "![image.png](attachment:image.png)\n", + "\n", + "另外一个框架是[Llama-Factory](https://github.com/hiyouga/LLaMA-Factory/blob/main/README_zh.md),已经有29k的stars,支持`100+`大模型(尤其是对国内的开源模型支持很好)。\n", + "\n", + "![image-2.png](attachment:image-2.png)\n", + "\n", + "所以我决定花时间简单了解一下这两个框架然后选一个作为后续的训练工具。\n", + "\n", + "个人觉得如今的大模型训练框架已经非常成熟,大家都在追求**更高的性能和更好的易用性**,所以**选择一个好的框架**对于后续的工作来说是非常重要的。\n", + "\n", + "也没有必要自己造轮子,毕竟大家都在造了,我们只需要选择一个适合自己的就好了。\n", + "\n", + "> 经过调研发现`Llama-Factory`似乎~~只能支持和发布的开源模型参数一致的模型(也可能是我看的不仔细没有找到修改配置的地方,如果是这样欢迎大家指出)~~。\n", + "> \n", + "> 而litgpt则支持自定义模型参数,\n", + "> 由于我后面是希望限制参数量级在一个很小的范畴,所以决定使用litgpt作为后续的训练工具。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "⚠️更新:\n", + "\n", + "后续也找到了`Llama-Factory`里从0训练并且修改参数的方法(修改`config.json`的配置,并且添加`train_from_scratch=True`参数),\n", + "\n", + "但`litgpt`的代码架构更加简洁,没有过多的抽象和封装,并且有一些训练加速的优化(`Llama-Factory`是基于`transformers`库),\n", + "\n", + "因此最后还是决定使用`litgpt`。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 小结\n", + "1. 确定了项目的数据集来源\n", + "2. 了解了一些数据清洗的方法\n", + "3. 选择了litgpt作为后续的训练工具\n", + "4. 了解了一些参考项目,对后续的实现流程有了一个大致的概念" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Journey/Day01/Day01_files/image-2.png b/Journey/Day01/Day01_files/image-2.png new file mode 100644 index 0000000..aef6e82 Binary files /dev/null and b/Journey/Day01/Day01_files/image-2.png differ diff --git a/Journey/Day01/Day01_files/image.png b/Journey/Day01/Day01_files/image.png new file mode 100644 index 0000000..20911f1 Binary files /dev/null and b/Journey/Day01/Day01_files/image.png differ diff --git a/Journey/Day02/Day02.ipynb b/Journey/Day02/Day02.ipynb new file mode 100644 index 0000000..8865826 --- /dev/null +++ b/Journey/Day02/Day02.ipynb @@ -0,0 +1,282 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 从零手搓中文大模型|🚀Day02" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tokenizer选择" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**tokenization**是大模型训练的第一步,是将文本转换为模型可以理解的数字表示(后面也能反向decode回来)。\n", + "\n", + "其中目前比较主流的是[BPE(Byte Pair Encoding)](https://zhuanlan.zhihu.com/p/424631681)(详细的介绍可以参考链接文章,下面只进行一些简单的介绍)。\n", + "\n", + "**BPE**是一种简单的数据压缩形式,这种方法用数据中不存在的一个字节表示最常出现的连续字节数据。这样的替换需要重建全部原始数据。\n", + "\n", + "### BPE简介\n", + "\n", + "假设我们要编码如下数据\n", + "\n", + "> aaabdaaabac\n", + "\n", + "字节对“aa”出现次数最多,所以我们用数据中没有出现的字节“Z”替换“aa”得到替换表\n", + "\n", + "> Z <- aa\n", + "\n", + "数据转变为\n", + "\n", + "> ZabdZabac\n", + "\n", + "在这个数据中,字节对“Za”出现的次数最多,我们用另外一个字节“Y”来替换它(这种情况下由于所有的“Z”都将被替换,所以也可以用“Z”来替换“Za”),得到替换表以及数据\n", + "\n", + "> Z <- aa\n", + "> Y <- Za\n", + "\n", + "> YbdYbac\n", + "\n", + "我们再次替换最常出现的字节对得到:\n", + "\n", + "> Z <- aa\n", + "> Y <- Za\n", + "> X <- Yb\n", + "\n", + "> XdXac\n", + "\n", + "由于不再有重复出现的字节对,所以这个数据不能再被进一步压缩。\n", + "\n", + "解压的时候,就是按照相反的顺序执行替换过程。\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 测试Tokenizer(以ChatGLM3-6B的tokenizer为例)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting eos_token is not supported, use the default one.\n", + "Setting pad_token is not supported, use the default one.\n", + "Setting unk_token is not supported, use the default one.\n" + ] + } + ], + "source": [ + "from transformers import AutoTokenizer\n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(\"THUDM/chatglm3-6b\", trust_remote_code=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'input_ids': [64790, 64792, 30910, 36037, 32882], 'attention_mask': [1, 1, 1, 1, 1], 'position_ids': [0, 1, 2, 3, 4]}" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer(\"这是一个测试\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'[gMASK] sop 这是一个测试'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.decode(tokenizer(\"这是一个测试\")[\"input_ids\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "64798" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenizer.vocab_size" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "\n", + "def process_line(line, tokenizer, add_eos=True, dtype=np.uint16):\n", + " js = json.loads(line)\n", + " story = js[\"story_zh\"]\n", + " story = tokenizer.encode(story, add_special_tokens=False)\n", + " if add_eos:\n", + " story.append(tokenizer.eos_token_id)\n", + " # 还记得么,这里可以用np.unint16,因为我们的vocab_size是小于65536的\n", + " arr = np.array(story, dtype=dtype)\n", + " return arr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "❗️这里有几个需要注意的点:\n", + "- `add_special_tokens`参数的作用是添加特殊token。\n", + " 是chatglm自定义的例如[gMASK]/sop,属于glm架构里特有的(可以参考[这里](https://github.com/THUDM/ChatGLM3/issues/183))。\n", + " \n", + " 由于我们后续并不使用glm的架构,因此这里不需要添加,**直接设置为False**。\n", + "- 需要在末尾加上`eos`标记对应的token_id。\n", + "- chatglm3-6b使用的词表大小为`64798`,刚好在uint16的表示范围内,所以上面我们给numpy.array设置了`dtype=np.uint16`。\n", + "\n", + "拿一行测试一下:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[30910 56623 56623 54542 50154 31761 31155 31633 31815 54534 32693 54662\n", + " 55409 31155 35632 31123 31633 34383 57427 47658 54578 34518 31623 55567\n", + " 55226 31155 56623 56623 54695 39887 32437 55567 55226 31155 54790 41309\n", + " 52624 31123 56856 32660 55567 55226 31155 13 30955 54834 54546 31123\n", + " 54613 31404 30955 36213 31155 54613 36660 54563 54834 43881 32024 31155\n", + " 56623 56623 32707 54657 33436 31155 54790 54937 56567 40714 31123 38502\n", + " 56653 55483 31155 13 54613 32984 56623 56623 31155 54572 31897 54790\n", + " 54657 35245 31155 36551 54695 56567 55567 55226 31155 33152 56623 56623\n", + " 51556 31797 39055 31155 31694 56623 56623 31631 51556 31155 54790 54937\n", + " 56567 54937 54929 31155 54790 55409 40915 34492 54537 31155 13 30955\n", + " 54546 32591 56567 55567 55226 55398 31123 56623 56623 31514 30955 54613\n", + " 54761 31155 56623 56623 54721 33906 31804 54887 31155 54790 46977 56567\n", + " 55567 55226 31155 54613 31897 32960 54597 31155 54572 54942 34675 31155\n", + " 13 56623 56623 56567 40915 54589 31123 36467 33501 31155 54790 54708\n", + " 55567 55226 54547 57456 32246 31123 36712 34245 31155 54790 56901 55328\n", + " 54537 55673 31155 54790 56399 37247 31155 13 30955 58394 56657 31123\n", + " 58394 56657 31123 58394 56657 31404 30955 36213 31155 35957 55227 54613\n", + " 31155 54790 31772 47554 31934 54790 31155 54688 54613 33551 33892 31155\n", + " 54572 34247 31155 13 56623 56623 32707 54657 52992 31155 54790 31772\n", + " 54790 54558 54542 54613 32097 55567 55226 31155 54790 31772 33152 33892\n", + " 37322 54790 31155 54790 54531 60337 54531 57635 54563 35220 52624 31155\n", + " 54790 31857 33277 32086 44829 49102 54547 31155 35328 43352 41147 31155\n", + " 54572 42393 32233 31155 13 56623 56623 40466 31155 54790 31897 54613\n", + " 33058 31155 54790 55947 32660 31804 41147 31155 54790 31772 38711 33857\n", + " 31155 54790 54695 37300 31155 54790 54695 32462 31705 31761 31155 2]\n", + "莉莉和本是朋友。他们喜欢在公园里玩。有一天,他们在一棵大树下看到了一个秋千。莉莉想试试那个秋千。她跑到树下,爬上了秋千。\n", + "\"推我,本!\"她说。本轻轻地推了她一下。莉莉感到很开心。她越荡越高,笑着喊叫。\n", + "本看着莉莉。他觉得她很可爱。他也想荡秋千。他在莉莉停下来之后等着。但是莉莉没有停下来。她越荡越快。她玩得太高兴了。\n", + "\"我也可以荡秋千吗,莉莉?\"本问。莉莉没听到他的话。她忙着荡秋千。本觉得很难过。他走开了。\n", + "莉莉荡得太高,失去了平衡。她从秋千上摔下来,落在地上。她扭伤了脚。她哭了起来。\n", + "\"哎呀,哎呀,哎呀!\"她说。她在找本。她希望他能帮助她。但本不在那里。他走了。\n", + "莉莉感到很抱歉。她希望她能和本分享秋千。她希望他在那里拥抱她。她一瘸一拐地走到树下。她看到有什么东西挂在树枝上。那是本的帽子。他留给她的。\n", + "莉莉笑了。她觉得本很好。她戴上了他的帽子。她希望他会回来。她想道歉。她想再次成为朋友。\n" + ] + } + ], + "source": [ + "import json\n", + "\n", + "with open(\"../../Data/TinyStoriesChinese/train/data00_zh.jsonl\", \"r\") as f:\n", + " for line in f.readlines():\n", + " data = process_line(line, tokenizer)\n", + " print(data)\n", + " print(tokenizer.decode(data))\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 选择ChatGLM3-6B的tokenizer的原因\n", + "\n", + "该词表大小为64798,值得注意的是:这是一个很妙的数字,因为它**刚好在uint16的表示范围(0~65535的无符号整数)**,每一个token只需要两个字节即可表示。\n", + "\n", + "当我们的语料较大时候,相比常用的int32可以**节省一半的存储空间**。\n", + "\n", + "另外这里选择一个小尺寸的词表还有一个更重要的原因:我们后面的模型会选择一个小参数量的,如果词表过大,会导致**大部分参数被embedding层占用**,而无法训练出更好的模型。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bigmodel", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Journey/Day03/Day03.ipynb b/Journey/Day03/Day03.ipynb new file mode 100644 index 0000000..9f1b64e --- /dev/null +++ b/Journey/Day03/Day03.ipynb @@ -0,0 +1,383 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 从零手搓中文大模型|🚀Day03" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 数据预处理" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "虽然省略了数据清洗的逻辑,但是我们还是需要对数据进行预处理,以便于后续的模型训练。\n", + "\n", + "包括以下两个细节:\n", + "\n", + "1. 在每个文本后添加`eos`标记,以便于模型识别句子的结束。\n", + "2. 将文本转换为`数字序列`,以便于模型处理。\n", + " \n", + " 这一步其实也可以放到模型训练的时候进行,但提前处理可以减少训练时的计算量。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 数据集划分\n", + "\n", + "解压数据集,得到`48`个jsonl文件,共计`3952863`行json数据。\n", + "\n", + "我之前已经解压过了,这里把命令贴出来做个记录。" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# !mkdir -p ../../Data/TinyStoriesChinese/raw_data/train\n", + "# !mkdir -p ../../Data/TinyStoriesChinese/raw_data/val\n", + "# !mkdir -p ../../Data/TinyStoriesChinese/processed_data/train\n", + "# !mkdir -p ../../Data/TinyStoriesChinese/processed_data/val\n", + "\n", + "# !tar zxvf ../../Data/TinyStoriesChinese/TinyStories_all_data_zh.tar.gz -C ../../Data/TinyStoriesChinese/raw_data/train" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "我把最后一个文件`data47_zh.jsonl`(共计78538行)里切分出来4w行作为`eval`数据。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "# !mv ../../Data/TinyStoriesChinese/raw_data/train/data47_zh.jsonl ../../Data/TinyStoriesChinese/raw_data/eval/\n", + "# !head -n 40000 ../../Data/TinyStoriesChinese/raw_data/eval/data47_zh.jsonl > ../../Data/TinyStoriesChinese/raw_data/eval/eval.jsonl\n", + "# !tail -n +40000 ../../Data/TinyStoriesChinese/raw_data/eval/data47_zh.jsonl > ../../Data/TinyStoriesChinese/raw_data/train/data47_zh.jsonl\n", + "# !rm ../../Data/TinyStoriesChinese/raw_data/eval/data47_zh.jsonl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 先看一条数据" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "莉莉和本是朋友。他们喜欢在公园里玩。有一天,他们在一棵大树下看到了一个秋千。莉莉想试试那个秋千。她跑到树下,爬上了秋千。\n", + "\"推我,本!\"她说。本轻轻地推了她一下。莉莉感到很开心。她越荡越高,笑着喊叫。\n", + "本看着莉莉。他觉得她很可爱。他也想荡秋千。他在莉莉停下来之后等着。但是莉莉没有停下来。她越荡越快。她玩得太高兴了。\n", + "\"我也可以荡秋千吗,莉莉?\"本问。莉莉没听到他的话。她忙着荡秋千。本觉得很难过。他走开了。\n", + "莉莉荡得太高,失去了平衡。她从秋千上摔下来,落在地上。她扭伤了脚。她哭了起来。\n", + "\"哎呀,哎呀,哎呀!\"她说。她在找本。她希望他能帮助她。但本不在那里。他走了。\n", + "莉莉感到很抱歉。她希望她能和本分享秋千。她希望他在那里拥抱她。她一瘸一拐地走到树下。她看到有什么东西挂在树枝上。那是本的帽子。他留给她的。\n", + "莉莉笑了。她觉得本很好。她戴上了他的帽子。她希望他会回来。她想道歉。她想再次成为朋友。\n" + ] + } + ], + "source": [ + "import json\n", + "\n", + "with open(\"../../Data/TinyStoriesChinese/raw_data/train/data00_zh.jsonl\", \"r\") as f:\n", + " for line in f.readlines():\n", + " js = json.loads(line)\n", + " print(js[\"story_zh\"])\n", + " break" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 适配框架API" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "由于选择了使用[⚡️litgpt](https://github.com/Lightning-AI/litgpt/tree/main)框架进行训练,所以需要引入框架相关的`Class`和`API`来封装我们的数据准备逻辑。\n", + "\n", + "这里我们可以参考[源码里集成的Tinyllama的数据预处理代码](https://github.com/Lightning-AI/litgpt/blob/main/litgpt/data/prepare_slimpajama.py)里的代码,稍作修改。\n", + "\n", + "主要是需要将[Day02](../Day02/Day02.ipynb)里的`line`处理逻辑封装到`ligtgpt`的`API`中。\n", + "\n", + "但在此之前我们先熟悉一下`litgpt`的Tokenizer的使用方法:" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from litgpt import Tokenizer\n", + "\n", + "litgpt_tokenizer = Tokenizer(\"../../References/chatglm3-6b\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "这里也实验了一下结果,对比发现和上面咱们之前用原生Tokenizer处理的**结果一致**。\n", + "\n", + "(不过需要注意litgpt的`Tokenizer.encode`返回的是一个torch的`Tensor`)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([30910, 56623, 56623, 54542, 50154, 31761, 31155, 31633, 31815, 54534,\n", + " 32693, 54662, 55409, 31155, 35632, 31123, 31633, 34383, 57427, 47658,\n", + " 54578, 34518, 31623, 55567, 55226, 31155, 56623, 56623, 54695, 39887,\n", + " 32437, 55567, 55226, 31155, 54790, 41309, 52624, 31123, 56856, 32660,\n", + " 55567, 55226, 31155, 13, 30955, 54834, 54546, 31123, 54613, 31404,\n", + " 30955, 36213, 31155, 54613, 36660, 54563, 54834, 43881, 32024, 31155,\n", + " 56623, 56623, 32707, 54657, 33436, 31155, 54790, 54937, 56567, 40714,\n", + " 31123, 38502, 56653, 55483, 31155, 13, 54613, 32984, 56623, 56623,\n", + " 31155, 54572, 31897, 54790, 54657, 35245, 31155, 36551, 54695, 56567,\n", + " 55567, 55226, 31155, 33152, 56623, 56623, 51556, 31797, 39055, 31155,\n", + " 31694, 56623, 56623, 31631, 51556, 31155, 54790, 54937, 56567, 54937,\n", + " 54929, 31155, 54790, 55409, 40915, 34492, 54537, 31155, 13, 30955,\n", + " 54546, 32591, 56567, 55567, 55226, 55398, 31123, 56623, 56623, 31514,\n", + " 30955, 54613, 54761, 31155, 56623, 56623, 54721, 33906, 31804, 54887,\n", + " 31155, 54790, 46977, 56567, 55567, 55226, 31155, 54613, 31897, 32960,\n", + " 54597, 31155, 54572, 54942, 34675, 31155, 13, 56623, 56623, 56567,\n", + " 40915, 54589, 31123, 36467, 33501, 31155, 54790, 54708, 55567, 55226,\n", + " 54547, 57456, 32246, 31123, 36712, 34245, 31155, 54790, 56901, 55328,\n", + " 54537, 55673, 31155, 54790, 56399, 37247, 31155, 13, 30955, 58394,\n", + " 56657, 31123, 58394, 56657, 31123, 58394, 56657, 31404, 30955, 36213,\n", + " 31155, 35957, 55227, 54613, 31155, 54790, 31772, 47554, 31934, 54790,\n", + " 31155, 54688, 54613, 33551, 33892, 31155, 54572, 34247, 31155, 13,\n", + " 56623, 56623, 32707, 54657, 52992, 31155, 54790, 31772, 54790, 54558,\n", + " 54542, 54613, 32097, 55567, 55226, 31155, 54790, 31772, 33152, 33892,\n", + " 37322, 54790, 31155, 54790, 54531, 60337, 54531, 57635, 54563, 35220,\n", + " 52624, 31155, 54790, 31857, 33277, 32086, 44829, 49102, 54547, 31155,\n", + " 35328, 43352, 41147, 31155, 54572, 42393, 32233, 31155, 13, 56623,\n", + " 56623, 40466, 31155, 54790, 31897, 54613, 33058, 31155, 54790, 55947,\n", + " 32660, 31804, 41147, 31155, 54790, 31772, 38711, 33857, 31155, 54790,\n", + " 54695, 37300, 31155, 54790, 54695, 32462, 31705, 31761, 31155, 2],\n", + " dtype=torch.int32)\n", + "莉莉和本是朋友。他们喜欢在公园里玩。有一天,他们在一棵大树下看到了一个秋千。莉莉想试试那个秋千。她跑到树下,爬上了秋千。\n", + "\"推我,本!\"她说。本轻轻地推了她一下。莉莉感到很开心。她越荡越高,笑着喊叫。\n", + "本看着莉莉。他觉得她很可爱。他也想荡秋千。他在莉莉停下来之后等着。但是莉莉没有停下来。她越荡越快。她玩得太高兴了。\n", + "\"我也可以荡秋千吗,莉莉?\"本问。莉莉没听到他的话。她忙着荡秋千。本觉得很难过。他走开了。\n", + "莉莉荡得太高,失去了平衡。她从秋千上摔下来,落在地上。她扭伤了脚。她哭了起来。\n", + "\"哎呀,哎呀,哎呀!\"她说。她在找本。她希望他能帮助她。但本不在那里。他走了。\n", + "莉莉感到很抱歉。她希望她能和本分享秋千。她希望他在那里拥抱她。她一瘸一拐地走到树下。她看到有什么东西挂在树枝上。那是本的帽子。他留给她的。\n", + "莉莉笑了。她觉得本很好。她戴上了他的帽子。她希望他会回来。她想道歉。她想再次成为朋友。\n" + ] + } + ], + "source": [ + "litgpt_encoded = litgpt_tokenizer.encode(\n", + " json.loads(line)[\"story_zh\"], eos=True\n", + ") # 记得设置eos=True\n", + "print(litgpt_encoded)\n", + "print(litgpt_tokenizer.decode(litgpt_encoded))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "数据处理参考上面给出的[链接](https://github.com/Lightning-AI/litgpt/blob/main/litgpt/data/prepare_slimpajama.py),我们需要实现`prepare_slimpajama.py`里的相关函数。" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright Lightning AI. Licensed under the Apache License 2.0, see LICENSE file.\n", + "\n", + "import json\n", + "import os\n", + "import time\n", + "import numpy as np\n", + "from pathlib import Path\n", + "\n", + "from litgpt.tokenizer import Tokenizer\n", + "from litgpt.data.prepare_starcoder import DataChunkRecipe\n", + "from litgpt.utils import extend_checkpoint_dir\n", + "\n", + "\n", + "class TinyStoriesZhDataRecipe(DataChunkRecipe):\n", + " is_generator = True\n", + "\n", + " def __init__(self, tokenizer: Tokenizer, chunk_size: int):\n", + " super().__init__(chunk_size)\n", + " self.tokenizer = tokenizer\n", + "\n", + " def prepare_structure(self, input_dir):\n", + " files = Path(input_dir).rglob(\"*.jsonl\")\n", + " return [str(file) for file in files]\n", + "\n", + " def prepare_item(self, filepath):\n", + "\n", + " with open(filepath, \"rb\") as f:\n", + " for line in f.readlines():\n", + " js = json.loads(line)\n", + " story = js[\"story_zh\"]\n", + " # 注意这里要添加eos\n", + " # 另外还记得吗:我们的vocab size在int16范围内,所以可以转换为uint16来节省内存\n", + " story_ids = np.array(\n", + " self.tokenizer.encode(story, eos=True), dtype=np.uint16\n", + " )\n", + " yield story_ids\n", + "\n", + "\n", + "def prepare(\n", + " input_dir: Path = Path(\"../../Data/TinyStoriesChinese/raw_data/train\"),\n", + " output_dir: Path = Path(\"../../Data/TinyStoriesChinese/processed_data/train\"),\n", + " tokenizer_path: Path = Path(\"../../References/chatglm3-6b\"),\n", + " chunk_size: int = (2049 * 16384),\n", + " fast_dev_run: bool = False,\n", + ") -> None:\n", + " from litdata.processing.data_processor import DataProcessor\n", + "\n", + " tokenizer_path = extend_checkpoint_dir(tokenizer_path)\n", + " tokenizer = Tokenizer(tokenizer_path)\n", + " data_recipe = TinyStoriesZhDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size)\n", + " data_processor = DataProcessor(\n", + " input_dir=str(input_dir),\n", + " output_dir=str(output_dir),\n", + " fast_dev_run=fast_dev_run,\n", + " num_workers=os.cpu_count(),\n", + " num_downloaders=1,\n", + " )\n", + "\n", + " start_time = time.time()\n", + " data_processor.run(data_recipe)\n", + " elapsed_time = time.time() - start_time\n", + " print(f\"Time taken: {elapsed_time:.2f} seconds\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 先用eval数据集测试\n", + "\n", + "(也可以设置`fast_dev_run=True`来处理更少的数据,尤其是debug时)\n", + "\n", + "执行完可以在`processed_data/eval`目录下看到生成的chunk文件。\n", + "\n", + "比较一下可以发现从原先的`83m`的`.jsonl`文件压缩到了`13m`的`.bin`," + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prepare(\n", + " input_dir=Path(\"../../Data/TinyStoriesChinese/raw_data/eval\"),\n", + " output_dir=Path(\"../../Data/TinyStoriesChinese/processed_data/eval\"),\n", + " tokenizer_path=Path(\"../../References/chatglm3-6b\"),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 处理train数据集\n", + "在32核的CPU上处理`train`数据集耗时不到`1min`。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "prepare(\n", + " input_dir=Path(\"../../Data/TinyStoriesChinese/raw_data/train\"),\n", + " output_dir=Path(\"../../Data/TinyStoriesChinese/processed_data/train\"),\n", + " tokenizer_path=Path(\"../../References/chatglm3-6b\"),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 小结\n", + "\n", + "1. 数据预处理的逻辑主要是将文本转换为数字序列,以便于模型处理。\n", + "2. 通过litgpt的Tokenizer可以方便的实现文本到数字序列的转换。\n", + "3. litgpt提供了数据处理的API,可以方便的封装我们的数据处理逻辑。\n", + "4. 数据处理的结果可以通过压缩文件的方式减少存储空间。" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "bigmodel", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Journey/Day03/__pycache__/packed_dataset.cpython-310.pyc b/Journey/Day03/__pycache__/packed_dataset.cpython-310.pyc new file mode 100644 index 0000000..dbf5a2c Binary files /dev/null and b/Journey/Day03/__pycache__/packed_dataset.cpython-310.pyc differ diff --git a/Journey/Day03/packed_dataset.py b/Journey/Day03/packed_dataset.py new file mode 100644 index 0000000..ab9a2e5 --- /dev/null +++ b/Journey/Day03/packed_dataset.py @@ -0,0 +1,260 @@ +# Very loosely inspired by indexed_dataset in Fairseq, Megatron +# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/data/indexed_dataset.py + + +import os +import random +import struct + +import numpy as np +import torch +from torch.utils.data import IterableDataset, get_worker_info + +dtypes = { + 1: np.uint8, + 2: np.int8, + 3: np.int16, + 4: np.int32, + 5: np.int64, + 6: np.float32, + 7: np.float64, + 8: np.uint16, +} + + +def code(dtype): + for k in dtypes: + if dtypes[k] == dtype: + return k + raise ValueError(dtype) + + +HDR_MAGIC = b"LITPKDS" +HDR_SIZE = 24 # bytes + + +class PackedDataset(IterableDataset): + def __init__( + self, + filenames, + n_chunks, + block_size, + seed=12345, + shuffle=True, + wrap=False, + num_processes=1, + process_rank=0, + ): + self._filenames = filenames + self._n_chunks = n_chunks + self._block_size = block_size + self._seed = seed + self._shuffle = shuffle + self._wrap = wrap + self._num_processes = num_processes + self._process_rank = process_rank + + def __iter__(self): + worker_info = get_worker_info() + num_workers = worker_info.num_workers if worker_info is not None else 1 + worker_id = worker_info.id if worker_info is not None else 0 + num_shards = num_workers * self._num_processes + shard_id = self._process_rank * num_workers + worker_id + + max_num_files = len(self._filenames) // num_shards * num_shards + filenames = self._filenames[shard_id:max_num_files:num_shards] + + return PackedDatasetIterator( + filenames=filenames, + n_chunks=self._n_chunks, + block_size=self._block_size, + seed=self._seed, + shuffle=self._shuffle, + wrap=self._wrap, + ) + + +class PackedDatasetBuilder(object): + def __init__( + self, outdir, prefix, chunk_size, sep_token, dtype="auto", vocab_size=None + ): + if dtype == "auto": + if vocab_size is None: + raise ValueError("vocab_size cannot be None when dtype='auto'") + if vocab_size is not None and vocab_size < 65500: + self._dtype = np.uint16 + else: + self._dtype = np.int32 + else: + self._dtype = dtype + self._counter = 0 + self._chunk_size = chunk_size + self._outdir = outdir + self._prefix = prefix + self._sep_token = sep_token + self._arr = np.zeros(self._chunk_size, dtype=self._dtype) + self._arr.fill(self._sep_token) + self._idx = 0 + self._version = 1 + self._filenames = [] + + def _write_chunk(self): + filename = f"{self._prefix}_{self._counter:010d}.bin" + filename = os.path.join(self._outdir, filename) + + with open(filename, "wb") as f: + f.write(HDR_MAGIC) + f.write(struct.pack(" self._chunk_size: + part_len = self._chunk_size - self._idx + self._arr[self._idx : self._idx + part_len] = arr[:part_len] + self._write_chunk() + arr = arr[part_len:] + + arr_len = arr.shape[0] + self._arr[self._idx : self._idx + arr_len] = arr + self._idx += arr_len + + def write_reminder(self): + self._write_chunk() + + +class PackedDatasetIterator: + def __init__(self, filenames, n_chunks, block_size, seed, shuffle, wrap): + self._seed = seed + self._shuffle = shuffle + self._rng = np.random.default_rng(seed) if shuffle else None + self._block_idxs = None + + self._wrap = wrap + + # TODO: instead of filenames, we could have a single text stream + # (or text file) with the sequence of all files to be + # fetched/loaded. + self._filenames = filenames + self._file_idx = 0 + + self._n_chunks = n_chunks + + self._dtype = None + self._block_size = block_size + self._n_blocks = None + + self._mmaps = [] + self._buffers = [] + + self._block_idxs = [] + self._curr_idx = 0 + + self._load_n_chunks() + + def _read_header(self, path): + with open(path, "rb") as f: + magic = f.read(len(HDR_MAGIC)) + assert magic == HDR_MAGIC, "File doesn't match expected format." + version = struct.unpack(" len(self._filenames[self._file_idx :]): + # if not self._wrap: + # raise StopIteration + self._file_idx = 0 + + for i in range(self._n_chunks): + filename = self._filenames[self._file_idx + i] + if self._dtype is None: + self._dtype, self._chunk_size = self._read_header(filename) + self._n_blocks = self._chunk_size // self._block_size + # TODO: check header matches with previous files + mmap = np.memmap(filename, mode="r", order="C", offset=HDR_SIZE) + self._mmaps.append(mmap) + self._buffers.append(memoryview(mmap)) + + self._file_idx += self._n_chunks + n_all_blocks = self._n_chunks * self._n_blocks + + self._block_idxs = ( + self._rng.permutation(n_all_blocks) + if self._shuffle + else range(n_all_blocks) + ) + + self._curr_idx = 0 + + def __del__(self): + self._close_mmaps() + del self._mmaps + del self._buffers + + def __iter__(self): + return self + + def __next__(self): + if self._curr_idx >= len(self._block_idxs): + self._load_n_chunks() + # TODO: trigger fetching next next n_chunks if remote + block_idx = self._block_idxs[self._curr_idx] + chunk_id = block_idx // self._n_blocks + buffer = self._buffers[chunk_id] + elem_id = (block_idx % self._n_blocks) * self._block_size + offset = np.dtype(self._dtype).itemsize * elem_id + arr = np.frombuffer( + buffer, dtype=self._dtype, count=self._block_size, offset=offset + ) + self._curr_idx += 1 + return torch.from_numpy(arr.astype(np.int64)) + + +class CombinedDataset(IterableDataset): + def __init__(self, datasets, seed, weights=None): + self._seed = seed + self._datasets = datasets + self._weights = weights + n_datasets = len(datasets) + if weights is None: + self._weights = [1 / n_datasets] * n_datasets + + def __iter__(self): + return CombinedDatasetIterator(self._datasets, self._seed, self._weights) + + +class CombinedDatasetIterator: + def __init__(self, datasets, seed, weights): + self._datasets = [iter(el) for el in datasets] + self._weights = weights + self._rng = random.Random(seed) + + def __next__(self): + (dataset,) = self._rng.choices(self._datasets, weights=self._weights, k=1) + return next(dataset) diff --git a/Journey/Day03/prepare_tinystories_zh.py b/Journey/Day03/prepare_tinystories_zh.py new file mode 100644 index 0000000..5570d45 --- /dev/null +++ b/Journey/Day03/prepare_tinystories_zh.py @@ -0,0 +1,66 @@ +import glob +import os +import json +from litgpt import Tokenizer +from pathlib import Path +from packed_dataset import PackedDatasetBuilder + + +def prepare_dataset( + source_path: Path, + tokenizer_dir: Path, + destination_path: Path, + chunk_size: int, + match: str = "", +) -> None: + """Prepare the "Red Pajama" dataset using the original tokenizer.""" + import zstandard as zstd + + destination_path.mkdir(parents=True, exist_ok=True) + + tokenizer = Tokenizer(tokenizer_dir) + + for set_name, pattern in filename_sets.items(): + if match and match not in set_name: + continue + + is_cc = set_name == "common_crawl" + + filenames = glob.glob(os.path.join(source_path, pattern), recursive=True) + + if not filenames: + raise RuntimeError( + f"No files matching {pattern} found at {source_path}. \nMake sure you download the data, e.g. wget -i" + " https://data.together.xyz/redpajama-data-1T/v1.0.0/urls.txt or through" + " \nhttps://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T" + " \nhttps://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample \n" + ) + + builder = PackedDatasetBuilder( + outdir=destination_path, + prefix=set_name, + chunk_size=chunk_size, + sep_token=tokenizer.eos_id, + dtype="auto", + vocab_size=tokenizer.vocab_size, + ) + + for name in filenames: + filepath = source_path / name + + print(f"Processing {name}") + + if is_cc: + with zstd.open(open(filepath, "rb"), "rt", encoding="utf-8") as f: + for row in tqdm(f): + text = json.loads(row)["text"] + text_ids = tokenizer.encode(text) + builder.add_array(np.array(text_ids, dtype=builder.dtype)) + else: + with open(filepath, encoding="utf-8") as f: + for row in tqdm(f): + text = json.loads(row)["text"] + text_ids = tokenizer.encode(text) + builder.add_array(np.array(text_ids, dtype=builder.dtype)) + + builder.write_reminder() diff --git a/References/chatglm3-6b b/References/chatglm3-6b new file mode 160000 index 0000000..91a0561 --- /dev/null +++ b/References/chatglm3-6b @@ -0,0 +1 @@ +Subproject commit 91a0561caa089280e94bf26a9fc3530482f0fe60