fix typos (#3543)

LAION-AI · Jul 19, 2023 · e421d1d · e421d1d
1 parent ddb852f
commit e421d1d
Show file tree

Hide file tree

Showing 28 changed files with 50 additions and 50 deletions.
diff --git a/data/datasets/nsfw_selfharm_reddit/dataset-cookbook.ipynb b/data/datasets/nsfw_selfharm_reddit/dataset-cookbook.ipynb
@@ -38,7 +38,7 @@
     "\n",
     "this list was build from https://anvaka.github.io/redsim. Can be used to expand the list of favourable subreddits.\n",
     "\n",
-    "takeing these for now"
+    "taking these for now"
    ]
   },
   {

diff --git a/data/datasets/oa_leet10k/oa_leet10k.ipynb b/data/datasets/oa_leet10k/oa_leet10k.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "Takes this Kaggle dataset 'leetcode-solutions'\n",
     "https://www.kaggle.com/datasets/erichartford/leetcode-solutions, and turns them into basic\n",
-    "dialogue using a preset list of user prompt tempaltes."
+    "dialogue using a preset list of user prompt templates."
    ]
   },
   {

diff --git a/data/datasets/poetry_instruction/README.md b/data/datasets/poetry_instruction/README.md
@@ -12,7 +12,7 @@ Dataset Structure This dataset follows the OA format, which is:
 
 INSTRUCTION (string): The user asks for a poem (from a variety of premade
 prompts) with topics (tags). If the given poem has no tags, the user asks for a
-poem on it's own.
+poem on its own.
 
 RESPONSE (string): The assistant replies with the poem and title (from a variety
 of premade prompts).

diff --git a/data/datasets/recipes/tasty_recipes.ipynb b/data/datasets/recipes/tasty_recipes.ipynb
@@ -5,7 +5,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Takes this Kaggle dataset 'Recipes from Tasty' https://www.kaggle.com/datasets/zeeenb/recipes-from-tasty?select=ingredient_and_instructions.json, and turns them into basic dialogue using a preset list of user prompt tempaltes."
+    "Takes this Kaggle dataset 'Recipes from Tasty' https://www.kaggle.com/datasets/zeeenb/recipes-from-tasty?select=ingredient_and_instructions.json, and turns them into basic dialogue using a preset list of user prompt templates."
    ]
   },
   {

diff --git a/data/datasets/safety_directory/child_help/child_help.py b/data/datasets/safety_directory/child_help/child_help.py
@@ -951,7 +951,7 @@
     "Parent-Child Support Line": {
         "region": "Hong Kong (China)",
         "page": "https://childhelplineinternational.org/hong-kong-china-parent-child-support-line/",
-        "description": "Operated by Action Againt Abuse (ACA), the Parent-Child Support Line provides service where parents, children, professionals and the public can call the hotline 2755 1122, or go to the ACA centre to report suspected child abuse cases or ask questions about any issues they are facing. It is also a support and hotline for children to express their voices and opinions. The personal data and case content of the data provider/reporter are kept strictly confidential.",
+        "description": "Operated by Action Against Abuse (ACA), the Parent-Child Support Line provides service where parents, children, professionals and the public can call the hotline 2755 1122, or go to the ACA centre to report suspected child abuse cases or ask questions about any issues they are facing. It is also a support and hotline for children to express their voices and opinions. The personal data and case content of the data provider/reporter are kept strictly confidential.",
         "contacts": {
             "Website": {"type": "website", "link": "https://www.aca.org.hk/index.php#.YmRbANNBw-Q"},
             "116 111": {"type": "phone", "link": "tel:"},

diff --git a/data/datasets/tv_dialogue/imsdb.ipynb b/data/datasets/tv_dialogue/imsdb.ipynb
@@ -231,9 +231,9 @@
     "            text += f\"{speaker}\\r\\n\"\n",
     "        if not re.findall(r\"\\[.+?\\] .+?\\r\\n\\r\\n\\[.+?\\] .+?\\r\\n\\r\\n\", text):\n",
     "            return \"\"\n",
-    "        first_occurance = re.findall(r\"\\[.+?\\] \", text)[0]\n",
-    "        if len(re.findall(re.escape(first_occurance), text)) == 1:\n",
-    "            text = re.sub(re.escape(first_occurance), f\"{first_occurance[1:-2]}\\r\\n\", text)\n",
+    "        first_occurrence = re.findall(r\"\\[.+?\\] \", text)[0]\n",
+    "        if len(re.findall(re.escape(first_occurrence), text)) == 1:\n",
+    "            text = re.sub(re.escape(first_occurrence), f\"{first_occurrence[1:-2]}\\r\\n\", text)\n",
     "\n",
     "        text = text.replace(\"&amp;\", \"&\")\n",
     "        text = \"\\r\\n\".join(text.splitlines())\n",

diff --git a/data/datasets/zhihu-kol/convert_parquet.py b/data/datasets/zhihu-kol/convert_parquet.py
@@ -3,7 +3,7 @@
 import pandas as pd
 
 
-def reformat_csv_to_openassitant(df: pd.DataFrame) -> pd.DataFrame:
+def reformat_csv_to_openassistant(df: pd.DataFrame) -> pd.DataFrame:
     """
     Reformat the downloaded CSV into either Instruction or Text format
     so that it could be directly ingested into the training pipeline.
@@ -44,6 +44,6 @@ def reformat_csv_to_openassitant(df: pd.DataFrame) -> pd.DataFrame:
     input_csv = "zhihu.csv"
     # Create a pandas dataframe from your dataset file(s)
     df = pd.read_csv(input_csv)  # or any other way
-    df = reformat_csv_to_openassitant(df)
+    df = reformat_csv_to_openassistant(df)
     # Save the file in the Parquet format
     df.to_parquet("dataset.parquet", row_group_size=100, engine="pyarrow", index=False)
diff --git a/data/datasets/zhihu-kol/main.py b/data/datasets/zhihu-kol/main.py
@@ -155,7 +155,7 @@ def get_answer_content(qid: str, aid) -> str:
     return content
 
 
-def reformat_csv_to_openassitant(df: pd.DataFrame) -> pd.DataFrame:
+def reformat_csv_to_openassistant(df: pd.DataFrame) -> pd.DataFrame:
     """
     Reformat the downloaded CSV into either Instruction or Text format
     so that it could be directly ingested into the training pipeline.
@@ -226,7 +226,7 @@ def start(qid: str, aid: str):
         start(qid, aid)
     multitasking.wait_for_tasks()
     df["回答内容"] = df["问题ID"].apply(lambda x: content_list[x])
-    updated_df = reformat_csv_to_openassitant(df)
+    updated_df = reformat_csv_to_openassistant(df)
     updated_df.to_csv(csv_path, encoding="utf-8-sig", index=None)
     bar.close()
     print(f"url_token 为 {url_token} 的用户回答数据已存储到文件:{csv_path}")

diff --git a/docs/blog/2023-04-10-open-assistant-livestream-just-chatting/index.mdx b/docs/blog/2023-04-10-open-assistant-livestream-just-chatting/index.mdx
@@ -8,7 +8,7 @@ image: https://img.youtube.com/vi/5IymlBZDw-0/0.jpg
 
 import ReactPlayer from "react-player";
 
-Livestream playing around with Open Assistant and AI allignement :)
+Livestream playing around with Open Assistant and AI alignment :)
 
 https://open-assistant.io/chat
 

diff --git a/docs/docs/research/retrieval.md b/docs/docs/research/retrieval.md
@@ -139,7 +139,7 @@ i.e. the 7B can utilize 40 nearest neighbor chunks, a 172M model only 10 NNs.
 ### Bertsch et al. 2023: Unlimiformer: Long-Range Transformers with Unlimited Length Input
 
 Idea: Use retrieval to actually maximize overlap of "query embeddings" with
-embeddings from an encoder (in a encoder-decoder architecture). Essentially it
+embeddings from an encoder (in an encoder-decoder architecture). Essentially it
 is an ideal approximation of the softmax in the Cross-Attention over all
 previous tokens (in the encoder inputs).
 

diff --git a/inference/worker/chat_chain.py b/inference/worker/chat_chain.py
@@ -440,7 +440,7 @@ def handle_conversation(
                         id="1",
                         chat_id="1",
                         parent_id=None,
-                        content="Hello, my name is Open Assisstant, how i can help you today?",
+                        content="Hello, my name is Open Assistant, how i can help you today?",
                         created_at=datetime.datetime.now(),
                         role="assistant",
                         state=inference.MessageState.complete,

diff --git a/model/model_training/configs/ppo_config.yaml b/model/model_training/configs/ppo_config.yaml
@@ -18,7 +18,7 @@ model:
 tokenizer:
   tokenizer_path:
   truncation_side: "left" # AKo: changed this to "left", otherwise it sees only the beginning of conversations
-  padding_side: "left" # use with rotary positional embedidngs
+  padding_side: "left" # use with rotary positional embeddings
 
 optimizer:
   name: "adamw"

diff --git a/model/model_training/tests/test_dialogue_data_collator.py b/model/model_training/tests/test_dialogue_data_collator.py
@@ -53,7 +53,7 @@ def test_dataset_entry_no_context(pythia_tokenizer):
     # check if targets are as expected
     assert pythia_tokenizer.decode(batch.targets[0]) == expected_decoded_targets[0]
 
-    # check if masking is correct. Note that we mask the system aswell
+    # check if masking is correct. Note that we mask the system as well
     assert pythia_tokenizer.decode(batch.input_ids[0][batch.label_masks[0]]) == expected_masked[0]
 
 
@@ -114,7 +114,7 @@ def test_dataset_entry(pythia_tokenizer):
     assert pythia_tokenizer.decode(batch.targets[0]) == expected_decoded_targets[0]
     assert pythia_tokenizer.decode(batch.targets[1]) == expected_decoded_targets[1]
 
-    # check if masking is correct. Note that we mask the system aswell
+    # check if masking is correct. Note that we mask the system as well
     assert pythia_tokenizer.decode(batch.input_ids[0][batch.label_masks[0]]) == expected_masked[0]
     assert pythia_tokenizer.decode(batch.input_ids[1][batch.label_masks[1]]) == expected_masked[1]
 

diff --git a/model/model_training/tools/sample_rm_data.py b/model/model_training/tools/sample_rm_data.py
@@ -1,5 +1,5 @@
 """
-    Recursive method to travese down the the conversation tree
+    Recursive method to traverse down the conversation tree
 
     Use fastlangid for language identification :
     >> pip install fastlangid

diff --git a/notebooks/data-augmentation/anthropic/safety data-augmentation.ipynb b/notebooks/data-augmentation/anthropic/safety data-augmentation.ipynb
@@ -271,7 +271,7 @@
    "metadata": {},
    "source": [
     "#### Convert red-teaming data to prosocial format\n",
-    "- Red-teaming data is laballed for the whole conversation\n",
+    "- Red-teaming data is labelled for the whole conversation\n",
     "- Pro-social data is labelled per dialog \n",
     "- The `episode_done` flag indicates the end of a conversation"
    ]

diff --git a/notebooks/data-augmentation/changemyview-builder/data_processor.ipynb b/notebooks/data-augmentation/changemyview-builder/data_processor.ipynb
@@ -544,7 +544,7 @@
     }
    ],
    "source": [
-    "# Test to see if it was sucessful\n",
+    "# Test to see if it was successful\n",
     "table = pq.read_table(\"output.parquet\")\n",
     "table.to_pandas()"
    ],

diff --git a/notebooks/data-augmentation/essay-revision/essay-revision.ipynb b/notebooks/data-augmentation/essay-revision/essay-revision.ipynb
@@ -23,7 +23,7 @@
     "id": "o0lAqmWhsiUe"
    },
    "source": [
-    "The goal of this notebook is to use data argumentation to have data on improving essays. The way this is done is by taking a template \"good\" essay and making step by step changes that make it worse and add intructions on how to fix it."
+    "The goal of this notebook is to use data argumentation to have data on improving essays. The way this is done is by taking a template \"good\" essay and making step by step changes that make it worse and add instructions on how to fix it."
    ]
   },
   {
@@ -165,7 +165,7 @@
    },
    "outputs": [],
    "source": [
-    "# Make stucture error (shuffle one paragraph with another)\n",
+    "# Make structure error (shuffle one paragraph with another)\n",
     "essay_paragraphs = essay.split(\"\\n\\n\")  # Splitting a String by newline character (\\n)\n",
     "\n",
     "rand1 = random.randint(0, len(essay_paragraphs) - 1)\n",
@@ -286,7 +286,7 @@
     }
    ],
    "source": [
-    "# Prints intrcutions (final step)\n",
+    "# Prints instructions (final step)\n",
     "for i in instructions:\n",
     "    print(i)\n",
     "instructions.clear()"

diff --git a/notebooks/data-augmentation/movie-dialogs/convert-to-instruction-format.ipynb b/notebooks/data-augmentation/movie-dialogs/convert-to-instruction-format.ipynb
@@ -198,7 +198,7 @@
     "                    ## make sure that characters are the same\n",
     "                    if char_id_new == char_ids:\n",
     "                        lineids_new = [int(lineid[1:]) for lineid in data[\"utterance\"][\"LineID\"]]\n",
-    "                        if lineids_new[0] == (lineids[-1] + 1):  ##ensure continuety\n",
+    "                        if lineids_new[0] == (lineids[-1] + 1):  ##ensure continuity\n",
     "                            lineids.extend(lineids_new)\n",
     "                        else:\n",
     "                            break\n",

diff --git a/notebooks/data-augmentation/unified-qa/unified-qa.ipynb b/notebooks/data-augmentation/unified-qa/unified-qa.ipynb
@@ -737,7 +737,7 @@
     "    conv.append(\"I have some common sense questions for you to answer.\")\n",
     "    conv.append(\"Okay, I can try to answer your questions while using common sense. Please provide the question.\")\n",
     "    conv.append(q)\n",
-    "    conv.append(\"The commmon sense answer would be: \" + a)\n",
+    "    conv.append(\"The common sense answer would be: \" + a)\n",
     "    return conv\n",
     "\n",
     "\n",

diff --git a/notebooks/data-augmentation/wikidata-qa/README.md b/notebooks/data-augmentation/wikidata-qa/README.md
@@ -41,7 +41,7 @@ print(wg.generate("Q115564437"))
 
 >>> Questions and Answers on ChatGPT (also known as GPT-3.5, Generative Pre-trained Transformer):
 
-Q: What is the offical website for Generative Pre-trained Transformer?
+Q: What is the official website for Generative Pre-trained Transformer?
 A: Its web address is: https://chat.openai.com/chat
 
 Q: What's ChatGPT's license?

diff --git a/notebooks/data-augmentation/wikidata-qa/wikidata.ipynb b/notebooks/data-augmentation/wikidata-qa/wikidata.ipynb
@@ -318,7 +318,7 @@
     "                \"What's {name}'s capital city? Thank you in advance!\",\n",
     "            ],\n",
     "            \"P37\": [\n",
-    "                \"Tell me what the offical language of {name} is?\",\n",
+    "                \"Tell me what the official language of {name} is?\",\n",
     "                \"What language do they speak in {name}?\",\n",
     "                \"How do they speak in {name}?\",\n",
     "                \"What languages they understand in {name}?\",\n",
@@ -368,7 +368,7 @@
     "                \"Who crated {name}?\",\n",
     "                \"Who is {name}'s creator?\",\n",
     "                \"Who made {name}?\",\n",
-    "                \"Who is reponsible for {name}?\",\n",
+    "                \"Who is responsible for {name}?\",\n",
     "            ],\n",
     "            \"P225\": [\n",
     "                \"Describe {name} to me in latin.\",\n",
@@ -477,7 +477,7 @@
     "                \"Give me the URL for {name}.\",\n",
     "                \"What's the URL for {name}? Thanks!\",\n",
     "                \"What's {name}'s website?\",\n",
-    "                \"What is the offical website for {name}?\",\n",
+    "                \"What is the official website for {name}?\",\n",
     "                \"Can you tell me the link to {name}?\",\n",
     "            ],\n",
     "            \"P973\": [\n",
@@ -587,7 +587,7 @@
     "            \"P36\": [\"Do you know {pos} capital?\", \"What is {pos} capital called?\", \"What's the name of {pos} capital?\"],\n",
     "            \"P37\": [\n",
     "                \"Describe {pos} official language.\",\n",
-    "                \"What is {pos} offical language?\",\n",
+    "                \"What is {pos} official language?\",\n",
     "                \"What language do they speak there?\",\n",
     "            ],\n",
     "            \"P38\": [\n",
@@ -713,7 +713,7 @@
     "            \"P856\": [\n",
     "                \"Send me {pos} web address.\",\n",
     "                \"What's the address of {pos} website?\",\n",
-    "                \"What is {pos} offical website?\",\n",
+    "                \"What is {pos} official website?\",\n",
     "                \"Can you tell me the link to {obj}?\",\n",
     "            ],\n",
     "            \"P973\": [\n",
@@ -1180,7 +1180,7 @@
       "Q: How big is Budim?\r\n",
       "A: Budim's area is 52514 hectare\n",
       "\n",
-      "Q: What is its offical website?\r\n",
+      "Q: What is its official website?\r\n",
       "A: The URLs for Budapest are: https://budapest.hu and https://budapest.hu/sites/english/\n",
       "\n",
       "Q: Tell me who its governor is!\r\n",

diff --git a/notebooks/data-augmentation/writing-prompt/writing_prompt.ipynb b/notebooks/data-augmentation/writing-prompt/writing_prompt.ipynb
@@ -38,7 +38,7 @@
     "where `stripped_constraint` is the constraint found.\n",
     "\n",
     "* Answer beginning constraints: this constraint was imposed by the way the answer should start.  \n",
-    "> Base template, starting with: {beggining} -> Rosey: Sure, here's a story about: {stripped_prompt}, starting with: {beggining}:\\n{story}\n",
+    "> Base template, starting with: {beginning} -> Rosey: Sure, here's a story about: {stripped_prompt}, starting with: {beginning}:\\n{story}\n",
     "\n",
     "where `beginning` is the first sentence of a story.\n",
     "\n",
@@ -1009,7 +1009,7 @@
     "    dialogBase = \"\"\"User: write me a story about: {stripped_prompt}\"\"\"\n",
     "    dialog1 = \"\"\" -> Rosey: Sure, here's a story about: {stripped_prompt}:\\n{story}\"\"\"\n",
     "    dialog2 = \"\"\", {stripped_constraint} -> Rosey: Sure, here's a story about: {stripped_prompt}, {stripped_constraint}:\\n{story}\"\"\"\n",
-    "    dialog3 = \"\"\", starting with: {beggining} -> Rosey: Sure, here's a story about: {stripped_prompt}, starting with: {beggining}:\\n{story}\"\"\"\n",
+    "    dialog3 = \"\"\", starting with: {beginning} -> Rosey: Sure, here's a story about: {stripped_prompt}, starting with: {beginning}:\\n{story}\"\"\"\n",
     "    dialog4 = \"\"\", ending with: {ending} -> Rosey: Sure, here's a story about {stripped_prompt}: ending with: {ending}\\n{story}\"\"\"\n",
     "    dialog5 = \"\"\", where the middle of the story is about: {middle} -> Rosey: Sure, here's a story about: {stripped_prompt}, where the middle of the story is about: {middle}:\\n{story}\"\"\"\n",
     "\n",
@@ -1041,7 +1041,7 @@
     "                        stripped_prompt=strippedPrompt, stripped_constraint=strippedConstraint, story=story\n",
     "                    )\n",
     "                    dialogs.append(get_sample_dict(row.split, row.splitIndex, dialog))\n",
-    "                dialog = dialogBeg + dialog3.format(stripped_prompt=strippedPrompt, story=story, beggining=beginning)\n",
+    "                dialog = dialogBeg + dialog3.format(stripped_prompt=strippedPrompt, story=story, beginning=beginning)\n",
     "                dialogs.append(get_sample_dict(row.split, row.splitIndex, dialog))\n",
     "                dialog = dialogBeg + dialog4.format(stripped_prompt=strippedPrompt, story=story, ending=ending)\n",
     "                dialogs.append(get_sample_dict(row.split, row.splitIndex, dialog))\n",
@@ -1087,7 +1087,7 @@
     "    dialogBase = \"\"\"User: write me a story about: {stripped_prompt}\"\"\"\n",
     "    dialog1 = \"\"\" -> Rosey: Sure, here's a story about: {stripped_prompt}:\\n{story}\"\"\"\n",
     "    dialog2 = \"\"\", {stripped_constraint} -> Rosey: Sure, here's a story about: {stripped_prompt}, {stripped_constraint}:\\n{story}\"\"\"\n",
-    "    dialog3 = \"\"\", starting with: {beggining} -> Rosey: Sure, here's a story about: {stripped_prompt}, starting with: {beggining}:\\n{story}\"\"\"\n",
+    "    dialog3 = \"\"\", starting with: {beginning} -> Rosey: Sure, here's a story about: {stripped_prompt}, starting with: {beginning}:\\n{story}\"\"\"\n",
     "    dialog4 = \"\"\", ending with: {ending} -> Rosey: Sure, here's a story about {stripped_prompt}: ending with: {ending}\\n{story}\"\"\"\n",
     "    dialog5 = \"\"\", where the middle of the story is about: {middle} -> Rosey: Sure, here's a story about: {stripped_prompt}, where the middle of the story is about: {middle}:\\n{story}\"\"\"\n",
     "\n",
@@ -1117,7 +1117,7 @@
     "                if beginning is not None:\n",
     "                    beginning, middles, ending = extract_story_parts(story)\n",
     "                    dialog = dialogBeg + dialog3.format(\n",
-    "                        stripped_prompt=strippedPrompt, story=story, beggining=beginning\n",
+    "                        stripped_prompt=strippedPrompt, story=story, beginning=beginning\n",
     "                    )\n",
     "                    dialogs.append(get_sample_dict(row.split, row.splitLineIndex, dialog))\n",
     "                    dialog = dialogBeg + dialog4.format(stripped_prompt=strippedPrompt, story=story, ending=ending)\n",

diff --git a/notebooks/detoxify-evaluation/README.md b/notebooks/detoxify-evaluation/README.md
@@ -1,7 +1,7 @@
 # Detoxify evaluation
 
-[Detoxify](https://github.com/unitaryai/detoxify) is a open source model used to
-identify prompts as toxic
+[Detoxify](https://github.com/unitaryai/detoxify) is an open source model used
+to identify prompts as toxic
 
 <img  src="https://raw.githubusercontent.com/unitaryai/detoxify/master/examples.png"  alt="Image from detoxify github that shows the example input/output of their model"  />
 

diff --git a/notebooks/detoxify-evaluation/detoxify-evaluation.ipynb b/notebooks/detoxify-evaluation/detoxify-evaluation.ipynb
@@ -31,7 +31,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "[Detoxify](https://github.com/unitaryai/detoxify) is a open source model used to identify prompts as toxic\n",
+    "[Detoxify](https://github.com/unitaryai/detoxify) is an open source model used to identify prompts as toxic\n",
     "\n",
     "<img src=\"https://raw.githubusercontent.com/unitaryai/detoxify/master/examples.png\" alt=\"Image from detoxify github that shows the example input/output of their model\" />\n",
     "\n",

diff --git a/oasst-data/README.md b/oasst-data/README.md
@@ -145,10 +145,10 @@ messages are those which have a `review_result` that is `false`.
 
 Conversation threads are a linear lists of messages. THese objects can be
 identified by the presence of the `"thread_id"` property which contains the UUID
-of the last message of the the thread (which can be used to reconstruct the
-thread by returning the list of ancestor messages up to the prompt root
-message). The message_id of the first message is normally also the id of the
-message-tree that contains the thread.
+of the last message of the thread (which can be used to reconstruct the thread
+by returning the list of ancestor messages up to the prompt root message). The
+message_id of the first message is normally also the id of the message-tree that
+contains the thread.
 
 ```json
 {
-Original file line number
+Diff line change
@@ Expand Up / @@ -38,7 +38,7 @@ @@
         "\n",
         "this list was build from https://anvaka.github.io/redsim. Can be used to expand the list of favourable subreddits.\n",
         "\n",
-        "takeing these for now"
+        "taking these for now"
        ]
       },
       {
@@ Expand Down @@