Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create notebooks locally #745

Open
Seneko opened this issue Sep 22, 2024 · 0 comments
Open

Create notebooks locally #745

Seneko opened this issue Sep 22, 2024 · 0 comments

Comments

@Seneko
Copy link

Seneko commented Sep 22, 2024

There was an error creating notebooks locally as some chars not identified found in yml and mdx files. Erro :UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 126: character maps to

Slight modification of the code resolved the errors and created the notebooks:

import argparse
import os
import re
import nbformat
import shutil
import yaml

from pathlib import Path

re_framework_test = re.compile(r"^{#if\s+fw\s+===\s+'([^']+)'}\s*$")
re_framework_else = re.compile(r"^{:else}\s*$")
re_framework_end = re.compile(r"^{/if}\s*$")

re_html_line = re.compile(r"^<[^>]*/>\s*$")
re_html_tag = re.compile(r"<([^/>]*)>\s*$")

re_python_code = re.compile(r"^```(?:py|python|py no\-format|python no\-format)\s*$")
re_output_code = re.compile(r"^```(?:py|python)\s+out\s*$")
re_end_code = re.compile(r"^```\s*$")

frameworks = {"pt": "PyTorch", "tf": "TensorFlow"}

PATH_TO_COURSE = Path("chapters/")
# Languages to exlude from the notebook generation because the notebooks were
# created manually
LANGS_TO_EXCLUDE = ["fr"]


def read_and_split_frameworks(fname):
    """
    Read the MDX in fname and creates two versions (if necessary) for each framework.
    """
    with open(fname, "r", encoding="utf-8") as f:
        content = f.readlines()

    contents = {"pt": [], "tf": []}

    differences = False
    current_content = []
    line_idx = 0
    for line in content:
        if re_framework_test.search(line) is not None:
            differences = True
            framework = re_framework_test.search(line).groups()[0]
            for key in contents:
                contents[key].extend(current_content)
            current_content = []
        elif re_framework_else.search(line) is not None:
            contents[framework].extend(current_content)
            current_content = []
            framework = "pt" if framework == "tf" else "tf"
        elif re_framework_end.search(line) is not None:
            contents[framework].extend(current_content)
            current_content = []
        else:
            current_content.append(line)

    if len(current_content) > 0:
        for key in contents:
            contents[key].extend(current_content)

    if differences:
        return {k: "".join(content) for k, content in contents.items()}
    else:
        return "".join(content)


def extract_cells(content):
    """
    Extract the code/output cells from content.
    """
    cells = []
    current_cell = None
    is_output = False
    for line in content.split("\n"):
        if re_python_code.search(line) is not None:
            is_output = False
            current_cell = []
        elif re_output_code.search(line) is not None:
            is_output = True
            current_cell = []
        elif re_end_code.search(line) is not None and current_cell is not None:
            cell = "\n".join(current_cell)
            if is_output:
                if not isinstance(cells[-1], tuple):
                    cells[-1] = (cells[-1], cell)
            else:
                cells.append(cell)
            current_cell = None
            current_md = []
        elif current_cell is not None:
            current_cell.append(line)

    return cells


def convert_to_nb_cell(cell):
    """
    Convert some cell (either just code or tuple (code, output)) to a proper notebook cell.
    """
    nb_cell = {"cell_type": "code", "execution_count": None, "metadata": {}}
    if isinstance(cell, tuple):
        nb_cell["source"] = cell[0]
        nb_cell["outputs"] = [
            nbformat.notebooknode.NotebookNode(
                {
                    "data": {"text/plain": cell[1]},
                    "execution_count": None,
                    "metadata": {},
                    "output_type": "execute_result",
                }
            )
        ]
    else:
        nb_cell["source"] = cell
        nb_cell["outputs"] = []
    return nbformat.notebooknode.NotebookNode(nb_cell)


def nb_cell(source, code=True):
    if not code:
        return nbformat.notebooknode.NotebookNode({"cell_type": "markdown", "source": source, "metadata": {}})
    return nbformat.notebooknode.NotebookNode(
        {"cell_type": "code", "metadata": {}, "source": source, "execution_count": None, "outputs": []}
    )


def build_notebook(fname, title, output_dir="."):
    """
    Build the notebook for fname with a given title in output_dir.
    """
    sections = read_and_split_frameworks(fname)
    sections_with_accelerate = [
        "chapter3/3",  # "Fine-tuning a model with the Trainer API or Keras",
        "chapter3/4",  # "A full training",
        "chapter7/2_pt",  # "Token classification (PyTorch)",
        "chapter7/3_pt",  # "Fine-tuning a masked language model (PyTorch)"
        "chapter7/4_pt",  # "Translation (PyTorch)"
        "chapter7/5_pt",  # "Summarization (PyTorch)",
        "chapter7/6_pt",  # "Training a causal language model from scratch (PyTorch)"
        "chapter7/7_pt",  # "Question answering (PyTorch)"
    ]
    sections_with_hf_hub = [
        "chapter4/3_pt",  # "Sharing pretrained models (PyTorch)"
        "chapter4/3_tf",  # "Sharing pretrained models (TensorFlow)"
        "chapter5/5",  # "Creating your own dataset"
        "chapter7/2_pt",  # "Token classification (PyTorch)"
        "chapter7/2_tf",  # "Token classification (TensorFlow)"
        "chapter6/2",  # "Training a new tokenizer from an old one"
        "chapter7/3_pt",  # "Fine-tuning a masked language model (PyTorch)"
        "chapter7/3_tf",  # "Fine-tuning a masked language model (TensorFlow)"
        "chapter7/4_pt",  # "Translation (PyTorch)"
        "chapter7/4_tf",  # "Translation (TensorFlow)"
        "chapter7/5_pt",  # "Summarization (PyTorch)"
        "chapter7/5_tf",  # "Summarization (TensorFlow)"
        "chapter7/6_pt",  # "Training a causal language model from scratch (PyTorch)"
        "chapter7/6_tf",  # "Training a causal language model from scratch (TensorFlow)"
        "chapter7/7_pt",  # "Question answering (PyTorch)"
        "chapter7/7_tf",  # "Question answering (TensorFlow)"
        "chapter8/2",  # "What to do when you get an error"
    ]
    sections_with_faiss = [
        "chapter5/6_pt",  # "Semantic search with FAISS (PyTorch)"
        "chapter5/6_tf",  # "Semantic search with FAISS (TensorFlow)"
    ]
    sections_with_gradio = [
        "chapter9/2",  # "Building your first demo"
        "chapter9/3",  # "Understanding the Interface class"
        "chapter9/4",  # "Sharing demos with others"
        "chapter9/5",  # "Integrations with the Hugging Face Hub"
        "chapter9/6",  # "Advanced Interface features"
        "chapter9/7",  # "Introduction to Blocks"
    ]
    stem = Path(fname).stem
    if not isinstance(sections, dict):
        contents = [sections]
        titles = [title]
        fnames = [f"section{stem}.ipynb"]
        section_names = [f"{Path(fname).parent.stem}/{stem}"]
    else:
        contents = []
        titles = []
        fnames = []
        section_names = []
        for key, section in sections.items():
            contents.append(section)
            titles.append(f"{title} ({frameworks[key]})")
            fnames.append(f"section{stem}_{key}.ipynb")
            section_names.append(f"{Path(fname).parent.stem}/{stem}_{key}")

    for title, content, fname, section_name in zip(titles, contents, fnames, section_names):
        cells = extract_cells(content)
        if len(cells) == 0:
            continue

        nb_cells = [
            nb_cell(f"# {title}", code=False),
            nb_cell("Install the Transformers, Datasets, and Evaluate libraries to run this notebook.", code=False),
        ]

        # Install cell
        installs = ["!pip install datasets evaluate transformers[sentencepiece]"]
        if section_name in sections_with_accelerate:
            installs.append("!pip install accelerate")
            installs.append("# To run the training on TPU, you will need to uncomment the following line:")
            installs.append(
                "# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl"
            )
        if section_name in sections_with_hf_hub:
            installs.append("!apt install git-lfs")
        if section_name in sections_with_faiss:
            installs.append("!pip install faiss-gpu")
        if section_name in sections_with_gradio:
            installs.append("!pip install gradio")

        nb_cells.append(nb_cell("\n".join(installs)))

        if section_name in sections_with_hf_hub:
            nb_cells.extend(
                [
                    nb_cell(
                        "You will need to setup git, adapt your email and name in the following cell.", code=False
                    ),
                    nb_cell(
                        '!git config --global user.email "you@example.com"\n!git config --global user.name "Your Name"'
                    ),
                    nb_cell(
                        "You will also need to be logged in to the Hugging Face Hub. Execute the following and enter your credentials.",
                        code=False,
                    ),
                    nb_cell("from huggingface_hub import notebook_login\n\nnotebook_login()"),
                ]
            )
        nb_cells += [convert_to_nb_cell(cell) for cell in cells]
        metadata = {"colab": {"name": title, "provenance": []}}
        nb_dict = {"cells": nb_cells, "metadata": metadata, "nbformat": 4, "nbformat_minor": 4}
        notebook = nbformat.notebooknode.NotebookNode(nb_dict)
        os.makedirs(output_dir, exist_ok=True)
        nbformat.write(notebook, os.path.join(output_dir, fname), version=4)


def get_titles(language):
    """
    Parse the _toctree.yml file to get the correspondence filename to title
    """
    # Specify utf-8 encoding explicitly when opening the YAML file
    yaml_path = os.path.join(f"chapters/{language}", "_toctree.yml")
    with open(yaml_path, "r", encoding="utf-8") as f:  # Ensure utf-8 encoding
        table = yaml.safe_load(f)
    result = {}
    for entry in table:
        for section in entry["sections"]:
            section_title = section["title"]
            if "local_fw" in section:
                section_names = section["local_fw"]
                result[section_names["pt"]] = section_title
                result[section_names["tf"]] = section_title
            else:
                section_name = section["local"]
                result[section_name] = section_title
    return {k: v for k, v in result.items() if "quiz" not in v}


def create_notebooks(language, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for folder in os.listdir(output_dir):
        if folder.startswith("chapter"):
            shutil.rmtree(os.path.join(output_dir, folder))
    titles = get_titles(language)
    for fname, title in titles.items():
        build_notebook(
            os.path.join(f"chapters/{language}", f"{fname}.mdx"),
            title,
            os.path.join(output_dir, Path(fname).parent),
        )


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--output_dir", type=str, help="Where to output the notebooks")
    args = parser.parse_args()

    languages = [f.stem for f in PATH_TO_COURSE.iterdir() if f.is_dir()]

    for language in languages:
        if language in LANGS_TO_EXCLUDE:
            continue
        language_output_dir = f"{args.output_dir}/{language}"
        create_notebooks(language, language_output_dir)
        # Remove empty notebook folders
        if not any(Path(language_output_dir).iterdir()):
            shutil.rmtree(language_output_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant