From 0a863af8ebb315b1455300210c2027049302d2af Mon Sep 17 00:00:00 2001 From: Mishig Davaadorj Date: Tue, 25 Jun 2024 16:18:20 +0100 Subject: [PATCH] add transformers --- .github/workflows/build_embeddings.yml | 2 ++ src/doc_builder/build_embeddings.py | 18 +++++------------- src/doc_builder/utils.py | 7 +++++++ 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/.github/workflows/build_embeddings.yml b/.github/workflows/build_embeddings.yml index 6a9b9d4f..e428d7dc 100644 --- a/.github/workflows/build_embeddings.yml +++ b/.github/workflows/build_embeddings.yml @@ -24,6 +24,8 @@ jobs: doc_folder: docs/source - repo_id: huggingface/huggingface_hub doc_folder: docs/source/en + - repo_id: huggingface/transformers + doc_folder: docs/source/en concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true diff --git a/src/doc_builder/build_embeddings.py b/src/doc_builder/build_embeddings.py index e8645ab7..923a7693 100644 --- a/src/doc_builder/build_embeddings.py +++ b/src/doc_builder/build_embeddings.py @@ -29,14 +29,8 @@ from .autodoc import autodoc_markdown, resolve_links_in_text from .convert_md_to_mdx import process_md from .convert_rst_to_mdx import find_indent, is_empty_line -from .meilisearch_helper import ( - add_embeddings_to_db, - create_embedding_db, - delete_embedding_db, - get_meili_chunks, - swap_indexes, -) -from .utils import read_doc_config +from .meilisearch_helper import add_embeddings_to_db, create_embedding_db, delete_embedding_db, swap_indexes +from .utils import chunk_list, read_doc_config Chunk = namedtuple("Chunk", "text source package_name") @@ -467,11 +461,9 @@ def build_embeddings( # Step 3: push embeddings to vector database (meilisearch) client = meilisearch.Client("https://edge.meilisearch.com", meilisearch_key) - - payloads_embeddings = get_meili_chunks(embeddings) - - for payload_embeddings in tqdm(payloads_embeddings): - add_embeddings_to_db(client, MEILI_INDEX_TEMP, payload_embeddings) + ITEMS_PER_CHUNK = 5000 # a value that was found experimentally + for chunk_embeddings in tqdm(chunk_list(embeddings, ITEMS_PER_CHUNK), desc="Uploading data to meilisearch"): + add_embeddings_to_db(client, MEILI_INDEX_TEMP, chunk_embeddings) def clean_meilisearch(meilisearch_key: str): diff --git a/src/doc_builder/utils.py b/src/doc_builder/utils.py index cd0d1048..25637e61 100644 --- a/src/doc_builder/utils.py +++ b/src/doc_builder/utils.py @@ -193,3 +193,10 @@ def sveltify_file_route(filename): # Replace the '{name}.mdx' with '{name}/+page.svelte' return filename.rsplit(".", 1)[0] + "/+page.svelte" return filename + + +def chunk_list(lst, n): + """ + Create a list of chunks + """ + return [lst[i : i + n] for i in range(0, len(lst), n)]