Skip to content

Commit

Permalink
add transformers
Browse files Browse the repository at this point in the history
  • Loading branch information
mishig25 committed Jun 26, 2024
1 parent 25a9a59 commit 0a863af
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 13 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/build_embeddings.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ jobs:
doc_folder: docs/source
- repo_id: huggingface/huggingface_hub
doc_folder: docs/source/en
- repo_id: huggingface/transformers
doc_folder: docs/source/en
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
Expand Down
18 changes: 5 additions & 13 deletions src/doc_builder/build_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,8 @@
from .autodoc import autodoc_markdown, resolve_links_in_text
from .convert_md_to_mdx import process_md
from .convert_rst_to_mdx import find_indent, is_empty_line
from .meilisearch_helper import (
add_embeddings_to_db,
create_embedding_db,
delete_embedding_db,
get_meili_chunks,
swap_indexes,
)
from .utils import read_doc_config
from .meilisearch_helper import add_embeddings_to_db, create_embedding_db, delete_embedding_db, swap_indexes
from .utils import chunk_list, read_doc_config


Chunk = namedtuple("Chunk", "text source package_name")
Expand Down Expand Up @@ -467,11 +461,9 @@ def build_embeddings(

# Step 3: push embeddings to vector database (meilisearch)
client = meilisearch.Client("https://edge.meilisearch.com", meilisearch_key)

payloads_embeddings = get_meili_chunks(embeddings)

for payload_embeddings in tqdm(payloads_embeddings):
add_embeddings_to_db(client, MEILI_INDEX_TEMP, payload_embeddings)
ITEMS_PER_CHUNK = 5000 # a value that was found experimentally
for chunk_embeddings in tqdm(chunk_list(embeddings, ITEMS_PER_CHUNK), desc="Uploading data to meilisearch"):
add_embeddings_to_db(client, MEILI_INDEX_TEMP, chunk_embeddings)


def clean_meilisearch(meilisearch_key: str):
Expand Down
7 changes: 7 additions & 0 deletions src/doc_builder/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,3 +193,10 @@ def sveltify_file_route(filename):
# Replace the '{name}.mdx' with '{name}/+page.svelte'
return filename.rsplit(".", 1)[0] + "/+page.svelte"
return filename


def chunk_list(lst, n):
"""
Create a list of chunks
"""
return [lst[i : i + n] for i in range(0, len(lst), n)]

0 comments on commit 0a863af

Please sign in to comment.