From a8a83fffff17c882c554e1a6ea95481de773c6b1 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev <alekszievr@gmail.com> Date: Fri, 20 Dec 2024 10:53:57 +0100 Subject: [PATCH 01/17] Ingest non-code files --- cognee/api/v1/cognify/code_graph_pipeline.py | 43 ++++++++++++++++--- cognee/tasks/repo_processor/__init__.py | 1 + .../repo_processor/get_non_code_files.py | 36 ++++++++++++++++ examples/python/code_graph_example.py | 11 ++--- 4 files changed, 80 insertions(+), 11 deletions(-) create mode 100644 cognee/tasks/repo_processor/get_non_code_files.py diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index c35f9719f..8e92d08e0 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -2,29 +2,37 @@ import logging from pathlib import Path +from cognee.base_config import get_base_config +from cognee.modules.cognify.config import get_cognify_config from cognee.modules.pipelines import run_tasks from cognee.modules.pipelines.tasks.Task import Task +from cognee.modules.users.methods import get_default_user +from cognee.shared.data_models import KnowledgeGraph, MonitoringTool +from cognee.tasks.documents import (classify_documents, + extract_chunks_from_documents) +from cognee.tasks.graph import extract_graph_from_data +from cognee.tasks.ingestion import ingest_data_with_metadata from cognee.tasks.repo_processor import (enrich_dependency_graph, expand_dependency_graph, + get_data_list_for_user, + get_non_code_files, get_repo_file_dependencies) from cognee.tasks.storage import add_data_points -from cognee.base_config import get_base_config -from cognee.shared.data_models import MonitoringTool - monitoring = get_base_config().monitoring_tool if monitoring == MonitoringTool.LANGFUSE: from langfuse.decorators import observe -from cognee.tasks.summarization import summarize_code +from cognee.tasks.summarization import summarize_code, summarize_text logger = logging.getLogger("code_graph_pipeline") update_status_lock = asyncio.Lock() @observe -async def run_code_graph_pipeline(repo_path): +async def run_code_graph_pipeline(repo_path, include_docs=True): import os import pathlib + import cognee from cognee.infrastructure.databases.relational import create_db_and_tables @@ -38,6 +46,9 @@ async def run_code_graph_pipeline(repo_path): await cognee.prune.prune_system(metadata=True) await create_db_and_tables() + cognee_config = get_cognify_config() + user = await get_default_user() + tasks = [ Task(get_repo_file_dependencies), Task(enrich_dependency_graph, task_config={"batch_size": 50}), @@ -46,4 +57,24 @@ async def run_code_graph_pipeline(repo_path): Task(add_data_points, task_config={"batch_size": 50}), ] - return run_tasks(tasks, repo_path, "cognify_code_pipeline") + if include_docs: + non_code_tasks = [ + Task(get_non_code_files, task_config={"batch_size": 50}), + Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user), + Task(get_data_list_for_user, dataset_name="repo_docs", user=user), + Task(classify_documents), + Task(extract_chunks_from_documents), + Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}), + Task( + summarize_text, + summarization_model=cognee_config.summarization_model, + task_config={"batch_size": 50} + ), + ] + + if include_docs: + async for result in run_tasks(non_code_tasks, repo_path): + yield result + + async for result in run_tasks(tasks, repo_path, "cognify_code_pipeline"): + yield result \ No newline at end of file diff --git a/cognee/tasks/repo_processor/__init__.py b/cognee/tasks/repo_processor/__init__.py index 05e111b29..fa754028e 100644 --- a/cognee/tasks/repo_processor/__init__.py +++ b/cognee/tasks/repo_processor/__init__.py @@ -4,4 +4,5 @@ from .enrich_dependency_graph import enrich_dependency_graph from .expand_dependency_graph import expand_dependency_graph +from .get_non_code_files import get_data_list_for_user, get_non_py_files from .get_repo_file_dependencies import get_repo_file_dependencies diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py new file mode 100644 index 000000000..5a8a34f64 --- /dev/null +++ b/cognee/tasks/repo_processor/get_non_code_files.py @@ -0,0 +1,36 @@ +import os + +import aiofiles + +import cognee.modules.ingestion as ingestion +from cognee.infrastructure.engine import DataPoint +from cognee.modules.data.methods import get_datasets +from cognee.modules.data.methods.get_dataset_data import get_dataset_data +from cognee.modules.data.methods.get_datasets_by_name import \ + get_datasets_by_name +from cognee.modules.data.models import Data +from cognee.modules.data.operations.write_metadata import write_metadata +from cognee.modules.ingestion.data_types import BinaryData +from cognee.modules.users.methods import get_default_user +from cognee.shared.CodeGraphEntities import Repository + + +async def get_non_py_files(repo_path): + """Get files that are not .py files and their contents""" + if not os.path.exists(repo_path): + return {} + + non_py_files_paths = [ + os.path.join(root, file) + for root, _, files in os.walk(repo_path) for file in files if not file.endswith(".py") + ] + return non_py_files_paths + + +async def get_data_list_for_user(_, dataset_name, user): + datasets = await get_datasets_by_name(dataset_name, user.id) + data_documents: list[Data] = [] + for dataset in datasets: + data_docs: list[Data] = await get_dataset_data(dataset_id=dataset.id) + data_documents.extend(data_docs) + return data_documents \ No newline at end of file diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py index 9189de46c..c0b91972b 100644 --- a/examples/python/code_graph_example.py +++ b/examples/python/code_graph_example.py @@ -1,15 +1,16 @@ import argparse import asyncio + from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline -async def main(repo_path): - async for result in await run_code_graph_pipeline(repo_path): +async def main(repo_path, include_docs): + async for result in run_code_graph_pipeline(repo_path, include_docs): print(result) if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--repo-path", type=str, required=True, help="Path to the repository") + parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository") + parser.add_argument("--include_docs", type=bool, default=True, help="Whether or not to process non-code files") args = parser.parse_args() - asyncio.run(main(args.repo_path)) - + asyncio.run(main(args.repo_path, args.include_docs)) \ No newline at end of file From 399faf9ca0e445e477e7c8e2201ec3478d62e928 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev <alekszievr@gmail.com> Date: Fri, 20 Dec 2024 13:37:24 +0100 Subject: [PATCH 02/17] Fixing review findings --- cognee/tasks/repo_processor/get_non_code_files.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py index 5a8a34f64..671b998d9 100644 --- a/cognee/tasks/repo_processor/get_non_code_files.py +++ b/cognee/tasks/repo_processor/get_non_code_files.py @@ -20,14 +20,26 @@ async def get_non_py_files(repo_path): if not os.path.exists(repo_path): return {} + IGNORED_PATTERNS = { + '.git', '__pycache__', '*.pyc', '*.pyo', '*.pyd', + 'node_modules', '*.egg-info' + } + + def should_process(path): + return not any(pattern in path for pattern in IGNORED_PATTERNS) + non_py_files_paths = [ os.path.join(root, file) - for root, _, files in os.walk(repo_path) for file in files if not file.endswith(".py") + for root, _, files in os.walk(repo_path) for file in files + if not file.endswith(".py") and should_process(os.path.join(root, file)) ] return non_py_files_paths async def get_data_list_for_user(_, dataset_name, user): + # Note: This method is meant to be used as a Task in a pipeline. + # By the nature of pipelines, the output of the previous Task will be passed as the first argument here, + # but it is not needed here, hence the "_" input. datasets = await get_datasets_by_name(dataset_name, user.id) data_documents: list[Data] = [] for dataset in datasets: From 4cee9a16ce16048aedf4678201ed16b5ce22273b Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Mon, 6 Jan 2025 11:22:45 +0100 Subject: [PATCH 03/17] fix: add allowed extensions --- .../repo_processor/get_non_code_files.py | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py index 671b998d9..f060782b6 100644 --- a/cognee/tasks/repo_processor/get_non_code_files.py +++ b/cognee/tasks/repo_processor/get_non_code_files.py @@ -25,8 +25,27 @@ async def get_non_py_files(repo_path): 'node_modules', '*.egg-info' } + ALLOWED_EXTENSIONS = { + '.txt', '.md', '.csv', '.json', '.xml', '.yaml', '.yml', '.html', + '.css', '.js', '.ts', '.jsx', '.tsx', '.sql', '.log', '.ini', + '.toml', '.properties', '.sh', '.bash', '.dockerfile', '.gitignore', + '.gitattributes', '.makefile', '.pyproject', '.requirements', + '.env', '.pdf', '.doc', '.docx', '.dot', '.dotx', '.rtf', + '.wps', '.wpd', '.odt', '.ott', '.ottx', '.txt', '.wp', + '.sdw', '.sdx', '.docm', '.dotm', + # Additional extensions for other programming languages + '.java', '.c', '.cpp', '.h', '.cs', '.go', '.php', '.rb', + '.swift', '.pl', '.lua', '.rs', '.scala', '.kt', '.sh', + '.sql', '.v', '.asm', '.pas', '.d', '.ml', '.clj', '.cljs', + '.erl', '.ex', '.exs', '.f', '.fs', '.r', '.pyi', + '.pdb', '.ipynb', '.rmd', '.cabal', '.hs', '.nim', + '.vhdl', '.verilog', '.svelte', '.html', '.css', '.scss', + '.less', '.json5', '.yaml', '.yml' + } + def should_process(path): - return not any(pattern in path for pattern in IGNORED_PATTERNS) + _, ext = os.path.splitext(path) + return ext in ALLOWED_EXTENSIONS and not any(pattern in path for pattern in IGNORED_PATTERNS) non_py_files_paths = [ os.path.join(root, file) From dbc33a6478944991f93223a3e443c3acf12460a7 Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Mon, 6 Jan 2025 11:23:55 +0100 Subject: [PATCH 04/17] fix: adhere UnstructuredDocument.read() to Document --- .../data/processing/document_types/UnstructuredDocument.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py index 62632cd08..8da065ff5 100644 --- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py +++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py @@ -8,7 +8,7 @@ class UnstructuredDocument(Document): type: str = "unstructured" - def read(self, chunk_size: int): + def read(self, chunk_size: int, chunker = str) -> str: def get_text(): try: from unstructured.partition.auto import partition From 5e79dc53c55405217925c5a511a64efccb67578c Mon Sep 17 00:00:00 2001 From: lxobr <122801072+lxobr@users.noreply.github.com> Date: Mon, 6 Jan 2025 11:25:04 +0100 Subject: [PATCH 05/17] feat: time code graph run and add mock support --- examples/python/code_graph_example.py | 62 ++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 5 deletions(-) diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py index 44ab33aad..9cd9f99c4 100644 --- a/examples/python/code_graph_example.py +++ b/examples/python/code_graph_example.py @@ -8,9 +8,61 @@ async def main(repo_path, include_docs): async for result in run_code_graph_pipeline(repo_path, include_docs): print(result) -if __name__ == "__main__": +def parse_args(): parser = argparse.ArgumentParser() - parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository") - parser.add_argument("--include_docs", type=lambda x: x.lower() in ("true", "1"), default=True, help="Whether or not to process non-code files") - args = parser.parse_args() - asyncio.run(main(args.repo_path, args.include_docs)) \ No newline at end of file + parser.add_argument( + "--repo_path", + type=str, + required=True, + help="Path to the repository" + ) + parser.add_argument( + "--include_docs", + type=lambda x: x.lower() in ("true", "1"), + default=True, + help="Whether or not to process non-code files" + ) + parser.add_argument( + "--mock_embedding", + type=lambda x: x.lower() in ("true", "1"), + default=True, + help="Whether or not to mock embedding and code summary" + ) + parser.add_argument( + "--mock_code_summary", + type=lambda x: x.lower() in ("true", "1"), + default=True, + help="Whether or not to mock code summary" + ) + parser.add_argument( + "--time", + type=lambda x: x.lower() in ("true", "1"), + default=True, + help="Whether or not to time the pipeline run" + ) + return parser.parse_args() + +if __name__ == "__main__": + import os + + args = parse_args() + + if args.mock_embedding: + os.environ["MOCK_EMBEDDING"] = "true" + print("Mocking embedding.") + + if args.mock_code_summary: + os.environ["MOCK_CODE_SUMMARY"] = "true" + print("Mocking code summary.") + + if args.time: + import time + start_time = time.time() + asyncio.run(main(args.repo_path, args.include_docs)) + end_time = time.time() + print("\n" + "="*50) + print(f"Pipeline Execution Time: {end_time - start_time:.2f} seconds") + print("="*50 + "\n") + else: + asyncio.run(main(args.repo_path, args.include_docs)) + \ No newline at end of file From 4802567871d34b3926f2e3dfd33ea9655dedcb44 Mon Sep 17 00:00:00 2001 From: alekszievr <44192193+alekszievr@users.noreply.github.com> Date: Tue, 7 Jan 2025 11:46:46 +0100 Subject: [PATCH 06/17] Overcome ContextWindowExceededError by checking token count while chunking (#413) --- cognee/api/v1/cognify/code_graph_pipeline.py | 2 +- cognee/modules/chunking/TextChunker.py | 23 +++++++++++--- .../processing/document_types/Document.py | 3 +- .../document_types/ImageDocument.py | 10 ++++-- .../processing/document_types/PdfDocument.py | 10 ++++-- .../processing/document_types/TextDocument.py | 9 ++++-- .../document_types/UnstructuredDocument.py | 8 +++-- cognee/tasks/chunks/chunk_by_paragraph.py | 31 ++++++++++++++++--- .../extract_chunks_from_documents.py | 12 +++++-- 9 files changed, 84 insertions(+), 24 deletions(-) diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 3d31b4000..2648d0731 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -71,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user), Task(get_data_list_for_user, dataset_name="repo_docs", user=user), Task(classify_documents), - Task(extract_chunks_from_documents), + Task(extract_chunks_from_documents, embedding_model=embedding_engine.model, max_tokens=8192), Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}), Task( summarize_text, diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py index 64c7aae5c..8ef4bfda9 100644 --- a/cognee/modules/chunking/TextChunker.py +++ b/cognee/modules/chunking/TextChunker.py @@ -1,31 +1,45 @@ -from uuid import uuid5, NAMESPACE_OID +from typing import Optional +from uuid import NAMESPACE_OID, uuid5 -from .models.DocumentChunk import DocumentChunk from cognee.tasks.chunks import chunk_by_paragraph +from .models.DocumentChunk import DocumentChunk + + class TextChunker(): document = None max_chunk_size: int chunk_index = 0 chunk_size = 0 + token_count = 0 - def __init__(self, document, get_text: callable, chunk_size: int = 1024): + def __init__(self, document, get_text: callable, embedding_model: Optional[str] = None, max_tokens: Optional[int] = None, chunk_size: int = 1024): self.document = document self.max_chunk_size = chunk_size self.get_text = get_text + self.max_tokens = max_tokens if max_tokens else float("inf") + self.embedding_model = embedding_model + + def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data): + word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size + token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens + return word_count_fits and token_count_fits def read(self): paragraph_chunks = [] for content_text in self.get_text(): for chunk_data in chunk_by_paragraph( content_text, + self.embedding_model, + self.max_tokens, self.max_chunk_size, batch_paragraphs = True, ): - if self.chunk_size + chunk_data["word_count"] <= self.max_chunk_size: + if self.check_word_count_and_token_count(self.chunk_size, self.token_count, chunk_data): paragraph_chunks.append(chunk_data) self.chunk_size += chunk_data["word_count"] + self.token_count += chunk_data["token_count"] else: if len(paragraph_chunks) == 0: yield DocumentChunk( @@ -63,6 +77,7 @@ def read(self): print(e) paragraph_chunks = [chunk_data] self.chunk_size = chunk_data["word_count"] + self.token_count = chunk_data["token_count"] self.chunk_index += 1 diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index 8d6a3dafb..6712175fb 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -1,3 +1,4 @@ +from typing import Optional from uuid import UUID from cognee.infrastructure.engine import DataPoint @@ -13,5 +14,5 @@ class Document(DataPoint): "type": "Document" } - def read(self, chunk_size: int, chunker = str) -> str: + def read(self, chunk_size: int, embedding_model: Optional[str], max_tokens: Optional[int], chunker = str) -> str: pass diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py index 352486bd8..1f4f281f8 100644 --- a/cognee/modules/data/processing/document_types/ImageDocument.py +++ b/cognee/modules/data/processing/document_types/ImageDocument.py @@ -1,6 +1,10 @@ +from typing import Optional + from cognee.infrastructure.llm.get_llm_client import get_llm_client -from .Document import Document + from .ChunkerMapping import ChunkerConfig +from .Document import Document + class ImageDocument(Document): type: str = "image" @@ -10,11 +14,11 @@ def transcribe_image(self): result = get_llm_client().transcribe_image(self.raw_data_location) return(result.choices[0].message.content) - def read(self, chunk_size: int, chunker: str): + def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): # Transcribe the image file text = self.transcribe_image() chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text]) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py index 361214718..27dadda33 100644 --- a/cognee/modules/data/processing/document_types/PdfDocument.py +++ b/cognee/modules/data/processing/document_types/PdfDocument.py @@ -1,11 +1,15 @@ +from typing import Optional + from pypdf import PdfReader -from .Document import Document + from .ChunkerMapping import ChunkerConfig +from .Document import Document + class PdfDocument(Document): type: str = "pdf" - def read(self, chunk_size: int, chunker: str): + def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): file = PdfReader(self.raw_data_location) def get_text(): @@ -14,7 +18,7 @@ def get_text(): yield page_text chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py index 3952d9845..895a6f8b6 100644 --- a/cognee/modules/data/processing/document_types/TextDocument.py +++ b/cognee/modules/data/processing/document_types/TextDocument.py @@ -1,10 +1,13 @@ -from .Document import Document +from typing import Optional + from .ChunkerMapping import ChunkerConfig +from .Document import Document + class TextDocument(Document): type: str = "text" - def read(self, chunk_size: int, chunker: str): + def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): def get_text(): with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file: while True: @@ -17,6 +20,6 @@ def get_text(): chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py index 8da065ff5..c94ca4a25 100644 --- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py +++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py @@ -1,14 +1,16 @@ from io import StringIO +from typing import Optional from cognee.modules.chunking.TextChunker import TextChunker -from .Document import Document from cognee.modules.data.exceptions import UnstructuredLibraryImportError +from .Document import Document + class UnstructuredDocument(Document): type: str = "unstructured" - def read(self, chunk_size: int, chunker = str) -> str: + def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]) -> str: def get_text(): try: from unstructured.partition.auto import partition @@ -27,6 +29,6 @@ def get_text(): yield text - chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text) + chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index 00bb5670c..546d4a1a7 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -1,8 +1,18 @@ -from uuid import uuid5, NAMESPACE_OID -from typing import Dict, Any, Iterator +from typing import Any, Dict, Iterator, Optional, Union +from uuid import NAMESPACE_OID, uuid5 + +import tiktoken + from .chunk_by_sentence import chunk_by_sentence -def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs: bool = True) -> Iterator[Dict[str, Any]]: + +def chunk_by_paragraph( + data: str, + embedding_model: Optional[str], + max_tokens: Optional[Union[int, float]], + paragraph_length: int = 1024, + batch_paragraphs: bool = True + ) -> Iterator[Dict[str, Any]]: """ Chunks text by paragraph while preserving exact text reconstruction capability. When chunks are joined with empty string "", they reproduce the original text exactly. @@ -12,14 +22,22 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs chunk_index = 0 paragraph_ids = [] last_cut_type = None + current_token_count = 0 for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): # Check if this sentence would exceed length limit - if current_word_count > 0 and current_word_count + word_count > paragraph_length: + if embedding_model: + tokenizer = tiktoken.encoding_for_model(embedding_model) + token_count = len(tokenizer.encode(sentence)) + else: + token_count = 0 + + if current_word_count > 0 and (current_word_count + word_count > paragraph_length or current_token_count + token_count > max_tokens): # Yield current chunk chunk_dict = { "text": current_chunk, "word_count": current_word_count, + "token_count": current_token_count, "chunk_id": uuid5(NAMESPACE_OID, current_chunk), "paragraph_ids": paragraph_ids, "chunk_index": chunk_index, @@ -32,11 +50,13 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs paragraph_ids = [] current_chunk = "" current_word_count = 0 + current_token_count = 0 chunk_index += 1 paragraph_ids.append(paragraph_id) current_chunk += sentence current_word_count += word_count + current_token_count += token_count # Handle end of paragraph if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs: @@ -44,6 +64,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs chunk_dict = { "text": current_chunk, "word_count": current_word_count, + "token_count": current_token_count, "paragraph_ids": paragraph_ids, "chunk_id": uuid5(NAMESPACE_OID, current_chunk), "chunk_index": chunk_index, @@ -53,6 +74,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs paragraph_ids = [] current_chunk = "" current_word_count = 0 + current_token_count = 0 chunk_index += 1 last_cut_type = end_type @@ -62,6 +84,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs chunk_dict = { "text": current_chunk, "word_count": current_word_count, + "token_count": current_token_count, "chunk_id": uuid5(NAMESPACE_OID, current_chunk), "paragraph_ids": paragraph_ids, "chunk_index": chunk_index, diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py index 423b87b69..ddcdb8765 100644 --- a/cognee/tasks/documents/extract_chunks_from_documents.py +++ b/cognee/tasks/documents/extract_chunks_from_documents.py @@ -1,7 +1,15 @@ +from typing import Optional + from cognee.modules.data.processing.document_types.Document import Document -async def extract_chunks_from_documents(documents: list[Document], chunk_size: int = 1024, chunker = 'text_chunker'): +async def extract_chunks_from_documents( + documents: list[Document], + chunk_size: int = 1024, + chunker='text_chunker', + embedding_model: Optional[str] = None, + max_tokens: Optional[int] = None, + ): for document in documents: - for document_chunk in document.read(chunk_size = chunk_size, chunker = chunker): + for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, embedding_model=embedding_model, max_tokens=max_tokens): yield document_chunk From a774191ed3153442bbdc29a79e90f45c51bc5cc5 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev <alekszievr@gmail.com> Date: Tue, 7 Jan 2025 13:38:23 +0100 Subject: [PATCH 07/17] Adjust AudioDocument and handle None token limit --- .../data/processing/document_types/AudioDocument.py | 10 +++++++--- cognee/tasks/chunks/chunk_by_paragraph.py | 2 ++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py index 268338703..a59064674 100644 --- a/cognee/modules/data/processing/document_types/AudioDocument.py +++ b/cognee/modules/data/processing/document_types/AudioDocument.py @@ -1,6 +1,10 @@ +from typing import Optional + from cognee.infrastructure.llm.get_llm_client import get_llm_client -from .Document import Document + from .ChunkerMapping import ChunkerConfig +from .Document import Document + class AudioDocument(Document): type: str = "audio" @@ -9,12 +13,12 @@ def create_transcript(self): result = get_llm_client().create_transcript(self.raw_data_location) return(result.text) - def read(self, chunk_size: int, chunker: str): + def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): # Transcribe the audio file text = self.create_transcript() chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text]) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index 546d4a1a7..2bbd9689f 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -23,6 +23,8 @@ def chunk_by_paragraph( paragraph_ids = [] last_cut_type = None current_token_count = 0 + if not max_tokens: + max_tokens = float("inf") for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): # Check if this sentence would exceed length limit From fb13a1b61a42c6b02ad85e70644c73aef722c1d7 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev <alekszievr@gmail.com> Date: Tue, 7 Jan 2025 15:00:58 +0100 Subject: [PATCH 08/17] Handle azure models as well --- cognee/tasks/chunks/chunk_by_paragraph.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index 2bbd9689f..b3c191e29 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -29,6 +29,8 @@ def chunk_by_paragraph( for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): # Check if this sentence would exceed length limit if embedding_model: + if embedding_model.startswith("azure/"): + embedding_model = embedding_model.split("/")[-1] tokenizer = tiktoken.encoding_for_model(embedding_model) token_count = len(tokenizer.encode(sentence)) else: From 8ffef5034ae560c7514d21ae2b58d1f30013354d Mon Sep 17 00:00:00 2001 From: Rita Aleksziev <alekszievr@gmail.com> Date: Wed, 8 Jan 2025 12:25:31 +0100 Subject: [PATCH 09/17] Add clean logging to code graph example --- examples/python/code_graph_example.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py index 9cd9f99c4..afc83beb0 100644 --- a/examples/python/code_graph_example.py +++ b/examples/python/code_graph_example.py @@ -1,7 +1,9 @@ import argparse import asyncio +import logging from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline +from cognee.shared.utils import setup_logging async def main(repo_path, include_docs): @@ -43,6 +45,8 @@ def parse_args(): return parser.parse_args() if __name__ == "__main__": + setup_logging(logging.ERROR) + import os args = parse_args() From f4397bf940e3a54a745ac1be19cadb9e33a28ae4 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev <alekszievr@gmail.com> Date: Wed, 8 Jan 2025 12:33:14 +0100 Subject: [PATCH 10/17] Remove setting envvars from arg --- examples/python/code_graph_example.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py index afc83beb0..16eafb024 100644 --- a/examples/python/code_graph_example.py +++ b/examples/python/code_graph_example.py @@ -24,18 +24,6 @@ def parse_args(): default=True, help="Whether or not to process non-code files" ) - parser.add_argument( - "--mock_embedding", - type=lambda x: x.lower() in ("true", "1"), - default=True, - help="Whether or not to mock embedding and code summary" - ) - parser.add_argument( - "--mock_code_summary", - type=lambda x: x.lower() in ("true", "1"), - default=True, - help="Whether or not to mock code summary" - ) parser.add_argument( "--time", type=lambda x: x.lower() in ("true", "1"), @@ -47,18 +35,8 @@ def parse_args(): if __name__ == "__main__": setup_logging(logging.ERROR) - import os - args = parse_args() - if args.mock_embedding: - os.environ["MOCK_EMBEDDING"] = "true" - print("Mocking embedding.") - - if args.mock_code_summary: - os.environ["MOCK_CODE_SUMMARY"] = "true" - print("Mocking code summary.") - if args.time: import time start_time = time.time() From 34a9267f414efc9553509bfdbf63bbee6aa5be69 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev <alekszievr@gmail.com> Date: Wed, 8 Jan 2025 13:23:17 +0100 Subject: [PATCH 11/17] Get embedding engine instead of passing it. Get it from vector engine instead of direct getter. --- cognee/api/v1/cognify/code_graph_pipeline.py | 2 +- cognee/modules/chunking/TextChunker.py | 6 ++---- .../document_types/AudioDocument.py | 4 ++-- .../processing/document_types/Document.py | 2 +- .../document_types/ImageDocument.py | 4 ++-- .../processing/document_types/PdfDocument.py | 4 ++-- .../processing/document_types/TextDocument.py | 4 ++-- .../document_types/UnstructuredDocument.py | 4 ++-- cognee/tasks/chunks/chunk_by_paragraph.py | 19 ++++++++++--------- .../extract_chunks_from_documents.py | 3 +-- 10 files changed, 25 insertions(+), 27 deletions(-) diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 2648d0731..7ba461f88 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -71,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user), Task(get_data_list_for_user, dataset_name="repo_docs", user=user), Task(classify_documents), - Task(extract_chunks_from_documents, embedding_model=embedding_engine.model, max_tokens=8192), + Task(extract_chunks_from_documents, max_tokens=8192), Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}), Task( summarize_text, diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py index 8ef4bfda9..a9cb52bf0 100644 --- a/cognee/modules/chunking/TextChunker.py +++ b/cognee/modules/chunking/TextChunker.py @@ -14,13 +14,12 @@ class TextChunker(): chunk_size = 0 token_count = 0 - def __init__(self, document, get_text: callable, embedding_model: Optional[str] = None, max_tokens: Optional[int] = None, chunk_size: int = 1024): + def __init__(self, document, get_text: callable, max_tokens: Optional[int] = None, chunk_size: int = 1024): self.document = document self.max_chunk_size = chunk_size self.get_text = get_text self.max_tokens = max_tokens if max_tokens else float("inf") - self.embedding_model = embedding_model - + def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data): word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens @@ -31,7 +30,6 @@ def read(self): for content_text in self.get_text(): for chunk_data in chunk_by_paragraph( content_text, - self.embedding_model, self.max_tokens, self.max_chunk_size, batch_paragraphs = True, diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py index a59064674..c4e6ae87c 100644 --- a/cognee/modules/data/processing/document_types/AudioDocument.py +++ b/cognee/modules/data/processing/document_types/AudioDocument.py @@ -13,12 +13,12 @@ def create_transcript(self): result = get_llm_client().create_transcript(self.raw_data_location) return(result.text) - def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): # Transcribe the audio file text = self.create_transcript() chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index 6712175fb..7c76d3f23 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -14,5 +14,5 @@ class Document(DataPoint): "type": "Document" } - def read(self, chunk_size: int, embedding_model: Optional[str], max_tokens: Optional[int], chunker = str) -> str: + def read(self, chunk_size: int, max_tokens: Optional[int], chunker = str) -> str: pass diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py index 1f4f281f8..ffe8ff3f9 100644 --- a/cognee/modules/data/processing/document_types/ImageDocument.py +++ b/cognee/modules/data/processing/document_types/ImageDocument.py @@ -14,11 +14,11 @@ def transcribe_image(self): result = get_llm_client().transcribe_image(self.raw_data_location) return(result.choices[0].message.content) - def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): # Transcribe the image file text = self.transcribe_image() chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py index 27dadda33..463911d5b 100644 --- a/cognee/modules/data/processing/document_types/PdfDocument.py +++ b/cognee/modules/data/processing/document_types/PdfDocument.py @@ -9,7 +9,7 @@ class PdfDocument(Document): type: str = "pdf" - def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): file = PdfReader(self.raw_data_location) def get_text(): @@ -18,7 +18,7 @@ def get_text(): yield page_text chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py index 895a6f8b6..582f47737 100644 --- a/cognee/modules/data/processing/document_types/TextDocument.py +++ b/cognee/modules/data/processing/document_types/TextDocument.py @@ -7,7 +7,7 @@ class TextDocument(Document): type: str = "text" - def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): def get_text(): with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file: while True: @@ -20,6 +20,6 @@ def get_text(): chunker_func = ChunkerConfig.get_chunker(chunker) - chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens) + chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py index c94ca4a25..6c70744a0 100644 --- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py +++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py @@ -10,7 +10,7 @@ class UnstructuredDocument(Document): type: str = "unstructured" - def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]) -> str: + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]) -> str: def get_text(): try: from unstructured.partition.auto import partition @@ -29,6 +29,6 @@ def get_text(): yield text - chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens) + chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens) yield from chunker.read() diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index b3c191e29..8ab66bd7f 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -3,12 +3,13 @@ import tiktoken +from cognee.infrastructure.databases.vector import get_vector_engine + from .chunk_by_sentence import chunk_by_sentence def chunk_by_paragraph( data: str, - embedding_model: Optional[str], max_tokens: Optional[Union[int, float]], paragraph_length: int = 1024, batch_paragraphs: bool = True @@ -26,16 +27,16 @@ def chunk_by_paragraph( if not max_tokens: max_tokens = float("inf") + vector_engine = get_vector_engine() + embedding_model = vector_engine.embedding_engine.model + for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): # Check if this sentence would exceed length limit - if embedding_model: - if embedding_model.startswith("azure/"): - embedding_model = embedding_model.split("/")[-1] - tokenizer = tiktoken.encoding_for_model(embedding_model) - token_count = len(tokenizer.encode(sentence)) - else: - token_count = 0 - + + embedding_model = embedding_model.split("/")[-1] + tokenizer = tiktoken.encoding_for_model(embedding_model) + token_count = len(tokenizer.encode(sentence)) + if current_word_count > 0 and (current_word_count + word_count > paragraph_length or current_token_count + token_count > max_tokens): # Yield current chunk chunk_dict = { diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py index ddcdb8765..e647afbef 100644 --- a/cognee/tasks/documents/extract_chunks_from_documents.py +++ b/cognee/tasks/documents/extract_chunks_from_documents.py @@ -7,9 +7,8 @@ async def extract_chunks_from_documents( documents: list[Document], chunk_size: int = 1024, chunker='text_chunker', - embedding_model: Optional[str] = None, max_tokens: Optional[int] = None, ): for document in documents: - for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, embedding_model=embedding_model, max_tokens=max_tokens): + for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, max_tokens=max_tokens): yield document_chunk From 97814e334f282b344cb0357df387b70cbf801397 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev <alekszievr@gmail.com> Date: Wed, 8 Jan 2025 13:45:04 +0100 Subject: [PATCH 12/17] Get embedding engine instead of passing it in code chunking. --- cognee/api/v1/cognify/code_graph_pipeline.py | 6 +----- cognee/tasks/chunks/chunk_by_paragraph.py | 4 ++-- cognee/tasks/repo_processor/get_source_code_chunks.py | 9 ++++++--- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 7ba461f88..6e06edfa3 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -3,8 +3,6 @@ from pathlib import Path from cognee.base_config import get_base_config -from cognee.infrastructure.databases.vector.embeddings import \ - get_embedding_engine from cognee.modules.cognify.config import get_cognify_config from cognee.modules.pipelines import run_tasks from cognee.modules.pipelines.tasks.Task import Task @@ -51,8 +49,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): await cognee.prune.prune_system(metadata=True) await create_db_and_tables() - embedding_engine = get_embedding_engine() - cognee_config = get_cognify_config() user = await get_default_user() @@ -60,7 +56,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): Task(get_repo_file_dependencies), Task(enrich_dependency_graph), Task(expand_dependency_graph, task_config={"batch_size": 50}), - Task(get_source_code_chunks, embedding_model=embedding_engine.model, task_config={"batch_size": 50}), + Task(get_source_code_chunks, task_config={"batch_size": 50}), Task(summarize_code, task_config={"batch_size": 50}), Task(add_data_points, task_config={"batch_size": 50}), ] diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py index 8ab66bd7f..44355a1ad 100644 --- a/cognee/tasks/chunks/chunk_by_paragraph.py +++ b/cognee/tasks/chunks/chunk_by_paragraph.py @@ -29,11 +29,11 @@ def chunk_by_paragraph( vector_engine = get_vector_engine() embedding_model = vector_engine.embedding_engine.model - + embedding_model = embedding_model.split("/")[-1] + for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length): # Check if this sentence would exceed length limit - embedding_model = embedding_model.split("/")[-1] tokenizer = tiktoken.encoding_for_model(embedding_model) token_count = len(tokenizer.encode(sentence)) diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py index 4d0ce3200..0bf7ebe32 100644 --- a/cognee/tasks/repo_processor/get_source_code_chunks.py +++ b/cognee/tasks/repo_processor/get_source_code_chunks.py @@ -5,6 +5,7 @@ import parso import tiktoken +from cognee.infrastructure.databases.vector import get_vector_engine from cognee.infrastructure.engine import DataPoint from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk @@ -115,13 +116,15 @@ def get_source_code_chunks_from_code_part( max_tokens: int = 8192, overlap: float = 0.25, granularity: float = 0.1, - model_name: str = "text-embedding-3-large" ) -> Generator[SourceCodeChunk, None, None]: """Yields source code chunks from a CodePart object, with configurable token limits and overlap.""" if not code_file_part.source_code: logger.error(f"No source code in CodeFile {code_file_part.id}") return + vector_engine = get_vector_engine() + embedding_model = vector_engine.embedding_engine.model + model_name = embedding_model.split("/")[-1] tokenizer = tiktoken.encoding_for_model(model_name) max_subchunk_tokens = max(1, int(granularity * max_tokens)) subchunk_token_counts = _get_subchunk_token_counts(tokenizer, code_file_part.source_code, max_subchunk_tokens) @@ -141,7 +144,7 @@ def get_source_code_chunks_from_code_part( previous_chunk = current_chunk -async def get_source_code_chunks(data_points: list[DataPoint], embedding_model="text-embedding-3-large") -> \ +async def get_source_code_chunks(data_points: list[DataPoint]) -> \ AsyncGenerator[list[DataPoint], None]: """Processes code graph datapoints, create SourceCodeChink datapoints.""" # TODO: Add support for other embedding models, with max_token mapping @@ -156,7 +159,7 @@ async def get_source_code_chunks(data_points: list[DataPoint], embedding_model=" for code_part in data_point.contains: try: yield code_part - for source_code_chunk in get_source_code_chunks_from_code_part(code_part, model_name=embedding_model): + for source_code_chunk in get_source_code_chunks_from_code_part(code_part): yield source_code_chunk except Exception as e: logger.error(f"Error processing code part: {e}") From abb3ea6d219f8221500fb7a7e7f6cc404cf75b08 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev <alekszievr@gmail.com> Date: Thu, 9 Jan 2025 11:31:16 +0100 Subject: [PATCH 13/17] Adjust integration tests --- .../data/processing/document_types/AudioDocument.py | 2 +- cognee/modules/data/processing/document_types/Document.py | 2 +- .../data/processing/document_types/ImageDocument.py | 2 +- .../modules/data/processing/document_types/PdfDocument.py | 2 +- .../data/processing/document_types/TextDocument.py | 2 +- .../processing/document_types/UnstructuredDocument.py | 2 +- .../integration/documents/UnstructuredDocument_test.py | 8 ++++---- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py index faace056b..b7d2476b4 100644 --- a/cognee/modules/data/processing/document_types/AudioDocument.py +++ b/cognee/modules/data/processing/document_types/AudioDocument.py @@ -13,7 +13,7 @@ def create_transcript(self): result = get_llm_client().create_transcript(self.raw_data_location) return result.text - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): # Transcribe the audio file text = self.create_transcript() diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py index 9a29e7797..7ecdf289e 100644 --- a/cognee/modules/data/processing/document_types/Document.py +++ b/cognee/modules/data/processing/document_types/Document.py @@ -11,5 +11,5 @@ class Document(DataPoint): mime_type: str _metadata: dict = {"index_fields": ["name"], "type": "Document"} - def read(self, chunk_size: int, max_tokens: Optional[int], chunker=str) -> str: + def read(self, chunk_size: int, chunker=str, max_tokens: Optional[int] = None) -> str: pass diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py index f0c7a6d61..c055b8253 100644 --- a/cognee/modules/data/processing/document_types/ImageDocument.py +++ b/cognee/modules/data/processing/document_types/ImageDocument.py @@ -13,7 +13,7 @@ def transcribe_image(self): result = get_llm_client().transcribe_image(self.raw_data_location) return result.choices[0].message.content - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): # Transcribe the image file text = self.transcribe_image() diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py index 56969c7f8..768f91264 100644 --- a/cognee/modules/data/processing/document_types/PdfDocument.py +++ b/cognee/modules/data/processing/document_types/PdfDocument.py @@ -9,7 +9,7 @@ class PdfDocument(Document): type: str = "pdf" - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): file = PdfReader(self.raw_data_location) def get_text(): diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py index 11dc798aa..b62ccd56e 100644 --- a/cognee/modules/data/processing/document_types/TextDocument.py +++ b/cognee/modules/data/processing/document_types/TextDocument.py @@ -7,7 +7,7 @@ class TextDocument(Document): type: str = "text" - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]): + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None): def get_text(): with open(self.raw_data_location, mode="r", encoding="utf-8") as file: while True: diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py index d6b64498c..1c291d0dc 100644 --- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py +++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py @@ -10,7 +10,7 @@ class UnstructuredDocument(Document): type: str = "unstructured" - def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]) -> str: + def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None) -> str: def get_text(): try: from unstructured.partition.auto import partition diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py index 03b8deb49..e0278de81 100644 --- a/cognee/tests/integration/documents/UnstructuredDocument_test.py +++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py @@ -68,7 +68,7 @@ def test_UnstructuredDocument(): ) # Test PPTX - for paragraph_data in pptx_document.read(chunk_size=1024): + for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"): assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }" assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }" assert ( @@ -76,7 +76,7 @@ def test_UnstructuredDocument(): ), f" sentence_cut != {paragraph_data.cut_type = }" # Test DOCX - for paragraph_data in docx_document.read(chunk_size=1024): + for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"): assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }" assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }" assert ( @@ -84,7 +84,7 @@ def test_UnstructuredDocument(): ), f" sentence_end != {paragraph_data.cut_type = }" # TEST CSV - for paragraph_data in csv_document.read(chunk_size=1024): + for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"): assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }" assert ( "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text @@ -94,7 +94,7 @@ def test_UnstructuredDocument(): ), f" sentence_cut != {paragraph_data.cut_type = }" # Test XLSX - for paragraph_data in xlsx_document.read(chunk_size=1024): + for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"): assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }" assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }" assert ( From cdaae161a8e006415988a45ca529d5bc71f9d632 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev <alekszievr@gmail.com> Date: Thu, 9 Jan 2025 12:08:42 +0100 Subject: [PATCH 14/17] Handle circular import --- cognee/tasks/repo_processor/__init__.py | 3 --- cognee/tasks/repo_processor/expand_dependency_graph.py | 5 ++++- cognee/tasks/repo_processor/extract_code_parts.py | 4 +++- cognee/tasks/repo_processor/get_local_dependencies.py | 4 +++- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/cognee/tasks/repo_processor/__init__.py b/cognee/tasks/repo_processor/__init__.py index 6dc032547..8f0df23d8 100644 --- a/cognee/tasks/repo_processor/__init__.py +++ b/cognee/tasks/repo_processor/__init__.py @@ -2,6 +2,3 @@ from .expand_dependency_graph import expand_dependency_graph from .get_non_code_files import get_data_list_for_user, get_non_py_files from .get_repo_file_dependencies import get_repo_file_dependencies -import logging - -logger = logging.getLogger("task:repo_processor") diff --git a/cognee/tasks/repo_processor/expand_dependency_graph.py b/cognee/tasks/repo_processor/expand_dependency_graph.py index de26fe8d4..d3f5d1b07 100644 --- a/cognee/tasks/repo_processor/expand_dependency_graph.py +++ b/cognee/tasks/repo_processor/expand_dependency_graph.py @@ -5,7 +5,10 @@ from cognee.infrastructure.engine import DataPoint from cognee.shared.CodeGraphEntities import CodeFile, CodePart from cognee.tasks.repo_processor.extract_code_parts import extract_code_parts -from cognee.tasks.repo_processor import logger + +import logging + +logger = logging.getLogger("task:repo_processor") def _add_code_parts_nodes_and_edges(code_file: CodeFile, part_type, code_parts) -> None: diff --git a/cognee/tasks/repo_processor/extract_code_parts.py b/cognee/tasks/repo_processor/extract_code_parts.py index 76cfef538..c181a87d9 100644 --- a/cognee/tasks/repo_processor/extract_code_parts.py +++ b/cognee/tasks/repo_processor/extract_code_parts.py @@ -1,7 +1,9 @@ from typing import Dict, List import parso -from cognee.tasks.repo_processor import logger +import logging + +logger = logging.getLogger("task:repo_processor") def _extract_parts_from_module(module, parts_dict: Dict[str, List[str]]) -> Dict[str, List[str]]: diff --git a/cognee/tasks/repo_processor/get_local_dependencies.py b/cognee/tasks/repo_processor/get_local_dependencies.py index b443829c9..92b50cd0b 100644 --- a/cognee/tasks/repo_processor/get_local_dependencies.py +++ b/cognee/tasks/repo_processor/get_local_dependencies.py @@ -10,7 +10,9 @@ import parso from parso.tree import BaseNode -from cognee.tasks.repo_processor import logger +import logging + +logger = logging.getLogger("task:repo_processor") @contextmanager From 626bc76f5ccdb830e741cf74464e1e3a967dec75 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev <alekszievr@gmail.com> Date: Thu, 9 Jan 2025 12:53:26 +0100 Subject: [PATCH 15/17] Set max_tokens in config --- cognee/api/v1/cognify/code_graph_pipeline.py | 2 +- cognee/modules/cognify/config.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py index 53c41d43b..2d077f39b 100644 --- a/cognee/api/v1/cognify/code_graph_pipeline.py +++ b/cognee/api/v1/cognify/code_graph_pipeline.py @@ -71,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True): Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user), Task(get_data_list_for_user, dataset_name="repo_docs", user=user), Task(classify_documents), - Task(extract_chunks_from_documents, max_tokens=8192), + Task(extract_chunks_from_documents, max_tokens=cognee_config.max_tokens), Task( extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50} ), diff --git a/cognee/modules/cognify/config.py b/cognee/modules/cognify/config.py index d40410bfc..dd94d8b41 100644 --- a/cognee/modules/cognify/config.py +++ b/cognee/modules/cognify/config.py @@ -1,12 +1,14 @@ from functools import lru_cache from pydantic_settings import BaseSettings, SettingsConfigDict from cognee.shared.data_models import DefaultContentPrediction, SummarizedContent +from typing import Optional +import os class CognifyConfig(BaseSettings): classification_model: object = DefaultContentPrediction summarization_model: object = SummarizedContent - + max_tokens: Optional[int] = os.getenv("MAX_TOKENS") model_config = SettingsConfigDict(env_file=".env", extra="allow") def to_dict(self) -> dict: From d7b2186300db585ee5dd1affb20e85e3017a1606 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev <alekszievr@gmail.com> Date: Thu, 9 Jan 2025 14:27:37 +0100 Subject: [PATCH 16/17] Adjust SWE-bench script to code graph pipeline call --- evals/eval_swe_bench.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index 789c95ab4..509530685 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -34,9 +34,8 @@ def check_install_package(package_name): async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS): repo_path = download_github_repo(instance, "../RAW_GIT_REPOS") - pipeline = await run_code_graph_pipeline(repo_path) - async for result in pipeline: + async for result in run_code_graph_pipeline(repo_path, include_docs=True): print(result) print("Here we have the repo under the repo_path") @@ -47,7 +46,16 @@ async def generate_patch_with_cognee(instance, llm_client, search_type=SearchTyp instructions = read_query_prompt("patch_gen_kg_instructions.txt") retrieved_edges = await brute_force_triplet_search( - problem_statement, top_k=3, collections=["data_point_source_code", "data_point_text"] + problem_statement, + top_k=3, + collections=[ + "code_summary_text", + "data_point_name", + "document_chunk_text", + "entity_name", + "entity_type_name", + "sourcecodechunk_source_code", + ], ) retrieved_edges_str = retrieved_edges_to_string(retrieved_edges) From 18bb282fbc7fb3c7e770641bd0b64fa38af7dd92 Mon Sep 17 00:00:00 2001 From: Rita Aleksziev <alekszievr@gmail.com> Date: Thu, 9 Jan 2025 14:27:37 +0100 Subject: [PATCH 17/17] Adjust SWE-bench script to code graph pipeline call --- evals/eval_swe_bench.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py index 789c95ab4..20e005751 100644 --- a/evals/eval_swe_bench.py +++ b/evals/eval_swe_bench.py @@ -34,9 +34,8 @@ def check_install_package(package_name): async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS): repo_path = download_github_repo(instance, "../RAW_GIT_REPOS") - pipeline = await run_code_graph_pipeline(repo_path) - async for result in pipeline: + async for result in run_code_graph_pipeline(repo_path, include_docs=True): print(result) print("Here we have the repo under the repo_path") @@ -47,7 +46,9 @@ async def generate_patch_with_cognee(instance, llm_client, search_type=SearchTyp instructions = read_query_prompt("patch_gen_kg_instructions.txt") retrieved_edges = await brute_force_triplet_search( - problem_statement, top_k=3, collections=["data_point_source_code", "data_point_text"] + problem_statement, + top_k=3, + collections=["code_summary_text"], ) retrieved_edges_str = retrieved_edges_to_string(retrieved_edges)