From a8a83fffff17c882c554e1a6ea95481de773c6b1 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Fri, 20 Dec 2024 10:53:57 +0100
Subject: [PATCH 01/17] Ingest non-code files

---
 cognee/api/v1/cognify/code_graph_pipeline.py  | 43 ++++++++++++++++---
 cognee/tasks/repo_processor/__init__.py       |  1 +
 .../repo_processor/get_non_code_files.py      | 36 ++++++++++++++++
 examples/python/code_graph_example.py         | 11 ++---
 4 files changed, 80 insertions(+), 11 deletions(-)
 create mode 100644 cognee/tasks/repo_processor/get_non_code_files.py

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index c35f9719f..8e92d08e0 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -2,29 +2,37 @@
 import logging
 from pathlib import Path
 
+from cognee.base_config import get_base_config
+from cognee.modules.cognify.config import get_cognify_config
 from cognee.modules.pipelines import run_tasks
 from cognee.modules.pipelines.tasks.Task import Task
+from cognee.modules.users.methods import get_default_user
+from cognee.shared.data_models import KnowledgeGraph, MonitoringTool
+from cognee.tasks.documents import (classify_documents,
+                                    extract_chunks_from_documents)
+from cognee.tasks.graph import extract_graph_from_data
+from cognee.tasks.ingestion import ingest_data_with_metadata
 from cognee.tasks.repo_processor import (enrich_dependency_graph,
                                          expand_dependency_graph,
+                                         get_data_list_for_user,
+                                         get_non_code_files,
                                          get_repo_file_dependencies)
 from cognee.tasks.storage import add_data_points
 
-from cognee.base_config import get_base_config
-from cognee.shared.data_models import MonitoringTool
-
 monitoring = get_base_config().monitoring_tool
 if monitoring == MonitoringTool.LANGFUSE:
     from langfuse.decorators import observe
 
-from cognee.tasks.summarization import summarize_code
+from cognee.tasks.summarization import summarize_code, summarize_text
 
 logger = logging.getLogger("code_graph_pipeline")
 update_status_lock = asyncio.Lock()
 
 @observe
-async def run_code_graph_pipeline(repo_path):
+async def run_code_graph_pipeline(repo_path, include_docs=True):
     import os
     import pathlib
+
     import cognee
     from cognee.infrastructure.databases.relational import create_db_and_tables
 
@@ -38,6 +46,9 @@ async def run_code_graph_pipeline(repo_path):
     await cognee.prune.prune_system(metadata=True)
     await create_db_and_tables()
 
+    cognee_config = get_cognify_config()
+    user = await get_default_user()
+
     tasks = [
         Task(get_repo_file_dependencies),
         Task(enrich_dependency_graph, task_config={"batch_size": 50}),
@@ -46,4 +57,24 @@ async def run_code_graph_pipeline(repo_path):
         Task(add_data_points, task_config={"batch_size": 50}),
     ]
 
-    return run_tasks(tasks, repo_path, "cognify_code_pipeline")
+    if include_docs:
+        non_code_tasks = [
+            Task(get_non_code_files, task_config={"batch_size": 50}),
+            Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
+            Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
+            Task(classify_documents),
+            Task(extract_chunks_from_documents),
+            Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}),
+            Task(
+                summarize_text,
+                summarization_model=cognee_config.summarization_model,
+                task_config={"batch_size": 50}
+            ),
+        ]
+
+    if include_docs:
+        async for result in run_tasks(non_code_tasks, repo_path):
+            yield result
+
+    async for result in run_tasks(tasks, repo_path, "cognify_code_pipeline"):
+        yield result
\ No newline at end of file
diff --git a/cognee/tasks/repo_processor/__init__.py b/cognee/tasks/repo_processor/__init__.py
index 05e111b29..fa754028e 100644
--- a/cognee/tasks/repo_processor/__init__.py
+++ b/cognee/tasks/repo_processor/__init__.py
@@ -4,4 +4,5 @@
 
 from .enrich_dependency_graph import enrich_dependency_graph
 from .expand_dependency_graph import expand_dependency_graph
+from .get_non_code_files import get_data_list_for_user, get_non_py_files
 from .get_repo_file_dependencies import get_repo_file_dependencies
diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py
new file mode 100644
index 000000000..5a8a34f64
--- /dev/null
+++ b/cognee/tasks/repo_processor/get_non_code_files.py
@@ -0,0 +1,36 @@
+import os
+
+import aiofiles
+
+import cognee.modules.ingestion as ingestion
+from cognee.infrastructure.engine import DataPoint
+from cognee.modules.data.methods import get_datasets
+from cognee.modules.data.methods.get_dataset_data import get_dataset_data
+from cognee.modules.data.methods.get_datasets_by_name import \
+    get_datasets_by_name
+from cognee.modules.data.models import Data
+from cognee.modules.data.operations.write_metadata import write_metadata
+from cognee.modules.ingestion.data_types import BinaryData
+from cognee.modules.users.methods import get_default_user
+from cognee.shared.CodeGraphEntities import Repository
+
+
+async def get_non_py_files(repo_path):
+    """Get files that are not .py files and their contents"""
+    if not os.path.exists(repo_path):
+        return {}
+
+    non_py_files_paths = [
+        os.path.join(root, file)
+        for root, _, files in os.walk(repo_path) for file in files if not file.endswith(".py")
+    ]
+    return non_py_files_paths
+
+
+async def get_data_list_for_user(_, dataset_name, user):
+    datasets = await get_datasets_by_name(dataset_name, user.id)
+    data_documents: list[Data] = []
+    for dataset in datasets:
+        data_docs: list[Data] = await get_dataset_data(dataset_id=dataset.id)
+        data_documents.extend(data_docs)
+    return data_documents
\ No newline at end of file
diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py
index 9189de46c..c0b91972b 100644
--- a/examples/python/code_graph_example.py
+++ b/examples/python/code_graph_example.py
@@ -1,15 +1,16 @@
 import argparse
 import asyncio
+
 from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
 
 
-async def main(repo_path):
-    async for result in await run_code_graph_pipeline(repo_path):
+async def main(repo_path, include_docs):
+    async for result in run_code_graph_pipeline(repo_path, include_docs):
         print(result)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--repo-path", type=str, required=True, help="Path to the repository")
+    parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository")
+    parser.add_argument("--include_docs", type=bool, default=True, help="Whether or not to process non-code files")
     args = parser.parse_args()
-    asyncio.run(main(args.repo_path))
-
+    asyncio.run(main(args.repo_path, args.include_docs))
\ No newline at end of file

From 399faf9ca0e445e477e7c8e2201ec3478d62e928 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Fri, 20 Dec 2024 13:37:24 +0100
Subject: [PATCH 02/17] Fixing review findings

---
 cognee/tasks/repo_processor/get_non_code_files.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py
index 5a8a34f64..671b998d9 100644
--- a/cognee/tasks/repo_processor/get_non_code_files.py
+++ b/cognee/tasks/repo_processor/get_non_code_files.py
@@ -20,14 +20,26 @@ async def get_non_py_files(repo_path):
     if not os.path.exists(repo_path):
         return {}
 
+    IGNORED_PATTERNS = {
+        '.git', '__pycache__', '*.pyc', '*.pyo', '*.pyd',
+        'node_modules', '*.egg-info'
+    }
+
+    def should_process(path):
+        return not any(pattern in path for pattern in IGNORED_PATTERNS)
+
     non_py_files_paths = [
         os.path.join(root, file)
-        for root, _, files in os.walk(repo_path) for file in files if not file.endswith(".py")
+        for root, _, files in os.walk(repo_path) for file in files 
+        if not file.endswith(".py") and should_process(os.path.join(root, file))
     ]
     return non_py_files_paths
 
 
 async def get_data_list_for_user(_, dataset_name, user):
+    # Note: This method is meant to be used as a Task in a pipeline.
+    # By the nature of pipelines, the output of the previous Task will be passed as the first argument here,
+    # but it is not needed here, hence the "_" input.
     datasets = await get_datasets_by_name(dataset_name, user.id)
     data_documents: list[Data] = []
     for dataset in datasets:

From 4cee9a16ce16048aedf4678201ed16b5ce22273b Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Mon, 6 Jan 2025 11:22:45 +0100
Subject: [PATCH 03/17] fix: add allowed extensions

---
 .../repo_processor/get_non_code_files.py      | 21 ++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/cognee/tasks/repo_processor/get_non_code_files.py b/cognee/tasks/repo_processor/get_non_code_files.py
index 671b998d9..f060782b6 100644
--- a/cognee/tasks/repo_processor/get_non_code_files.py
+++ b/cognee/tasks/repo_processor/get_non_code_files.py
@@ -25,8 +25,27 @@ async def get_non_py_files(repo_path):
         'node_modules', '*.egg-info'
     }
 
+    ALLOWED_EXTENSIONS = {
+        '.txt', '.md', '.csv', '.json', '.xml', '.yaml', '.yml', '.html', 
+        '.css', '.js', '.ts', '.jsx', '.tsx', '.sql', '.log', '.ini', 
+        '.toml', '.properties', '.sh', '.bash', '.dockerfile', '.gitignore', 
+        '.gitattributes', '.makefile', '.pyproject', '.requirements', 
+        '.env', '.pdf', '.doc', '.docx', '.dot', '.dotx', '.rtf', 
+        '.wps', '.wpd', '.odt', '.ott', '.ottx', '.txt', '.wp', 
+        '.sdw', '.sdx', '.docm', '.dotm', 
+        # Additional extensions for other programming languages
+        '.java', '.c', '.cpp', '.h', '.cs', '.go', '.php', '.rb', 
+        '.swift', '.pl', '.lua', '.rs', '.scala', '.kt', '.sh', 
+        '.sql', '.v', '.asm', '.pas', '.d', '.ml', '.clj', '.cljs', 
+        '.erl', '.ex', '.exs', '.f', '.fs', '.r', '.pyi', 
+        '.pdb', '.ipynb', '.rmd', '.cabal', '.hs', '.nim', 
+        '.vhdl', '.verilog', '.svelte', '.html', '.css', '.scss', 
+        '.less', '.json5', '.yaml', '.yml'
+    }
+
     def should_process(path):
-        return not any(pattern in path for pattern in IGNORED_PATTERNS)
+        _, ext = os.path.splitext(path)
+        return ext in ALLOWED_EXTENSIONS and not any(pattern in path for pattern in IGNORED_PATTERNS)
 
     non_py_files_paths = [
         os.path.join(root, file)

From dbc33a6478944991f93223a3e443c3acf12460a7 Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Mon, 6 Jan 2025 11:23:55 +0100
Subject: [PATCH 04/17] fix: adhere UnstructuredDocument.read() to Document

---
 .../data/processing/document_types/UnstructuredDocument.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
index 62632cd08..8da065ff5 100644
--- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py
+++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
@@ -8,7 +8,7 @@
 class UnstructuredDocument(Document):
     type: str = "unstructured"
 
-    def read(self, chunk_size: int):
+    def read(self, chunk_size: int, chunker = str) -> str:
         def get_text():
             try:
                 from unstructured.partition.auto import partition

From 5e79dc53c55405217925c5a511a64efccb67578c Mon Sep 17 00:00:00 2001
From: lxobr <122801072+lxobr@users.noreply.github.com>
Date: Mon, 6 Jan 2025 11:25:04 +0100
Subject: [PATCH 05/17] feat: time code graph run and add mock support

---
 examples/python/code_graph_example.py | 62 ++++++++++++++++++++++++---
 1 file changed, 57 insertions(+), 5 deletions(-)

diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py
index 44ab33aad..9cd9f99c4 100644
--- a/examples/python/code_graph_example.py
+++ b/examples/python/code_graph_example.py
@@ -8,9 +8,61 @@ async def main(repo_path, include_docs):
     async for result in run_code_graph_pipeline(repo_path, include_docs):
         print(result)
 
-if __name__ == "__main__":
+def parse_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--repo_path", type=str, required=True, help="Path to the repository")
-    parser.add_argument("--include_docs", type=lambda x: x.lower() in ("true", "1"), default=True, help="Whether or not to process non-code files")
-    args = parser.parse_args()
-    asyncio.run(main(args.repo_path, args.include_docs))
\ No newline at end of file
+    parser.add_argument(
+        "--repo_path", 
+        type=str, 
+        required=True, 
+        help="Path to the repository"
+    )
+    parser.add_argument(
+        "--include_docs",
+        type=lambda x: x.lower() in ("true", "1"),
+        default=True,
+        help="Whether or not to process non-code files"
+    )
+    parser.add_argument(
+        "--mock_embedding",
+        type=lambda x: x.lower() in ("true", "1"), 
+        default=True,
+        help="Whether or not to mock embedding and code summary"
+    )
+    parser.add_argument(
+        "--mock_code_summary",
+        type=lambda x: x.lower() in ("true", "1"),
+        default=True, 
+        help="Whether or not to mock code summary"
+    )
+    parser.add_argument(
+        "--time",
+        type=lambda x: x.lower() in ("true", "1"),
+        default=True,
+        help="Whether or not to time the pipeline run"
+    )
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    import os
+
+    args = parse_args()
+    
+    if args.mock_embedding:
+        os.environ["MOCK_EMBEDDING"] = "true"
+        print("Mocking embedding.")
+    
+    if args.mock_code_summary:
+        os.environ["MOCK_CODE_SUMMARY"] = "true"
+        print("Mocking code summary.")
+    
+    if args.time:
+        import time
+        start_time = time.time()
+        asyncio.run(main(args.repo_path, args.include_docs))
+        end_time = time.time()
+        print("\n" + "="*50)
+        print(f"Pipeline Execution Time: {end_time - start_time:.2f} seconds")
+        print("="*50 + "\n")
+    else:
+        asyncio.run(main(args.repo_path, args.include_docs))
+        
\ No newline at end of file

From 4802567871d34b3926f2e3dfd33ea9655dedcb44 Mon Sep 17 00:00:00 2001
From: alekszievr <44192193+alekszievr@users.noreply.github.com>
Date: Tue, 7 Jan 2025 11:46:46 +0100
Subject: [PATCH 06/17] Overcome ContextWindowExceededError by checking token
 count while chunking (#413)

---
 cognee/api/v1/cognify/code_graph_pipeline.py  |  2 +-
 cognee/modules/chunking/TextChunker.py        | 23 +++++++++++---
 .../processing/document_types/Document.py     |  3 +-
 .../document_types/ImageDocument.py           | 10 ++++--
 .../processing/document_types/PdfDocument.py  | 10 ++++--
 .../processing/document_types/TextDocument.py |  9 ++++--
 .../document_types/UnstructuredDocument.py    |  8 +++--
 cognee/tasks/chunks/chunk_by_paragraph.py     | 31 ++++++++++++++++---
 .../extract_chunks_from_documents.py          | 12 +++++--
 9 files changed, 84 insertions(+), 24 deletions(-)

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index 3d31b4000..2648d0731 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -71,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
             Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
             Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
             Task(classify_documents),
-            Task(extract_chunks_from_documents),
+            Task(extract_chunks_from_documents, embedding_model=embedding_engine.model, max_tokens=8192),
             Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}),
             Task(
                 summarize_text,
diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py
index 64c7aae5c..8ef4bfda9 100644
--- a/cognee/modules/chunking/TextChunker.py
+++ b/cognee/modules/chunking/TextChunker.py
@@ -1,31 +1,45 @@
-from uuid import uuid5, NAMESPACE_OID
+from typing import Optional
+from uuid import NAMESPACE_OID, uuid5
 
-from .models.DocumentChunk import DocumentChunk
 from cognee.tasks.chunks import chunk_by_paragraph
 
+from .models.DocumentChunk import DocumentChunk
+
+
 class TextChunker():
     document = None
     max_chunk_size: int
 
     chunk_index = 0
     chunk_size = 0
+    token_count = 0
 
-    def __init__(self, document, get_text: callable, chunk_size: int = 1024):
+    def __init__(self, document, get_text: callable, embedding_model: Optional[str] = None, max_tokens: Optional[int] = None, chunk_size: int = 1024):
         self.document = document
         self.max_chunk_size = chunk_size
         self.get_text = get_text
+        self.max_tokens = max_tokens if max_tokens else float("inf")
+        self.embedding_model = embedding_model
+
+    def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data):
+        word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size
+        token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens
+        return word_count_fits and token_count_fits
 
     def read(self):
         paragraph_chunks = []
         for content_text in self.get_text():
             for chunk_data in chunk_by_paragraph(
                 content_text,
+                self.embedding_model,
+                self.max_tokens,
                 self.max_chunk_size,
                 batch_paragraphs = True,
             ):
-                if self.chunk_size + chunk_data["word_count"] <= self.max_chunk_size:
+                if self.check_word_count_and_token_count(self.chunk_size, self.token_count, chunk_data):
                     paragraph_chunks.append(chunk_data)
                     self.chunk_size += chunk_data["word_count"]
+                    self.token_count += chunk_data["token_count"]
                 else:
                     if len(paragraph_chunks) == 0:
                         yield DocumentChunk(
@@ -63,6 +77,7 @@ def read(self):
                             print(e)
                         paragraph_chunks = [chunk_data]
                         self.chunk_size = chunk_data["word_count"]
+                        self.token_count = chunk_data["token_count"]
 
                     self.chunk_index += 1
 
diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py
index 8d6a3dafb..6712175fb 100644
--- a/cognee/modules/data/processing/document_types/Document.py
+++ b/cognee/modules/data/processing/document_types/Document.py
@@ -1,3 +1,4 @@
+from typing import Optional
 from uuid import UUID
 
 from cognee.infrastructure.engine import DataPoint
@@ -13,5 +14,5 @@ class Document(DataPoint):
         "type": "Document"
     }
 
-    def read(self, chunk_size: int, chunker = str) -> str:
+    def read(self, chunk_size: int, embedding_model: Optional[str], max_tokens: Optional[int], chunker = str) -> str:
         pass
diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py
index 352486bd8..1f4f281f8 100644
--- a/cognee/modules/data/processing/document_types/ImageDocument.py
+++ b/cognee/modules/data/processing/document_types/ImageDocument.py
@@ -1,6 +1,10 @@
+from typing import Optional
+
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from .Document import Document
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document
+
 
 class ImageDocument(Document):
     type: str = "image"
@@ -10,11 +14,11 @@ def transcribe_image(self):
         result = get_llm_client().transcribe_image(self.raw_data_location)
         return(result.choices[0].message.content)
 
-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
         # Transcribe the image file
         text = self.transcribe_image()
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text])
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py
index 361214718..27dadda33 100644
--- a/cognee/modules/data/processing/document_types/PdfDocument.py
+++ b/cognee/modules/data/processing/document_types/PdfDocument.py
@@ -1,11 +1,15 @@
+from typing import Optional
+
 from pypdf import PdfReader
-from .Document import Document
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document
+
 
 class PdfDocument(Document):
     type: str = "pdf"
 
-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
         file = PdfReader(self.raw_data_location)
 
         def get_text():
@@ -14,7 +18,7 @@ def get_text():
                 yield page_text
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
 
         yield from chunker.read()
 
diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py
index 3952d9845..895a6f8b6 100644
--- a/cognee/modules/data/processing/document_types/TextDocument.py
+++ b/cognee/modules/data/processing/document_types/TextDocument.py
@@ -1,10 +1,13 @@
-from .Document import Document
+from typing import Optional
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document
+
 
 class TextDocument(Document):
     type: str = "text"
 
-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
         def get_text():
             with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file:
                 while True:
@@ -17,6 +20,6 @@ def get_text():
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
 
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
index 8da065ff5..c94ca4a25 100644
--- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py
+++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
@@ -1,14 +1,16 @@
 from io import StringIO
+from typing import Optional
 
 from cognee.modules.chunking.TextChunker import TextChunker
-from .Document import Document
 from cognee.modules.data.exceptions import UnstructuredLibraryImportError
 
+from .Document import Document
+
 
 class UnstructuredDocument(Document):
     type: str = "unstructured"
 
-    def read(self, chunk_size: int, chunker = str) -> str:
+    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]) -> str:
         def get_text():
             try:
                 from unstructured.partition.auto import partition
@@ -27,6 +29,6 @@ def get_text():
 
                 yield text
 
-        chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text)
+        chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py
index 00bb5670c..546d4a1a7 100644
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@@ -1,8 +1,18 @@
-from uuid import uuid5, NAMESPACE_OID
-from typing import Dict, Any, Iterator
+from typing import Any, Dict, Iterator, Optional, Union
+from uuid import NAMESPACE_OID, uuid5
+
+import tiktoken
+
 from .chunk_by_sentence import chunk_by_sentence
 
-def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs: bool = True) -> Iterator[Dict[str, Any]]:
+
+def chunk_by_paragraph(
+        data: str, 
+        embedding_model: Optional[str], 
+        max_tokens: Optional[Union[int, float]], 
+        paragraph_length: int = 1024, 
+        batch_paragraphs: bool = True
+    ) -> Iterator[Dict[str, Any]]:
     """
     Chunks text by paragraph while preserving exact text reconstruction capability.
     When chunks are joined with empty string "", they reproduce the original text exactly.
@@ -12,14 +22,22 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
     chunk_index = 0
     paragraph_ids = []
     last_cut_type = None
+    current_token_count = 0
     
     for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
         # Check if this sentence would exceed length limit
-        if current_word_count > 0 and current_word_count + word_count > paragraph_length:
+        if embedding_model:
+            tokenizer = tiktoken.encoding_for_model(embedding_model)
+            token_count = len(tokenizer.encode(sentence))
+        else:
+            token_count = 0
+
+        if current_word_count > 0 and (current_word_count + word_count > paragraph_length or current_token_count + token_count > max_tokens):
             # Yield current chunk
             chunk_dict = {
                 "text": current_chunk,
                 "word_count": current_word_count,
+                "token_count": current_token_count,
                 "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
                 "paragraph_ids": paragraph_ids,
                 "chunk_index": chunk_index,
@@ -32,11 +50,13 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
             paragraph_ids = []
             current_chunk = ""
             current_word_count = 0
+            current_token_count = 0
             chunk_index += 1
 
         paragraph_ids.append(paragraph_id)
         current_chunk += sentence
         current_word_count += word_count
+        current_token_count += token_count
         
         # Handle end of paragraph
         if end_type in ("paragraph_end", "sentence_cut") and not batch_paragraphs:
@@ -44,6 +64,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
             chunk_dict = {
                 "text": current_chunk,
                 "word_count": current_word_count,
+                "token_count": current_token_count,
                 "paragraph_ids": paragraph_ids,
                 "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
                 "chunk_index": chunk_index,
@@ -53,6 +74,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
             paragraph_ids = []
             current_chunk = ""
             current_word_count = 0
+            current_token_count = 0
             chunk_index += 1
         
         last_cut_type = end_type
@@ -62,6 +84,7 @@ def chunk_by_paragraph(data: str, paragraph_length: int = 1024, batch_paragraphs
         chunk_dict = {
             "text": current_chunk,
             "word_count": current_word_count,
+            "token_count": current_token_count,
             "chunk_id": uuid5(NAMESPACE_OID, current_chunk),
             "paragraph_ids": paragraph_ids,
             "chunk_index": chunk_index,
diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py
index 423b87b69..ddcdb8765 100644
--- a/cognee/tasks/documents/extract_chunks_from_documents.py
+++ b/cognee/tasks/documents/extract_chunks_from_documents.py
@@ -1,7 +1,15 @@
+from typing import Optional
+
 from cognee.modules.data.processing.document_types.Document import Document
 
 
-async def extract_chunks_from_documents(documents: list[Document], chunk_size: int = 1024, chunker = 'text_chunker'):
+async def extract_chunks_from_documents(
+        documents: list[Document], 
+        chunk_size: int = 1024, 
+        chunker='text_chunker', 
+        embedding_model: Optional[str] = None, 
+        max_tokens: Optional[int] = None,
+        ):
     for document in documents:
-        for document_chunk in document.read(chunk_size = chunk_size, chunker = chunker):
+        for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, embedding_model=embedding_model, max_tokens=max_tokens):
             yield document_chunk

From a774191ed3153442bbdc29a79e90f45c51bc5cc5 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Tue, 7 Jan 2025 13:38:23 +0100
Subject: [PATCH 07/17] Adjust AudioDocument and handle None token limit

---
 .../data/processing/document_types/AudioDocument.py    | 10 +++++++---
 cognee/tasks/chunks/chunk_by_paragraph.py              |  2 ++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py
index 268338703..a59064674 100644
--- a/cognee/modules/data/processing/document_types/AudioDocument.py
+++ b/cognee/modules/data/processing/document_types/AudioDocument.py
@@ -1,6 +1,10 @@
+from typing import Optional
+
 from cognee.infrastructure.llm.get_llm_client import get_llm_client
-from .Document import Document
+
 from .ChunkerMapping import ChunkerConfig
+from .Document import Document
+
 
 class AudioDocument(Document):
     type: str = "audio"
@@ -9,12 +13,12 @@ def create_transcript(self):
         result = get_llm_client().create_transcript(self.raw_data_location)
         return(result.text)
 
-    def read(self, chunk_size: int, chunker: str):
+    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
         # Transcribe the audio file
         
         text = self.create_transcript()
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text])
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py
index 546d4a1a7..2bbd9689f 100644
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@@ -23,6 +23,8 @@ def chunk_by_paragraph(
     paragraph_ids = []
     last_cut_type = None
     current_token_count = 0
+    if not max_tokens:
+        max_tokens = float("inf")
     
     for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
         # Check if this sentence would exceed length limit

From fb13a1b61a42c6b02ad85e70644c73aef722c1d7 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Tue, 7 Jan 2025 15:00:58 +0100
Subject: [PATCH 08/17] Handle azure models as well

---
 cognee/tasks/chunks/chunk_by_paragraph.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py
index 2bbd9689f..b3c191e29 100644
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@@ -29,6 +29,8 @@ def chunk_by_paragraph(
     for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
         # Check if this sentence would exceed length limit
         if embedding_model:
+            if embedding_model.startswith("azure/"):
+                embedding_model = embedding_model.split("/")[-1]
             tokenizer = tiktoken.encoding_for_model(embedding_model)
             token_count = len(tokenizer.encode(sentence))
         else:

From 8ffef5034ae560c7514d21ae2b58d1f30013354d Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Wed, 8 Jan 2025 12:25:31 +0100
Subject: [PATCH 09/17] Add clean logging to code graph example

---
 examples/python/code_graph_example.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py
index 9cd9f99c4..afc83beb0 100644
--- a/examples/python/code_graph_example.py
+++ b/examples/python/code_graph_example.py
@@ -1,7 +1,9 @@
 import argparse
 import asyncio
+import logging
 
 from cognee.api.v1.cognify.code_graph_pipeline import run_code_graph_pipeline
+from cognee.shared.utils import setup_logging
 
 
 async def main(repo_path, include_docs):
@@ -43,6 +45,8 @@ def parse_args():
     return parser.parse_args()
 
 if __name__ == "__main__":
+    setup_logging(logging.ERROR)
+
     import os
 
     args = parse_args()

From f4397bf940e3a54a745ac1be19cadb9e33a28ae4 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Wed, 8 Jan 2025 12:33:14 +0100
Subject: [PATCH 10/17] Remove setting envvars from arg

---
 examples/python/code_graph_example.py | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/examples/python/code_graph_example.py b/examples/python/code_graph_example.py
index afc83beb0..16eafb024 100644
--- a/examples/python/code_graph_example.py
+++ b/examples/python/code_graph_example.py
@@ -24,18 +24,6 @@ def parse_args():
         default=True,
         help="Whether or not to process non-code files"
     )
-    parser.add_argument(
-        "--mock_embedding",
-        type=lambda x: x.lower() in ("true", "1"), 
-        default=True,
-        help="Whether or not to mock embedding and code summary"
-    )
-    parser.add_argument(
-        "--mock_code_summary",
-        type=lambda x: x.lower() in ("true", "1"),
-        default=True, 
-        help="Whether or not to mock code summary"
-    )
     parser.add_argument(
         "--time",
         type=lambda x: x.lower() in ("true", "1"),
@@ -47,18 +35,8 @@ def parse_args():
 if __name__ == "__main__":
     setup_logging(logging.ERROR)
 
-    import os
-
     args = parse_args()
     
-    if args.mock_embedding:
-        os.environ["MOCK_EMBEDDING"] = "true"
-        print("Mocking embedding.")
-    
-    if args.mock_code_summary:
-        os.environ["MOCK_CODE_SUMMARY"] = "true"
-        print("Mocking code summary.")
-    
     if args.time:
         import time
         start_time = time.time()

From 34a9267f414efc9553509bfdbf63bbee6aa5be69 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Wed, 8 Jan 2025 13:23:17 +0100
Subject: [PATCH 11/17] Get embedding engine instead of passing it. Get it from
 vector engine instead of direct getter.

---
 cognee/api/v1/cognify/code_graph_pipeline.py  |  2 +-
 cognee/modules/chunking/TextChunker.py        |  6 ++----
 .../document_types/AudioDocument.py           |  4 ++--
 .../processing/document_types/Document.py     |  2 +-
 .../document_types/ImageDocument.py           |  4 ++--
 .../processing/document_types/PdfDocument.py  |  4 ++--
 .../processing/document_types/TextDocument.py |  4 ++--
 .../document_types/UnstructuredDocument.py    |  4 ++--
 cognee/tasks/chunks/chunk_by_paragraph.py     | 19 ++++++++++---------
 .../extract_chunks_from_documents.py          |  3 +--
 10 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index 2648d0731..7ba461f88 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -71,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
             Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
             Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
             Task(classify_documents),
-            Task(extract_chunks_from_documents, embedding_model=embedding_engine.model, max_tokens=8192),
+            Task(extract_chunks_from_documents, max_tokens=8192),
             Task(extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}),
             Task(
                 summarize_text,
diff --git a/cognee/modules/chunking/TextChunker.py b/cognee/modules/chunking/TextChunker.py
index 8ef4bfda9..a9cb52bf0 100644
--- a/cognee/modules/chunking/TextChunker.py
+++ b/cognee/modules/chunking/TextChunker.py
@@ -14,13 +14,12 @@ class TextChunker():
     chunk_size = 0
     token_count = 0
 
-    def __init__(self, document, get_text: callable, embedding_model: Optional[str] = None, max_tokens: Optional[int] = None, chunk_size: int = 1024):
+    def __init__(self, document, get_text: callable, max_tokens: Optional[int] = None, chunk_size: int = 1024):
         self.document = document
         self.max_chunk_size = chunk_size
         self.get_text = get_text
         self.max_tokens = max_tokens if max_tokens else float("inf")
-        self.embedding_model = embedding_model
-
+    
     def check_word_count_and_token_count(self, word_count_before, token_count_before, chunk_data):
         word_count_fits = word_count_before + chunk_data["word_count"] <= self.max_chunk_size
         token_count_fits = token_count_before + chunk_data["token_count"] <= self.max_tokens
@@ -31,7 +30,6 @@ def read(self):
         for content_text in self.get_text():
             for chunk_data in chunk_by_paragraph(
                 content_text,
-                self.embedding_model,
                 self.max_tokens,
                 self.max_chunk_size,
                 batch_paragraphs = True,
diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py
index a59064674..c4e6ae87c 100644
--- a/cognee/modules/data/processing/document_types/AudioDocument.py
+++ b/cognee/modules/data/processing/document_types/AudioDocument.py
@@ -13,12 +13,12 @@ def create_transcript(self):
         result = get_llm_client().create_transcript(self.raw_data_location)
         return(result.text)
 
-    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
         # Transcribe the audio file
         
         text = self.create_transcript()
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py
index 6712175fb..7c76d3f23 100644
--- a/cognee/modules/data/processing/document_types/Document.py
+++ b/cognee/modules/data/processing/document_types/Document.py
@@ -14,5 +14,5 @@ class Document(DataPoint):
         "type": "Document"
     }
 
-    def read(self, chunk_size: int, embedding_model: Optional[str], max_tokens: Optional[int], chunker = str) -> str:
+    def read(self, chunk_size: int, max_tokens: Optional[int], chunker = str) -> str:
         pass
diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py
index 1f4f281f8..ffe8ff3f9 100644
--- a/cognee/modules/data/processing/document_types/ImageDocument.py
+++ b/cognee/modules/data/processing/document_types/ImageDocument.py
@@ -14,11 +14,11 @@ def transcribe_image(self):
         result = get_llm_client().transcribe_image(self.raw_data_location)
         return(result.choices[0].message.content)
 
-    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
         # Transcribe the image file
         text = self.transcribe_image()
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], embedding_model=embedding_model, max_tokens=max_tokens)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = lambda: [text], max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py
index 27dadda33..463911d5b 100644
--- a/cognee/modules/data/processing/document_types/PdfDocument.py
+++ b/cognee/modules/data/processing/document_types/PdfDocument.py
@@ -9,7 +9,7 @@
 class PdfDocument(Document):
     type: str = "pdf"
 
-    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
         file = PdfReader(self.raw_data_location)
 
         def get_text():
@@ -18,7 +18,7 @@ def get_text():
                 yield page_text
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens)
 
         yield from chunker.read()
 
diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py
index 895a6f8b6..582f47737 100644
--- a/cognee/modules/data/processing/document_types/TextDocument.py
+++ b/cognee/modules/data/processing/document_types/TextDocument.py
@@ -7,7 +7,7 @@
 class TextDocument(Document):
     type: str = "text"
 
-    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
         def get_text():
             with open(self.raw_data_location, mode = "r", encoding = "utf-8") as file:
                 while True:
@@ -20,6 +20,6 @@ def get_text():
 
         chunker_func = ChunkerConfig.get_chunker(chunker)
 
-        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
+        chunker = chunker_func(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
index c94ca4a25..6c70744a0 100644
--- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py
+++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
@@ -10,7 +10,7 @@
 class UnstructuredDocument(Document):
     type: str = "unstructured"
 
-    def read(self, chunk_size: int, chunker: str, embedding_model:Optional[str], max_tokens: Optional[int]) -> str:
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]) -> str:
         def get_text():
             try:
                 from unstructured.partition.auto import partition
@@ -29,6 +29,6 @@ def get_text():
 
                 yield text
 
-        chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, embedding_model=embedding_model, max_tokens=max_tokens)
+        chunker = TextChunker(self, chunk_size = chunk_size, get_text = get_text, max_tokens=max_tokens)
 
         yield from chunker.read()
diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py
index b3c191e29..8ab66bd7f 100644
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@@ -3,12 +3,13 @@
 
 import tiktoken
 
+from cognee.infrastructure.databases.vector import get_vector_engine
+
 from .chunk_by_sentence import chunk_by_sentence
 
 
 def chunk_by_paragraph(
         data: str, 
-        embedding_model: Optional[str], 
         max_tokens: Optional[Union[int, float]], 
         paragraph_length: int = 1024, 
         batch_paragraphs: bool = True
@@ -26,16 +27,16 @@ def chunk_by_paragraph(
     if not max_tokens:
         max_tokens = float("inf")
     
+    vector_engine = get_vector_engine()
+    embedding_model = vector_engine.embedding_engine.model
+
     for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
         # Check if this sentence would exceed length limit
-        if embedding_model:
-            if embedding_model.startswith("azure/"):
-                embedding_model = embedding_model.split("/")[-1]
-            tokenizer = tiktoken.encoding_for_model(embedding_model)
-            token_count = len(tokenizer.encode(sentence))
-        else:
-            token_count = 0
-
+        
+        embedding_model = embedding_model.split("/")[-1]
+        tokenizer = tiktoken.encoding_for_model(embedding_model)
+        token_count = len(tokenizer.encode(sentence))
+    
         if current_word_count > 0 and (current_word_count + word_count > paragraph_length or current_token_count + token_count > max_tokens):
             # Yield current chunk
             chunk_dict = {
diff --git a/cognee/tasks/documents/extract_chunks_from_documents.py b/cognee/tasks/documents/extract_chunks_from_documents.py
index ddcdb8765..e647afbef 100644
--- a/cognee/tasks/documents/extract_chunks_from_documents.py
+++ b/cognee/tasks/documents/extract_chunks_from_documents.py
@@ -7,9 +7,8 @@ async def extract_chunks_from_documents(
         documents: list[Document], 
         chunk_size: int = 1024, 
         chunker='text_chunker', 
-        embedding_model: Optional[str] = None, 
         max_tokens: Optional[int] = None,
         ):
     for document in documents:
-        for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, embedding_model=embedding_model, max_tokens=max_tokens):
+        for document_chunk in document.read(chunk_size=chunk_size, chunker=chunker, max_tokens=max_tokens):
             yield document_chunk

From 97814e334f282b344cb0357df387b70cbf801397 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Wed, 8 Jan 2025 13:45:04 +0100
Subject: [PATCH 12/17] Get embedding engine instead of passing it in code
 chunking.

---
 cognee/api/v1/cognify/code_graph_pipeline.py          | 6 +-----
 cognee/tasks/chunks/chunk_by_paragraph.py             | 4 ++--
 cognee/tasks/repo_processor/get_source_code_chunks.py | 9 ++++++---
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index 7ba461f88..6e06edfa3 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -3,8 +3,6 @@
 from pathlib import Path
 
 from cognee.base_config import get_base_config
-from cognee.infrastructure.databases.vector.embeddings import \
-    get_embedding_engine
 from cognee.modules.cognify.config import get_cognify_config
 from cognee.modules.pipelines import run_tasks
 from cognee.modules.pipelines.tasks.Task import Task
@@ -51,8 +49,6 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
     await cognee.prune.prune_system(metadata=True)
     await create_db_and_tables()
 
-    embedding_engine = get_embedding_engine()
-
     cognee_config = get_cognify_config()
     user = await get_default_user()
 
@@ -60,7 +56,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
         Task(get_repo_file_dependencies),
         Task(enrich_dependency_graph),
         Task(expand_dependency_graph, task_config={"batch_size": 50}),
-        Task(get_source_code_chunks, embedding_model=embedding_engine.model, task_config={"batch_size": 50}),
+        Task(get_source_code_chunks, task_config={"batch_size": 50}),
         Task(summarize_code, task_config={"batch_size": 50}),
         Task(add_data_points, task_config={"batch_size": 50}),
     ]
diff --git a/cognee/tasks/chunks/chunk_by_paragraph.py b/cognee/tasks/chunks/chunk_by_paragraph.py
index 8ab66bd7f..44355a1ad 100644
--- a/cognee/tasks/chunks/chunk_by_paragraph.py
+++ b/cognee/tasks/chunks/chunk_by_paragraph.py
@@ -29,11 +29,11 @@ def chunk_by_paragraph(
     
     vector_engine = get_vector_engine()
     embedding_model = vector_engine.embedding_engine.model
-
+    embedding_model = embedding_model.split("/")[-1]
+        
     for paragraph_id, sentence, word_count, end_type in chunk_by_sentence(data, maximum_length=paragraph_length):
         # Check if this sentence would exceed length limit
         
-        embedding_model = embedding_model.split("/")[-1]
         tokenizer = tiktoken.encoding_for_model(embedding_model)
         token_count = len(tokenizer.encode(sentence))
     
diff --git a/cognee/tasks/repo_processor/get_source_code_chunks.py b/cognee/tasks/repo_processor/get_source_code_chunks.py
index 4d0ce3200..0bf7ebe32 100644
--- a/cognee/tasks/repo_processor/get_source_code_chunks.py
+++ b/cognee/tasks/repo_processor/get_source_code_chunks.py
@@ -5,6 +5,7 @@
 import parso
 import tiktoken
 
+from cognee.infrastructure.databases.vector import get_vector_engine
 from cognee.infrastructure.engine import DataPoint
 from cognee.shared.CodeGraphEntities import CodeFile, CodePart, SourceCodeChunk
 
@@ -115,13 +116,15 @@ def get_source_code_chunks_from_code_part(
         max_tokens: int = 8192,
         overlap: float = 0.25,
         granularity: float = 0.1,
-        model_name: str = "text-embedding-3-large"
 ) -> Generator[SourceCodeChunk, None, None]:
     """Yields source code chunks from a CodePart object, with configurable token limits and overlap."""
     if not code_file_part.source_code:
         logger.error(f"No source code in CodeFile {code_file_part.id}")
         return
 
+    vector_engine = get_vector_engine()
+    embedding_model = vector_engine.embedding_engine.model
+    model_name = embedding_model.split("/")[-1]
     tokenizer = tiktoken.encoding_for_model(model_name)
     max_subchunk_tokens = max(1, int(granularity * max_tokens))
     subchunk_token_counts = _get_subchunk_token_counts(tokenizer, code_file_part.source_code, max_subchunk_tokens)
@@ -141,7 +144,7 @@ def get_source_code_chunks_from_code_part(
         previous_chunk = current_chunk
 
 
-async def get_source_code_chunks(data_points: list[DataPoint], embedding_model="text-embedding-3-large") -> \
+async def get_source_code_chunks(data_points: list[DataPoint]) -> \
         AsyncGenerator[list[DataPoint], None]:
     """Processes code graph datapoints, create SourceCodeChink datapoints."""
     # TODO: Add support for other embedding models, with max_token mapping
@@ -156,7 +159,7 @@ async def get_source_code_chunks(data_points: list[DataPoint], embedding_model="
             for code_part in data_point.contains:
                 try:
                     yield code_part
-                    for source_code_chunk in get_source_code_chunks_from_code_part(code_part, model_name=embedding_model):
+                    for source_code_chunk in get_source_code_chunks_from_code_part(code_part):
                         yield source_code_chunk
                 except Exception as e:
                     logger.error(f"Error processing code part: {e}")

From abb3ea6d219f8221500fb7a7e7f6cc404cf75b08 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Thu, 9 Jan 2025 11:31:16 +0100
Subject: [PATCH 13/17] Adjust integration tests

---
 .../data/processing/document_types/AudioDocument.py       | 2 +-
 cognee/modules/data/processing/document_types/Document.py | 2 +-
 .../data/processing/document_types/ImageDocument.py       | 2 +-
 .../modules/data/processing/document_types/PdfDocument.py | 2 +-
 .../data/processing/document_types/TextDocument.py        | 2 +-
 .../processing/document_types/UnstructuredDocument.py     | 2 +-
 .../integration/documents/UnstructuredDocument_test.py    | 8 ++++----
 7 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/cognee/modules/data/processing/document_types/AudioDocument.py b/cognee/modules/data/processing/document_types/AudioDocument.py
index faace056b..b7d2476b4 100644
--- a/cognee/modules/data/processing/document_types/AudioDocument.py
+++ b/cognee/modules/data/processing/document_types/AudioDocument.py
@@ -13,7 +13,7 @@ def create_transcript(self):
         result = get_llm_client().create_transcript(self.raw_data_location)
         return result.text
 
-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
         # Transcribe the audio file
 
         text = self.create_transcript()
diff --git a/cognee/modules/data/processing/document_types/Document.py b/cognee/modules/data/processing/document_types/Document.py
index 9a29e7797..7ecdf289e 100644
--- a/cognee/modules/data/processing/document_types/Document.py
+++ b/cognee/modules/data/processing/document_types/Document.py
@@ -11,5 +11,5 @@ class Document(DataPoint):
     mime_type: str
     _metadata: dict = {"index_fields": ["name"], "type": "Document"}
 
-    def read(self, chunk_size: int, max_tokens: Optional[int], chunker=str) -> str:
+    def read(self, chunk_size: int, chunker=str, max_tokens: Optional[int] = None) -> str:
         pass
diff --git a/cognee/modules/data/processing/document_types/ImageDocument.py b/cognee/modules/data/processing/document_types/ImageDocument.py
index f0c7a6d61..c055b8253 100644
--- a/cognee/modules/data/processing/document_types/ImageDocument.py
+++ b/cognee/modules/data/processing/document_types/ImageDocument.py
@@ -13,7 +13,7 @@ def transcribe_image(self):
         result = get_llm_client().transcribe_image(self.raw_data_location)
         return result.choices[0].message.content
 
-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
         # Transcribe the image file
         text = self.transcribe_image()
 
diff --git a/cognee/modules/data/processing/document_types/PdfDocument.py b/cognee/modules/data/processing/document_types/PdfDocument.py
index 56969c7f8..768f91264 100644
--- a/cognee/modules/data/processing/document_types/PdfDocument.py
+++ b/cognee/modules/data/processing/document_types/PdfDocument.py
@@ -9,7 +9,7 @@
 class PdfDocument(Document):
     type: str = "pdf"
 
-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
         file = PdfReader(self.raw_data_location)
 
         def get_text():
diff --git a/cognee/modules/data/processing/document_types/TextDocument.py b/cognee/modules/data/processing/document_types/TextDocument.py
index 11dc798aa..b62ccd56e 100644
--- a/cognee/modules/data/processing/document_types/TextDocument.py
+++ b/cognee/modules/data/processing/document_types/TextDocument.py
@@ -7,7 +7,7 @@
 class TextDocument(Document):
     type: str = "text"
 
-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]):
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None):
         def get_text():
             with open(self.raw_data_location, mode="r", encoding="utf-8") as file:
                 while True:
diff --git a/cognee/modules/data/processing/document_types/UnstructuredDocument.py b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
index d6b64498c..1c291d0dc 100644
--- a/cognee/modules/data/processing/document_types/UnstructuredDocument.py
+++ b/cognee/modules/data/processing/document_types/UnstructuredDocument.py
@@ -10,7 +10,7 @@
 class UnstructuredDocument(Document):
     type: str = "unstructured"
 
-    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int]) -> str:
+    def read(self, chunk_size: int, chunker: str, max_tokens: Optional[int] = None) -> str:
         def get_text():
             try:
                 from unstructured.partition.auto import partition
diff --git a/cognee/tests/integration/documents/UnstructuredDocument_test.py b/cognee/tests/integration/documents/UnstructuredDocument_test.py
index 03b8deb49..e0278de81 100644
--- a/cognee/tests/integration/documents/UnstructuredDocument_test.py
+++ b/cognee/tests/integration/documents/UnstructuredDocument_test.py
@@ -68,7 +68,7 @@ def test_UnstructuredDocument():
     )
 
     # Test PPTX
-    for paragraph_data in pptx_document.read(chunk_size=1024):
+    for paragraph_data in pptx_document.read(chunk_size=1024, chunker="text_chunker"):
         assert 19 == paragraph_data.word_count, f" 19 != {paragraph_data.word_count = }"
         assert 104 == len(paragraph_data.text), f" 104 != {len(paragraph_data.text) = }"
         assert (
@@ -76,7 +76,7 @@ def test_UnstructuredDocument():
         ), f" sentence_cut != {paragraph_data.cut_type = }"
 
     # Test DOCX
-    for paragraph_data in docx_document.read(chunk_size=1024):
+    for paragraph_data in docx_document.read(chunk_size=1024, chunker="text_chunker"):
         assert 16 == paragraph_data.word_count, f" 16 != {paragraph_data.word_count = }"
         assert 145 == len(paragraph_data.text), f" 145 != {len(paragraph_data.text) = }"
         assert (
@@ -84,7 +84,7 @@ def test_UnstructuredDocument():
         ), f" sentence_end != {paragraph_data.cut_type = }"
 
     # TEST CSV
-    for paragraph_data in csv_document.read(chunk_size=1024):
+    for paragraph_data in csv_document.read(chunk_size=1024, chunker="text_chunker"):
         assert 15 == paragraph_data.word_count, f" 15 != {paragraph_data.word_count = }"
         assert (
             "A A A A A A A A A,A A A A A A,A A" == paragraph_data.text
@@ -94,7 +94,7 @@ def test_UnstructuredDocument():
         ), f" sentence_cut != {paragraph_data.cut_type = }"
 
     # Test XLSX
-    for paragraph_data in xlsx_document.read(chunk_size=1024):
+    for paragraph_data in xlsx_document.read(chunk_size=1024, chunker="text_chunker"):
         assert 36 == paragraph_data.word_count, f" 36 != {paragraph_data.word_count = }"
         assert 171 == len(paragraph_data.text), f" 171 != {len(paragraph_data.text) = }"
         assert (

From cdaae161a8e006415988a45ca529d5bc71f9d632 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Thu, 9 Jan 2025 12:08:42 +0100
Subject: [PATCH 14/17] Handle circular import

---
 cognee/tasks/repo_processor/__init__.py                | 3 ---
 cognee/tasks/repo_processor/expand_dependency_graph.py | 5 ++++-
 cognee/tasks/repo_processor/extract_code_parts.py      | 4 +++-
 cognee/tasks/repo_processor/get_local_dependencies.py  | 4 +++-
 4 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/cognee/tasks/repo_processor/__init__.py b/cognee/tasks/repo_processor/__init__.py
index 6dc032547..8f0df23d8 100644
--- a/cognee/tasks/repo_processor/__init__.py
+++ b/cognee/tasks/repo_processor/__init__.py
@@ -2,6 +2,3 @@
 from .expand_dependency_graph import expand_dependency_graph
 from .get_non_code_files import get_data_list_for_user, get_non_py_files
 from .get_repo_file_dependencies import get_repo_file_dependencies
-import logging
-
-logger = logging.getLogger("task:repo_processor")
diff --git a/cognee/tasks/repo_processor/expand_dependency_graph.py b/cognee/tasks/repo_processor/expand_dependency_graph.py
index de26fe8d4..d3f5d1b07 100644
--- a/cognee/tasks/repo_processor/expand_dependency_graph.py
+++ b/cognee/tasks/repo_processor/expand_dependency_graph.py
@@ -5,7 +5,10 @@
 from cognee.infrastructure.engine import DataPoint
 from cognee.shared.CodeGraphEntities import CodeFile, CodePart
 from cognee.tasks.repo_processor.extract_code_parts import extract_code_parts
-from cognee.tasks.repo_processor import logger
+
+import logging
+
+logger = logging.getLogger("task:repo_processor")
 
 
 def _add_code_parts_nodes_and_edges(code_file: CodeFile, part_type, code_parts) -> None:
diff --git a/cognee/tasks/repo_processor/extract_code_parts.py b/cognee/tasks/repo_processor/extract_code_parts.py
index 76cfef538..c181a87d9 100644
--- a/cognee/tasks/repo_processor/extract_code_parts.py
+++ b/cognee/tasks/repo_processor/extract_code_parts.py
@@ -1,7 +1,9 @@
 from typing import Dict, List
 import parso
 
-from cognee.tasks.repo_processor import logger
+import logging
+
+logger = logging.getLogger("task:repo_processor")
 
 
 def _extract_parts_from_module(module, parts_dict: Dict[str, List[str]]) -> Dict[str, List[str]]:
diff --git a/cognee/tasks/repo_processor/get_local_dependencies.py b/cognee/tasks/repo_processor/get_local_dependencies.py
index b443829c9..92b50cd0b 100644
--- a/cognee/tasks/repo_processor/get_local_dependencies.py
+++ b/cognee/tasks/repo_processor/get_local_dependencies.py
@@ -10,7 +10,9 @@
 import parso
 from parso.tree import BaseNode
 
-from cognee.tasks.repo_processor import logger
+import logging
+
+logger = logging.getLogger("task:repo_processor")
 
 
 @contextmanager

From 626bc76f5ccdb830e741cf74464e1e3a967dec75 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Thu, 9 Jan 2025 12:53:26 +0100
Subject: [PATCH 15/17] Set max_tokens in config

---
 cognee/api/v1/cognify/code_graph_pipeline.py | 2 +-
 cognee/modules/cognify/config.py             | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/cognee/api/v1/cognify/code_graph_pipeline.py b/cognee/api/v1/cognify/code_graph_pipeline.py
index 53c41d43b..2d077f39b 100644
--- a/cognee/api/v1/cognify/code_graph_pipeline.py
+++ b/cognee/api/v1/cognify/code_graph_pipeline.py
@@ -71,7 +71,7 @@ async def run_code_graph_pipeline(repo_path, include_docs=True):
             Task(ingest_data_with_metadata, dataset_name="repo_docs", user=user),
             Task(get_data_list_for_user, dataset_name="repo_docs", user=user),
             Task(classify_documents),
-            Task(extract_chunks_from_documents, max_tokens=8192),
+            Task(extract_chunks_from_documents, max_tokens=cognee_config.max_tokens),
             Task(
                 extract_graph_from_data, graph_model=KnowledgeGraph, task_config={"batch_size": 50}
             ),
diff --git a/cognee/modules/cognify/config.py b/cognee/modules/cognify/config.py
index d40410bfc..dd94d8b41 100644
--- a/cognee/modules/cognify/config.py
+++ b/cognee/modules/cognify/config.py
@@ -1,12 +1,14 @@
 from functools import lru_cache
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from cognee.shared.data_models import DefaultContentPrediction, SummarizedContent
+from typing import Optional
+import os
 
 
 class CognifyConfig(BaseSettings):
     classification_model: object = DefaultContentPrediction
     summarization_model: object = SummarizedContent
-
+    max_tokens: Optional[int] = os.getenv("MAX_TOKENS")
     model_config = SettingsConfigDict(env_file=".env", extra="allow")
 
     def to_dict(self) -> dict:

From d7b2186300db585ee5dd1affb20e85e3017a1606 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Thu, 9 Jan 2025 14:27:37 +0100
Subject: [PATCH 16/17] Adjust SWE-bench script to code graph pipeline call

---
 evals/eval_swe_bench.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py
index 789c95ab4..509530685 100644
--- a/evals/eval_swe_bench.py
+++ b/evals/eval_swe_bench.py
@@ -34,9 +34,8 @@ def check_install_package(package_name):
 
 async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS):
     repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")
-    pipeline = await run_code_graph_pipeline(repo_path)
 
-    async for result in pipeline:
+    async for result in run_code_graph_pipeline(repo_path, include_docs=True):
         print(result)
 
     print("Here we have the repo under the repo_path")
@@ -47,7 +46,16 @@ async def generate_patch_with_cognee(instance, llm_client, search_type=SearchTyp
     instructions = read_query_prompt("patch_gen_kg_instructions.txt")
 
     retrieved_edges = await brute_force_triplet_search(
-        problem_statement, top_k=3, collections=["data_point_source_code", "data_point_text"]
+        problem_statement,
+        top_k=3,
+        collections=[
+            "code_summary_text",
+            "data_point_name",
+            "document_chunk_text",
+            "entity_name",
+            "entity_type_name",
+            "sourcecodechunk_source_code",
+        ],
     )
 
     retrieved_edges_str = retrieved_edges_to_string(retrieved_edges)

From 18bb282fbc7fb3c7e770641bd0b64fa38af7dd92 Mon Sep 17 00:00:00 2001
From: Rita Aleksziev <alekszievr@gmail.com>
Date: Thu, 9 Jan 2025 14:27:37 +0100
Subject: [PATCH 17/17] Adjust SWE-bench script to code graph pipeline call

---
 evals/eval_swe_bench.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/evals/eval_swe_bench.py b/evals/eval_swe_bench.py
index 789c95ab4..20e005751 100644
--- a/evals/eval_swe_bench.py
+++ b/evals/eval_swe_bench.py
@@ -34,9 +34,8 @@ def check_install_package(package_name):
 
 async def generate_patch_with_cognee(instance, llm_client, search_type=SearchType.CHUNKS):
     repo_path = download_github_repo(instance, "../RAW_GIT_REPOS")
-    pipeline = await run_code_graph_pipeline(repo_path)
 
-    async for result in pipeline:
+    async for result in run_code_graph_pipeline(repo_path, include_docs=True):
         print(result)
 
     print("Here we have the repo under the repo_path")
@@ -47,7 +46,9 @@ async def generate_patch_with_cognee(instance, llm_client, search_type=SearchTyp
     instructions = read_query_prompt("patch_gen_kg_instructions.txt")
 
     retrieved_edges = await brute_force_triplet_search(
-        problem_statement, top_k=3, collections=["data_point_source_code", "data_point_text"]
+        problem_statement,
+        top_k=3,
+        collections=["code_summary_text"],
     )
 
     retrieved_edges_str = retrieved_edges_to_string(retrieved_edges)