Merge pull request #80 from bhavnicksm/development

[Fix] Unify dataclasses under a types.py for ease
chonkie-ai · Dec 6, 2024 · 98af028 · 98af028
2 parents 1e784c2 + 837e41a
commit 98af028
Show file tree

Hide file tree

Showing 18 changed files with 321 additions and 282 deletions.
diff --git a/src/chonkie/__init__.py b/src/chonkie/__init__.py
@@ -2,18 +2,12 @@
 
 from .chunker import (
     BaseChunker,
-    Chunk,
     SDPMChunker,
-    SemanticChunk,
     SemanticChunker,
-    SemanticSentence,
-    Sentence,
-    SentenceChunk,
     SentenceChunker,
     TokenChunker,
     WordChunker,
 )
-from .context import Context
 from .embeddings import (
     AutoEmbeddings,
     BaseEmbeddings,
@@ -25,6 +19,14 @@
     BaseRefinery,
     OverlapRefinery,
 )
+from .types import (
+    Chunk,
+    Context,
+    SemanticChunk,
+    SemanticSentence,
+    Sentence,
+    SentenceChunk,
+)
 
 __version__ = "0.2.1.post1"
 __name__ = "chonkie"

diff --git a/src/chonkie/chunker/__init__.py b/src/chonkie/chunker/__init__.py
@@ -1,20 +1,17 @@
-from .base import BaseChunker, Chunk
+"""Module for chunkers."""
+
+from .base import BaseChunker
 from .sdpm import SDPMChunker
-from .semantic import SemanticChunk, SemanticChunker, SemanticSentence
-from .sentence import Sentence, SentenceChunk, SentenceChunker
+from .semantic import SemanticChunker
+from .sentence import SentenceChunker
 from .token import TokenChunker
 from .word import WordChunker
 
 __all__ = [
-    "Chunk",
     "BaseChunker",
     "TokenChunker",
     "WordChunker",
-    "Sentence",
-    "SentenceChunk",
     "SentenceChunker",
-    "SemanticSentence",
-    "SemanticChunk",
     "SemanticChunker",
     "SDPMChunker",
 ]
diff --git a/src/chonkie/chunker/base.py b/src/chonkie/chunker/base.py
@@ -4,72 +4,10 @@
 import inspect
 import warnings
 from abc import ABC, abstractmethod
-from dataclasses import dataclass
 from multiprocessing import Pool, cpu_count
-from typing import Any, Callable, List, Optional, Union
+from typing import Any, Callable, List, Union
 
-from chonkie.context import Context
-
-
-@dataclass
-class Chunk:
-    """Dataclass representing a text chunk with metadata.
-
-    All attributes are read-only via slots for performance reasons.
-
-    Attributes:
-        text: The text content of the chunk
-        start_index: The starting index of the chunk in the original text
-        end_index: The ending index of the chunk in the original text
-        token_count: The number of tokens in the chunk
-        context: The context of the chunk, useful for refinery classes
-
-    """
-
-    text: str
-    start_index: int
-    end_index: int
-    token_count: int
-    context: Optional[Context] = None
-
-    def __str__(self) -> str:
-        """Return string representation of the chunk."""
-        return self.text
-
-    def __len__(self) -> int:
-        """Return the length of the chunk."""
-        return len(self.text)
-
-    def __repr__(self) -> str:
-        """Return string representation of the chunk."""
-        if self.context is not None:
-            return (
-                f"Chunk(text={self.text}, start_index={self.start_index}, "
-                f"end_index={self.end_index}, token_count={self.token_count})"
-            )
-        else:
-            return (
-                f"Chunk(text={self.text}, start_index={self.start_index}, "
-                f"end_index={self.end_index}, token_count={self.token_count}, "
-                f"context={self.context})"
-            )
-
-    def __iter__(self):
-        """Return an iterator over the chunk."""
-        return iter(self.text)
-
-    def __getitem__(self, index: int):
-        """Return the item at the given index."""
-        return self.text[index]
-
-    def copy(self) -> "Chunk":
-        """Return a deep copy of the chunk."""
-        return Chunk(
-            text=self.text,
-            start_index=self.start_index,
-            end_index=self.end_index,
-            token_count=self.token_count,
-        )
+from chonkie.types import Chunk
 
 
 class BaseChunker(ABC):

diff --git a/src/chonkie/chunker/sdpm.py b/src/chonkie/chunker/sdpm.py
@@ -1,7 +1,10 @@
 """Semantic Double Pass Merge chunking using sentence embeddings."""
+
 from typing import Any, List, Union
 
-from .semantic import SemanticChunk, SemanticChunker, Sentence
+from chonkie.types import SemanticChunk, Sentence
+
+from .semantic import SemanticChunker
 
 
 class SDPMChunker(SemanticChunker):
@@ -23,7 +26,7 @@ class SDPMChunker(SemanticChunker):
 
     Methods:
         chunk: Split text into chunks using the SDPM approach.
-        
+
     """
 
     def __init__(
@@ -133,7 +136,7 @@ def chunk(self, text: str) -> List[SemanticChunk]:
         sentences = self._prepare_sentences(text)
         if len(sentences) <= self.min_sentences:
             return [self._create_chunk(sentences)]
-        
+
         # Calculate similarity threshold
         self.similarity_threshold = self._calculate_similarity_threshold(sentences)