Skip to content

Commit

Permalink
Merge pull request #80 from bhavnicksm/development
Browse files Browse the repository at this point in the history
[Fix] Unify dataclasses under a types.py for ease
  • Loading branch information
bhavnicksm authored Dec 6, 2024
2 parents 1e784c2 + 837e41a commit 98af028
Show file tree
Hide file tree
Showing 18 changed files with 321 additions and 282 deletions.
14 changes: 8 additions & 6 deletions src/chonkie/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,12 @@

from .chunker import (
BaseChunker,
Chunk,
SDPMChunker,
SemanticChunk,
SemanticChunker,
SemanticSentence,
Sentence,
SentenceChunk,
SentenceChunker,
TokenChunker,
WordChunker,
)
from .context import Context
from .embeddings import (
AutoEmbeddings,
BaseEmbeddings,
Expand All @@ -25,6 +19,14 @@
BaseRefinery,
OverlapRefinery,
)
from .types import (
Chunk,
Context,
SemanticChunk,
SemanticSentence,
Sentence,
SentenceChunk,
)

__version__ = "0.2.1.post1"
__name__ = "chonkie"
Expand Down
13 changes: 5 additions & 8 deletions src/chonkie/chunker/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,17 @@
from .base import BaseChunker, Chunk
"""Module for chunkers."""

from .base import BaseChunker
from .sdpm import SDPMChunker
from .semantic import SemanticChunk, SemanticChunker, SemanticSentence
from .sentence import Sentence, SentenceChunk, SentenceChunker
from .semantic import SemanticChunker
from .sentence import SentenceChunker
from .token import TokenChunker
from .word import WordChunker

__all__ = [
"Chunk",
"BaseChunker",
"TokenChunker",
"WordChunker",
"Sentence",
"SentenceChunk",
"SentenceChunker",
"SemanticSentence",
"SemanticChunk",
"SemanticChunker",
"SDPMChunker",
]
66 changes: 2 additions & 64 deletions src/chonkie/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,72 +4,10 @@
import inspect
import warnings
from abc import ABC, abstractmethod
from dataclasses import dataclass
from multiprocessing import Pool, cpu_count
from typing import Any, Callable, List, Optional, Union
from typing import Any, Callable, List, Union

from chonkie.context import Context


@dataclass
class Chunk:
"""Dataclass representing a text chunk with metadata.
All attributes are read-only via slots for performance reasons.
Attributes:
text: The text content of the chunk
start_index: The starting index of the chunk in the original text
end_index: The ending index of the chunk in the original text
token_count: The number of tokens in the chunk
context: The context of the chunk, useful for refinery classes
"""

text: str
start_index: int
end_index: int
token_count: int
context: Optional[Context] = None

def __str__(self) -> str:
"""Return string representation of the chunk."""
return self.text

def __len__(self) -> int:
"""Return the length of the chunk."""
return len(self.text)

def __repr__(self) -> str:
"""Return string representation of the chunk."""
if self.context is not None:
return (
f"Chunk(text={self.text}, start_index={self.start_index}, "
f"end_index={self.end_index}, token_count={self.token_count})"
)
else:
return (
f"Chunk(text={self.text}, start_index={self.start_index}, "
f"end_index={self.end_index}, token_count={self.token_count}, "
f"context={self.context})"
)

def __iter__(self):
"""Return an iterator over the chunk."""
return iter(self.text)

def __getitem__(self, index: int):
"""Return the item at the given index."""
return self.text[index]

def copy(self) -> "Chunk":
"""Return a deep copy of the chunk."""
return Chunk(
text=self.text,
start_index=self.start_index,
end_index=self.end_index,
token_count=self.token_count,
)
from chonkie.types import Chunk


class BaseChunker(ABC):
Expand Down
9 changes: 6 additions & 3 deletions src/chonkie/chunker/sdpm.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
"""Semantic Double Pass Merge chunking using sentence embeddings."""

from typing import Any, List, Union

from .semantic import SemanticChunk, SemanticChunker, Sentence
from chonkie.types import SemanticChunk, Sentence

from .semantic import SemanticChunker


class SDPMChunker(SemanticChunker):
Expand All @@ -23,7 +26,7 @@ class SDPMChunker(SemanticChunker):
Methods:
chunk: Split text into chunks using the SDPM approach.
"""

def __init__(
Expand Down Expand Up @@ -133,7 +136,7 @@ def chunk(self, text: str) -> List[SemanticChunk]:
sentences = self._prepare_sentences(text)
if len(sentences) <= self.min_sentences:
return [self._create_chunk(sentences)]

# Calculate similarity threshold
self.similarity_threshold = self._calculate_similarity_threshold(sentences)

Expand Down
Loading

0 comments on commit 98af028

Please sign in to comment.