From b62d5570235032f6d39b04c77a33fe3b87e657ee Mon Sep 17 00:00:00 2001
From: bhavnicksm <bhavnicksm@gmail.com>
Date: Thu, 12 Dec 2024 03:13:08 +0530
Subject: [PATCH 1/4] [Fix] indexing logic for TokenChunker for fn chunk

---
 src/chonkie/chunker/token.py | 42 +++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 15 deletions(-)

diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py
index df889a9..78f8691 100644
--- a/src/chonkie/chunker/token.py
+++ b/src/chonkie/chunker/token.py
@@ -48,6 +48,32 @@ def __init__(
             if isinstance(chunk_overlap, int)
             else int(chunk_overlap * chunk_size)
         )
+    
+    def _create_chunks(
+        self,
+        chunk_texts: List[str],
+        token_counts: List[int],
+        decoded_text: str,
+    ) -> List[Chunk]:
+        """Create chunks from a list of texts."""
+        # package everything as Chunk objects and send out the result
+        chunks = []
+        current_index = 0
+        for chunk_text, token_count in zip(chunk_texts, token_counts):
+            start_index = decoded_text.find(
+                chunk_text, current_index
+            )  # Find needs to be run every single time because of unknown overlap length
+            end_index = start_index + len(chunk_text)
+            chunks.append(
+                Chunk(
+                    text=chunk_text,
+                    start_index=start_index,
+                    end_index=end_index,
+                    token_count=token_count,
+                )
+            )
+            current_index = end_index
+        return chunks
 
     def chunk(self, text: str) -> List[Chunk]:
         """Split text into overlapping chunks of specified token size.
@@ -85,21 +111,7 @@ def chunk(self, text: str) -> List[Chunk]:
             token_groups
         )  # decrease the time by decoding in one go (?)
 
-        # package everything as Chunk objects and send out the result
-        chunks = []
-        for chunk_text, token_count in zip(chunk_texts, token_counts):
-            start_index = decoded_text.find(
-                chunk_text
-            )  # Find needs to be run every single time because of unknown overlap length
-            end_index = start_index + len(chunk_text)
-            chunks.append(
-                Chunk(
-                    text=chunk_text,
-                    start_index=start_index,
-                    end_index=end_index,
-                    token_count=token_count,
-                )
-            )
+        chunks = self._process_batch(token_groups, token_counts, decoded_text)
 
         return chunks
 

From 23f72b0948ae2e3b14cf3aa7827eef2b14fce20e Mon Sep 17 00:00:00 2001
From: bhavnicksm <bhavnicksm@gmail.com>
Date: Thu, 12 Dec 2024 03:15:48 +0530
Subject: [PATCH 2/4] Refactor TokenChunker to use _create_chunks method

- Updated the TokenChunker class to replace the _process_batch method with _create_chunks for improved clarity and functionality.
- This change enhances the overall structure of the code and aligns with recent refactoring efforts in the chunking classes.
---
 src/chonkie/chunker/token.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/chonkie/chunker/token.py b/src/chonkie/chunker/token.py
index 78f8691..b6f9ee0 100644
--- a/src/chonkie/chunker/token.py
+++ b/src/chonkie/chunker/token.py
@@ -111,7 +111,7 @@ def chunk(self, text: str) -> List[Chunk]:
             token_groups
         )  # decrease the time by decoding in one go (?)
 
-        chunks = self._process_batch(token_groups, token_counts, decoded_text)
+        chunks = self._create_chunks(chunk_texts, token_counts, decoded_text)
 
         return chunks
 

From 5760031a26aa19e9d47d2062ebb2c3b32df3f13f Mon Sep 17 00:00:00 2001
From: bhavnicksm <bhavnicksm@gmail.com>
Date: Thu, 12 Dec 2024 03:30:14 +0530
Subject: [PATCH 3/4] Refactor WordChunker to improve chunk creation logic

- Updated the _create_chunk method to include current_index as a parameter for better control over chunk starting index.
- Adjusted the logic in the chunking process to utilize the new current_index parameter, enhancing the accuracy of chunk creation.
- This refactor improves code clarity and maintains consistency with recent changes in other chunking classes.
---
 src/chonkie/chunker/word.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/src/chonkie/chunker/word.py b/src/chonkie/chunker/word.py
index f60d2f8..8649853 100644
--- a/src/chonkie/chunker/word.py
+++ b/src/chonkie/chunker/word.py
@@ -51,21 +51,26 @@ def _split_into_words(self, text: str) -> List[str]:
         return words
 
     def _create_chunk(
-        self, words: List[str], text: str, token_count: int
+        self,
+        words: List[str],
+        text: str,
+        token_count: int,
+        current_index: int = 0,
     ) -> Tuple[Chunk, int]:
         """Create a chunk from a list of words.
 
         Args:
             words: List of words to create chunk from
-            start_idx: Starting index in original text
-            end_idx: Ending index in original text
+            text: The original text
+            token_count: Number of tokens in the chunk
+            current_index: The index of the first token in the chunk
 
         Returns:
             Tuple of (Chunk object, number of tokens in chunk)
 
         """
         chunk_text = "".join(words)
-        start_index = text.find(chunk_text)
+        start_index = text.find(chunk_text, current_index)
         return Chunk(
             text=chunk_text,
             start_index=start_index,
@@ -110,19 +115,23 @@ def chunk(self, text: str) -> List[Chunk]:
         current_chunk = []
         current_chunk_length = 0
 
+        current_index = 0
+
         for i, (word, length) in enumerate(zip(words, lengths)):
             if current_chunk_length + length <= self.chunk_size:
                 current_chunk.append(word)
                 current_chunk_length += length
             else:
-                chunk = self._create_chunk(current_chunk, text, current_chunk_length)
+                chunk = self._create_chunk(
+                    current_chunk,
+                    text,
+                    current_chunk_length,
+                    current_index,
+                )
                 chunks.append(chunk)
-
                 # update the current_chunk and previous chunk
                 previous_chunk_length = current_chunk_length
-
-                current_chunk = []
-                current_chunk_length = 0
+                current_index = chunk.end_index
 
                 overlap = []
                 overlap_length = 0

From 289cdc2155d40c12692e1b123673e8ce52078013 Mon Sep 17 00:00:00 2001
From: bhavnicksm <bhavnicksm@gmail.com>
Date: Thu, 12 Dec 2024 03:30:22 +0530
Subject: [PATCH 4/4] Refactor SentenceChunker to improve sentence position
 calculation

- Removed unnecessary space adjustment in position calculation for sentences, as they are already separated by spaces.
- Commented out the _prepare_sentences method to streamline the class and focus on the essential functionality.
- This change enhances code clarity and prepares for future improvements in sentence processing.
---
 src/chonkie/chunker/sentence.py | 58 ++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/src/chonkie/chunker/sentence.py b/src/chonkie/chunker/sentence.py
index 8c0b6d2..4936918 100644
--- a/src/chonkie/chunker/sentence.py
+++ b/src/chonkie/chunker/sentence.py
@@ -236,7 +236,7 @@ def _prepare_sentences(self, text: str) -> List[Sentence]:
         current_pos = 0
         for sent in sentence_texts:
             positions.append(current_pos)
-            current_pos += len(sent) + 1  # +1 for space/separator
+            current_pos += len(sent)  # No +1 space because sentences are already separated by spaces
 
         if not self.approximate:
             # Get accurate token counts in batch
@@ -253,34 +253,34 @@ def _prepare_sentences(self, text: str) -> List[Sentence]:
             for sent, pos, count in zip(sentence_texts, positions, token_counts)
         ]
 
-    def _prepare_sentences(self, text: str) -> List[Sentence]:
-        """Prepare sentences with either estimated or accurate token counts."""
-        # Split text into sentences
-        sentence_texts = self._split_sentences(text)
-        if not sentence_texts:
-            return []
-
-        # Calculate positions once
-        positions = []
-        current_pos = 0
-        for sent in sentence_texts:
-            positions.append(current_pos)
-            current_pos += len(sent) + 1  # +1 for space/separator
-
-        if not self.approximate:
-            # Get accurate token counts in batch
-            token_counts = self._get_token_counts(sentence_texts)
-        else:
-            # Estimate token counts using character length
-            token_counts = self._estimate_token_counts(sentence_texts)
-
-        # Create sentence objects
-        return [
-            Sentence(
-                text=sent, start_index=pos, end_index=pos + len(sent), token_count=count
-            )
-            for sent, pos, count in zip(sentence_texts, positions, token_counts)
-        ]
+    # def _prepare_sentences(self, text: str) -> List[Sentence]:
+    #     """Prepare sentences with either estimated or accurate token counts."""
+    #     # Split text into sentences
+    #     sentence_texts = self._split_sentences(text)
+    #     if not sentence_texts:
+    #         return []
+
+    #     # Calculate positions once
+    #     positions = []
+    #     current_pos = 0
+    #     for sent in sentence_texts:
+    #         positions.append(current_pos)
+    #         current_pos += len(sent) + 1  # +1 for space/separator
+
+    #     if not self.approximate:
+    #         # Get accurate token counts in batch
+    #         token_counts = self._get_token_counts(sentence_texts)
+    #     else:
+    #         # Estimate token counts using character length
+    #         token_counts = self._estimate_token_counts(sentence_texts)
+
+    #     # Create sentence objects
+    #     return [
+    #         Sentence(
+    #             text=sent, start_index=pos, end_index=pos + len(sent), token_count=count
+    #         )
+    #         for sent, pos, count in zip(sentence_texts, positions, token_counts)
+    #     ]
 
     def _create_chunk(self, sentences: List[Sentence], token_count: int) -> Chunk:
         """Create a chunk from a list of sentences.