Merge 41c034a96f into 242eeb537f

2 weeks ago · 0ab3d07659
parent 242eeb537f 41c034a96f
commit 0ab3d07659
1 changed files with 7 additions and 4 deletions
--- a/libs/text-splitters/langchain_text_splitters/base.py
+++ b/libs/text-splitters/langchain_text_splitters/base.py
@ -73,13 +73,16 @@ class TextSplitter(BaseDocumentTransformer, ABC):
        _metadatas = metadatas or [{}] * len(texts)
        documents = []
        for i, text in enumerate(texts):
-            index = 0
+            index = -1
            previous_chunk_len = 0
-            for chunk in self.split_text(text):
+            for j, chunk in enumerate(self.split_text(text)):
                metadata = copy.deepcopy(_metadatas[i])
                if self._add_start_index:
-                    offset = index + previous_chunk_len - self._chunk_overlap
-                    index = text.find(chunk, max(0, offset))
+                    if j > 0:
+                        minimum_index_offset = max(0, previous_chunk_len - self._chunk_overlap, previous_chunk_len - len(chunk))
+                    else:
+                        minimum_index_offset = 1
+                    index = text.find(chunk, index + minimum_index_offset)
                    metadata["start_index"] = index
                    previous_chunk_len = len(chunk)
                new_doc = Document(page_content=chunk, metadata=metadata)