Ravi Maggon 2 weeks ago committed by GitHub
commit 0ab3d07659
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -73,13 +73,16 @@ class TextSplitter(BaseDocumentTransformer, ABC):
_metadatas = metadatas or [{}] * len(texts)
documents = []
for i, text in enumerate(texts):
index = 0
index = -1
previous_chunk_len = 0
for chunk in self.split_text(text):
for j, chunk in enumerate(self.split_text(text)):
metadata = copy.deepcopy(_metadatas[i])
if self._add_start_index:
offset = index + previous_chunk_len - self._chunk_overlap
index = text.find(chunk, max(0, offset))
if j > 0:
minimum_index_offset = max(0, previous_chunk_len - self._chunk_overlap, previous_chunk_len - len(chunk))
else:
minimum_index_offset = 1
index = text.find(chunk, index + minimum_index_offset)
metadata["start_index"] = index
previous_chunk_len = len(chunk)
new_doc = Document(page_content=chunk, metadata=metadata)

Loading…
Cancel
Save