Merge 2bb2c2ea0c into 242eeb537f

2 weeks ago · f3c1b8157f
parent 242eeb537f 2bb2c2ea0c
commit f3c1b8157f
5 changed files with 727 additions and 6 deletions
--- a/docs/docs/integrations/vectorstores/qdrant.ipynb
+++ b/docs/docs/integrations/vectorstores/qdrant.ipynb
@ -307,6 +307,39 @@
    ")"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a8360f3f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = \"<---qdrant url here --->\"\n",
+    "qdrant = Qdrant.from_texts(\n",
+    "    texts=[\"abc\", \"def\"],\n",
+    "    embedding=embeddings,\n",
+    "    metadatas=[{'page_number': 1}, {'page_number': 2}],\n",
+    "    ids=[1,2], #integer ids\n",
+    "    url=url,\n",
+    "    prefer_grpc=True,\n",
+    "    collection_name=\"my_documents\",\n",
+    "    force_recreate=True,\n",
+    ")\n",
+    "\n",
+    "# OR\n",
+    "\n",
+    "qdrant = Qdrant.from_texts(\n",
+    "    texts=[\"abc\", \"def\"],\n",
+    "    embedding=embeddings,\n",
+    "    metadatas=[{'page_number': 1}, {'page_number': 2}],\n",
+    "    ids=['fa38d572-4c31-4579-aedc-1960d79df6df','cdc1aa36-d6ab-4fb2-8a94-56674fd27484'], #string based UUID ids\n",
+    "    url=url,\n",
+    "    prefer_grpc=True,\n",
+    "    collection_name=\"my_documents\",\n",
+    "    force_recreate=True,\n",
+    ")"
+   ]
+  },
  {
   "attachments": {},
   "cell_type": "markdown",
@ -537,6 +570,37 @@
    "    print(f\"{i + 1}.\", doc.page_content, \"\\n\")"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "id": "91b3f6a5",
+   "metadata": {},
+   "source": [
+    "## Chunk Window Retrieval with Similarity search\n",
+    "\n",
+    "Based on the notion of sentence window retrieval. The idea is to get context that surrounds the result of similarity search, i.e. fetch additional documents (both above and below) for each of the results returned by similarity search. This way, we provide more supporting information for each document and pass that on to LLM for better decision making. This also helps aleviate some issues around chunking and disproptionate split of information that should ideally looked together and in a more wholistic way."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8128c09f",
+   "metadata": {},
+   "source": [
+    "### Caveat\n",
+    "\n",
+    "To use chunk window retrieval, the prerequiste is that you must set integer ids using `Qdrant.from_texts`. Please refer to above 'Recreating the collection' section."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4fec30f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"What did the president say about Ketanji Brown Jackson\"\n",
+    "found_docs = qdrant.chunk_window_retrieval_similarity_search(query, k=3, window_size=2)"
+   ]
+  },
  {
   "attachments": {},
   "cell_type": "markdown",
--- a/libs/community/langchain_community/vectorstores/qdrant.py
+++ b/libs/community/langchain_community/vectorstores/qdrant.py
@ -168,7 +168,7 @@ class Qdrant(VectorStore):
        self,
        texts: Iterable[str],
        metadatas: Optional[List[dict]] = None,
-        ids: Optional[Sequence[str]] = None,
+        ids: Optional[Union[Sequence[str], Sequence[int]]] = None,
        batch_size: int = 64,
        **kwargs: Any,
    ) -> List[str]:
@ -203,7 +203,7 @@ class Qdrant(VectorStore):
        self,
        texts: Iterable[str],
        metadatas: Optional[List[dict]] = None,
-        ids: Optional[Sequence[str]] = None,
+        ids: Optional[Union[Sequence[str], Sequence[int]]] = None,
        batch_size: int = 64,
        **kwargs: Any,
    ) -> List[str]:
@ -1147,6 +1147,388 @@ class Qdrant(VectorStore):
            for i in mmr_selected
        ]

+    def chunk_window_retrieval_similarity_search(
+        self,
+        query: str,
+        k: int = 4,
+        window_size: int = 2,
+        filter: Optional[MetadataFilter] = None,
+        search_params: Optional[common_types.SearchParams] = None,
+        offset: int = 0,
+        score_threshold: Optional[float] = None,
+        consistency: Optional[common_types.ReadConsistency] = None,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Return docs most similar to query.
+        The result step comprises of documents which are retrieved in 2 steps.
+        The first step is regular similarity search which fetches k documents.
+        The second step retrieves additional documents that are present before
+        and after the documents retrieved in first step.
+        The numnber of additional documents to add to the result set is
+        determined by window_size.
+        Hence, for this method to work as expected, it is required that IDs of
+        the document are integer based
+        and added in sequence via 'Qdrant.add_texts' method.
+
+        Args:
+            query: Text to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            window_size:
+                Number of Additional Documents to fetch which surrounds
+                (both before and after) the retrieved similar document.
+                Defaults to 2.
+            filter: Filter by metadata. Defaults to None.
+            search_params: Additional search params
+            offset:
+                Offset of the first result to return.
+                May be used to paginate results.
+                Note: large offset values may cause performance issues.
+            score_threshold:
+                Define a minimal score threshold for the result.
+                If defined, less similar results will not be returned.
+                Score of the returned result might be higher or smaller than the
+                threshold depending on the Distance function used.
+                E.g. for cosine similarity only higher scores will be returned.
+            consistency:
+                Read consistency of the search. Defines how many replicas should be
+                queried before returning the result.
+                Values:
+                - int - number of replicas to query, values should present in all
+                        queried replicas
+                - 'majority' - query all replicas, but return values present in the
+                            majority of replicas
+                - 'quorum' - query the majority of replicas, return values present in
+                            all of them
+                - 'all' - query all replicas, and return values present in all replicas
+            **kwargs:
+                Any other named arguments to pass through to QdrantClient.search()
+
+        Returns:
+            List of Documents most similar to the query along with extra context
+            documents which surround the similarity search results
+        """
+        embedding = self._embed_query(query)
+        return self.chunk_window_retrieval_similarity_search_by_vector(
+            embedding,
+            k,
+            window_size=window_size,
+            filter=filter,
+            search_params=search_params,
+            offset=offset,
+            score_threshold=score_threshold,
+            consistency=consistency,
+            **kwargs,
+        )
+
+    @sync_call_fallback
+    async def achunk_window_retrieval_similarity_search(
+        self,
+        query: str,
+        k: int = 4,
+        window_size: int = 2,
+        filter: Optional[MetadataFilter] = None,
+        search_params: Optional[common_types.SearchParams] = None,
+        offset: int = 0,
+        score_threshold: Optional[float] = None,
+        consistency: Optional[common_types.ReadConsistency] = None,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Return docs most similar to query.
+        The result step comprises of documents which are retrieved in 2 steps.
+        The first step is regular similarity search which fetches k documents.
+        The second step retrieves additional documents that are present before
+        and after the documents retrieved in first step.
+        The numnber of additional documents to add to the result set is
+        determined by window_size.
+        Hence, for this method to work as expected, it is required that IDs of
+        the document are integer based
+        and added in sequence via 'Qdrant.add_texts' method.
+
+        Args:
+            query: Text to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            window_size:
+                Number of Additional Documents to fetch which surrounds
+                (both before and after) the retrieved similar document.
+                Defaults to 2.
+            filter: Filter by metadata. Defaults to None.
+            search_params: Additional search params
+            offset:
+                Offset of the first result to return.
+                May be used to paginate results.
+                Note: large offset values may cause performance issues.
+            score_threshold:
+                Define a minimal score threshold for the result.
+                If defined, less similar results will not be returned.
+                Score of the returned result might be higher or smaller than the
+                threshold depending on the Distance function used.
+                E.g. for cosine similarity only higher scores will be returned.
+            consistency:
+                Read consistency of the search. Defines how many replicas should be
+                queried before returning the result.
+                Values:
+                - int - number of replicas to query, values should present in all
+                        queried replicas
+                - 'majority' - query all replicas, but return values present in the
+                            majority of replicas
+                - 'quorum' - query the majority of replicas, return values present in
+                            all of them
+                - 'all' - query all replicas, and return values present in all replicas
+            **kwargs:
+                Any other named arguments to pass through to QdrantClient.search()
+
+        Returns:
+            List of Documents most similar to the query along with extra context
+            documents which surround the similarity search results
+        """
+        embedding = await self._aembed_query(query)
+        return await self.achunk_window_retrieval_similarity_search_by_vector(
+            embedding,
+            k,
+            window_size=window_size,
+            filter=filter,
+            search_params=search_params,
+            offset=offset,
+            score_threshold=score_threshold,
+            consistency=consistency,
+            kwargs=kwargs,
+        )
+
+    def chunk_window_retrieval_similarity_search_by_vector(
+        self,
+        embedding: List[float],
+        k: int = 4,
+        window_size: int = 2,
+        filter: Optional[MetadataFilter] = None,
+        search_params: Optional[common_types.SearchParams] = None,
+        offset: int = 0,
+        score_threshold: Optional[float] = None,
+        consistency: Optional[common_types.ReadConsistency] = None,
+        **kwargs: Any,
+    ):
+        """Return docs most similar to query.
+        The result step comprises of documents which are retrieved in 2 steps.
+        The first step is regular similarity search which fetches k documents.
+        The second step retrieves additional documents that are present before
+        and after the documents retrieved in first step.
+        The numnber of additional documents to add to the result set is
+        determined by window_size.
+        Hence, for this method to work as expected, it is required that IDs of
+        the document are integer based
+        and added in sequence via 'Qdrant.add_texts' method.
+
+        Args:
+            embedding: Embedding vector to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            window_size:
+                Number of Additional Documents to fetch which surrounds
+                (both before and after) the retrieved similar document.
+                Defaults to 2.
+            filter: Filter by metadata. Defaults to None.
+            search_params: Additional search params
+            offset:
+                Offset of the first result to return.
+                May be used to paginate results.
+                Note: large offset values may cause performance issues.
+            score_threshold:
+                Define a minimal score threshold for the result.
+                If defined, less similar results will not be returned.
+                Score of the returned result might be higher or smaller than the
+                threshold depending on the Distance function used.
+                E.g. for cosine similarity only higher scores will be returned.
+            consistency:
+                Read consistency of the search. Defines how many replicas should be
+                queried before returning the result.
+                Values:
+                - int - number of replicas to query, values should present in all
+                        queried replicas
+                - 'majority' - query all replicas, but return values present in the
+                            majority of replicas
+                - 'quorum' - query the majority of replicas, return values present in
+                            all of them
+                - 'all' - query all replicas, and return values present in all replicas
+            **kwargs:
+                Any other named arguments to pass through to QdrantClient.search()
+
+        Returns:
+            List of Documents most similar to the query along with extra context
+            documents which surround the similarity search results
+        """
+        query_vector = embedding
+        if self.vector_name is not None:
+            query_vector = (self.vector_name, embedding)  # type: ignore[assignment]
+
+        results = self.client.search(
+            collection_name=self.collection_name,
+            query_vector=query_vector,
+            query_filter=filter,
+            search_params=search_params,
+            limit=k,
+            offset=offset,
+            with_payload=True,
+            with_vectors=False,  # Langchain does not expect vectors to be returned
+            score_threshold=score_threshold,
+            consistency=consistency,
+            **kwargs,
+        )
+
+        results_ids_list = [obj.id for obj in results]
+        window_point_ids = []
+        for id in results_ids_list:
+            # check the type of PointId, it should be int and not UUID string
+            if not isinstance(id, int):
+                raise QdrantException(
+                    f"'Chunk Window Retrieval needs PointId to be integer. \
+                      PointId found was '{id}'"
+                )
+            for x in range(1, window_size + 1):
+                window_point_ids.append(id - x)
+                window_point_ids.append(id + x)
+
+        # make another call to Qdrant and fetch all the points with Ids built above
+        window_point_results = self.client.retrieve(
+            collection_name=self.collection_name,
+            ids=window_point_ids,
+            with_payload=True,
+            with_vectors=False,
+        )
+        # merge and sort all the chunks
+        merged_list = results + window_point_results
+        sorted_list = sorted(merged_list, key=lambda obj: obj.id)
+        return [
+            (
+                self._document_from_scored_point(
+                    result,
+                    self.collection_name,
+                    self.content_payload_key,
+                    self.metadata_payload_key,
+                )
+            )
+            for result in sorted_list
+        ]
+
+    @sync_call_fallback
+    async def achunk_window_retrieval_similarity_search_by_vector(
+        self,
+        embedding: List[float],
+        k: int = 4,
+        window_size: int = 2,
+        filter: Optional[MetadataFilter] = None,
+        search_params: Optional[common_types.SearchParams] = None,
+        offset: int = 0,
+        score_threshold: Optional[float] = None,
+        consistency: Optional[common_types.ReadConsistency] = None,
+        **kwargs: Any,
+    ):
+        """Return docs most similar to query.
+        The result step comprises of documents which are retrieved in 2 steps.
+        The first step is regular similarity search which fetches k documents.
+        The second step retrieves additional documents that are present before
+        and after the documents retrieved in first step.
+        The numnber of additional documents to add to the result set is
+        determined by window_size.
+        Hence, for this method to work as expected, it is required that IDs of
+        the document are integer based
+        and added in sequence via 'Qdrant.add_texts' method.
+
+        Args:
+            embedding: Embedding vector to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            window_size:
+                Number of Additional Documents to fetch which surrounds
+                (both before and after) the retrieved similar document.
+                Defaults to 2.
+            filter: Filter by metadata. Defaults to None.
+            search_params: Additional search params
+            offset:
+                Offset of the first result to return.
+                May be used to paginate results.
+                Note: large offset values may cause performance issues.
+            score_threshold:
+                Define a minimal score threshold for the result.
+                If defined, less similar results will not be returned.
+                Score of the returned result might be higher or smaller than the
+                threshold depending on the Distance function used.
+                E.g. for cosine similarity only higher scores will be returned.
+            consistency:
+                Read consistency of the search. Defines how many replicas should be
+                queried before returning the result.
+                Values:
+                - int - number of replicas to query, values should present in all
+                        queried replicas
+                - 'majority' - query all replicas, but return values present in the
+                            majority of replicas
+                - 'quorum' - query the majority of replicas, return values present in
+                            all of them
+                - 'all' - query all replicas, and return values present in all replicas
+            **kwargs:
+                Any other named arguments to pass through to QdrantClient.search()
+
+        Returns:
+            List of Documents most similar to the query along with extra context
+            documents which surround the similarity search results
+        """
+        from qdrant_client.local.async_qdrant_local import AsyncQdrantLocal
+
+        if self.async_client is None or isinstance(
+            self.async_client._client, AsyncQdrantLocal
+        ):
+            raise NotImplementedError(
+                "QdrantLocal cannot interoperate with sync and async clients"
+            )
+        query_vector = embedding
+        if self.vector_name is not None:
+            query_vector = (self.vector_name, embedding)  # type: ignore[assignment]
+
+        results = await self.async_client.search(
+            collection_name=self.collection_name,
+            query_vector=query_vector,
+            query_filter=filter,
+            search_params=search_params,
+            limit=k,
+            offset=offset,
+            with_payload=True,
+            with_vectors=False,  # Langchain does not expect vectors to be returned
+            score_threshold=score_threshold,
+            consistency=consistency,
+            **kwargs,
+        )
+
+        results_ids_list = [obj.id for obj in results]
+        window_point_ids = []
+        for id in results_ids_list:
+            # check the type of PointId, it should be int and not UUID string
+            if not isinstance(id, int):
+                raise QdrantException(
+                    f"'Chunk Window Retrieval needs PointId to be integer. \
+                      PointId found was '{id}'"
+                )
+            for x in range(1, window_size + 1):
+                window_point_ids.append(id - x)
+                window_point_ids.append(id + x)
+
+        # make another call to Qdrant and fetch all the points with Ids built above
+        window_point_results = await self.async_client.retrieve(
+            collection_name=self.collection_name,
+            ids=window_point_ids,
+            with_payload=True,
+            with_vectors=False,
+        )
+        # merge and sort all the chunks
+        merged_list = results + window_point_results
+        sorted_list = sorted(merged_list, key=lambda obj: obj.id)
+        return [
+            (
+                self._document_from_scored_point(
+                    result,
+                    self.collection_name,
+                    self.content_payload_key,
+                    self.metadata_payload_key,
+                )
+            )
+            for result in sorted_list
+        ]
+
    def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
        """Delete by vector ID or other criteria.

@ -1202,7 +1584,7 @@ class Qdrant(VectorStore):
        texts: List[str],
        embedding: Embeddings,
        metadatas: Optional[List[dict]] = None,
-        ids: Optional[Sequence[str]] = None,
+        ids: Optional[Union[Sequence[str], Sequence[int]]] = None,
        location: Optional[str] = None,
        url: Optional[str] = None,
        port: Optional[int] = 6333,
@ -1423,7 +1805,7 @@ class Qdrant(VectorStore):
        texts: List[str],
        embedding: Embeddings,
        metadatas: Optional[List[dict]] = None,
-        ids: Optional[Sequence[str]] = None,
+        ids: Optional[Union[Sequence[str], Sequence[int]]] = None,
        location: Optional[str] = None,
        url: Optional[str] = None,
        port: Optional[int] = 6333,
@ -2150,7 +2532,7 @@ class Qdrant(VectorStore):
        self,
        texts: Iterable[str],
        metadatas: Optional[List[dict]] = None,
-        ids: Optional[Sequence[str]] = None,
+        ids: Optional[Union[Sequence[str], Sequence[int]]] = None,
        batch_size: int = 64,
    ) -> Generator[Tuple[List[str], List[rest.PointStruct]], None, None]:
        from qdrant_client.http import models as rest
@ -2192,7 +2574,7 @@ class Qdrant(VectorStore):
        self,
        texts: Iterable[str],
        metadatas: Optional[List[dict]] = None,
-        ids: Optional[Sequence[str]] = None,
+        ids: Optional[Union[Sequence[str], Sequence[int]]] = None,
        batch_size: int = 64,
    ) -> AsyncGenerator[Tuple[List[str], List[rest.PointStruct]], None]:
        from qdrant_client.http import models as rest
--- a/libs/community/tests/integration_tests/vectorstores/qdrant/test_add_texts.py
+++ b/libs/community/tests/integration_tests/vectorstores/qdrant/test_add_texts.py
@ -104,6 +104,35 @@ def test_qdrant_add_texts_stores_ids(batch_size: int) -> None:
    assert set(ids) == set(stored_ids)


+@pytest.mark.parametrize("batch_size", [1, 64])
+def test_qdrant_add_texts_stores_int_ids(batch_size: int) -> None:
+    """Test end to end Qdrant.add_texts stores provided ids (of type 'int')"""
+    from qdrant_client import QdrantClient
+    from qdrant_client.http import models as rest
+
+    ids = [
+        1,
+        2,
+    ]
+
+    client = QdrantClient(":memory:")
+    collection_name = uuid.uuid4().hex
+    client.recreate_collection(
+        collection_name,
+        vectors_config=rest.VectorParams(size=10, distance=rest.Distance.COSINE),
+    )
+
+    vec_store = Qdrant(client, collection_name, ConsistentFakeEmbeddings())
+    returned_ids = vec_store.add_texts(["abc", "def"], ids=ids, batch_size=batch_size)
+
+    assert all(first == second for first, second in zip(ids, returned_ids))
+    assert 2 == client.count(collection_name).count
+    stored_ids = [point.id for point in client.scroll(collection_name)[0]]
+    for id in stored_ids:
+        assert isinstance(id, int)
+    assert set(ids) == set(stored_ids)
+
+
@pytest.mark.parametrize("vector_name", ["custom-vector"])
 def test_qdrant_add_texts_stores_embeddings_as_named_vectors(vector_name: str) -> None:
    """Test end to end Qdrant.add_texts stores named vectors if name is provided."""
--- a/libs/community/tests/integration_tests/vectorstores/qdrant/test_chunk_window_retrieval_search.py
+++ b/libs/community/tests/integration_tests/vectorstores/qdrant/test_chunk_window_retrieval_search.py
@ -0,0 +1,213 @@
+from typing import Optional
+
+import pytest
+from langchain_core.documents import Document
+
+from langchain_community.vectorstores import Qdrant
+from tests.integration_tests.vectorstores.fake_embeddings import (
+    ConsistentFakeEmbeddings,
+)
+from tests.integration_tests.vectorstores.qdrant.common import assert_documents_equals
+
+
+@pytest.mark.parametrize("batch_size", [1, 64])
+@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"])
+@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"])
+@pytest.mark.parametrize("vector_name", [None, "my-vector"])
+def test_qdrant_chunk_window_search(
+    batch_size: int,
+    content_payload_key: str,
+    metadata_payload_key: str,
+    vector_name: Optional[str],
+) -> None:
+    """Test end to end construction and search."""
+    texts = [
+        "Dogs are known for their loyalty and companionship",
+        "Dogs have temperaments ranging from energetic and playful to calm and gentle",
+        "Dogs are highly intelligent animals",
+        "Cars come in various shapes, sizes, and colors",
+        "Drivers often customize cars to reflect their personality",
+        "Cars enable mobility, facilitating travel and exploration",
+    ]
+    docsearch = Qdrant.from_texts(
+        texts,
+        ConsistentFakeEmbeddings(),
+        ids=[1, 2, 3, 4, 5, 6],
+        location=":memory:",
+        content_payload_key=content_payload_key,
+        metadata_payload_key=metadata_payload_key,
+        batch_size=batch_size,
+        vector_name=vector_name,
+    )
+    output = docsearch.chunk_window_retrieval_similarity_search(
+        "Dogs have temperaments ranging from energetic and playful to calm and gentle",
+        k=1,
+        window_size=1,
+    )
+    assert_documents_equals(
+        actual=output,
+        expected=[
+            Document(page_content="Dogs are known for their loyalty and companionship"),
+            Document(
+                page_content="Dogs have temperaments ranging from energetic \
+                  and playful to calm and gentle"
+            ),
+            Document(page_content="Dogs are highly intelligent animals"),
+        ],
+    )
+
+    # below assert highlights that even though the chunks are not semantically
+    # similar, they will still be returned based on window_size
+    # we are not reranking the chunks in this approach.
+    output = docsearch.chunk_window_retrieval_similarity_search(
+        "Dogs are highly intelligent animals", k=1, window_size=1
+    )
+    assert_documents_equals(
+        actual=output,
+        expected=[
+            Document(
+                page_content="Dogs have temperaments ranging from energetic \
+                and playful to calm and gentle"
+            ),
+            Document(page_content="Dogs are highly intelligent animals"),
+            Document(page_content="Cars come in various shapes, sizes, and colors"),
+        ],
+    )
+
+
+@pytest.mark.parametrize("batch_size", [1, 64])
+@pytest.mark.parametrize("content_payload_key", [Qdrant.CONTENT_KEY, "foo"])
+@pytest.mark.parametrize("metadata_payload_key", [Qdrant.METADATA_KEY, "bar"])
+@pytest.mark.parametrize("vector_name", [None, "my-vector"])
+def test_qdrant_chunk_window_search_by_vector(
+    batch_size: int,
+    content_payload_key: str,
+    metadata_payload_key: str,
+    vector_name: Optional[str],
+) -> None:
+    """Test end to end construction and search."""
+    texts = [
+        "Dogs are known for their loyalty and companionship",
+        "Dogs have temperaments ranging from energetic and playful to calm and gentle",
+        "Dogs are highly intelligent animals",
+        "Cars come in various shapes, sizes, and colors",
+        "Drivers often customize cars to reflect their personality",
+        "Cars enable mobility, facilitating travel and exploration",
+    ]
+    embeddings = ConsistentFakeEmbeddings()
+    docsearch = Qdrant.from_texts(
+        texts,
+        embeddings,
+        ids=[1, 2, 3, 4, 5, 6],
+        location=":memory:",
+        content_payload_key=content_payload_key,
+        metadata_payload_key=metadata_payload_key,
+        batch_size=batch_size,
+        vector_name=vector_name,
+    )
+    embed_query = embeddings.embed_query(
+        "Dogs have temperaments ranging from energetic and playful to calm and gentle"
+    )
+    output = docsearch.chunk_window_retrieval_similarity_search_by_vector(
+        embed_query, k=1, window_size=1
+    )
+    assert_documents_equals(
+        actual=output,
+        expected=[
+            Document(page_content="Dogs are known for their loyalty and companionship"),
+            Document(
+                page_content="Dogs have temperaments ranging from energetic \
+                  and playful to calm and gentle"
+            ),
+            Document(page_content="Dogs are highly intelligent animals"),
+        ],
+    )
+
+    # below assert highlights that even though the chunks are not semantically
+    # similar, they will still be returned based on window_size
+    # we are not reranking the chunks in this approach.
+    embed_query = embeddings.embed_query("Dogs are highly intelligent animals")
+    output = docsearch.chunk_window_retrieval_similarity_search_by_vector(
+        embed_query, k=1, window_size=1
+    )
+    assert_documents_equals(
+        actual=output,
+        expected=[
+            Document(
+                page_content="Dogs have temperaments ranging from energetic \
+                  and playful to calm and gentle"
+            ),
+            Document(page_content="Dogs are highly intelligent animals"),
+            Document(page_content="Cars come in various shapes, sizes, and colors"),
+        ],
+    )
+
+
+@pytest.mark.parametrize("vector_name", [None, "my-vector"])
+def test_qdrant_chunk_window_search_filters_with_qdrant_filters(
+    vector_name: Optional[str],
+) -> None:
+    """Test end to end construction and search."""
+    from qdrant_client.http import models as rest
+
+    texts = [
+        "Dogs are known for their loyalty and companionship",
+        "Dogs have temperaments ranging from energetic and playful to calm and gentle",
+        "Dogs are highly intelligent animals",
+        "Cars come in various shapes, sizes, and colors",
+        "Drivers often customize cars to reflect their personality",
+        "Cars enable mobility, facilitating travel and exploration",
+    ]
+    metadatas = [
+        {"page": i, "details": {"page": i + 1, "pages": [i + 2, -1]}}
+        for i in range(len(texts))
+    ]
+    docsearch = Qdrant.from_texts(
+        texts,
+        ConsistentFakeEmbeddings(),
+        metadatas=metadatas,
+        ids=[1, 2, 3, 4, 5, 6],
+        location=":memory:",
+        vector_name=vector_name,
+    )
+
+    qdrant_filter = rest.Filter(
+        must=[
+            rest.FieldCondition(
+                key="metadata.page",
+                match=rest.MatchValue(value=2),
+            ),
+            rest.FieldCondition(
+                key="metadata.details.page",
+                match=rest.MatchValue(value=3),
+            ),
+            rest.FieldCondition(
+                key="metadata.details.pages",
+                match=rest.MatchAny(any=[4]),
+            ),
+        ]
+    )
+    output = docsearch.chunk_window_retrieval_similarity_search(
+        "Dogs have temperaments ranging from energetic and playful to calm and gentle",
+        k=1,
+        window_size=1,
+        filter=qdrant_filter,
+    )
+    assert_documents_equals(
+        actual=output,
+        expected=[
+            Document(
+                page_content="Dogs have temperaments ranging from energetic \
+                  and playful to calm and gentle",
+                metadata={"page": 1, "details": {"page": 2, "pages": [3, -1]}},
+            ),
+            Document(
+                page_content="Dogs are highly intelligent animals",
+                metadata={"page": 2, "details": {"page": 3, "pages": [4, -1]}},
+            ),
+            Document(
+                page_content="Cars come in various shapes, sizes, and colors",
+                metadata={"page": 3, "details": {"page": 4, "pages": [5, -1]}},
+            ),
+        ],
+    )
--- a/libs/community/tests/integration_tests/vectorstores/qdrant/test_from_texts.py
+++ b/libs/community/tests/integration_tests/vectorstores/qdrant/test_from_texts.py
@ -66,6 +66,39 @@ def test_qdrant_from_texts_stores_ids(
        assert set(ids) == set(stored_ids)


+@pytest.mark.parametrize("batch_size", [1, 64])
+@pytest.mark.parametrize("vector_name", [None, "my-vector"])
+def test_qdrant_from_texts_stores_int_ids(
+    batch_size: int, vector_name: Optional[str]
+) -> None:
+    """Test end to end Qdrant.from_texts stores provided ids (of type 'int')"""
+    from qdrant_client import QdrantClient
+
+    collection_name = uuid.uuid4().hex
+    with tempfile.TemporaryDirectory() as tmpdir:
+        ids = [
+            1,
+            2,
+        ]
+        vec_store = Qdrant.from_texts(
+            ["abc", "def"],
+            ConsistentFakeEmbeddings(),
+            ids=ids,
+            collection_name=collection_name,
+            path=str(tmpdir),
+            batch_size=batch_size,
+            vector_name=vector_name,
+        )
+        del vec_store
+
+        client = QdrantClient(path=str(tmpdir))
+        assert 2 == client.count(collection_name).count
+        stored_ids = [point.id for point in client.scroll(collection_name)[0]]
+        for id in stored_ids:
+            assert isinstance(id, int)
+        assert set(ids) == set(stored_ids)
+
+
@pytest.mark.parametrize("vector_name", ["custom-vector"])
 def test_qdrant_from_texts_stores_embeddings_as_named_vectors(vector_name: str) -> None:
    """Test end to end Qdrant.from_texts stores named vectors if name is provided."""