Upgrade the AwaDB from 0.3.5 to 0.3.6 (#7363)

11 months ago · fb6e63dc36
parent c5edbea34a
commit fb6e63dc36
2 changed files with 814 additions and 72 deletions
--- a/langchain/vectorstores/awadb.py
+++ b/langchain/vectorstores/awadb.py
@ -3,11 +3,14 @@ from __future__ import annotations

 import logging
 import uuid
-from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Type
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tuple, Type
+
+import numpy as np

 from langchain.docstore.document import Document
 from langchain.embeddings.base import Embeddings
 from langchain.vectorstores.base import VectorStore
+from langchain.vectorstores.utils import maximal_marginal_relevance

 # from pydantic import BaseModel, Field, root_validator

@ -30,9 +33,19 @@ class AwaDB(VectorStore):
        embedding: Optional[Embeddings] = None,
        log_and_data_dir: Optional[str] = None,
        client: Optional[awadb.Client] = None,
+        **kwargs: Any,
    ) -> None:
-        """Initialize with AwaDB client."""
+        """Initialize with AwaDB client.
+        Args:
+            table_name: Iterable of strings to add to the vectorstore.
+            embedding: Optional list of metadatas associated with the texts.
+            log_and_data_dir: Optional whether to duplicate texts.
+            client: Optional AwaDB client.
+            kwargs: any possible extend parameters in the future.

+        Returns:
+            None.
+        """
        try:
            import awadb
        except ImportError:
@ -71,7 +84,7 @@ class AwaDB(VectorStore):
            texts: Iterable of strings to add to the vectorstore.
            metadatas: Optional list of metadatas associated with the texts.
            is_duplicate_texts: Optional whether to duplicate texts.
-            kwargs: vectorstore specific parameters.
+            kwargs: any possible extend parameters in the future.

        Returns:
            List of ids from adding the texts into the vectorstore.
@ -99,6 +112,16 @@ class AwaDB(VectorStore):
        table_name: str,
        **kwargs: Any,
    ) -> bool:
+        """Load the local specified table.
+
+        Args:
+            table_name: Table name
+            kwargs: Any possible extend parameters in the future.
+
+        Returns:
+            Success or failure of loading the local specified table
+        """
+
        if self.awadb_client is None:
            raise ValueError("AwaDB client is None!!!")

@ -110,7 +133,17 @@ class AwaDB(VectorStore):
        k: int = DEFAULT_TOPN,
        **kwargs: Any,
    ) -> List[Document]:
-        """Return docs most similar to query."""
+        """Return docs most similar to query.
+
+        Args:
+            query: Text query.
+            k: The maximum number of documents to return.
+            kwargs: Any possible extend parameters in the future.
+
+        Returns:
+            Returns the k most similar documents to the specified text query.
+        """
+
        if self.awadb_client is None:
            raise ValueError("AwaDB client is None!!!")

@ -123,7 +156,10 @@ class AwaDB(VectorStore):
            llm = llm_embedding.LLMEmbedding()
            embedding = llm.Embedding(query)

-        return self.similarity_search_by_vector(embedding, k)
+        not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
+        return self.similarity_search_by_vector(
+            embedding, k, not_include_fields_in_metadata=not_include_fields
+        )

    def similarity_search_with_score(
        self,
@ -131,9 +167,16 @@ class AwaDB(VectorStore):
        k: int = DEFAULT_TOPN,
        **kwargs: Any,
    ) -> List[Tuple[Document, float]]:
-        """Return docs and relevance scores, normalized on a scale from 0 to 1.
+        """The most k similar documents and scores of the specified query.
+
+        Args:
+            query: Text query.
+            k: The k most similar documents to the text query.
+            kwargs: Any possible extend parameters in the future.

-        0 is dissimilar, 1 is most similar.
+        Returns:
+            The k most similar documents to the specified text query.
+            0 is dissimilar, 1 is the most similar.
        """

        if self.awadb_client is None:
@ -150,17 +193,18 @@ class AwaDB(VectorStore):

        results: List[Tuple[Document, float]] = []

-        scores: List[float] = []
-        retrieval_docs = self.similarity_search_by_vector(embedding, k, scores)
-
-        L2_Norm = 0.0
-        for score in scores:
-            L2_Norm = L2_Norm + score * score
+        dists: List[float] = []
+        not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
+        retrieval_docs = self.similarity_search_by_vector(
+            embedding,
+            k,
+            scores=dists,
+            not_include_fields_in_metadata=not_include_fields,
+        )

-        L2_Norm = pow(L2_Norm, 0.5)
        doc_no = 0
        for doc in retrieval_docs:
-            doc_tuple = (doc, 1 - (scores[doc_no] / L2_Norm))
+            doc_tuple = (doc, dists[doc_no])
            results.append(doc_tuple)
            doc_no = doc_no + 1

@ -172,9 +216,17 @@ class AwaDB(VectorStore):
        k: int = DEFAULT_TOPN,
        **kwargs: Any,
    ) -> List[Tuple[Document, float]]:
-        """Return docs and relevance scores, normalized on a scale from 0 to 1.
+        """Return docs and relevance scores
+           which denote the InnerProduct distance, range from 0 to 1.
+
+        Args:
+            query: Text query.
+            k: Number of the most similar documents to return. Defaults to 4.

-        0 is dissimilar, 1 is most similar.
+        Returns:
+            List of (Document, relevance_score) tuples similar to the text query.
+            Note that relevance_score ranged from 0 to 1.
+            0 is dissimilar, 1 is the most similar.
        """

        if self.awadb_client is None:
@ -191,17 +243,18 @@ class AwaDB(VectorStore):
        if show_results.__len__() == 0:
            return results

-        scores: List[float] = []
-        retrieval_docs = self.similarity_search_by_vector(embedding, k, scores)
-
-        L2_Norm = 0.0
-        for score in scores:
-            L2_Norm = L2_Norm + score * score
+        dists: List[float] = []
+        not_include_fields: Set[str] = {"text_embedding", "_id", "score"}
+        retrieval_docs = self.similarity_search_by_vector(
+            embedding,
+            k,
+            scores=dists,
+            not_include_fields_in_metadata=not_include_fields,
+        )

-        L2_Norm = pow(L2_Norm, 0.5)
        doc_no = 0
        for doc in retrieval_docs:
-            doc_tuple = (doc, 1 - scores[doc_no] / L2_Norm)
+            doc_tuple = (doc, dists[doc_no])
            results.append(doc_tuple)
            doc_no = doc_no + 1

@ -212,6 +265,7 @@ class AwaDB(VectorStore):
        embedding: Optional[List[float]] = None,
        k: int = DEFAULT_TOPN,
        scores: Optional[list] = None,
+        not_include_fields_in_metadata: Optional[Set[str]] = None,
        **kwargs: Any,
    ) -> List[Document]:
        """Return docs most similar to embedding vector.
@ -219,9 +273,11 @@ class AwaDB(VectorStore):
        Args:
            embedding: Embedding to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
+            scores: Scores for retrieved docs.
+            not_incude_fields_in_metadata: Not include meta fields of each document.

        Returns:
-            List of Documents most similar to the query vector.
+            List of Documents which are the most similar to the query vector.
        """

        if self.awadb_client is None:
@ -232,7 +288,9 @@ class AwaDB(VectorStore):
        if embedding is None:
            return results

-        show_results = self.awadb_client.Search(embedding, k)
+        show_results = self.awadb_client.Search(
+            embedding, k, not_include_fields=not_include_fields_in_metadata
+        )

        if show_results.__len__() == 0:
            return results
@ -241,26 +299,200 @@ class AwaDB(VectorStore):
            content = ""
            meta_data = {}
            for item_key in item_detail:
-                if (
-                    item_key == "Field@0"
-                    and self.using_table_name in self.table2embeddings
-                ):  # text for the document
-                    content = item_detail[item_key]
-                elif item_key == "embedding_text":
+                if item_key == "embedding_text":
                    content = item_detail[item_key]
-                elif (
-                    item_key == "Field@1" or item_key == "text_embedding"
-                ):  # embedding field for the document
                    continue
-                elif item_key == "score":  # L2 distance
+                elif item_key == "score":
                    if scores is not None:
-                        score = item_detail[item_key]
-                        scores.append(score)
-                else:
-                    meta_data[item_key] = item_detail[item_key]
+                        scores.append(item_detail[item_key])
+                        continue
+                elif not_include_fields_in_metadata is not None:
+                    if item_key in not_include_fields_in_metadata:
+                        continue
+                meta_data[item_key] = item_detail[item_key]
            results.append(Document(page_content=content, metadata=meta_data))
        return results

+    def max_marginal_relevance_search(
+        self,
+        query: str,
+        k: int = 4,
+        fetch_k: int = 20,
+        lambda_mult: float = 0.5,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Return docs selected using the maximal marginal relevance.
+
+        Maximal marginal relevance optimizes for similarity to query AND diversity
+        among selected documents.
+
+        Args:
+            query: Text to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
+            lambda_mult: Number between 0 and 1 that determines the degree
+                        of diversity among the results with 0 corresponding
+                        to maximum diversity and 1 to minimum diversity.
+                        Defaults to 0.5.
+        Returns:
+            List of Documents selected by maximal marginal relevance.
+        """
+        if self.awadb_client is None:
+            raise ValueError("AwaDB client is None!!!")
+
+        embedding: List[float] = []
+        if self.using_table_name in self.table2embeddings:
+            embedding = self.table2embeddings[self.using_table_name].embed_query(query)
+        else:
+            from awadb import llm_embedding
+
+            llm = llm_embedding.LLMEmbedding()
+            embedding = llm.Embedding(query)
+
+        if embedding.__len__() == 0:
+            return []
+
+        results = self.max_marginal_relevance_search_by_vector(
+            embedding, k, fetch_k, lambda_mult=lambda_mult
+        )
+        return results
+
+    def max_marginal_relevance_search_by_vector(
+        self,
+        embedding: List[float],
+        k: int = 4,
+        fetch_k: int = 20,
+        lambda_mult: float = 0.5,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Return docs selected using the maximal marginal relevance.
+
+        Maximal marginal relevance optimizes for similarity to query AND diversity
+        among selected documents.
+
+        Args:
+            embedding: Embedding to look up documents similar to.
+            k: Number of Documents to return. Defaults to 4.
+            fetch_k: Number of Documents to fetch to pass to MMR algorithm.
+            lambda_mult: Number between 0 and 1 that determines the degree
+                        of diversity among the results with 0 corresponding
+                        to maximum diversity and 1 to minimum diversity.
+                        Defaults to 0.5.
+        Returns:
+            List of Documents selected by maximal marginal relevance.
+        """
+
+        if self.awadb_client is None:
+            raise ValueError("AwaDB client is None!!!")
+
+        results: List[Document] = []
+
+        if embedding is None:
+            return results
+
+        not_include_fields: set = {"_id", "score"}
+        retrieved_docs = self.similarity_search_by_vector(
+            embedding, fetch_k, not_include_fields_in_metadata=not_include_fields
+        )
+
+        top_embeddings = []
+
+        for doc in retrieved_docs:
+            top_embeddings.append(doc.metadata["text_embedding"])
+
+        selected_docs = maximal_marginal_relevance(
+            np.array(embedding, dtype=np.float32), embedding_list=top_embeddings
+        )
+
+        for s_id in selected_docs:
+            if "text_embedding" in retrieved_docs[s_id].metadata:
+                del retrieved_docs[s_id].metadata["text_embedding"]
+                results.append(retrieved_docs[s_id])
+        return results
+
+    def get(
+        self,
+        ids: List[str],
+        not_include_fields: Optional[Set[str]] = None,
+        **kwargs: Any,
+    ) -> Dict[str, Document]:
+        """Return docs according ids.
+
+        Args:
+            ids: The ids of the embedding vectors.
+        Returns:
+            Documents which have the ids.
+        """
+
+        if self.awadb_client is None:
+            raise ValueError("AwaDB client is None!!!")
+
+        docs_detail = self.awadb_client.Get(ids, not_include_fields=not_include_fields)
+
+        results: Dict[str, Document] = {}
+        for doc_detail in docs_detail:
+            content = ""
+            meta_info = {}
+            for field in doc_detail:
+                if field == "embeddint_text":
+                    content = doc_detail[field]
+                    continue
+                elif field == "text_embedding" or field == "_id":
+                    continue
+
+                meta_info[field] = doc_detail[field]
+
+            doc = Document(page_content=content, metadata=meta_info)
+            results[doc_detail["_id"]] = doc
+        return results
+
+    def delete(
+        self,
+        ids: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> Optional[bool]:
+        """Delete the documents which have the specified ids.
+
+        Args:
+            ids: The ids of the embedding vectors.
+            **kwargs: Other keyword arguments that subclasses might use.
+
+        Returns:
+            Optional[bool]: True if deletion is successful.
+            False otherwise, None if not implemented.
+        """
+        if self.awadb_client is None:
+            raise ValueError("AwaDB client is None!!!")
+        ret: Optional[bool] = None
+        if ids is None or ids.__len__() == 0:
+            return ret
+        ret = self.awadb_client.Delete(ids)
+        return ret
+
+    def update(
+        self,
+        ids: List[str],
+        texts: Iterable[str],
+        metadatas: Optional[List[dict]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        """Update the documents which have the specified ids.
+
+        Args:
+            ids: The id list of the updating embedding vector.
+            texts: The texts of the updating documents.
+            metadatas: The metadatas of the updating documents.
+        Returns:
+            the ids of the updated documents.
+        """
+
+        if self.awadb_client is None:
+            raise ValueError("AwaDB client is None!!!")
+
+        return self.awadb_client.UpdateTexts(
+            ids=ids, text_field_name="embedding_text", texts=texts, metadatas=metadatas
+        )
+
    def create_table(
        self,
        table_name: str,
@ -364,7 +596,8 @@ class AwaDB(VectorStore):
            embedding (Optional[Embeddings]): Embedding function. Defaults to None.
            table_name (str): Name of the table to create.
            log_and_data_dir (Optional[str]): Directory to persist the table.
-            client (Optional[awadb.Client]): AwaDB client
+            client (Optional[awadb.Client]): AwaDB client.
+            Any: Any possible parameters in the future

        Returns:
            AwaDB: AwaDB vectorstore.
--- a/poetry.lock
+++ b/poetry.lock