Andreas Varotsis 2 weeks ago committed by GitHub
commit 301793aea4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -281,6 +281,28 @@
"docs = docsearch.max_marginal_relevance_search(query, k=2, fetch_k=10, lambda_param=0.5)"
]
},
{
"cell_type": "markdown",
"id": "483a280e",
"metadata": {},
"source": [
"## Hybrid Search\n",
"If you'd like to combine KNN search with more traditional, BM25 keyword searching (for example in new domains where vector search may underperform), you can use the hybrid search type. Optionally, you can also provide a tuple of weights for the keyword and KNN searches respectively."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "085c898e",
"metadata": {},
"outputs": [],
"source": [
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"hybrid_search = docsearch.similarity_search(\n",
" query, search_type=\"hybrid_search\", hybrid_search_weights=(0.5, 0.5)\n",
")"
]
},
{
"cell_type": "markdown",
"id": "73264864",

@ -20,6 +20,7 @@ Could not import AsyncOpenSearch.
Please install it with `pip install opensearch-py`."""
SCRIPT_SCORING_SEARCH = "script_scoring"
HYBRID_SEARCH = 'hybrid_search'
PAINLESS_SCRIPTING_SEARCH = "painless_scripting"
MATCH_ALL_QUERY = {"match_all": {}} # type: Dict
@ -428,6 +429,7 @@ class OpenSearchVectorSearch(VectorStore):
self.index_name = index_name
http_auth = kwargs.get("http_auth")
self.is_aoss = _is_aoss_enabled(http_auth=http_auth)
self.hybrid_search_weights = kwargs.get("hybrid_search_weights", (0.7, 0.3))
self.client = _get_opensearch_client(opensearch_url, **kwargs)
self.async_client = _get_async_opensearch_client(opensearch_url, **kwargs)
self.engine = kwargs.get("engine")
@ -755,6 +757,12 @@ class OpenSearchVectorSearch(VectorStore):
List of Documents most similar to the query.
Optional Args:
search_type: The type of search to perform. Can be one of:
- "approximate_search" (default)
- "script_scoring"
- "painless_scripting"
- "hybrid_search"
vector_field: Document field embeddings are stored in. Defaults to
"vector_field".
@ -797,6 +805,9 @@ class OpenSearchVectorSearch(VectorStore):
pre_filter: script_score query to pre-filter documents before identifying
nearest neighbors; default: {"match_all": {}}
Optional Args for Hybrid Search:
hybrid_search_weights: a tuple for the weighting of the keyword search and the KNN search respectively; default: (0.7, 0.3)
"""
docs_with_scores = self.similarity_search_with_score(
query, k, score_threshold, **kwargs
@ -823,27 +834,48 @@ class OpenSearchVectorSearch(VectorStore):
score_threshold: Optional[float] = 0.0,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs and it's scores most similar to query.
By default, supports Approximate Search.
Also supports Script Scoring and Painless Scripting.
"""Return docs and their similarity scores based on the input query.
Args:
query: Text to look up documents similar to.
query: The query text to search for.
k: Number of Documents to return. Defaults to 4.
score_threshold: Specify a score threshold to return only documents
above the threshold. Defaults to 0.0.
Returns:
List of Documents along with its scores most similar to the query.
List of tuples containing the Document and its similarity score.
Optional Args:
same as `similarity_search`
search_type: The type of search to perform. Can be one of:
- "approximate_search" (default)
- "script_scoring"
- "painless_scripting"
- "hybrid_search"
query_text: The query text to use for keyword search in hybrid search.
Other optional arguments are the same as `similarity_search`.
"""
embedding = self.embedding_function.embed_query(query)
return self.similarity_search_with_score_by_vector(
embedding, k, score_threshold, **kwargs
hits = self._raw_similarity_search_with_score_by_vector(
embedding=embedding, k=k, query_text=query, **kwargs
)
text_field = kwargs.get("text_field", "text")
metadata_field = kwargs.get("metadata_field", "metadata")
documents_with_scores = [
(
Document(
page_content=hit["_source"][text_field],
metadata=(
hit["_source"]
if metadata_field == "*" or metadata_field not in hit["_source"]
else hit["_source"][metadata_field]
),
),
hit["_score"],
)
for hit in hits
]
return documents_with_scores
def similarity_search_with_score_by_vector(
self,
@ -852,28 +884,31 @@ class OpenSearchVectorSearch(VectorStore):
score_threshold: Optional[float] = 0.0,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs and it's scores most similar to the embedding vector.
By default, supports Approximate Search.
Also supports Script Scoring and Painless Scripting.
"""Return docs and their similarity scores based on the input vector.
Args:
embedding: Embedding vector to look up documents similar to.
embedding: The embedding vector to search for.
k: Number of Documents to return. Defaults to 4.
score_threshold: Specify a score threshold to return only documents
above the threshold. Defaults to 0.0.
Returns:
List of Documents along with its scores most similar to the query.
List of tuples containing the Document and its similarity score.
Optional Args:
same as `similarity_search`
search_type: The type of search to perform. Can be one of:
- "approximate_search" (default)
- "script_scoring"
- "painless_scripting"
- "hybrid_search"
query_text: The query text to use for keyword search in hybrid search.
Other optional arguments are the same as `similarity_search`.
"""
text_field = kwargs.get("text_field", "text")
metadata_field = kwargs.get("metadata_field", "metadata")
hits = self._raw_similarity_search_with_score_by_vector(
embedding=embedding, k=k, score_threshold=score_threshold, **kwargs
embedding=embedding, k=k, query_text=kwargs.get("query_text", ""), **kwargs
)
documents_with_scores = [
@ -1026,6 +1061,44 @@ class OpenSearchVectorSearch(VectorStore):
vector_field,
score_threshold=score_threshold,
)
elif search_type == HYBRID_SEARCH:
keyword_weight, vector_weight = self.hybrid_search_weights
query_text = kwargs.get("query_text", "")
search_query = {
"size": k,
"query": {
"function_score": {
"query": {
"bool": {
"should": [
{
"match": {
"text": query_text
}
},
{
"knn": {
vector_field: {
"vector": embedding,
"k": k
}
}
}
]
}
},
"functions": [
{
"weight": keyword_weight
},
{
"weight": vector_weight
}
],
"score_mode": "sum"
}
}
}
else:
raise ValueError("Invalid `search_type` provided as an argument")

Loading…
Cancel
Save