You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
38 lines
987 B
Python
38 lines
987 B
Python
r"""°°°
|
|
## Loading PDF
|
|
°°°"""
|
|
#|%%--%%| <ut22SE2PmJ|EQL3ZDG6Dt>
|
|
|
|
from langchain.document_loaders import PagedPDFSplitter
|
|
|
|
loader = PagedPDFSplitter("./documents/layout-parser-paper.pdf")
|
|
pages = loader.load_and_split()
|
|
|
|
#|%%--%%| <EQL3ZDG6Dt|6LWg1c7vN6>
|
|
r"""°°°
|
|
Documents can be retrived with page numbers
|
|
°°°"""
|
|
#|%%--%%| <6LWg1c7vN6|0kFnbEI7yL>
|
|
|
|
from langchain.vectorstores import FAISS
|
|
from langchain.embeddings.openai import OpenAIEmbeddings
|
|
|
|
#|%%--%%| <0kFnbEI7yL|KkXwCS4JHN>
|
|
|
|
faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings() )
|
|
|
|
# Find docs (ie pages) most similar to query
|
|
# k: number of docs similar to query
|
|
docs = faiss_index.similarity_search("How will the community be engaged ?", k=2)
|
|
|
|
#|%%--%%| <KkXwCS4JHN|RDajVoEdqh>
|
|
# get page numbers + content, similar to query
|
|
for doc in docs:
|
|
print("\n----\n")
|
|
print("page: " + str(doc.metadata["page"] + 1))
|
|
print("content:")
|
|
print(str(doc.page_content))
|
|
|
|
#|%%--%%| <RDajVoEdqh|cqoPocvVBS>
|
|
|