feat: #2 Provide option to just create a model for provided book

doc-sources
namuan 1 year ago
parent db9e458e04
commit c1fddfb986

13
poetry.lock generated

@ -2009,6 +2009,17 @@ files = [
{file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
]
[[package]]
name = "slug"
version = "2.0"
description = "Python module to convert str to slug"
category = "main"
optional = false
python-versions = "*"
files = [
{file = "slug-2.0-py3-none-any.whl", hash = "sha256:f55b24a0563428ce56243079f6aeb79c0dd1cc5bbf8b39b10bd6e5263ba8be5f"},
]
[[package]]
name = "smmap"
version = "5.0.0"
@ -2352,4 +2363,4 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools"
[metadata]
lock-version = "2.0"
python-versions = ">=3.9.0, <4.0"
content-hash = "63ba1df6e953119e1330dc91b626d073f20c59e8df8a9afeca312563c003f20a"
content-hash = "1b4573d056d857cacbbe6ee164af7190b06b11bc2cd794326a54cfbd0f592896"

@ -40,6 +40,7 @@ langchain = "^0.0.57"
faiss-cpu = "^1.7.3"
python-dotenv = "^0.21.0"
panel = "^0.14.2"
slug = "^2.0"
[tool.poetry.group.dev.dependencies]
autoflake = "*"

@ -9,7 +9,7 @@ from rich import print
from doc_search import setup_logging
from doc_search.web import run_web
from doc_search.workflow import workflow_steps
from doc_search.workflow import training_workflow_steps, workflow_steps
def parse_args() -> Namespace:
@ -24,6 +24,7 @@ def parse_args() -> Namespace:
"-q", "--input-question", default="Can you summarize the lessons from this book?", help="Question to ask"
)
parser.add_argument("-w", "--overwrite-index", action="store_true", help="Overwrite existing index")
parser.add_argument("-t", "--train", action="store_true", help="Train and index the PDF file")
parser.add_argument("-a", "--web-app", action="store_true", help="Start WebApp")
parser.add_argument(
@ -43,6 +44,8 @@ def main() -> None: # pragma: no cover
context = args.__dict__
if args.web_app:
run_web(context)
elif args.train:
run_workflow(context, training_workflow_steps())
else:
run_workflow(context, workflow_steps())
print("[bold]Question: " + context["input_question"] + "[/bold]")

@ -1,5 +1,6 @@
import logging
import pickle
import shutil
from pathlib import Path
from typing import Any
@ -12,24 +13,41 @@ from langchain.vectorstores.faiss import FAISS
from py_executable_checklist.workflow import WorkflowBase, run_command
from pypdf import PdfReader
from rich import print
from slug import slug # type: ignore
from doc_search import retry
def slugify_pdf_name(input_pdf_path: Path) -> str:
return str(slug(input_pdf_path.stem))
def pdf_name_from(input_pdf_path: Path) -> str:
return input_pdf_path.stem
return slugify_pdf_name(input_pdf_path)
def pdf_to_faiss_db_path(app_dir: Path, input_pdf_path: Path) -> Path:
def output_directory_for_pdf(app_dir: Path, input_pdf_path: Path) -> Path:
pdf_file_name = pdf_name_from(input_pdf_path)
output_dir = app_dir / "OutputDir/dr-doc-search" / pdf_file_name / "index"
return app_dir / "OutputDir/dr-doc-search" / pdf_file_name
def copy_raw_pdf_file(app_dir: Path, input_pdf_path: Path) -> Path:
pdf_file_name = pdf_name_from(input_pdf_path)
output_dir = app_dir / "OutputDir/dr-doc-search" / pdf_file_name
output_dir.mkdir(parents=True, exist_ok=True)
new_input_pdf_path = output_dir / f"{pdf_file_name}.pdf"
shutil.copy2(input_pdf_path, new_input_pdf_path)
return new_input_pdf_path
def pdf_to_faiss_db_path(app_dir: Path, input_pdf_path: Path) -> Path:
output_dir = output_directory_for_pdf(app_dir, input_pdf_path) / "index"
output_dir.mkdir(parents=True, exist_ok=True)
return output_dir / "index.pkl"
def pdf_to_index_path(app_dir: Path, input_pdf_path: Path) -> Path:
pdf_file_name = input_pdf_path.stem
output_dir = app_dir / "OutputDir/dr-doc-search" / pdf_file_name / "index"
output_dir = output_directory_for_pdf(app_dir, input_pdf_path) / "index"
output_dir.mkdir(parents=True, exist_ok=True)
return output_dir / "docsearch.index"
@ -39,17 +57,24 @@ class VerifyInputFile(WorkflowBase):
Verify input file and return pdf stats
"""
app_dir: Path
input_pdf_path: Path
start_page: int
end_page: int
def execute(self) -> dict:
reader = PdfReader(self.input_pdf_path)
new_pdf_file_path = copy_raw_pdf_file(self.app_dir, self.input_pdf_path)
reader = PdfReader(new_pdf_file_path)
total_pages = len(reader.pages)
start_page = self.start_page if self.start_page != -1 else 1
end_page = self.end_page if self.end_page != -1 else total_pages
return {"start_page": start_page, "end_page": end_page, "total_pages": total_pages}
return {
"input_pdf_path": new_pdf_file_path,
"start_page": start_page,
"end_page": end_page,
"total_pages": total_pages,
}
class ConvertPDFToImages(WorkflowBase):
@ -63,8 +88,7 @@ class ConvertPDFToImages(WorkflowBase):
end_page: int
def execute(self) -> dict:
pdf_file_name = self.input_pdf_path.stem
output_dir = self.app_dir / "OutputDir/dr-doc-search" / pdf_file_name / "images"
output_dir = output_directory_for_pdf(self.app_dir, self.input_pdf_path) / "images"
output_dir.mkdir(parents=True, exist_ok=True)
for i in range(self.start_page, self.end_page):
@ -88,8 +112,7 @@ class ConvertImagesToText(WorkflowBase):
app_dir: Path
def execute(self) -> dict:
pdf_file_name = self.input_pdf_path.stem
output_dir = self.app_dir / "OutputDir/dr-doc-search" / pdf_file_name / "scanned"
output_dir = output_directory_for_pdf(self.app_dir, self.input_pdf_path) / "scanned"
output_dir.mkdir(parents=True, exist_ok=True)
for image_path in self.pdf_images_path.glob("*.png"):
@ -243,7 +266,6 @@ AI:"""
return qa.run(prompt)
# TODO: Only run when --train argument is passed from command line
def training_workflow_steps() -> list:
return [
VerifyInputFile,

@ -8,6 +8,7 @@ from doc_search.workflow import VerifyInputFile
def test_return_pdf_properties() -> None:
context: dict[str, Any] = {
"app_dir": Path.cwd(),
"input_pdf_path": Path("tests/data/input.pdf"),
"start_page": -1,
"end_page": -1,
@ -22,6 +23,7 @@ def test_return_pdf_properties() -> None:
def test_override_start_and_end_pages() -> None:
context: dict[str, Any] = {
"app_dir": Path.cwd(),
"input_pdf_path": Path("tests/data/input.pdf"),
"start_page": 2,
"end_page": 2,

Loading…
Cancel
Save