feat: #2 Provide option to just create a model for provided book

1 year ago · c1fddfb986
parent db9e458e04
commit c1fddfb986
5 changed files with 53 additions and 14 deletions
--- a/poetry.lock
+++ b/poetry.lock
@ -2009,6 +2009,17 @@ files = [
    {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"},
 ]

+[[package]]
+name = "slug"
+version = "2.0"
+description = "Python module to convert str to slug"
+category = "main"
+optional = false
+python-versions = "*"
+files = [
+    {file = "slug-2.0-py3-none-any.whl", hash = "sha256:f55b24a0563428ce56243079f6aeb79c0dd1cc5bbf8b39b10bd6e5263ba8be5f"},
+]
+
 [[package]]
 name = "smmap"
 version = "5.0.0"
@ -2352,4 +2363,4 @@ testing = ["flake8 (<5)", "func-timeout", "jaraco.functools", "jaraco.itertools"
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.9.0, <4.0"
-content-hash = "63ba1df6e953119e1330dc91b626d073f20c59e8df8a9afeca312563c003f20a"
+content-hash = "1b4573d056d857cacbbe6ee164af7190b06b11bc2cd794326a54cfbd0f592896"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -40,6 +40,7 @@ langchain = "^0.0.57"
 faiss-cpu = "^1.7.3"
 python-dotenv = "^0.21.0"
 panel = "^0.14.2"
+slug = "^2.0"

 [tool.poetry.group.dev.dependencies]
 autoflake = "*"
--- a/src/doc_search/app.py
+++ b/src/doc_search/app.py
@ -9,7 +9,7 @@ from rich import print

 from doc_search import setup_logging
 from doc_search.web import run_web
-from doc_search.workflow import workflow_steps
+from doc_search.workflow import training_workflow_steps, workflow_steps


 def parse_args() -> Namespace:
@ -24,6 +24,7 @@ def parse_args() -> Namespace:
        "-q", "--input-question", default="Can you summarize the lessons from this book?", help="Question to ask"
    )
    parser.add_argument("-w", "--overwrite-index", action="store_true", help="Overwrite existing index")
+    parser.add_argument("-t", "--train", action="store_true", help="Train and index the PDF file")
    parser.add_argument("-a", "--web-app", action="store_true", help="Start WebApp")

    parser.add_argument(
@ -43,6 +44,8 @@ def main() -> None:  # pragma: no cover
    context = args.__dict__
    if args.web_app:
        run_web(context)
+    elif args.train:
+        run_workflow(context, training_workflow_steps())
    else:
        run_workflow(context, workflow_steps())
        print("[bold]Question: " + context["input_question"] + "[/bold]")
--- a/src/doc_search/workflow/init.py
+++ b/src/doc_search/workflow/init.py
@ -1,5 +1,6 @@
 import logging
 import pickle
+import shutil
 from pathlib import Path
 from typing import Any

@ -12,24 +13,41 @@ from langchain.vectorstores.faiss import FAISS
 from py_executable_checklist.workflow import WorkflowBase, run_command
 from pypdf import PdfReader
 from rich import print
+from slug import slug  # type: ignore

 from doc_search import retry


+def slugify_pdf_name(input_pdf_path: Path) -> str:
+    return str(slug(input_pdf_path.stem))
+
+
 def pdf_name_from(input_pdf_path: Path) -> str:
-    return input_pdf_path.stem
+    return slugify_pdf_name(input_pdf_path)


-def pdf_to_faiss_db_path(app_dir: Path, input_pdf_path: Path) -> Path:
+def output_directory_for_pdf(app_dir: Path, input_pdf_path: Path) -> Path:
    pdf_file_name = pdf_name_from(input_pdf_path)
-    output_dir = app_dir / "OutputDir/dr-doc-search" / pdf_file_name / "index"
+    return app_dir / "OutputDir/dr-doc-search" / pdf_file_name
+
+
+def copy_raw_pdf_file(app_dir: Path, input_pdf_path: Path) -> Path:
+    pdf_file_name = pdf_name_from(input_pdf_path)
+    output_dir = app_dir / "OutputDir/dr-doc-search" / pdf_file_name
+    output_dir.mkdir(parents=True, exist_ok=True)
+    new_input_pdf_path = output_dir / f"{pdf_file_name}.pdf"
+    shutil.copy2(input_pdf_path, new_input_pdf_path)
+    return new_input_pdf_path
+
+
+def pdf_to_faiss_db_path(app_dir: Path, input_pdf_path: Path) -> Path:
+    output_dir = output_directory_for_pdf(app_dir, input_pdf_path) / "index"
    output_dir.mkdir(parents=True, exist_ok=True)
    return output_dir / "index.pkl"


 def pdf_to_index_path(app_dir: Path, input_pdf_path: Path) -> Path:
-    pdf_file_name = input_pdf_path.stem
-    output_dir = app_dir / "OutputDir/dr-doc-search" / pdf_file_name / "index"
+    output_dir = output_directory_for_pdf(app_dir, input_pdf_path) / "index"
    output_dir.mkdir(parents=True, exist_ok=True)
    return output_dir / "docsearch.index"

@ -39,17 +57,24 @@ class VerifyInputFile(WorkflowBase):
    Verify input file and return pdf stats
    """

+    app_dir: Path
    input_pdf_path: Path
    start_page: int
    end_page: int

    def execute(self) -> dict:
-        reader = PdfReader(self.input_pdf_path)
+        new_pdf_file_path = copy_raw_pdf_file(self.app_dir, self.input_pdf_path)
+        reader = PdfReader(new_pdf_file_path)
        total_pages = len(reader.pages)
        start_page = self.start_page if self.start_page != -1 else 1
        end_page = self.end_page if self.end_page != -1 else total_pages

-        return {"start_page": start_page, "end_page": end_page, "total_pages": total_pages}
+        return {
+            "input_pdf_path": new_pdf_file_path,
+            "start_page": start_page,
+            "end_page": end_page,
+            "total_pages": total_pages,
+        }


 class ConvertPDFToImages(WorkflowBase):
@ -63,8 +88,7 @@ class ConvertPDFToImages(WorkflowBase):
    end_page: int

    def execute(self) -> dict:
-        pdf_file_name = self.input_pdf_path.stem
-        output_dir = self.app_dir / "OutputDir/dr-doc-search" / pdf_file_name / "images"
+        output_dir = output_directory_for_pdf(self.app_dir, self.input_pdf_path) / "images"
        output_dir.mkdir(parents=True, exist_ok=True)

        for i in range(self.start_page, self.end_page):
@ -88,8 +112,7 @@ class ConvertImagesToText(WorkflowBase):
    app_dir: Path

    def execute(self) -> dict:
-        pdf_file_name = self.input_pdf_path.stem
-        output_dir = self.app_dir / "OutputDir/dr-doc-search" / pdf_file_name / "scanned"
+        output_dir = output_directory_for_pdf(self.app_dir, self.input_pdf_path) / "scanned"
        output_dir.mkdir(parents=True, exist_ok=True)

        for image_path in self.pdf_images_path.glob("*.png"):
@ -243,7 +266,6 @@ AI:"""
        return qa.run(prompt)


-# TODO: Only run when --train argument is passed from command line
 def training_workflow_steps() -> list:
    return [
        VerifyInputFile,
--- a/tests/verify_input_file_test.py
+++ b/tests/verify_input_file_test.py
@ -8,6 +8,7 @@ from doc_search.workflow import VerifyInputFile

 def test_return_pdf_properties() -> None:
    context: dict[str, Any] = {
+        "app_dir": Path.cwd(),
        "input_pdf_path": Path("tests/data/input.pdf"),
        "start_page": -1,
        "end_page": -1,
@ -22,6 +23,7 @@ def test_return_pdf_properties() -> None:

 def test_override_start_and_end_pages() -> None:
    context: dict[str, Any] = {
+        "app_dir": Path.cwd(),
        "input_pdf_path": Path("tests/data/input.pdf"),
        "start_page": 2,
        "end_page": 2,