|
|
|
@ -3,6 +3,7 @@ from __future__ import annotations
|
|
|
|
|
import logging
|
|
|
|
|
import os
|
|
|
|
|
import pickle
|
|
|
|
|
import platform
|
|
|
|
|
import shutil
|
|
|
|
|
import warnings
|
|
|
|
|
from pathlib import Path
|
|
|
|
@ -99,11 +100,30 @@ class VerifyInputFile(WorkflowBase):
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ImageMagickCommand(WorkflowBase):
|
|
|
|
|
"""
|
|
|
|
|
Use command based on OS
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
def execute(self) -> dict:
|
|
|
|
|
command: str | None = "convert"
|
|
|
|
|
|
|
|
|
|
if platform.system() == "Windows":
|
|
|
|
|
image_magick_path = os.getenv("IMCONV")
|
|
|
|
|
assert (
|
|
|
|
|
image_magick_path is not None
|
|
|
|
|
), "IMCONV environment variable not set. It should point to location of ImageMagick's magick.exe"
|
|
|
|
|
command = f"{image_magick_path} {command}"
|
|
|
|
|
|
|
|
|
|
return {"convert_command": command}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ConvertPDFToImages(WorkflowBase):
|
|
|
|
|
"""
|
|
|
|
|
Convert PDF to images using ImageMagick
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
convert_command: str
|
|
|
|
|
input_pdf_path: Path
|
|
|
|
|
app_dir: Path
|
|
|
|
|
start_page: int
|
|
|
|
@ -118,7 +138,7 @@ class ConvertPDFToImages(WorkflowBase):
|
|
|
|
|
image_path = output_dir / f"output-{i}.png"
|
|
|
|
|
if image_path.exists():
|
|
|
|
|
continue
|
|
|
|
|
convert_command = f"""convert -density 150 -trim -background white -alpha remove -quality 100 -sharpen 0x1.0 {input_file_page} -quality 100 {image_path}"""
|
|
|
|
|
convert_command = f"""{self.convert_command} -density 150 -trim -background white -alpha remove -quality 100 -sharpen 0x1.0 {input_file_page} -quality 100 {image_path}"""
|
|
|
|
|
run_command(convert_command)
|
|
|
|
|
|
|
|
|
|
return {"pdf_images_path": output_dir}
|
|
|
|
@ -294,6 +314,7 @@ ${question}
|
|
|
|
|
def training_workflow_steps() -> list:
|
|
|
|
|
return [
|
|
|
|
|
VerifyInputFile,
|
|
|
|
|
ImageMagickCommand,
|
|
|
|
|
ConvertPDFToImages,
|
|
|
|
|
ConvertImagesToText,
|
|
|
|
|
CombineAllText,
|
|
|
|
@ -304,6 +325,7 @@ def training_workflow_steps() -> list:
|
|
|
|
|
def pre_process_workflow_steps() -> list:
|
|
|
|
|
return [
|
|
|
|
|
VerifyInputFile,
|
|
|
|
|
ImageMagickCommand,
|
|
|
|
|
ConvertPDFToImages,
|
|
|
|
|
ConvertImagesToText,
|
|
|
|
|
]
|
|
|
|
|