-
Notifications
You must be signed in to change notification settings - Fork 0
/
ocr_module.py
79 lines (68 loc) · 2.6 KB
/
ocr_module.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# ocr_module.py
import pytesseract
from pdf2image import convert_from_path
import json
from pathlib import Path
import tempfile
import logging
from PIL import Image
from dataclasses import dataclass
from typing import List, Optional
@dataclass
class OCRResult:
filename: str
total_lines: int
lines: List[str]
success: bool
error: Optional[str] = None
class OCRPDFExtractor:
def __init__(self, tesseract_path: str = None):
if tesseract_path:
pytesseract.pytesseract.tesseract_cmd = tesseract_path
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
self.logger = logging.getLogger(__name__)
def preprocess_image(self, image: Image.Image) -> Image.Image:
return image.convert('L').point(lambda x: 0 if x < 128 else 255, '1')
def extract_lines(self, pdf_file) -> OCRResult:
try:
all_lines = []
with tempfile.TemporaryDirectory() as temp_dir:
# Handle both file path strings and file objects
if isinstance(pdf_file, str):
images = convert_from_path(pdf_file)
filename = Path(pdf_file).name
else:
# Save uploaded file temporarily
temp_path = Path(temp_dir) / "temp.pdf"
pdf_file.save(temp_path)
images = convert_from_path(str(temp_path))
filename = pdf_file.filename
for i, image in enumerate(images, 1):
self.logger.info(f"Processing page {i}")
processed_image = self.preprocess_image(image)
text = pytesseract.image_to_string(
processed_image, lang='eng')
lines = text.split('\n')
if len(images) > 1:
lines = [f"[Page {i}] {line}" for line in lines]
all_lines.extend(lines)
cleaned_lines = [line.strip()
for line in all_lines if line.strip()]
return OCRResult(
filename=filename,
total_lines=len(cleaned_lines),
lines=cleaned_lines,
success=True
)
except Exception as e:
self.logger.error(f"Error in OCR processing: {e}")
return OCRResult(
filename=getattr(pdf_file, 'filename', 'unknown'),
total_lines=0,
lines=[],
success=False,
error=str(e)
)