| from pathlib import Path | |
| from typing import List, Optional, Dict, Union | |
| import numpy as np | |
| import pandas as pd | |
| from paddleocr import PaddleOCR | |
| from PIL import Image | |
| class TextRecognizer: | |
| """ | |
| A class for performing OCR on detected tables using PaddleOCR. | |
| Attributes: | |
| models_dir (Path): Directory containing OCR model files | |
| """ | |
| def __init__(self, models_dir: Optional[Union[str, Path]] = None) -> None: | |
| """ | |
| Initialize the TextRecognizer with model directory. | |
| Args: | |
| models_dir: Directory containing OCR model files | |
| """ | |
| self.models_dir = Path(models_dir) if models_dir else Path(__file__).parent / 'paddleocr_models' | |
| self._setup_model_dirs() | |
| self.model = PaddleOCR( | |
| use_angle_cls=False, | |
| lang='en', | |
| det_model_dir=str(self.models_dir / 'det'), | |
| rec_model_dir=str(self.models_dir / 'rec') | |
| ) | |
| def _setup_model_dirs(self) -> None: | |
| """Create necessary directories for model files.""" | |
| (self.models_dir / 'det').mkdir(parents=True, exist_ok=True) | |
| (self.models_dir / 'rec').mkdir(parents=True, exist_ok=True) | |
| def recognize( | |
| self, | |
| image_path: Union[str, Path], | |
| table_boxes: Optional[np.ndarray] = None, | |
| padding: tuple = (0, 0) | |
| ) -> List[pd.DataFrame]: | |
| """ | |
| Perform OCR on the image within specified table regions. | |
| Args: | |
| image_path: Path to the input image | |
| table_boxes: Array of table bounding box coordinates | |
| padding: Padding to add around table regions (x, y) | |
| Returns: | |
| List of DataFrames containing extracted text and positions | |
| """ | |
| with Image.open(image_path) as img: | |
| img_array = np.array(img.convert('RGB')) | |
| if table_boxes is not None and len(table_boxes) == 1: | |
| pad_x, pad_y = padding | |
| box = table_boxes[0] | |
| img_array = img_array[ | |
| max(box[1]-pad_y, 0):box[3]+pad_y, | |
| max(box[0]-pad_x, 0):box[2]+pad_x | |
| ] | |
| ocr_result = self.model.ocr(img_array) | |
| if table_boxes is not None and len(table_boxes) > 1: | |
| return self._process_multiple_tables(ocr_result[0], table_boxes) | |
| return self._process_single_table(ocr_result[0]) | |
| def _process_multiple_tables( | |
| self, | |
| ocr_data: List, | |
| table_boxes: np.ndarray | |
| ) -> List[pd.DataFrame]: | |
| """Process OCR results for multiple tables.""" | |
| result: Dict[int, List] = {} | |
| for item in ocr_data: | |
| bbox = np.array(item[0]).astype(int) | |
| word = item[1][0] | |
| bbox = [bbox[:,0].min(), bbox[:,1].min(), bbox[:,0].max(), bbox[:,1].max()] | |
| for idx, table_box in enumerate(table_boxes): | |
| if (bbox[0] >= table_box[0] and bbox[1] >= table_box[1] and | |
| bbox[0] <= table_box[2] and bbox[1] <= table_box[3]): | |
| if idx not in result: | |
| result[idx] = [] | |
| result[idx].append((word, bbox)) | |
| return [ | |
| pd.DataFrame( | |
| sorted(table_data, key=lambda x: (x[1][1], x[1][0])), | |
| columns=['text', 'boundingBox'] | |
| ) | |
| for table_data in result.values() | |
| ] | |
| def _process_single_table(self, ocr_data: List) -> List[pd.DataFrame]: | |
| """Process OCR results for a single table.""" | |
| processed_data = [ | |
| (item[1][0], [ | |
| np.array(item[0])[:,0].min(), | |
| np.array(item[0])[:,1].min(), | |
| np.array(item[0])[:,0].max(), | |
| np.array(item[0])[:,1].max() | |
| ]) | |
| for item in ocr_data | |
| ] | |
| return [pd.DataFrame( | |
| sorted(processed_data, key=lambda x: (x[1][1], x[1][0])), | |
| columns=['text', 'boundingBox'] | |
| )] |