Multicentury-HTR-Demo

Sleeping

App Files Files Community

MikkoLipsanen commited on 22 days ago

Commit

1cf6710

verified ·

1 Parent(s): 7f74e3e

Create image_processing.py

Browse files

Files changed (1) hide show

image_processing.py +145 -0

image_processing.py ADDED Viewed

	@@ -0,0 +1,145 @@

+from torchvision.transforms import v2 as transforms_v2
+from torchvision.io import read_image, ImageReadMode
+import numpy as np
+import torch
+import cv2
+def load_with_torchvision(img_path):
+    """
+    Load an image using torchvision and convert to numpy array.
+    Args:
+        img_path (str or Path): Path to the image file
+    Returns:
+        numpy.ndarray: Image array in RGB format with shape (H, W, C)
+    """
+    # Read as tensor
+    img_tensor = read_image(str(img_path), mode= ImageReadMode.RGB)
+    # Convert to numpy: (C, H, W) -> (H, W, C)
+    img_np = img_tensor.permute(1, 2, 0).numpy()
+    return img_np
+def preprocess_resize_torch_transform(image, max_size=1024, normalize=True):
+    """
+    Resize using torchvision.transforms.v2 (most concise, PyTorch only).
+    Args:
+        image: torch.Tensor (C, H, W) or PIL Image
+        max_size: maximum size for the longer dimension
+        normalize: whether to normalize to [0, 1] range
+    Returns:
+        torch.Tensor (C, H, W) or PIL Image (same type as input)
+    """
+    # Convert to tensor if numpy
+    input_type = type(image)
+    if isinstance(image, np.ndarray):
+        image = torch.from_numpy(image)
+        if image.ndim == 3 and image.shape[2] in [1, 3]:
+            image = image.permute(2, 0, 1)
+    c, h, w = image.shape if isinstance(image, torch.Tensor) else (None, *image.size[::-1])
+    # Build transform pipeline
+    transform_list = []
+    # Add resize if needed
+    if h > max_size or w > max_size:
+        transform_list.append(transforms_v2.Resize(size=None, max_size=max_size, antialias=True))
+    # Add normalization
+    if normalize:
+        transform_list.append(transforms_v2.ToDtype(torch.float32, scale=True))
+    # Apply transforms
+    if transform_list:
+        transform = transforms_v2.Compose(transform_list)
+        resized = transform(image)
+    else:
+        resized = image
+    return resized
+def upscale_mask_opencv(mask, bbox, upscaled_bbox_shape):
+    """Upscale using OpenCV resize with nearest neighbor."""
+    x1, y1, x2, y2 = map(int, bbox)
+    cropped_mask = mask[y1:y2, x1:x2]
+    mask_uint8 = cropped_mask.astype(np.uint8)
+    upscaled = cv2.resize(mask_uint8,
+                         upscaled_bbox_shape,
+                         interpolation=cv2.INTER_NEAREST)
+    return upscaled * 255
+def upscale_bbox(bbox, original_shape, mask_shape):
+    """
+    Upscale bounding box coordinates from mask resolution to original image resolution.
+    Parameters:
+    -----------
+    bbox : np.ndarray or list
+        Bounding box coordinates in format [x_min, y_min, x_max, y_max]
+        in the mask's coordinate system
+    original_shape : tuple
+        Original image shape (H, W) or (H, W, C) - e.g., (4545, 5527, 3)
+    mask_shape : tuple
+        Mask shape (H, W) - e.g., (631, 768)
+    Returns:
+    --------
+    np.ndarray
+        Upscaled bounding box as integer coordinates [x_min, y_min, x_max, y_max]
+    """
+    # Ensure bbox is a numpy array
+    bbox = np.array(bbox)
+    # Extract height and width from shapes
+    original_h, original_w = original_shape[0], original_shape[1]
+    mask_h, mask_w = mask_shape[0], mask_shape[1]
+    # Calculate scale factors
+    scale_x = original_w / mask_w  # Width scaling
+    scale_y = original_h / mask_h  # Height scaling
+    # Unpack bbox coordinates
+    x_min, y_min, x_max, y_max = bbox
+    # Scale coordinates
+    x_min_scaled = x_min * scale_x
+    y_min_scaled = y_min * scale_y
+    x_max_scaled = x_max * scale_x
+    y_max_scaled = y_max * scale_y
+    # limit to range 0 to original width/height
+    if x_min_scaled < 0:
+        x_min_scaled = 0
+    if y_min_scaled < 0:
+        y_min_scaled = 0
+    if x_max_scaled > original_w:
+        x_max_scaled = original_w
+    if y_max_scaled > original_h:
+        y_max_scaled = original_h
+    # Convert to integers (rounding to nearest)
+    bbox_scaled = np.array([
+        x_min_scaled,
+        y_min_scaled,
+        x_max_scaled,
+        y_max_scaled
+    ]).astype(np.int32)
+    return bbox_scaled
+def crop_line(image, mask, upscaledbbox):
+    """Crops predicted text line based on the polygon coordinates
+    and returns binarised text line image."""
+    x1,y1,x2,y2 = upscaledbbox
+    cropped_image = image[y1:y2,x1:x2,:]
+    res = cv2.bitwise_and(cropped_image, cropped_image, mask = mask)
+    wbg = np.ones_like(cropped_image, np.uint8)*255
+    cv2.bitwise_not(wbg,wbg, mask=mask)
+    # Overlap the resulted cropped image on the white background
+    dst = wbg+res
+    return dst