MikkoLipsanen commited on
Commit
1cf6710
·
verified ·
1 Parent(s): 7f74e3e

Create image_processing.py

Browse files
Files changed (1) hide show
  1. image_processing.py +145 -0
image_processing.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from torchvision.transforms import v2 as transforms_v2
2
+ from torchvision.io import read_image, ImageReadMode
3
+ import numpy as np
4
+ import torch
5
+ import cv2
6
+
7
+ def load_with_torchvision(img_path):
8
+ """
9
+ Load an image using torchvision and convert to numpy array.
10
+
11
+ Args:
12
+ img_path (str or Path): Path to the image file
13
+
14
+ Returns:
15
+ numpy.ndarray: Image array in RGB format with shape (H, W, C)
16
+ """
17
+ # Read as tensor
18
+ img_tensor = read_image(str(img_path), mode= ImageReadMode.RGB)
19
+ # Convert to numpy: (C, H, W) -> (H, W, C)
20
+ img_np = img_tensor.permute(1, 2, 0).numpy()
21
+ return img_np
22
+
23
+ def preprocess_resize_torch_transform(image, max_size=1024, normalize=True):
24
+ """
25
+ Resize using torchvision.transforms.v2 (most concise, PyTorch only).
26
+
27
+ Args:
28
+ image: torch.Tensor (C, H, W) or PIL Image
29
+ max_size: maximum size for the longer dimension
30
+ normalize: whether to normalize to [0, 1] range
31
+
32
+ Returns:
33
+ torch.Tensor (C, H, W) or PIL Image (same type as input)
34
+ """
35
+ # Convert to tensor if numpy
36
+ input_type = type(image)
37
+ if isinstance(image, np.ndarray):
38
+ image = torch.from_numpy(image)
39
+ if image.ndim == 3 and image.shape[2] in [1, 3]:
40
+ image = image.permute(2, 0, 1)
41
+
42
+ c, h, w = image.shape if isinstance(image, torch.Tensor) else (None, *image.size[::-1])
43
+
44
+ # Build transform pipeline
45
+ transform_list = []
46
+
47
+ # Add resize if needed
48
+ if h > max_size or w > max_size:
49
+ transform_list.append(transforms_v2.Resize(size=None, max_size=max_size, antialias=True))
50
+
51
+ # Add normalization
52
+ if normalize:
53
+ transform_list.append(transforms_v2.ToDtype(torch.float32, scale=True))
54
+
55
+ # Apply transforms
56
+ if transform_list:
57
+ transform = transforms_v2.Compose(transform_list)
58
+ resized = transform(image)
59
+ else:
60
+ resized = image
61
+
62
+ return resized
63
+
64
+ def upscale_mask_opencv(mask, bbox, upscaled_bbox_shape):
65
+ """Upscale using OpenCV resize with nearest neighbor."""
66
+ x1, y1, x2, y2 = map(int, bbox)
67
+ cropped_mask = mask[y1:y2, x1:x2]
68
+ mask_uint8 = cropped_mask.astype(np.uint8)
69
+ upscaled = cv2.resize(mask_uint8,
70
+ upscaled_bbox_shape,
71
+ interpolation=cv2.INTER_NEAREST)
72
+
73
+ return upscaled * 255
74
+
75
+ def upscale_bbox(bbox, original_shape, mask_shape):
76
+ """
77
+ Upscale bounding box coordinates from mask resolution to original image resolution.
78
+
79
+ Parameters:
80
+ -----------
81
+ bbox : np.ndarray or list
82
+ Bounding box coordinates in format [x_min, y_min, x_max, y_max]
83
+ in the mask's coordinate system
84
+ original_shape : tuple
85
+ Original image shape (H, W) or (H, W, C) - e.g., (4545, 5527, 3)
86
+ mask_shape : tuple
87
+ Mask shape (H, W) - e.g., (631, 768)
88
+
89
+ Returns:
90
+ --------
91
+ np.ndarray
92
+ Upscaled bounding box as integer coordinates [x_min, y_min, x_max, y_max]
93
+ """
94
+
95
+ # Ensure bbox is a numpy array
96
+ bbox = np.array(bbox)
97
+
98
+ # Extract height and width from shapes
99
+ original_h, original_w = original_shape[0], original_shape[1]
100
+ mask_h, mask_w = mask_shape[0], mask_shape[1]
101
+
102
+ # Calculate scale factors
103
+ scale_x = original_w / mask_w # Width scaling
104
+ scale_y = original_h / mask_h # Height scaling
105
+
106
+ # Unpack bbox coordinates
107
+ x_min, y_min, x_max, y_max = bbox
108
+
109
+ # Scale coordinates
110
+ x_min_scaled = x_min * scale_x
111
+ y_min_scaled = y_min * scale_y
112
+ x_max_scaled = x_max * scale_x
113
+ y_max_scaled = y_max * scale_y
114
+
115
+ # limit to range 0 to original width/height
116
+ if x_min_scaled < 0:
117
+ x_min_scaled = 0
118
+ if y_min_scaled < 0:
119
+ y_min_scaled = 0
120
+ if x_max_scaled > original_w:
121
+ x_max_scaled = original_w
122
+ if y_max_scaled > original_h:
123
+ y_max_scaled = original_h
124
+
125
+ # Convert to integers (rounding to nearest)
126
+ bbox_scaled = np.array([
127
+ x_min_scaled,
128
+ y_min_scaled,
129
+ x_max_scaled,
130
+ y_max_scaled
131
+ ]).astype(np.int32)
132
+
133
+ return bbox_scaled
134
+
135
+ def crop_line(image, mask, upscaledbbox):
136
+ """Crops predicted text line based on the polygon coordinates
137
+ and returns binarised text line image."""
138
+ x1,y1,x2,y2 = upscaledbbox
139
+ cropped_image = image[y1:y2,x1:x2,:]
140
+ res = cv2.bitwise_and(cropped_image, cropped_image, mask = mask)
141
+ wbg = np.ones_like(cropped_image, np.uint8)*255
142
+ cv2.bitwise_not(wbg,wbg, mask=mask)
143
+ # Overlap the resulted cropped image on the white background
144
+ dst = wbg+res
145
+ return dst