testing dataset

This commit is contained in:
2025-07-10 19:42:57 +08:00
commit 185959cf2a
316 changed files with 19605393 additions and 0 deletions

View File

@@ -0,0 +1,65 @@
import math
import numpy as np
def pointAngle(Apoint, Bpoint):
angle = (Bpoint[1] - Apoint[1]) / ((Bpoint[0] - Apoint[0]) + 10e-8)
return angle
def pointDistance(Apoint, Bpoint):
return math.sqrt((Bpoint[1] - Apoint[1])**2 + (Bpoint[0] - Apoint[0])**2)
def lineBiasAndK(Apoint, Bpoint):
K = pointAngle(Apoint, Bpoint)
B = Apoint[1] - K*Apoint[0]
return K, B
def getX(K, B, Ypoint):
return int((Ypoint-B)/K)
def sidePoint(Apoint, Bpoint, h, w, placehold, enlarge_size):
K, B = lineBiasAndK(Apoint, Bpoint)
angle = abs(math.atan(pointAngle(Apoint, Bpoint)))
distance = pointDistance(Apoint, Bpoint)
x_enlarge_size, y_enlarge_size = enlarge_size
XaxisIncreaseDistance = abs(math.cos(angle) * x_enlarge_size * distance)
YaxisIncreaseDistance = abs(math.sin(angle) * y_enlarge_size * distance)
if placehold == 'leftTop':
x1 = max(0, Apoint[0] - XaxisIncreaseDistance)
y1 = max(0, Apoint[1] - YaxisIncreaseDistance)
elif placehold == 'rightTop':
x1 = min(w, Bpoint[0] + XaxisIncreaseDistance)
y1 = max(0, Bpoint[1] - YaxisIncreaseDistance)
elif placehold == 'rightBottom':
x1 = min(w, Bpoint[0] + XaxisIncreaseDistance)
y1 = min(h, Bpoint[1] + YaxisIncreaseDistance)
elif placehold == 'leftBottom':
x1 = max(0, Apoint[0] - XaxisIncreaseDistance)
y1 = min(h, Apoint[1] + YaxisIncreaseDistance)
return int(x1), int(y1)
def enlargebox(box, h, w, enlarge_size, horizontal_text_bool):
if not horizontal_text_bool:
enlarge_size = (enlarge_size[1], enlarge_size[0])
box = np.roll(box, -np.argmin(box.sum(axis=1)), axis=0)
Apoint, Bpoint, Cpoint, Dpoint = box
K1, B1 = lineBiasAndK(box[0], box[2])
K2, B2 = lineBiasAndK(box[3], box[1])
X = (B2 - B1)/(K1 - K2)
Y = K1 * X + B1
center = [X, Y]
x1, y1 = sidePoint(Apoint, center, h, w, 'leftTop', enlarge_size)
x2, y2 = sidePoint(center, Bpoint, h, w, 'rightTop', enlarge_size)
x3, y3 = sidePoint(center, Cpoint, h, w, 'rightBottom', enlarge_size)
x4, y4 = sidePoint(Dpoint, center, h, w, 'leftBottom', enlarge_size)
newcharbox = np.array([[x1, y1], [x2, y2], [x3, y3], [x4, y4]])
return newcharbox

View File

@@ -0,0 +1,542 @@
import os
import re
import itertools
import random
import numpy as np
import scipy.io as scio
from PIL import Image
import cv2
from torch.utils.data import Dataset
import torchvision.transforms as transforms
from data import imgproc
from data.gaussian import GaussianBuilder
from data.imgaug import (
rescale,
random_resize_crop_synth,
random_resize_crop,
random_horizontal_flip,
random_rotate,
random_scale,
random_crop,
)
from data.pseudo_label.make_charbox import PseudoCharBoxBuilder
from utils.util import saveInput, saveImage
class CraftBaseDataset(Dataset):
def __init__(
self,
output_size,
data_dir,
saved_gt_dir,
mean,
variance,
gauss_init_size,
gauss_sigma,
enlarge_region,
enlarge_affinity,
aug,
vis_test_dir,
vis_opt,
sample,
):
self.output_size = output_size
self.data_dir = data_dir
self.saved_gt_dir = saved_gt_dir
self.mean, self.variance = mean, variance
self.gaussian_builder = GaussianBuilder(
gauss_init_size, gauss_sigma, enlarge_region, enlarge_affinity
)
self.aug = aug
self.vis_test_dir = vis_test_dir
self.vis_opt = vis_opt
self.sample = sample
if self.sample != -1:
random.seed(0)
self.idx = random.sample(range(0, len(self.img_names)), self.sample)
self.pre_crop_area = []
def augment_image(
self, image, region_score, affinity_score, confidence_mask, word_level_char_bbox
):
augment_targets = [image, region_score, affinity_score, confidence_mask]
if self.aug.random_scale.option:
augment_targets, word_level_char_bbox = random_scale(
augment_targets, word_level_char_bbox, self.aug.random_scale.range
)
if self.aug.random_rotate.option:
augment_targets = random_rotate(
augment_targets, self.aug.random_rotate.max_angle
)
if self.aug.random_crop.option:
if self.aug.random_crop.version == "random_crop_with_bbox":
augment_targets = random_crop_with_bbox(
augment_targets, word_level_char_bbox, self.output_size
)
elif self.aug.random_crop.version == "random_resize_crop_synth":
augment_targets = random_resize_crop_synth(
augment_targets, self.output_size
)
elif self.aug.random_crop.version == "random_resize_crop":
if len(self.pre_crop_area) > 0:
pre_crop_area = self.pre_crop_area
else:
pre_crop_area = None
augment_targets = random_resize_crop(
augment_targets,
self.aug.random_crop.scale,
self.aug.random_crop.ratio,
self.output_size,
self.aug.random_crop.rnd_threshold,
pre_crop_area,
)
elif self.aug.random_crop.version == "random_crop":
augment_targets = random_crop(augment_targets, self.output_size,)
else:
assert "Undefined RandomCrop version"
if self.aug.random_horizontal_flip.option:
augment_targets = random_horizontal_flip(augment_targets)
if self.aug.random_colorjitter.option:
image, region_score, affinity_score, confidence_mask = augment_targets
image = Image.fromarray(image)
image = transforms.ColorJitter(
brightness=self.aug.random_colorjitter.brightness,
contrast=self.aug.random_colorjitter.contrast,
saturation=self.aug.random_colorjitter.saturation,
hue=self.aug.random_colorjitter.hue,
)(image)
else:
image, region_score, affinity_score, confidence_mask = augment_targets
return np.array(image), region_score, affinity_score, confidence_mask
def resize_to_half(self, ground_truth, interpolation):
return cv2.resize(
ground_truth,
(self.output_size // 2, self.output_size // 2),
interpolation=interpolation,
)
def __len__(self):
if self.sample != -1:
return len(self.idx)
else:
return len(self.img_names)
def __getitem__(self, index):
if self.sample != -1:
index = self.idx[index]
if self.saved_gt_dir is None:
(
image,
region_score,
affinity_score,
confidence_mask,
word_level_char_bbox,
all_affinity_bbox,
words,
) = self.make_gt_score(index)
else:
(
image,
region_score,
affinity_score,
confidence_mask,
word_level_char_bbox,
words,
) = self.load_saved_gt_score(index)
all_affinity_bbox = []
if self.vis_opt:
saveImage(
self.img_names[index],
self.vis_test_dir,
image.copy(),
word_level_char_bbox.copy(),
all_affinity_bbox.copy(),
region_score.copy(),
affinity_score.copy(),
confidence_mask.copy(),
)
image, region_score, affinity_score, confidence_mask = self.augment_image(
image, region_score, affinity_score, confidence_mask, word_level_char_bbox
)
if self.vis_opt:
saveInput(
self.img_names[index],
self.vis_test_dir,
image,
region_score,
affinity_score,
confidence_mask,
)
region_score = self.resize_to_half(region_score, interpolation=cv2.INTER_CUBIC)
affinity_score = self.resize_to_half(
affinity_score, interpolation=cv2.INTER_CUBIC
)
confidence_mask = self.resize_to_half(
confidence_mask, interpolation=cv2.INTER_NEAREST
)
image = imgproc.normalizeMeanVariance(
np.array(image), mean=self.mean, variance=self.variance
)
image = image.transpose(2, 0, 1)
return image, region_score, affinity_score, confidence_mask
class SynthTextDataSet(CraftBaseDataset):
def __init__(
self,
output_size,
data_dir,
saved_gt_dir,
mean,
variance,
gauss_init_size,
gauss_sigma,
enlarge_region,
enlarge_affinity,
aug,
vis_test_dir,
vis_opt,
sample,
):
super().__init__(
output_size,
data_dir,
saved_gt_dir,
mean,
variance,
gauss_init_size,
gauss_sigma,
enlarge_region,
enlarge_affinity,
aug,
vis_test_dir,
vis_opt,
sample,
)
self.img_names, self.char_bbox, self.img_words = self.load_data()
self.vis_index = list(range(1000))
def load_data(self, bbox="char"):
gt = scio.loadmat(os.path.join(self.data_dir, "gt.mat"))
img_names = gt["imnames"][0]
img_words = gt["txt"][0]
if bbox == "char":
img_bbox = gt["charBB"][0]
else:
img_bbox = gt["wordBB"][0] # word bbox needed for test
return img_names, img_bbox, img_words
def dilate_img_to_output_size(self, image, char_bbox):
h, w, _ = image.shape
if min(h, w) <= self.output_size:
scale = float(self.output_size) / min(h, w)
else:
scale = 1.0
image = cv2.resize(
image, dsize=None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC
)
char_bbox *= scale
return image, char_bbox
def make_gt_score(self, index):
img_path = os.path.join(self.data_dir, self.img_names[index][0])
image = cv2.imread(img_path, cv2.IMREAD_COLOR)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
all_char_bbox = self.char_bbox[index].transpose(
(2, 1, 0)
) # shape : (Number of characters in image, 4, 2)
img_h, img_w, _ = image.shape
confidence_mask = np.ones((img_h, img_w), dtype=np.float32)
words = [
re.split(" \n|\n |\n| ", word.strip()) for word in self.img_words[index]
]
words = list(itertools.chain(*words))
words = [word for word in words if len(word) > 0]
word_level_char_bbox = []
char_idx = 0
for i in range(len(words)):
length_of_word = len(words[i])
word_bbox = all_char_bbox[char_idx : char_idx + length_of_word]
assert len(word_bbox) == length_of_word
char_idx += length_of_word
word_bbox = np.array(word_bbox)
word_level_char_bbox.append(word_bbox)
region_score = self.gaussian_builder.generate_region(
img_h,
img_w,
word_level_char_bbox,
horizontal_text_bools=[True for _ in range(len(words))],
)
affinity_score, all_affinity_bbox = self.gaussian_builder.generate_affinity(
img_h,
img_w,
word_level_char_bbox,
horizontal_text_bools=[True for _ in range(len(words))],
)
return (
image,
region_score,
affinity_score,
confidence_mask,
word_level_char_bbox,
all_affinity_bbox,
words,
)
class CustomDataset(CraftBaseDataset):
def __init__(
self,
output_size,
data_dir,
saved_gt_dir,
mean,
variance,
gauss_init_size,
gauss_sigma,
enlarge_region,
enlarge_affinity,
aug,
vis_test_dir,
vis_opt,
sample,
watershed_param,
pseudo_vis_opt,
do_not_care_label,
):
super().__init__(
output_size,
data_dir,
saved_gt_dir,
mean,
variance,
gauss_init_size,
gauss_sigma,
enlarge_region,
enlarge_affinity,
aug,
vis_test_dir,
vis_opt,
sample,
)
self.pseudo_vis_opt = pseudo_vis_opt
self.do_not_care_label = do_not_care_label
self.pseudo_charbox_builder = PseudoCharBoxBuilder(
watershed_param, vis_test_dir, pseudo_vis_opt, self.gaussian_builder
)
self.vis_index = list(range(1000))
self.img_dir = os.path.join(data_dir, "ch4_training_images")
self.img_gt_box_dir = os.path.join(
data_dir, "ch4_training_localization_transcription_gt"
)
self.img_names = os.listdir(self.img_dir)
def update_model(self, net):
self.net = net
def update_device(self, gpu):
self.gpu = gpu
def load_img_gt_box(self, img_gt_box_path):
lines = open(img_gt_box_path, encoding="utf-8").readlines()
word_bboxes = []
words = []
for line in lines:
box_info = line.strip().encode("utf-8").decode("utf-8-sig").split(",")
box_points = [int(box_info[i]) for i in range(8)]
box_points = np.array(box_points, np.float32).reshape(4, 2)
word = box_info[8:]
word = ",".join(word)
if word in self.do_not_care_label:
words.append(self.do_not_care_label[0])
word_bboxes.append(box_points)
continue
word_bboxes.append(box_points)
words.append(word)
return np.array(word_bboxes), words
def load_data(self, index):
img_name = self.img_names[index]
img_path = os.path.join(self.img_dir, img_name)
image = cv2.imread(img_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
img_gt_box_path = os.path.join(
self.img_gt_box_dir, "gt_%s.txt" % os.path.splitext(img_name)[0]
)
word_bboxes, words = self.load_img_gt_box(
img_gt_box_path
) # shape : (Number of word bbox, 4, 2)
confidence_mask = np.ones((image.shape[0], image.shape[1]), np.float32)
word_level_char_bbox = []
do_care_words = []
horizontal_text_bools = []
if len(word_bboxes) == 0:
return (
image,
word_level_char_bbox,
do_care_words,
confidence_mask,
horizontal_text_bools,
)
_word_bboxes = word_bboxes.copy()
for i in range(len(word_bboxes)):
if words[i] in self.do_not_care_label:
cv2.fillPoly(confidence_mask, [np.int32(_word_bboxes[i])], 0)
continue
(
pseudo_char_bbox,
confidence,
horizontal_text_bool,
) = self.pseudo_charbox_builder.build_char_box(
self.net, self.gpu, image, word_bboxes[i], words[i], img_name=img_name
)
cv2.fillPoly(confidence_mask, [np.int32(_word_bboxes[i])], confidence)
do_care_words.append(words[i])
word_level_char_bbox.append(pseudo_char_bbox)
horizontal_text_bools.append(horizontal_text_bool)
return (
image,
word_level_char_bbox,
do_care_words,
confidence_mask,
horizontal_text_bools,
)
def make_gt_score(self, index):
"""
Make region, affinity scores using pseudo character-level GT bounding box
word_level_char_bbox's shape : [word_num, [char_num_in_one_word, 4, 2]]
:rtype region_score: np.float32
:rtype affinity_score: np.float32
:rtype confidence_mask: np.float32
:rtype word_level_char_bbox: np.float32
:rtype words: list
"""
(
image,
word_level_char_bbox,
words,
confidence_mask,
horizontal_text_bools,
) = self.load_data(index)
img_h, img_w, _ = image.shape
if len(word_level_char_bbox) == 0:
region_score = np.zeros((img_h, img_w), dtype=np.float32)
affinity_score = np.zeros((img_h, img_w), dtype=np.float32)
all_affinity_bbox = []
else:
region_score = self.gaussian_builder.generate_region(
img_h, img_w, word_level_char_bbox, horizontal_text_bools
)
affinity_score, all_affinity_bbox = self.gaussian_builder.generate_affinity(
img_h, img_w, word_level_char_bbox, horizontal_text_bools
)
return (
image,
region_score,
affinity_score,
confidence_mask,
word_level_char_bbox,
all_affinity_bbox,
words,
)
def load_saved_gt_score(self, index):
"""
Load pre-saved official CRAFT model's region, affinity scores to train
word_level_char_bbox's shape : [word_num, [char_num_in_one_word, 4, 2]]
:rtype region_score: np.float32
:rtype affinity_score: np.float32
:rtype confidence_mask: np.float32
:rtype word_level_char_bbox: np.float32
:rtype words: list
"""
img_name = self.img_names[index]
img_path = os.path.join(self.img_dir, img_name)
image = cv2.imread(img_path)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
img_gt_box_path = os.path.join(
self.img_gt_box_dir, "gt_%s.txt" % os.path.splitext(img_name)[0]
)
word_bboxes, words = self.load_img_gt_box(img_gt_box_path)
image, word_bboxes = rescale(image, word_bboxes)
img_h, img_w, _ = image.shape
query_idx = int(self.img_names[index].split(".")[0].split("_")[1])
saved_region_scores_path = os.path.join(
self.saved_gt_dir, f"res_img_{query_idx}_region.jpg"
)
saved_affi_scores_path = os.path.join(
self.saved_gt_dir, f"res_img_{query_idx}_affi.jpg"
)
saved_cf_mask_path = os.path.join(
self.saved_gt_dir, f"res_img_{query_idx}_cf_mask_thresh_0.6.jpg"
)
region_score = cv2.imread(saved_region_scores_path, cv2.IMREAD_GRAYSCALE)
affinity_score = cv2.imread(saved_affi_scores_path, cv2.IMREAD_GRAYSCALE)
confidence_mask = cv2.imread(saved_cf_mask_path, cv2.IMREAD_GRAYSCALE)
region_score = cv2.resize(region_score, (img_w, img_h))
affinity_score = cv2.resize(affinity_score, (img_w, img_h))
confidence_mask = cv2.resize(
confidence_mask, (img_w, img_h), interpolation=cv2.INTER_NEAREST
)
region_score = region_score.astype(np.float32) / 255
affinity_score = affinity_score.astype(np.float32) / 255
confidence_mask = confidence_mask.astype(np.float32) / 255
# NOTE : Even though word_level_char_bbox is not necessary, align bbox format with make_gt_score()
word_level_char_bbox = []
for i in range(len(word_bboxes)):
word_level_char_bbox.append(np.expand_dims(word_bboxes[i], 0))
return (
image,
region_score,
affinity_score,
confidence_mask,
word_level_char_bbox,
words,
)

View File

@@ -0,0 +1,192 @@
import numpy as np
import cv2
from data.boxEnlarge import enlargebox
class GaussianBuilder(object):
def __init__(self, init_size, sigma, enlarge_region, enlarge_affinity):
self.init_size = init_size
self.sigma = sigma
self.enlarge_region = enlarge_region
self.enlarge_affinity = enlarge_affinity
self.gaussian_map, self.gaussian_map_color = self.generate_gaussian_map()
def generate_gaussian_map(self):
circle_mask = self.generate_circle_mask()
gaussian_map = np.zeros((self.init_size, self.init_size), np.float32)
for i in range(self.init_size):
for j in range(self.init_size):
gaussian_map[i, j] = (
1
/ 2
/ np.pi
/ (self.sigma ** 2)
* np.exp(
-1
/ 2
* (
(i - self.init_size / 2) ** 2 / (self.sigma ** 2)
+ (j - self.init_size / 2) ** 2 / (self.sigma ** 2)
)
)
)
gaussian_map = gaussian_map * circle_mask
gaussian_map = (gaussian_map / np.max(gaussian_map)).astype(np.float32)
gaussian_map_color = (gaussian_map * 255).astype(np.uint8)
gaussian_map_color = cv2.applyColorMap(gaussian_map_color, cv2.COLORMAP_JET)
return gaussian_map, gaussian_map_color
def generate_circle_mask(self):
zero_arr = np.zeros((self.init_size, self.init_size), np.float32)
circle_mask = cv2.circle(
img=zero_arr,
center=(self.init_size // 2, self.init_size // 2),
radius=self.init_size // 2,
color=1,
thickness=-1,
)
return circle_mask
def four_point_transform(self, bbox):
"""
Using the bbox, standard 2D gaussian map, returns Transformed 2d Gaussian map
"""
width, height = (
np.max(bbox[:, 0]).astype(np.int32),
np.max(bbox[:, 1]).astype(np.int32),
)
init_points = np.array(
[
[0, 0],
[self.init_size, 0],
[self.init_size, self.init_size],
[0, self.init_size],
],
dtype="float32",
)
M = cv2.getPerspectiveTransform(init_points, bbox)
warped_gaussian_map = cv2.warpPerspective(self.gaussian_map, M, (width, height))
return warped_gaussian_map, width, height
def add_gaussian_map_to_score_map(
self, score_map, bbox, enlarge_size, horizontal_text_bool, map_type=None
):
"""
Mapping 2D Gaussian to the character box coordinates of the score_map.
:param score_map: Target map to put 2D gaussian on character box
:type score_map: np.float32
:param bbox: character boxes
:type bbox: np.float32
:param enlarge_size: Enlarge size of gaussian map to fit character shape
:type enlarge_size: list of enlarge size [x dim, y dim]
:param horizontal_text_bool: Flag that bbox is horizontal text or not
:type horizontal_text_bool: bool
:param map_type: Whether map's type is "region" | "affinity"
:type map_type: str
:return score_map: score map that all 2D gaussian put on character box
:rtype: np.float32
"""
map_h, map_w = score_map.shape
bbox = enlargebox(bbox, map_h, map_w, enlarge_size, horizontal_text_bool)
# If any one point of character bbox is out of range, don't put in on map
if np.any(bbox < 0) or np.any(bbox[:, 0] > map_w) or np.any(bbox[:, 1] > map_h):
return score_map
bbox_left, bbox_top = np.array([np.min(bbox[:, 0]), np.min(bbox[:, 1])]).astype(
np.int32
)
bbox -= (bbox_left, bbox_top)
warped_gaussian_map, width, height = self.four_point_transform(
bbox.astype(np.float32)
)
try:
bbox_area_of_image = score_map[
bbox_top : bbox_top + height, bbox_left : bbox_left + width,
]
high_value_score = np.where(
warped_gaussian_map > bbox_area_of_image,
warped_gaussian_map,
bbox_area_of_image,
)
score_map[
bbox_top : bbox_top + height, bbox_left : bbox_left + width,
] = high_value_score
except Exception as e:
print("Error : {}".format(e))
print(
"On generating {} map, strange box came out. (width: {}, height: {})".format(
map_type, width, height
)
)
return score_map
def calculate_affinity_box_points(self, bbox_1, bbox_2, vertical=False):
center_1, center_2 = np.mean(bbox_1, axis=0), np.mean(bbox_2, axis=0)
if vertical:
tl = (bbox_1[0] + bbox_1[-1] + center_1) / 3
tr = (bbox_1[1:3].sum(0) + center_1) / 3
br = (bbox_2[1:3].sum(0) + center_2) / 3
bl = (bbox_2[0] + bbox_2[-1] + center_2) / 3
else:
tl = (bbox_1[0:2].sum(0) + center_1) / 3
tr = (bbox_2[0:2].sum(0) + center_2) / 3
br = (bbox_2[2:4].sum(0) + center_2) / 3
bl = (bbox_1[2:4].sum(0) + center_1) / 3
affinity_box = np.array([tl, tr, br, bl]).astype(np.float32)
return affinity_box
def generate_region(
self, img_h, img_w, word_level_char_bbox, horizontal_text_bools
):
region_map = np.zeros([img_h, img_w], dtype=np.float32)
for i in range(
len(word_level_char_bbox)
): # shape : [word_num, [char_num_in_one_word, 4, 2]]
for j in range(len(word_level_char_bbox[i])):
region_map = self.add_gaussian_map_to_score_map(
region_map,
word_level_char_bbox[i][j].copy(),
self.enlarge_region,
horizontal_text_bools[i],
map_type="region",
)
return region_map
def generate_affinity(
self, img_h, img_w, word_level_char_bbox, horizontal_text_bools
):
affinity_map = np.zeros([img_h, img_w], dtype=np.float32)
all_affinity_bbox = []
for i in range(len(word_level_char_bbox)):
for j in range(len(word_level_char_bbox[i]) - 1):
affinity_bbox = self.calculate_affinity_box_points(
word_level_char_bbox[i][j], word_level_char_bbox[i][j + 1]
)
affinity_map = self.add_gaussian_map_to_score_map(
affinity_map,
affinity_bbox.copy(),
self.enlarge_affinity,
horizontal_text_bools[i],
map_type="affinity",
)
all_affinity_bbox.append(np.expand_dims(affinity_bbox, axis=0))
if len(all_affinity_bbox) > 0:
all_affinity_bbox = np.concatenate(all_affinity_bbox, axis=0)
return affinity_map, all_affinity_bbox

View File

@@ -0,0 +1,175 @@
import random
import cv2
import numpy as np
from PIL import Image
from torchvision.transforms.functional import resized_crop, crop
from torchvision.transforms import RandomResizedCrop, RandomCrop
from torchvision.transforms import InterpolationMode
def rescale(img, bboxes, target_size=2240):
h, w = img.shape[0:2]
scale = target_size / max(h, w)
img = cv2.resize(img, dsize=None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
bboxes = bboxes * scale
return img, bboxes
def random_resize_crop_synth(augment_targets, size):
image, region_score, affinity_score, confidence_mask = augment_targets
image = Image.fromarray(image)
region_score = Image.fromarray(region_score)
affinity_score = Image.fromarray(affinity_score)
confidence_mask = Image.fromarray(confidence_mask)
short_side = min(image.size)
i, j, h, w = RandomCrop.get_params(image, output_size=(short_side, short_side))
image = resized_crop(
image, i, j, h, w, size=(size, size), interpolation=InterpolationMode.BICUBIC
)
region_score = resized_crop(
region_score, i, j, h, w, (size, size), interpolation=InterpolationMode.BICUBIC
)
affinity_score = resized_crop(
affinity_score,
i,
j,
h,
w,
(size, size),
interpolation=InterpolationMode.BICUBIC,
)
confidence_mask = resized_crop(
confidence_mask,
i,
j,
h,
w,
(size, size),
interpolation=InterpolationMode.NEAREST,
)
image = np.array(image)
region_score = np.array(region_score)
affinity_score = np.array(affinity_score)
confidence_mask = np.array(confidence_mask)
augment_targets = [image, region_score, affinity_score, confidence_mask]
return augment_targets
def random_resize_crop(
augment_targets, scale, ratio, size, threshold, pre_crop_area=None
):
image, region_score, affinity_score, confidence_mask = augment_targets
image = Image.fromarray(image)
region_score = Image.fromarray(region_score)
affinity_score = Image.fromarray(affinity_score)
confidence_mask = Image.fromarray(confidence_mask)
if pre_crop_area != None:
i, j, h, w = pre_crop_area
else:
if random.random() < threshold:
i, j, h, w = RandomResizedCrop.get_params(image, scale=scale, ratio=ratio)
else:
i, j, h, w = RandomResizedCrop.get_params(
image, scale=(1.0, 1.0), ratio=(1.0, 1.0)
)
image = resized_crop(
image, i, j, h, w, size=(size, size), interpolation=InterpolationMode.BICUBIC
)
region_score = resized_crop(
region_score, i, j, h, w, (size, size), interpolation=InterpolationMode.BICUBIC
)
affinity_score = resized_crop(
affinity_score,
i,
j,
h,
w,
(size, size),
interpolation=InterpolationMode.BICUBIC,
)
confidence_mask = resized_crop(
confidence_mask,
i,
j,
h,
w,
(size, size),
interpolation=InterpolationMode.NEAREST,
)
image = np.array(image)
region_score = np.array(region_score)
affinity_score = np.array(affinity_score)
confidence_mask = np.array(confidence_mask)
augment_targets = [image, region_score, affinity_score, confidence_mask]
return augment_targets
def random_crop(augment_targets, size):
image, region_score, affinity_score, confidence_mask = augment_targets
image = Image.fromarray(image)
region_score = Image.fromarray(region_score)
affinity_score = Image.fromarray(affinity_score)
confidence_mask = Image.fromarray(confidence_mask)
i, j, h, w = RandomCrop.get_params(image, output_size=(size, size))
image = crop(image, i, j, h, w)
region_score = crop(region_score, i, j, h, w)
affinity_score = crop(affinity_score, i, j, h, w)
confidence_mask = crop(confidence_mask, i, j, h, w)
image = np.array(image)
region_score = np.array(region_score)
affinity_score = np.array(affinity_score)
confidence_mask = np.array(confidence_mask)
augment_targets = [image, region_score, affinity_score, confidence_mask]
return augment_targets
def random_horizontal_flip(imgs):
if random.random() < 0.5:
for i in range(len(imgs)):
imgs[i] = np.flip(imgs[i], axis=1).copy()
return imgs
def random_scale(images, word_level_char_bbox, scale_range):
scale = random.sample(scale_range, 1)[0]
for i in range(len(images)):
images[i] = cv2.resize(images[i], dsize=None, fx=scale, fy=scale)
for i in range(len(word_level_char_bbox)):
word_level_char_bbox[i] *= scale
return images
def random_rotate(images, max_angle):
angle = random.random() * 2 * max_angle - max_angle
for i in range(len(images)):
img = images[i]
w, h = img.shape[:2]
rotation_matrix = cv2.getRotationMatrix2D((h / 2, w / 2), angle, 1)
if i == len(images) - 1:
img_rotation = cv2.warpAffine(
img, M=rotation_matrix, dsize=(h, w), flags=cv2.INTER_NEAREST
)
else:
img_rotation = cv2.warpAffine(img, rotation_matrix, (h, w))
images[i] = img_rotation
return images

View File

@@ -0,0 +1,91 @@
"""
Copyright (c) 2019-present NAVER Corp.
MIT License
"""
# -*- coding: utf-8 -*-
import numpy as np
import cv2
from skimage import io
def loadImage(img_file):
img = io.imread(img_file) # RGB order
if img.shape[0] == 2:
img = img[0]
if len(img.shape) == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
if img.shape[2] == 4:
img = img[:, :, :3]
img = np.array(img)
return img
def normalizeMeanVariance(
in_img, mean=(0.485, 0.456, 0.406), variance=(0.229, 0.224, 0.225)
):
# should be RGB order
img = in_img.copy().astype(np.float32)
img -= np.array(
[mean[0] * 255.0, mean[1] * 255.0, mean[2] * 255.0], dtype=np.float32
)
img /= np.array(
[variance[0] * 255.0, variance[1] * 255.0, variance[2] * 255.0],
dtype=np.float32,
)
return img
def denormalizeMeanVariance(
in_img, mean=(0.485, 0.456, 0.406), variance=(0.229, 0.224, 0.225)
):
# should be RGB order
img = in_img.copy()
img *= variance
img += mean
img *= 255.0
img = np.clip(img, 0, 255).astype(np.uint8)
return img
def resize_aspect_ratio(img, square_size, interpolation, mag_ratio=1):
height, width, channel = img.shape
# magnify image size
target_size = mag_ratio * max(height, width)
# set original image size
if target_size > square_size:
target_size = square_size
ratio = target_size / max(height, width)
target_h, target_w = int(height * ratio), int(width * ratio)
# NOTE
valid_size_heatmap = (int(target_h / 2), int(target_w / 2))
proc = cv2.resize(img, (target_w, target_h), interpolation=interpolation)
# make canvas and paste image
target_h32, target_w32 = target_h, target_w
if target_h % 32 != 0:
target_h32 = target_h + (32 - target_h % 32)
if target_w % 32 != 0:
target_w32 = target_w + (32 - target_w % 32)
resized = np.zeros((target_h32, target_w32, channel), dtype=np.float32)
resized[0:target_h, 0:target_w, :] = proc
# target_h, target_w = target_h32, target_w32
# size_heatmap = (int(target_w/2), int(target_h/2))
return resized, ratio, valid_size_heatmap
def cvt2HeatmapImg(img):
img = (np.clip(img, 0, 1) * 255).astype(np.uint8)
img = cv2.applyColorMap(img, cv2.COLORMAP_JET)
return img

View File

@@ -0,0 +1,263 @@
import os
import random
import math
import numpy as np
import cv2
import torch
from data import imgproc
from data.pseudo_label.watershed import exec_watershed_by_version
class PseudoCharBoxBuilder:
def __init__(self, watershed_param, vis_test_dir, pseudo_vis_opt, gaussian_builder):
self.watershed_param = watershed_param
self.vis_test_dir = vis_test_dir
self.pseudo_vis_opt = pseudo_vis_opt
self.gaussian_builder = gaussian_builder
self.cnt = 0
self.flag = False
def crop_image_by_bbox(self, image, box, word):
w = max(
int(np.linalg.norm(box[0] - box[1])), int(np.linalg.norm(box[2] - box[3]))
)
h = max(
int(np.linalg.norm(box[0] - box[3])), int(np.linalg.norm(box[1] - box[2]))
)
try:
word_ratio = h / w
except:
import ipdb
ipdb.set_trace()
one_char_ratio = min(h, w) / (max(h, w) / len(word))
# NOTE: criterion to split vertical word in here is set to work properly on IC15 dataset
if word_ratio > 2 or (word_ratio > 1.6 and one_char_ratio > 2.4):
# warping method of vertical word (classified by upper condition)
horizontal_text_bool = False
long_side = h
short_side = w
M = cv2.getPerspectiveTransform(
np.float32(box),
np.float32(
np.array(
[
[long_side, 0],
[long_side, short_side],
[0, short_side],
[0, 0],
]
)
),
)
self.flag = True
else:
# warping method of horizontal word
horizontal_text_bool = True
long_side = w
short_side = h
M = cv2.getPerspectiveTransform(
np.float32(box),
np.float32(
np.array(
[
[0, 0],
[long_side, 0],
[long_side, short_side],
[0, short_side],
]
)
),
)
self.flag = False
warped = cv2.warpPerspective(image, M, (long_side, short_side))
return warped, M, horizontal_text_bool
def inference_word_box(self, net, gpu, word_image):
if net.training:
net.eval()
with torch.no_grad():
word_img_torch = torch.from_numpy(
imgproc.normalizeMeanVariance(
word_image,
mean=(0.485, 0.456, 0.406),
variance=(0.229, 0.224, 0.225),
)
)
word_img_torch = word_img_torch.permute(2, 0, 1).unsqueeze(0)
word_img_torch = word_img_torch.type(torch.FloatTensor).cuda(gpu)
with torch.cuda.amp.autocast():
word_img_scores, _ = net(word_img_torch)
return word_img_scores
def visualize_pseudo_label(
self, word_image, region_score, watershed_box, pseudo_char_bbox, img_name,
):
word_img_h, word_img_w, _ = word_image.shape
word_img_cp1 = word_image.copy()
word_img_cp2 = word_image.copy()
_watershed_box = np.int32(watershed_box)
_pseudo_char_bbox = np.int32(pseudo_char_bbox)
region_score_color = cv2.applyColorMap(np.uint8(region_score), cv2.COLORMAP_JET)
region_score_color = cv2.resize(region_score_color, (word_img_w, word_img_h))
for box in _watershed_box:
cv2.polylines(
np.uint8(word_img_cp1),
[np.reshape(box, (-1, 1, 2))],
True,
(255, 0, 0),
)
for box in _pseudo_char_bbox:
cv2.polylines(
np.uint8(word_img_cp2), [np.reshape(box, (-1, 1, 2))], True, (255, 0, 0)
)
# NOTE: Just for visualize, put gaussian map on char box
pseudo_gt_region_score = self.gaussian_builder.generate_region(
word_img_h, word_img_w, [_pseudo_char_bbox], [True]
)
pseudo_gt_region_score = cv2.applyColorMap(
(pseudo_gt_region_score * 255).astype("uint8"), cv2.COLORMAP_JET
)
overlay_img = cv2.addWeighted(
word_image[:, :, ::-1], 0.7, pseudo_gt_region_score, 0.3, 5
)
vis_result = np.hstack(
[
word_image[:, :, ::-1],
region_score_color,
word_img_cp1[:, :, ::-1],
word_img_cp2[:, :, ::-1],
pseudo_gt_region_score,
overlay_img,
]
)
if not os.path.exists(os.path.dirname(self.vis_test_dir)):
os.makedirs(os.path.dirname(self.vis_test_dir))
cv2.imwrite(
os.path.join(
self.vis_test_dir,
"{}_{}".format(
img_name, f"pseudo_char_bbox_{random.randint(0,100)}.jpg"
),
),
vis_result,
)
def clip_into_boundary(self, box, bound):
if len(box) == 0:
return box
else:
box[:, :, 0] = np.clip(box[:, :, 0], 0, bound[1])
box[:, :, 1] = np.clip(box[:, :, 1], 0, bound[0])
return box
def get_confidence(self, real_len, pseudo_len):
if pseudo_len == 0:
return 0.0
return (real_len - min(real_len, abs(real_len - pseudo_len))) / real_len
def split_word_equal_gap(self, word_img_w, word_img_h, word):
width = word_img_w
height = word_img_h
width_per_char = width / len(word)
bboxes = []
for j, char in enumerate(word):
if char == " ":
continue
left = j * width_per_char
right = (j + 1) * width_per_char
bbox = np.array([[left, 0], [right, 0], [right, height], [left, height]])
bboxes.append(bbox)
bboxes = np.array(bboxes, np.float32)
return bboxes
def cal_angle(self, v1):
theta = np.arccos(min(1, v1[0] / (np.linalg.norm(v1) + 10e-8)))
return 2 * math.pi - theta if v1[1] < 0 else theta
def clockwise_sort(self, points):
# returns 4x2 [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] ndarray
v1, v2, v3, v4 = points
center = (v1 + v2 + v3 + v4) / 4
theta = np.array(
[
self.cal_angle(v1 - center),
self.cal_angle(v2 - center),
self.cal_angle(v3 - center),
self.cal_angle(v4 - center),
]
)
index = np.argsort(theta)
return np.array([v1, v2, v3, v4])[index, :]
def build_char_box(self, net, gpu, image, word_bbox, word, img_name=""):
word_image, M, horizontal_text_bool = self.crop_image_by_bbox(
image, word_bbox, word
)
real_word_without_space = word.replace("\s", "")
real_char_len = len(real_word_without_space)
scale = 128.0 / word_image.shape[0]
word_image = cv2.resize(word_image, None, fx=scale, fy=scale)
word_img_h, word_img_w, _ = word_image.shape
scores = self.inference_word_box(net, gpu, word_image)
region_score = scores[0, :, :, 0].cpu().data.numpy()
region_score = np.uint8(np.clip(region_score, 0, 1) * 255)
region_score_rgb = cv2.resize(region_score, (word_img_w, word_img_h))
region_score_rgb = cv2.cvtColor(region_score_rgb, cv2.COLOR_GRAY2RGB)
pseudo_char_bbox = exec_watershed_by_version(
self.watershed_param, region_score, word_image, self.pseudo_vis_opt
)
# Used for visualize only
watershed_box = pseudo_char_bbox.copy()
pseudo_char_bbox = self.clip_into_boundary(
pseudo_char_bbox, region_score_rgb.shape
)
confidence = self.get_confidence(real_char_len, len(pseudo_char_bbox))
if confidence <= 0.5:
pseudo_char_bbox = self.split_word_equal_gap(word_img_w, word_img_h, word)
confidence = 0.5
if self.pseudo_vis_opt and self.flag:
self.visualize_pseudo_label(
word_image, region_score, watershed_box, pseudo_char_bbox, img_name,
)
if len(pseudo_char_bbox) != 0:
index = np.argsort(pseudo_char_bbox[:, 0, 0])
pseudo_char_bbox = pseudo_char_bbox[index]
pseudo_char_bbox /= scale
M_inv = np.linalg.pinv(M)
for i in range(len(pseudo_char_bbox)):
pseudo_char_bbox[i] = cv2.perspectiveTransform(
pseudo_char_bbox[i][None, :, :], M_inv
)
pseudo_char_bbox = self.clip_into_boundary(pseudo_char_bbox, image.shape)
return pseudo_char_bbox, confidence, horizontal_text_bool

View File

@@ -0,0 +1,45 @@
import cv2
import numpy as np
from skimage.segmentation import watershed
def segment_region_score(watershed_param, region_score, word_image, pseudo_vis_opt):
region_score = np.float32(region_score) / 255
fore = np.uint8(region_score > 0.75)
back = np.uint8(region_score < 0.05)
unknown = 1 - (fore + back)
ret, markers = cv2.connectedComponents(fore)
markers += 1
markers[unknown == 1] = 0
labels = watershed(-region_score, markers)
boxes = []
for label in range(2, ret + 1):
y, x = np.where(labels == label)
x_max = x.max()
y_max = y.max()
x_min = x.min()
y_min = y.min()
box = [[x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max]]
box = np.array(box)
box *= 2
boxes.append(box)
return np.array(boxes, dtype=np.float32)
def exec_watershed_by_version(
watershed_param, region_score, word_image, pseudo_vis_opt
):
func_name_map_dict = {
"skimage": segment_region_score,
}
try:
return func_name_map_dict[watershed_param.version](
watershed_param, region_score, word_image, pseudo_vis_opt
)
except:
print(
f"Watershed version {watershed_param.version} does not exist in func_name_map_dict."
)