testing dataset
This commit is contained in:
65
trainer/craft/data/boxEnlarge.py
Normal file
65
trainer/craft/data/boxEnlarge.py
Normal file
@@ -0,0 +1,65 @@
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
|
||||
def pointAngle(Apoint, Bpoint):
|
||||
angle = (Bpoint[1] - Apoint[1]) / ((Bpoint[0] - Apoint[0]) + 10e-8)
|
||||
return angle
|
||||
|
||||
def pointDistance(Apoint, Bpoint):
|
||||
return math.sqrt((Bpoint[1] - Apoint[1])**2 + (Bpoint[0] - Apoint[0])**2)
|
||||
|
||||
def lineBiasAndK(Apoint, Bpoint):
|
||||
|
||||
K = pointAngle(Apoint, Bpoint)
|
||||
B = Apoint[1] - K*Apoint[0]
|
||||
return K, B
|
||||
|
||||
def getX(K, B, Ypoint):
|
||||
return int((Ypoint-B)/K)
|
||||
|
||||
def sidePoint(Apoint, Bpoint, h, w, placehold, enlarge_size):
|
||||
|
||||
K, B = lineBiasAndK(Apoint, Bpoint)
|
||||
angle = abs(math.atan(pointAngle(Apoint, Bpoint)))
|
||||
distance = pointDistance(Apoint, Bpoint)
|
||||
|
||||
x_enlarge_size, y_enlarge_size = enlarge_size
|
||||
|
||||
XaxisIncreaseDistance = abs(math.cos(angle) * x_enlarge_size * distance)
|
||||
YaxisIncreaseDistance = abs(math.sin(angle) * y_enlarge_size * distance)
|
||||
|
||||
if placehold == 'leftTop':
|
||||
x1 = max(0, Apoint[0] - XaxisIncreaseDistance)
|
||||
y1 = max(0, Apoint[1] - YaxisIncreaseDistance)
|
||||
elif placehold == 'rightTop':
|
||||
x1 = min(w, Bpoint[0] + XaxisIncreaseDistance)
|
||||
y1 = max(0, Bpoint[1] - YaxisIncreaseDistance)
|
||||
elif placehold == 'rightBottom':
|
||||
x1 = min(w, Bpoint[0] + XaxisIncreaseDistance)
|
||||
y1 = min(h, Bpoint[1] + YaxisIncreaseDistance)
|
||||
elif placehold == 'leftBottom':
|
||||
x1 = max(0, Apoint[0] - XaxisIncreaseDistance)
|
||||
y1 = min(h, Apoint[1] + YaxisIncreaseDistance)
|
||||
return int(x1), int(y1)
|
||||
|
||||
def enlargebox(box, h, w, enlarge_size, horizontal_text_bool):
|
||||
|
||||
if not horizontal_text_bool:
|
||||
enlarge_size = (enlarge_size[1], enlarge_size[0])
|
||||
|
||||
box = np.roll(box, -np.argmin(box.sum(axis=1)), axis=0)
|
||||
|
||||
Apoint, Bpoint, Cpoint, Dpoint = box
|
||||
K1, B1 = lineBiasAndK(box[0], box[2])
|
||||
K2, B2 = lineBiasAndK(box[3], box[1])
|
||||
X = (B2 - B1)/(K1 - K2)
|
||||
Y = K1 * X + B1
|
||||
center = [X, Y]
|
||||
|
||||
x1, y1 = sidePoint(Apoint, center, h, w, 'leftTop', enlarge_size)
|
||||
x2, y2 = sidePoint(center, Bpoint, h, w, 'rightTop', enlarge_size)
|
||||
x3, y3 = sidePoint(center, Cpoint, h, w, 'rightBottom', enlarge_size)
|
||||
x4, y4 = sidePoint(Dpoint, center, h, w, 'leftBottom', enlarge_size)
|
||||
newcharbox = np.array([[x1, y1], [x2, y2], [x3, y3], [x4, y4]])
|
||||
return newcharbox
|
||||
542
trainer/craft/data/dataset.py
Normal file
542
trainer/craft/data/dataset.py
Normal file
@@ -0,0 +1,542 @@
|
||||
import os
|
||||
import re
|
||||
import itertools
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import scipy.io as scio
|
||||
from PIL import Image
|
||||
import cv2
|
||||
from torch.utils.data import Dataset
|
||||
import torchvision.transforms as transforms
|
||||
|
||||
from data import imgproc
|
||||
from data.gaussian import GaussianBuilder
|
||||
from data.imgaug import (
|
||||
rescale,
|
||||
random_resize_crop_synth,
|
||||
random_resize_crop,
|
||||
random_horizontal_flip,
|
||||
random_rotate,
|
||||
random_scale,
|
||||
random_crop,
|
||||
)
|
||||
from data.pseudo_label.make_charbox import PseudoCharBoxBuilder
|
||||
from utils.util import saveInput, saveImage
|
||||
|
||||
|
||||
class CraftBaseDataset(Dataset):
|
||||
def __init__(
|
||||
self,
|
||||
output_size,
|
||||
data_dir,
|
||||
saved_gt_dir,
|
||||
mean,
|
||||
variance,
|
||||
gauss_init_size,
|
||||
gauss_sigma,
|
||||
enlarge_region,
|
||||
enlarge_affinity,
|
||||
aug,
|
||||
vis_test_dir,
|
||||
vis_opt,
|
||||
sample,
|
||||
):
|
||||
self.output_size = output_size
|
||||
self.data_dir = data_dir
|
||||
self.saved_gt_dir = saved_gt_dir
|
||||
self.mean, self.variance = mean, variance
|
||||
self.gaussian_builder = GaussianBuilder(
|
||||
gauss_init_size, gauss_sigma, enlarge_region, enlarge_affinity
|
||||
)
|
||||
self.aug = aug
|
||||
self.vis_test_dir = vis_test_dir
|
||||
self.vis_opt = vis_opt
|
||||
self.sample = sample
|
||||
if self.sample != -1:
|
||||
random.seed(0)
|
||||
self.idx = random.sample(range(0, len(self.img_names)), self.sample)
|
||||
|
||||
self.pre_crop_area = []
|
||||
|
||||
def augment_image(
|
||||
self, image, region_score, affinity_score, confidence_mask, word_level_char_bbox
|
||||
):
|
||||
augment_targets = [image, region_score, affinity_score, confidence_mask]
|
||||
|
||||
if self.aug.random_scale.option:
|
||||
augment_targets, word_level_char_bbox = random_scale(
|
||||
augment_targets, word_level_char_bbox, self.aug.random_scale.range
|
||||
)
|
||||
|
||||
if self.aug.random_rotate.option:
|
||||
augment_targets = random_rotate(
|
||||
augment_targets, self.aug.random_rotate.max_angle
|
||||
)
|
||||
|
||||
if self.aug.random_crop.option:
|
||||
if self.aug.random_crop.version == "random_crop_with_bbox":
|
||||
augment_targets = random_crop_with_bbox(
|
||||
augment_targets, word_level_char_bbox, self.output_size
|
||||
)
|
||||
elif self.aug.random_crop.version == "random_resize_crop_synth":
|
||||
augment_targets = random_resize_crop_synth(
|
||||
augment_targets, self.output_size
|
||||
)
|
||||
elif self.aug.random_crop.version == "random_resize_crop":
|
||||
|
||||
if len(self.pre_crop_area) > 0:
|
||||
pre_crop_area = self.pre_crop_area
|
||||
else:
|
||||
pre_crop_area = None
|
||||
|
||||
augment_targets = random_resize_crop(
|
||||
augment_targets,
|
||||
self.aug.random_crop.scale,
|
||||
self.aug.random_crop.ratio,
|
||||
self.output_size,
|
||||
self.aug.random_crop.rnd_threshold,
|
||||
pre_crop_area,
|
||||
)
|
||||
|
||||
elif self.aug.random_crop.version == "random_crop":
|
||||
augment_targets = random_crop(augment_targets, self.output_size,)
|
||||
|
||||
else:
|
||||
assert "Undefined RandomCrop version"
|
||||
|
||||
if self.aug.random_horizontal_flip.option:
|
||||
augment_targets = random_horizontal_flip(augment_targets)
|
||||
|
||||
if self.aug.random_colorjitter.option:
|
||||
image, region_score, affinity_score, confidence_mask = augment_targets
|
||||
image = Image.fromarray(image)
|
||||
image = transforms.ColorJitter(
|
||||
brightness=self.aug.random_colorjitter.brightness,
|
||||
contrast=self.aug.random_colorjitter.contrast,
|
||||
saturation=self.aug.random_colorjitter.saturation,
|
||||
hue=self.aug.random_colorjitter.hue,
|
||||
)(image)
|
||||
else:
|
||||
image, region_score, affinity_score, confidence_mask = augment_targets
|
||||
|
||||
return np.array(image), region_score, affinity_score, confidence_mask
|
||||
|
||||
def resize_to_half(self, ground_truth, interpolation):
|
||||
return cv2.resize(
|
||||
ground_truth,
|
||||
(self.output_size // 2, self.output_size // 2),
|
||||
interpolation=interpolation,
|
||||
)
|
||||
|
||||
def __len__(self):
|
||||
if self.sample != -1:
|
||||
return len(self.idx)
|
||||
else:
|
||||
return len(self.img_names)
|
||||
|
||||
def __getitem__(self, index):
|
||||
if self.sample != -1:
|
||||
index = self.idx[index]
|
||||
if self.saved_gt_dir is None:
|
||||
(
|
||||
image,
|
||||
region_score,
|
||||
affinity_score,
|
||||
confidence_mask,
|
||||
word_level_char_bbox,
|
||||
all_affinity_bbox,
|
||||
words,
|
||||
) = self.make_gt_score(index)
|
||||
else:
|
||||
(
|
||||
image,
|
||||
region_score,
|
||||
affinity_score,
|
||||
confidence_mask,
|
||||
word_level_char_bbox,
|
||||
words,
|
||||
) = self.load_saved_gt_score(index)
|
||||
all_affinity_bbox = []
|
||||
|
||||
if self.vis_opt:
|
||||
saveImage(
|
||||
self.img_names[index],
|
||||
self.vis_test_dir,
|
||||
image.copy(),
|
||||
word_level_char_bbox.copy(),
|
||||
all_affinity_bbox.copy(),
|
||||
region_score.copy(),
|
||||
affinity_score.copy(),
|
||||
confidence_mask.copy(),
|
||||
)
|
||||
|
||||
image, region_score, affinity_score, confidence_mask = self.augment_image(
|
||||
image, region_score, affinity_score, confidence_mask, word_level_char_bbox
|
||||
)
|
||||
|
||||
if self.vis_opt:
|
||||
saveInput(
|
||||
self.img_names[index],
|
||||
self.vis_test_dir,
|
||||
image,
|
||||
region_score,
|
||||
affinity_score,
|
||||
confidence_mask,
|
||||
)
|
||||
|
||||
region_score = self.resize_to_half(region_score, interpolation=cv2.INTER_CUBIC)
|
||||
affinity_score = self.resize_to_half(
|
||||
affinity_score, interpolation=cv2.INTER_CUBIC
|
||||
)
|
||||
confidence_mask = self.resize_to_half(
|
||||
confidence_mask, interpolation=cv2.INTER_NEAREST
|
||||
)
|
||||
|
||||
image = imgproc.normalizeMeanVariance(
|
||||
np.array(image), mean=self.mean, variance=self.variance
|
||||
)
|
||||
image = image.transpose(2, 0, 1)
|
||||
|
||||
return image, region_score, affinity_score, confidence_mask
|
||||
|
||||
|
||||
class SynthTextDataSet(CraftBaseDataset):
|
||||
def __init__(
|
||||
self,
|
||||
output_size,
|
||||
data_dir,
|
||||
saved_gt_dir,
|
||||
mean,
|
||||
variance,
|
||||
gauss_init_size,
|
||||
gauss_sigma,
|
||||
enlarge_region,
|
||||
enlarge_affinity,
|
||||
aug,
|
||||
vis_test_dir,
|
||||
vis_opt,
|
||||
sample,
|
||||
):
|
||||
super().__init__(
|
||||
output_size,
|
||||
data_dir,
|
||||
saved_gt_dir,
|
||||
mean,
|
||||
variance,
|
||||
gauss_init_size,
|
||||
gauss_sigma,
|
||||
enlarge_region,
|
||||
enlarge_affinity,
|
||||
aug,
|
||||
vis_test_dir,
|
||||
vis_opt,
|
||||
sample,
|
||||
)
|
||||
self.img_names, self.char_bbox, self.img_words = self.load_data()
|
||||
self.vis_index = list(range(1000))
|
||||
|
||||
def load_data(self, bbox="char"):
|
||||
|
||||
gt = scio.loadmat(os.path.join(self.data_dir, "gt.mat"))
|
||||
img_names = gt["imnames"][0]
|
||||
img_words = gt["txt"][0]
|
||||
|
||||
if bbox == "char":
|
||||
img_bbox = gt["charBB"][0]
|
||||
else:
|
||||
img_bbox = gt["wordBB"][0] # word bbox needed for test
|
||||
|
||||
return img_names, img_bbox, img_words
|
||||
|
||||
def dilate_img_to_output_size(self, image, char_bbox):
|
||||
h, w, _ = image.shape
|
||||
if min(h, w) <= self.output_size:
|
||||
scale = float(self.output_size) / min(h, w)
|
||||
else:
|
||||
scale = 1.0
|
||||
image = cv2.resize(
|
||||
image, dsize=None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC
|
||||
)
|
||||
char_bbox *= scale
|
||||
return image, char_bbox
|
||||
|
||||
def make_gt_score(self, index):
|
||||
img_path = os.path.join(self.data_dir, self.img_names[index][0])
|
||||
image = cv2.imread(img_path, cv2.IMREAD_COLOR)
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||
all_char_bbox = self.char_bbox[index].transpose(
|
||||
(2, 1, 0)
|
||||
) # shape : (Number of characters in image, 4, 2)
|
||||
|
||||
img_h, img_w, _ = image.shape
|
||||
|
||||
confidence_mask = np.ones((img_h, img_w), dtype=np.float32)
|
||||
|
||||
words = [
|
||||
re.split(" \n|\n |\n| ", word.strip()) for word in self.img_words[index]
|
||||
]
|
||||
words = list(itertools.chain(*words))
|
||||
words = [word for word in words if len(word) > 0]
|
||||
|
||||
word_level_char_bbox = []
|
||||
char_idx = 0
|
||||
|
||||
for i in range(len(words)):
|
||||
length_of_word = len(words[i])
|
||||
word_bbox = all_char_bbox[char_idx : char_idx + length_of_word]
|
||||
assert len(word_bbox) == length_of_word
|
||||
char_idx += length_of_word
|
||||
word_bbox = np.array(word_bbox)
|
||||
word_level_char_bbox.append(word_bbox)
|
||||
|
||||
region_score = self.gaussian_builder.generate_region(
|
||||
img_h,
|
||||
img_w,
|
||||
word_level_char_bbox,
|
||||
horizontal_text_bools=[True for _ in range(len(words))],
|
||||
)
|
||||
affinity_score, all_affinity_bbox = self.gaussian_builder.generate_affinity(
|
||||
img_h,
|
||||
img_w,
|
||||
word_level_char_bbox,
|
||||
horizontal_text_bools=[True for _ in range(len(words))],
|
||||
)
|
||||
|
||||
return (
|
||||
image,
|
||||
region_score,
|
||||
affinity_score,
|
||||
confidence_mask,
|
||||
word_level_char_bbox,
|
||||
all_affinity_bbox,
|
||||
words,
|
||||
)
|
||||
|
||||
|
||||
class CustomDataset(CraftBaseDataset):
|
||||
def __init__(
|
||||
self,
|
||||
output_size,
|
||||
data_dir,
|
||||
saved_gt_dir,
|
||||
mean,
|
||||
variance,
|
||||
gauss_init_size,
|
||||
gauss_sigma,
|
||||
enlarge_region,
|
||||
enlarge_affinity,
|
||||
aug,
|
||||
vis_test_dir,
|
||||
vis_opt,
|
||||
sample,
|
||||
watershed_param,
|
||||
pseudo_vis_opt,
|
||||
do_not_care_label,
|
||||
):
|
||||
super().__init__(
|
||||
output_size,
|
||||
data_dir,
|
||||
saved_gt_dir,
|
||||
mean,
|
||||
variance,
|
||||
gauss_init_size,
|
||||
gauss_sigma,
|
||||
enlarge_region,
|
||||
enlarge_affinity,
|
||||
aug,
|
||||
vis_test_dir,
|
||||
vis_opt,
|
||||
sample,
|
||||
)
|
||||
self.pseudo_vis_opt = pseudo_vis_opt
|
||||
self.do_not_care_label = do_not_care_label
|
||||
self.pseudo_charbox_builder = PseudoCharBoxBuilder(
|
||||
watershed_param, vis_test_dir, pseudo_vis_opt, self.gaussian_builder
|
||||
)
|
||||
self.vis_index = list(range(1000))
|
||||
self.img_dir = os.path.join(data_dir, "ch4_training_images")
|
||||
self.img_gt_box_dir = os.path.join(
|
||||
data_dir, "ch4_training_localization_transcription_gt"
|
||||
)
|
||||
self.img_names = os.listdir(self.img_dir)
|
||||
|
||||
def update_model(self, net):
|
||||
self.net = net
|
||||
|
||||
def update_device(self, gpu):
|
||||
self.gpu = gpu
|
||||
|
||||
def load_img_gt_box(self, img_gt_box_path):
|
||||
lines = open(img_gt_box_path, encoding="utf-8").readlines()
|
||||
word_bboxes = []
|
||||
words = []
|
||||
for line in lines:
|
||||
box_info = line.strip().encode("utf-8").decode("utf-8-sig").split(",")
|
||||
box_points = [int(box_info[i]) for i in range(8)]
|
||||
box_points = np.array(box_points, np.float32).reshape(4, 2)
|
||||
word = box_info[8:]
|
||||
word = ",".join(word)
|
||||
if word in self.do_not_care_label:
|
||||
words.append(self.do_not_care_label[0])
|
||||
word_bboxes.append(box_points)
|
||||
continue
|
||||
word_bboxes.append(box_points)
|
||||
words.append(word)
|
||||
return np.array(word_bboxes), words
|
||||
|
||||
def load_data(self, index):
|
||||
img_name = self.img_names[index]
|
||||
img_path = os.path.join(self.img_dir, img_name)
|
||||
image = cv2.imread(img_path)
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||
|
||||
img_gt_box_path = os.path.join(
|
||||
self.img_gt_box_dir, "gt_%s.txt" % os.path.splitext(img_name)[0]
|
||||
)
|
||||
word_bboxes, words = self.load_img_gt_box(
|
||||
img_gt_box_path
|
||||
) # shape : (Number of word bbox, 4, 2)
|
||||
confidence_mask = np.ones((image.shape[0], image.shape[1]), np.float32)
|
||||
|
||||
word_level_char_bbox = []
|
||||
do_care_words = []
|
||||
horizontal_text_bools = []
|
||||
|
||||
if len(word_bboxes) == 0:
|
||||
return (
|
||||
image,
|
||||
word_level_char_bbox,
|
||||
do_care_words,
|
||||
confidence_mask,
|
||||
horizontal_text_bools,
|
||||
)
|
||||
_word_bboxes = word_bboxes.copy()
|
||||
for i in range(len(word_bboxes)):
|
||||
if words[i] in self.do_not_care_label:
|
||||
cv2.fillPoly(confidence_mask, [np.int32(_word_bboxes[i])], 0)
|
||||
continue
|
||||
|
||||
(
|
||||
pseudo_char_bbox,
|
||||
confidence,
|
||||
horizontal_text_bool,
|
||||
) = self.pseudo_charbox_builder.build_char_box(
|
||||
self.net, self.gpu, image, word_bboxes[i], words[i], img_name=img_name
|
||||
)
|
||||
|
||||
cv2.fillPoly(confidence_mask, [np.int32(_word_bboxes[i])], confidence)
|
||||
do_care_words.append(words[i])
|
||||
word_level_char_bbox.append(pseudo_char_bbox)
|
||||
horizontal_text_bools.append(horizontal_text_bool)
|
||||
|
||||
return (
|
||||
image,
|
||||
word_level_char_bbox,
|
||||
do_care_words,
|
||||
confidence_mask,
|
||||
horizontal_text_bools,
|
||||
)
|
||||
|
||||
def make_gt_score(self, index):
|
||||
"""
|
||||
Make region, affinity scores using pseudo character-level GT bounding box
|
||||
word_level_char_bbox's shape : [word_num, [char_num_in_one_word, 4, 2]]
|
||||
:rtype region_score: np.float32
|
||||
:rtype affinity_score: np.float32
|
||||
:rtype confidence_mask: np.float32
|
||||
:rtype word_level_char_bbox: np.float32
|
||||
:rtype words: list
|
||||
"""
|
||||
(
|
||||
image,
|
||||
word_level_char_bbox,
|
||||
words,
|
||||
confidence_mask,
|
||||
horizontal_text_bools,
|
||||
) = self.load_data(index)
|
||||
img_h, img_w, _ = image.shape
|
||||
|
||||
if len(word_level_char_bbox) == 0:
|
||||
region_score = np.zeros((img_h, img_w), dtype=np.float32)
|
||||
affinity_score = np.zeros((img_h, img_w), dtype=np.float32)
|
||||
all_affinity_bbox = []
|
||||
else:
|
||||
region_score = self.gaussian_builder.generate_region(
|
||||
img_h, img_w, word_level_char_bbox, horizontal_text_bools
|
||||
)
|
||||
affinity_score, all_affinity_bbox = self.gaussian_builder.generate_affinity(
|
||||
img_h, img_w, word_level_char_bbox, horizontal_text_bools
|
||||
)
|
||||
|
||||
return (
|
||||
image,
|
||||
region_score,
|
||||
affinity_score,
|
||||
confidence_mask,
|
||||
word_level_char_bbox,
|
||||
all_affinity_bbox,
|
||||
words,
|
||||
)
|
||||
|
||||
def load_saved_gt_score(self, index):
|
||||
"""
|
||||
Load pre-saved official CRAFT model's region, affinity scores to train
|
||||
word_level_char_bbox's shape : [word_num, [char_num_in_one_word, 4, 2]]
|
||||
:rtype region_score: np.float32
|
||||
:rtype affinity_score: np.float32
|
||||
:rtype confidence_mask: np.float32
|
||||
:rtype word_level_char_bbox: np.float32
|
||||
:rtype words: list
|
||||
"""
|
||||
img_name = self.img_names[index]
|
||||
img_path = os.path.join(self.img_dir, img_name)
|
||||
image = cv2.imread(img_path)
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||
|
||||
img_gt_box_path = os.path.join(
|
||||
self.img_gt_box_dir, "gt_%s.txt" % os.path.splitext(img_name)[0]
|
||||
)
|
||||
word_bboxes, words = self.load_img_gt_box(img_gt_box_path)
|
||||
image, word_bboxes = rescale(image, word_bboxes)
|
||||
img_h, img_w, _ = image.shape
|
||||
|
||||
query_idx = int(self.img_names[index].split(".")[0].split("_")[1])
|
||||
|
||||
saved_region_scores_path = os.path.join(
|
||||
self.saved_gt_dir, f"res_img_{query_idx}_region.jpg"
|
||||
)
|
||||
saved_affi_scores_path = os.path.join(
|
||||
self.saved_gt_dir, f"res_img_{query_idx}_affi.jpg"
|
||||
)
|
||||
saved_cf_mask_path = os.path.join(
|
||||
self.saved_gt_dir, f"res_img_{query_idx}_cf_mask_thresh_0.6.jpg"
|
||||
)
|
||||
region_score = cv2.imread(saved_region_scores_path, cv2.IMREAD_GRAYSCALE)
|
||||
affinity_score = cv2.imread(saved_affi_scores_path, cv2.IMREAD_GRAYSCALE)
|
||||
confidence_mask = cv2.imread(saved_cf_mask_path, cv2.IMREAD_GRAYSCALE)
|
||||
|
||||
region_score = cv2.resize(region_score, (img_w, img_h))
|
||||
affinity_score = cv2.resize(affinity_score, (img_w, img_h))
|
||||
confidence_mask = cv2.resize(
|
||||
confidence_mask, (img_w, img_h), interpolation=cv2.INTER_NEAREST
|
||||
)
|
||||
|
||||
region_score = region_score.astype(np.float32) / 255
|
||||
affinity_score = affinity_score.astype(np.float32) / 255
|
||||
confidence_mask = confidence_mask.astype(np.float32) / 255
|
||||
|
||||
# NOTE : Even though word_level_char_bbox is not necessary, align bbox format with make_gt_score()
|
||||
word_level_char_bbox = []
|
||||
|
||||
for i in range(len(word_bboxes)):
|
||||
word_level_char_bbox.append(np.expand_dims(word_bboxes[i], 0))
|
||||
|
||||
return (
|
||||
image,
|
||||
region_score,
|
||||
affinity_score,
|
||||
confidence_mask,
|
||||
word_level_char_bbox,
|
||||
words,
|
||||
)
|
||||
192
trainer/craft/data/gaussian.py
Normal file
192
trainer/craft/data/gaussian.py
Normal file
@@ -0,0 +1,192 @@
|
||||
import numpy as np
|
||||
import cv2
|
||||
|
||||
from data.boxEnlarge import enlargebox
|
||||
|
||||
|
||||
class GaussianBuilder(object):
|
||||
def __init__(self, init_size, sigma, enlarge_region, enlarge_affinity):
|
||||
self.init_size = init_size
|
||||
self.sigma = sigma
|
||||
self.enlarge_region = enlarge_region
|
||||
self.enlarge_affinity = enlarge_affinity
|
||||
self.gaussian_map, self.gaussian_map_color = self.generate_gaussian_map()
|
||||
|
||||
def generate_gaussian_map(self):
|
||||
circle_mask = self.generate_circle_mask()
|
||||
|
||||
gaussian_map = np.zeros((self.init_size, self.init_size), np.float32)
|
||||
|
||||
for i in range(self.init_size):
|
||||
for j in range(self.init_size):
|
||||
gaussian_map[i, j] = (
|
||||
1
|
||||
/ 2
|
||||
/ np.pi
|
||||
/ (self.sigma ** 2)
|
||||
* np.exp(
|
||||
-1
|
||||
/ 2
|
||||
* (
|
||||
(i - self.init_size / 2) ** 2 / (self.sigma ** 2)
|
||||
+ (j - self.init_size / 2) ** 2 / (self.sigma ** 2)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
gaussian_map = gaussian_map * circle_mask
|
||||
gaussian_map = (gaussian_map / np.max(gaussian_map)).astype(np.float32)
|
||||
|
||||
gaussian_map_color = (gaussian_map * 255).astype(np.uint8)
|
||||
gaussian_map_color = cv2.applyColorMap(gaussian_map_color, cv2.COLORMAP_JET)
|
||||
return gaussian_map, gaussian_map_color
|
||||
|
||||
def generate_circle_mask(self):
|
||||
|
||||
zero_arr = np.zeros((self.init_size, self.init_size), np.float32)
|
||||
circle_mask = cv2.circle(
|
||||
img=zero_arr,
|
||||
center=(self.init_size // 2, self.init_size // 2),
|
||||
radius=self.init_size // 2,
|
||||
color=1,
|
||||
thickness=-1,
|
||||
)
|
||||
|
||||
return circle_mask
|
||||
|
||||
def four_point_transform(self, bbox):
|
||||
"""
|
||||
Using the bbox, standard 2D gaussian map, returns Transformed 2d Gaussian map
|
||||
"""
|
||||
width, height = (
|
||||
np.max(bbox[:, 0]).astype(np.int32),
|
||||
np.max(bbox[:, 1]).astype(np.int32),
|
||||
)
|
||||
init_points = np.array(
|
||||
[
|
||||
[0, 0],
|
||||
[self.init_size, 0],
|
||||
[self.init_size, self.init_size],
|
||||
[0, self.init_size],
|
||||
],
|
||||
dtype="float32",
|
||||
)
|
||||
|
||||
M = cv2.getPerspectiveTransform(init_points, bbox)
|
||||
warped_gaussian_map = cv2.warpPerspective(self.gaussian_map, M, (width, height))
|
||||
return warped_gaussian_map, width, height
|
||||
|
||||
def add_gaussian_map_to_score_map(
|
||||
self, score_map, bbox, enlarge_size, horizontal_text_bool, map_type=None
|
||||
):
|
||||
"""
|
||||
Mapping 2D Gaussian to the character box coordinates of the score_map.
|
||||
|
||||
:param score_map: Target map to put 2D gaussian on character box
|
||||
:type score_map: np.float32
|
||||
:param bbox: character boxes
|
||||
:type bbox: np.float32
|
||||
:param enlarge_size: Enlarge size of gaussian map to fit character shape
|
||||
:type enlarge_size: list of enlarge size [x dim, y dim]
|
||||
:param horizontal_text_bool: Flag that bbox is horizontal text or not
|
||||
:type horizontal_text_bool: bool
|
||||
:param map_type: Whether map's type is "region" | "affinity"
|
||||
:type map_type: str
|
||||
:return score_map: score map that all 2D gaussian put on character box
|
||||
:rtype: np.float32
|
||||
"""
|
||||
|
||||
map_h, map_w = score_map.shape
|
||||
bbox = enlargebox(bbox, map_h, map_w, enlarge_size, horizontal_text_bool)
|
||||
|
||||
# If any one point of character bbox is out of range, don't put in on map
|
||||
if np.any(bbox < 0) or np.any(bbox[:, 0] > map_w) or np.any(bbox[:, 1] > map_h):
|
||||
return score_map
|
||||
|
||||
bbox_left, bbox_top = np.array([np.min(bbox[:, 0]), np.min(bbox[:, 1])]).astype(
|
||||
np.int32
|
||||
)
|
||||
bbox -= (bbox_left, bbox_top)
|
||||
warped_gaussian_map, width, height = self.four_point_transform(
|
||||
bbox.astype(np.float32)
|
||||
)
|
||||
|
||||
try:
|
||||
bbox_area_of_image = score_map[
|
||||
bbox_top : bbox_top + height, bbox_left : bbox_left + width,
|
||||
]
|
||||
high_value_score = np.where(
|
||||
warped_gaussian_map > bbox_area_of_image,
|
||||
warped_gaussian_map,
|
||||
bbox_area_of_image,
|
||||
)
|
||||
score_map[
|
||||
bbox_top : bbox_top + height, bbox_left : bbox_left + width,
|
||||
] = high_value_score
|
||||
|
||||
except Exception as e:
|
||||
print("Error : {}".format(e))
|
||||
print(
|
||||
"On generating {} map, strange box came out. (width: {}, height: {})".format(
|
||||
map_type, width, height
|
||||
)
|
||||
)
|
||||
|
||||
return score_map
|
||||
|
||||
def calculate_affinity_box_points(self, bbox_1, bbox_2, vertical=False):
|
||||
center_1, center_2 = np.mean(bbox_1, axis=0), np.mean(bbox_2, axis=0)
|
||||
if vertical:
|
||||
tl = (bbox_1[0] + bbox_1[-1] + center_1) / 3
|
||||
tr = (bbox_1[1:3].sum(0) + center_1) / 3
|
||||
br = (bbox_2[1:3].sum(0) + center_2) / 3
|
||||
bl = (bbox_2[0] + bbox_2[-1] + center_2) / 3
|
||||
else:
|
||||
tl = (bbox_1[0:2].sum(0) + center_1) / 3
|
||||
tr = (bbox_2[0:2].sum(0) + center_2) / 3
|
||||
br = (bbox_2[2:4].sum(0) + center_2) / 3
|
||||
bl = (bbox_1[2:4].sum(0) + center_1) / 3
|
||||
affinity_box = np.array([tl, tr, br, bl]).astype(np.float32)
|
||||
return affinity_box
|
||||
|
||||
def generate_region(
|
||||
self, img_h, img_w, word_level_char_bbox, horizontal_text_bools
|
||||
):
|
||||
region_map = np.zeros([img_h, img_w], dtype=np.float32)
|
||||
for i in range(
|
||||
len(word_level_char_bbox)
|
||||
): # shape : [word_num, [char_num_in_one_word, 4, 2]]
|
||||
for j in range(len(word_level_char_bbox[i])):
|
||||
region_map = self.add_gaussian_map_to_score_map(
|
||||
region_map,
|
||||
word_level_char_bbox[i][j].copy(),
|
||||
self.enlarge_region,
|
||||
horizontal_text_bools[i],
|
||||
map_type="region",
|
||||
)
|
||||
return region_map
|
||||
|
||||
def generate_affinity(
|
||||
self, img_h, img_w, word_level_char_bbox, horizontal_text_bools
|
||||
):
|
||||
|
||||
affinity_map = np.zeros([img_h, img_w], dtype=np.float32)
|
||||
all_affinity_bbox = []
|
||||
for i in range(len(word_level_char_bbox)):
|
||||
for j in range(len(word_level_char_bbox[i]) - 1):
|
||||
affinity_bbox = self.calculate_affinity_box_points(
|
||||
word_level_char_bbox[i][j], word_level_char_bbox[i][j + 1]
|
||||
)
|
||||
|
||||
affinity_map = self.add_gaussian_map_to_score_map(
|
||||
affinity_map,
|
||||
affinity_bbox.copy(),
|
||||
self.enlarge_affinity,
|
||||
horizontal_text_bools[i],
|
||||
map_type="affinity",
|
||||
)
|
||||
all_affinity_bbox.append(np.expand_dims(affinity_bbox, axis=0))
|
||||
|
||||
if len(all_affinity_bbox) > 0:
|
||||
all_affinity_bbox = np.concatenate(all_affinity_bbox, axis=0)
|
||||
return affinity_map, all_affinity_bbox
|
||||
175
trainer/craft/data/imgaug.py
Normal file
175
trainer/craft/data/imgaug.py
Normal file
@@ -0,0 +1,175 @@
|
||||
import random
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from torchvision.transforms.functional import resized_crop, crop
|
||||
from torchvision.transforms import RandomResizedCrop, RandomCrop
|
||||
from torchvision.transforms import InterpolationMode
|
||||
|
||||
|
||||
def rescale(img, bboxes, target_size=2240):
|
||||
h, w = img.shape[0:2]
|
||||
scale = target_size / max(h, w)
|
||||
img = cv2.resize(img, dsize=None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
|
||||
bboxes = bboxes * scale
|
||||
return img, bboxes
|
||||
|
||||
|
||||
def random_resize_crop_synth(augment_targets, size):
|
||||
image, region_score, affinity_score, confidence_mask = augment_targets
|
||||
|
||||
image = Image.fromarray(image)
|
||||
region_score = Image.fromarray(region_score)
|
||||
affinity_score = Image.fromarray(affinity_score)
|
||||
confidence_mask = Image.fromarray(confidence_mask)
|
||||
|
||||
short_side = min(image.size)
|
||||
i, j, h, w = RandomCrop.get_params(image, output_size=(short_side, short_side))
|
||||
|
||||
image = resized_crop(
|
||||
image, i, j, h, w, size=(size, size), interpolation=InterpolationMode.BICUBIC
|
||||
)
|
||||
region_score = resized_crop(
|
||||
region_score, i, j, h, w, (size, size), interpolation=InterpolationMode.BICUBIC
|
||||
)
|
||||
affinity_score = resized_crop(
|
||||
affinity_score,
|
||||
i,
|
||||
j,
|
||||
h,
|
||||
w,
|
||||
(size, size),
|
||||
interpolation=InterpolationMode.BICUBIC,
|
||||
)
|
||||
confidence_mask = resized_crop(
|
||||
confidence_mask,
|
||||
i,
|
||||
j,
|
||||
h,
|
||||
w,
|
||||
(size, size),
|
||||
interpolation=InterpolationMode.NEAREST,
|
||||
)
|
||||
|
||||
image = np.array(image)
|
||||
region_score = np.array(region_score)
|
||||
affinity_score = np.array(affinity_score)
|
||||
confidence_mask = np.array(confidence_mask)
|
||||
augment_targets = [image, region_score, affinity_score, confidence_mask]
|
||||
|
||||
return augment_targets
|
||||
|
||||
|
||||
def random_resize_crop(
|
||||
augment_targets, scale, ratio, size, threshold, pre_crop_area=None
|
||||
):
|
||||
image, region_score, affinity_score, confidence_mask = augment_targets
|
||||
|
||||
image = Image.fromarray(image)
|
||||
region_score = Image.fromarray(region_score)
|
||||
affinity_score = Image.fromarray(affinity_score)
|
||||
confidence_mask = Image.fromarray(confidence_mask)
|
||||
|
||||
if pre_crop_area != None:
|
||||
i, j, h, w = pre_crop_area
|
||||
|
||||
else:
|
||||
if random.random() < threshold:
|
||||
i, j, h, w = RandomResizedCrop.get_params(image, scale=scale, ratio=ratio)
|
||||
else:
|
||||
i, j, h, w = RandomResizedCrop.get_params(
|
||||
image, scale=(1.0, 1.0), ratio=(1.0, 1.0)
|
||||
)
|
||||
|
||||
image = resized_crop(
|
||||
image, i, j, h, w, size=(size, size), interpolation=InterpolationMode.BICUBIC
|
||||
)
|
||||
region_score = resized_crop(
|
||||
region_score, i, j, h, w, (size, size), interpolation=InterpolationMode.BICUBIC
|
||||
)
|
||||
affinity_score = resized_crop(
|
||||
affinity_score,
|
||||
i,
|
||||
j,
|
||||
h,
|
||||
w,
|
||||
(size, size),
|
||||
interpolation=InterpolationMode.BICUBIC,
|
||||
)
|
||||
confidence_mask = resized_crop(
|
||||
confidence_mask,
|
||||
i,
|
||||
j,
|
||||
h,
|
||||
w,
|
||||
(size, size),
|
||||
interpolation=InterpolationMode.NEAREST,
|
||||
)
|
||||
|
||||
image = np.array(image)
|
||||
region_score = np.array(region_score)
|
||||
affinity_score = np.array(affinity_score)
|
||||
confidence_mask = np.array(confidence_mask)
|
||||
augment_targets = [image, region_score, affinity_score, confidence_mask]
|
||||
|
||||
return augment_targets
|
||||
|
||||
|
||||
def random_crop(augment_targets, size):
|
||||
image, region_score, affinity_score, confidence_mask = augment_targets
|
||||
|
||||
image = Image.fromarray(image)
|
||||
region_score = Image.fromarray(region_score)
|
||||
affinity_score = Image.fromarray(affinity_score)
|
||||
confidence_mask = Image.fromarray(confidence_mask)
|
||||
|
||||
i, j, h, w = RandomCrop.get_params(image, output_size=(size, size))
|
||||
|
||||
image = crop(image, i, j, h, w)
|
||||
region_score = crop(region_score, i, j, h, w)
|
||||
affinity_score = crop(affinity_score, i, j, h, w)
|
||||
confidence_mask = crop(confidence_mask, i, j, h, w)
|
||||
|
||||
image = np.array(image)
|
||||
region_score = np.array(region_score)
|
||||
affinity_score = np.array(affinity_score)
|
||||
confidence_mask = np.array(confidence_mask)
|
||||
augment_targets = [image, region_score, affinity_score, confidence_mask]
|
||||
|
||||
return augment_targets
|
||||
|
||||
|
||||
def random_horizontal_flip(imgs):
|
||||
if random.random() < 0.5:
|
||||
for i in range(len(imgs)):
|
||||
imgs[i] = np.flip(imgs[i], axis=1).copy()
|
||||
return imgs
|
||||
|
||||
|
||||
def random_scale(images, word_level_char_bbox, scale_range):
|
||||
scale = random.sample(scale_range, 1)[0]
|
||||
|
||||
for i in range(len(images)):
|
||||
images[i] = cv2.resize(images[i], dsize=None, fx=scale, fy=scale)
|
||||
|
||||
for i in range(len(word_level_char_bbox)):
|
||||
word_level_char_bbox[i] *= scale
|
||||
|
||||
return images
|
||||
|
||||
|
||||
def random_rotate(images, max_angle):
|
||||
angle = random.random() * 2 * max_angle - max_angle
|
||||
for i in range(len(images)):
|
||||
img = images[i]
|
||||
w, h = img.shape[:2]
|
||||
rotation_matrix = cv2.getRotationMatrix2D((h / 2, w / 2), angle, 1)
|
||||
if i == len(images) - 1:
|
||||
img_rotation = cv2.warpAffine(
|
||||
img, M=rotation_matrix, dsize=(h, w), flags=cv2.INTER_NEAREST
|
||||
)
|
||||
else:
|
||||
img_rotation = cv2.warpAffine(img, rotation_matrix, (h, w))
|
||||
images[i] = img_rotation
|
||||
return images
|
||||
91
trainer/craft/data/imgproc.py
Normal file
91
trainer/craft/data/imgproc.py
Normal file
@@ -0,0 +1,91 @@
|
||||
"""
|
||||
Copyright (c) 2019-present NAVER Corp.
|
||||
MIT License
|
||||
"""
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
import numpy as np
|
||||
|
||||
import cv2
|
||||
from skimage import io
|
||||
|
||||
|
||||
def loadImage(img_file):
|
||||
img = io.imread(img_file) # RGB order
|
||||
if img.shape[0] == 2:
|
||||
img = img[0]
|
||||
if len(img.shape) == 2:
|
||||
img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
|
||||
if img.shape[2] == 4:
|
||||
img = img[:, :, :3]
|
||||
img = np.array(img)
|
||||
|
||||
return img
|
||||
|
||||
|
||||
def normalizeMeanVariance(
|
||||
in_img, mean=(0.485, 0.456, 0.406), variance=(0.229, 0.224, 0.225)
|
||||
):
|
||||
# should be RGB order
|
||||
img = in_img.copy().astype(np.float32)
|
||||
|
||||
img -= np.array(
|
||||
[mean[0] * 255.0, mean[1] * 255.0, mean[2] * 255.0], dtype=np.float32
|
||||
)
|
||||
img /= np.array(
|
||||
[variance[0] * 255.0, variance[1] * 255.0, variance[2] * 255.0],
|
||||
dtype=np.float32,
|
||||
)
|
||||
return img
|
||||
|
||||
|
||||
def denormalizeMeanVariance(
|
||||
in_img, mean=(0.485, 0.456, 0.406), variance=(0.229, 0.224, 0.225)
|
||||
):
|
||||
# should be RGB order
|
||||
img = in_img.copy()
|
||||
img *= variance
|
||||
img += mean
|
||||
img *= 255.0
|
||||
img = np.clip(img, 0, 255).astype(np.uint8)
|
||||
return img
|
||||
|
||||
|
||||
def resize_aspect_ratio(img, square_size, interpolation, mag_ratio=1):
|
||||
height, width, channel = img.shape
|
||||
|
||||
# magnify image size
|
||||
target_size = mag_ratio * max(height, width)
|
||||
|
||||
# set original image size
|
||||
if target_size > square_size:
|
||||
target_size = square_size
|
||||
|
||||
ratio = target_size / max(height, width)
|
||||
|
||||
target_h, target_w = int(height * ratio), int(width * ratio)
|
||||
|
||||
# NOTE
|
||||
valid_size_heatmap = (int(target_h / 2), int(target_w / 2))
|
||||
|
||||
proc = cv2.resize(img, (target_w, target_h), interpolation=interpolation)
|
||||
|
||||
# make canvas and paste image
|
||||
target_h32, target_w32 = target_h, target_w
|
||||
if target_h % 32 != 0:
|
||||
target_h32 = target_h + (32 - target_h % 32)
|
||||
if target_w % 32 != 0:
|
||||
target_w32 = target_w + (32 - target_w % 32)
|
||||
resized = np.zeros((target_h32, target_w32, channel), dtype=np.float32)
|
||||
resized[0:target_h, 0:target_w, :] = proc
|
||||
|
||||
# target_h, target_w = target_h32, target_w32
|
||||
# size_heatmap = (int(target_w/2), int(target_h/2))
|
||||
|
||||
return resized, ratio, valid_size_heatmap
|
||||
|
||||
|
||||
def cvt2HeatmapImg(img):
|
||||
img = (np.clip(img, 0, 1) * 255).astype(np.uint8)
|
||||
img = cv2.applyColorMap(img, cv2.COLORMAP_JET)
|
||||
return img
|
||||
263
trainer/craft/data/pseudo_label/make_charbox.py
Normal file
263
trainer/craft/data/pseudo_label/make_charbox.py
Normal file
@@ -0,0 +1,263 @@
|
||||
import os
|
||||
import random
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
import cv2
|
||||
import torch
|
||||
|
||||
from data import imgproc
|
||||
from data.pseudo_label.watershed import exec_watershed_by_version
|
||||
|
||||
|
||||
class PseudoCharBoxBuilder:
|
||||
def __init__(self, watershed_param, vis_test_dir, pseudo_vis_opt, gaussian_builder):
|
||||
self.watershed_param = watershed_param
|
||||
self.vis_test_dir = vis_test_dir
|
||||
self.pseudo_vis_opt = pseudo_vis_opt
|
||||
self.gaussian_builder = gaussian_builder
|
||||
self.cnt = 0
|
||||
self.flag = False
|
||||
|
||||
def crop_image_by_bbox(self, image, box, word):
|
||||
w = max(
|
||||
int(np.linalg.norm(box[0] - box[1])), int(np.linalg.norm(box[2] - box[3]))
|
||||
)
|
||||
h = max(
|
||||
int(np.linalg.norm(box[0] - box[3])), int(np.linalg.norm(box[1] - box[2]))
|
||||
)
|
||||
try:
|
||||
word_ratio = h / w
|
||||
except:
|
||||
import ipdb
|
||||
|
||||
ipdb.set_trace()
|
||||
|
||||
one_char_ratio = min(h, w) / (max(h, w) / len(word))
|
||||
|
||||
# NOTE: criterion to split vertical word in here is set to work properly on IC15 dataset
|
||||
if word_ratio > 2 or (word_ratio > 1.6 and one_char_ratio > 2.4):
|
||||
# warping method of vertical word (classified by upper condition)
|
||||
horizontal_text_bool = False
|
||||
long_side = h
|
||||
short_side = w
|
||||
M = cv2.getPerspectiveTransform(
|
||||
np.float32(box),
|
||||
np.float32(
|
||||
np.array(
|
||||
[
|
||||
[long_side, 0],
|
||||
[long_side, short_side],
|
||||
[0, short_side],
|
||||
[0, 0],
|
||||
]
|
||||
)
|
||||
),
|
||||
)
|
||||
self.flag = True
|
||||
else:
|
||||
# warping method of horizontal word
|
||||
horizontal_text_bool = True
|
||||
long_side = w
|
||||
short_side = h
|
||||
M = cv2.getPerspectiveTransform(
|
||||
np.float32(box),
|
||||
np.float32(
|
||||
np.array(
|
||||
[
|
||||
[0, 0],
|
||||
[long_side, 0],
|
||||
[long_side, short_side],
|
||||
[0, short_side],
|
||||
]
|
||||
)
|
||||
),
|
||||
)
|
||||
self.flag = False
|
||||
|
||||
warped = cv2.warpPerspective(image, M, (long_side, short_side))
|
||||
return warped, M, horizontal_text_bool
|
||||
|
||||
def inference_word_box(self, net, gpu, word_image):
|
||||
if net.training:
|
||||
net.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
word_img_torch = torch.from_numpy(
|
||||
imgproc.normalizeMeanVariance(
|
||||
word_image,
|
||||
mean=(0.485, 0.456, 0.406),
|
||||
variance=(0.229, 0.224, 0.225),
|
||||
)
|
||||
)
|
||||
word_img_torch = word_img_torch.permute(2, 0, 1).unsqueeze(0)
|
||||
word_img_torch = word_img_torch.type(torch.FloatTensor).cuda(gpu)
|
||||
with torch.cuda.amp.autocast():
|
||||
word_img_scores, _ = net(word_img_torch)
|
||||
return word_img_scores
|
||||
|
||||
def visualize_pseudo_label(
|
||||
self, word_image, region_score, watershed_box, pseudo_char_bbox, img_name,
|
||||
):
|
||||
word_img_h, word_img_w, _ = word_image.shape
|
||||
word_img_cp1 = word_image.copy()
|
||||
word_img_cp2 = word_image.copy()
|
||||
_watershed_box = np.int32(watershed_box)
|
||||
_pseudo_char_bbox = np.int32(pseudo_char_bbox)
|
||||
|
||||
region_score_color = cv2.applyColorMap(np.uint8(region_score), cv2.COLORMAP_JET)
|
||||
region_score_color = cv2.resize(region_score_color, (word_img_w, word_img_h))
|
||||
|
||||
for box in _watershed_box:
|
||||
cv2.polylines(
|
||||
np.uint8(word_img_cp1),
|
||||
[np.reshape(box, (-1, 1, 2))],
|
||||
True,
|
||||
(255, 0, 0),
|
||||
)
|
||||
|
||||
for box in _pseudo_char_bbox:
|
||||
cv2.polylines(
|
||||
np.uint8(word_img_cp2), [np.reshape(box, (-1, 1, 2))], True, (255, 0, 0)
|
||||
)
|
||||
|
||||
# NOTE: Just for visualize, put gaussian map on char box
|
||||
pseudo_gt_region_score = self.gaussian_builder.generate_region(
|
||||
word_img_h, word_img_w, [_pseudo_char_bbox], [True]
|
||||
)
|
||||
|
||||
pseudo_gt_region_score = cv2.applyColorMap(
|
||||
(pseudo_gt_region_score * 255).astype("uint8"), cv2.COLORMAP_JET
|
||||
)
|
||||
|
||||
overlay_img = cv2.addWeighted(
|
||||
word_image[:, :, ::-1], 0.7, pseudo_gt_region_score, 0.3, 5
|
||||
)
|
||||
vis_result = np.hstack(
|
||||
[
|
||||
word_image[:, :, ::-1],
|
||||
region_score_color,
|
||||
word_img_cp1[:, :, ::-1],
|
||||
word_img_cp2[:, :, ::-1],
|
||||
pseudo_gt_region_score,
|
||||
overlay_img,
|
||||
]
|
||||
)
|
||||
|
||||
if not os.path.exists(os.path.dirname(self.vis_test_dir)):
|
||||
os.makedirs(os.path.dirname(self.vis_test_dir))
|
||||
cv2.imwrite(
|
||||
os.path.join(
|
||||
self.vis_test_dir,
|
||||
"{}_{}".format(
|
||||
img_name, f"pseudo_char_bbox_{random.randint(0,100)}.jpg"
|
||||
),
|
||||
),
|
||||
vis_result,
|
||||
)
|
||||
|
||||
def clip_into_boundary(self, box, bound):
|
||||
if len(box) == 0:
|
||||
return box
|
||||
else:
|
||||
box[:, :, 0] = np.clip(box[:, :, 0], 0, bound[1])
|
||||
box[:, :, 1] = np.clip(box[:, :, 1], 0, bound[0])
|
||||
return box
|
||||
|
||||
def get_confidence(self, real_len, pseudo_len):
|
||||
if pseudo_len == 0:
|
||||
return 0.0
|
||||
return (real_len - min(real_len, abs(real_len - pseudo_len))) / real_len
|
||||
|
||||
def split_word_equal_gap(self, word_img_w, word_img_h, word):
|
||||
width = word_img_w
|
||||
height = word_img_h
|
||||
|
||||
width_per_char = width / len(word)
|
||||
bboxes = []
|
||||
for j, char in enumerate(word):
|
||||
if char == " ":
|
||||
continue
|
||||
left = j * width_per_char
|
||||
right = (j + 1) * width_per_char
|
||||
bbox = np.array([[left, 0], [right, 0], [right, height], [left, height]])
|
||||
bboxes.append(bbox)
|
||||
|
||||
bboxes = np.array(bboxes, np.float32)
|
||||
return bboxes
|
||||
|
||||
def cal_angle(self, v1):
|
||||
theta = np.arccos(min(1, v1[0] / (np.linalg.norm(v1) + 10e-8)))
|
||||
return 2 * math.pi - theta if v1[1] < 0 else theta
|
||||
|
||||
def clockwise_sort(self, points):
|
||||
# returns 4x2 [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] ndarray
|
||||
v1, v2, v3, v4 = points
|
||||
center = (v1 + v2 + v3 + v4) / 4
|
||||
theta = np.array(
|
||||
[
|
||||
self.cal_angle(v1 - center),
|
||||
self.cal_angle(v2 - center),
|
||||
self.cal_angle(v3 - center),
|
||||
self.cal_angle(v4 - center),
|
||||
]
|
||||
)
|
||||
index = np.argsort(theta)
|
||||
return np.array([v1, v2, v3, v4])[index, :]
|
||||
|
||||
def build_char_box(self, net, gpu, image, word_bbox, word, img_name=""):
|
||||
word_image, M, horizontal_text_bool = self.crop_image_by_bbox(
|
||||
image, word_bbox, word
|
||||
)
|
||||
real_word_without_space = word.replace("\s", "")
|
||||
real_char_len = len(real_word_without_space)
|
||||
|
||||
scale = 128.0 / word_image.shape[0]
|
||||
|
||||
word_image = cv2.resize(word_image, None, fx=scale, fy=scale)
|
||||
word_img_h, word_img_w, _ = word_image.shape
|
||||
|
||||
scores = self.inference_word_box(net, gpu, word_image)
|
||||
region_score = scores[0, :, :, 0].cpu().data.numpy()
|
||||
region_score = np.uint8(np.clip(region_score, 0, 1) * 255)
|
||||
|
||||
region_score_rgb = cv2.resize(region_score, (word_img_w, word_img_h))
|
||||
region_score_rgb = cv2.cvtColor(region_score_rgb, cv2.COLOR_GRAY2RGB)
|
||||
|
||||
pseudo_char_bbox = exec_watershed_by_version(
|
||||
self.watershed_param, region_score, word_image, self.pseudo_vis_opt
|
||||
)
|
||||
|
||||
# Used for visualize only
|
||||
watershed_box = pseudo_char_bbox.copy()
|
||||
|
||||
pseudo_char_bbox = self.clip_into_boundary(
|
||||
pseudo_char_bbox, region_score_rgb.shape
|
||||
)
|
||||
|
||||
confidence = self.get_confidence(real_char_len, len(pseudo_char_bbox))
|
||||
|
||||
if confidence <= 0.5:
|
||||
pseudo_char_bbox = self.split_word_equal_gap(word_img_w, word_img_h, word)
|
||||
confidence = 0.5
|
||||
|
||||
if self.pseudo_vis_opt and self.flag:
|
||||
self.visualize_pseudo_label(
|
||||
word_image, region_score, watershed_box, pseudo_char_bbox, img_name,
|
||||
)
|
||||
|
||||
if len(pseudo_char_bbox) != 0:
|
||||
index = np.argsort(pseudo_char_bbox[:, 0, 0])
|
||||
pseudo_char_bbox = pseudo_char_bbox[index]
|
||||
|
||||
pseudo_char_bbox /= scale
|
||||
|
||||
M_inv = np.linalg.pinv(M)
|
||||
for i in range(len(pseudo_char_bbox)):
|
||||
pseudo_char_bbox[i] = cv2.perspectiveTransform(
|
||||
pseudo_char_bbox[i][None, :, :], M_inv
|
||||
)
|
||||
|
||||
pseudo_char_bbox = self.clip_into_boundary(pseudo_char_bbox, image.shape)
|
||||
|
||||
return pseudo_char_bbox, confidence, horizontal_text_bool
|
||||
45
trainer/craft/data/pseudo_label/watershed.py
Normal file
45
trainer/craft/data/pseudo_label/watershed.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
from skimage.segmentation import watershed
|
||||
|
||||
|
||||
def segment_region_score(watershed_param, region_score, word_image, pseudo_vis_opt):
|
||||
region_score = np.float32(region_score) / 255
|
||||
fore = np.uint8(region_score > 0.75)
|
||||
back = np.uint8(region_score < 0.05)
|
||||
unknown = 1 - (fore + back)
|
||||
ret, markers = cv2.connectedComponents(fore)
|
||||
markers += 1
|
||||
markers[unknown == 1] = 0
|
||||
|
||||
labels = watershed(-region_score, markers)
|
||||
boxes = []
|
||||
for label in range(2, ret + 1):
|
||||
y, x = np.where(labels == label)
|
||||
x_max = x.max()
|
||||
y_max = y.max()
|
||||
x_min = x.min()
|
||||
y_min = y.min()
|
||||
box = [[x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max]]
|
||||
box = np.array(box)
|
||||
box *= 2
|
||||
boxes.append(box)
|
||||
return np.array(boxes, dtype=np.float32)
|
||||
|
||||
|
||||
def exec_watershed_by_version(
|
||||
watershed_param, region_score, word_image, pseudo_vis_opt
|
||||
):
|
||||
|
||||
func_name_map_dict = {
|
||||
"skimage": segment_region_score,
|
||||
}
|
||||
|
||||
try:
|
||||
return func_name_map_dict[watershed_param.version](
|
||||
watershed_param, region_score, word_image, pseudo_vis_opt
|
||||
)
|
||||
except:
|
||||
print(
|
||||
f"Watershed version {watershed_param.version} does not exist in func_name_map_dict."
|
||||
)
|
||||
Reference in New Issue
Block a user