Files
easyocr/trainer/craft/data/pseudo_label/make_charbox.py
2025-07-10 19:42:57 +08:00

264 lines
8.8 KiB
Python

import os
import random
import math
import numpy as np
import cv2
import torch
from data import imgproc
from data.pseudo_label.watershed import exec_watershed_by_version
class PseudoCharBoxBuilder:
def __init__(self, watershed_param, vis_test_dir, pseudo_vis_opt, gaussian_builder):
self.watershed_param = watershed_param
self.vis_test_dir = vis_test_dir
self.pseudo_vis_opt = pseudo_vis_opt
self.gaussian_builder = gaussian_builder
self.cnt = 0
self.flag = False
def crop_image_by_bbox(self, image, box, word):
w = max(
int(np.linalg.norm(box[0] - box[1])), int(np.linalg.norm(box[2] - box[3]))
)
h = max(
int(np.linalg.norm(box[0] - box[3])), int(np.linalg.norm(box[1] - box[2]))
)
try:
word_ratio = h / w
except:
import ipdb
ipdb.set_trace()
one_char_ratio = min(h, w) / (max(h, w) / len(word))
# NOTE: criterion to split vertical word in here is set to work properly on IC15 dataset
if word_ratio > 2 or (word_ratio > 1.6 and one_char_ratio > 2.4):
# warping method of vertical word (classified by upper condition)
horizontal_text_bool = False
long_side = h
short_side = w
M = cv2.getPerspectiveTransform(
np.float32(box),
np.float32(
np.array(
[
[long_side, 0],
[long_side, short_side],
[0, short_side],
[0, 0],
]
)
),
)
self.flag = True
else:
# warping method of horizontal word
horizontal_text_bool = True
long_side = w
short_side = h
M = cv2.getPerspectiveTransform(
np.float32(box),
np.float32(
np.array(
[
[0, 0],
[long_side, 0],
[long_side, short_side],
[0, short_side],
]
)
),
)
self.flag = False
warped = cv2.warpPerspective(image, M, (long_side, short_side))
return warped, M, horizontal_text_bool
def inference_word_box(self, net, gpu, word_image):
if net.training:
net.eval()
with torch.no_grad():
word_img_torch = torch.from_numpy(
imgproc.normalizeMeanVariance(
word_image,
mean=(0.485, 0.456, 0.406),
variance=(0.229, 0.224, 0.225),
)
)
word_img_torch = word_img_torch.permute(2, 0, 1).unsqueeze(0)
word_img_torch = word_img_torch.type(torch.FloatTensor).cuda(gpu)
with torch.cuda.amp.autocast():
word_img_scores, _ = net(word_img_torch)
return word_img_scores
def visualize_pseudo_label(
self, word_image, region_score, watershed_box, pseudo_char_bbox, img_name,
):
word_img_h, word_img_w, _ = word_image.shape
word_img_cp1 = word_image.copy()
word_img_cp2 = word_image.copy()
_watershed_box = np.int32(watershed_box)
_pseudo_char_bbox = np.int32(pseudo_char_bbox)
region_score_color = cv2.applyColorMap(np.uint8(region_score), cv2.COLORMAP_JET)
region_score_color = cv2.resize(region_score_color, (word_img_w, word_img_h))
for box in _watershed_box:
cv2.polylines(
np.uint8(word_img_cp1),
[np.reshape(box, (-1, 1, 2))],
True,
(255, 0, 0),
)
for box in _pseudo_char_bbox:
cv2.polylines(
np.uint8(word_img_cp2), [np.reshape(box, (-1, 1, 2))], True, (255, 0, 0)
)
# NOTE: Just for visualize, put gaussian map on char box
pseudo_gt_region_score = self.gaussian_builder.generate_region(
word_img_h, word_img_w, [_pseudo_char_bbox], [True]
)
pseudo_gt_region_score = cv2.applyColorMap(
(pseudo_gt_region_score * 255).astype("uint8"), cv2.COLORMAP_JET
)
overlay_img = cv2.addWeighted(
word_image[:, :, ::-1], 0.7, pseudo_gt_region_score, 0.3, 5
)
vis_result = np.hstack(
[
word_image[:, :, ::-1],
region_score_color,
word_img_cp1[:, :, ::-1],
word_img_cp2[:, :, ::-1],
pseudo_gt_region_score,
overlay_img,
]
)
if not os.path.exists(os.path.dirname(self.vis_test_dir)):
os.makedirs(os.path.dirname(self.vis_test_dir))
cv2.imwrite(
os.path.join(
self.vis_test_dir,
"{}_{}".format(
img_name, f"pseudo_char_bbox_{random.randint(0,100)}.jpg"
),
),
vis_result,
)
def clip_into_boundary(self, box, bound):
if len(box) == 0:
return box
else:
box[:, :, 0] = np.clip(box[:, :, 0], 0, bound[1])
box[:, :, 1] = np.clip(box[:, :, 1], 0, bound[0])
return box
def get_confidence(self, real_len, pseudo_len):
if pseudo_len == 0:
return 0.0
return (real_len - min(real_len, abs(real_len - pseudo_len))) / real_len
def split_word_equal_gap(self, word_img_w, word_img_h, word):
width = word_img_w
height = word_img_h
width_per_char = width / len(word)
bboxes = []
for j, char in enumerate(word):
if char == " ":
continue
left = j * width_per_char
right = (j + 1) * width_per_char
bbox = np.array([[left, 0], [right, 0], [right, height], [left, height]])
bboxes.append(bbox)
bboxes = np.array(bboxes, np.float32)
return bboxes
def cal_angle(self, v1):
theta = np.arccos(min(1, v1[0] / (np.linalg.norm(v1) + 10e-8)))
return 2 * math.pi - theta if v1[1] < 0 else theta
def clockwise_sort(self, points):
# returns 4x2 [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] ndarray
v1, v2, v3, v4 = points
center = (v1 + v2 + v3 + v4) / 4
theta = np.array(
[
self.cal_angle(v1 - center),
self.cal_angle(v2 - center),
self.cal_angle(v3 - center),
self.cal_angle(v4 - center),
]
)
index = np.argsort(theta)
return np.array([v1, v2, v3, v4])[index, :]
def build_char_box(self, net, gpu, image, word_bbox, word, img_name=""):
word_image, M, horizontal_text_bool = self.crop_image_by_bbox(
image, word_bbox, word
)
real_word_without_space = word.replace("\s", "")
real_char_len = len(real_word_without_space)
scale = 128.0 / word_image.shape[0]
word_image = cv2.resize(word_image, None, fx=scale, fy=scale)
word_img_h, word_img_w, _ = word_image.shape
scores = self.inference_word_box(net, gpu, word_image)
region_score = scores[0, :, :, 0].cpu().data.numpy()
region_score = np.uint8(np.clip(region_score, 0, 1) * 255)
region_score_rgb = cv2.resize(region_score, (word_img_w, word_img_h))
region_score_rgb = cv2.cvtColor(region_score_rgb, cv2.COLOR_GRAY2RGB)
pseudo_char_bbox = exec_watershed_by_version(
self.watershed_param, region_score, word_image, self.pseudo_vis_opt
)
# Used for visualize only
watershed_box = pseudo_char_bbox.copy()
pseudo_char_bbox = self.clip_into_boundary(
pseudo_char_bbox, region_score_rgb.shape
)
confidence = self.get_confidence(real_char_len, len(pseudo_char_bbox))
if confidence <= 0.5:
pseudo_char_bbox = self.split_word_equal_gap(word_img_w, word_img_h, word)
confidence = 0.5
if self.pseudo_vis_opt and self.flag:
self.visualize_pseudo_label(
word_image, region_score, watershed_box, pseudo_char_bbox, img_name,
)
if len(pseudo_char_bbox) != 0:
index = np.argsort(pseudo_char_bbox[:, 0, 0])
pseudo_char_bbox = pseudo_char_bbox[index]
pseudo_char_bbox /= scale
M_inv = np.linalg.pinv(M)
for i in range(len(pseudo_char_bbox)):
pseudo_char_bbox[i] = cv2.perspectiveTransform(
pseudo_char_bbox[i][None, :, :], M_inv
)
pseudo_char_bbox = self.clip_into_boundary(pseudo_char_bbox, image.shape)
return pseudo_char_bbox, confidence, horizontal_text_bool