import os import random import math import numpy as np import cv2 import torch from data import imgproc from data.pseudo_label.watershed import exec_watershed_by_version class PseudoCharBoxBuilder: def __init__(self, watershed_param, vis_test_dir, pseudo_vis_opt, gaussian_builder): self.watershed_param = watershed_param self.vis_test_dir = vis_test_dir self.pseudo_vis_opt = pseudo_vis_opt self.gaussian_builder = gaussian_builder self.cnt = 0 self.flag = False def crop_image_by_bbox(self, image, box, word): w = max( int(np.linalg.norm(box[0] - box[1])), int(np.linalg.norm(box[2] - box[3])) ) h = max( int(np.linalg.norm(box[0] - box[3])), int(np.linalg.norm(box[1] - box[2])) ) try: word_ratio = h / w except: import ipdb ipdb.set_trace() one_char_ratio = min(h, w) / (max(h, w) / len(word)) # NOTE: criterion to split vertical word in here is set to work properly on IC15 dataset if word_ratio > 2 or (word_ratio > 1.6 and one_char_ratio > 2.4): # warping method of vertical word (classified by upper condition) horizontal_text_bool = False long_side = h short_side = w M = cv2.getPerspectiveTransform( np.float32(box), np.float32( np.array( [ [long_side, 0], [long_side, short_side], [0, short_side], [0, 0], ] ) ), ) self.flag = True else: # warping method of horizontal word horizontal_text_bool = True long_side = w short_side = h M = cv2.getPerspectiveTransform( np.float32(box), np.float32( np.array( [ [0, 0], [long_side, 0], [long_side, short_side], [0, short_side], ] ) ), ) self.flag = False warped = cv2.warpPerspective(image, M, (long_side, short_side)) return warped, M, horizontal_text_bool def inference_word_box(self, net, gpu, word_image): if net.training: net.eval() with torch.no_grad(): word_img_torch = torch.from_numpy( imgproc.normalizeMeanVariance( word_image, mean=(0.485, 0.456, 0.406), variance=(0.229, 0.224, 0.225), ) ) word_img_torch = word_img_torch.permute(2, 0, 1).unsqueeze(0) word_img_torch = word_img_torch.type(torch.FloatTensor).cuda(gpu) with torch.cuda.amp.autocast(): word_img_scores, _ = net(word_img_torch) return word_img_scores def visualize_pseudo_label( self, word_image, region_score, watershed_box, pseudo_char_bbox, img_name, ): word_img_h, word_img_w, _ = word_image.shape word_img_cp1 = word_image.copy() word_img_cp2 = word_image.copy() _watershed_box = np.int32(watershed_box) _pseudo_char_bbox = np.int32(pseudo_char_bbox) region_score_color = cv2.applyColorMap(np.uint8(region_score), cv2.COLORMAP_JET) region_score_color = cv2.resize(region_score_color, (word_img_w, word_img_h)) for box in _watershed_box: cv2.polylines( np.uint8(word_img_cp1), [np.reshape(box, (-1, 1, 2))], True, (255, 0, 0), ) for box in _pseudo_char_bbox: cv2.polylines( np.uint8(word_img_cp2), [np.reshape(box, (-1, 1, 2))], True, (255, 0, 0) ) # NOTE: Just for visualize, put gaussian map on char box pseudo_gt_region_score = self.gaussian_builder.generate_region( word_img_h, word_img_w, [_pseudo_char_bbox], [True] ) pseudo_gt_region_score = cv2.applyColorMap( (pseudo_gt_region_score * 255).astype("uint8"), cv2.COLORMAP_JET ) overlay_img = cv2.addWeighted( word_image[:, :, ::-1], 0.7, pseudo_gt_region_score, 0.3, 5 ) vis_result = np.hstack( [ word_image[:, :, ::-1], region_score_color, word_img_cp1[:, :, ::-1], word_img_cp2[:, :, ::-1], pseudo_gt_region_score, overlay_img, ] ) if not os.path.exists(os.path.dirname(self.vis_test_dir)): os.makedirs(os.path.dirname(self.vis_test_dir)) cv2.imwrite( os.path.join( self.vis_test_dir, "{}_{}".format( img_name, f"pseudo_char_bbox_{random.randint(0,100)}.jpg" ), ), vis_result, ) def clip_into_boundary(self, box, bound): if len(box) == 0: return box else: box[:, :, 0] = np.clip(box[:, :, 0], 0, bound[1]) box[:, :, 1] = np.clip(box[:, :, 1], 0, bound[0]) return box def get_confidence(self, real_len, pseudo_len): if pseudo_len == 0: return 0.0 return (real_len - min(real_len, abs(real_len - pseudo_len))) / real_len def split_word_equal_gap(self, word_img_w, word_img_h, word): width = word_img_w height = word_img_h width_per_char = width / len(word) bboxes = [] for j, char in enumerate(word): if char == " ": continue left = j * width_per_char right = (j + 1) * width_per_char bbox = np.array([[left, 0], [right, 0], [right, height], [left, height]]) bboxes.append(bbox) bboxes = np.array(bboxes, np.float32) return bboxes def cal_angle(self, v1): theta = np.arccos(min(1, v1[0] / (np.linalg.norm(v1) + 10e-8))) return 2 * math.pi - theta if v1[1] < 0 else theta def clockwise_sort(self, points): # returns 4x2 [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] ndarray v1, v2, v3, v4 = points center = (v1 + v2 + v3 + v4) / 4 theta = np.array( [ self.cal_angle(v1 - center), self.cal_angle(v2 - center), self.cal_angle(v3 - center), self.cal_angle(v4 - center), ] ) index = np.argsort(theta) return np.array([v1, v2, v3, v4])[index, :] def build_char_box(self, net, gpu, image, word_bbox, word, img_name=""): word_image, M, horizontal_text_bool = self.crop_image_by_bbox( image, word_bbox, word ) real_word_without_space = word.replace("\s", "") real_char_len = len(real_word_without_space) scale = 128.0 / word_image.shape[0] word_image = cv2.resize(word_image, None, fx=scale, fy=scale) word_img_h, word_img_w, _ = word_image.shape scores = self.inference_word_box(net, gpu, word_image) region_score = scores[0, :, :, 0].cpu().data.numpy() region_score = np.uint8(np.clip(region_score, 0, 1) * 255) region_score_rgb = cv2.resize(region_score, (word_img_w, word_img_h)) region_score_rgb = cv2.cvtColor(region_score_rgb, cv2.COLOR_GRAY2RGB) pseudo_char_bbox = exec_watershed_by_version( self.watershed_param, region_score, word_image, self.pseudo_vis_opt ) # Used for visualize only watershed_box = pseudo_char_bbox.copy() pseudo_char_bbox = self.clip_into_boundary( pseudo_char_bbox, region_score_rgb.shape ) confidence = self.get_confidence(real_char_len, len(pseudo_char_bbox)) if confidence <= 0.5: pseudo_char_bbox = self.split_word_equal_gap(word_img_w, word_img_h, word) confidence = 0.5 if self.pseudo_vis_opt and self.flag: self.visualize_pseudo_label( word_image, region_score, watershed_box, pseudo_char_bbox, img_name, ) if len(pseudo_char_bbox) != 0: index = np.argsort(pseudo_char_bbox[:, 0, 0]) pseudo_char_bbox = pseudo_char_bbox[index] pseudo_char_bbox /= scale M_inv = np.linalg.pinv(M) for i in range(len(pseudo_char_bbox)): pseudo_char_bbox[i] = cv2.perspectiveTransform( pseudo_char_bbox[i][None, :, :], M_inv ) pseudo_char_bbox = self.clip_into_boundary(pseudo_char_bbox, image.shape) return pseudo_char_bbox, confidence, horizontal_text_bool