testing dataset

2025-07-10 19:42:57 +08:00
commit 185959cf2a
316 changed files with 19605393 additions and 0 deletions
--- a/trainer/craft/data/boxEnlarge.py
+++ b/trainer/craft/data/boxEnlarge.py
@@ -0,0 +1,65 @@
+import math
+import numpy as np
+
+
+def pointAngle(Apoint, Bpoint):
+    angle = (Bpoint[1] - Apoint[1]) / ((Bpoint[0] - Apoint[0]) + 10e-8)
+    return angle
+
+def pointDistance(Apoint, Bpoint):
+    return math.sqrt((Bpoint[1] - Apoint[1])**2 + (Bpoint[0] - Apoint[0])**2)
+
+def lineBiasAndK(Apoint, Bpoint):
+
+    K = pointAngle(Apoint, Bpoint)
+    B = Apoint[1] - K*Apoint[0]
+    return K, B
+
+def getX(K, B, Ypoint):
+    return int((Ypoint-B)/K)
+
+def sidePoint(Apoint, Bpoint, h, w, placehold, enlarge_size):
+
+    K, B = lineBiasAndK(Apoint, Bpoint)
+    angle = abs(math.atan(pointAngle(Apoint, Bpoint)))
+    distance = pointDistance(Apoint, Bpoint)
+
+    x_enlarge_size, y_enlarge_size = enlarge_size
+
+    XaxisIncreaseDistance = abs(math.cos(angle) * x_enlarge_size * distance)
+    YaxisIncreaseDistance = abs(math.sin(angle) * y_enlarge_size * distance)
+
+    if placehold == 'leftTop':
+        x1 = max(0, Apoint[0] - XaxisIncreaseDistance)
+        y1 = max(0, Apoint[1] - YaxisIncreaseDistance)
+    elif placehold == 'rightTop':
+        x1 = min(w, Bpoint[0] + XaxisIncreaseDistance)
+        y1 = max(0, Bpoint[1] - YaxisIncreaseDistance)
+    elif placehold == 'rightBottom':
+        x1 = min(w, Bpoint[0] + XaxisIncreaseDistance)
+        y1 = min(h, Bpoint[1] + YaxisIncreaseDistance)
+    elif placehold == 'leftBottom':
+        x1 = max(0, Apoint[0] - XaxisIncreaseDistance)
+        y1 = min(h, Apoint[1] + YaxisIncreaseDistance)
+    return int(x1), int(y1)
+
+def enlargebox(box, h, w, enlarge_size, horizontal_text_bool):
+
+    if not horizontal_text_bool:
+        enlarge_size = (enlarge_size[1], enlarge_size[0])
+
+    box = np.roll(box, -np.argmin(box.sum(axis=1)), axis=0)
+
+    Apoint, Bpoint, Cpoint, Dpoint = box
+    K1, B1 = lineBiasAndK(box[0], box[2])
+    K2, B2 = lineBiasAndK(box[3], box[1])
+    X = (B2 - B1)/(K1 - K2)
+    Y = K1 * X + B1
+    center = [X, Y]
+
+    x1, y1 = sidePoint(Apoint, center, h, w, 'leftTop', enlarge_size)
+    x2, y2 = sidePoint(center, Bpoint, h, w, 'rightTop', enlarge_size)
+    x3, y3 = sidePoint(center, Cpoint, h, w, 'rightBottom', enlarge_size)
+    x4, y4 = sidePoint(Dpoint, center, h, w, 'leftBottom', enlarge_size)
+    newcharbox = np.array([[x1, y1], [x2, y2], [x3, y3], [x4, y4]])
+    return newcharbox
--- a/trainer/craft/data/dataset.py
+++ b/trainer/craft/data/dataset.py
@@ -0,0 +1,542 @@
+import os
+import re
+import itertools
+import random
+
+import numpy as np
+import scipy.io as scio
+from PIL import Image
+import cv2
+from torch.utils.data import Dataset
+import torchvision.transforms as transforms
+
+from data import imgproc
+from data.gaussian import GaussianBuilder
+from data.imgaug import (
+    rescale,
+    random_resize_crop_synth,
+    random_resize_crop,
+    random_horizontal_flip,
+    random_rotate,
+    random_scale,
+    random_crop,
+)
+from data.pseudo_label.make_charbox import PseudoCharBoxBuilder
+from utils.util import saveInput, saveImage
+
+
+class CraftBaseDataset(Dataset):
+    def __init__(
+        self,
+        output_size,
+        data_dir,
+        saved_gt_dir,
+        mean,
+        variance,
+        gauss_init_size,
+        gauss_sigma,
+        enlarge_region,
+        enlarge_affinity,
+        aug,
+        vis_test_dir,
+        vis_opt,
+        sample,
+    ):
+        self.output_size = output_size
+        self.data_dir = data_dir
+        self.saved_gt_dir = saved_gt_dir
+        self.mean, self.variance = mean, variance
+        self.gaussian_builder = GaussianBuilder(
+            gauss_init_size, gauss_sigma, enlarge_region, enlarge_affinity
+        )
+        self.aug = aug
+        self.vis_test_dir = vis_test_dir
+        self.vis_opt = vis_opt
+        self.sample = sample
+        if self.sample != -1:
+            random.seed(0)
+            self.idx = random.sample(range(0, len(self.img_names)), self.sample)
+
+        self.pre_crop_area = []
+
+    def augment_image(
+        self, image, region_score, affinity_score, confidence_mask, word_level_char_bbox
+    ):
+        augment_targets = [image, region_score, affinity_score, confidence_mask]
+
+        if self.aug.random_scale.option:
+            augment_targets, word_level_char_bbox = random_scale(
+                augment_targets, word_level_char_bbox, self.aug.random_scale.range
+            )
+
+        if self.aug.random_rotate.option:
+            augment_targets = random_rotate(
+                augment_targets, self.aug.random_rotate.max_angle
+            )
+
+        if self.aug.random_crop.option:
+            if self.aug.random_crop.version == "random_crop_with_bbox":
+                augment_targets = random_crop_with_bbox(
+                    augment_targets, word_level_char_bbox, self.output_size
+                )
+            elif self.aug.random_crop.version == "random_resize_crop_synth":
+                augment_targets = random_resize_crop_synth(
+                    augment_targets, self.output_size
+                )
+            elif self.aug.random_crop.version == "random_resize_crop":
+
+                if len(self.pre_crop_area) > 0:
+                    pre_crop_area = self.pre_crop_area
+                else:
+                    pre_crop_area = None
+
+                augment_targets = random_resize_crop(
+                    augment_targets,
+                    self.aug.random_crop.scale,
+                    self.aug.random_crop.ratio,
+                    self.output_size,
+                    self.aug.random_crop.rnd_threshold,
+                    pre_crop_area,
+                )
+
+            elif self.aug.random_crop.version == "random_crop":
+                augment_targets = random_crop(augment_targets, self.output_size,)
+
+            else:
+                assert "Undefined RandomCrop version"
+
+        if self.aug.random_horizontal_flip.option:
+            augment_targets = random_horizontal_flip(augment_targets)
+
+        if self.aug.random_colorjitter.option:
+            image, region_score, affinity_score, confidence_mask = augment_targets
+            image = Image.fromarray(image)
+            image = transforms.ColorJitter(
+                brightness=self.aug.random_colorjitter.brightness,
+                contrast=self.aug.random_colorjitter.contrast,
+                saturation=self.aug.random_colorjitter.saturation,
+                hue=self.aug.random_colorjitter.hue,
+            )(image)
+        else:
+            image, region_score, affinity_score, confidence_mask = augment_targets
+
+        return np.array(image), region_score, affinity_score, confidence_mask
+
+    def resize_to_half(self, ground_truth, interpolation):
+        return cv2.resize(
+            ground_truth,
+            (self.output_size // 2, self.output_size // 2),
+            interpolation=interpolation,
+        )
+
+    def __len__(self):
+        if self.sample != -1:
+            return len(self.idx)
+        else:
+            return len(self.img_names)
+
+    def __getitem__(self, index):
+        if self.sample != -1:
+            index = self.idx[index]
+        if self.saved_gt_dir is None:
+            (
+                image,
+                region_score,
+                affinity_score,
+                confidence_mask,
+                word_level_char_bbox,
+                all_affinity_bbox,
+                words,
+            ) = self.make_gt_score(index)
+        else:
+            (
+                image,
+                region_score,
+                affinity_score,
+                confidence_mask,
+                word_level_char_bbox,
+                words,
+            ) = self.load_saved_gt_score(index)
+            all_affinity_bbox = []
+
+        if self.vis_opt:
+            saveImage(
+                self.img_names[index],
+                self.vis_test_dir,
+                image.copy(),
+                word_level_char_bbox.copy(),
+                all_affinity_bbox.copy(),
+                region_score.copy(),
+                affinity_score.copy(),
+                confidence_mask.copy(),
+            )
+
+        image, region_score, affinity_score, confidence_mask = self.augment_image(
+            image, region_score, affinity_score, confidence_mask, word_level_char_bbox
+        )
+
+        if self.vis_opt:
+            saveInput(
+                self.img_names[index],
+                self.vis_test_dir,
+                image,
+                region_score,
+                affinity_score,
+                confidence_mask,
+            )
+
+        region_score = self.resize_to_half(region_score, interpolation=cv2.INTER_CUBIC)
+        affinity_score = self.resize_to_half(
+            affinity_score, interpolation=cv2.INTER_CUBIC
+        )
+        confidence_mask = self.resize_to_half(
+            confidence_mask, interpolation=cv2.INTER_NEAREST
+        )
+
+        image = imgproc.normalizeMeanVariance(
+            np.array(image), mean=self.mean, variance=self.variance
+        )
+        image = image.transpose(2, 0, 1)
+
+        return image, region_score, affinity_score, confidence_mask
+
+
+class SynthTextDataSet(CraftBaseDataset):
+    def __init__(
+        self,
+        output_size,
+        data_dir,
+        saved_gt_dir,
+        mean,
+        variance,
+        gauss_init_size,
+        gauss_sigma,
+        enlarge_region,
+        enlarge_affinity,
+        aug,
+        vis_test_dir,
+        vis_opt,
+        sample,
+    ):
+        super().__init__(
+            output_size,
+            data_dir,
+            saved_gt_dir,
+            mean,
+            variance,
+            gauss_init_size,
+            gauss_sigma,
+            enlarge_region,
+            enlarge_affinity,
+            aug,
+            vis_test_dir,
+            vis_opt,
+            sample,
+        )
+        self.img_names, self.char_bbox, self.img_words = self.load_data()
+        self.vis_index = list(range(1000))
+
+    def load_data(self, bbox="char"):
+
+        gt = scio.loadmat(os.path.join(self.data_dir, "gt.mat"))
+        img_names = gt["imnames"][0]
+        img_words = gt["txt"][0]
+
+        if bbox == "char":
+            img_bbox = gt["charBB"][0]
+        else:
+            img_bbox = gt["wordBB"][0]  # word bbox needed for test
+
+        return img_names, img_bbox, img_words
+
+    def dilate_img_to_output_size(self, image, char_bbox):
+        h, w, _ = image.shape
+        if min(h, w) <= self.output_size:
+            scale = float(self.output_size) / min(h, w)
+        else:
+            scale = 1.0
+        image = cv2.resize(
+            image, dsize=None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC
+        )
+        char_bbox *= scale
+        return image, char_bbox
+
+    def make_gt_score(self, index):
+        img_path = os.path.join(self.data_dir, self.img_names[index][0])
+        image = cv2.imread(img_path, cv2.IMREAD_COLOR)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        all_char_bbox = self.char_bbox[index].transpose(
+            (2, 1, 0)
+        )  # shape : (Number of characters in image, 4, 2)
+
+        img_h, img_w, _ = image.shape
+
+        confidence_mask = np.ones((img_h, img_w), dtype=np.float32)
+
+        words = [
+            re.split(" \n|\n |\n| ", word.strip()) for word in self.img_words[index]
+        ]
+        words = list(itertools.chain(*words))
+        words = [word for word in words if len(word) > 0]
+
+        word_level_char_bbox = []
+        char_idx = 0
+
+        for i in range(len(words)):
+            length_of_word = len(words[i])
+            word_bbox = all_char_bbox[char_idx : char_idx + length_of_word]
+            assert len(word_bbox) == length_of_word
+            char_idx += length_of_word
+            word_bbox = np.array(word_bbox)
+            word_level_char_bbox.append(word_bbox)
+
+        region_score = self.gaussian_builder.generate_region(
+            img_h,
+            img_w,
+            word_level_char_bbox,
+            horizontal_text_bools=[True for _ in range(len(words))],
+        )
+        affinity_score, all_affinity_bbox = self.gaussian_builder.generate_affinity(
+            img_h,
+            img_w,
+            word_level_char_bbox,
+            horizontal_text_bools=[True for _ in range(len(words))],
+        )
+
+        return (
+            image,
+            region_score,
+            affinity_score,
+            confidence_mask,
+            word_level_char_bbox,
+            all_affinity_bbox,
+            words,
+        )
+
+
+class CustomDataset(CraftBaseDataset):
+    def __init__(
+        self,
+        output_size,
+        data_dir,
+        saved_gt_dir,
+        mean,
+        variance,
+        gauss_init_size,
+        gauss_sigma,
+        enlarge_region,
+        enlarge_affinity,
+        aug,
+        vis_test_dir,
+        vis_opt,
+        sample,
+        watershed_param,
+        pseudo_vis_opt,
+        do_not_care_label,
+    ):
+        super().__init__(
+            output_size,
+            data_dir,
+            saved_gt_dir,
+            mean,
+            variance,
+            gauss_init_size,
+            gauss_sigma,
+            enlarge_region,
+            enlarge_affinity,
+            aug,
+            vis_test_dir,
+            vis_opt,
+            sample,
+        )
+        self.pseudo_vis_opt = pseudo_vis_opt
+        self.do_not_care_label = do_not_care_label
+        self.pseudo_charbox_builder = PseudoCharBoxBuilder(
+            watershed_param, vis_test_dir, pseudo_vis_opt, self.gaussian_builder
+        )
+        self.vis_index = list(range(1000))
+        self.img_dir = os.path.join(data_dir, "ch4_training_images")
+        self.img_gt_box_dir = os.path.join(
+            data_dir, "ch4_training_localization_transcription_gt"
+        )
+        self.img_names = os.listdir(self.img_dir)
+
+    def update_model(self, net):
+        self.net = net
+
+    def update_device(self, gpu):
+        self.gpu = gpu
+
+    def load_img_gt_box(self, img_gt_box_path):
+        lines = open(img_gt_box_path, encoding="utf-8").readlines()
+        word_bboxes = []
+        words = []
+        for line in lines:
+            box_info = line.strip().encode("utf-8").decode("utf-8-sig").split(",")
+            box_points = [int(box_info[i]) for i in range(8)]
+            box_points = np.array(box_points, np.float32).reshape(4, 2)
+            word = box_info[8:]
+            word = ",".join(word)
+            if word in self.do_not_care_label:
+                words.append(self.do_not_care_label[0])
+                word_bboxes.append(box_points)
+                continue
+            word_bboxes.append(box_points)
+            words.append(word)
+        return np.array(word_bboxes), words
+
+    def load_data(self, index):
+        img_name = self.img_names[index]
+        img_path = os.path.join(self.img_dir, img_name)
+        image = cv2.imread(img_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+        img_gt_box_path = os.path.join(
+            self.img_gt_box_dir, "gt_%s.txt" % os.path.splitext(img_name)[0]
+        )
+        word_bboxes, words = self.load_img_gt_box(
+            img_gt_box_path
+        )  # shape : (Number of word bbox, 4, 2)
+        confidence_mask = np.ones((image.shape[0], image.shape[1]), np.float32)
+
+        word_level_char_bbox = []
+        do_care_words = []
+        horizontal_text_bools = []
+
+        if len(word_bboxes) == 0:
+            return (
+                image,
+                word_level_char_bbox,
+                do_care_words,
+                confidence_mask,
+                horizontal_text_bools,
+            )
+        _word_bboxes = word_bboxes.copy()
+        for i in range(len(word_bboxes)):
+            if words[i] in self.do_not_care_label:
+                cv2.fillPoly(confidence_mask, [np.int32(_word_bboxes[i])], 0)
+                continue
+
+            (
+                pseudo_char_bbox,
+                confidence,
+                horizontal_text_bool,
+            ) = self.pseudo_charbox_builder.build_char_box(
+                self.net, self.gpu, image, word_bboxes[i], words[i], img_name=img_name
+            )
+
+            cv2.fillPoly(confidence_mask, [np.int32(_word_bboxes[i])], confidence)
+            do_care_words.append(words[i])
+            word_level_char_bbox.append(pseudo_char_bbox)
+            horizontal_text_bools.append(horizontal_text_bool)
+
+        return (
+            image,
+            word_level_char_bbox,
+            do_care_words,
+            confidence_mask,
+            horizontal_text_bools,
+        )
+
+    def make_gt_score(self, index):
+        """
+        Make region, affinity scores using pseudo character-level GT bounding box
+        word_level_char_bbox's shape : [word_num, [char_num_in_one_word, 4, 2]]
+        :rtype region_score: np.float32
+        :rtype affinity_score: np.float32
+        :rtype confidence_mask: np.float32
+        :rtype word_level_char_bbox: np.float32
+        :rtype words: list
+        """
+        (
+            image,
+            word_level_char_bbox,
+            words,
+            confidence_mask,
+            horizontal_text_bools,
+        ) = self.load_data(index)
+        img_h, img_w, _ = image.shape
+
+        if len(word_level_char_bbox) == 0:
+            region_score = np.zeros((img_h, img_w), dtype=np.float32)
+            affinity_score = np.zeros((img_h, img_w), dtype=np.float32)
+            all_affinity_bbox = []
+        else:
+            region_score = self.gaussian_builder.generate_region(
+                img_h, img_w, word_level_char_bbox, horizontal_text_bools
+            )
+            affinity_score, all_affinity_bbox = self.gaussian_builder.generate_affinity(
+                img_h, img_w, word_level_char_bbox, horizontal_text_bools
+            )
+
+        return (
+            image,
+            region_score,
+            affinity_score,
+            confidence_mask,
+            word_level_char_bbox,
+            all_affinity_bbox,
+            words,
+        )
+
+    def load_saved_gt_score(self, index):
+        """
+        Load pre-saved official CRAFT model's region, affinity scores to train
+        word_level_char_bbox's shape : [word_num, [char_num_in_one_word, 4, 2]]
+        :rtype region_score: np.float32
+        :rtype affinity_score: np.float32
+        :rtype confidence_mask: np.float32
+        :rtype word_level_char_bbox: np.float32
+        :rtype words: list
+        """
+        img_name = self.img_names[index]
+        img_path = os.path.join(self.img_dir, img_name)
+        image = cv2.imread(img_path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+        img_gt_box_path = os.path.join(
+            self.img_gt_box_dir, "gt_%s.txt" % os.path.splitext(img_name)[0]
+        )
+        word_bboxes, words = self.load_img_gt_box(img_gt_box_path)
+        image, word_bboxes = rescale(image, word_bboxes)
+        img_h, img_w, _ = image.shape
+
+        query_idx = int(self.img_names[index].split(".")[0].split("_")[1])
+
+        saved_region_scores_path = os.path.join(
+            self.saved_gt_dir, f"res_img_{query_idx}_region.jpg"
+        )
+        saved_affi_scores_path = os.path.join(
+            self.saved_gt_dir, f"res_img_{query_idx}_affi.jpg"
+        )
+        saved_cf_mask_path = os.path.join(
+            self.saved_gt_dir, f"res_img_{query_idx}_cf_mask_thresh_0.6.jpg"
+        )
+        region_score = cv2.imread(saved_region_scores_path, cv2.IMREAD_GRAYSCALE)
+        affinity_score = cv2.imread(saved_affi_scores_path, cv2.IMREAD_GRAYSCALE)
+        confidence_mask = cv2.imread(saved_cf_mask_path, cv2.IMREAD_GRAYSCALE)
+
+        region_score = cv2.resize(region_score, (img_w, img_h))
+        affinity_score = cv2.resize(affinity_score, (img_w, img_h))
+        confidence_mask = cv2.resize(
+            confidence_mask, (img_w, img_h), interpolation=cv2.INTER_NEAREST
+        )
+
+        region_score = region_score.astype(np.float32) / 255
+        affinity_score = affinity_score.astype(np.float32) / 255
+        confidence_mask = confidence_mask.astype(np.float32) / 255
+
+        # NOTE : Even though word_level_char_bbox is not necessary, align bbox format with make_gt_score()
+        word_level_char_bbox = []
+
+        for i in range(len(word_bboxes)):
+            word_level_char_bbox.append(np.expand_dims(word_bboxes[i], 0))
+
+        return (
+            image,
+            region_score,
+            affinity_score,
+            confidence_mask,
+            word_level_char_bbox,
+            words,
+        )
--- a/trainer/craft/data/gaussian.py
+++ b/trainer/craft/data/gaussian.py
@@ -0,0 +1,192 @@
+import numpy as np
+import cv2
+
+from data.boxEnlarge import enlargebox
+
+
+class GaussianBuilder(object):
+    def __init__(self, init_size, sigma, enlarge_region, enlarge_affinity):
+        self.init_size = init_size
+        self.sigma = sigma
+        self.enlarge_region = enlarge_region
+        self.enlarge_affinity = enlarge_affinity
+        self.gaussian_map, self.gaussian_map_color = self.generate_gaussian_map()
+
+    def generate_gaussian_map(self):
+        circle_mask = self.generate_circle_mask()
+
+        gaussian_map = np.zeros((self.init_size, self.init_size), np.float32)
+
+        for i in range(self.init_size):
+            for j in range(self.init_size):
+                gaussian_map[i, j] = (
+                    1
+                    / 2
+                    / np.pi
+                    / (self.sigma ** 2)
+                    * np.exp(
+                        -1
+                        / 2
+                        * (
+                            (i - self.init_size / 2) ** 2 / (self.sigma ** 2)
+                            + (j - self.init_size / 2) ** 2 / (self.sigma ** 2)
+                        )
+                    )
+                )
+
+        gaussian_map = gaussian_map * circle_mask
+        gaussian_map = (gaussian_map / np.max(gaussian_map)).astype(np.float32)
+
+        gaussian_map_color = (gaussian_map * 255).astype(np.uint8)
+        gaussian_map_color = cv2.applyColorMap(gaussian_map_color, cv2.COLORMAP_JET)
+        return gaussian_map, gaussian_map_color
+
+    def generate_circle_mask(self):
+
+        zero_arr = np.zeros((self.init_size, self.init_size), np.float32)
+        circle_mask = cv2.circle(
+            img=zero_arr,
+            center=(self.init_size // 2, self.init_size // 2),
+            radius=self.init_size // 2,
+            color=1,
+            thickness=-1,
+        )
+
+        return circle_mask
+
+    def four_point_transform(self, bbox):
+        """
+        Using the bbox, standard 2D gaussian map, returns Transformed 2d Gaussian map
+        """
+        width, height = (
+            np.max(bbox[:, 0]).astype(np.int32),
+            np.max(bbox[:, 1]).astype(np.int32),
+        )
+        init_points = np.array(
+            [
+                [0, 0],
+                [self.init_size, 0],
+                [self.init_size, self.init_size],
+                [0, self.init_size],
+            ],
+            dtype="float32",
+        )
+
+        M = cv2.getPerspectiveTransform(init_points, bbox)
+        warped_gaussian_map = cv2.warpPerspective(self.gaussian_map, M, (width, height))
+        return warped_gaussian_map, width, height
+
+    def add_gaussian_map_to_score_map(
+        self, score_map, bbox, enlarge_size, horizontal_text_bool, map_type=None
+    ):
+        """
+        Mapping 2D Gaussian to the character box coordinates of the score_map.
+
+        :param score_map: Target map to put 2D gaussian on character box
+        :type score_map: np.float32
+        :param bbox: character boxes
+        :type bbox: np.float32
+        :param enlarge_size: Enlarge size of gaussian map to fit character shape
+        :type enlarge_size: list of enlarge size [x dim, y dim]
+        :param horizontal_text_bool: Flag that bbox is horizontal text or not
+        :type horizontal_text_bool: bool
+        :param map_type: Whether map's type is "region" | "affinity"
+        :type map_type: str
+        :return score_map: score map that all 2D gaussian put on character box
+        :rtype: np.float32
+        """
+
+        map_h, map_w = score_map.shape
+        bbox = enlargebox(bbox, map_h, map_w, enlarge_size, horizontal_text_bool)
+
+        # If any one point of character bbox is out of range, don't put in on map
+        if np.any(bbox < 0) or np.any(bbox[:, 0] > map_w) or np.any(bbox[:, 1] > map_h):
+            return score_map
+
+        bbox_left, bbox_top = np.array([np.min(bbox[:, 0]), np.min(bbox[:, 1])]).astype(
+            np.int32
+        )
+        bbox -= (bbox_left, bbox_top)
+        warped_gaussian_map, width, height = self.four_point_transform(
+            bbox.astype(np.float32)
+        )
+
+        try:
+            bbox_area_of_image = score_map[
+                bbox_top : bbox_top + height, bbox_left : bbox_left + width,
+            ]
+            high_value_score = np.where(
+                warped_gaussian_map > bbox_area_of_image,
+                warped_gaussian_map,
+                bbox_area_of_image,
+            )
+            score_map[
+                bbox_top : bbox_top + height, bbox_left : bbox_left + width,
+            ] = high_value_score
+
+        except Exception as e:
+            print("Error : {}".format(e))
+            print(
+                "On generating {} map, strange box came out. (width: {}, height: {})".format(
+                    map_type, width, height
+                )
+            )
+
+        return score_map
+
+    def calculate_affinity_box_points(self, bbox_1, bbox_2, vertical=False):
+        center_1, center_2 = np.mean(bbox_1, axis=0), np.mean(bbox_2, axis=0)
+        if vertical:
+            tl = (bbox_1[0] + bbox_1[-1] + center_1) / 3
+            tr = (bbox_1[1:3].sum(0) + center_1) / 3
+            br = (bbox_2[1:3].sum(0) + center_2) / 3
+            bl = (bbox_2[0] + bbox_2[-1] + center_2) / 3
+        else:
+            tl = (bbox_1[0:2].sum(0) + center_1) / 3
+            tr = (bbox_2[0:2].sum(0) + center_2) / 3
+            br = (bbox_2[2:4].sum(0) + center_2) / 3
+            bl = (bbox_1[2:4].sum(0) + center_1) / 3
+        affinity_box = np.array([tl, tr, br, bl]).astype(np.float32)
+        return affinity_box
+
+    def generate_region(
+        self, img_h, img_w, word_level_char_bbox, horizontal_text_bools
+    ):
+        region_map = np.zeros([img_h, img_w], dtype=np.float32)
+        for i in range(
+            len(word_level_char_bbox)
+        ):  # shape : [word_num, [char_num_in_one_word, 4, 2]]
+            for j in range(len(word_level_char_bbox[i])):
+                region_map = self.add_gaussian_map_to_score_map(
+                    region_map,
+                    word_level_char_bbox[i][j].copy(),
+                    self.enlarge_region,
+                    horizontal_text_bools[i],
+                    map_type="region",
+                )
+        return region_map
+
+    def generate_affinity(
+        self, img_h, img_w, word_level_char_bbox, horizontal_text_bools
+    ):
+
+        affinity_map = np.zeros([img_h, img_w], dtype=np.float32)
+        all_affinity_bbox = []
+        for i in range(len(word_level_char_bbox)):
+            for j in range(len(word_level_char_bbox[i]) - 1):
+                affinity_bbox = self.calculate_affinity_box_points(
+                    word_level_char_bbox[i][j], word_level_char_bbox[i][j + 1]
+                )
+
+                affinity_map = self.add_gaussian_map_to_score_map(
+                    affinity_map,
+                    affinity_bbox.copy(),
+                    self.enlarge_affinity,
+                    horizontal_text_bools[i],
+                    map_type="affinity",
+                )
+                all_affinity_bbox.append(np.expand_dims(affinity_bbox, axis=0))
+
+        if len(all_affinity_bbox) > 0:
+            all_affinity_bbox = np.concatenate(all_affinity_bbox, axis=0)
+        return affinity_map, all_affinity_bbox
--- a/trainer/craft/data/imgaug.py
+++ b/trainer/craft/data/imgaug.py
@@ -0,0 +1,175 @@
+import random
+
+import cv2
+import numpy as np
+from PIL import Image
+from torchvision.transforms.functional import resized_crop, crop
+from torchvision.transforms import RandomResizedCrop, RandomCrop
+from torchvision.transforms import InterpolationMode
+
+
+def rescale(img, bboxes, target_size=2240):
+    h, w = img.shape[0:2]
+    scale = target_size / max(h, w)
+    img = cv2.resize(img, dsize=None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
+    bboxes = bboxes * scale
+    return img, bboxes
+
+
+def random_resize_crop_synth(augment_targets, size):
+    image, region_score, affinity_score, confidence_mask = augment_targets
+
+    image = Image.fromarray(image)
+    region_score = Image.fromarray(region_score)
+    affinity_score = Image.fromarray(affinity_score)
+    confidence_mask = Image.fromarray(confidence_mask)
+
+    short_side = min(image.size)
+    i, j, h, w = RandomCrop.get_params(image, output_size=(short_side, short_side))
+
+    image = resized_crop(
+        image, i, j, h, w, size=(size, size), interpolation=InterpolationMode.BICUBIC
+    )
+    region_score = resized_crop(
+        region_score, i, j, h, w, (size, size), interpolation=InterpolationMode.BICUBIC
+    )
+    affinity_score = resized_crop(
+        affinity_score,
+        i,
+        j,
+        h,
+        w,
+        (size, size),
+        interpolation=InterpolationMode.BICUBIC,
+    )
+    confidence_mask = resized_crop(
+        confidence_mask,
+        i,
+        j,
+        h,
+        w,
+        (size, size),
+        interpolation=InterpolationMode.NEAREST,
+    )
+
+    image = np.array(image)
+    region_score = np.array(region_score)
+    affinity_score = np.array(affinity_score)
+    confidence_mask = np.array(confidence_mask)
+    augment_targets = [image, region_score, affinity_score, confidence_mask]
+
+    return augment_targets
+
+
+def random_resize_crop(
+    augment_targets, scale, ratio, size, threshold, pre_crop_area=None
+):
+    image, region_score, affinity_score, confidence_mask = augment_targets
+
+    image = Image.fromarray(image)
+    region_score = Image.fromarray(region_score)
+    affinity_score = Image.fromarray(affinity_score)
+    confidence_mask = Image.fromarray(confidence_mask)
+
+    if pre_crop_area != None:
+        i, j, h, w = pre_crop_area
+
+    else:
+        if random.random() < threshold:
+            i, j, h, w = RandomResizedCrop.get_params(image, scale=scale, ratio=ratio)
+        else:
+            i, j, h, w = RandomResizedCrop.get_params(
+                image, scale=(1.0, 1.0), ratio=(1.0, 1.0)
+            )
+
+    image = resized_crop(
+        image, i, j, h, w, size=(size, size), interpolation=InterpolationMode.BICUBIC
+    )
+    region_score = resized_crop(
+        region_score, i, j, h, w, (size, size), interpolation=InterpolationMode.BICUBIC
+    )
+    affinity_score = resized_crop(
+        affinity_score,
+        i,
+        j,
+        h,
+        w,
+        (size, size),
+        interpolation=InterpolationMode.BICUBIC,
+    )
+    confidence_mask = resized_crop(
+        confidence_mask,
+        i,
+        j,
+        h,
+        w,
+        (size, size),
+        interpolation=InterpolationMode.NEAREST,
+    )
+
+    image = np.array(image)
+    region_score = np.array(region_score)
+    affinity_score = np.array(affinity_score)
+    confidence_mask = np.array(confidence_mask)
+    augment_targets = [image, region_score, affinity_score, confidence_mask]
+
+    return augment_targets
+
+
+def random_crop(augment_targets, size):
+    image, region_score, affinity_score, confidence_mask = augment_targets
+
+    image = Image.fromarray(image)
+    region_score = Image.fromarray(region_score)
+    affinity_score = Image.fromarray(affinity_score)
+    confidence_mask = Image.fromarray(confidence_mask)
+
+    i, j, h, w = RandomCrop.get_params(image, output_size=(size, size))
+
+    image = crop(image, i, j, h, w)
+    region_score = crop(region_score, i, j, h, w)
+    affinity_score = crop(affinity_score, i, j, h, w)
+    confidence_mask = crop(confidence_mask, i, j, h, w)
+
+    image = np.array(image)
+    region_score = np.array(region_score)
+    affinity_score = np.array(affinity_score)
+    confidence_mask = np.array(confidence_mask)
+    augment_targets = [image, region_score, affinity_score, confidence_mask]
+
+    return augment_targets
+
+
+def random_horizontal_flip(imgs):
+    if random.random() < 0.5:
+        for i in range(len(imgs)):
+            imgs[i] = np.flip(imgs[i], axis=1).copy()
+    return imgs
+
+
+def random_scale(images, word_level_char_bbox, scale_range):
+    scale = random.sample(scale_range, 1)[0]
+
+    for i in range(len(images)):
+        images[i] = cv2.resize(images[i], dsize=None, fx=scale, fy=scale)
+
+    for i in range(len(word_level_char_bbox)):
+        word_level_char_bbox[i] *= scale
+
+    return images
+
+
+def random_rotate(images, max_angle):
+    angle = random.random() * 2 * max_angle - max_angle
+    for i in range(len(images)):
+        img = images[i]
+        w, h = img.shape[:2]
+        rotation_matrix = cv2.getRotationMatrix2D((h / 2, w / 2), angle, 1)
+        if i == len(images) - 1:
+            img_rotation = cv2.warpAffine(
+                img, M=rotation_matrix, dsize=(h, w), flags=cv2.INTER_NEAREST
+            )
+        else:
+            img_rotation = cv2.warpAffine(img, rotation_matrix, (h, w))
+        images[i] = img_rotation
+    return images
--- a/trainer/craft/data/imgproc.py
+++ b/trainer/craft/data/imgproc.py
@@ -0,0 +1,91 @@
+"""  
+Copyright (c) 2019-present NAVER Corp.
+MIT License
+"""
+
+# -*- coding: utf-8 -*-
+import numpy as np
+
+import cv2
+from skimage import io
+
+
+def loadImage(img_file):
+    img = io.imread(img_file)  # RGB order
+    if img.shape[0] == 2:
+        img = img[0]
+    if len(img.shape) == 2:
+        img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+    if img.shape[2] == 4:
+        img = img[:, :, :3]
+    img = np.array(img)
+
+    return img
+
+
+def normalizeMeanVariance(
+    in_img, mean=(0.485, 0.456, 0.406), variance=(0.229, 0.224, 0.225)
+):
+    # should be RGB order
+    img = in_img.copy().astype(np.float32)
+
+    img -= np.array(
+        [mean[0] * 255.0, mean[1] * 255.0, mean[2] * 255.0], dtype=np.float32
+    )
+    img /= np.array(
+        [variance[0] * 255.0, variance[1] * 255.0, variance[2] * 255.0],
+        dtype=np.float32,
+    )
+    return img
+
+
+def denormalizeMeanVariance(
+    in_img, mean=(0.485, 0.456, 0.406), variance=(0.229, 0.224, 0.225)
+):
+    # should be RGB order
+    img = in_img.copy()
+    img *= variance
+    img += mean
+    img *= 255.0
+    img = np.clip(img, 0, 255).astype(np.uint8)
+    return img
+
+
+def resize_aspect_ratio(img, square_size, interpolation, mag_ratio=1):
+    height, width, channel = img.shape
+
+    # magnify image size
+    target_size = mag_ratio * max(height, width)
+
+    # set original image size
+    if target_size > square_size:
+        target_size = square_size
+
+    ratio = target_size / max(height, width)
+
+    target_h, target_w = int(height * ratio), int(width * ratio)
+
+    # NOTE
+    valid_size_heatmap = (int(target_h / 2), int(target_w / 2))
+
+    proc = cv2.resize(img, (target_w, target_h), interpolation=interpolation)
+
+    # make canvas and paste image
+    target_h32, target_w32 = target_h, target_w
+    if target_h % 32 != 0:
+        target_h32 = target_h + (32 - target_h % 32)
+    if target_w % 32 != 0:
+        target_w32 = target_w + (32 - target_w % 32)
+    resized = np.zeros((target_h32, target_w32, channel), dtype=np.float32)
+    resized[0:target_h, 0:target_w, :] = proc
+
+    # target_h, target_w = target_h32, target_w32
+    # size_heatmap = (int(target_w/2), int(target_h/2))
+
+    return resized, ratio, valid_size_heatmap
+
+
+def cvt2HeatmapImg(img):
+    img = (np.clip(img, 0, 1) * 255).astype(np.uint8)
+    img = cv2.applyColorMap(img, cv2.COLORMAP_JET)
+    return img
--- a/trainer/craft/data/pseudo_label/make_charbox.py
+++ b/trainer/craft/data/pseudo_label/make_charbox.py
@@ -0,0 +1,263 @@
+import os
+import random
+import math
+
+import numpy as np
+import cv2
+import torch
+
+from data import imgproc
+from data.pseudo_label.watershed import exec_watershed_by_version
+
+
+class PseudoCharBoxBuilder:
+    def __init__(self, watershed_param, vis_test_dir, pseudo_vis_opt, gaussian_builder):
+        self.watershed_param = watershed_param
+        self.vis_test_dir = vis_test_dir
+        self.pseudo_vis_opt = pseudo_vis_opt
+        self.gaussian_builder = gaussian_builder
+        self.cnt = 0
+        self.flag = False
+
+    def crop_image_by_bbox(self, image, box, word):
+        w = max(
+            int(np.linalg.norm(box[0] - box[1])), int(np.linalg.norm(box[2] - box[3]))
+        )
+        h = max(
+            int(np.linalg.norm(box[0] - box[3])), int(np.linalg.norm(box[1] - box[2]))
+        )
+        try:
+            word_ratio = h / w
+        except:
+            import ipdb
+
+            ipdb.set_trace()
+
+        one_char_ratio = min(h, w) / (max(h, w) / len(word))
+
+        # NOTE: criterion to split vertical word in here is set to work properly on IC15 dataset
+        if word_ratio > 2 or (word_ratio > 1.6 and one_char_ratio > 2.4):
+            # warping method of vertical word (classified by upper condition)
+            horizontal_text_bool = False
+            long_side = h
+            short_side = w
+            M = cv2.getPerspectiveTransform(
+                np.float32(box),
+                np.float32(
+                    np.array(
+                        [
+                            [long_side, 0],
+                            [long_side, short_side],
+                            [0, short_side],
+                            [0, 0],
+                        ]
+                    )
+                ),
+            )
+            self.flag = True
+        else:
+            # warping method of horizontal word
+            horizontal_text_bool = True
+            long_side = w
+            short_side = h
+            M = cv2.getPerspectiveTransform(
+                np.float32(box),
+                np.float32(
+                    np.array(
+                        [
+                            [0, 0],
+                            [long_side, 0],
+                            [long_side, short_side],
+                            [0, short_side],
+                        ]
+                    )
+                ),
+            )
+            self.flag = False
+
+        warped = cv2.warpPerspective(image, M, (long_side, short_side))
+        return warped, M, horizontal_text_bool
+
+    def inference_word_box(self, net, gpu, word_image):
+        if net.training:
+            net.eval()
+
+        with torch.no_grad():
+            word_img_torch = torch.from_numpy(
+                imgproc.normalizeMeanVariance(
+                    word_image,
+                    mean=(0.485, 0.456, 0.406),
+                    variance=(0.229, 0.224, 0.225),
+                )
+            )
+            word_img_torch = word_img_torch.permute(2, 0, 1).unsqueeze(0)
+            word_img_torch = word_img_torch.type(torch.FloatTensor).cuda(gpu)
+            with torch.cuda.amp.autocast():
+                word_img_scores, _ = net(word_img_torch)
+        return word_img_scores
+
+    def visualize_pseudo_label(
+        self, word_image, region_score, watershed_box, pseudo_char_bbox, img_name,
+    ):
+        word_img_h, word_img_w, _ = word_image.shape
+        word_img_cp1 = word_image.copy()
+        word_img_cp2 = word_image.copy()
+        _watershed_box = np.int32(watershed_box)
+        _pseudo_char_bbox = np.int32(pseudo_char_bbox)
+
+        region_score_color = cv2.applyColorMap(np.uint8(region_score), cv2.COLORMAP_JET)
+        region_score_color = cv2.resize(region_score_color, (word_img_w, word_img_h))
+
+        for box in _watershed_box:
+            cv2.polylines(
+                np.uint8(word_img_cp1),
+                [np.reshape(box, (-1, 1, 2))],
+                True,
+                (255, 0, 0),
+            )
+
+        for box in _pseudo_char_bbox:
+            cv2.polylines(
+                np.uint8(word_img_cp2), [np.reshape(box, (-1, 1, 2))], True, (255, 0, 0)
+            )
+
+        # NOTE: Just for visualize, put gaussian map on char box
+        pseudo_gt_region_score = self.gaussian_builder.generate_region(
+            word_img_h, word_img_w, [_pseudo_char_bbox], [True]
+        )
+
+        pseudo_gt_region_score = cv2.applyColorMap(
+            (pseudo_gt_region_score * 255).astype("uint8"), cv2.COLORMAP_JET
+        )
+
+        overlay_img = cv2.addWeighted(
+            word_image[:, :, ::-1], 0.7, pseudo_gt_region_score, 0.3, 5
+        )
+        vis_result = np.hstack(
+            [
+                word_image[:, :, ::-1],
+                region_score_color,
+                word_img_cp1[:, :, ::-1],
+                word_img_cp2[:, :, ::-1],
+                pseudo_gt_region_score,
+                overlay_img,
+            ]
+        )
+
+        if not os.path.exists(os.path.dirname(self.vis_test_dir)):
+            os.makedirs(os.path.dirname(self.vis_test_dir))
+        cv2.imwrite(
+            os.path.join(
+                self.vis_test_dir,
+                "{}_{}".format(
+                    img_name, f"pseudo_char_bbox_{random.randint(0,100)}.jpg"
+                ),
+            ),
+            vis_result,
+        )
+
+    def clip_into_boundary(self, box, bound):
+        if len(box) == 0:
+            return box
+        else:
+            box[:, :, 0] = np.clip(box[:, :, 0], 0, bound[1])
+            box[:, :, 1] = np.clip(box[:, :, 1], 0, bound[0])
+            return box
+
+    def get_confidence(self, real_len, pseudo_len):
+        if pseudo_len == 0:
+            return 0.0
+        return (real_len - min(real_len, abs(real_len - pseudo_len))) / real_len
+
+    def split_word_equal_gap(self, word_img_w, word_img_h, word):
+        width = word_img_w
+        height = word_img_h
+
+        width_per_char = width / len(word)
+        bboxes = []
+        for j, char in enumerate(word):
+            if char == " ":
+                continue
+            left = j * width_per_char
+            right = (j + 1) * width_per_char
+            bbox = np.array([[left, 0], [right, 0], [right, height], [left, height]])
+            bboxes.append(bbox)
+
+        bboxes = np.array(bboxes, np.float32)
+        return bboxes
+
+    def cal_angle(self, v1):
+        theta = np.arccos(min(1, v1[0] / (np.linalg.norm(v1) + 10e-8)))
+        return 2 * math.pi - theta if v1[1] < 0 else theta
+
+    def clockwise_sort(self, points):
+        # returns 4x2 [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] ndarray
+        v1, v2, v3, v4 = points
+        center = (v1 + v2 + v3 + v4) / 4
+        theta = np.array(
+            [
+                self.cal_angle(v1 - center),
+                self.cal_angle(v2 - center),
+                self.cal_angle(v3 - center),
+                self.cal_angle(v4 - center),
+            ]
+        )
+        index = np.argsort(theta)
+        return np.array([v1, v2, v3, v4])[index, :]
+
+    def build_char_box(self, net, gpu, image, word_bbox, word, img_name=""):
+        word_image, M, horizontal_text_bool = self.crop_image_by_bbox(
+            image, word_bbox, word
+        )
+        real_word_without_space = word.replace("\s", "")
+        real_char_len = len(real_word_without_space)
+
+        scale = 128.0 / word_image.shape[0]
+
+        word_image = cv2.resize(word_image, None, fx=scale, fy=scale)
+        word_img_h, word_img_w, _ = word_image.shape
+
+        scores = self.inference_word_box(net, gpu, word_image)
+        region_score = scores[0, :, :, 0].cpu().data.numpy()
+        region_score = np.uint8(np.clip(region_score, 0, 1) * 255)
+
+        region_score_rgb = cv2.resize(region_score, (word_img_w, word_img_h))
+        region_score_rgb = cv2.cvtColor(region_score_rgb, cv2.COLOR_GRAY2RGB)
+
+        pseudo_char_bbox = exec_watershed_by_version(
+            self.watershed_param, region_score, word_image, self.pseudo_vis_opt
+        )
+
+        # Used for visualize only
+        watershed_box = pseudo_char_bbox.copy()
+
+        pseudo_char_bbox = self.clip_into_boundary(
+            pseudo_char_bbox, region_score_rgb.shape
+        )
+
+        confidence = self.get_confidence(real_char_len, len(pseudo_char_bbox))
+
+        if confidence <= 0.5:
+            pseudo_char_bbox = self.split_word_equal_gap(word_img_w, word_img_h, word)
+            confidence = 0.5
+
+        if self.pseudo_vis_opt and self.flag:
+            self.visualize_pseudo_label(
+                word_image, region_score, watershed_box, pseudo_char_bbox, img_name,
+            )
+
+        if len(pseudo_char_bbox) != 0:
+            index = np.argsort(pseudo_char_bbox[:, 0, 0])
+            pseudo_char_bbox = pseudo_char_bbox[index]
+
+        pseudo_char_bbox /= scale
+
+        M_inv = np.linalg.pinv(M)
+        for i in range(len(pseudo_char_bbox)):
+            pseudo_char_bbox[i] = cv2.perspectiveTransform(
+                pseudo_char_bbox[i][None, :, :], M_inv
+            )
+
+        pseudo_char_bbox = self.clip_into_boundary(pseudo_char_bbox, image.shape)
+
+        return pseudo_char_bbox, confidence, horizontal_text_bool
--- a/trainer/craft/data/pseudo_label/watershed.py
+++ b/trainer/craft/data/pseudo_label/watershed.py
@@ -0,0 +1,45 @@
+import cv2
+import numpy as np
+from skimage.segmentation import watershed
+
+
+def segment_region_score(watershed_param, region_score, word_image, pseudo_vis_opt):
+    region_score = np.float32(region_score) / 255
+    fore = np.uint8(region_score > 0.75)
+    back = np.uint8(region_score < 0.05)
+    unknown = 1 - (fore + back)
+    ret, markers = cv2.connectedComponents(fore)
+    markers += 1
+    markers[unknown == 1] = 0
+
+    labels = watershed(-region_score, markers)
+    boxes = []
+    for label in range(2, ret + 1):
+        y, x = np.where(labels == label)
+        x_max = x.max()
+        y_max = y.max()
+        x_min = x.min()
+        y_min = y.min()
+        box = [[x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max]]
+        box = np.array(box)
+        box *= 2
+        boxes.append(box)
+    return np.array(boxes, dtype=np.float32)
+
+
+def exec_watershed_by_version(
+    watershed_param, region_score, word_image, pseudo_vis_opt
+):
+
+    func_name_map_dict = {
+        "skimage": segment_region_score,
+    }
+
+    try:
+        return func_name_map_dict[watershed_param.version](
+            watershed_param, region_score, word_image, pseudo_vis_opt
+        )
+    except:
+        print(
+            f"Watershed version {watershed_param.version} does not exist in func_name_map_dict."
+        )