4digit_training done

This commit is contained in:
HT
2025-07-11 12:21:59 +08:00
parent cfa52e5f2e
commit c10b0719c7
11 changed files with 5770 additions and 12 deletions

View File

@@ -121,6 +121,9 @@ def hierarchical_dataset(root, opt, select_data='/'):
print(dataset_log)
dataset_log += '\n'
for dirpath, dirnames, filenames in os.walk(root+'/'):
print(f"dirpath : {dirpath}")
print(f"dirnames : {dirnames}")
if not dirnames:
select_flag = False
for selected_d in select_data:
@@ -146,7 +149,7 @@ class OCRDataset(Dataset):
self.root = root
self.opt = opt
print(root)
self.df = pd.read_csv(os.path.join(root,'labels.csv'), sep='^([^,]+),', engine='python', usecols=['filename', 'words'], keep_default_na=False)
self.df = pd.read_csv(os.path.join(root,'labels.csv'), sep='^([^,]+),',dtype={'words': str}, engine='python', usecols=['filename', 'words'], keep_default_na=False)
self.nSamples = len(self.df)
if self.opt.data_filtering_off:
@@ -159,7 +162,7 @@ class OCRDataset(Dataset):
if len(label) > self.opt.batch_max_length:
continue
except:
print(label)
print(f"type of label {type(label)} \n {label}")
out_of_char = f'[^{self.opt.character}]'
if re.search(out_of_char, label.lower()):
continue