testing dataset

2025-07-10 19:42:57 +08:00
commit 185959cf2a
316 changed files with 19605393 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,110 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# macOS
+*.DS_Store
+
+# IDEs
+.vscode/
+.vs/
+.idea/
+trainer/all_data/**
--- a/32
+++ b/32
@@ -0,0 +1,32 @@
+FROM docker.io/pytorch/pytorch
+
+# if you forked EasyOCR, you can pass in your own GitHub username to use your fork
+# i.e. gh_username=myname
+ARG gh_username=JaidedAI
+ARG service_home="/home/EasyOCR"
+
+# Configure apt and install packages
+RUN apt-get update -y && \
+    apt-get install -y \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgl1-mesa-dev \
+    git \
+    # cleanup
+    && apt-get autoremove -y \
+    && apt-get clean -y \
+    && rm -rf /var/lib/apt/lists
+
+# Clone EasyOCR repo
+RUN mkdir "$service_home" \
+    && git clone "https://github.com/$gh_username/EasyOCR.git" "$service_home" \
+    && cd "$service_home" \
+    && git remote add upstream "https://github.com/JaidedAI/EasyOCR.git" \
+    && git pull upstream master
+
+# Build
+RUN cd "$service_home" \
+    && python setup.py build_ext --inplace -j 4 \
+    && python -m pip install -e .
--- a/201
+++ b/201
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -0,0 +1,8 @@
+include LICENSE.txt
+include README.md
+
+include easyocr/model/*
+include easyocr/character/*
+include easyocr/dict/*
+include easyocr/scripts/compile_dbnet_dcn.py
+recursive-include easyocr/DBNet *
--- a/README.md
+++ b/README.md
@@ -0,0 +1,178 @@
+# EasyOCR
+
+[![PyPI Status](https://badge.fury.io/py/easyocr.svg)](https://badge.fury.io/py/easyocr)
+[![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/JaidedAI/EasyOCR/blob/master/LICENSE)
+[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.to/easyocr)
+[![Tweet](https://img.shields.io/twitter/url/https/github.com/JaidedAI/EasyOCR.svg?style=social)](https://twitter.com/intent/tweet?text=Check%20out%20this%20awesome%20library:%20EasyOCR%20https://github.com/JaidedAI/EasyOCR)
+[![Twitter](https://img.shields.io/badge/twitter-@JaidedAI-blue.svg?style=flat)](https://twitter.com/JaidedAI)
+
+Ready-to-use OCR with 80+ [supported languages](https://www.jaided.ai/easyocr) and all popular writing scripts including: Latin, Chinese, Arabic, Devanagari, Cyrillic, etc.
+
+[Try Demo on our website](https://www.jaided.ai/easyocr)
+
+Integrated into [Huggingface Spaces 🤗](https://huggingface.co/spaces) using [Gradio](https://github.com/gradio-app/gradio). Try out the Web Demo: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/tomofi/EasyOCR)
+
+
+## What's new
+- 24 September 2024 - Version 1.7.2
+    - Fix several compatibilities
+
+- [Read all release notes](https://github.com/JaidedAI/EasyOCR/blob/master/releasenotes.md)
+
+## What's coming next
+- Handwritten text support
+
+## Examples
+
+![example](examples/example.png)
+
+![example2](examples/example2.png)
+
+![example3](examples/example3.png)
+
+
+## Installation
+
+Install using `pip`
+
+For the latest stable release:
+
+``` bash
+pip install easyocr
+```
+
+For the latest development release:
+
+``` bash
+pip install git+https://github.com/JaidedAI/EasyOCR.git
+```
+
+Note 1: For Windows, please install torch and torchvision first by following the official instructions here https://pytorch.org. On the pytorch website, be sure to select the right CUDA version you have. If you intend to run on CPU mode only, select `CUDA = None`.
+
+Note 2: We also provide a Dockerfile [here](https://github.com/JaidedAI/EasyOCR/blob/master/Dockerfile).
+
+## Usage
+
+``` python
+import easyocr
+reader = easyocr.Reader(['ch_sim','en']) # this needs to run only once to load the model into memory
+result = reader.readtext('chinese.jpg')
+```
+
+The output will be in a list format, each item represents a bounding box, the text detected and confident level, respectively.
+
+``` bash
+[([[189, 75], [469, 75], [469, 165], [189, 165]], '愚园路', 0.3754989504814148),
+ ([[86, 80], [134, 80], [134, 128], [86, 128]], '西', 0.40452659130096436),
+ ([[517, 81], [565, 81], [565, 123], [517, 123]], '东', 0.9989598989486694),
+ ([[78, 126], [136, 126], [136, 156], [78, 156]], '315', 0.8125889301300049),
+ ([[514, 126], [574, 126], [574, 156], [514, 156]], '309', 0.4971577227115631),
+ ([[226, 170], [414, 170], [414, 220], [226, 220]], 'Yuyuan Rd.', 0.8261902332305908),
+ ([[79, 173], [125, 173], [125, 213], [79, 213]], 'W', 0.9848111271858215),
+ ([[529, 173], [569, 173], [569, 213], [529, 213]], 'E', 0.8405593633651733)]
+```
+Note 1: `['ch_sim','en']` is the list of languages you want to read. You can pass
+several languages at once but not all languages can be used together.
+English is compatible with every language and languages that share common characters are usually compatible with each other.
+
+Note 2: Instead of the filepath `chinese.jpg`, you can also pass an OpenCV image object (numpy array) or an image file as bytes. A URL to a raw image is also acceptable.
+
+Note 3: The line `reader = easyocr.Reader(['ch_sim','en'])` is for loading a model into memory. It takes some time but it needs to be run only once.
+
+You can also set `detail=0` for simpler output.
+
+``` python
+reader.readtext('chinese.jpg', detail = 0)
+```
+Result:
+``` bash
+['愚园路', '西', '东', '315', '309', 'Yuyuan Rd.', 'W', 'E']
+```
+
+Model weights for the chosen language will be automatically downloaded or you can
+download them manually from the [model hub](https://www.jaided.ai/easyocr/modelhub) and put them in the '~/.EasyOCR/model' folder
+
+In case you do not have a GPU, or your GPU has low memory, you can run the model in CPU-only mode by adding `gpu=False`.
+
+``` python
+reader = easyocr.Reader(['ch_sim','en'], gpu=False)
+```
+
+For more information, read the [tutorial](https://www.jaided.ai/easyocr/tutorial) and [API Documentation](https://www.jaided.ai/easyocr/documentation).
+
+#### Run on command line
+
+```shell
+$ easyocr -l ch_sim en -f chinese.jpg --detail=1 --gpu=True
+```
+
+## Train/use your own model
+
+For recognition model, [Read here](https://github.com/JaidedAI/EasyOCR/blob/master/custom_model.md).
+
+For detection model (CRAFT), [Read here](https://github.com/JaidedAI/EasyOCR/blob/master/trainer/craft/README.md).
+
+## Implementation Roadmap
+
+- Handwritten support
+- Restructure code to support swappable detection and recognition algorithms
+The api should be as easy as
+``` python
+reader = easyocr.Reader(['en'], detection='DB', recognition = 'Transformer')
+```
+The idea is to be able to plug in any state-of-the-art model into EasyOCR. There are a lot of geniuses trying to make better detection/recognition models, but we are not trying to be geniuses here. We just want to make their works quickly accessible to the public ... for free. (well, we believe most geniuses want their work to create a positive impact as fast/big as possible) The pipeline should be something like the below diagram. Grey slots are placeholders for changeable light blue modules.
+
+![plan](examples/easyocr_framework.jpeg)
+
+## Acknowledgement and References
+
+This project is based on research and code from several papers and open-source repositories.
+
+All deep learning execution is based on [Pytorch](https://pytorch.org). :heart:
+
+Detection execution uses the CRAFT algorithm from this [official repository](https://github.com/clovaai/CRAFT-pytorch) and their [paper](https://arxiv.org/abs/1904.01941) (Thanks @YoungminBaek from [@clovaai](https://github.com/clovaai)). We also use their pretrained model. Training script is provided by [@gmuffiness](https://github.com/gmuffiness).
+
+The recognition model is a CRNN ([paper](https://arxiv.org/abs/1507.05717)). It is composed of 3 main components: feature extraction (we are currently using [Resnet](https://arxiv.org/abs/1512.03385)) and VGG, sequence labeling ([LSTM](https://www.bioinf.jku.at/publications/older/2604.pdf)) and decoding ([CTC](https://www.cs.toronto.edu/~graves/icml_2006.pdf)). The training pipeline for recognition execution is a modified version of the [deep-text-recognition-benchmark](https://github.com/clovaai/deep-text-recognition-benchmark) framework. (Thanks [@ku21fan](https://github.com/ku21fan) from [@clovaai](https://github.com/clovaai)) This repository is a gem that deserves more recognition.
+
+Beam search code is based on this [repository](https://github.com/githubharald/CTCDecoder) and his [blog](https://towardsdatascience.com/beam-search-decoding-in-ctc-trained-neural-networks-5a889a3d85a7). (Thanks [@githubharald](https://github.com/githubharald))
+
+Data synthesis is based on [TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator). (Thanks [@Belval](https://github.com/Belval))
+
+And a good read about CTC from distill.pub [here](https://distill.pub/2017/ctc/).
+
+## Want To Contribute?
+
+Let's advance humanity together by making AI available to everyone!
+
+3 ways to contribute:
+
+**Coder:** Please send a PR for small bugs/improvements. For bigger ones, discuss with us by opening an issue first. There is a list of possible bug/improvement issues tagged with ['PR WELCOME'](https://github.com/JaidedAI/EasyOCR/issues?q=is%3Aissue+is%3Aopen+label%3A%22PR+WELCOME%22).
+
+**User:** Tell us how EasyOCR benefits you/your organization to encourage further development. Also post failure cases in [Issue  Section](https://github.com/JaidedAI/EasyOCR/issues) to help improve future models.
+
+**Tech leader/Guru:** If you found this library useful, please spread the word! (See [Yann Lecun's post](https://www.facebook.com/yann.lecun/posts/10157018122787143) about EasyOCR)
+
+## Guideline for new language request
+
+To request a new language, we need you to send a PR with the 2 following files:
+
+1. In folder [easyocr/character](https://github.com/JaidedAI/EasyOCR/tree/master/easyocr/character),
+we need 'yourlanguagecode_char.txt' that contains list of all characters. Please see format examples from other files in that folder.
+2. In folder [easyocr/dict](https://github.com/JaidedAI/EasyOCR/tree/master/easyocr/dict),
+we need 'yourlanguagecode.txt' that contains list of words in your language.
+On average, we have ~30000 words per language with more than 50000 words for more popular ones.
+More is better in this file.
+
+If your language has unique elements (such as 1. Arabic: characters change form when attached to each other + write from right to left 2. Thai: Some characters need to be above the line and some below), please educate us to the best of your ability and/or give useful links. It is important to take care of the detail to achieve a system that really works.
+
+Lastly, please understand that our priority will have to go to popular languages or sets of languages that share large portions of their characters with each other (also tell us if this is the case for your language). It takes us at least a week to develop a new model, so you may have to wait a while for the new model to be released.
+
+See [List of languages in development](https://github.com/JaidedAI/EasyOCR/issues/91)
+
+## Github Issues
+
+Due to limited resources, an issue older than 6 months will be automatically closed. Please open an issue again if it is critical.
+
+## Business Inquiries
+
+For Enterprise Support, [Jaided AI](https://www.jaided.ai/) offers full service for custom OCR/AI systems from implementation, training/finetuning and deployment. Click [here](https://www.jaided.ai/contactus?ref=github) to contact us.
--- a/custom_model.md
+++ b/custom_model.md
@@ -0,0 +1,24 @@
+# Custom recognition models
+
+## How to train your custom model
+
+You can use your own data or generate your own dataset. To generate your own data, we recommend using
+[TextRecognitionDataGenerator](https://github.com/Belval/TextRecognitionDataGenerator). We provide an example of a dataset [here](https://jaided.ai/easyocr/modelhub/).
+After you have a dataset, you can train your own model by following this repository
+[deep-text-recognition-benchmark](https://github.com/clovaai/deep-text-recognition-benchmark).
+The network needs to be fully convolutional in order to predict flexible text length. Our current network is 'None-VGG-BiLSTM-CTC'.
+Once you have your trained model (a `.pth` file), you need 2 additional files describing recognition network architecture and model configuration.
+An example is provided in `custom_example.zip` file [here](https://jaided.ai/easyocr/modelhub/).
+
+Please do not create an issue about data generation and model training in this repository. If you have any question regarding data generation and model training, please ask in the respective repositories.
+
+Note: We also provide our version of a training script [here](https://github.com/JaidedAI/EasyOCR/tree/master/trainer). It is a modified version from [deep-text-recognition-benchmark](https://github.com/clovaai/deep-text-recognition-benchmark).
+
+## How to use your custom model
+
+To use your own recognition model, you need the three files as explained above. These three files have to share the same name (i.e. `yourmodel.pth`, `yourmodel.yaml`, `yourmodel.py`) that you will then use to call your model with EasyOCR API.
+
+We provide [custom_example.zip](https://jaided.ai/easyocr/modelhub/)
+as an example. Please download, extract and place `custom_example.py`, `custom_example.yaml` in the `user_network_directory` (default = `~/.EasyOCR/user_network`) and place `custom_example.pth` in model directory (default = `~/.EasyOCR/model`)
+Once you place all 3 files in their respective places, you can use `custom_example` by
+specifying `recog_network` like this `reader = easyocr.Reader(['en'], recog_network='custom_example')`.
--- a/easyocr/DBNet/DBNet.py
+++ b/easyocr/DBNet/DBNet.py
@@ -0,0 +1,766 @@
+'''
+Created by Jaided AI
+Released Date: 18/08/2022
+Description:
+DBNet text detection module. 
+Many parts of the codes are adapted from https://github.com/MhLiao/DB
+'''
+import os
+import math
+import yaml
+from shapely.geometry import Polygon
+import PIL.Image
+import numpy as np
+import cv2
+import pyclipper
+import torch
+
+from .model.constructor import Configurable
+# %%
+class DBNet:
+    def __init__(self, 
+                 backbone = "resnet18",
+                 weight_dir = None,
+                 weight_name = 'pretrained',
+                 initialize_model = True,
+                 dynamic_import_relative_path = None,
+                 device = 'cuda', 
+                 verbose = 0):
+        '''
+        DBNet text detector class
+
+        Parameters
+        ----------
+        backbone : str, optional
+            Backbone to use. Options are "resnet18" and "resnet50". The default is "resnet18".
+        weight_dir : str, optional
+            Path to directory that contains weight files. If set to None, the path will be set
+            to "../weights/". The default is None.
+        weight_name : str, optional
+            Name of the weight to use as specified in DBNet_inference.yaml or a filename 
+            in weight_dir. The default is 'pretrained'.
+        initialize_model : Boolean, optional
+            If True, construct the model and load weight at class initialization.
+            Otherwise, only initial the class without constructing the model.
+            The default is True.
+        dynamic_import_relative_path : str, optional
+            Relative path to 'model/detector.py'. This option is for supporting
+            integrating this module into other modules. For example, easyocr/DBNet
+            This should be left as None when calling this module as a standalone. 
+            The default is None.
+        device : str, optional
+            Device to use. Options are "cuda" and "cpu". The default is 'cuda'.
+        verbose : int, optional
+            Verbosity level. The default is 0.
+
+        Raises
+        ------
+        ValueError
+            Raised when backbone is invalid.
+        FileNotFoundError
+            Raised when weight file is not found.
+
+        Returns
+        -------
+        None.
+        '''
+        self.device = device
+        
+        config_path = os.path.join(os.path.dirname(__file__), "configs", "DBNet_inference.yaml")
+        with open(config_path, 'r') as fid:
+            self.configs = yaml.safe_load(fid)
+
+        if dynamic_import_relative_path is not None:
+            self.configs = self.set_relative_import_path(self.configs, dynamic_import_relative_path)
+
+        if backbone in self.configs.keys():
+            self.backbone = backbone
+        else:
+            raise ValueError("Invalid backbone. Current support backbone are {}.".format(",".join(self.configs.keys())))
+
+        if weight_dir is not None:
+            self.weight_dir = weight_dir
+        else:    
+            self.weight_dir = os.path.join(os.path.dirname(__file__), 'weights')
+
+        if initialize_model:
+            if weight_name in self.configs[backbone]['weight'].keys():
+                weight_path = os.path.join(self.weight_dir, self.configs[backbone]['weight'][weight_name])
+                error_message = "A weight with a name {} is found in DBNet_inference.yaml but cannot be find file: {}."
+            else:
+                weight_path = os.path.join(self.weight_dir, weight_name)
+                error_message = "A weight with a name {} is not found in DBNet_inference.yaml and cannot be find file: {}."
+                
+            if not os.path.isfile(weight_path):
+                raise FileNotFoundError(error_message.format(weight_name, weight_path))
+                
+            self.initialize_model(self.configs[backbone]['model'], weight_path)
+        
+        else:
+            self.model = None
+
+        self.BGR_MEAN = np.array(self.configs['BGR_MEAN'])
+        self.min_detection_size = self.configs['min_detection_size']
+        self.max_detection_size = self.configs['max_detection_size']
+
+    def set_relative_import_path(self, configs, dynamic_import_relative_path):
+        '''
+        Create relative import paths for modules specified in class. This method
+        is recursive.
+
+        Parameters
+        ----------
+        configs : dict
+            Configuration dictionary from .yaml file.
+        dynamic_import_relative_path : str, optional
+            Relative path to 'model/detector/'. This option is for supporting
+            integrating this module into other modules. For example, easyocr/DBNet
+            This should be left as None when calling this module as a standalone. 
+            The default is None.
+        
+        Returns
+        -------
+        configs : dict
+            Configuration dictionary with correct relative path.
+        '''
+        assert dynamic_import_relative_path is not None
+        prefices = dynamic_import_relative_path.split(os.sep)
+        for key,value in configs.items():
+            if key == 'class':
+                configs.update({key: ".".join(prefices + value.split("."))})
+            else:
+                if isinstance(value, dict):
+                    value = self.set_relative_import_path(value, dynamic_import_relative_path)
+                else:
+                    pass
+        return configs
+
+    def load_weight(self, weight_path):
+        '''
+        Load weight to model.
+
+        Parameters
+        ----------
+        weight_path : str
+            Path to trained weight.
+
+        Raises
+        ------
+        RuntimeError
+            Raised when the model has not yet been contructed.
+
+        Returns
+        -------
+        None.
+        '''
+        if self.model is None:
+            raise RuntimeError("model has not yet been constructed.")
+        self.model.load_state_dict(torch.load(weight_path, map_location=self.device), strict=False)
+        self.model.eval()
+        
+    def construct_model(self, config):
+        '''
+        Contruct text detection model based on the configuration in .yaml file.
+
+        Parameters
+        ----------
+        config : dict
+            Configuration dictionary.
+
+        Returns
+        -------
+        None.
+        '''
+        self.model = Configurable.construct_class_from_config(config).structure.builder.build(self.device)
+
+    def initialize_model(self, model_config, weight_path):
+        '''
+        Wrapper to initialize text detection model. This model includes contructing
+        and weight loading.
+
+        Parameters
+        ----------
+        model_config : dict
+            Configuration dictionary.
+        weight_path : str
+            Path to trained weight.
+
+        Returns
+        -------
+        None.
+        '''
+        self.construct_model(model_config)
+        self.load_weight(weight_path)
+        if isinstance(self.model.model, torch.nn.DataParallel) and self.device == 'cpu':
+            self.model.model = self.model.model.module.to(self.device)    
+
+    def get_cv2_image(self, image):
+        '''
+        Load or convert input to OpenCV BGR image numpy array.
+
+        Parameters
+        ----------
+        image : str, PIL.Image, or np.ndarray
+            Image to load or convert.
+
+        Raises
+        ------
+        FileNotFoundError
+            Raised when the input is a path to file (str), but the file is not found.
+        TypeError
+            Raised when the data type of the input is not supported.
+
+        Returns
+        -------
+        image : np.ndarray
+            OpenCV BGR image.
+        '''
+        if isinstance(image, str):
+            if os.path.isfile(image):
+                image = cv2.imread(image, cv2.IMREAD_COLOR).astype('float32')
+            else:
+                raise FileNotFoundError("Cannot find {}".format(image))
+        elif isinstance(image, np.ndarray):
+            image = image.astype('float32')
+        elif isinstance(image, PIL.Image.Image):
+            image = np.asarray(image)[:, :, ::-1]
+        else:
+            raise TypeError("Unsupport image format. Only path-to-file, opencv BGR image, and PIL image are supported.")
+
+        return image
+
+    def resize_image(self, img, detection_size = None):
+        '''
+        Resize image such that the shorter side of the image is equal to the 
+        closest multiple of 32 to the provided detection_size. If detection_size
+        is not provided, it will be resized to the closest multiple of 32 each
+        side. If the original size exceeds the min-/max-detection sizes 
+        (specified in configs.yaml), it will be resized to be within the 
+        min-/max-sizes.
+
+        Parameters
+        ----------
+        img : np.ndarray
+            OpenCV BGR image.
+        detection_size : int, optional
+            Target detection size. The default is None.
+
+        Returns
+        -------
+        np.ndarray
+            Resized OpenCV BGR image. The width and height of this image should
+            be multiple of 32.
+        '''
+        height, width, _ = img.shape
+        if detection_size is None:
+            detection_size = max(self.min_detection_size, min(height, width, self.max_detection_size))
+        
+        if height < width:
+            new_height = int(math.ceil(detection_size / 32) * 32)
+            new_width = int(math.ceil(new_height / height * width / 32) * 32)
+        else:
+            new_width = int(math.ceil(detection_size / 32) * 32)
+            new_height = int(math.ceil(new_width / width * height / 32) * 32)
+        resized_img = cv2.resize(img, (new_width, new_height))
+
+        return resized_img, (height, width)
+
+    def image_array2tensor(self, image):
+        '''
+        Convert image array (assuming OpenCV BGR format) to image tensor.
+
+        Parameters
+        ----------
+        image : np.ndarray
+            OpenCV BGR image.
+
+        Returns
+        -------
+        torch.tensor
+            Tensor image with 4 dimension [batch, channel, width, height].
+        '''
+        return torch.from_numpy(image).permute(2, 0, 1).float().unsqueeze(0)
+
+    def normalize_image(self, image):
+        '''
+        Normalize image by substracting BGR mean and divided by 255
+
+        Parameters
+        ----------
+        image : np.ndarray
+            OpenCV BGR image.
+
+        Returns
+        -------
+        np.ndarray
+            OpenCV BGR image.
+        '''
+        return (image - self.BGR_MEAN)/255.0    
+       
+    def load_image(self, image_path, detection_size = 0):
+        '''
+        Wrapper to load and convert an image to an image tensor
+
+        Parameters
+        ----------
+        image : path-to-file, PIL.Image, or np.ndarray
+            Image to load or convert.
+        detection_size : int, optional
+            Target detection size. The default is None.
+
+        Returns
+        -------
+        img : torch.tensor
+            Tensor image with 4 dimension [batch, channel, width, height]..
+        original_shape : tuple
+            A tuple (height, width) of the original input image before resizing.
+        '''
+        img =self.get_cv2_image(image_path)
+        img, original_shape = self.resize_image(img, detection_size = detection_size)
+        img = self.normalize_image(img)
+        img = self.image_array2tensor(img)
+
+        return img, original_shape
+    
+    def load_images(self, images, detection_size = None):
+        '''
+        Wrapper to load or convert list of multiple images to a single image 
+        tensor. Multiple images are concatenated together on the first dimension.
+        
+        Parameters
+        ----------
+        images : a list of path-to-file, PIL.Image, or np.ndarray
+            Image to load or convert.
+        detection_size : int, optional
+            Target detection size. The default is None.
+
+        Returns
+        -------
+        img : torch.tensor
+            A single tensor image with 4 dimension [batch, channel, width, height].
+        original_shape : tuple
+            A list of tuples (height, width) of the original input image before resizing.
+        '''
+        images, original_shapes = zip(*[self.load_image(image, detection_size = detection_size) 
+                                        for image in images])
+        return torch.cat(images, dim = 0), original_shapes
+    
+    def hmap2bbox(self, 
+                  image_tensor, 
+                  original_shapes,
+                  hmap, 
+                  text_threshold = 0.2, 
+                  bbox_min_score = 0.2, 
+                  bbox_min_size = 3, 
+                  max_candidates = 0, 
+                  as_polygon=False):
+        '''
+        Translate probability heatmap tensor to text region boudning boxes.
+
+        Parameters
+        ----------
+        image_tensor : torch.tensor
+            Image tensor.
+        original_shapes : tuple
+            Original size of the image (height, width) of the input image (before
+            rounded to the closest multiple of 32).
+        hmap : torch.tensor
+            Probability heatmap tensor.
+        text_threshold : float, optional
+            Minimum probability for each pixel of heatmap tensor to be considered
+            as a valid text pixel. The default is 0.2.
+        bbox_min_score : float, optional
+            Minimum score for each detected bounding box to be considered as a
+            valid text bounding box. The default is 0.2.
+        bbox_min_size : int, optional
+            Minimum size for each detected bounding box to be considered as a
+            valid text bounding box. The default is 3.
+        max_candidates : int, optional
+            Maximum number of detected bounding boxes to be considered as 
+            candidates for valid text bounding box. Setting it to 0 implies
+            no maximum. The default is 0.
+        as_polygon : boolean, optional
+            If True, return the bounding box as polygon (fine vertrices), 
+            otherwise return as rectangular. The default is False.
+
+        Returns
+        -------
+        boxes_batch : list of lists
+            Bounding boxes of each text box.
+        scores_batch : list of floats
+            Confidence scores of each text box.
+
+        '''
+        segmentation = self.binarize(hmap, threshold = text_threshold)
+        boxes_batch = []
+        scores_batch = []
+        for batch_index in range(image_tensor.size(0)):
+            height, width = original_shapes[batch_index]
+            if as_polygon:
+                boxes, scores = self.polygons_from_bitmap(
+                                        hmap[batch_index],
+                                        segmentation[batch_index], 
+                                        width, 
+                                        height, 
+                                        bbox_min_score = bbox_min_score, 
+                                        bbox_min_size = bbox_min_size, 
+                                        max_candidates = max_candidates)
+            else:
+                boxes, scores = self.boxes_from_bitmap(
+                                        hmap[batch_index],
+                                        segmentation[batch_index], 
+                                        width, 
+                                        height, 
+                                        bbox_min_score = bbox_min_score, 
+                                        bbox_min_size = bbox_min_size, 
+                                        max_candidates = max_candidates)
+
+            boxes_batch.append(boxes)
+            scores_batch.append(scores)
+            
+        boxes_batch, scores_batch = zip(*[zip(*[(box, score) 
+                                                for (box,score) in zip(boxes, scores) if score > 0]
+                                             ) if any(scores > 0) else [(),()]
+                                         for (boxes, scores) in zip(boxes_batch, scores_batch)]
+                                       )
+            
+        return boxes_batch, scores_batch
+    
+    def binarize(self, tensor, threshold):
+        '''
+        Apply threshold to return boolean tensor.
+
+        Parameters
+        ----------
+        tensor : torch.tensor
+            input tensor.
+        threshold : float
+            Threshold.
+
+        Returns
+        -------
+        torch.tensor
+            Boolean tensor.
+
+        '''
+        return tensor > threshold
+    
+    def polygons_from_bitmap(self, 
+                             hmap,
+                             segmentation,
+                             dest_width, 
+                             dest_height, 
+                             bbox_min_score = 0.2, 
+                             bbox_min_size = 3, 
+                             max_candidates = 0):
+        '''
+        Translate boolean tensor to fine polygon indicating text bounding boxes
+
+        Parameters
+        ----------
+        hmap : torch.tensor
+            Probability heatmap tensor.
+        segmentation : torch.tensor
+            Segmentataion tensor.
+        dest_width : TYPE
+            target width of the output.
+        dest_height : TYPE
+            target width of the output.
+        bbox_min_score : float, optional
+            Minimum score for each detected bounding box to be considered as a
+            valid text bounding box. The default is 0.2.
+        bbox_min_size : int, optional
+            Minimum size for each detected bounding box to be considered as a
+            valid text bounding box. The default is 3.
+        max_candidates : int, optional
+            Maximum number of detected bounding boxes to be considered as 
+            candidates for valid text bounding box. Setting it to 0 implies
+            no maximum. The default is 0.
+        
+        Returns
+        -------
+        boxes_batch : list of lists
+            Polygon bounding boxes of each text box.
+        scores_batch : list of floats
+            Confidence scores of each text box.
+
+        '''
+        assert segmentation.size(0) == 1
+        bitmap = segmentation.cpu().numpy()[0]  # The first channel
+        hmap = hmap.cpu().detach().numpy()[0]
+        height, width = bitmap.shape
+        boxes = []
+        scores = []
+    
+        contours, _ = cv2.findContours(
+            (bitmap*255).astype(np.uint8),
+            cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
+        
+        if max_candidates > 0:
+            contours = contours[:max_candidates]
+        
+        for contour in contours:
+            epsilon = 0.002 * cv2.arcLength(contour, True)
+            approx = cv2.approxPolyDP(contour, epsilon, True)
+            points = approx.reshape((-1, 2))
+            if points.shape[0] < 4:
+                continue
+
+            score = self.box_score_fast(hmap, points.reshape(-1, 2))
+            if score < bbox_min_score:
+                continue
+            
+            if points.shape[0] > 2:
+                box = self.unclip(points, unclip_ratio=2.0)
+                if len(box) > 1:
+                    continue
+
+            else:
+                continue
+
+            box = box.reshape(-1, 2)
+            _, sside = self.get_mini_boxes(box.reshape((-1, 1, 2)))
+            if sside < bbox_min_size + 2:
+                continue
+    
+            if not isinstance(dest_width, int):
+                dest_width = dest_width.item()
+                dest_height = dest_height.item()
+            
+            box[:, 0] = np.clip(
+                np.round(box[:, 0] / width * dest_width), 0, dest_width)
+            box[:, 1] = np.clip(
+                np.round(box[:, 1] / height * dest_height), 0, dest_height)
+            boxes.append(box.tolist())
+            scores.append(score)
+
+        return boxes, scores
+    
+    def boxes_from_bitmap(self, 
+                          hmap,
+                          segmentation,
+                          dest_width, 
+                          dest_height, 
+                          bbox_min_score = 0.2, 
+                          bbox_min_size = 3, 
+                          max_candidates = 0):
+        '''
+        Translate boolean tensor to fine polygon indicating text bounding boxes
+
+        Parameters
+        ----------
+        hmap : torch.tensor
+            Probability heatmap tensor.
+        segmentation : torch.tensor
+            Segmentataion tensor.
+        dest_width : TYPE
+            target width of the output.
+        dest_height : TYPE
+            target width of the output.
+        bbox_min_score : float, optional
+            Minimum score for each detected bounding box to be considered as a
+            valid text bounding box. The default is 0.2.
+        bbox_min_size : int, optional
+            Minimum size for each detected bounding box to be considered as a
+            valid text bounding box. The default is 3.
+        max_candidates : int, optional
+            Maximum number of detected bounding boxes to be considered as 
+            candidates for valid text bounding box. Setting it to 0 implies
+            no maximum. The default is 0.
+        
+        Returns
+        -------
+        boxes_batch : list of lists
+            Polygon bounding boxes of each text box.
+        scores_batch : list of floats
+            Confidence scores of each text box.
+        '''        
+        assert segmentation.size(0) == 1
+        bitmap = segmentation.cpu().numpy()[0]  # The first channel
+        hmap = hmap.cpu().detach().numpy()[0]
+        height, width = bitmap.shape
+        contours, _ = cv2.findContours(
+                            (bitmap*255).astype(np.uint8),
+                            cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
+        if max_candidates > 0:
+            num_contours = min(len(contours), max_candidates)
+        else:
+            num_contours = len(contours)
+
+        boxes = np.zeros((num_contours, 4, 2), dtype=np.int16)
+        scores = np.zeros((num_contours,), dtype=np.float32)
+    
+        for index in range(num_contours):
+            contour = contours[index]
+            points, sside = self.get_mini_boxes(contour)
+            if sside < bbox_min_size:
+                continue
+
+            points = np.array(points)
+            score = self.box_score_fast(hmap, points.reshape(-1, 2))
+            if score < bbox_min_score:
+                continue
+        
+            box = self.unclip(points).reshape(-1, 1, 2)
+            box, sside = self.get_mini_boxes(box)
+            if sside < bbox_min_size + 2:
+                continue
+
+            box = np.array(box)
+            if not isinstance(dest_width, int):
+                dest_width = dest_width.item()
+                dest_height = dest_height.item()
+            
+            box[:, 0] = np.clip(
+                np.round(box[:, 0] / width * dest_width), 0, dest_width)
+            box[:, 1] = np.clip(
+                np.round(box[:, 1] / height * dest_height), 0, dest_height)
+            boxes[index, :, :] = box.astype(np.int16)
+            scores[index] = score
+
+        return boxes.tolist(), scores
+    
+    def unclip(self, box, unclip_ratio=1.5):
+        poly = Polygon(box)
+        distance = poly.area * unclip_ratio / poly.length
+        offset = pyclipper.PyclipperOffset()
+        offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
+        expanded = np.array(offset.Execute(distance))
+
+        return expanded
+    
+    def get_mini_boxes(self, contour):
+        bounding_box = cv2.minAreaRect(contour)
+        points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
+    
+        index_1, index_2, index_3, index_4 = 0, 1, 2, 3
+        if points[1][1] > points[0][1]:
+            index_1 = 0
+            index_4 = 1
+        else:
+            index_1 = 1
+            index_4 = 0
+        if points[3][1] > points[2][1]:
+            index_2 = 2
+            index_3 = 3
+        else:
+            index_2 = 3
+            index_3 = 2
+    
+        box = [points[index_1], points[index_2],
+               points[index_3], points[index_4]]
+
+        return box, min(bounding_box[1])
+    
+    def box_score_fast(self, hmap, box_):
+        '''
+        Calculate total score of each bounding box
+
+        Parameters
+        ----------
+        hmap : torch.tensor
+            Probability heatmap tensor.
+        box_ : list
+            Rectanguar bounding box.
+
+        Returns
+        -------
+        float
+            Confidence score.
+        '''
+        h, w = hmap.shape[:2]
+        box = box_.copy()
+        xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int32), 0, w - 1)
+        xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int32), 0, w - 1)
+        ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int32), 0, h - 1)
+        ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int32), 0, h - 1)
+    
+        mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
+        box[:, 0] = box[:, 0] - xmin
+        box[:, 1] = box[:, 1] - ymin
+        cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
+
+        return cv2.mean(hmap[ymin:ymax+1, xmin:xmax+1], mask)[0]
+    
+    def image2hmap(self, image_tensor):
+        '''
+        Run the model to obtain a heatmap tensor from a image tensor. The heatmap
+        tensor indicates the probability of each pixel being a part of text area.
+
+        Parameters
+        ----------
+        image_tensor : torch.tensor
+            Image tensor.
+
+        Returns
+        -------
+        torch.tensor
+            Probability heatmap tensor.
+        '''
+        return self.model.forward(image_tensor, training=False)
+        
+    def inference(self, 
+                  image,
+                  text_threshold = 0.2, 
+                  bbox_min_score = 0.2, 
+                  bbox_min_size = 3, 
+                  max_candidates = 0, 
+                  detection_size = None,
+                  as_polygon = False,
+                  return_scores = False):
+        '''
+        Wrapper to run the model on an input image to get text bounding boxes.
+
+        Parameters
+        ----------
+        image : path-to-file, PIL.Image, or np.ndarray
+            Image to load or convert.
+        text_threshold : float, optional
+            Minimum probability for each pixel of heatmap tensor to be considered
+            as a valid text pixel. The default is 0.2.
+        bbox_min_score : float, optional
+            Minimum score for each detected bounding box to be considered as a
+            valid text bounding box. The default is 0.2.
+        bbox_min_size : int, optional
+            Minimum size for each detected bounding box to be considered as a
+            valid text bounding box. The default is 3.
+        max_candidates : int, optional
+            Maximum number of detected bounding boxes to be considered as 
+            candidates for valid text bounding box. Setting it to 0 implies
+            no maximum. The default is 0.
+        detection_size : int, optional
+            Target detection size. Please see docstring under method resize_image()
+            for explanation. The default is None.
+        as_polygon : boolean, optional
+            If true, return the bounding boxes as find polygons, otherwise, return
+            as rectagular. The default is False.
+        return_scores : boolean, optional
+            If true, return confidence score along with the text bounding boxes.
+            The default is False.
+
+        Returns
+        -------
+        list of lists
+            Text bounding boxes. If return_scores is set to true, another list
+            of lists will also be returned.
+
+        '''
+        if not isinstance(image, list):
+            image = [image]
+
+        image_tensor, original_shapes = self.load_images(image, detection_size = detection_size)
+        with torch.no_grad():
+            hmap = self.image2hmap(image_tensor)
+            batch_boxes, batch_scores = self.hmap2bbox(image_tensor, 
+                                                       original_shapes,
+                                                       hmap, 
+                                                       text_threshold = text_threshold, 
+                                                       bbox_min_score = bbox_min_score, 
+                                                       bbox_min_size = bbox_min_size, 
+                                                       max_candidates = max_candidates, 
+                                                       as_polygon=as_polygon) 
+        
+        if return_scores:
+            return batch_boxes, batch_scores
+        else:
+            return batch_boxes
--- a/easyocr/DBNet/README.md
+++ b/easyocr/DBNet/README.md
@@ -0,0 +1,172 @@
+# DBNet - Inference Only
+This text detection module is adapted from [DBNet++](https://github.com/MhLiao/DB).
+
+## 1. Overview
+DBNet works as an image segmentation which performs classification at pixel-level. The model classifies if each pixel from the input image is a part of a text region. This module uses dynamic import and class construction from a config file. Config files are expected to be found in `./configs/`. At the input, the input image is expected to have width and height as multiple of 32. Input images that does not have these dimension will be resized accordingly. In addition, minimum and maximum sizes can be specified in the config file.
+
+### 1.1) Terminology
+  * Probability Heatmap: A tensor represents classification confidence of each pixel for being a part of a text region.
+  * Segmentation: A boolean-like tensor represents region that is determined as being a text region.
+  * text_threshold: A threshold for each element of the probability heatmap to be considered as a text region.
+  * detection_size: This term is used to refer to the size of the image on which the detection routine will be performed. Input images that are not of this size will be resized accordingly.
+
+### 1.2) Changes from the original repo
+  1. Scripts inside `./concerns/` and multiple `.yaml` files are consolidated and pruned for inference-only implementation and dependencies reduction.
+  2. DCN operators, which are required to be compiled with Ahead-of-Time (AoT) in the original repo, are changed to compile with Just-in-Time (JIT) approach as the default. AoT approach is still support.
+  3. DCN CPU version is provided in addition to the CUDA version from the original repo.
+  4. Pretrained weights are renamed for easy referring and adding file extension.
+   
+  | Original name                                       | New name                  |
+  |-----------------------------------------------------|---------------------------|
+  |synthtext_finetune_ic15_res18_dcn_fpn_dbv2           |pretrained_ic15_resnet18.pt|
+  |synthtext_finetune_ic15_res50_dcn_fpn_dbv2_thresh0.25|pretrained_ic15_resnet50.pt|
+  
+
+## 2. Using and Compiling DCN operators
+DBNet requires DCN operators to be compiled. There are two versions of DCN provided; CPU version and CUDA version (original). CUDA version works significantly faster, but requires CUDA-support GPU and CUDA developer toolkit. The CPU version can work without GPU and CUDA. The compilation prerequisites and instruction can be found below.
+
+Please not that, EasyOCR **can work** without DBNet and DCN operators by using CRAFT text detection (the default detector module).
+
+### 2.1) Prerequisites
+##### CPU version
+ * GCC compiler > 4.9
+
+##### CUDA version
+ * GCC compiler > 4.9
+ * [CUDA Developer Toolkits](https://developer.nvidia.com/cuda-toolkit) > 9.0 (Tested on 11.3). 
+
+### 2.2) Installing Dependencies
+
+Some step-by-step procedure to install the prerequisites is listed below. Please note that there are other methods that work as well. These methods are listed only to serve as a guideline.
+
+#### Installing GCC Compiler
+*Step 1*: Check if your machine already has GCC installed.
+
+On command line terminal (Linux/Mac/Windows);
+```
+> gcc --version
+```
+If you already have GCC installed, it will report the version of GCC on your machine. If the command gives an error message along the line of command not found, it implies you do not have GCC installed. 
+
+*Step 2*: Install GCC.
+
+To install GCC, you can do one of the following commands, depending on the privileges of your user account on your machine (Linux/Debian/Ubuntu)
+```
+> apt-get install build-essential
+```
+or
+```
+> sudo apt-get install build-essential
+```
+For Mac and Windows users, please follow the respective official instructions.
+
+*Step 3*: Verification
+Repeat Step 1 to make sure that you now have GCC installed.
+
+#### Installing CUDA and NVCC Compiler
+*Step 1*: Check if your machine already has NVCC and CUDA toolkit installed.
+On command line terminal (Linux/Mac/Windows);
+```
+> nvcc --version
+```
+If you already have NVCC installed, it will report the NVCC version on your machine. If the command gives an error message along the line of command not found, it implies you do not have NVCC installed.
+
+*Step 2*: Install NVCC and CUDA developer toolkit.
+
+Option 1: The official instruction can be found [here](https://developer.nvidia.com/cuda-downloads).
+
+Option 2:
+Alternatively, you can try install NVCC with [conda](https://docs.conda.io/projects/conda/en/latest/index.html) (package management system and environment management system).
+
+To use conda to install NVCC, you can do;
+*Linux/Mac/Windows*
+```
+> conda install -c conda-forge cudatoolkit-dev 
+```
+Note that the above command may fail if your machine is missing some library, such as libxml2. If such error occurs, please install the missing libraries and try again.
+
+*Step 3*: Verification
+Repeat Step 1 to make sure that you now have NVCC installed.
+
+#### Installing conda
+Step 1: Check if your machine already has conda installed.
+On command line terminal, (Linux/Mac/Windows)
+```
+> conda --version
+```
+If you already have conda installed, it will report the version of conda on your machine. If the command gives an error message along the line of command not found, it implies you do not have conda installed.
+
+Step 2: Install conda
+Please follow the [official instruction](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html) to install it according to your OS.
+
+Step 3: Verification
+Repeat Step 1 to make sure that you now have conda installed.
+
+#### Using Docker image
+For Docker users, please use development level images. For example, pytorch/pytorch:x.xx.x-cudax.x-cudnnx-devel. You can verify if all prerequisites are provided by the image by checking
+```
+gcc --version
+```
+and
+```
+nvcc --version
+```
+
+If you already have GCC/NVCC installed, each command will report the version of GCC/NVCC on your machine. If the command gives an error message along the line of command not found, it implies you do not have GCC/NVCC installed. 
+
+### 2.3 Compiling DCN Just-in-Time (JiT)
+Once all of the prerequisites have been installed, you can start using `dbnet18` as the detect_network for EasyOCR. The module will compile the source codes automatically when needed. The compilation may take a while if the modules are being loaded and compiled for the first time.
+
+### 2.4 Compiling DCN Ahead-of-Time (AoT)
+You can also try compiling DCN with Ahead-of-Time approach. The following procedure may serve as a guideline.
+
+#### 2.4.1 Locate EasyOCR and DBNet module inside it
+Start python console environment of your choice, such as Jupyter notebook and Spyder IDE. You can also start one from command line interface (Linux/Mac terminal, etc.) by calling `python` or `python3`;
+
+In python console environment;
+```
+> import os
+> import easyocr
+> print(os.dirname(easyocr.__file__))
+```
+This should show the installation location of easyocr on your machine.
+
+The exact output of the above command depends on many factors and will be likely unique for each user, especially the `username`. As an example, let's assuming the command above returns something like;
+```
+> /home/username/anaconda3/lib/python3.8/site-packages/easyocr
+```
+
+We want to go into the directory where `DBNet` and the DCN source files are located within EasyOCR which can be done by appending `DBNet/assets/ops/dcn` to the path obtained above. For example;
+```
+/home/username/anaconda3/lib/python3.8/site-packages/easyocr/DBNet/assets/ops/dcn
+```
+Access the above directory with any File Manager app on your machine of your choice, for example, Explorer (Windows), Nautilus (Linux/GNOME), Finder (MAC). Or use the following command in the command line interface;
+```
+> cd /home/username/anaconda3/lib/python3.8/site-packages/easyocr/DBNet/assets/ops/dcn
+```
+
+#### 2.4.2 Compiling DCN operator manually with setup.py script
+
+First go to DCN operator subdirectory inside DBNet module inside EasyOCR directory (e.g. `/home/username/anaconda3/lib/python3.8/site-packages/easyocr/DBNet/assets/ops/dcn`) by;
+
+Verify that a script `setup.py` is found in that directory. (This version of `setup.py` script is different from the original version from [DBNet++](https://github.com/MhLiao/DB) since the support for CPU has been added.) Once the script is located, run the following command;
+```
+> python setup.py build_ext --inplace
+```
+This will start the compilation process and you can monitor the progress, including error messages, if any, on the command line interface. If there is any error, please resolve them, and try again. Once the compilation has been completed, new files will be added to the current directory (i.e. `/home/username/anaconda3/lib/python3.8/site-packages/easyocr/DBNet/assets/ops/dcn`). If your machine has only CPU, but no CUDA device (GPU), two files will be added to the directory. **Please note that the exact names of the files will be different depending on the configuration of your machine.** The file names should look like;
+```
+deform_conv_cpu.******.so
+deform_pool_cpu.******.so
+```
+If your machine also has CUDA device, two additional files will be added (4 files in total). The file names of these files should look like;
+```
+deform_conv_cuda.******.so
+deform_pool_cuda.******.so
+```
+
+### 3. Using DBNet Detector
+When initializing EasyOCR with DBNet as the detect network for the first time in the current working session, messages will be print to indicate if the DCN operators are loaded from objects compiled with AoT approach (pre-compiled) or the source codes are compiling with JiT approach. 
+
+
+
+
--- a/easyocr/DBNet/assets/ops/dcn/init.py
+++ b/easyocr/DBNet/assets/ops/dcn/init.py
@@ -0,0 +1,13 @@
+from .functions.deform_conv import deform_conv, modulated_deform_conv
+from .functions.deform_pool import deform_roi_pooling
+from .modules.deform_conv import (DeformConv, ModulatedDeformConv,
+                                  DeformConvPack, ModulatedDeformConvPack)
+from .modules.deform_pool import (DeformRoIPooling, DeformRoIPoolingPack,
+                                  ModulatedDeformRoIPoolingPack)
+
+__all__ = [
+    'DeformConv', 'DeformConvPack', 'ModulatedDeformConv',
+    'ModulatedDeformConvPack', 'DeformRoIPooling', 'DeformRoIPoolingPack',
+    'ModulatedDeformRoIPoolingPack', 'deform_conv', 'modulated_deform_conv',
+    'deform_roi_pooling'
+]
--- a/easyocr/DBNet/assets/ops/dcn/functions/init.py
+++ b/easyocr/DBNet/assets/ops/dcn/functions/init.py
--- a/easyocr/DBNet/assets/ops/dcn/functions/deform_conv.py
+++ b/easyocr/DBNet/assets/ops/dcn/functions/deform_conv.py
@@ -0,0 +1,271 @@
+'''
+Modified by Jaided AI
+Released Date: 31/08/2022
+Description:
+- Add support for Deformable convolution operator on CPU for forward propagation.
+- Change to Just-in-Time loading approach
+'''
+import os
+import torch
+import warnings
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+from torch.utils import cpp_extension
+
+# TODO - Jaided AI: 
+# 1. Find a better way to handle and support both Ahead-of-Time (AoT) and Just-in-Time (JiT) compilation.
+# 2. Find a better way to report error to help pinpointing issues if there is any.
+# Note on JiT and AoT compilation:
+# This module supports both AoT and JiT compilation approaches. JiT is hardcoded as the default. If AoT compiled objects are present, it will supercede JiT compilation.
+ 
+def custom_formatwarning(msg, *args, **kwargs):
+    # ignore everything except the message
+    return str(msg) + '\n'
+
+warnings.formatwarning = custom_formatwarning
+dcn_dir = os.path.dirname(os.path.dirname(__file__))
+try:
+    from .. import deform_conv_cpu
+    warnings.warn("Using precompiled deform_conv_cpu from {}".format(deform_conv_cpu.__file__))
+    dcn_cpu_ready = True
+except:
+    try:
+        warnings.warn("Compiling deform_conv_cpu ...")
+        warnings.warn("(This may take a while if this module is loaded for the first time.)")
+        deform_conv_cpu = cpp_extension.load(
+                            name="deform_conv_cpu", 
+                            sources=[os.path.join(dcn_dir, 'src', "deform_conv_cpu.cpp"),
+                                     os.path.join(dcn_dir, 'src', "deform_conv_cpu_kernel.cpp")])
+        warnings.warn("Done.")
+        dcn_cpu_ready = True
+    except Exception as error:
+        warnings.warn(' '.join([
+            "Failed to import and/or compile 'deform_conv_cpu' with the following error",
+            "{}".format(error),
+            "Deformable convulution and DBNet will not be able to run on CPU."
+            ]))
+        dcn_cpu_ready = False
+
+if torch.cuda.is_available():
+    try:
+        from .. import deform_conv_cuda
+        warnings.warn("Using precompiled deform_conv_cuda from {}".format(deform_conv_cuda.__file__))
+        dcn_cuda_ready = True
+    except:
+        try:
+            warnings.warn("Compiling deform_conv_cuda ...")
+            warnings.warn("(This may take a while if this module is loaded for the first time.)")
+            cuda_sources = [os.path.join(dcn_dir, 'src', src_file) 
+                           for src_file in ["deform_conv_cuda.cpp",
+                                            "deform_conv_cuda_kernel.cu"]
+                           ]
+            deform_conv_cuda = cpp_extension.load(
+                                name="deform_conv_cuda", 
+                                sources=[os.path.join(dcn_dir, 'src', "deform_conv_cuda.cpp"),
+                                         os.path.join(dcn_dir, 'src', "deform_conv_cuda_kernel.cu")])
+            warnings.warn("Done.")
+            dcn_cuda_ready = True
+        except Exception as error:
+            warnings.warn(' '.join([
+                "Failed to import or compile 'deform_conv_cuda' with the following error",
+                "{}".format(error),
+                "Deformable convulution and DBNet will not be able to run on GPU."
+                ]))
+            dcn_cuda_ready = False
+
+class DeformConvFunction(Function):
+    
+    @staticmethod
+    def forward(ctx,
+                input,
+                offset,
+                weight,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deformable_groups=1,
+                im2col_step=64):
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                "Expected 4D tensor as input, got {}D tensor instead.".format(
+                    input.dim()))
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.im2col_step = im2col_step
+
+        ctx.save_for_backward(input, offset, weight)
+
+        output = input.new_empty(
+            DeformConvFunction._output_size(input, weight, ctx.padding,
+                                            ctx.dilation, ctx.stride))
+
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+
+        cur_im2col_step = min(ctx.im2col_step, input.shape[0])
+        assert (input.shape[0] %
+                cur_im2col_step) == 0, 'im2col step must divide batchsize'
+        if not input.is_cuda and dcn_cpu_ready:
+            deform_conv_cpu.deform_conv_forward_cpu(
+                input, weight, offset, output, ctx.bufs_[0], ctx.bufs_[1],
+                weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0],
+                ctx.padding[1], ctx.padding[0], ctx.dilation[1],
+                ctx.dilation[0], ctx.groups, ctx.deformable_groups,
+                cur_im2col_step)
+        elif input.is_cuda and dcn_cuda_ready:
+            deform_conv_cuda.deform_conv_forward_cuda(
+                input, weight, offset, output, ctx.bufs_[0], ctx.bufs_[1],
+                weight.size(3), weight.size(2), ctx.stride[1], ctx.stride[0],
+                ctx.padding[1], ctx.padding[0], ctx.dilation[1],
+                ctx.dilation[0], ctx.groups, ctx.deformable_groups,
+                cur_im2col_step)
+        else:
+            device_ = input.device.type
+            raise RuntimeError(
+                "Input type is {}, but 'deform_conv_{}.*.so' is not imported successfully.".format(device_, device_),
+                )
+             
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input, offset, weight = ctx.saved_tensors
+
+        grad_input = grad_offset = grad_weight = None
+
+        if not grad_output.is_cuda:
+            raise NotImplementedError("DCN operator for cpu for backward propagation is not implemented.")
+        else:
+            cur_im2col_step = min(ctx.im2col_step, input.shape[0])
+            assert (input.shape[0] %
+                    cur_im2col_step) == 0, 'im2col step must divide batchsize'
+
+            if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+                grad_input = torch.zeros_like(input)
+                grad_offset = torch.zeros_like(offset)
+                deform_conv_cuda.deform_conv_backward_input_cuda(
+                    input, offset, grad_output, grad_input,
+                    grad_offset, weight, ctx.bufs_[0], weight.size(3),
+                    weight.size(2), ctx.stride[1], ctx.stride[0],
+                    ctx.padding[1], ctx.padding[0], ctx.dilation[1],
+                    ctx.dilation[0], ctx.groups, ctx.deformable_groups,
+                    cur_im2col_step)
+
+            if ctx.needs_input_grad[2]:
+                grad_weight = torch.zeros_like(weight)
+                deform_conv_cuda.deform_conv_backward_parameters_cuda(
+                    input, offset, grad_output,
+                    grad_weight, ctx.bufs_[0], ctx.bufs_[1], weight.size(3),
+                    weight.size(2), ctx.stride[1], ctx.stride[0],
+                    ctx.padding[1], ctx.padding[0], ctx.dilation[1],
+                    ctx.dilation[0], ctx.groups, ctx.deformable_groups, 1,
+                    cur_im2col_step)
+
+        return (grad_input, grad_offset, grad_weight, None, None, None, None,
+                None)
+
+    @staticmethod
+    def _output_size(input, weight, padding, dilation, stride):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = padding[d]
+            kernel = dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                "convolution input is too small (output would be {})".format(
+                    'x'.join(map(str, output_size))))
+        return output_size
+
+
+class ModulatedDeformConvFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                input,
+                offset,
+                mask,
+                weight,
+                bias=None,
+                stride=1,
+                padding=0,
+                dilation=1,
+                groups=1,
+                deformable_groups=1):
+        ctx.stride = stride
+        ctx.padding = padding
+        ctx.dilation = dilation
+        ctx.groups = groups
+        ctx.deformable_groups = deformable_groups
+        ctx.with_bias = bias is not None
+        if not ctx.with_bias:
+            bias = input.new_empty(1)  # fake tensor
+        
+        if weight.requires_grad or mask.requires_grad or offset.requires_grad \
+                or input.requires_grad:
+            ctx.save_for_backward(input, offset, mask, weight, bias)
+        output = input.new_empty(
+            ModulatedDeformConvFunction._infer_shape(ctx, input, weight))
+        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
+        if not input.is_cuda and dcn_cpu_ready:
+            deform_conv_cpu.modulated_deform_conv_cpu_forward(
+                input, weight, bias, ctx._bufs[0], offset, mask, output,
+                ctx._bufs[1], weight.shape[2], weight.shape[3], ctx.stride,
+                ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation,
+                ctx.groups, ctx.deformable_groups, ctx.with_bias)
+        elif input.is_cuda and dcn_cuda_ready:
+            deform_conv_cuda.modulated_deform_conv_cuda_forward(
+                input, weight, bias, ctx._bufs[0], offset, mask, output,
+                ctx._bufs[1], weight.shape[2], weight.shape[3], ctx.stride,
+                ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation,
+                ctx.groups, ctx.deformable_groups, ctx.with_bias)
+        else:
+            device_ = input.device.type
+            raise RuntimeError(
+                "Input type is {}, but 'deform_conv_{}.*.so' is not imported successfully.".format(device_, device_),
+                )
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if not grad_output.is_cuda:
+            raise NotImplementedError("DCN operator for CPU for backward propagation is not implemented.")
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input = torch.zeros_like(input)
+        grad_offset = torch.zeros_like(offset)
+        grad_mask = torch.zeros_like(mask)
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        deform_conv_cuda.modulated_deform_conv_cuda_backward(
+            input, weight, bias, ctx._bufs[0], offset, mask, ctx._bufs[1],
+            grad_input, grad_weight, grad_bias, grad_offset, grad_mask,
+            grad_output, weight.shape[2], weight.shape[3], ctx.stride,
+            ctx.stride, ctx.padding, ctx.padding, ctx.dilation, ctx.dilation,
+            ctx.groups, ctx.deformable_groups, ctx.with_bias)
+        if not ctx.with_bias:
+            grad_bias = None
+
+        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
+                None, None, None, None, None)
+
+    @staticmethod
+    def _infer_shape(ctx, input, weight):
+        n = input.size(0)
+        channels_out = weight.size(0)
+        height, width = input.shape[2:4]
+        kernel_h, kernel_w = weight.shape[2:4]
+        height_out = (height + 2 * ctx.padding -
+                      (ctx.dilation * (kernel_h - 1) + 1)) // ctx.stride + 1
+        width_out = (width + 2 * ctx.padding -
+                     (ctx.dilation * (kernel_w - 1) + 1)) // ctx.stride + 1
+        return n, channels_out, height_out, width_out
+
+
+deform_conv = DeformConvFunction.apply
+modulated_deform_conv = ModulatedDeformConvFunction.apply
--- a/easyocr/DBNet/assets/ops/dcn/functions/deform_pool.py
+++ b/easyocr/DBNet/assets/ops/dcn/functions/deform_pool.py
@@ -0,0 +1,142 @@
+'''
+Modified by Jaided AI
+Released Date: 31/08/2022
+Description:
+- Add support for Deformable convolution operator on CPU for forward propagation.
+- Change to Just-in-Time loading approach
+'''
+import os
+import warnings
+import torch
+from torch.autograd import Function
+from torch.utils import cpp_extension
+
+# TODO - Jaided AI: 
+# 1. Find a better way to handle and support both Ahead-of-Time (AoT) and Just-in-Time (JiT) compilation.
+# 2. Find a better way to report error to help pinpointing issues if there is any.
+# Note on JiT and AoT compilation:
+# This module supports both AoT and JiT compilation approaches. JiT is hardcoded as the default. If AoT compiled objects are present, it will supercede JiT compilation.
+ 
+def custom_formatwarning(msg, *args, **kwargs):
+    # ignore everything except the message
+    return str(msg) + '\n'
+
+warnings.formatwarning = custom_formatwarning
+dcn_dir = os.path.dirname(os.path.dirname(__file__))
+try:
+    from .. import deform_pool_cpu
+    warnings.warn("Using precompiled deform_pool_cpu from {}".format(deform_pool_cpu.__file__))
+    dcn_cpu_ready = True
+except:
+    try:
+        warnings.warn("Compiling deform_pool_cpu ...")
+        warnings.warn("(This may take a while if this module is loaded for the first time.)")
+        deform_pool_cpu = cpp_extension.load(
+                            name="deform_pool_cpu", 
+                            sources=[os.path.join(dcn_dir, 'src', "deform_pool_cpu.cpp"),
+                                     os.path.join(dcn_dir, 'src', "deform_pool_cpu_kernel.cpp")])
+        warnings.warn("Done.")
+        dcn_cpu_ready = True
+    except Exception as error:
+        warnings.warn(' '.join([
+            "Failed to import or compile 'deform_pool_cpu' with the following error",
+            "{}".format(error),
+            "Deformable convulution and DBNet will not be able to run on CPU."
+            ]))
+        dcn_cpu_ready = False
+
+if torch.cuda.is_available():
+    try:
+        from .. import deform_pool_cuda
+        warnings.warn("Using precompiled deform_pool_cuda from {}".format(deform_pool_cuda.__file__))
+        dcn_cuda_ready = True
+    except:
+        try:
+            warnings.warn("Compiling deform_pool_cuda ...")
+            warnings.warn("(This may take a while if this module is loaded for the first time.)")
+            deform_pool_cuda = cpp_extension.load(
+                                name="deform_pool_cuda", 
+                                sources=[os.path.join(dcn_dir, 'src', "deform_pool_cuda.cpp"),
+                                         os.path.join(dcn_dir, 'src', "deform_pool_cuda_kernel.cu")])
+            warnings.warn("Done.")
+            dcn_cuda_ready = True
+        except Exception as error:
+            warnings.warn(' '.join([
+                "Failed to import or compile 'deform_pool_cuda' with the following error",
+                "{}".format(error),
+                "Deformable convulution and DBNet will not be able to run on GPU."
+                ]))
+            dcn_cuda_ready = False
+
+class DeformRoIPoolingFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                data,
+                rois,
+                offset,
+                spatial_scale,
+                out_size,
+                out_channels,
+                no_trans,
+                group_size=1,
+                part_size=None,
+                sample_per_part=4,
+                trans_std=.0):
+        ctx.spatial_scale = spatial_scale
+        ctx.out_size = out_size
+        ctx.out_channels = out_channels
+        ctx.no_trans = no_trans
+        ctx.group_size = group_size
+        ctx.part_size = out_size if part_size is None else part_size
+        ctx.sample_per_part = sample_per_part
+        ctx.trans_std = trans_std
+
+        assert 0.0 <= ctx.trans_std <= 1.0
+        
+        n = rois.shape[0]
+        output = data.new_empty(n, out_channels, out_size, out_size)
+        output_count = data.new_empty(n, out_channels, out_size, out_size)
+        if not data.is_cuda and dcn_cpu_ready:
+            deform_pool_cpu.deform_psroi_pooling_cpu_forward(
+                data, rois, offset, output, output_count, ctx.no_trans,
+                ctx.spatial_scale, ctx.out_channels, ctx.group_size, ctx.out_size,
+                ctx.part_size, ctx.sample_per_part, ctx.trans_std)
+        elif data.is_cuda and dcn_cuda_ready:    
+            deform_pool_cuda.deform_psroi_pooling_cuda_forward(
+                data, rois, offset, output, output_count, ctx.no_trans,
+                ctx.spatial_scale, ctx.out_channels, ctx.group_size, ctx.out_size,
+                ctx.part_size, ctx.sample_per_part, ctx.trans_std)
+        else:
+            device_ = input.device.type
+            raise RuntimeError(
+                "Input type is {}, but 'deform_conv_{}.*.so' is not imported successfully.".format(device_, device_),
+                )
+        
+        if data.requires_grad or rois.requires_grad or offset.requires_grad:
+            ctx.save_for_backward(data, rois, offset)
+        ctx.output_count = output_count
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        if not grad_output.is_cuda:
+            raise NotImplementedError("DCN operator for cpu for backward propagation is not implemented.")
+
+        data, rois, offset = ctx.saved_tensors
+        output_count = ctx.output_count
+        grad_input = torch.zeros_like(data)
+        grad_rois = None
+        grad_offset = torch.zeros_like(offset)
+
+        deform_pool_cuda.deform_psroi_pooling_cuda_backward(
+            grad_output, data, rois, offset, output_count, grad_input,
+            grad_offset, ctx.no_trans, ctx.spatial_scale, ctx.out_channels,
+            ctx.group_size, ctx.out_size, ctx.part_size, ctx.sample_per_part,
+            ctx.trans_std)
+        return (grad_input, grad_rois, grad_offset, None, None, None, None,
+                None, None, None, None)
+
+
+deform_roi_pooling = DeformRoIPoolingFunction.apply
--- a/easyocr/DBNet/assets/ops/dcn/modules/init.py
+++ b/easyocr/DBNet/assets/ops/dcn/modules/init.py
--- a/easyocr/DBNet/assets/ops/dcn/modules/deform_conv.py
+++ b/easyocr/DBNet/assets/ops/dcn/modules/deform_conv.py
@@ -0,0 +1,157 @@
+import math
+
+import torch
+import torch.nn as nn
+from torch.nn.modules.utils import _pair
+
+from ..functions.deform_conv import deform_conv, modulated_deform_conv
+
+
+class DeformConv(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 deformable_groups=1,
+                 bias=False):
+        super(DeformConv, self).__init__()
+
+        assert not bias
+        assert in_channels % groups == 0, \
+            'in_channels {} cannot be divisible by groups {}'.format(
+                in_channels, groups)
+        assert out_channels % groups == 0, \
+            'out_channels {} cannot be divisible by groups {}'.format(
+                out_channels, groups)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups,
+                         *self.kernel_size))
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+
+    def forward(self, x, offset):
+        return deform_conv(x, offset, self.weight, self.stride, self.padding,
+                           self.dilation, self.groups, self.deformable_groups)
+
+
+class DeformConvPack(DeformConv):
+
+    def __init__(self, *args, **kwargs):
+        super(DeformConvPack, self).__init__(*args, **kwargs)
+
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deformable_groups * 2 * self.kernel_size[0] *
+            self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=_pair(self.stride),
+            padding=_pair(self.padding),
+            bias=True)
+        self.init_offset()
+
+    def init_offset(self):
+        self.conv_offset.weight.data.zero_()
+        self.conv_offset.bias.data.zero_()
+
+    def forward(self, x):
+        offset = self.conv_offset(x)
+        return deform_conv(x, offset, self.weight, self.stride, self.padding,
+                           self.dilation, self.groups, self.deformable_groups)
+
+
+class ModulatedDeformConv(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 deformable_groups=1,
+                 bias=True):
+        super(ModulatedDeformConv, self).__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.deformable_groups = deformable_groups
+        self.with_bias = bias
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups,
+                         *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.zero_()
+
+    def forward(self, x, offset, mask):
+        return modulated_deform_conv(x, offset, mask, self.weight, self.bias,
+                                     self.stride, self.padding, self.dilation,
+                                     self.groups, self.deformable_groups)
+
+
+class ModulatedDeformConvPack(ModulatedDeformConv):
+
+    def __init__(self, *args, **kwargs):
+        super(ModulatedDeformConvPack, self).__init__(*args, **kwargs)
+
+        self.conv_offset_mask = nn.Conv2d(
+            self.in_channels,
+            self.deformable_groups * 3 * self.kernel_size[0] *
+            self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=_pair(self.stride),
+            padding=_pair(self.padding),
+            bias=True)
+        self.init_offset()
+
+    def init_offset(self):
+        self.conv_offset_mask.weight.data.zero_()
+        self.conv_offset_mask.bias.data.zero_()
+
+    def forward(self, x):
+        out = self.conv_offset_mask(x)
+        o1, o2, mask = torch.chunk(out, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)
+        mask = torch.sigmoid(mask)
+        return modulated_deform_conv(x, offset, mask, self.weight, self.bias,
+                                     self.stride, self.padding, self.dilation,
+                                     self.groups, self.deformable_groups)
--- a/easyocr/DBNet/assets/ops/dcn/modules/deform_pool.py
+++ b/easyocr/DBNet/assets/ops/dcn/modules/deform_pool.py
@@ -0,0 +1,172 @@
+from torch import nn
+
+from ..functions.deform_pool import deform_roi_pooling
+
+
+class DeformRoIPooling(nn.Module):
+
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0):
+        super(DeformRoIPooling, self).__init__()
+        self.spatial_scale = spatial_scale
+        self.out_size = out_size
+        self.out_channels = out_channels
+        self.no_trans = no_trans
+        self.group_size = group_size
+        self.part_size = out_size if part_size is None else part_size
+        self.sample_per_part = sample_per_part
+        self.trans_std = trans_std
+
+    def forward(self, data, rois, offset):
+        if self.no_trans:
+            offset = data.new_empty(0)
+        return deform_roi_pooling(
+            data, rois, offset, self.spatial_scale, self.out_size,
+            self.out_channels, self.no_trans, self.group_size, self.part_size,
+            self.sample_per_part, self.trans_std)
+
+
+class DeformRoIPoolingPack(DeformRoIPooling):
+
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0,
+                 num_offset_fcs=3,
+                 deform_fc_channels=1024):
+        super(DeformRoIPoolingPack,
+              self).__init__(spatial_scale, out_size, out_channels, no_trans,
+                             group_size, part_size, sample_per_part, trans_std)
+
+        self.num_offset_fcs = num_offset_fcs
+        self.deform_fc_channels = deform_fc_channels
+
+        if not no_trans:
+            seq = []
+            ic = self.out_size * self.out_size * self.out_channels
+            for i in range(self.num_offset_fcs):
+                if i < self.num_offset_fcs - 1:
+                    oc = self.deform_fc_channels
+                else:
+                    oc = self.out_size * self.out_size * 2
+                seq.append(nn.Linear(ic, oc))
+                ic = oc
+                if i < self.num_offset_fcs - 1:
+                    seq.append(nn.ReLU(inplace=True))
+            self.offset_fc = nn.Sequential(*seq)
+            self.offset_fc[-1].weight.data.zero_()
+            self.offset_fc[-1].bias.data.zero_()
+
+    def forward(self, data, rois):
+        assert data.size(1) == self.out_channels
+        if self.no_trans:
+            offset = data.new_empty(0)
+            return deform_roi_pooling(
+                data, rois, offset, self.spatial_scale, self.out_size,
+                self.out_channels, self.no_trans, self.group_size,
+                self.part_size, self.sample_per_part, self.trans_std)
+        else:
+            n = rois.shape[0]
+            offset = data.new_empty(0)
+            x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                   self.out_size, self.out_channels, True,
+                                   self.group_size, self.part_size,
+                                   self.sample_per_part, self.trans_std)
+            offset = self.offset_fc(x.view(n, -1))
+            offset = offset.view(n, 2, self.out_size, self.out_size)
+            return deform_roi_pooling(
+                data, rois, offset, self.spatial_scale, self.out_size,
+                self.out_channels, self.no_trans, self.group_size,
+                self.part_size, self.sample_per_part, self.trans_std)
+
+
+class ModulatedDeformRoIPoolingPack(DeformRoIPooling):
+
+    def __init__(self,
+                 spatial_scale,
+                 out_size,
+                 out_channels,
+                 no_trans,
+                 group_size=1,
+                 part_size=None,
+                 sample_per_part=4,
+                 trans_std=.0,
+                 num_offset_fcs=3,
+                 num_mask_fcs=2,
+                 deform_fc_channels=1024):
+        super(ModulatedDeformRoIPoolingPack, self).__init__(
+            spatial_scale, out_size, out_channels, no_trans, group_size,
+            part_size, sample_per_part, trans_std)
+
+        self.num_offset_fcs = num_offset_fcs
+        self.num_mask_fcs = num_mask_fcs
+        self.deform_fc_channels = deform_fc_channels
+
+        if not no_trans:
+            offset_fc_seq = []
+            ic = self.out_size * self.out_size * self.out_channels
+            for i in range(self.num_offset_fcs):
+                if i < self.num_offset_fcs - 1:
+                    oc = self.deform_fc_channels
+                else:
+                    oc = self.out_size * self.out_size * 2
+                offset_fc_seq.append(nn.Linear(ic, oc))
+                ic = oc
+                if i < self.num_offset_fcs - 1:
+                    offset_fc_seq.append(nn.ReLU(inplace=True))
+            self.offset_fc = nn.Sequential(*offset_fc_seq)
+            self.offset_fc[-1].weight.data.zero_()
+            self.offset_fc[-1].bias.data.zero_()
+
+            mask_fc_seq = []
+            ic = self.out_size * self.out_size * self.out_channels
+            for i in range(self.num_mask_fcs):
+                if i < self.num_mask_fcs - 1:
+                    oc = self.deform_fc_channels
+                else:
+                    oc = self.out_size * self.out_size
+                mask_fc_seq.append(nn.Linear(ic, oc))
+                ic = oc
+                if i < self.num_mask_fcs - 1:
+                    mask_fc_seq.append(nn.ReLU(inplace=True))
+                else:
+                    mask_fc_seq.append(nn.Sigmoid())
+            self.mask_fc = nn.Sequential(*mask_fc_seq)
+            self.mask_fc[-2].weight.data.zero_()
+            self.mask_fc[-2].bias.data.zero_()
+
+    def forward(self, data, rois):
+        assert data.size(1) == self.out_channels
+        if self.no_trans:
+            offset = data.new_empty(0)
+            return deform_roi_pooling(
+                data, rois, offset, self.spatial_scale, self.out_size,
+                self.out_channels, self.no_trans, self.group_size,
+                self.part_size, self.sample_per_part, self.trans_std)
+        else:
+            n = rois.shape[0]
+            offset = data.new_empty(0)
+            x = deform_roi_pooling(data, rois, offset, self.spatial_scale,
+                                   self.out_size, self.out_channels, True,
+                                   self.group_size, self.part_size,
+                                   self.sample_per_part, self.trans_std)
+            offset = self.offset_fc(x.view(n, -1))
+            offset = offset.view(n, 2, self.out_size, self.out_size)
+            mask = self.mask_fc(x.view(n, -1))
+            mask = mask.view(n, 1, self.out_size, self.out_size)
+            return deform_roi_pooling(
+                data, rois, offset, self.spatial_scale, self.out_size,
+                self.out_channels, self.no_trans, self.group_size,
+                self.part_size, self.sample_per_part, self.trans_std) * mask
--- a/easyocr/DBNet/assets/ops/dcn/setup.py
+++ b/easyocr/DBNet/assets/ops/dcn/setup.py
@@ -0,0 +1,33 @@
+import torch
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+
+modules = [
+        CppExtension('deform_conv_cpu', [
+            'src/deform_conv_cpu.cpp',
+            'src/deform_conv_cpu_kernel.cpp',
+        ]),
+        CppExtension('deform_pool_cpu', [
+            'src/deform_pool_cpu.cpp', 
+            'src/deform_pool_cpu_kernel.cpp'
+        ])
+]
+
+if torch.cuda.is_available():
+    modules.extend([
+        CUDAExtension('deform_conv_cuda', [
+            'src/deform_conv_cuda.cpp',
+            'src/deform_conv_cuda_kernel.cu',
+        ]),
+        CUDAExtension('deform_pool_cuda', [
+            'src/deform_pool_cuda.cpp', 
+            'src/deform_pool_cuda_kernel.cu'
+        ])
+    ])
+
+setup(
+    name='deform_conv',
+    ext_modules=modules,
+    cmdclass={'build_ext': BuildExtension})
--- a/easyocr/DBNet/assets/ops/dcn/src/deform_conv_cpu.cpp
+++ b/easyocr/DBNet/assets/ops/dcn/src/deform_conv_cpu.cpp
@@ -0,0 +1,759 @@
+/*
+Created by Jaided AI
+Released Date: 31/08/2022
+Description:
+Deformable convolution operator for CPU. 
+This code is adapted from;
+https://github.com/MhLiao/DB/blob/master/assets/ops/dcn/src/deform_conv_cuda.cpp
+https://github.com/CharlesShang/DCNv2
+https://github.com/lbin/DCNv2
+*/
+
+#include <torch/extension.h>
+#include "deform_conv_cpu_kernel.h"
+#include <cmath>
+#include <vector>
+
+void shape_check(
+    at::Tensor input, at::Tensor offset, 
+    at::Tensor *gradOutput, at::Tensor weight, 
+    int kH, int kW, int dH, int dW, 
+    int padH, int padW, int dilationH, int dilationW,
+    int group, int deformable_group) {
+
+  TORCH_CHECK(weight.ndimension() == 4,
+           "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+           "but got: %s",
+           weight.ndimension());
+
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  TORCH_CHECK(kW > 0 && kH > 0,
+           "kernel size should be greater than zero, but got kH: %d kW: %d", kH,
+           kW);
+
+  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+           "kernel size should be consistent with weight, ",
+           "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
+           kW, weight.size(2), weight.size(3));
+
+  TORCH_CHECK(dW > 0 && dH > 0,
+           "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  TORCH_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH, dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
+           ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  TORCH_CHECK(nInputPlane % deformable_group == 0,
+           "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+        outputWidth);
+
+  TORCH_CHECK(input.size(1) == nInputPlane,
+           "invalid number of input planes, expected: %d, but got: %d",
+           nInputPlane, input.size(1));
+
+  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
+           "input image is smaller than kernel");
+
+  TORCH_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+           "invalid spatial size of offset, expected height: %d width: %d, but "
+           "got height: %d width: %d",
+           outputHeight, outputWidth, offset.size(2), offset.size(3));
+
+  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+           "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    TORCH_CHECK(gradOutput->size(dimf) == nOutputPlane,
+             "invalid number of gradOutput planes, expected: %d, but got: %d",
+             nOutputPlane, gradOutput->size(dimf));
+
+    TORCH_CHECK((gradOutput->size(dimh) == outputHeight &&
+              gradOutput->size(dimw) == outputWidth),
+             "invalid size of gradOutput, expected height: %d width: %d , but "
+             "got height: %d width: %d",
+             outputHeight, outputWidth, gradOutput->size(dimh),
+             gradOutput->size(dimw));
+  }
+}
+
+int deform_conv_forward_cpu(
+    at::Tensor input, at::Tensor weight,
+    at::Tensor offset, at::Tensor output,
+    at::Tensor columns, at::Tensor ones, 
+    int kW, int kH, int dW, int dH, 
+    int padW, int padH, int dilationW, int dilationH,
+    int group, int deformable_group, int im2col_step) {
+  // todo: resize columns to include im2col: done
+  // todo: add im2col_step as input
+  // todo: add new output buffer and transpose it to output (or directly
+  // transpose output) todo: possibly change data indexing because of
+  // parallel_imgs
+
+  shape_check(
+      input, offset, 
+      NULL, weight, 
+      kH, kW, dH, dW, 
+      padH, padW, dilationH, dilationW, 
+      group, deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+                        outputHeight, outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  at::Tensor output_buffer =
+      at::zeros({batchSize / im2col_step, nOutputPlane,
+                 im2col_step * outputHeight, outputWidth},
+                output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), group, output_buffer.size(1) / group,
+       output_buffer.size(2), output_buffer.size(3)});
+  using scalar_t = float;
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    auto input_n = input.select(0, elt);
+    auto offset_n = offset.select(0, elt);
+    auto columns_n = columns.select(0, elt);
+        
+    deformable_im2col(
+        input_n.data_ptr<scalar_t>(),
+        offset_n.data_ptr<scalar_t>(), 
+        nInputPlane, 
+        inputHeight, inputWidth, kH, kW, 
+        padH, padW, dH, dW, 
+        dilationH, dilationW, 
+        im2col_step, deformable_group, 
+        columns_n.data_ptr<scalar_t>()
+        );
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3), output_buffer.size(4)});
+
+  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+                                      im2col_step, outputHeight, outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_input_cpu(
+    at::Tensor input, at::Tensor offset,
+    at::Tensor gradOutput, at::Tensor gradInput,
+    at::Tensor gradOffset, at::Tensor weight,
+    at::Tensor columns, 
+    int kW, int kH, int dW, int dH, 
+    int padW, int padH, int dilationW, int dilationH, 
+    int group,  int deformable_group, int im2col_step) {
+
+  shape_check(
+    input, offset, 
+    &gradOutput, weight,
+    kH, kW, dH, dW, 
+    padH, padW, dilationH, dilationW, 
+    group, deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                              inputHeight, inputWidth});
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+                                deformable_group * 2 * kH * kW, outputHeight,
+                                outputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  using scalar_t = float;
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    auto input_n = input.select(0, elt);
+    auto offset_n = offset.select(0, elt);
+    auto gradInput_n = gradInput.select(0, elt);
+    auto gradOffset_n = gradOffset.select(0, elt);
+    
+
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), group, gradOutput.size(1) / group,
+         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+
+    auto columns_n = columns.select(0, elt);
+    
+    deformable_col2im_coord(
+        columns_n.data_ptr<scalar_t>(), 
+        input_n.data_ptr<scalar_t>(), 
+        offset_n.data_ptr<scalar_t>(), 
+        nInputPlane,
+        inputHeight, inputWidth, kH, kW, 
+        padH, padW, dH, dW,
+        dilationH, dilationW, 
+        im2col_step, deformable_group,
+        gradOffset_n.data_ptr<scalar_t>());
+
+    deformable_col2im(
+        columns_n.data_ptr<scalar_t>(), 
+        offset_n.data_ptr<scalar_t>(), 
+        nInputPlane, 
+        inputHeight, inputWidth, kH, kW, 
+        padH, padW, dH, dW, 
+        dilationH, dilationW, 
+        im2col_step, deformable_group, 
+        gradInput_n.data_ptr<scalar_t>());
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_parameters_cpu(
+    at::Tensor input, at::Tensor offset, 
+    at::Tensor gradOutput, at::Tensor gradWeight,  // at::Tensor gradBias,
+    at::Tensor columns, at::Tensor ones, 
+    int kW, int kH, int dW, int dH,
+    int padW, int padH, int dilationW, int dilationH, 
+    int group, int deformable_group, 
+    float scale, int im2col_step) {
+  // todo: transpose and reshape outGrad
+  // todo: reshape columns
+  // todo: add im2col_step as input
+
+  shape_check(
+      input, offset, 
+      &gradOutput, gradWeight, 
+      kH, kW, dH, dW, 
+      padH, padW, dilationH, dilationW, 
+      group, deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+                             outputHeight, outputWidth});
+  gradOutputBuffer.copy_(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+                             im2col_step * outputHeight, outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  using scalar_t = float;
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    auto input_n = input.select(0, elt);
+    auto offset_n = offset.select(0, elt);
+    auto columns_n = columns.select(0, elt);
+     
+    deformable_im2col(
+        input_n.data_ptr<scalar_t>(), 
+        offset_n.data_ptr<scalar_t>(), 
+        nInputPlane, 
+        inputHeight, inputWidth, 
+        kH, kW, padH, padW, dH, dW, 
+        dilationH, dilationW, 
+        im2col_step, deformable_group, 
+        columns_n.data_ptr<scalar_t>());
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight =
+        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+                         gradWeight.size(2), gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
+                                  columns[g].transpose(1, 0), 1.0, scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2), gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+
+  return 1;
+}
+
+void modulated_deform_conv_cpu_forward(
+    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+    at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns,
+    int kernel_h, int kernel_w, const int stride_h, const int stride_w,
+    const int pad_h, const int pad_w, const int dilation_h, const int dilation_w,
+    const int group, const int deformable_group,
+    const bool with_bias) {
+
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns =
+      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+                input.options());
+
+  output = output.view({output.size(0), group, output.size(1) / group,
+                        output.size(2), output.size(3)});
+  using scalar_t = float;
+
+  for (int b = 0; b < batch; b++) {
+    auto input_n = input.select(0, b);
+    auto offset_n = offset.select(0, b);
+    auto mask_n = mask.select(0, b);
+    auto output_n = output.select(0, b);
+        
+
+    modulated_deformable_im2col_cpu(
+        input_n.data_ptr<scalar_t>(),
+        offset_n.data_ptr<scalar_t>(),
+        mask_n.data_ptr<scalar_t>(),
+        1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w,
+        pad_h, pad_w, stride_h, stride_w, 
+        dilation_h, dilation_w, deformable_group,
+        columns.data_ptr<scalar_t>());
+
+    // divide into group
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view({output.size(0), output.size(1) * output.size(2),
+                        output.size(3), output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_cpu_backward(
+    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+    at::Tensor offset, at::Tensor mask, at::Tensor columns,
+    at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias,
+    at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w,
+    int pad_h, int pad_w, int dilation_h, int dilation_w, 
+    int group, int deformable_group,
+    const bool with_bias) {
+
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    TORCH_CHECK("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    TORCH_CHECK("Input shape and kernel channels wont match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+                      input.options());
+
+  grad_output =
+      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+                        grad_output.size(2), grad_output.size(3)});
+
+  using scalar_t = float;
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    auto input_n = input.select(0, b);
+    auto offset_n = offset.select(0, b);
+    auto mask_n = mask.select(0, b);
+    auto grad_output_n = grad_output.select(0, b);
+    auto grad_input_n = grad_input.select(0, b);
+    auto grad_offset_n = grad_offset.select(0, b);
+    auto grad_mask_n = grad_mask.select(0, b);
+
+
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_cpu(
+        columns.data_ptr<scalar_t>(),
+        input_n.data_ptr<scalar_t>(),
+        offset_n.data_ptr<scalar_t>(),
+        mask_n.data_ptr<scalar_t>(),
+        1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w,
+        pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group,
+        grad_offset_n.data_ptr<scalar_t>(),
+        grad_mask_n.data_ptr<scalar_t>()
+    );
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_cpu(
+        columns.data_ptr<scalar_t>(),
+        offset_n.data_ptr<scalar_t>(),
+        mask_n.data_ptr<scalar_t>(),
+        1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w,
+        pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group,
+        grad_input_n.data_ptr<scalar_t>()
+    );
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and group
+    modulated_deformable_im2col_cpu(
+        input_n.data_ptr<scalar_t>(),
+        offset_n.data_ptr<scalar_t>(),
+        mask_n.data_ptr<scalar_t>(),
+        1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w,
+        pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group,
+        columns.data_ptr<scalar_t>()
+    );
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+                                    grad_weight.size(1), grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2), grad_weight.size(3),
+                                    grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2), grad_output.size(3),
+                                  grad_output.size(4)});
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("deform_conv_forward_cpu", 
+        &deform_conv_forward_cpu,
+        "deform forward (CPU)");
+  m.def("deform_conv_backward_input_cpu", 
+        &deform_conv_backward_input_cpu,
+        "deform_conv_backward_input (CPU)");
+  m.def("deform_conv_backward_parameters_cpu",
+        &deform_conv_backward_parameters_cpu,
+        "deform_conv_backward_parameters (CPU)");
+  m.def("modulated_deform_conv_cpu_forward",
+        &modulated_deform_conv_cpu_forward,
+        "modulated deform conv forward (CPU)");
+  m.def("modulated_deform_conv_cpu_backward",
+        &modulated_deform_conv_cpu_backward,
+        "modulated deform conv backward (CPU)");
+}
--- a/easyocr/DBNet/assets/ops/dcn/src/deform_conv_cpu_kernel.cpp
+++ b/easyocr/DBNet/assets/ops/dcn/src/deform_conv_cpu_kernel.cpp
@@ -0,0 +1,766 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+/*
+Modified by Jaided AI
+Released Date: 31/08/2022
+Description:
+Deformable convolution kernel for CPU. 
+This code is adapted from;
+https://github.com/MhLiao/DB/blob/master/assets/ops/dcn/src/deform_conv_cuda_kernel.cu
+https://github.com/CharlesShang/DCNv2
+https://github.com/lbin/DCNv2
+*/
+
+#include "deform_conv_cpu_kernel.h"
+#include <ATen/ATen.h>
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+
+using namespace at;
+
+float deformable_im2col_bilinear(
+        const float *bottom_data, const int data_width,
+        const int height, const int width, 
+        float h, float w) {
+
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  float lh = h - h_low;
+  float lw = w - w_low;
+  float hh = 1 - lh, hw = 1 - lw;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  float v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  float v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+float get_gradient_weight(
+        float argmax_h, float argmax_w,
+        const int h, const int w, 
+        const int height, const int width) {
+
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  float weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+float get_coordinate_weight(
+        float argmax_h, float argmax_w,
+        const int height, const int width, 
+        const float *im_data, const int data_width, 
+        const int bp_dir) {
+
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  float weight = 0;
+
+  if (bp_dir == 0)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+  else if (bp_dir == 1)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+void deformable_im2col_cpu_kernel(
+        const int n, const float *data_im, const float *data_offset,
+        const int height, const int width, const int kernel_h, const int kernel_w,
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+        const int dilation_h, const int dilation_w, const int channel_per_deformable_group,
+        const int batch_size, const int num_channels, const int deformable_group,
+        const int height_col, const int width_col,
+        float *data_col) {
+
+  for(int index=0; index<n; index++)
+  {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col / num_channels) % batch_size;
+    const int c_im = (index / width_col / height_col) % num_channels;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    
+    float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
+    const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+    const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+        const float offset_h = data_offset_ptr[data_offset_h_ptr];
+        const float offset_w = data_offset_ptr[data_offset_w_ptr];
+        float val = static_cast<float>(0);
+        const float h_im = h_in + i * dilation_h + offset_h;
+        const float w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+        {
+          val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+void deformable_im2col(
+        const float* data_im, const float* data_offset, const int channels,
+        const int height, const int width, const int ksize_h, const int ksize_w,
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+        const int dilation_h, const int dilation_w, const int parallel_imgs,
+        const int deformable_group, float* data_col) {
+
+  // num_axes should be smaller than block size
+  // todo: check parallel_imgs is correctly passed in
+  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  deformable_im2col_cpu_kernel(
+    num_kernels, data_im, data_offset, 
+    height, width, ksize_h, ksize_w,
+    pad_h, pad_w, stride_h, stride_w, 
+    dilation_h, dilation_w, channel_per_deformable_group, 
+    parallel_imgs, channels, deformable_group,
+    height_col, width_col, data_col);
+
+}
+
+void deformable_col2im_cpu_kernel(
+        const int n, const float *data_col, const float *data_offset, const int channels, 
+        const int height, const int width, const int kernel_h, const int kernel_w,
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+        const int dilation_h, const int dilation_w, const int channel_per_deformable_group,
+        const int batch_size, const int deformable_group,
+        const int height_col, const int width_col, float *grad_im) {
+
+  for(int index = 0; index < n; index++)
+  {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) *
+                                                        2 * kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const float offset_h = data_offset_ptr[data_offset_h_ptr];
+    const float offset_w = data_offset_ptr[data_offset_w_ptr];
+    const float cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const float cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const float cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++)
+    {
+      for (int dx = -2; dx <= 2; dx++)
+      {
+        if (cur_h + dy >= 0 && cur_h + dy < height &&
+            cur_w + dx >= 0 && cur_w + dx < width &&
+            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1)
+        {
+          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          float weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
+        }
+      }
+    }
+  }
+}
+
+void deformable_col2im(
+        const float* data_col, const float* data_offset, const int channels,
+        const int height, const int width, const int ksize_h, const int ksize_w, 
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+        const int dilation_h, const int dilation_w,
+        const int parallel_imgs, const int deformable_group, float* grad_im) {
+
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  deformable_col2im_cpu_kernel(
+      num_kernels, data_col, data_offset, channels, 
+      height, width, ksize_h, ksize_w, 
+      pad_h, pad_w, stride_h, stride_w,
+      dilation_h, dilation_w, channel_per_deformable_group,
+      parallel_imgs, deformable_group, height_col, width_col, grad_im);
+      
+}
+
+void deformable_col2im_coord_cpu_kernel(
+        const int n, const float *data_col, const float *data_im, 
+        const float *data_offset, const int channels, 
+        const int height, const int width, const int kernel_h, const int kernel_w, 
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+        const int dilation_h, const int dilation_w, const int channel_per_deformable_group,
+        const int batch_size, const int offset_channels, const int deformable_group,
+        const int height_col, const int width_col, float *grad_offset) {
+
+  for(int index = 0; index < n; index++)
+  {
+    float val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group *
+                                                  batch_size * width_col * height_col;
+    const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) *
+                                                channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                                                        kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+    {
+      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+      const float offset_h = data_offset_ptr[data_offset_h_ptr];
+      const float offset_w = data_offset_ptr[data_offset_w_ptr];
+      float inv_h = h_in + i * dilation_h + offset_h;
+      float inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+      {
+        inv_h = inv_w = -2;
+      }
+    
+      const float weight = get_coordinate_weight(
+          inv_h, inv_w,
+          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+void deformable_col2im_coord(
+        const float* data_col, const float* data_im, 
+        const float* data_offset, const int channels, 
+        const int height, const int width, const int ksize_h, const int ksize_w, 
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+        const int dilation_h, const int dilation_w, const int parallel_imgs, 
+        const int deformable_group, float* grad_offset) {
+
+  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * deformable_group * parallel_imgs;
+  int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group;
+
+  deformable_col2im_coord_cpu_kernel(
+      num_kernels, data_col, data_im, 
+      data_offset, channels, 
+      height, width, ksize_h, ksize_w, 
+      pad_h, pad_w, stride_h, stride_w,
+      dilation_h, dilation_w, channel_per_deformable_group,
+      parallel_imgs, 2 * ksize_h * ksize_w * deformable_group, deformable_group,
+      height_col, width_col, grad_offset);
+
+}
+
+float dmcn_im2col_bilinear_cpu(
+        const float *bottom_data, const int data_width,
+        const int height, const int width, 
+        float h, float w) {
+
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  float lh = h - h_low;
+  float lw = w - w_low;
+  float hh = 1 - lh, hw = 1 - lw;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  float v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  float v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+float dmcn_get_gradient_weight_cpu(
+        float argmax_h, float argmax_w,
+        const int h, const int w, 
+        const int height, const int width) {
+
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  float weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+float dmcn_get_coordinate_weight_cpu(
+        float argmax_h, float argmax_w,
+        const int height, const int width,
+        const float *im_data, const int data_width, 
+        const int bp_dir) {
+
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  float weight = 0;
+
+  if (bp_dir == 0)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+  else if (bp_dir == 1)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+void modulated_deformable_im2col_cpu_kernel(
+        const int n, const float *data_im, const float *data_offset, const float *data_mask,
+        const int height, const int width, const int kernel_h, const int kernel_w,
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+        const int dilation_h, const int dilation_w, const int channel_per_deformable_group,
+        const int batch_size, const int num_channels, const int deformable_group,
+        const int height_col, const int width_col, float *data_col) {
+
+  for(int index=0; index<n; index++)
+  {
+    // NOTE(CharlesShang): different from Dai Jifeng's MXNet implementation, col_buffer is of shape (c*kw*kh, N, oh, ow)
+    // here columns is of shape (N, c*kw*kh, oh * ow), need to adapt axis
+
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col / num_channels) % batch_size;
+    const int c_im = (index / width_col / height_col) % num_channels;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    float *data_col_ptr = data_col + ((b_col * num_channels * kernel_w * kernel_h + c_col) * height_col + h_col) * width_col + w_col;
+    const float *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+    const float *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+    const float *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+        const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const float offset_h = data_offset_ptr[data_offset_h_ptr];
+        const float offset_w = data_offset_ptr[data_offset_w_ptr];
+        const float mask = data_mask_ptr[data_mask_hw_ptr];
+        float val = static_cast<float>(0);
+        const float h_im = h_in + i * dilation_h + offset_h;
+        const float w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+        {
+          val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+void modulated_deformable_col2im_cpu_kernel(
+        const int n, const float *data_col, const float *data_offset, 
+        const float *data_mask, const int channels, 
+        const int height, const int width, const int kernel_h, const int kernel_w,
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+        const int dilation_h, const int dilation_w, const int channel_per_deformable_group,
+        const int batch_size, const int deformable_group,
+        const int height_col, const int width_col, float *grad_im) {
+
+  for(int index = 0; index < n; index++)
+  {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const float offset_h = data_offset_ptr[data_offset_h_ptr];
+    const float offset_w = data_offset_ptr[data_offset_w_ptr];
+    const float mask = data_mask_ptr[data_mask_hw_ptr];
+    const float cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const float cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const float cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    
+    for (int dy = -2; dy <= 2; dy++)
+    {
+      for (int dx = -2; dx <= 2; dx++)
+      {
+        if (cur_h + dy >= 0 && cur_h + dy < height &&
+            cur_w + dx >= 0 && cur_w + dx < width &&
+            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1)
+        {
+          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          float weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
+
+        }
+      }
+    }
+  }
+}
+
+void modulated_deformable_col2im_coord_cpu_kernel(
+        const int n, const float *data_col, const float *data_im,
+        const float *data_offset, const float *data_mask, const int channels, 
+        const int height, const int width, const int kernel_h, const int kernel_w,
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+        const int dilation_h, const int dilation_w, const int channel_per_deformable_group,
+        const int batch_size, const int offset_channels, const int deformable_group,
+        const int height_col, const int width_col, float *grad_offset, float *grad_mask) {
+
+  for(int index = 0; index < n; index++)
+  {
+    float val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const float *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
+    const float *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const float *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const float *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+    {
+      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+      const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const float offset_h = data_offset_ptr[data_offset_h_ptr];
+      const float offset_w = data_offset_ptr[data_offset_w_ptr];
+      const float mask = data_mask_ptr[data_mask_hw_ptr];
+      float inv_h = h_in + i * dilation_h + offset_h;
+      float inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+      {
+        inv_h = inv_w = -2;
+      }
+      else
+      {
+        mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
+      }
+      const float weight = dmcn_get_coordinate_weight_cpu(
+          inv_h, inv_w,
+          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
+  }
+}
+
+void modulated_deformable_im2col_cpu(
+        const float* data_im, const float* data_offset, const float* data_mask,
+        const int batch_size, const int channels, const int height_im, const int width_im, 
+        const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+        const int dilation_h, const int dilation_w, const int deformable_group, 
+        float* data_col) {
+
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+  modulated_deformable_im2col_cpu_kernel(
+      num_kernels, data_im, data_offset, data_mask, 
+      height_im, width_im, kernel_h, kernel_w,
+      pad_h, pad_w, stride_h, stride_w, 
+      dilation_h, dilation_w, channel_per_deformable_group,
+      batch_size, channels, deformable_group, 
+      height_col, width_col, data_col);
+}
+
+void modulated_deformable_col2im_cpu(
+        const float* data_col, const float* data_offset, const float* data_mask,
+        const int batch_size, const int channels, const int height_im, const int width_im, 
+        const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+        const int dilation_h, const int dilation_w, const int deformable_group, 
+        float* grad_im) {
+
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+  modulated_deformable_col2im_cpu_kernel(
+      num_kernels, data_col, data_offset, data_mask, channels, 
+      height_im, width_im, kernel_h, kernel_w, 
+      pad_h, pad_h, stride_h, stride_w,
+      dilation_h, dilation_w, channel_per_deformable_group,
+      batch_size, deformable_group, 
+      height_col, width_col, grad_im);
+}
+
+void modulated_deformable_col2im_coord_cpu(
+        const float* data_col, const float* data_im, const float* data_offset, const float* data_mask,
+        const int batch_size, const int channels, const int height_im, const int width_im, 
+        const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+        const int dilation_h, const int dilation_w, const int deformable_group,
+        float* grad_offset, float* grad_mask) {
+
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
+  const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
+  modulated_deformable_col2im_coord_cpu_kernel(
+        num_kernels, data_col, data_im, 
+        data_offset, data_mask, channels, 
+        height_im, width_im, kernel_h, kernel_w, 
+        pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, channel_per_deformable_group,
+        batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, 
+        height_col, width_col, grad_offset, grad_mask);
+}
--- a/easyocr/DBNet/assets/ops/dcn/src/deform_conv_cpu_kernel.h
+++ b/easyocr/DBNet/assets/ops/dcn/src/deform_conv_cpu_kernel.h
@@ -0,0 +1,64 @@
+/*
+Created by Jaided AI
+Released Date: 31/08/2022
+Description:
+Deformable convolution kernel for CPU. 
+This code is adapted from;
+https://github.com/MhLiao/DB/blob/master/assets/ops/dcn/src/deform_conv_cuda.cpp
+https://github.com/CharlesShang/DCNv2
+https://github.com/lbin/DCNv2
+*/
+
+#pragma once
+#ifndef DEFORM_CONV_CPU_KERNEL
+#define DEFORM_CONV_CPU_KERNEL
+
+void deformable_im2col(
+        const float *data_im, const float *data_offset, const int channels, 
+        const int height, const int width, const int ksize_h, const int ksize_w, 
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+        const int dilation_h, const int dilation_w, const int parallel_imgs, 
+        const int deformable_group, float *data_col);
+
+void deformable_col2im(
+        const float *data_col, const float *data_offset, const int channels, 
+        const int height, const int width, const int ksize_h, const int ksize_w, 
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+        const int dilation_h, const int dilation_w, const int parallel_imgs, 
+        const int deformable_group, float *grad_im);
+
+void deformable_col2im_coord(
+        const float *data_col, const float *data_im,
+        const float *data_offset, const int channels, 
+        const int height, const int width, const int ksize_h, const int ksize_w, 
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+        const int dilation_h, const int dilation_w, const int parallel_imgs,
+        const int deformable_group, float *grad_offset);
+
+void modulated_deformable_im2col_cpu(
+        const float *data_im, const float *data_offset, const float *data_mask, 
+        const int batch_size, const int channels, const int height_im, const int width_im, 
+        const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+        const int dilation_h, const int dilation_w, const int deformable_group,
+        float *data_col);
+
+void modulated_deformable_col2im_cpu(
+        const float *data_col, const float *data_offset, const float *data_mask, 
+        const int batch_size, const int channels, const int height_im, const int width_im,
+        const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+        const int dilation_h, const int dilation_w, const int deformable_group,
+        float *grad_im);
+
+void modulated_deformable_col2im_coord_cpu(
+        const float *data_col, const float *data_im, const float *data_offset, const float *data_mask,
+        const int batch_size, const int channels, const int height_im, const int width_im, 
+        const int height_col, const int width_col, const int kernel_h, const int kenerl_w, 
+        const int pad_h, const int pad_w, const int stride_h, const int stride_w, 
+        const int dilation_h, const int dilation_w, const int deformable_group, 
+        float *grad_offset, float *grad_mask);
+
+#endif
+
+
--- a/easyocr/DBNet/assets/ops/dcn/src/deform_conv_cuda.cpp
+++ b/easyocr/DBNet/assets/ops/dcn/src/deform_conv_cuda.cpp
@@ -0,0 +1,695 @@
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda.c
+
+#include <torch/extension.h>
+
+#include <cmath>
+#include <vector>
+
+void deformable_im2col(const at::Tensor data_im, const at::Tensor data_offset,
+                       const int channels, const int height, const int width,
+                       const int ksize_h, const int ksize_w, const int pad_h,
+                       const int pad_w, const int stride_h, const int stride_w,
+                       const int dilation_h, const int dilation_w,
+                       const int parallel_imgs, const int deformable_group,
+                       at::Tensor data_col);
+
+void deformable_col2im(const at::Tensor data_col, const at::Tensor data_offset,
+                       const int channels, const int height, const int width,
+                       const int ksize_h, const int ksize_w, const int pad_h,
+                       const int pad_w, const int stride_h, const int stride_w,
+                       const int dilation_h, const int dilation_w,
+                       const int parallel_imgs, const int deformable_group,
+                       at::Tensor grad_im);
+
+void deformable_col2im_coord(
+    const at::Tensor data_col, const at::Tensor data_im,
+    const at::Tensor data_offset, const int channels, const int height,
+    const int width, const int ksize_h, const int ksize_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, at::Tensor grad_offset);
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im, const at::Tensor data_offset,
+    const at::Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kenerl_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    at::Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col, const at::Tensor data_offset,
+    const at::Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kenerl_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    at::Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col, const at::Tensor data_im,
+    const at::Tensor data_offset, const at::Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, at::Tensor grad_offset,
+    at::Tensor grad_mask);
+
+void shape_check(at::Tensor input, at::Tensor offset, at::Tensor *gradOutput,
+                 at::Tensor weight, int kH, int kW, int dH, int dW, int padH,
+                 int padW, int dilationH, int dilationW, int group,
+                 int deformable_group) {
+  TORCH_CHECK(weight.ndimension() == 4,
+           "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
+           "but got: %s",
+           weight.ndimension());
+
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  TORCH_CHECK(kW > 0 && kH > 0,
+           "kernel size should be greater than zero, but got kH: %d kW: %d", kH,
+           kW);
+
+  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+           "kernel size should be consistent with weight, ",
+           "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
+           kW, weight.size(2), weight.size(3));
+
+  TORCH_CHECK(dW > 0 && dH > 0,
+           "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
+
+  TORCH_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH, dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(ndim == 3 || ndim == 4, "3D or 4D input tensor expected but got: %s",
+           ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  TORCH_CHECK(nInputPlane % deformable_group == 0,
+           "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+        outputWidth);
+
+  TORCH_CHECK(input.size(1) == nInputPlane,
+           "invalid number of input planes, expected: %d, but got: %d",
+           nInputPlane, input.size(1));
+
+  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
+           "input image is smaller than kernel");
+
+  TORCH_CHECK((offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+           "invalid spatial size of offset, expected height: %d width: %d, but "
+           "got height: %d width: %d",
+           outputHeight, outputWidth, offset.size(2), offset.size(3));
+
+  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+           "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    TORCH_CHECK(gradOutput->size(dimf) == nOutputPlane,
+             "invalid number of gradOutput planes, expected: %d, but got: %d",
+             nOutputPlane, gradOutput->size(dimf));
+
+    TORCH_CHECK((gradOutput->size(dimh) == outputHeight &&
+              gradOutput->size(dimw) == outputWidth),
+             "invalid size of gradOutput, expected height: %d width: %d , but "
+             "got height: %d width: %d",
+             outputHeight, outputWidth, gradOutput->size(dimh),
+             gradOutput->size(dimw));
+  }
+}
+
+int deform_conv_forward_cuda(at::Tensor input, at::Tensor weight,
+                             at::Tensor offset, at::Tensor output,
+                             at::Tensor columns, at::Tensor ones, int kW,
+                             int kH, int dW, int dH, int padW, int padH,
+                             int dilationW, int dilationH, int group,
+                             int deformable_group, int im2col_step) {
+  // todo: resize columns to include im2col: done
+  // todo: add im2col_step as input
+  // todo: add new output buffer and transpose it to output (or directly
+  // transpose output) todo: possibly change data indexing because of
+  // parallel_imgs
+
+  shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH, padW,
+              dilationH, dilationW, group, deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+                        outputHeight, outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  at::Tensor output_buffer =
+      at::zeros({batchSize / im2col_step, nOutputPlane,
+                 im2col_step * outputHeight, outputWidth},
+                output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), group, output_buffer.size(1) / group,
+       output_buffer.size(2), output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3), output_buffer.size(4)});
+
+  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+                                      im2col_step, outputHeight, outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_input_cuda(at::Tensor input, at::Tensor offset,
+                                    at::Tensor gradOutput, at::Tensor gradInput,
+                                    at::Tensor gradOffset, at::Tensor weight,
+                                    at::Tensor columns, int kW, int kH, int dW,
+                                    int dH, int padW, int padH, int dilationW,
+                                    int dilationH, int group,
+                                    int deformable_group, int im2col_step) {
+  shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW, padH, padW,
+              dilationH, dilationW, group, deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+  weight = weight.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                              inputHeight, inputWidth});
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+                                deformable_group * 2 * kH * kW, outputHeight,
+                                outputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), group, gradOutput.size(1) / group,
+         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+
+    deformable_col2im_coord(columns, input[elt], offset[elt], nInputPlane,
+                            inputHeight, inputWidth, kH, kW, padH, padW, dH, dW,
+                            dilationH, dilationW, im2col_step, deformable_group,
+                            gradOffset[elt]);
+
+    deformable_col2im(columns, offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, gradInput[elt]);
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+
+  return 1;
+}
+
+int deform_conv_backward_parameters_cuda(
+    at::Tensor input, at::Tensor offset, at::Tensor gradOutput,
+    at::Tensor gradWeight,  // at::Tensor gradBias,
+    at::Tensor columns, at::Tensor ones, int kW, int kH, int dW, int dH,
+    int padW, int padH, int dilationW, int dilationH, int group,
+    int deformable_group, float scale, int im2col_step) {
+  // todo: transpose and reshape outGrad
+  // todo: reshape columns
+  // todo: add im2col_step as input
+
+  shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH, dW, padH,
+              padW, dilationH, dilationW, group, deformable_group);
+
+  input = input.contiguous();
+  offset = offset.contiguous();
+  gradOutput = gradOutput.contiguous();
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  at::Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+                             outputHeight, outputWidth});
+  gradOutputBuffer.copy_(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+                             im2col_step * outputHeight, outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col(input[elt], offset[elt], nInputPlane, inputHeight,
+                      inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                      dilationW, im2col_step, deformable_group, columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight =
+        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+                         gradWeight.size(2), gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
+                                  columns[g].transpose(1, 0), 1.0, scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2), gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+
+  return 1;
+}
+
+void modulated_deform_conv_cuda_forward(
+    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+    at::Tensor offset, at::Tensor mask, at::Tensor output, at::Tensor columns,
+    int kernel_h, int kernel_w, const int stride_h, const int stride_w,
+    const int pad_h, const int pad_w, const int dilation_h,
+    const int dilation_w, const int group, const int deformable_group,
+    const bool with_bias) {
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels wont match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns =
+      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+                input.options());
+
+  output = output.view({output.size(0), group, output.size(1) / group,
+                        output.size(2), output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_cuda(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    // divide into group
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view({output.size(0), output.size(1) * output.size(2),
+                        output.size(3), output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_cuda_backward(
+    at::Tensor input, at::Tensor weight, at::Tensor bias, at::Tensor ones,
+    at::Tensor offset, at::Tensor mask, at::Tensor columns,
+    at::Tensor grad_input, at::Tensor grad_weight, at::Tensor grad_bias,
+    at::Tensor grad_offset, at::Tensor grad_mask, at::Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    TORCH_CHECK("Input shape and kernel shape wont match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    TORCH_CHECK("Input shape and kernel channels wont match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+                      input.options());
+
+  grad_output =
+      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+                        grad_output.size(2), grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_cuda(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_cuda(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_cuda(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+                                    grad_weight.size(1), grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2), grad_weight.size(3),
+                                    grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2), grad_output.size(3),
+                                  grad_output.size(4)});
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("deform_conv_forward_cuda", &deform_conv_forward_cuda,
+        "deform forward (CUDA)");
+  m.def("deform_conv_backward_input_cuda", &deform_conv_backward_input_cuda,
+        "deform_conv_backward_input (CUDA)");
+  m.def("deform_conv_backward_parameters_cuda",
+        &deform_conv_backward_parameters_cuda,
+        "deform_conv_backward_parameters (CUDA)");
+  m.def("modulated_deform_conv_cuda_forward",
+        &modulated_deform_conv_cuda_forward,
+        "modulated deform conv forward (CUDA)");
+  m.def("modulated_deform_conv_cuda_backward",
+        &modulated_deform_conv_cuda_backward,
+        "modulated deform conv backward (CUDA)");
+}
--- a/easyocr/DBNet/assets/ops/dcn/src/deform_conv_cuda_kernel.cu
+++ b/easyocr/DBNet/assets/ops/dcn/src/deform_conv_cuda_kernel.cu
@@ -0,0 +1,866 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer ****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer ********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#include <ATen/ATen.h>
+#include <THC/THCAtomics.cuh>
+#include <stdio.h>
+#include <math.h>
+#include <float.h>
+
+using namespace at;
+
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+const int kMaxGridNum = 65535;
+
+inline int GET_BLOCKS(const int N)
+{
+  return std::min(kMaxGridNum, (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS);
+}
+
+template <typename scalar_t>
+__device__ scalar_t deformable_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
+                                               const int height, const int width, scalar_t h, scalar_t w)
+{
+
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
+                                        const int h, const int w, const int height, const int width)
+{
+
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
+                                          const int height, const int width, const scalar_t *im_data,
+                                          const int data_width, const int bp_dir)
+{
+
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+  else if (bp_dir == 1)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void deformable_im2col_gpu_kernel(const int n, const scalar_t *data_im, const scalar_t *data_offset,
+                                             const int height, const int width, const int kernel_h, const int kernel_w,
+                                             const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+                                             const int dilation_h, const int dilation_w, const int channel_per_deformable_group,
+                                             const int batch_size, const int num_channels, const int deformable_group,
+                                             const int height_col, const int width_col,
+                                             scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    //const scalar_t* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+    const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+        {
+          //const scalar_t map_h = i * dilation_h + offset_h;
+          //const scalar_t map_w = j * dilation_w + offset_w;
+          //const int cur_height = height - h_in;
+          //const int cur_width = width - w_in;
+          //val = deformable_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+          val = deformable_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+void deformable_im2col(
+    const at::Tensor data_im, const at::Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, at::Tensor data_col)
+{
+  // num_axes should be smaller than block size
+  // todo: check parallel_imgs is correctly passed in
+  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.type(), "deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        scalar_t *data_col_ = data_col.data<scalar_t>();
+
+        deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_im_, data_offset_, height, width, ksize_h, ksize_w,
+            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, parallel_imgs, channels, deformable_group,
+            height_col, width_col, data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
+  }
+}
+
+template <typename scalar_t>
+__global__ void deformable_col2im_gpu_kernel(
+    const int n, const scalar_t *data_col, const scalar_t *data_offset,
+    const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group,
+    const int batch_size, const int deformable_group,
+    const int height_col, const int width_col,
+    scalar_t *grad_im)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) *
+                                                        2 * kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++)
+    {
+      for (int dx = -2; dx <= 2; dx++)
+      {
+        if (cur_h + dy >= 0 && cur_h + dy < height &&
+            cur_w + dx >= 0 && cur_w + dx < width &&
+            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1)
+        {
+          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+void deformable_col2im(
+    const at::Tensor data_col, const at::Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h,
+    const int ksize_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int parallel_imgs, const int deformable_group,
+    at::Tensor grad_im)
+{
+
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data<scalar_t>();
+
+        deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_col_, data_offset_, channels, height, width, ksize_h,
+            ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group,
+            parallel_imgs, deformable_group, height_col, width_col, grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in deformable_col2im: %s\n", cudaGetErrorString(err));
+  }
+}
+
+template <typename scalar_t>
+__global__ void deformable_col2im_coord_gpu_kernel(const int n, const scalar_t *data_col,
+                                                   const scalar_t *data_im, const scalar_t *data_offset,
+                                                   const int channels, const int height, const int width,
+                                                   const int kernel_h, const int kernel_w,
+                                                   const int pad_h, const int pad_w,
+                                                   const int stride_h, const int stride_w,
+                                                   const int dilation_h, const int dilation_w,
+                                                   const int channel_per_deformable_group,
+                                                   const int batch_size, const int offset_channels, const int deformable_group,
+                                                   const int height_col, const int width_col, scalar_t *grad_offset)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    scalar_t val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group *
+                                                  batch_size * width_col * height_col;
+    const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) *
+                                                channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                                                        kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+    {
+      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+      {
+        inv_h = inv_w = -2;
+      }
+      const scalar_t weight = get_coordinate_weight(
+          inv_h, inv_w,
+          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+void deformable_col2im_coord(
+    const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset,
+    const int channels, const int height, const int width, const int ksize_h,
+    const int ksize_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int parallel_imgs, const int deformable_group, at::Tensor grad_offset)
+{
+
+  int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w * deformable_group * parallel_imgs;
+  int channel_per_deformable_group = channels * ksize_h * ksize_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.type(), "deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_im_ = data_im.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data<scalar_t>();
+
+        deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_col_, data_im_, data_offset_, channels, height, width,
+            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group,
+            parallel_imgs, 2 * ksize_h * ksize_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_);
+      }));
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_im2col_bilinear(const scalar_t *bottom_data, const int data_width,
+                                         const int height, const int width, scalar_t h, scalar_t w)
+{
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  scalar_t lh = h - h_low;
+  scalar_t lw = w - w_low;
+  scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+    v1 = bottom_data[h_low * data_width + w_low];
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_gradient_weight(scalar_t argmax_h, scalar_t argmax_w,
+                                             const int h, const int w, const int height, const int width)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename scalar_t>
+__device__ scalar_t dmcn_get_coordinate_weight(scalar_t argmax_h, scalar_t argmax_w,
+                                               const int height, const int width, const scalar_t *im_data,
+                                               const int data_width, const int bp_dir)
+{
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || argmax_w >= width)
+  {
+    //empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  scalar_t weight = 0;
+
+  if (bp_dir == 0)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+  else if (bp_dir == 1)
+  {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) * im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) * im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_im2col_gpu_kernel(const int n,
+                                                       const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask,
+                                                       const int height, const int width, const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int num_channels, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    scalar_t *data_col_ptr = data_col + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    //const float* data_im_ptr = data_im + ((b_col * num_channels + c_im) * height + h_in) * width + w_in;
+    const scalar_t *data_im_ptr = data_im + (b_col * num_channels + c_im) * height * width;
+    const scalar_t *data_offset_ptr = data_offset + (b_col * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+
+    const scalar_t *data_mask_ptr = data_mask + (b_col * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i)
+    {
+      for (int j = 0; j < kernel_w; ++j)
+      {
+        const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + w_col;
+        const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+        const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+        const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+        scalar_t val = static_cast<scalar_t>(0);
+        const scalar_t h_im = h_in + i * dilation_h + offset_h;
+        const scalar_t w_im = w_in + j * dilation_w + offset_w;
+        //if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+        {
+          //const float map_h = i * dilation_h + offset_h;
+          //const float map_w = j * dilation_w + offset_w;
+          //const int cur_height = height - h_in;
+          //const int cur_width = width - w_in;
+          //val = dmcn_im2col_bilinear(data_im_ptr, width, cur_height, cur_width, map_h, map_w);
+          val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+        //data_col_ptr += height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_gpu_kernel(const int n,
+                                                       const scalar_t *data_col, const scalar_t *data_offset, const scalar_t *data_mask,
+                                                       const int channels, const int height, const int width,
+                                                       const int kernel_h, const int kernel_w,
+                                                       const int pad_h, const int pad_w,
+                                                       const int stride_h, const int stride_w,
+                                                       const int dilation_h, const int dilation_w,
+                                                       const int channel_per_deformable_group,
+                                                       const int batch_size, const int deformable_group,
+                                                       const int height_col, const int width_col,
+                                                       scalar_t *grad_im)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i = (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c = index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr = ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr = ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr = ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+    const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+    const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+    const scalar_t cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const scalar_t cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const scalar_t cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++)
+    {
+      for (int dx = -2; dx <= 2; dx++)
+      {
+        if (cur_h + dy >= 0 && cur_h + dy < height &&
+            cur_w + dx >= 0 && cur_w + dx < width &&
+            abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1)
+        {
+          int cur_bottom_grad_pos = ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          scalar_t weight = dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(const int n,
+                                                             const scalar_t *data_col, const scalar_t *data_im,
+                                                             const scalar_t *data_offset, const scalar_t *data_mask,
+                                                             const int channels, const int height, const int width,
+                                                             const int kernel_h, const int kernel_w,
+                                                             const int pad_h, const int pad_w,
+                                                             const int stride_h, const int stride_w,
+                                                             const int dilation_h, const int dilation_w,
+                                                             const int channel_per_deformable_group,
+                                                             const int batch_size, const int offset_channels, const int deformable_group,
+                                                             const int height_col, const int width_col,
+                                                             scalar_t *grad_offset, scalar_t *grad_mask)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    scalar_t val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const scalar_t *data_col_ptr = data_col + deformable_group_index * channel_per_deformable_group * batch_size * width_col * height_col;
+    const scalar_t *data_im_ptr = data_im + (b * deformable_group + deformable_group_index) * channel_per_deformable_group / kernel_h / kernel_w * height * width;
+    const scalar_t *data_offset_ptr = data_offset + (b * deformable_group + deformable_group_index) * 2 * kernel_h * kernel_w * height_col * width_col;
+    const scalar_t *data_mask_ptr = data_mask + (b * deformable_group + deformable_group_index) * kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; col_c += col_step)
+    {
+      const int col_pos = (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i = (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr = (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr = (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out);
+      const int data_mask_hw_ptr = (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const scalar_t offset_h = data_offset_ptr[data_offset_h_ptr];
+      const scalar_t offset_w = data_offset_ptr[data_offset_w_ptr];
+      const scalar_t mask = data_mask_ptr[data_mask_hw_ptr];
+      scalar_t inv_h = h_in + i * dilation_h + offset_h;
+      scalar_t inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+      {
+        inv_h = inv_w = -2;
+      }
+      else
+      {
+        mval += data_col_ptr[col_pos] * dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width, height, width, inv_h, inv_w);
+      }
+      const scalar_t weight = dmcn_get_coordinate_weight(
+          inv_h, inv_w,
+          height, width, data_im_ptr + cnt * height * width, width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * height_col + h) * width_col + w] = mval;
+  }
+}
+
+void modulated_deformable_im2col_cuda(
+    const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
+    const int batch_size, const int channels, const int height_im, const int width_im,
+    const int height_col, const int width_col, const int kernel_h, const int kenerl_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int deformable_group, at::Tensor data_col)
+{
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.type(), "modulated_deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
+        scalar_t *data_col_ = data_col.data<scalar_t>();
+
+        modulated_deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im, kernel_h, kenerl_w,
+            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, channels, deformable_group, height_col, width_col, data_col_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    // printf("error in modulated_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_cuda(
+    const at::Tensor data_col, const at::Tensor data_offset, const at::Tensor data_mask,
+    const int batch_size, const int channels, const int height_im, const int width_im,
+    const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int deformable_group, at::Tensor grad_im)
+{
+
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.type(), "modulated_deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data<scalar_t>();
+
+        modulated_deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_col_, data_offset_, data_mask_, channels, height_im, width_im,
+            kernel_h, kernel_w, pad_h, pad_h, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, deformable_group, height_col, width_col, grad_im_);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+void modulated_deformable_col2im_coord_cuda(
+    const at::Tensor data_col, const at::Tensor data_im, const at::Tensor data_offset, const at::Tensor data_mask,
+    const int batch_size, const int channels, const int height_im, const int width_im,
+    const int height_col, const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int deformable_group,
+    at::Tensor grad_offset, at::Tensor grad_mask)
+{
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h * kernel_w * deformable_group;
+  const int channel_per_deformable_group = channels * kernel_h * kernel_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data<scalar_t>();
+        const scalar_t *data_im_ = data_im.data<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data<scalar_t>();
+
+        modulated_deformable_col2im_coord_gpu_kernel<<<GET_BLOCKS(num_kernels), CUDA_NUM_THREADS>>>(
+            num_kernels, data_col_, data_im_, data_offset_, data_mask_, channels, height_im, width_im,
+            kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, 2 * kernel_h * kernel_w * deformable_group, deformable_group, height_col, width_col,
+            grad_offset_, grad_mask_);
+      }));
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in modulated_deformable_col2im_coord_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
--- a/easyocr/DBNet/assets/ops/dcn/src/deform_pool_cpu.cpp
+++ b/easyocr/DBNet/assets/ops/dcn/src/deform_pool_cpu.cpp
@@ -0,0 +1,86 @@
+/*
+Created by Jaided AI
+Released Date: 31/08/2022
+Description:
+Deformable convolution operator for CPU. 
+This code is adapted from;
+https://github.com/MhLiao/DB/blob/master/assets/ops/dcn/src/deform_pool_cuda.cpp
+https://github.com/CharlesShang/DCNv2
+https://github.com/lbin/DCNv2
+*/
+
+#include "deform_pool_cpu_kernel.h"
+#include <torch/extension.h>
+#include <cmath>
+#include <vector>
+
+void deform_psroi_pooling_cpu_forward(
+    at::Tensor input, at::Tensor bbox, at::Tensor trans, 
+    at::Tensor out, at::Tensor top_count, const int no_trans, 
+    const float spatial_scale, const int output_dim, const int group_size,
+    const int pooled_size, const int part_size, const int sample_per_part,
+    const float trans_std) {
+  
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+
+  const int num_bbox = bbox.size(0);
+  
+  if (num_bbox != out.size(0))
+    AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+             out.size(0), num_bbox);
+
+  DeformablePSROIPoolForward(
+      input, bbox, trans, 
+      out, top_count, 
+      batch, channels, height, width,
+      num_bbox, channels_trans, no_trans, 
+      spatial_scale, output_dim, group_size,
+      pooled_size, part_size, sample_per_part, 
+      trans_std);
+}
+
+void deform_psroi_pooling_cpu_backward(
+    at::Tensor out_grad, at::Tensor input, at::Tensor bbox, 
+    at::Tensor trans, at::Tensor top_count, at::Tensor input_grad, 
+    at::Tensor trans_grad, const int no_trans, 
+    const float spatial_scale, const int output_dim, const int group_size, 
+    const int pooled_size, const int part_size, const int sample_per_part, 
+    const float trans_std) {
+  TORCH_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+
+  const int num_bbox = bbox.size(0);
+  if (num_bbox != out_grad.size(0))
+    AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+             out_grad.size(0), num_bbox);
+
+  DeformablePSROIPoolBackwardAcc(
+      out_grad, input, bbox, 
+      trans, top_count, input_grad, trans_grad, 
+      batch, channels, height, width, 
+      num_bbox, channels_trans, no_trans,
+      spatial_scale, output_dim, group_size, 
+      pooled_size, part_size, sample_per_part, 
+      trans_std);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("deform_psroi_pooling_cpu_forward", 
+        &deform_psroi_pooling_cpu_forward,
+        "deform psroi pooling forward(CPU)");
+  m.def("deform_psroi_pooling_cpu_backward",
+        &deform_psroi_pooling_cpu_backward,
+        "deform psroi pooling backward(CPU)");
+}
--- a/easyocr/DBNet/assets/ops/dcn/src/deform_pool_cpu_kernel.cpp
+++ b/easyocr/DBNet/assets/ops/dcn/src/deform_pool_cpu_kernel.cpp
@@ -0,0 +1,314 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file deformable_psroi_pooling.cu
+ * \brief
+ * \author Yi Li, Guodong Zhang, Jifeng Dai
+*/
+/***************** Adapted by Charles Shang *********************/
+// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/cuda/deform_psroi_pooling_cuda.cu
+
+/*
+Modified by Jaided AI
+Released Date: 31/08/2022
+Description:
+Deformable convolution kernel for CPU. 
+This code is adapted from;
+https://github.com/MhLiao/DB/blob/master/assets/ops/dcn/src/deform_pool_cuda_kernel.cu
+https://github.com/CharlesShang/DCNv2
+https://github.com/lbin/DCNv2
+*/
+
+#include <torch/extension.h>
+#include "deform_pool_cpu_kernel.h"
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+#include <ATen/ATen.h>
+
+template <typename T>
+T bilinear_interp_cpu(
+    const T *data, const T x, const T y,
+    const int width, const int height) {
+
+  int x1 = floor(x);
+  int x2 = ceil(x);
+  int y1 = floor(y);
+  int y2 = ceil(y);
+  T dist_x = static_cast<T>(x - x1);
+  T dist_y = static_cast<T>(y - y1);
+  T value11 = data[y1 * width + x1];
+  T value12 = data[y2 * width + x1];
+  T value21 = data[y1 * width + x2];
+  T value22 = data[y2 * width + x2];
+  T value = (1 - dist_x) * (1 - dist_y) * value11 +
+            (1 - dist_x) * dist_y * value12 +
+            dist_x * (1 - dist_y) * value21 +
+            dist_x * dist_y * value22;
+  return value;
+}
+
+template <typename T>
+void DeformablePSROIPoolForwardKernelCpu(
+    const int count, const T *bottom_data, const T spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const T *bottom_rois, const T *bottom_trans, 
+    const int no_trans, const T trans_std, const int sample_per_part, 
+    const int output_dim, const int group_size, const int part_size, 
+    const int num_classes, const int channels_each_class, 
+    T *top_data, T *top_count) {
+
+  for(int index = 0; index < count; index++)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const T *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+    // Force too small ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); //avoid 0
+    T roi_height = std::max(roi_end_h - roi_start_h, T(0.1));
+
+    // Compute w and h at bottom
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+
+    int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
+    int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
+    T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
+
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    T sum = 0;
+    int count = 0;
+    int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
+    int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
+    gw = std::min(std::max(gw, 0), group_size - 1);
+    gh = std::min(std::max(gh, 0), group_size - 1);
+
+    const T *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = std::min(std::max(w, T(0.)), width - T(1.));
+        h = std::min(std::max(h, T(0.)), height - T(1.));
+        int c = (ctop * group_size + gh) * group_size + gw;
+        T val = bilinear_interp_cpu(offset_bottom_data + c * height * width, w, h, width, height);
+        sum += val;
+        count++;
+      }
+    }
+    top_data[index] = count == 0 ? static_cast<T>(0) : sum / count;
+    top_count[index] = count;
+  }
+}
+
+template <typename T>
+void DeformablePSROIPoolBackwardAccKernelCpu(
+    const int count, const T *top_diff, const T *top_count,
+    const int num_rois, const T spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int output_dim, 
+    T *bottom_data_diff, T *bottom_trans_diff,
+    const T *bottom_data, const T *bottom_rois, const T *bottom_trans,
+    const int no_trans, const T trans_std, const int sample_per_part,
+    const int group_size, const int part_size, const int num_classes,
+    const int channels_each_class) {
+
+  for(int index = 0; index < count; index++)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const T *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    T roi_start_w = static_cast<T>(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_start_h = static_cast<T>(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    T roi_end_w = static_cast<T>(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h = static_cast<T>(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+    
+    // Force too small ROIs to be 1x1
+    T roi_width = std::max(roi_end_w - roi_start_w, T(0.1)); //avoid 0
+    T roi_height = std::max(roi_end_h - roi_start_h, T(0.1));
+
+    // Compute w and h at bottom
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+
+    int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
+    int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    T trans_x = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * trans_std;
+    T trans_y = no_trans ? static_cast<T>(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * trans_std;
+
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    if (top_count[index] <= 0)
+    {
+      continue;
+    }
+    T diff_val = top_diff[index] / top_count[index];
+    const T *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
+    T *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
+    int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
+    int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
+    gw = std::min(std::max(gw, 0), group_size - 1);
+    gh = std::min(std::max(gh, 0), group_size - 1);
+
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = std::min(std::max(w, T(0.)), width - T(1.));
+        h = std::min(std::max(h, T(0.)), height - T(1.));
+        int c = (ctop * group_size + gh) * group_size + gw;
+        // backward on feature
+        int x0 = floor(w);
+        int x1 = ceil(w);
+        int y0 = floor(h);
+        int y1 = ceil(h);
+        T dist_x = w - x0, dist_y = h - y0;
+        T q00 = (1 - dist_x) * (1 - dist_y);
+        T q01 = (1 - dist_x) * dist_y;
+        T q10 = dist_x * (1 - dist_y);
+        T q11 = dist_x * dist_y;
+        int bottom_index_base = c * height * width;
+       *(offset_bottom_data_diff + bottom_index_base + y0 * width + x0) += q00 * diff_val;
+       *(offset_bottom_data_diff + bottom_index_base + y1 * width + x0) += q01 * diff_val;
+       *(offset_bottom_data_diff + bottom_index_base + y0 * width + x1) += q10 * diff_val;
+       *(offset_bottom_data_diff + bottom_index_base + y1 * width + x1) += q11 * diff_val;
+
+
+        if (no_trans)
+        {
+          continue;
+        }
+        T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+        T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+        T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+        T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+        T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
+        diff_x *= roi_width;
+        T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
+        diff_y *= roi_height;
+
+        *(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w) += diff_x;
+        *(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w) += diff_y;
+      }
+    }
+  }
+}
+
+void DeformablePSROIPoolForward(
+    const at::Tensor input, const at::Tensor bbox, 
+    const at::Tensor trans, at::Tensor out, at::Tensor top_count,
+    const int batch, const int channels, const int height, const int width,
+    const int num_bbox, const int channels_trans, const int no_trans,
+    const float spatial_scale, const int output_dim,
+    const int group_size, const int pooled_size, const int part_size,
+    const int sample_per_part, const float trans_std) {
+
+  const int pooled_height = pooled_size;
+  const int pooled_width = pooled_size;
+  
+  long out_size = num_bbox * output_dim * pooled_height * pooled_width;
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "DeformablePSROIPoolForward", [&] {
+    DeformablePSROIPoolForwardKernelCpu<scalar_t>(
+        out_size, input.contiguous().data_ptr<scalar_t>(), spatial_scale,
+        channels, height, width,
+        pooled_height, pooled_width,
+        bbox.contiguous().data_ptr<scalar_t>(), 
+        trans.contiguous().data_ptr<scalar_t>(),
+        no_trans, trans_std, sample_per_part,
+        output_dim, group_size, part_size,
+        num_classes, channels_each_class,
+        out.data_ptr<scalar_t>(), 
+        top_count.data_ptr<scalar_t>());
+
+  });
+
+}
+
+void DeformablePSROIPoolBackwardAcc(
+    const at::Tensor out_grad, const at::Tensor input, const at::Tensor bbox,
+    const at::Tensor trans, const at::Tensor top_count,
+    at::Tensor in_grad, at::Tensor trans_grad,
+    const int batch, const int channels, const int height, const int width,
+    const int num_bbox, const int channels_trans, const int no_trans,
+    const float spatial_scale, const int output_dim,
+    const int group_size, const int pooled_size, const int part_size,
+    const int sample_per_part, const float trans_std) {
+  // LOG(INFO) << "DeformablePSROIPoolBackward";
+  const int num_rois = num_bbox;
+  const int pooled_height = pooled_size;
+  const int pooled_width = pooled_size;
+  long out_size = num_bbox * output_dim * pooled_height * pooled_width;
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  AT_DISPATCH_FLOATING_TYPES(out_grad.scalar_type(), "DeformablePSROIPoolBackwardAcc", [&] {
+    DeformablePSROIPoolBackwardAccKernelCpu<scalar_t>(
+        out_size, 
+        out_grad.contiguous().data_ptr<scalar_t>(), 
+        top_count.contiguous().data_ptr<scalar_t>(),
+        num_rois, spatial_scale,
+        channels, height, width,
+        pooled_height, pooled_width, output_dim,
+        in_grad.contiguous().data_ptr<scalar_t>(),
+        trans_grad.contiguous().data_ptr<scalar_t>(),
+        input.contiguous().data_ptr<scalar_t>(),
+        bbox.contiguous().data_ptr<scalar_t>(),
+        trans.contiguous().data_ptr<scalar_t>(),
+        no_trans, trans_std, sample_per_part,
+        group_size, part_size, num_classes, 
+        channels_each_class);
+  });
+ 
+
+}
--- a/easyocr/DBNet/assets/ops/dcn/src/deform_pool_cpu_kernel.h
+++ b/easyocr/DBNet/assets/ops/dcn/src/deform_pool_cpu_kernel.h
@@ -0,0 +1,35 @@
+/*
+Created by Jaided AI
+Released Date: 31/08/2022
+Description:
+Deformable convolution kernel for CPU. 
+This code is adapted from;
+https://github.com/MhLiao/DB/blob/master/assets/ops/dcn/src/deform_pool_cuda_kernel.cu
+https://github.com/CharlesShang/DCNv2
+https://github.com/lbin/DCNv2
+*/
+
+#include <torch/extension.h>
+#pragma once
+#ifndef DEFORM_POOL_CPU_KERNEL
+#define DEFORM_POOL_CPU_KERNEL
+
+void DeformablePSROIPoolForward(
+    const at::Tensor data, const at::Tensor bbox, const at::Tensor trans,
+    at::Tensor out, at::Tensor top_count, 
+    const int batch, const int channels, const int height, const int width, 
+    const int num_bbox, const int channels_trans, const int no_trans, 
+    const float spatial_scale, const int output_dim, const int group_size, 
+    const int pooled_size, const int part_size, const int sample_per_part, 
+    const float trans_std);
+
+void DeformablePSROIPoolBackwardAcc(
+    const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox,
+    const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad, at::Tensor trans_grad, 
+    const int batch, const int channels, const int height, const int width, 
+    const int num_bbox, const int channels_trans, const int no_trans, 
+    const float spatial_scale, const int output_dim, const int group_size, 
+    const int pooled_size, const int part_size, const int sample_per_part, 
+    const float trans_std);
+
+#endif
--- a/easyocr/DBNet/assets/ops/dcn/src/deform_pool_cuda.cpp
+++ b/easyocr/DBNet/assets/ops/dcn/src/deform_pool_cuda.cpp
@@ -0,0 +1,87 @@
+// modify from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/modulated_dcn_cuda.c
+
+// based on
+// author: Charles Shang
+// https://github.com/torch/cunn/blob/master/lib/THCUNN/generic/SpatialConvolutionMM.cu
+
+#include <torch/extension.h>
+
+#include <cmath>
+#include <vector>
+
+void DeformablePSROIPoolForward(
+    const at::Tensor data, const at::Tensor bbox, const at::Tensor trans,
+    at::Tensor out, at::Tensor top_count, const int batch, const int channels,
+    const int height, const int width, const int num_bbox,
+    const int channels_trans, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std);
+
+void DeformablePSROIPoolBackwardAcc(
+    const at::Tensor out_grad, const at::Tensor data, const at::Tensor bbox,
+    const at::Tensor trans, const at::Tensor top_count, at::Tensor in_grad,
+    at::Tensor trans_grad, const int batch, const int channels,
+    const int height, const int width, const int num_bbox,
+    const int channels_trans, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std);
+
+void deform_psroi_pooling_cuda_forward(
+    at::Tensor input, at::Tensor bbox, at::Tensor trans, at::Tensor out,
+    at::Tensor top_count, const int no_trans, const float spatial_scale,
+    const int output_dim, const int group_size, const int pooled_size,
+    const int part_size, const int sample_per_part, const float trans_std) {
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+
+  const int num_bbox = bbox.size(0);
+  if (num_bbox != out.size(0))
+    AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+             out.size(0), num_bbox);
+
+  DeformablePSROIPoolForward(
+      input, bbox, trans, out, top_count, batch, channels, height, width,
+      num_bbox, channels_trans, no_trans, spatial_scale, output_dim, group_size,
+      pooled_size, part_size, sample_per_part, trans_std);
+}
+
+void deform_psroi_pooling_cuda_backward(
+    at::Tensor out_grad, at::Tensor input, at::Tensor bbox, at::Tensor trans,
+    at::Tensor top_count, at::Tensor input_grad, at::Tensor trans_grad,
+    const int no_trans, const float spatial_scale, const int output_dim,
+    const int group_size, const int pooled_size, const int part_size,
+    const int sample_per_part, const float trans_std) {
+  TORCH_CHECK(out_grad.is_contiguous(), "out_grad tensor has to be contiguous");
+  TORCH_CHECK(input.is_contiguous(), "input tensor has to be contiguous");
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+  const int channels_trans = no_trans ? 2 : trans.size(1);
+
+  const int num_bbox = bbox.size(0);
+  if (num_bbox != out_grad.size(0))
+    AT_ERROR("Output shape and bbox number wont match: (%d vs %d).",
+             out_grad.size(0), num_bbox);
+
+  DeformablePSROIPoolBackwardAcc(
+      out_grad, input, bbox, trans, top_count, input_grad, trans_grad, batch,
+      channels, height, width, num_bbox, channels_trans, no_trans,
+      spatial_scale, output_dim, group_size, pooled_size, part_size,
+      sample_per_part, trans_std);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("deform_psroi_pooling_cuda_forward", &deform_psroi_pooling_cuda_forward,
+        "deform psroi pooling forward(CUDA)");
+  m.def("deform_psroi_pooling_cuda_backward",
+        &deform_psroi_pooling_cuda_backward,
+        "deform psroi pooling backward(CUDA)");
+}
--- a/easyocr/DBNet/assets/ops/dcn/src/deform_pool_cuda_kernel.cu
+++ b/easyocr/DBNet/assets/ops/dcn/src/deform_pool_cuda_kernel.cu
@@ -0,0 +1,364 @@
+/*!
+ * Copyright (c) 2017 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file deformable_psroi_pooling.cu
+ * \brief
+ * \author Yi Li, Guodong Zhang, Jifeng Dai
+*/
+/***************** Adapted by Charles Shang *********************/
+// modify from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/cuda/deform_psroi_pooling_cuda.cu
+
+#include <ATen/ATen.h>
+#include <THC/THCAtomics.cuh>
+#include <stdio.h>
+#include <math.h>
+#include <algorithm>
+
+using namespace at;
+
+#define CUDA_KERNEL_LOOP(i, n)                        \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n);                                       \
+       i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N)
+{
+  return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
+}
+
+template <typename scalar_t>
+__device__ scalar_t bilinear_interp(
+    const scalar_t *data,
+    const scalar_t x,
+    const scalar_t y,
+    const int width,
+    const int height)
+{
+  int x1 = floor(x);
+  int x2 = ceil(x);
+  int y1 = floor(y);
+  int y2 = ceil(y);
+  scalar_t dist_x = (scalar_t)(x - x1);
+  scalar_t dist_y = (scalar_t)(y - y1);
+  scalar_t value11 = data[y1 * width + x1];
+  scalar_t value12 = data[y2 * width + x1];
+  scalar_t value21 = data[y1 * width + x2];
+  scalar_t value22 = data[y2 * width + x2];
+  scalar_t value = (1 - dist_x) * (1 - dist_y) * value11 + (1 - dist_x) * dist_y * value12 + dist_x * (1 - dist_y) * value21 + dist_x * dist_y * value22;
+  return value;
+}
+
+template <typename scalar_t>
+__global__ void DeformablePSROIPoolForwardKernel(
+    const int count,
+    const scalar_t *bottom_data,
+    const scalar_t spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const scalar_t *bottom_rois, const scalar_t *bottom_trans,
+    const int no_trans,
+    const scalar_t trans_std,
+    const int sample_per_part,
+    const int output_dim,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class,
+    scalar_t *top_data,
+    scalar_t *top_count)
+{
+  CUDA_KERNEL_LOOP(index, count)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+    // Force too small ROIs to be 1x1
+    scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+    scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
+    scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
+
+    scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
+    scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
+
+    int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
+    int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+    scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+
+    scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    scalar_t sum = 0;
+    int count = 0;
+    int gw = floor((scalar_t)(pw)*group_size / pooled_width);
+    int gh = floor((scalar_t)(ph)*group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+
+    const scalar_t *offset_bottom_data = bottom_data + (roi_batch_ind * channels) * height * width;
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        scalar_t w = wstart + iw * sub_bin_size_w;
+        scalar_t h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = min(max(w, 0.), width - 1.);
+        h = min(max(h, 0.), height - 1.);
+        int c = (ctop * group_size + gh) * group_size + gw;
+        scalar_t val = bilinear_interp(offset_bottom_data + c * height * width, w, h, width, height);
+        sum += val;
+        count++;
+      }
+    }
+    top_data[index] = count == 0 ? (scalar_t)(0) : sum / count;
+    top_count[index] = count;
+  }
+}
+
+template <typename scalar_t>
+__global__ void DeformablePSROIPoolBackwardAccKernel(
+    const int count,
+    const scalar_t *top_diff,
+    const scalar_t *top_count,
+    const int num_rois,
+    const scalar_t spatial_scale,
+    const int channels,
+    const int height, const int width,
+    const int pooled_height, const int pooled_width,
+    const int output_dim,
+    scalar_t *bottom_data_diff, scalar_t *bottom_trans_diff,
+    const scalar_t *bottom_data,
+    const scalar_t *bottom_rois,
+    const scalar_t *bottom_trans,
+    const int no_trans,
+    const scalar_t trans_std,
+    const int sample_per_part,
+    const int group_size,
+    const int part_size,
+    const int num_classes,
+    const int channels_each_class)
+{
+  CUDA_KERNEL_LOOP(index, count)
+  {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+
+    // [start, end) interval for spatial sampling
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    scalar_t roi_start_w = (scalar_t)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    scalar_t roi_start_h = (scalar_t)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    scalar_t roi_end_w = (scalar_t)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    scalar_t roi_end_h = (scalar_t)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+
+    // Force too small ROIs to be 1x1
+    scalar_t roi_width = max(roi_end_w - roi_start_w, 0.1); //avoid 0
+    scalar_t roi_height = max(roi_end_h - roi_start_h, 0.1);
+
+    // Compute w and h at bottom
+    scalar_t bin_size_h = roi_height / (scalar_t)(pooled_height);
+    scalar_t bin_size_w = roi_width / (scalar_t)(pooled_width);
+
+    scalar_t sub_bin_size_h = bin_size_h / (scalar_t)(sample_per_part);
+    scalar_t sub_bin_size_w = bin_size_w / (scalar_t)(sample_per_part);
+
+    int part_h = floor((scalar_t)(ph) / pooled_height * part_size);
+    int part_w = floor((scalar_t)(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    scalar_t trans_x = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+    scalar_t trans_y = no_trans ? (scalar_t)(0) : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w] * (scalar_t)trans_std;
+
+    scalar_t wstart = (scalar_t)(pw)*bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    scalar_t hstart = (scalar_t)(ph)*bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    if (top_count[index] <= 0)
+    {
+      continue;
+    }
+    scalar_t diff_val = top_diff[index] / top_count[index];
+    const scalar_t *offset_bottom_data = bottom_data + roi_batch_ind * channels * height * width;
+    scalar_t *offset_bottom_data_diff = bottom_data_diff + roi_batch_ind * channels * height * width;
+    int gw = floor((scalar_t)(pw)*group_size / pooled_width);
+    int gh = floor((scalar_t)(ph)*group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+
+    for (int ih = 0; ih < sample_per_part; ih++)
+    {
+      for (int iw = 0; iw < sample_per_part; iw++)
+      {
+        scalar_t w = wstart + iw * sub_bin_size_w;
+        scalar_t h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5)
+        {
+          continue;
+        }
+        w = min(max(w, 0.), width - 1.);
+        h = min(max(h, 0.), height - 1.);
+        int c = (ctop * group_size + gh) * group_size + gw;
+        // backward on feature
+        int x0 = floor(w);
+        int x1 = ceil(w);
+        int y0 = floor(h);
+        int y1 = ceil(h);
+        scalar_t dist_x = w - x0, dist_y = h - y0;
+        scalar_t q00 = (1 - dist_x) * (1 - dist_y);
+        scalar_t q01 = (1 - dist_x) * dist_y;
+        scalar_t q10 = dist_x * (1 - dist_y);
+        scalar_t q11 = dist_x * dist_y;
+        int bottom_index_base = c * height * width;
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x0, q00 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x0, q01 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y0 * width + x1, q10 * diff_val);
+        atomicAdd(offset_bottom_data_diff + bottom_index_base + y1 * width + x1, q11 * diff_val);
+
+        if (no_trans)
+        {
+          continue;
+        }
+        scalar_t U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+        scalar_t U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+        scalar_t U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+        scalar_t U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+        scalar_t diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - U00 * (1 - dist_y)) * trans_std * diff_val;
+        diff_x *= roi_width;
+        scalar_t diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - U00 * (1 - dist_x)) * trans_std * diff_val;
+        diff_y *= roi_height;
+
+        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2) * part_size + part_h) * part_size + part_w, diff_x);
+        atomicAdd(bottom_trans_diff + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * part_size + part_w, diff_y);
+      }
+    }
+  }
+}
+
+void DeformablePSROIPoolForward(const at::Tensor data,
+                                const at::Tensor bbox,
+                                const at::Tensor trans,
+                                at::Tensor out,
+                                at::Tensor top_count,
+                                const int batch,
+                                const int channels,
+                                const int height,
+                                const int width,
+                                const int num_bbox,
+                                const int channels_trans,
+                                const int no_trans,
+                                const float spatial_scale,
+                                const int output_dim,
+                                const int group_size,
+                                const int pooled_size,
+                                const int part_size,
+                                const int sample_per_part,
+                                const float trans_std)
+{
+  const int pooled_height = pooled_size;
+  const int pooled_width = pooled_size;
+  const int count = num_bbox * output_dim * pooled_height * pooled_width;
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data.type(), "deformable_psroi_pool_forward", ([&] {
+        const scalar_t *bottom_data = data.data<scalar_t>();
+        const scalar_t *bottom_rois = bbox.data<scalar_t>();
+        const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>();
+        scalar_t *top_data = out.data<scalar_t>();
+        scalar_t *top_count_data = top_count.data<scalar_t>();
+
+        DeformablePSROIPoolForwardKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS>>>(
+            count, bottom_data, (scalar_t)spatial_scale, channels, height, width, pooled_height, pooled_width,
+            bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part, output_dim,
+            group_size, part_size, num_classes, channels_each_class, top_data, top_count_data);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
+  }
+}
+
+void DeformablePSROIPoolBackwardAcc(const at::Tensor out_grad,
+                                    const at::Tensor data,
+                                    const at::Tensor bbox,
+                                    const at::Tensor trans,
+                                    const at::Tensor top_count,
+                                    at::Tensor in_grad,
+                                    at::Tensor trans_grad,
+                                    const int batch,
+                                    const int channels,
+                                    const int height,
+                                    const int width,
+                                    const int num_bbox,
+                                    const int channels_trans,
+                                    const int no_trans,
+                                    const float spatial_scale,
+                                    const int output_dim,
+                                    const int group_size,
+                                    const int pooled_size,
+                                    const int part_size,
+                                    const int sample_per_part,
+                                    const float trans_std)
+{
+  // LOG(INFO) << "DeformablePSROIPoolBackward";
+  const int num_rois = num_bbox;
+  const int pooled_height = pooled_size;
+  const int pooled_width = pooled_size;
+  const int count = num_bbox * output_dim * pooled_height * pooled_width;
+  const int num_classes = no_trans ? 1 : channels_trans / 2;
+  const int channels_each_class = no_trans ? output_dim : output_dim / num_classes;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      out_grad.type(), "deformable_psroi_pool_backward_acc", ([&] {
+        const scalar_t *top_diff = out_grad.data<scalar_t>();
+        const scalar_t *bottom_data = data.data<scalar_t>();
+        const scalar_t *bottom_rois = bbox.data<scalar_t>();
+        const scalar_t *bottom_trans = no_trans ? NULL : trans.data<scalar_t>();
+        scalar_t *bottom_data_diff = in_grad.data<scalar_t>();
+        scalar_t *bottom_trans_diff = no_trans ? NULL : trans_grad.data<scalar_t>();
+        const scalar_t *top_count_data = top_count.data<scalar_t>();
+
+        DeformablePSROIPoolBackwardAccKernel<<<GET_BLOCKS(count), CUDA_NUM_THREADS>>>(
+            count, top_diff, top_count_data, num_rois, (scalar_t)spatial_scale, channels, height, width,
+            pooled_height, pooled_width, output_dim, bottom_data_diff, bottom_trans_diff,
+            bottom_data, bottom_rois, bottom_trans, no_trans, (scalar_t)trans_std, sample_per_part,
+            group_size, part_size, num_classes, channels_each_class);
+      }));
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in DeformablePSROIPoolForward: %s\n", cudaGetErrorString(err));
+  }
+}
--- a/easyocr/DBNet/backbones/init.py
+++ b/easyocr/DBNet/backbones/init.py
@@ -0,0 +1,2 @@
+from .resnet import resnet18, resnet34, resnet50, resnet101, deformable_resnet50, deformable_resnet18
+from .mobilenetv3 import mobilenet_v3_large, mobilenet_v3_small
--- a/easyocr/DBNet/backbones/mobilenetv3.py
+++ b/easyocr/DBNet/backbones/mobilenetv3.py
@@ -0,0 +1,252 @@
+# https://github.com/kuan-wang/pytorch-mobilenet-v3
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+__all__ = ['MobileNetV3', 'mobilenetv3']
+
+
+def conv_bn(inp, oup, stride, conv_layer=nn.Conv2d, norm_layer=nn.BatchNorm2d, nlin_layer=nn.ReLU):
+    return nn.Sequential(
+        conv_layer(inp, oup, 3, stride, 1, bias=False),
+        norm_layer(oup),
+        nlin_layer(inplace=True)
+    )
+
+
+def conv_1x1_bn(inp, oup, conv_layer=nn.Conv2d, norm_layer=nn.BatchNorm2d, nlin_layer=nn.ReLU):
+    return nn.Sequential(
+        conv_layer(inp, oup, 1, 1, 0, bias=False),
+        norm_layer(oup),
+        nlin_layer(inplace=True)
+    )
+
+
+class Hswish(nn.Module):
+    def __init__(self, inplace=True):
+        super(Hswish, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return x * F.relu6(x + 3., inplace=self.inplace) / 6.
+
+
+class Hsigmoid(nn.Module):
+    def __init__(self, inplace=True):
+        super(Hsigmoid, self).__init__()
+        self.inplace = inplace
+
+    def forward(self, x):
+        return F.relu6(x + 3., inplace=self.inplace) / 6.
+
+
+class SEModule(nn.Module):
+    def __init__(self, channel, reduction=4):
+        super(SEModule, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel, bias=False),
+            Hsigmoid()
+            # nn.Sigmoid()
+        )
+
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y.expand_as(x)
+
+
+class Identity(nn.Module):
+    def __init__(self, channel):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+def make_divisible(x, divisible_by=8):
+    import numpy as np
+    return int(np.ceil(x * 1. / divisible_by) * divisible_by)
+
+
+class MobileBottleneck(nn.Module):
+    def __init__(self, inp, oup, kernel, stride, exp, se=False, nl='RE'):
+        super(MobileBottleneck, self).__init__()
+        assert stride in [1, 2]
+        assert kernel in [3, 5]
+        padding = (kernel - 1) // 2
+        self.use_res_connect = stride == 1 and inp == oup
+
+        conv_layer = nn.Conv2d
+        norm_layer = nn.BatchNorm2d
+        if nl == 'RE':
+            nlin_layer = nn.ReLU # or ReLU6
+        elif nl == 'HS':
+            nlin_layer = Hswish
+        else:
+            raise NotImplementedError
+        if se:
+            SELayer = SEModule
+        else:
+            SELayer = Identity
+
+        self.conv = nn.Sequential(
+            # pw
+            conv_layer(inp, exp, 1, 1, 0, bias=False),
+            norm_layer(exp),
+            nlin_layer(inplace=True),
+            # dw
+            conv_layer(exp, exp, kernel, stride, padding, groups=exp, bias=False),
+            norm_layer(exp),
+            SELayer(exp),
+            nlin_layer(inplace=True),
+            # pw-linear
+            conv_layer(exp, oup, 1, 1, 0, bias=False),
+            norm_layer(oup),
+        )
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV3(nn.Module):
+    def __init__(self, n_class=1000, input_size=224, dropout=0.8, mode='small', width_mult=1.0):
+        super(MobileNetV3, self).__init__()
+        input_channel = 16
+        last_channel = 1280
+        if mode == 'large':
+            # refer to Table 1 in paper
+            mobile_setting = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16,  16,  False, 'RE', 1],
+                [3, 64,  24,  False, 'RE', 2],
+                [3, 72,  24,  False, 'RE', 1],  # 3
+                [5, 72,  40,  True,  'RE', 2],
+                [5, 120, 40,  True,  'RE', 1],
+                [5, 120, 40,  True,  'RE', 1],  # 6
+                [3, 240, 80,  False, 'HS', 2],
+                [3, 200, 80,  False, 'HS', 1],
+                [3, 184, 80,  False, 'HS', 1],
+                [3, 184, 80,  False, 'HS', 1],
+                [3, 480, 112, True,  'HS', 1],
+                [3, 672, 112, True,  'HS', 1],  # 12
+                [5, 672, 160, True,  'HS', 2],
+                [5, 960, 160, True,  'HS', 1],
+                [5, 960, 160, True,  'HS', 1],
+            ]
+        elif mode == 'small':
+            # refer to Table 2 in paper
+            mobile_setting = [
+                # k, exp, c,  se,     nl,  s,
+                [3, 16,  16,  True,  'RE', 2],
+                [3, 72,  24,  False, 'RE', 2],
+                [3, 88,  24,  False, 'RE', 1],
+                [5, 96,  40,  True,  'HS', 2],
+                [5, 240, 40,  True,  'HS', 1],
+                [5, 240, 40,  True,  'HS', 1],
+                [5, 120, 48,  True,  'HS', 1],
+                [5, 144, 48,  True,  'HS', 1],
+                [5, 288, 96,  True,  'HS', 2],
+                [5, 576, 96,  True,  'HS', 1],
+                [5, 576, 96,  True,  'HS', 1],
+            ]
+        else:
+            raise NotImplementedError
+
+        # building first layer
+        assert input_size % 32 == 0
+        last_channel = make_divisible(last_channel * width_mult) if width_mult > 1.0 else last_channel
+        self.features = nn.ModuleList([conv_bn(3, input_channel, 2, nlin_layer=Hswish)])   # start_idx = 0: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same
+        self.classifier = []
+
+        # building mobile blocks
+        for k, exp, c, se, nl, s in mobile_setting:
+            output_channel = make_divisible(c * width_mult)
+            exp_channel = make_divisible(exp * width_mult)
+            self.features.append(MobileBottleneck(input_channel, output_channel, k, s, exp_channel, se, nl))
+            input_channel = output_channel
+
+        # building last several layers
+        if mode == 'large':
+            last_conv = make_divisible(960 * width_mult)
+            self.features.append(conv_1x1_bn(input_channel, last_conv, nlin_layer=Hswish))  # 16
+            self.features.append(nn.AdaptiveAvgPool2d(1))
+            self.features.append(nn.Conv2d(last_conv, last_channel, 1, 1, 0))
+            self.features.append(Hswish(inplace=True))
+        elif mode == 'small':
+            last_conv = make_divisible(576 * width_mult)
+            self.features.append(conv_1x1_bn(input_channel, last_conv, nlin_layer=Hswish))
+            # self.features.append(SEModule(last_conv))  # refer to paper Table2, but I think this is a mistake
+            self.features.append(nn.AdaptiveAvgPool2d(1))
+            self.features.append(nn.Conv2d(last_conv, last_channel, 1, 1, 0))
+            self.features.append(Hswish(inplace=True))
+        else:
+            raise NotImplementedError
+
+        # make it nn.Sequential
+        #self.features = nn.Sequential(*self.features)  del for dbnet
+
+        # building classifier
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=dropout),    # refer to paper section 6
+            nn.Linear(last_channel, n_class),
+        )
+
+        self._initialize_weights()
+
+    def forward(self, x):
+        '''x = self.features(x)
+        x = x.mean(3).mean(2)
+        x = self.classifier(x)
+        return x'''
+        x2, x3, x4, x5 = None, None, None, None
+        for stage in range(17): # https://github.com/PaddlePaddle/PaddleOCR/blob/release/2.1/ppocr/modeling/backbones/det_mobilenet_v3.py
+            x = self.features[stage](x)
+            if stage == 3:  # if s == 2 and start_idx > 3
+                x2 = x
+            elif stage == 6:
+                x3 = x
+            elif stage == 12:
+                x4 = x
+            elif stage == 16:
+                x5 = x
+        return x2, x3, x4, x5
+
+    def _initialize_weights(self):
+        # weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out')
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+
+
+def mobilenet_v3_large(pretrained=False, **kwargs):
+    model = MobileNetV3(mode='large', **kwargs)
+    if pretrained:
+        state_dict = torch.load('mobilenetv3_large.pth.tar')
+        model.load_state_dict(state_dict, strict=True)
+        # raise NotImplementedError
+    return model
+
+def mobilenet_v3_small(pretrained=False, **kwargs):
+    model = MobileNetV3(mode='small', **kwargs)
+    if pretrained:
+        state_dict = torch.load('mobilenetv3_small_67.4.pth.tar')
+        model.load_state_dict(state_dict, strict=True)
+        # raise NotImplementedError
+    return model
--- a/easyocr/DBNet/backbones/resnet.py
+++ b/easyocr/DBNet/backbones/resnet.py
@@ -0,0 +1,340 @@
+import torch.nn as nn
+import math
+import torch.utils.model_zoo as model_zoo
+BatchNorm2d = nn.BatchNorm2d
+
+__all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
+           'resnet152']
+
+
+model_urls = {
+    'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
+    'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
+    'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
+    'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
+    'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
+}
+
+
+def constant_init(module, constant, bias=0):
+    nn.init.constant_(module.weight, constant)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, dcn=None):
+        super(BasicBlock, self).__init__()
+        self.with_dcn = dcn is not None
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.get('fallback_on_stride', False)
+            self.with_modulated_dcn = dcn.get('modulated', False)
+        # self.conv2 = conv3x3(planes, planes)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+                                   padding=1, bias=False)
+        else:
+            deformable_groups = dcn.get('deformable_groups', 1)
+            if not self.with_modulated_dcn:
+                #from assets.ops.dcn import DeformConv
+                from ..assets.ops.dcn import DeformConv
+                conv_op = DeformConv
+                offset_channels = 18
+            else:
+                #from assets.ops.dcn import ModulatedDeformConv
+                from ..assets.ops.dcn import ModulatedDeformConv
+                conv_op = ModulatedDeformConv
+                offset_channels = 27
+            self.conv2_offset = nn.Conv2d(
+                planes,
+                deformable_groups * offset_channels,
+                kernel_size=3,
+                padding=1)
+            self.conv2 = conv_op(
+                planes,
+                planes,
+                kernel_size=3,
+                padding=1,
+                deformable_groups=deformable_groups,
+                bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        # out = self.conv2(out)
+        if not self.with_dcn:
+            out = self.conv2(out)
+        elif self.with_modulated_dcn:
+            offset_mask = self.conv2_offset(out)
+            offset = offset_mask[:, :18, :, :]
+            mask = offset_mask[:, -9:, :, :].sigmoid()
+            out = self.conv2(out, offset, mask)
+        else:
+            offset = self.conv2_offset(out)
+            out = self.conv2(out, offset)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, dcn=None):
+        super(Bottleneck, self).__init__()
+        self.with_dcn = dcn is not None
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.get('fallback_on_stride', False)
+            self.with_modulated_dcn = dcn.get('modulated', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+                                   stride=stride, padding=1, bias=False)
+        else:
+            deformable_groups = dcn.get('deformable_groups', 1)
+            if not self.with_modulated_dcn:
+                #from assets.ops.dcn import DeformConv
+                from ..assets.ops.dcn import DeformConv
+                conv_op = DeformConv
+                offset_channels = 18
+            else:
+                #from assets.ops.dcn import ModulatedDeformConv
+                from ..assets.ops.dcn import ModulatedDeformConv
+                conv_op = ModulatedDeformConv
+                offset_channels = 27
+            self.conv2_offset = nn.Conv2d(
+                planes, deformable_groups * offset_channels,
+                kernel_size=3,
+                padding=1)
+            self.conv2 = conv_op(
+                planes, planes, kernel_size=3, padding=1, stride=stride,
+                deformable_groups=deformable_groups, bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dcn = dcn
+        self.with_dcn = dcn is not None
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        # out = self.conv2(out)
+        if not self.with_dcn:
+            out = self.conv2(out)
+        elif self.with_modulated_dcn:
+            offset_mask = self.conv2_offset(out)
+            offset = offset_mask[:, :18, :, :]
+            mask = offset_mask[:, -9:, :, :].sigmoid()
+            out = self.conv2(out, offset, mask)
+        else:
+            offset = self.conv2_offset(out)
+            out = self.conv2(out, offset)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+    def __init__(self, block, layers, num_classes=1000, 
+                 dcn=None, stage_with_dcn=(False, False, False, False)):
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
+                               bias=False)
+        self.bn1 = BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(
+            block, 128, layers[1], stride=2, dcn=dcn)
+        self.layer3 = self._make_layer(
+            block, 256, layers[2], stride=2, dcn=dcn)
+        self.layer4 = self._make_layer(
+            block, 512, layers[3], stride=2, dcn=dcn)
+        self.avgpool = nn.AvgPool2d(7, stride=1)
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+    
+        self.smooth = nn.Conv2d(2048, 256, kernel_size=1, stride=1, padding=1)    
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+        if self.dcn is not None:
+            for m in self.modules():
+                if isinstance(m, Bottleneck) or isinstance(m, BasicBlock):
+                    if hasattr(m, 'conv2_offset'):
+                        constant_init(m.conv2_offset, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dcn=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.inplanes, planes * block.expansion,
+                          kernel_size=1, stride=stride, bias=False),
+                BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(block(self.inplanes, planes,
+                            stride, downsample, dcn=dcn))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, dcn=dcn))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x2 = self.layer1(x)
+        x3 = self.layer2(x2)
+        x4 = self.layer3(x3)
+        x5 = self.layer4(x4)
+
+        return x2, x3, x4, x5
+
+
+def resnet18(pretrained=True, **kwargs):
+    """Constructs a ResNet-18 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(
+            model_urls['resnet18']), strict=False)
+    return model
+
+def deformable_resnet18(pretrained=True, **kwargs):
+    """Constructs a ResNet-18 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [2, 2, 2, 2],
+                    dcn=dict(modulated=True,
+                            deformable_groups=1,
+                            fallback_on_stride=False),
+                    stage_with_dcn=[False, True, True, True], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(
+            model_urls['resnet18']), strict=False)
+    return model
+
+
+def resnet34(pretrained=True, **kwargs):
+    """Constructs a ResNet-34 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(
+            model_urls['resnet34']), strict=False)
+    return model
+
+
+def resnet50(pretrained=True, **kwargs):
+    """Constructs a ResNet-50 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(
+            model_urls['resnet50']), strict=False)
+    return model
+
+
+def deformable_resnet50(pretrained=True, **kwargs):
+    """Constructs a ResNet-50 model with deformable conv.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 6, 3],
+                   dcn=dict(modulated=True,
+                            deformable_groups=1,
+                            fallback_on_stride=False),
+                   stage_with_dcn=[False, True, True, True],
+                   **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(
+            model_urls['resnet50']), strict=False)
+    return model
+
+
+def resnet101(pretrained=True, **kwargs):
+    """Constructs a ResNet-101 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(
+            model_urls['resnet101']), strict=False)
+    return model
+
+
+def resnet152(pretrained=True, **kwargs):
+    """Constructs a ResNet-152 model.
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(
+            model_urls['resnet152']), strict=False)
+    return model
--- a/easyocr/DBNet/configs/DBNet_inference.yaml
+++ b/easyocr/DBNet/configs/DBNet_inference.yaml
@@ -0,0 +1,50 @@
+resnet18:
+  model:
+    class: model.detector.Detector
+    structure: 
+      class: model.detector.Model
+      builder: 
+        class: model.detector.Builder
+        model: SegDetectorModel
+        model_args: 
+          backbone: deformable_resnet18
+          decoder: SegDetector
+          decoder_args: 
+            adaptive: True
+            in_channels: 
+              - 64
+              - 128
+              - 256
+              - 512
+            k: 50
+          loss_class: L1BalanceCELoss
+  weight:
+    pretrained: pretrained_ic15_res18.pt
+resnet50:
+  model:
+    class: model.detector.Detector
+    structure: 
+      class: model.detector.Model
+      builder: 
+        class: model.detector.Builder
+        model: SegDetectorModel
+        model_args: 
+          backbone: deformable_resnet50
+          decoder: SegDetector
+          decoder_args: 
+            adaptive: True
+            in_channels: 
+              - 256
+              - 512
+              - 1024
+              - 2048
+            k: 50
+          loss_class: L1BalanceCELoss
+  weight:
+    pretrained: pretrained_ic15_res50.pt
+BGR_MEAN:
+  - 122.67891434
+  - 116.66876762
+  - 104.00698793
+min_detection_size: 640
+max_detection_size: 2560
--- a/easyocr/DBNet/decoders/init.py
+++ b/easyocr/DBNet/decoders/init.py
@@ -0,0 +1,6 @@
+from .seg_detector import SegDetector
+from .seg_detector_asf import SegSpatialScaleDetector
+from .dice_loss import DiceLoss
+from .pss_loss import PSS_Loss
+from .l1_loss import MaskL1Loss
+from .balance_cross_entropy_loss import BalanceCrossEntropyLoss
--- a/easyocr/DBNet/decoders/balance_cross_entropy_loss.py
+++ b/easyocr/DBNet/decoders/balance_cross_entropy_loss.py
@@ -0,0 +1,56 @@
+import torch
+import torch.nn as nn
+
+
+class BalanceCrossEntropyLoss(nn.Module):
+    '''
+    Balanced cross entropy loss.
+    Shape:
+        - Input: :math:`(N, 1, H, W)`
+        - GT: :math:`(N, 1, H, W)`, same shape as the input
+        - Mask: :math:`(N, H, W)`, same spatial shape as the input
+        - Output: scalar.
+
+    Examples::
+
+        >>> m = nn.Sigmoid()
+        >>> loss = nn.BCELoss()
+        >>> input = torch.randn(3, requires_grad=True)
+        >>> target = torch.empty(3).random_(2)
+        >>> output = loss(m(input), target)
+        >>> output.backward()
+    '''
+
+    def __init__(self, negative_ratio=3.0, eps=1e-6):
+        super(BalanceCrossEntropyLoss, self).__init__()
+        self.negative_ratio = negative_ratio
+        self.eps = eps
+
+    def forward(self,
+                pred: torch.Tensor,
+                gt: torch.Tensor,
+                mask: torch.Tensor,
+                return_origin=False):
+        '''
+        Args:
+            pred: shape :math:`(N, 1, H, W)`, the prediction of network
+            gt: shape :math:`(N, 1, H, W)`, the target
+            mask: shape :math:`(N, H, W)`, the mask indicates positive regions
+        '''
+        positive = (gt[:,0,:,:] * mask).byte()
+        negative = ((1 - gt[:,0,:,:]) * mask).byte()
+        positive_count = int(positive.float().sum())
+        negative_count = min(int(negative.float().sum()),
+                            int(positive_count * self.negative_ratio))
+        loss = nn.functional.binary_cross_entropy(
+            pred, gt, reduction='none')[:, 0, :, :]
+        positive_loss = loss * positive.float()
+        negative_loss = loss * negative.float()
+        negative_loss, _ = torch.topk(negative_loss.view(-1), negative_count)
+
+        balance_loss = (positive_loss.sum() + negative_loss.sum()) /\
+            (positive_count + negative_count + self.eps)
+
+        if return_origin:
+            return balance_loss, loss
+        return balance_loss
--- a/easyocr/DBNet/decoders/dice_loss.py
+++ b/easyocr/DBNet/decoders/dice_loss.py
@@ -0,0 +1,186 @@
+import torch
+import torch.nn as nn
+import numpy as np
+import cv2
+from scipy import ndimage
+
+
+class DiceLoss(nn.Module):
+    '''
+    Loss function from https://arxiv.org/abs/1707.03237,
+    where iou computation is introduced heatmap manner to measure the
+    diversity bwtween tow heatmaps.
+    '''
+    def __init__(self, eps=1e-6):
+        super(DiceLoss, self).__init__()
+        self.eps = eps
+
+    def forward(self, pred: torch.Tensor, gt, mask, weights=None):
+        '''
+        pred: one or two heatmaps of shape (N, 1, H, W),
+            the losses of tow heatmaps are added together.
+        gt: (N, 1, H, W)
+        mask: (N, H, W)
+        '''
+        assert pred.dim() == 4, pred.dim()
+        return self._compute(pred, gt, mask, weights)
+
+    def _compute(self, pred, gt, mask, weights):
+        if pred.dim() == 4:
+            pred = pred[:, 0, :, :]
+            gt = gt[:, 0, :, :]
+        assert pred.shape == gt.shape
+        assert pred.shape == mask.shape
+        if weights is not None:
+            assert weights.shape == mask.shape
+            mask = weights * mask
+
+        intersection = (pred * gt * mask).sum()
+        union = (pred * mask).sum() + (gt * mask).sum() + self.eps
+        loss = 1 - 2.0 * intersection / union
+        assert loss <= 1
+        return loss
+
+
+class LeakyDiceLoss(nn.Module):
+    '''
+    Variation from DiceLoss.
+    The coverage and union are computed separately.
+    '''
+    def __init__(self, eps=1e-6, coverage_scale=5.0):
+        super(LeakyDiceLoss, self).__init__()
+        self.eps = eps
+        self.coverage_scale = coverage_scale
+
+    def forward(self, pred, gt, mask):
+        if pred.dim() == 4:
+            pred = pred[:, 0, :, :]
+            gt = gt[:, 0, :, :]
+        assert pred.shape == gt.shape
+        assert pred.shape == mask.shape
+
+        coverage = (pred * mask * gt).sum() / ((gt * mask).sum() + self.eps)
+        assert coverage <= 1
+        coverage = 1 - coverage
+        excede = (pred * mask * gt).sum() / ((pred * mask).sum() + self.eps)
+        assert excede <= 1
+        excede = 1 - excede
+        loss = coverage * self.coverage_scale + excede
+        return loss, dict(coverage=coverage, excede=excede)
+
+
+class InstanceDiceLoss(DiceLoss):
+    '''
+    DiceLoss normalized on each instance.
+    Input:
+        pred: (N, 1, H, W)
+        gt: (N, 1, H, W)
+        mask: (N, H, W)
+    Note: This class assume that input tensors are on gpu,
+        while cput computation is required to find union areas.
+    '''
+    REDUCTION = ['mean', 'sum', 'none']
+
+    def __init__(self, threshold=0.3, iou_thresh=0.2, reduction=None,
+                 max_regions=100, eps=1e-6):
+        nn.Module.__init__(self)
+        self.threshold = threshold
+        self.iou_thresh = iou_thresh
+        self.reduction = reduction
+        if self.reduction is None:
+            self.reduction = 'mean'
+        assert self.reduction in self.REDUCTION
+        self.max_regions = max_regions
+        self.eps = eps
+
+    def label(self, tensor_on_gpu, blur=None):
+        '''
+        Args:
+            tensor_on_gpu: (N, 1, H, W)
+            blur: Lambda. If exists, each instance will be blured using `blur`.
+        '''
+        tensor = tensor_on_gpu.cpu().detach().numpy()
+
+        instance_maps = []
+        instance_counts = []
+        for batch_index in range(tensor_on_gpu.shape[0]):
+            instance = tensor[batch_index]
+            if blur is not None:
+                instance = blur(instance)
+            lable_map, instance_count = ndimage.label(instance[0])
+            instance_count = min(self.max_regions, instance_count)
+            instance_map = []
+            for index in range(1, instance_count):
+                instance = torch.from_numpy(
+                        lable_map == index).to(tensor_on_gpu.device).type(torch.float32)
+                instance_map.append(instance)
+            instance_maps.append(instance_map)
+        return instance_maps, instance_counts
+
+    def iou(self, pred, gt):
+        overlap = (pred * gt).sum()
+        return max(overlap / pred.sum(), overlap / gt.sum())
+
+    def replace_or_add(self, dest, value):
+        if dest is None:
+            return value
+        if value is None:
+            return dest
+        return dest + value
+
+    def forward(self, pred, gt, mask):
+        # pred_label_maps: N, P, H, W, where P is the number of regions.
+        torch.cuda.synchronize()
+        pred_label_maps, _ = self.label(pred > self.threshold)
+        gt_label_maps, _ = self.label(gt)
+
+        losses = []
+        for batch_index, gt_instance_maps in enumerate(gt_label_maps):
+            pred_instance_maps = pred_label_maps[batch_index]
+            if gt_instance_maps is None or pred_instance_maps is None:
+                continue
+
+            single_loss = None  # loss on a single image in a batch
+            mask_not_matched = set(range(len(pred_instance_maps)))
+            for gt_instance_map in gt_instance_maps:
+                instance_loss = None  # loss on a specific gt region
+                for instance_index, pred_instance_map in enumerate(pred_instance_maps):
+                    if self.iou(pred_instance_map, gt_instance_map) > self.iou_thresh:
+                        match_loss = self._compute(
+                                pred[batch_index][0], gt[batch_index][0],
+                                mask[batch_index] * (pred_instance_map + gt_instance_map > 0).type(torch.float32))
+                        instance_loss = self.replace_or_add(instance_loss, match_loss)
+                        if instance_index in mask_not_matched:
+                            mask_not_matched.remove(instance_index)
+                if instance_loss is None:
+                    instance_loss = self._compute(
+                            pred[batch_index][0], gt[batch_index][0],
+                            mask[batch_index] * gt_instance_map)
+                single_loss = self.replace_or_add(single_loss, instance_loss)
+
+            '''Whether to compute single loss on instances which contrain no positive sample.
+            if single_loss is None:
+                single_loss = self._compute(
+                        pred[batch_index][0], gt[batch_index][0],
+                        mask[batch_index])
+            '''
+
+            for instance_index in mask_not_matched:
+                single_loss = self.replace_or_add(
+                        single_loss,
+                        self._compute(
+                            pred[batch_index][0], gt[batch_index][0],
+                            mask[batch_index] * pred_instance_maps[instance_index]))
+
+            if single_loss is not None:
+                losses.append(single_loss)
+
+        if self.reduction == 'none':
+            loss = losses
+        else:
+            assert self.reduction in ['sum', 'mean']
+            count = len(losses)
+            loss = sum(losses)
+            if self.reduction == 'mean':
+                loss = loss / count
+        return loss
--- a/easyocr/DBNet/decoders/feature_attention.py
+++ b/easyocr/DBNet/decoders/feature_attention.py
@@ -0,0 +1,145 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class ScaleChannelAttention(nn.Module):
+    def __init__(self, in_planes, out_planes, num_features, init_weight=True):
+        super(ScaleChannelAttention, self).__init__()
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        print(self.avgpool)
+        self.fc1 = nn.Conv2d(in_planes, out_planes, 1, bias=False)
+        self.bn = nn.BatchNorm2d(out_planes)
+        self.fc2 = nn.Conv2d(out_planes, num_features, 1, bias=False)
+        if init_weight:
+            self._initialize_weights()
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            if isinstance(m ,nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        global_x = self.avgpool(x)
+        global_x = self.fc1(global_x)
+        global_x = F.relu(self.bn(global_x))
+        global_x = self.fc2(global_x)
+        global_x = F.softmax(global_x, 1)
+        return global_x
+
+class ScaleChannelSpatialAttention(nn.Module):
+    def __init__(self, in_planes, out_planes, num_features, init_weight=True):
+        super(ScaleChannelSpatialAttention, self).__init__()
+        self.channel_wise = nn.Sequential(
+            nn.AdaptiveAvgPool2d(1),
+            nn.Conv2d(in_planes, out_planes , 1, bias=False),
+            # nn.BatchNorm2d(out_planes),
+            nn.ReLU(),
+            nn.Conv2d(out_planes, in_planes, 1, bias=False)
+        )
+        self.spatial_wise = nn.Sequential(
+            #Nx1xHxW
+            nn.Conv2d(1, 1, 3, bias=False, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(1, 1, 1, bias=False),
+            nn.Sigmoid()
+        )
+        self.attention_wise = nn.Sequential(
+            nn.Conv2d(in_planes, num_features, 1, bias=False),
+            nn.Sigmoid()
+        )
+        if init_weight:
+            self._initialize_weights()
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            if isinstance(m ,nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        # global_x = self.avgpool(x)
+        #shape Nx4x1x1
+        global_x = self.channel_wise(x).sigmoid()
+        #shape: NxCxHxW
+        global_x = global_x + x
+        #shape:Nx1xHxW
+        x = torch.mean(global_x, dim=1, keepdim=True)
+        global_x = self.spatial_wise(x) + global_x
+        global_x = self.attention_wise(global_x)
+        return global_x
+
+class ScaleSpatialAttention(nn.Module):
+    def __init__(self, in_planes, out_planes, num_features, init_weight=True):
+        super(ScaleSpatialAttention, self).__init__()
+        self.spatial_wise = nn.Sequential(
+            #Nx1xHxW
+            nn.Conv2d(1, 1, 3, bias=False, padding=1),
+            nn.ReLU(),
+            nn.Conv2d(1, 1, 1, bias=False),
+            nn.Sigmoid() 
+        )
+        self.attention_wise = nn.Sequential(
+            nn.Conv2d(in_planes, num_features, 1, bias=False),
+            nn.Sigmoid()
+        )
+        if init_weight:
+            self._initialize_weights()
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            if isinstance(m ,nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        global_x = torch.mean(x, dim=1, keepdim=True)
+        global_x = self.spatial_wise(global_x) + x
+        global_x = self.attention_wise(global_x)
+        return global_x
+
+class ScaleFeatureSelection(nn.Module):
+    def __init__(self, in_channels, inter_channels , out_features_num=4, attention_type='scale_spatial'):
+        super(ScaleFeatureSelection, self).__init__()
+        self.in_channels=in_channels
+        self.inter_channels = inter_channels
+        self.out_features_num = out_features_num
+        self.conv = nn.Conv2d(in_channels, inter_channels, 3, padding=1)
+        self.type = attention_type
+        if self.type == 'scale_spatial':
+            self.enhanced_attention = ScaleSpatialAttention(inter_channels, inter_channels//4, out_features_num)
+        elif self.type == 'scale_channel_spatial':
+            self.enhanced_attention = ScaleChannelSpatialAttention(inter_channels, inter_channels // 4, out_features_num)
+        elif self.type == 'scale_channel':
+            self.enhanced_attention = ScaleChannelAttention(inter_channels, inter_channels//2, out_features_num)
+
+    def _initialize_weights(self, m):
+        classname = m.__class__.__name__
+        if classname.find('Conv') != -1:
+            nn.init.kaiming_normal_(m.weight.data)
+        elif classname.find('BatchNorm') != -1:
+            m.weight.data.fill_(1.)
+            m.bias.data.fill_(1e-4)
+    def forward(self, concat_x, features_list):
+        concat_x = self.conv(concat_x)
+        score = self.enhanced_attention(concat_x)
+        assert len(features_list) == self.out_features_num
+        if self.type not in ['scale_channel_spatial', 'scale_spatial']:
+            shape = features_list[0].shape[2:]
+            score = F.interpolate(score, size=shape, mode='bilinear')
+        x = []
+        for i in range(self.out_features_num):
+            x.append(score[:, i:i+1] * features_list[i])
+        return torch.cat(x, dim=1)
--- a/easyocr/DBNet/decoders/l1_loss.py
+++ b/easyocr/DBNet/decoders/l1_loss.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+
+
+class MaskL1Loss(nn.Module):
+    def __init__(self):
+        super(MaskL1Loss, self).__init__()
+
+    def forward(self, pred: torch.Tensor, gt, mask):
+        mask_sum = mask.sum()
+        if mask_sum.item() == 0:
+            return mask_sum, dict(l1_loss=mask_sum)
+        else:
+            loss = (torch.abs(pred[:, 0] - gt) * mask).sum() / mask_sum
+            return loss, dict(l1_loss=loss)
+
+
+class BalanceL1Loss(nn.Module):
+    def __init__(self, negative_ratio=3.):
+        super(BalanceL1Loss, self).__init__()
+        self.negative_ratio = negative_ratio
+
+    def forward(self, pred: torch.Tensor, gt, mask):
+        '''
+        Args:
+            pred: (N, 1, H, W).
+            gt: (N, H, W).
+            mask: (N, H, W).
+        '''
+        loss = torch.abs(pred[:, 0] - gt)
+        positive = loss * mask
+        negative = loss * (1 - mask)
+        positive_count = int(mask.sum())
+        negative_count = min(
+                int((1 - mask).sum()),
+                int(positive_count * self.negative_ratio))
+        negative_loss, _ = torch.topk(negative.view(-1), negative_count)
+        negative_loss = negative_loss.sum() / negative_count
+        positive_loss = positive.sum() / positive_count
+        return positive_loss + negative_loss,\
+            dict(l1_loss=positive_loss, nge_l1_loss=negative_loss)
--- a/easyocr/DBNet/decoders/pss_loss.py
+++ b/easyocr/DBNet/decoders/pss_loss.py
@@ -0,0 +1,115 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class PSS_Loss(nn.Module):
+
+    def __init__(self, cls_loss):
+        super(PSS_Loss, self).__init__()
+        self.eps = 1e-6
+        self.criterion = eval('self.' + cls_loss + '_loss')
+
+    def dice_loss(self, pred, gt, m):
+        intersection = torch.sum(pred*gt*m)
+        union = torch.sum(pred*m) + torch.sum(gt*m) + self.eps
+        loss = 1 - 2.0*intersection/union
+        if loss > 1:
+            print(intersection, union)
+        return loss
+
+    def dice_ohnm_loss(self, pred, gt, m):
+        pos_index = (gt == 1) * (m == 1)
+        neg_index = (gt == 0) * (m == 1)
+        pos_num = pos_index.float().sum().item()
+        neg_num = neg_index.float().sum().item()
+        if pos_num == 0 or neg_num < pos_num*3.0:
+            return self.dice_loss(pred, gt, m)
+        else:
+            neg_num = int(pos_num*3)
+            pos_pred = pred[pos_index]
+            neg_pred = pred[neg_index]
+            neg_sort, _ = torch.sort(neg_pred, descending=True)
+            sampled_neg_pred = neg_sort[:neg_num]
+            pos_gt = pos_pred.clone()
+            pos_gt.data.fill_(1.0)
+            pos_gt = pos_gt.detach()
+            neg_gt = sampled_neg_pred.clone()
+            neg_gt.data.fill_(0)
+            neg_gt = neg_gt.detach()
+            tpred = torch.cat((pos_pred, sampled_neg_pred))
+            tgt = torch.cat((pos_gt, neg_gt))
+            intersection = torch.sum(tpred * tgt)
+            union = torch.sum(tpred) + torch.sum(gt) + self.eps
+            loss = 1 - 2.0 * intersection / union
+        return loss
+
+    def focal_loss(self, pred, gt, m, alpha=0.25, gamma=0.6):
+        pos_mask = (gt == 1).float()
+        neg_mask = (gt == 0).float()
+        mask = alpha*pos_mask * \
+            torch.pow(1-pred.data, gamma)+(1-alpha) * \
+            neg_mask*torch.pow(pred.data, gamma)
+        l = F.binary_cross_entropy(pred, gt, weight=mask, reduction='none')
+        loss = torch.sum(l*m)/(self.eps+m.sum())
+        loss *= 10
+        return loss
+
+    def wbce_orig_loss(self, pred, gt, m):
+        n, h, w = pred.size()
+        assert (torch.max(gt) == 1)
+        pos_neg_p = pred[m.byte()]
+        pos_neg_t = gt[m.byte()]
+        pos_mask = (pos_neg_t == 1).squeeze()
+        w = pos_mask.float() * (1 - pos_mask).sum().item() + \
+            (1 - pos_mask).float() * pos_mask.sum().item()
+        w = w / (pos_mask.size(0))
+        loss = F.binary_cross_entropy(pos_neg_p, pos_neg_t, w, reduction='sum')
+        return loss
+
+    def wbce_loss(self, pred, gt, m):
+        pos_mask = (gt == 1).float()*m
+        neg_mask = (gt == 0).float()*m
+        # mask=(pos_mask*neg_mask.sum()+neg_mask*pos_mask.sum())/m.sum()
+        # loss=torch.sum(l)
+        mask = pos_mask * neg_mask.sum() / pos_mask.sum() + neg_mask
+        l = F.binary_cross_entropy(pred, gt, weight=mask, reduction='none')
+        loss = torch.sum(l)/(m.sum()+self.eps)
+        return loss
+
+    def bce_loss(self, pred, gt, m):
+        l = F.binary_cross_entropy(pred, gt, weight=m, reduction='sum')
+        loss = l/(m.sum()+self.eps)
+        return loss
+
+    def dice_bce_loss(self, pred, gt, m):
+        return (self.dice_loss(pred, gt, m) + self.bce_loss(pred, gt, m)) / 2.0
+
+    def dice_ohnm_bce_loss(self, pred, gt, m):
+        return (self.dice_ohnm_loss(pred, gt, m) + self.bce_loss(pred, gt, m)) / 2.0
+
+    def forward(self, pred, gt, mask, gt_type='shrink'):
+        if gt_type == 'shrink':
+            loss = self.get_loss(pred, gt, mask)
+            return loss
+        elif gt_type == 'pss':
+            loss = self.get_loss(pred, gt[:, :4, :, :], mask)
+            g_g = gt[:, 4, :, :]
+            g_p, _ = torch.max(pred, 1)
+            loss += self.criterion(g_p, g_g, mask)
+            return loss
+        elif gt_type == 'both':
+            pss_loss = self.get_loss(pred[:, :4, :, :], gt[:, :4, :, :], mask)
+            g_g = gt[:, 4, :, :]
+            g_p, _ = torch.max(pred, 1)
+            pss_loss += self.criterion(g_p, g_g, mask)
+            shrink_loss = self.criterion(
+                pred[:, 4, :, :], gt[:, 5, :, :], mask)
+            return pss_loss, shrink_loss
+        else:
+            return NotImplementedError('gt_type [%s] is not implemented', gt_type)
+
+    def get_loss(self, pred, gt, mask):
+        loss = torch.tensor(0.)
+        for ind in range(pred.size(1)):
+            loss += self.criterion(pred[:, ind, :, :], gt[:, ind, :, :], mask)
+        return loss
--- a/easyocr/DBNet/decoders/seg_detector.py
+++ b/easyocr/DBNet/decoders/seg_detector.py
@@ -0,0 +1,152 @@
+from collections import OrderedDict
+
+import torch
+import torch.nn as nn
+BatchNorm2d = nn.BatchNorm2d
+
+class SegDetector(nn.Module):
+    def __init__(self,
+                 in_channels=[64, 128, 256, 512],
+                 inner_channels=256, k=10,
+                 bias=False, adaptive=False, smooth=False, serial=False,
+                 *args, **kwargs):
+        '''
+        bias: Whether conv layers have bias or not.
+        adaptive: Whether to use adaptive threshold training or not.
+        smooth: If true, use bilinear instead of deconv.
+        serial: If true, thresh prediction will combine segmentation result as input.
+        '''
+        super(SegDetector, self).__init__()
+        self.k = k
+        self.serial = serial
+        self.up5 = nn.Upsample(scale_factor=2, mode='nearest')
+        self.up4 = nn.Upsample(scale_factor=2, mode='nearest')
+        self.up3 = nn.Upsample(scale_factor=2, mode='nearest')
+
+        self.in5 = nn.Conv2d(in_channels[-1], inner_channels, 1, bias=bias)
+        self.in4 = nn.Conv2d(in_channels[-2], inner_channels, 1, bias=bias)
+        self.in3 = nn.Conv2d(in_channels[-3], inner_channels, 1, bias=bias)
+        self.in2 = nn.Conv2d(in_channels[-4], inner_channels, 1, bias=bias)
+
+        self.out5 = nn.Sequential(
+            nn.Conv2d(inner_channels, inner_channels //
+                      4, 3, padding=1, bias=bias),
+            nn.Upsample(scale_factor=8, mode='nearest'))
+        self.out4 = nn.Sequential(
+            nn.Conv2d(inner_channels, inner_channels //
+                      4, 3, padding=1, bias=bias),
+            nn.Upsample(scale_factor=4, mode='nearest'))
+        self.out3 = nn.Sequential(
+            nn.Conv2d(inner_channels, inner_channels //
+                      4, 3, padding=1, bias=bias),
+            nn.Upsample(scale_factor=2, mode='nearest'))
+        self.out2 = nn.Conv2d(
+            inner_channels, inner_channels//4, 3, padding=1, bias=bias)
+
+        self.binarize = nn.Sequential(
+            nn.Conv2d(inner_channels, inner_channels //
+                      4, 3, padding=1, bias=bias),
+            BatchNorm2d(inner_channels//4),
+            nn.ReLU(inplace=True),
+            nn.ConvTranspose2d(inner_channels//4, inner_channels//4, 2, 2),
+            BatchNorm2d(inner_channels//4),
+            nn.ReLU(inplace=True),
+            nn.ConvTranspose2d(inner_channels//4, 1, 2, 2),
+            nn.Sigmoid())
+        self.binarize.apply(self.weights_init)
+
+        self.adaptive = adaptive
+        if adaptive:
+            self.thresh = self._init_thresh(
+                    inner_channels, serial=serial, smooth=smooth, bias=bias)
+            self.thresh.apply(self.weights_init)
+
+        self.in5.apply(self.weights_init)
+        self.in4.apply(self.weights_init)
+        self.in3.apply(self.weights_init)
+        self.in2.apply(self.weights_init)
+        self.out5.apply(self.weights_init)
+        self.out4.apply(self.weights_init)
+        self.out3.apply(self.weights_init)
+        self.out2.apply(self.weights_init)
+
+    def weights_init(self, m):
+        classname = m.__class__.__name__
+        if classname.find('Conv') != -1:
+            nn.init.kaiming_normal_(m.weight.data)
+        elif classname.find('BatchNorm') != -1:
+            m.weight.data.fill_(1.)
+            m.bias.data.fill_(1e-4)
+
+    def _init_thresh(self, inner_channels,
+                     serial=False, smooth=False, bias=False):
+        in_channels = inner_channels
+        if serial:
+            in_channels += 1
+        self.thresh = nn.Sequential(
+            nn.Conv2d(in_channels, inner_channels //
+                      4, 3, padding=1, bias=bias),
+            BatchNorm2d(inner_channels//4),
+            nn.ReLU(inplace=True),
+            self._init_upsample(inner_channels // 4, inner_channels//4, smooth=smooth, bias=bias),
+            BatchNorm2d(inner_channels//4),
+            nn.ReLU(inplace=True),
+            self._init_upsample(inner_channels // 4, 1, smooth=smooth, bias=bias),
+            nn.Sigmoid())
+        return self.thresh
+
+    def _init_upsample(self,
+                       in_channels, out_channels,
+                       smooth=False, bias=False):
+        if smooth:
+            inter_out_channels = out_channels
+            if out_channels == 1:
+                inter_out_channels = in_channels
+            module_list = [
+                    nn.Upsample(scale_factor=2, mode='nearest'),
+                    nn.Conv2d(in_channels, inter_out_channels, 3, 1, 1, bias=bias)]
+            if out_channels == 1:
+                module_list.append(
+                    nn.Conv2d(in_channels, out_channels,
+                              kernel_size=1, stride=1, padding=1, bias=True))
+
+            return nn.Sequential(module_list)
+        else:
+            return nn.ConvTranspose2d(in_channels, out_channels, 2, 2)
+
+    def forward(self, features, gt=None, masks=None, training=False):
+        c2, c3, c4, c5 = features
+        in5 = self.in5(c5)
+        in4 = self.in4(c4)
+        in3 = self.in3(c3)
+        in2 = self.in2(c2)
+
+        out4 = self.up5(in5) + in4  # 1/16
+        out3 = self.up4(out4) + in3  # 1/8
+        out2 = self.up3(out3) + in2  # 1/4
+
+        p5 = self.out5(in5)
+        p4 = self.out4(out4)
+        p3 = self.out3(out3)
+        p2 = self.out2(out2)
+
+        fuse = torch.cat((p5, p4, p3, p2), 1)
+        # this is the pred module, not binarization module; 
+        # We do not correct the name due to the trained model.
+        binary = self.binarize(fuse)
+        if self.training:
+            result = OrderedDict(binary=binary)
+        else:
+            return binary
+        if self.adaptive and self.training:
+            if self.serial:
+                fuse = torch.cat(
+                        (fuse, nn.functional.interpolate(
+                            binary, fuse.shape[2:])), 1)
+            thresh = self.thresh(fuse)
+            thresh_binary = self.step_function(binary, thresh)
+            result.update(thresh=thresh, thresh_binary=thresh_binary)
+        return result
+
+    def step_function(self, x, y):
+        return torch.reciprocal(1 + torch.exp(-self.k * (x - y)))
--- a/easyocr/DBNet/decoders/seg_detector_asf.py
+++ b/easyocr/DBNet/decoders/seg_detector_asf.py
@@ -0,0 +1,163 @@
+from collections import OrderedDict
+import pdb
+import torch
+import torch.nn as nn
+from .feature_attention import ScaleFeatureSelection
+BatchNorm2d = nn.BatchNorm2d
+
+
+class SegSpatialScaleDetector(nn.Module):
+    def __init__(self,
+                 in_channels=[64, 128, 256, 512],
+                 inner_channels=256, k=10,
+                 bias=False, adaptive=False, smooth=False, serial=False,fpn=True, attention_type='scale_spatial',
+                 *args, **kwargs):
+        '''
+        bias: Whether conv layers have bias or not.
+        adaptive: Whether to use adaptive threshold training or not.
+        smooth: If true, use bilinear instead of deconv.
+        serial: If true, thresh prediction will combine segmentation result as input.
+        '''
+        super(SegSpatialScaleDetector, self).__init__()
+        self.k = k
+        self.serial = serial
+        self.fpn = fpn
+        self.up5 = nn.Upsample(scale_factor=2, mode='nearest')
+        self.up4 = nn.Upsample(scale_factor=2, mode='nearest')
+        self.up3 = nn.Upsample(scale_factor=2, mode='nearest')
+
+        self.in5 = nn.Conv2d(in_channels[-1], inner_channels, 1, bias=bias)
+        self.in4 = nn.Conv2d(in_channels[-2], inner_channels, 1, bias=bias)
+        self.in3 = nn.Conv2d(in_channels[-3], inner_channels, 1, bias=bias)
+        self.in2 = nn.Conv2d(in_channels[-4], inner_channels, 1, bias=bias)
+
+        if self.fpn:
+            self.out5 = nn.Sequential(
+                nn.Conv2d(inner_channels, inner_channels // 4, 3, padding=1, bias=bias),
+                nn.Upsample(scale_factor=8, mode='nearest'))
+            self.out4 = nn.Sequential(
+                nn.Conv2d(inner_channels, inner_channels // 4, 3, padding=1, bias=bias),
+                nn.Upsample(scale_factor=4, mode='nearest'))
+            self.out3 = nn.Sequential(
+                nn.Conv2d(inner_channels, inner_channels // 4, 3, padding=1, bias=bias),
+                nn.Upsample(scale_factor=2, mode='nearest'))
+            self.out2 = nn.Conv2d(inner_channels, inner_channels//4, 3, padding=1, bias=bias)
+            self.out5.apply(self.weights_init)
+            self.out4.apply(self.weights_init)
+            self.out3.apply(self.weights_init)
+            self.out2.apply(self.weights_init)
+
+            self.concat_attention = ScaleFeatureSelection(inner_channels, inner_channels//4, attention_type=attention_type)
+            self.binarize = nn.Sequential(
+                nn.Conv2d(inner_channels, inner_channels // 4, 3, bias=bias, padding=1),
+                BatchNorm2d(inner_channels//4),
+                nn.ReLU(inplace=True),
+                nn.ConvTranspose2d(inner_channels//4, inner_channels//4, 2, 2),
+                BatchNorm2d(inner_channels//4),
+                nn.ReLU(inplace=True),
+                nn.ConvTranspose2d(inner_channels//4, 1, 2, 2),
+                nn.Sigmoid())
+        else:
+            self.concat_attention = ScaleFeatureSelection(inner_channels, inner_channels//4, )
+            self.binarize = nn.Sequential(
+                nn.Conv2d(inner_channels, inner_channels // 4, 3, bias=bias, padding=1),
+                BatchNorm2d(inner_channels//4),
+                nn.ReLU(inplace=True),
+                nn.ConvTranspose2d(inner_channels//4, inner_channels//4, 2, 2),
+                BatchNorm2d(inner_channels//4),
+                nn.ReLU(inplace=True),
+                nn.ConvTranspose2d(inner_channels//4, 1, 2, 2),
+                nn.Sigmoid())
+
+        self.binarize.apply(self.weights_init)
+        self.adaptive = adaptive
+        if adaptive:
+            self.thresh = self._init_thresh(
+                    inner_channels, serial=serial, smooth=smooth, bias=bias)
+            self.thresh.apply(self.weights_init)
+
+        self.in5.apply(self.weights_init)
+        self.in4.apply(self.weights_init)
+        self.in3.apply(self.weights_init)
+        self.in2.apply(self.weights_init)
+
+    def weights_init(self, m):
+        classname = m.__class__.__name__
+        if classname.find('Conv') != -1:
+            nn.init.kaiming_normal_(m.weight.data)
+        elif classname.find('BatchNorm') != -1:
+            m.weight.data.fill_(1.)
+            m.bias.data.fill_(1e-4)
+
+    def _init_thresh(self, inner_channels,
+                     serial=False, smooth=False, bias=False):
+        in_channels = inner_channels
+        if serial:
+            in_channels += 1
+        self.thresh = nn.Sequential(
+            nn.Conv2d(in_channels, inner_channels //
+                      4, 3, padding=1, bias=bias),
+            BatchNorm2d(inner_channels//4),
+            nn.ReLU(inplace=True),
+            self._init_upsample(inner_channels // 4, inner_channels//4, smooth=smooth, bias=bias),
+            BatchNorm2d(inner_channels//4),
+            nn.ReLU(inplace=True),
+            self._init_upsample(inner_channels // 4, 1, smooth=smooth, bias=bias),
+            nn.Sigmoid())
+        return self.thresh
+
+    def _init_upsample(self,
+                       in_channels, out_channels,
+                       smooth=False, bias=False):
+        if smooth:
+            inter_out_channels = out_channels
+            if out_channels == 1:
+                inter_out_channels = in_channels
+            module_list = [
+                    nn.Upsample(scale_factor=2, mode='nearest'),
+                    nn.Conv2d(in_channels, inter_out_channels, 3, 1, 1, bias=bias)]
+            if out_channels == 1:
+                module_list.append(
+                    nn.Conv2d(in_channels, out_channels,
+                              kernel_size=1, stride=1, padding=1, bias=True))
+
+            return nn.Sequential(module_list)
+        else:
+            return nn.ConvTranspose2d(in_channels, out_channels, 2, 2)
+
+    def forward(self, features, gt=None, masks=None, training=False):
+        c2, c3, c4, c5 = features
+        in5 = self.in5(c5)
+        in4 = self.in4(c4)
+        in3 = self.in3(c3)
+        in2 = self.in2(c2)
+
+        out4 = self.up5(in5) + in4  # 1/16
+        out3 = self.up4(out4) + in3  # 1/8
+        out2 = self.up3(out3) + in2  # 1/4
+        p5 = self.out5(in5)
+        p4 = self.out4(out4)
+        p3 = self.out3(out3)
+        p2 = self.out2(out2)
+
+        fuse = torch.cat((p5, p4, p3, p2), 1)
+        fuse = self.concat_attention(fuse, [p5, p4, p3, p2])
+        # this is the pred module, not binarization module; 
+        # We do not correct the name due to the trained model.
+        binary = self.binarize(fuse)
+        if self.training:
+            result = OrderedDict(binary=binary)
+        else:
+            return binary
+        if self.adaptive and self.training:
+            if self.serial:
+                fuse = torch.cat(
+                        (fuse, nn.functional.interpolate(
+                            binary, fuse.shape[2:])), 1)
+            thresh = self.thresh(fuse)
+            thresh_binary = self.step_function(binary, thresh)
+            result.update(thresh=thresh, thresh_binary=thresh_binary)
+        return result
+
+    def step_function(self, x, y):
+        return torch.reciprocal(1 + torch.exp(-self.k * (x - y)))
--- a/easyocr/DBNet/decoders/seg_detector_loss.py
+++ b/easyocr/DBNet/decoders/seg_detector_loss.py
@@ -0,0 +1,264 @@
+import sys
+
+import torch
+import torch.nn as nn
+
+
+class SegDetectorLossBuilder():
+    '''
+    Build loss functions for SegDetector.
+    Details about the built functions:
+        Input:
+            pred: A dict which contains predictions.
+                thresh: The threshold prediction
+                binary: The text segmentation prediction.
+                thresh_binary: Value produced by `step_function(binary - thresh)`.
+            batch:
+                gt: Text regions bitmap gt.
+                mask: Ignore mask,
+                    pexels where value is 1 indicates no contribution to loss.
+                thresh_mask: Mask indicates regions cared by thresh supervision.
+                thresh_map: Threshold gt.
+        Return:
+            (loss, metrics).
+            loss: A scalar loss value.
+            metrics: A dict contraining partial loss values.
+    '''
+
+    def __init__(self, loss_class, *args, **kwargs):
+        self.loss_class = loss_class
+        self.loss_args = args
+        self.loss_kwargs = kwargs
+
+    def build(self):
+        return getattr(sys.modules[__name__], self.loss_class)(*self.loss_args, **self.loss_kwargs)
+
+
+class DiceLoss(nn.Module):
+    '''
+    DiceLoss on binary.
+    For SegDetector without adaptive module.
+    '''
+
+    def __init__(self, eps=1e-6):
+        super(DiceLoss, self).__init__()
+        from .dice_loss import DiceLoss as Loss
+        self.loss = Loss(eps)
+
+    def forward(self, pred, batch):
+        loss = self.loss(pred['binary'], batch['gt'], batch['mask'])
+        return loss, dict(dice_loss=loss)
+
+
+class BalanceBCELoss(nn.Module):
+    '''
+    DiceLoss on binary.
+    For SegDetector without adaptive module.
+    '''
+
+    def __init__(self, eps=1e-6):
+        super(BalanceBCELoss, self).__init__()
+        from .balance_cross_entropy_loss import BalanceCrossEntropyLoss
+        self.loss = BalanceCrossEntropyLoss()
+
+    def forward(self, pred, batch):
+        loss = self.loss(pred['binary'], batch['gt'], batch['mask'])
+        return loss, dict(dice_loss=loss)
+
+
+class AdaptiveDiceLoss(nn.Module):
+    '''
+    Integration of DiceLoss on both binary
+        prediction and thresh prediction.
+    '''
+
+    def __init__(self, eps=1e-6):
+        super(AdaptiveDiceLoss, self).__init__()
+        from .dice_loss import DiceLoss
+        self.main_loss = DiceLoss(eps)
+        self.thresh_loss = DiceLoss(eps)
+
+    def forward(self, pred, batch):
+        assert isinstance(pred, dict)
+        assert 'binary' in pred
+        assert 'thresh_binary' in pred
+
+        binary = pred['binary']
+        thresh_binary = pred['thresh_binary']
+        gt = batch['gt']
+        mask = batch['mask']
+        main_loss = self.main_loss(binary, gt, mask)
+        thresh_loss = self.thresh_loss(thresh_binary, gt, mask)
+        loss = main_loss + thresh_loss
+        return loss, dict(main_loss=main_loss, thresh_loss=thresh_loss)
+
+
+class AdaptiveInstanceDiceLoss(nn.Module):
+    '''
+    InstanceDiceLoss on both binary and thresh_bianry.
+    '''
+
+    def __init__(self, iou_thresh=0.2, thresh=0.3):
+        super(AdaptiveInstanceDiceLoss, self).__init__()
+        from .dice_loss import InstanceDiceLoss, DiceLoss
+        self.main_loss = DiceLoss()
+        self.main_instance_loss = InstanceDiceLoss()
+        self.thresh_loss = DiceLoss()
+        self.thresh_instance_loss = InstanceDiceLoss()
+        self.weights = nn.ParameterDict(dict(
+            main=nn.Parameter(torch.ones(1)),
+            thresh=nn.Parameter(torch.ones(1)),
+            main_instance=nn.Parameter(torch.ones(1)),
+            thresh_instance=nn.Parameter(torch.ones(1))))
+
+    def partial_loss(self, weight, loss):
+        return loss / weight + torch.log(torch.sqrt(weight))
+
+    def forward(self, pred, batch):
+        main_loss = self.main_loss(pred['binary'], batch['gt'], batch['mask'])
+        thresh_loss = self.thresh_loss(pred['thresh_binary'], batch['gt'], batch['mask'])
+        main_instance_loss = self.main_instance_loss(
+            pred['binary'], batch['gt'], batch['mask'])
+        thresh_instance_loss = self.thresh_instance_loss(
+            pred['thresh_binary'], batch['gt'], batch['mask'])
+        loss = self.partial_loss(self.weights['main'], main_loss) \
+               + self.partial_loss(self.weights['thresh'], thresh_loss) \
+               + self.partial_loss(self.weights['main_instance'], main_instance_loss) \
+               + self.partial_loss(self.weights['thresh_instance'], thresh_instance_loss)
+        metrics = dict(
+            main_loss=main_loss,
+            thresh_loss=thresh_loss,
+            main_instance_loss=main_instance_loss,
+            thresh_instance_loss=thresh_instance_loss)
+        metrics.update(self.weights)
+        return loss, metrics
+
+
+class L1DiceLoss(nn.Module):
+    '''
+    L1Loss on thresh, DiceLoss on thresh_binary and binary.
+    '''
+
+    def __init__(self, eps=1e-6, l1_scale=10):
+        super(L1DiceLoss, self).__init__()
+        self.dice_loss = AdaptiveDiceLoss(eps=eps)
+        from .l1_loss import MaskL1Loss
+        self.l1_loss = MaskL1Loss()
+        self.l1_scale = l1_scale
+
+    def forward(self, pred, batch):
+        dice_loss, metrics = self.dice_loss(pred, batch)
+        l1_loss, l1_metric = self.l1_loss(
+            pred['thresh'], batch['thresh_map'], batch['thresh_mask'])
+
+        loss = dice_loss + self.l1_scale * l1_loss
+        metrics.update(**l1_metric)
+        return loss, metrics
+
+
+class FullL1DiceLoss(L1DiceLoss):
+    '''
+    L1loss on thresh, pixels with topk losses in non-text regions are also counted.
+    DiceLoss on thresh_binary and binary.
+    '''
+
+    def __init__(self, eps=1e-6, l1_scale=10):
+        nn.Module.__init__(self)
+        self.dice_loss = AdaptiveDiceLoss(eps=eps)
+        from .l1_loss import BalanceL1Loss
+        self.l1_loss = BalanceL1Loss()
+        self.l1_scale = l1_scale
+
+
+class L1BalanceCELoss(nn.Module):
+    '''
+    Balanced CrossEntropy Loss on `binary`,
+    MaskL1Loss on `thresh`,
+    DiceLoss on `thresh_binary`.
+    Note: The meaning of inputs can be figured out in `SegDetectorLossBuilder`.
+    '''
+
+    def __init__(self, eps=1e-6, l1_scale=10, bce_scale=5):
+        super(L1BalanceCELoss, self).__init__()
+        from .dice_loss import DiceLoss
+        from .l1_loss import MaskL1Loss
+        from .balance_cross_entropy_loss import BalanceCrossEntropyLoss
+        self.dice_loss = DiceLoss(eps=eps)
+        self.l1_loss = MaskL1Loss()
+        self.bce_loss = BalanceCrossEntropyLoss()
+
+        self.l1_scale = l1_scale
+        self.bce_scale = bce_scale
+
+    def forward(self, pred, batch):
+        bce_loss = self.bce_loss(pred['binary'], batch['gt'], batch['mask'])
+        metrics = dict(bce_loss=bce_loss)
+        if 'thresh' in pred:
+            l1_loss, l1_metric = self.l1_loss(pred['thresh'], batch['thresh_map'], batch['thresh_mask'])
+            dice_loss = self.dice_loss(pred['thresh_binary'], batch['gt'], batch['mask'])
+            metrics['thresh_loss'] = dice_loss
+            loss = dice_loss + self.l1_scale * l1_loss + bce_loss * self.bce_scale
+            metrics.update(**l1_metric)
+        else:
+            loss = bce_loss
+        return loss, metrics
+
+
+class L1BCEMiningLoss(nn.Module):
+    '''
+    Basicly the same with L1BalanceCELoss, where the bce loss map is used as
+        attention weigts for DiceLoss
+    '''
+
+    def __init__(self, eps=1e-6, l1_scale=10, bce_scale=5):
+        super(L1BCEMiningLoss, self).__init__()
+        from .dice_loss import DiceLoss
+        from .l1_loss import MaskL1Loss
+        from .balance_cross_entropy_loss import BalanceCrossEntropyLoss
+        self.dice_loss = DiceLoss(eps=eps)
+        self.l1_loss = MaskL1Loss()
+        self.bce_loss = BalanceCrossEntropyLoss()
+
+        self.l1_scale = l1_scale
+        self.bce_scale = bce_scale
+
+    def forward(self, pred, batch):
+        bce_loss, bce_map = self.bce_loss(pred['binary'], batch['gt'], batch['mask'],
+                                          return_origin=True)
+        l1_loss, l1_metric = self.l1_loss(pred['thresh'], batch['thresh_map'], batch['thresh_mask'])
+        bce_map = (bce_map - bce_map.min()) / (bce_map.max() - bce_map.min())
+        dice_loss = self.dice_loss(
+            pred['thresh_binary'], batch['gt'],
+            batch['mask'], weights=bce_map + 1)
+        metrics = dict(bce_loss=bce_loss)
+        metrics['thresh_loss'] = dice_loss
+        loss = dice_loss + self.l1_scale * l1_loss + bce_loss * self.bce_scale
+        metrics.update(**l1_metric)
+        return loss, metrics
+
+
+class L1LeakyDiceLoss(nn.Module):
+    '''
+    LeakyDiceLoss on binary,
+    MaskL1Loss on thresh,
+    DiceLoss on thresh_binary.
+    '''
+
+    def __init__(self, eps=1e-6, coverage_scale=5, l1_scale=10):
+        super(L1LeakyDiceLoss, self).__init__()
+        from .dice_loss import DiceLoss, LeakyDiceLoss
+        from .l1_loss import MaskL1Loss
+        self.main_loss = LeakyDiceLoss(coverage_scale=coverage_scale)
+        self.l1_loss = MaskL1Loss()
+        self.thresh_loss = DiceLoss(eps=eps)
+
+        self.l1_scale = l1_scale
+
+    def forward(self, pred, batch):
+        main_loss, metrics = self.main_loss(pred['binary'], batch['gt'], batch['mask'])
+        thresh_loss = self.thresh_loss(pred['thresh_binary'], batch['gt'], batch['mask'])
+        l1_loss, l1_metric = self.l1_loss(
+            pred['thresh'], batch['thresh_map'], batch['thresh_mask'])
+        metrics.update(**l1_metric, thresh_loss=thresh_loss)
+        loss = main_loss + thresh_loss + l1_loss * self.l1_scale
+        return loss, metrics
--- a/easyocr/DBNet/decoders/simple_detection.py
+++ b/easyocr/DBNet/decoders/simple_detection.py
@@ -0,0 +1,191 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+from backbones.upsample_head import SimpleUpsampleHead
+
+
+class SimpleDetectionDecoder(nn.Module):
+    def __init__(self, feature_channel=256):
+        nn.Module.__init__(self)
+
+        self.feature_channel = feature_channel
+        self.head_layer = self.create_head_layer()
+
+        self.pred_layers = nn.ModuleDict(self.create_pred_layers())
+
+    def create_head_layer(self):
+        return SimpleUpsampleHead(
+            self.feature_channel,
+            [self.feature_channel, self.feature_channel // 2, self.feature_channel // 4]
+        )
+
+    def create_pred_layer(self, channels):
+        return nn.Sequential(
+            nn.Conv2d(self.feature_channel // 4, channels, kernel_size=1, stride=1, padding=0, bias=False),
+        )
+
+    def create_pred_layers(self):
+        return {}
+
+    def postprocess_pred(self, pred):
+        return pred
+
+    def calculate_losses(self, preds, label):
+        raise NotImplementedError()
+
+    def forward(self, input, label, meta, train):
+        feature = self.head_layer(input)
+
+        pred = {}
+        for name, pred_layer in self.pred_layers.items():
+            pred[name] = pred_layer(feature)
+
+        if train:
+            losses = self.calculate_losses(pred, label)
+            pred = self.postprocess_pred(pred)
+            loss = sum(losses.values())
+            return loss, pred, losses
+        else:
+            pred = self.postprocess_pred(pred)
+            return pred
+
+
+class SimpleSegDecoder(SimpleDetectionDecoder):
+    def create_pred_layers(self):
+        return {
+            'heatmap': self.create_pred_layer(1)
+        }
+
+    def postprocess_pred(self, pred):
+        pred['heatmap'] = F.sigmoid(pred['heatmap'])
+        return pred
+
+    def calculate_losses(self, pred, label):
+        heatmap = label['heatmap']
+        heatmap_weight = label['heatmap_weight']
+
+        heatmap_pred = pred['heatmap']
+
+        heatmap_loss = F.binary_cross_entropy_with_logits(heatmap_pred, heatmap, reduction='none')
+        heatmap_loss = (heatmap_loss * heatmap_weight).mean(dim=(1, 2, 3))
+
+        return {
+            'heatmap_loss': heatmap_loss,
+        }
+
+
+class SimpleEASTDecoder(SimpleDetectionDecoder):
+    def __init__(self, feature_channels=256, densebox_ratio=1000.0, densebox_rescale_factor=512):
+        SimpleDetectionDecoder.__init__(self, feature_channels)
+
+        self.densebox_ratio = densebox_ratio
+        self.densebox_rescale_factor = densebox_rescale_factor
+
+    def create_pred_layers(self):
+        return {
+            'heatmap': self.create_pred_layer(1),
+            'densebox': self.create_pred_layer(8),
+        }
+
+    def postprocess_pred(self, pred):
+        pred['heatmap'] = F.sigmoid(pred['heatmap'])
+        pred['densebox'] = pred['densebox'] * self.densebox_rescale_factor
+        return pred
+
+    def calculate_losses(self, pred, label):
+        heatmap = label['heatmap']
+        heatmap_weight = label['heatmap_weight']
+        densebox = label['densebox'] / self.densebox_rescale_factor
+        densebox_weight = label['densebox_weight']
+
+        heatmap_pred = pred['heatmap']
+        densebox_pred = pred['densebox']
+
+        heatmap_loss = F.binary_cross_entropy_with_logits(heatmap_pred, heatmap, reduction='none')
+        heatmap_loss = (heatmap_loss * heatmap_weight).mean(dim=(1, 2, 3))
+
+        densebox_loss = F.mse_loss(densebox_pred, densebox, reduction='none')
+        densebox_loss = (densebox_loss * densebox_weight).mean(dim=(1, 2, 3)) * self.densebox_ratio
+
+        return {
+            'heatmap_loss': heatmap_loss,
+            'densebox_loss': densebox_loss,
+        }
+
+
+class SimpleTextsnakeDecoder(SimpleDetectionDecoder):
+    def __init__(self, feature_channels=256, radius_ratio=10.0):
+        SimpleDetectionDecoder.__init__(self, feature_channels)
+
+        self.radius_ratio = radius_ratio
+
+    def create_pred_layers(self):
+        return {
+            'heatmap': self.create_pred_layer(1),
+            'radius': self.create_pred_layer(1),
+        }
+
+    def postprocess_pred(self, pred):
+        pred['heatmap'] = F.sigmoid(pred['heatmap'])
+        pred['radius'] = torch.exp(pred['radius'])
+        return pred
+
+    def calculate_losses(self, pred, label):
+        heatmap = label['heatmap']
+        heatmap_weight = label['heatmap_weight']
+        radius = torch.log(label['radius'] + 1)
+        radius_weight = label['radius_weight']
+
+        heatmap_pred = pred['heatmap']
+        radius_pred = pred['radius']
+
+        heatmap_loss = F.binary_cross_entropy_with_logits(heatmap_pred, heatmap, reduction='none')
+        heatmap_loss = (heatmap_loss * heatmap_weight).mean(dim=(1, 2, 3))
+
+        radius_loss = F.smooth_l1_loss(radius_pred, radius, reduction='none')
+        radius_loss = (radius_loss * radius_weight).mean(dim=(1, 2, 3)) * self.radius_ratio
+
+        return {
+            'heatmap_loss': heatmap_loss,
+            'radius_loss': radius_loss,
+        }
+
+
+class SimpleMSRDecoder(SimpleDetectionDecoder):
+    def __init__(self, feature_channels=256, offset_ratio=1000.0, offset_rescale_factor=512):
+        SimpleDetectionDecoder.__init__(self, feature_channels)
+
+        self.offset_ratio = offset_ratio
+        self.offset_rescale_factor = offset_rescale_factor
+
+    def create_pred_layers(self):
+        return {
+            'heatmap': self.create_pred_layer(1),
+            'offset': self.create_pred_layer(2),
+        }
+
+    def postprocess_pred(self, pred):
+        pred['heatmap'] = F.sigmoid(pred['heatmap'])
+        pred['offset'] = pred['offset'] * self.offset_rescale_factor
+        return pred
+
+    def calculate_losses(self, pred, label):
+        heatmap = label['heatmap']
+        heatmap_weight = label['heatmap_weight']
+        offset = label['offset'] / self.offset_rescale_factor
+        offset_weight = label['offset_weight']
+
+        heatmap_pred = pred['heatmap']
+        offset_pred = pred['offset']
+
+        heatmap_loss = F.binary_cross_entropy_with_logits(heatmap_pred, heatmap, reduction='none')
+        heatmap_loss = (heatmap_loss * heatmap_weight).mean(dim=(1, 2, 3))
+        offset_loss = F.mse_loss(offset_pred, offset, reduction='none')
+        offset_loss = (offset_loss * offset_weight).mean(dim=(1, 2, 3)) * self.offset_ratio
+
+        return {
+            'heatmap_loss': heatmap_loss,
+            'offset_loss': offset_loss,
+        }
--- a/easyocr/DBNet/model/constructor.py
+++ b/easyocr/DBNet/model/constructor.py
@@ -0,0 +1,103 @@
+import importlib
+from collections import OrderedDict
+
+class State:
+    def __init__(self, autoload=True, default=None):
+        self.autoload = autoload
+        self.default = default
+
+
+class StateMeta(type):
+    def __new__(mcs, name, bases, attrs):
+        current_states = []
+        for key, value in attrs.items():
+            if isinstance(value, State):
+                current_states.append((key, value))
+
+        current_states.sort(key=lambda x: x[0])
+        attrs['states'] = OrderedDict(current_states)
+        new_class = super(StateMeta, mcs).__new__(mcs, name, bases, attrs)
+
+        # Walk through the MRO
+        states = OrderedDict()
+        for base in reversed(new_class.__mro__):
+            if hasattr(base, 'states'):
+                states.update(base.states)
+        new_class.states = states
+
+        for key, value in states.items():
+            setattr(new_class, key, value.default)
+
+        return new_class
+
+
+class Configurable(metaclass=StateMeta):
+    def __init__(self, *args, cmd={}, **kwargs):
+        self.load_all(cmd=cmd, **kwargs)
+
+    @staticmethod
+    def construct_class_from_config(args):
+        cls = Configurable.extract_class_from_args(args)
+        return cls(**args)
+
+    @staticmethod
+    def extract_class_from_args(args):
+        cls = args.copy().pop('class')
+        package, cls = cls.rsplit('.', 1)
+        module = importlib.import_module(package)
+        cls = getattr(module, cls)
+        return cls
+
+    def load_all(self, **kwargs):
+        for name, state in self.states.items():
+            if state.autoload:
+                self.load(name, **kwargs)
+
+    def load(self, state_name, **kwargs):
+        # FIXME: kwargs should be filtered
+        # Args passed from command line
+        cmd = kwargs.pop('cmd', dict())
+        if state_name in kwargs:
+            setattr(self, state_name, self.create_member_from_config(
+                (kwargs[state_name], cmd)))
+        else:
+            setattr(self, state_name, self.states[state_name].default)
+
+    def create_member_from_config(self, conf):
+        args, cmd = conf
+        if args is None or isinstance(args, (int, float, str)):
+            return args
+        elif isinstance(args, (list, tuple)):
+            return [self.create_member_from_config((subargs, cmd)) for subargs in args]
+        elif isinstance(args, dict):
+            if 'class' in args:
+                cls = self.extract_class_from_args(args)
+                return cls(**args, cmd=cmd)
+            return {key: self.create_member_from_config((subargs, cmd)) for key, subargs in args.items()}
+        else:
+            return args
+
+    def dump(self):
+        state = {}
+        state['class'] = self.__class__.__module__ + \
+            '.' + self.__class__.__name__
+        for name, value in self.states.items():
+            obj = getattr(self, name)
+            state[name] = self.dump_obj(obj)
+        return state
+
+    def dump_obj(self, obj):
+        if obj is None:
+            return None
+        elif hasattr(obj, 'dump'):
+            return obj.dump()
+        elif isinstance(obj, (int, float, str)):
+            return obj
+        elif isinstance(obj, (list, tuple)):
+            return [self.dump_obj(value) for value in obj]
+        elif isinstance(obj, dict):
+            return {key: self.dump_obj(value) for key, value in obj.items()}
+        else:
+            return str(obj)
+
+
--- a/easyocr/DBNet/model/detector.py
+++ b/easyocr/DBNet/model/detector.py
@@ -0,0 +1,53 @@
+from . import model as structure_model
+from .constructor import Configurable, State
+
+class Model(Configurable):
+    builder = State()
+    #representer = State()
+    
+    def __init__(self, **kwargs):
+        self.load_all(**kwargs)
+
+    @property
+    def model_name(self):
+        return self.builder.model_name
+
+
+class Builder(Configurable):
+    model = State()
+    model_args = State()
+
+    def __init__(self, cmd={}, **kwargs):
+        self.load_all(**kwargs)
+        if 'backbone' in cmd:
+            self.model_args['backbone'] = cmd['backbone']
+
+    @property
+    def model_name(self):
+        return self.model + '-' + getattr(structure_model, self.model).model_name(self.model_args)
+
+    def build(self, device, distributed=False, local_rank: int = 0):
+        Model = getattr(structure_model, self.model)
+        model = Model(self.model_args, device,
+                      distributed=distributed, local_rank=local_rank)
+        return model
+
+class Detector(Configurable):
+    structure = State(autoload=False)
+
+    def __init__(self, **kwargs):
+        self.load('structure', **kwargs)
+
+        cmd = kwargs.get('cmd', {})
+        if 'name' not in cmd:
+            cmd['name'] = self.structure.model_name
+
+        self.load_all(**kwargs)
+        self.distributed = cmd.get('distributed', False)
+        self.local_rank = cmd.get('local_rank', 0)
+
+        if cmd.get('validate', False):
+            self.load('validation', **kwargs)
+        else:
+            self.validation = None
+
--- a/easyocr/DBNet/model/model.py
+++ b/easyocr/DBNet/model/model.py
@@ -0,0 +1,71 @@
+import os
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+#import backbones
+#import decoders
+from .. import backbones
+from .. import decoders
+
+
+class BasicModel(nn.Module):
+    def __init__(self, args):
+        nn.Module.__init__(self)
+
+        self.backbone = getattr(backbones, args['backbone'])(**args.get('backbone_args', {}))
+        self.decoder = getattr(decoders, args['decoder'])(**args.get('decoder_args', {}))
+
+    def forward(self, data, *args, **kwargs):
+        return self.decoder(self.backbone(data), *args, **kwargs)
+
+
+def parallelize(model, distributed, local_rank):
+    if distributed:
+        return nn.parallel.DistributedDataParallel(
+            model,
+            device_ids=[local_rank],
+            output_device=[local_rank],
+            find_unused_parameters=True)
+    else:
+        return nn.DataParallel(model)
+
+class SegDetectorModel(nn.Module):
+    def __init__(self, args, device, distributed: bool = False, local_rank: int = 0):
+        super(SegDetectorModel, self).__init__()
+        #from decoders.seg_detector_loss import SegDetectorLossBuilder
+        from ..decoders.seg_detector_loss import SegDetectorLossBuilder
+
+        self.model = BasicModel(args)
+        # for loading models
+        self.model = parallelize(self.model, distributed, local_rank)
+        self.criterion = SegDetectorLossBuilder(
+            args['loss_class'], *args.get('loss_args', []), **args.get('loss_kwargs', {})).build()
+        self.criterion = parallelize(self.criterion, distributed, local_rank)
+        self.device = device
+        self.to(self.device)
+
+    @staticmethod
+    def model_name(args):
+        return os.path.join('seg_detector', args['backbone'], args['loss_class'])
+
+    def forward(self, batch, training=True):
+        if isinstance(batch, dict):
+            data = batch['image'].to(self.device)
+        else:
+            data = batch.to(self.device)
+        data = data.float()
+        #pred = self.model(data, training=self.training)
+        pred = self.model(data, training=training)
+
+        #if self.training:
+        if training:
+            for key, value in batch.items():
+                if value is not None:
+                    if hasattr(value, 'to'):
+                        batch[key] = value.to(self.device)
+            loss_with_metrics = self.criterion(pred, batch)
+            loss, metrics = loss_with_metrics
+            return loss, pred, metrics
+        return pred
--- a/easyocr/init.py
+++ b/easyocr/init.py
@@ -0,0 +1,3 @@
+from .easyocr import Reader
+
+__version__ = '1.7.2'
--- a/easyocr/character/ab_char.txt
+++ b/easyocr/character/ab_char.txt
@@ -0,0 +1,84 @@
+А
+Б
+В
+Г
+Ӷ
+Ҕ
+Д
+Е
+Ж
+З
+Ӡ
+И
+К
+Қ
+Ҟ
+Л
+М
+Н
+О
+П
+Ԥ
+Ҧ
+Р
+С
+Т
+Ҭ
+У
+Ф
+Х
+Ҳ
+Ц
+Ҵ
+Ч
+Ҷ
+Ҽ
+Ҿ
+Ш
+Ы
+Ҩ
+Џ
+Ь
+Ә
+а
+б
+в
+г
+ӷ
+ҕ
+д
+е
+ж
+з
+ӡ
+и
+к
+қ
+ҟ
+л
+м
+н
+о
+п
+ԥ
+ҧ
+р
+с
+т
+ҭ
+у
+ф
+х
+ҳ
+ц
+ҵ
+ч
+ҷ
+ҽ
+ҿ
+ш
+ы
+ҩ
+џ
+ь
+ә
--- a/easyocr/character/abq_char.txt
+++ b/easyocr/character/abq_char.txt
@@ -0,0 +1,67 @@
+А
+Б
+В
+Г
+Д
+Е
+Ё
+Ж
+З
+И
+Й
+К
+Л
+М
+Н
+О
+П
+Р
+С
+Т
+У
+Ф
+Х
+Ц
+Ч
+Ш
+Щ
+Ъ
+Ы
+Ь
+Э
+Ю
+Я
+а
+б
+в
+г
+д
+е
+ё
+ж
+з
+и
+й
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+I
--- a/easyocr/character/ady_char.txt
+++ b/easyocr/character/ady_char.txt
@@ -0,0 +1,68 @@
+А
+Б
+В
+Г
+Д
+Е
+Ё
+Ж
+З
+И
+Й
+К
+Л
+М
+Н
+О
+П
+Р
+С
+Т
+У
+Ф
+Х
+Ц
+Ч
+Ш
+Щ
+Ъ
+Ы
+Ь
+Э
+Ю
+Я
+а
+б
+в
+г
+д
+е
+ё
+ж
+з
+и
+й
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+Ӏ
+ӏ
--- a/easyocr/character/af_char.txt
+++ b/easyocr/character/af_char.txt
@@ -0,0 +1,78 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+É
+Î
+ß
+à
+á
+â
+ä
+å
+æ
+ç
+è
+é
+ê
+ë
+í
+ï
+ñ
+ò
+ó
+ô
+ö
+ø
+ú
+û
+ü
+ł
--- a/easyocr/character/am_char.txt
+++ b/easyocr/character/am_char.txt
@@ -0,0 +1,267 @@
+ሀ
+ሁ
+ሂ
+ሃ
+ሄ
+ህ
+ሆ
+ለ
+ሉ
+ሊ
+ላ
+ሌ
+ል
+ሎ
+ሏ
+ሐ
+ሑ
+ሒ
+ሓ
+ሔ
+ሕ
+ሖ
+መ
+ሙ
+ሚ
+ማ
+ሜ
+ም
+ሞ
+ሟ
+ሠ
+ሡ
+ሢ
+ሣ
+ሤ
+ሥ
+ሦ
+ሧ
+ረ
+ሩ
+ሪ
+ራ
+ሬ
+ር
+ሮ
+ሯ
+ሰ
+ሱ
+ሲ
+ሳ
+ሴ
+ስ
+ሶ
+ሷ
+ሸ
+ሹ
+ሺ
+ሻ
+ሼ
+ሽ
+ሾ
+ሿ
+ቀ
+ቁ
+ቂ
+ቃ
+ቄ
+ቅ
+ቆ
+ቋ
+በ
+ቡ
+ቢ
+ባ
+ቤ
+ብ
+ቦ
+ቧ
+ቨ
+ቩ
+ቪ
+ቫ
+ቬ
+ቭ
+ቮ
+ቯ
+ተ
+ቱ
+ቲ
+ታ
+ቴ
+ት
+ቶ
+ቷ
+ቸ
+ቹ
+ቺ
+ቻ
+ቼ
+ች
+ቾ
+ቿ
+ኀ
+ኁ
+ኂ
+ኃ
+ኄ
+ኅ
+ኆ
+ኋ
+ነ
+ኑ
+ኒ
+ና
+ኔ
+ን
+ኖ
+ኗ
+ኘ
+ኙ
+ኚ
+ኛ
+ኝ
+ኞ
+ኟ
+አ
+ኡ
+ኢ
+ኣ
+ኤ
+እ
+ኦ
+ኧ
+ከ
+ኩ
+ኪ
+ካ
+ኬ
+ክ
+ኮ
+ኳ
+ኸ
+ኹ
+ኺ
+ኻ
+ኼ
+ኽ
+ኾ
+ዃ
+ወ
+ዉ
+ዊ
+ዋ
+ዌ
+ው
+ዎ
+ዏ
+ዐ
+ዑ
+ዒ
+ዓ
+ዔ
+ዕ
+ዖ
+ዘ
+ዙ
+ዚ
+ዛ
+ዜ
+ዝ
+ዞ
+ዟ
+ዠ
+ዣ
+ዥ
+ዧ
+የ
+ዩ
+ያ
+ዬ
+ይ
+ዮ
+ደ
+ዱ
+ዲ
+ዳ
+ዴ
+ድ
+ዶ
+ዷ
+ጀ
+ጁ
+ጂ
+ጃ
+ጄ
+ጅ
+ጆ
+ጇ
+ገ
+ጉ
+ጊ
+ጋ
+ጌ
+ግ
+ጎ
+ጓ
+ጠ
+ጡ
+ጢ
+ጣ
+ጤ
+ጥ
+ጦ
+ጧ
+ጨ
+ጩ
+ጪ
+ጫ
+ጬ
+ጭ
+ጮ
+ጯ
+ጰ
+ጱ
+ጲ
+ጳ
+ጴ
+ጵ
+ጶ
+ጷ
+ጸ
+ጹ
+ጻ
+ጼ
+ጽ
+ጾ
+ጿ
+ፀ
+ፁ
+ፂ
+ፃ
+ፄ
+ፅ
+ፆ
+ፈ
+ፉ
+ፊ
+ፋ
+ፌ
+ፍ
+ፎ
+ፏ
+ፐ
+ፑ
+ፒ
+ፓ
+ፔ
+ፕ
+ፖ
+ፗ
+፡
+።
+፣
+፤
+፥
+፦
+፧
--- a/easyocr/character/amh_text.txt
+++ b/easyocr/character/amh_text.txt
@@ -0,0 +1,260 @@
+ሀ
+ሁ
+ሂ
+ሃ
+ሄ
+ህ
+ሆ
+ለ
+ሉ
+ሊ
+ላ
+ሌ
+ል
+ሎ
+ሏ
+ሐ
+ሑ
+ሒ
+ሓ
+ሔ
+ሕ
+ሖ
+መ
+ሙ
+ሚ
+ማ
+ሜ
+ም
+ሞ
+ሟ
+ሠ
+ሡ
+ሢ
+ሣ
+ሤ
+ሥ
+ሦ
+ሧ
+ረ
+ሩ
+ሪ
+ራ
+ሬ
+ር
+ሮ
+ሯ
+ሰ
+ሱ
+ሲ
+ሳ
+ሴ
+ስ
+ሶ
+ሷ
+ሸ
+ሹ
+ሺ
+ሻ
+ሼ
+ሽ
+ሾ
+ሿ
+ቀ
+ቁ
+ቂ
+ቃ
+ቄ
+ቅ
+ቆ
+ቋ
+በ
+ቡ
+ቢ
+ባ
+ቤ
+ብ
+ቦ
+ቧ
+ቨ
+ቩ
+ቪ
+ቫ
+ቬ
+ቭ
+ቮ
+ቯ
+ተ
+ቱ
+ቲ
+ታ
+ቴ
+ት
+ቶ
+ቷ
+ቸ
+ቹ
+ቺ
+ቻ
+ቼ
+ች
+ቾ
+ቿ
+ኀ
+ኁ
+ኂ
+ኃ
+ኄ
+ኅ
+ኆ
+ኋ
+ነ
+ኑ
+ኒ
+ና
+ኔ
+ን
+ኖ
+ኗ
+ኘ
+ኙ
+ኚ
+ኛ
+ኝ
+ኞ
+ኟ
+አ
+ኡ
+ኢ
+ኣ
+ኤ
+እ
+ኦ
+ኧ
+ከ
+ኩ
+ኪ
+ካ
+ኬ
+ክ
+ኮ
+ኳ
+ኸ
+ኹ
+ኺ
+ኻ
+ኼ
+ኽ
+ኾ
+ዃ
+ወ
+ዉ
+ዊ
+ዋ
+ዌ
+ው
+ዎ
+ዏ
+ዐ
+ዑ
+ዒ
+ዓ
+ዔ
+ዕ
+ዖ
+ዘ
+ዙ
+ዚ
+ዛ
+ዜ
+ዝ
+ዞ
+ዟ
+ዠ
+ዣ
+ዥ
+ዧ
+የ
+ዩ
+ያ
+ዬ
+ይ
+ዮ
+ደ
+ዱ
+ዲ
+ዳ
+ዴ
+ድ
+ዶ
+ዷ
+ጀ
+ጁ
+ጂ
+ጃ
+ጄ
+ጅ
+ጆ
+ጇ
+ገ
+ጉ
+ጊ
+ጋ
+ጌ
+ግ
+ጎ
+ጓ
+ጠ
+ጡ
+ጢ
+ጣ
+ጤ
+ጥ
+ጦ
+ጧ
+ጨ
+ጩ
+ጪ
+ጫ
+ጬ
+ጭ
+ጮ
+ጯ
+ጰ
+ጱ
+ጲ
+ጳ
+ጴ
+ጵ
+ጶ
+ጷ
+ጸ
+ጹ
+ጻ
+ጼ
+ጽ
+ጾ
+ጿ
+ፀ
+ፁ
+ፂ
+ፃ
+ፄ
+ፅ
+ፆ
+ፈ
+ፉ
+ፊ
+ፋ
+ፌ
+ፍ
+ፎ
+ፏ
+ፐ
+ፑ
+ፒ
+ፓ
+ፔ
+ፕ
+ፖ
+ፗ
--- a/easyocr/character/ang_char.txt
+++ b/easyocr/character/ang_char.txt
@@ -0,0 +1,84 @@
+अ
+आ
+इ
+ई
+उ
+ऊ
+ऋ
+ए
+ऐ
+ऑ
+ओ
+औ
+अं
+अः
+क
+ख
+ग
+घ
+ङ
+च
+छ
+ज
+झ
+ञ
+ट
+ठ
+ड
+ढ
+ण
+त
+थ
+द
+ध
+न
+प
+फ
+ब
+भ
+म
+य
+र
+ल
+व
+श
+ष
+स
+ह
+ळ
+१
+२
+३
+४
+५
+६
+७
+८
+९
+०
+ै
+ा
+ं
+े
+ि
+ो
+्
+ु
+ी
+़
+ू
+ँ
+ृ
+ौ
+ॉ
+ज़
+ड़
+क़
+ढ़
+फ़
+ग़
+ः
+ख़
+.
+॰
+ॅ
--- a/easyocr/character/ar_char.txt
+++ b/easyocr/character/ar_char.txt
@@ -0,0 +1,36 @@
+ا
+أ
+إ
+آ
+ب
+ت
+ث
+ج
+ح
+خ
+د
+ذ
+ر
+ز
+س
+ش
+ص
+ض
+ط
+ظ
+ع
+غ
+ف
+ق
+ك
+ل
+م
+ن
+ه
+و
+ى
+ي
+ء
+ئ
+ؤ
+ة
--- a/easyocr/character/as_char.txt
+++ b/easyocr/character/as_char.txt
@@ -0,0 +1,74 @@
+হ
+থ
+শ
+৫
+ক
+ও
+য
+০
+গ
+দ
+ড়
+খ
+য়
+ঋ
+ন
+অ
+৪
+এ
+ব
+ঠ
+ঢ
+৭
+৯
+ধ
+ঙ
+ট
+ঝ
+ৎ
+ণ
+ত
+র
+২
+চ
+ঌ
+ড
+৬
+ঔ
+প
+ভ
+ম
+ঢ়
+ঈ
+৮
+ঘ
+১
+ষ
+৩
+ফ
+ছ
+ল
+জ
+আ
+।
+ঊ
+ই
+স
+ঐ
+উ
+ঞ
+া
+্
+ু
+ী
+ে
+ং
+ি
+়
+ঁ
+ৃ
+ো
+ূ
+ৈ
+ৌ
+ঃ
--- a/easyocr/character/ava_char.txt
+++ b/easyocr/character/ava_char.txt
@@ -0,0 +1,67 @@
+А
+Б
+В
+Г
+Д
+Е
+Ё
+Ж
+З
+И
+Й
+К
+Л
+М
+Н
+О
+П
+Р
+С
+Т
+У
+Ф
+Х
+Ц
+Ч
+Ш
+Щ
+Ъ
+Ы
+Ь
+Э
+Ю
+Я
+а
+б
+в
+г
+д
+е
+ё
+ж
+з
+и
+й
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+I
--- a/easyocr/character/az_char.txt
+++ b/easyocr/character/az_char.txt
@@ -0,0 +1,66 @@
+A
+B
+C
+Ç
+D
+E
+Ə
+F
+G
+H
+I
+İ
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+Ş
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+ə
+f
+g
+ğ
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+ş
+t
+u
+v
+w
+x
+y
+z
+Ç
+Ö
+Ü
+ç
+ö
+ü
+ı
--- a/easyocr/character/be_char.txt
+++ b/easyocr/character/be_char.txt
@@ -0,0 +1,64 @@
+А
+Б
+В
+Г
+Д
+Е
+Ё
+Ж
+З
+І
+Й
+К
+Л
+М
+Н
+О
+П
+Р
+С
+Т
+У
+Ў
+Ф
+Х
+Ц
+Ч
+Ш
+Ы
+Ь
+Э
+Ю
+Я
+а
+б
+в
+г
+д
+е
+ё
+ж
+з
+і
+й
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ў
+ф
+х
+ц
+ч
+ш
+ы
+ь
+э
+ю
+я
--- a/easyocr/character/bg_char.txt
+++ b/easyocr/character/bg_char.txt
@@ -0,0 +1,60 @@
+А
+Б
+В
+Г
+Д
+Е
+Ж
+З
+И
+Й
+К
+Л
+М
+Н
+О
+П
+Р
+С
+Т
+У
+Ф
+Х
+Ц
+Ч
+Ш
+Щ
+Ъ
+Ь
+Ю
+Я
+а
+б
+в
+г
+д
+е
+ж
+з
+и
+й
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ь
+ю
+я
--- a/easyocr/character/bh_char.txt
+++ b/easyocr/character/bh_char.txt
@@ -0,0 +1,84 @@
+अ
+आ
+इ
+ई
+उ
+ऊ
+ऋ
+ए
+ऐ
+ऑ
+ओ
+औ
+अं
+अः
+क
+ख
+ग
+घ
+ङ
+च
+छ
+ज
+झ
+ञ
+ट
+ठ
+ड
+ढ
+ण
+त
+थ
+द
+ध
+न
+प
+फ
+ब
+भ
+म
+य
+र
+ल
+व
+श
+ष
+स
+ह
+ळ
+१
+२
+३
+४
+५
+६
+७
+८
+९
+०
+ै
+ा
+ं
+े
+ि
+ो
+्
+ु
+ी
+़
+ू
+ँ
+ृ
+ौ
+ॉ
+ज़
+ड़
+क़
+ढ़
+फ़
+ग़
+ः
+ख़
+.
+॰
+ॅ
--- a/easyocr/character/bho_char.txt
+++ b/easyocr/character/bho_char.txt
@@ -0,0 +1,84 @@
+अ
+आ
+इ
+ई
+उ
+ऊ
+ऋ
+ए
+ऐ
+ऑ
+ओ
+औ
+अं
+अः
+क
+ख
+ग
+घ
+ङ
+च
+छ
+ज
+झ
+ञ
+ट
+ठ
+ड
+ढ
+ण
+त
+थ
+द
+ध
+न
+प
+फ
+ब
+भ
+म
+य
+र
+ल
+व
+श
+ष
+स
+ह
+ळ
+१
+२
+३
+४
+५
+६
+७
+८
+९
+०
+ै
+ा
+ं
+े
+ि
+ो
+्
+ु
+ी
+़
+ू
+ँ
+ृ
+ौ
+ॉ
+ज़
+ड़
+क़
+ढ़
+फ़
+ग़
+ः
+ख़
+.
+॰
+ॅ
--- a/easyocr/character/bn_char.txt
+++ b/easyocr/character/bn_char.txt
@@ -0,0 +1,74 @@
+হ
+থ
+শ
+৫
+ক
+ও
+য
+০
+গ
+দ
+ড়
+খ
+য়
+ঋ
+ন
+অ
+৪
+এ
+ব
+ঠ
+ঢ
+৭
+৯
+ধ
+ঙ
+ট
+ঝ
+ৎ
+ণ
+ত
+র
+২
+চ
+ঌ
+ড
+৬
+ঔ
+প
+ভ
+ম
+ঢ়
+ঈ
+৮
+ঘ
+১
+ষ
+৩
+ফ
+ছ
+ল
+জ
+আ
+।
+ঊ
+ই
+স
+ঐ
+উ
+ঞ
+া
+্
+ু
+ী
+ে
+ং
+ি
+়
+ঁ
+ৃ
+ো
+ূ
+ৈ
+ৌ
+ঃ
--- a/easyocr/character/braille_char.txt
+++ b/easyocr/character/braille_char.txt
@@ -0,0 +1,64 @@
+⠀
+⠁
+⠂
+⠃
+⠄
+⠅
+⠆
+⠇
+⠈
+⠉
+⠊
+⠋
+⠌
+⠍
+⠎
+⠏
+⠐
+⠑
+⠒
+⠓
+⠔
+⠕
+⠖
+⠗
+⠘
+⠙
+⠚
+⠛
+⠜
+⠝
+⠞
+⠟
+⠠
+⠡
+⠢
+⠣
+⠤
+⠥
+⠦
+⠧
+⠨
+⠩
+⠪
+⠫
+⠬
+⠭
+⠮
+⠯
+⠰
+⠱
+⠲
+⠳
+⠴
+⠵
+⠶
+⠷
+⠸
+⠹
+⠺
+⠻
+⠼
+⠽
+⠾
+⠿
--- a/easyocr/character/bs_char.txt
+++ b/easyocr/character/bs_char.txt
@@ -0,0 +1,68 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+á
+â
+ä
+ç
+é
+ë
+í
+ñ
+ó
+ö
+ú
+ü
+Š
+š
+Ž
+ž
--- a/easyocr/character/ch_pin_char.txt
+++ b/easyocr/character/ch_pin_char.txt
@@ -0,0 +1,50 @@
+a
+ā
+á
+ǎ
+à
+o
+ō
+ó
+ǒ
+ò
+e
+ē
+é
+ě
+è
+i
+ī
+í
+ǐ
+ì
+u
+ū
+ú
+ǔ
+ù
+ǖ
+ǖ
+ǘ
+ǚ
+ǜ
+b
+p
+m
+f
+d
+t
+n
+l 
+ɡ 
+k 
+h 
+j 
+q 
+x 
+z 
+c 
+s 
+r
+y
+w 
--- a/easyocr/character/ch_sim_char.txt
+++ b/easyocr/character/ch_sim_char.txt
--- a/easyocr/character/ch_tra_char.txt
+++ b/easyocr/character/ch_tra_char.txt
--- a/easyocr/character/che_char.txt
+++ b/easyocr/character/che_char.txt
@@ -0,0 +1,67 @@
+А
+Б
+В
+Г
+Д
+Е
+Ё
+Ж
+З
+И
+Й
+К
+Л
+М
+Н
+О
+П
+Р
+С
+Т
+У
+Ф
+Х
+Ц
+Ч
+Ш
+Щ
+Ъ
+Ы
+Ь
+Э
+Ю
+Я
+а
+б
+в
+г
+д
+е
+ё
+ж
+з
+и
+й
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+I
--- a/easyocr/character/cs_char.txt
+++ b/easyocr/character/cs_char.txt
@@ -0,0 +1,94 @@
+A
+Á
+B
+C
+Č
+D
+Ď
+E
+É
+Ě
+F
+G
+H
+I
+Í
+J
+K
+L
+M
+N
+Ň
+O
+Ó
+P
+Q
+R
+Ř
+S
+Š
+T
+Ť
+U
+Ú
+Ů
+V
+W
+X
+Y
+Ý
+Z
+Ž
+a
+á
+b
+c
+č
+d
+ď
+e
+é
+ě
+f
+g
+h
+i
+í
+j
+k
+l
+m
+n
+ň
+o
+ó
+p
+q
+r
+ř
+s
+š
+t
+ť
+u
+ú
+ů
+v
+w
+x
+y
+ý
+z
+ž
+Ä
+Ĺ
+Ô
+Ö
+Ŕ
+Ü
+ä
+ĺ
+ô
+ö
+ŕ
+ü
--- a/easyocr/character/cy_char.txt
+++ b/easyocr/character/cy_char.txt
@@ -0,0 +1,74 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+É
+à
+á
+â
+ä
+ç
+è
+é
+ê
+ë
+í
+î
+ï
+ó
+ô
+õ
+ö
+ø
+ù
+ú
+ü
+ı
--- a/easyocr/character/da_char.txt
+++ b/easyocr/character/da_char.txt
@@ -0,0 +1,81 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+Å
+Æ
+É
+Ö
+Ø
+ß
+à
+á
+â
+ã
+ä
+å
+æ
+ç
+è
+é
+ë
+í
+ð
+ñ
+ó
+ô
+ö
+ø
+ú
+ü
+ı
+ł
+š
--- a/easyocr/character/dar_char.txt
+++ b/easyocr/character/dar_char.txt
@@ -0,0 +1,67 @@
+А
+Б
+В
+Г
+Д
+Е
+Ё
+Ж
+З
+И
+Й
+К
+Л
+М
+Н
+О
+П
+Р
+С
+Т
+У
+Ф
+Х
+Ц
+Ч
+Ш
+Щ
+Ъ
+Ы
+Ь
+Э
+Ю
+Я
+а
+б
+в
+г
+д
+е
+ё
+ж
+з
+и
+й
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+I
--- a/easyocr/character/de_char.txt
+++ b/easyocr/character/de_char.txt
@@ -0,0 +1,59 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+Ä
+Ö
+Ü
+ß
+ä
+ö
+ü
--- a/easyocr/character/en_char.txt
+++ b/easyocr/character/en_char.txt
@@ -0,0 +1,52 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
--- a/easyocr/character/es_char.txt
+++ b/easyocr/character/es_char.txt
@@ -0,0 +1,79 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+Á
+É
+Ñ
+Ó
+Ú
+à
+á
+â
+ã
+ä
+ç
+è
+é
+ê
+ì
+í
+ñ
+ò
+ó
+ô
+ö
+ø
+ù
+ú
+ü
+š
+ž
--- a/easyocr/character/et_char.txt
+++ b/easyocr/character/et_char.txt
@@ -0,0 +1,81 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+Ä
+É
+Õ
+Ö
+Ü
+ß
+á
+ä
+å
+ç
+è
+é
+ê
+ë
+í
+ñ
+ó
+ô
+õ
+ö
+ø
+ü
+ı
+ł
+œ
+Š
+š
+Ž
+ž
--- a/easyocr/character/fa_char.txt
+++ b/easyocr/character/fa_char.txt
@@ -0,0 +1,55 @@
+آ
+ا
+ب
+پ
+ت
+ث
+ج
+چ
+ح
+خ
+د
+ذ
+ر
+ز
+ژ
+س
+ش
+ص
+ض
+ط
+ظ
+ع
+غ
+ف
+ق
+ک
+گ
+ل
+م
+ن
+و
+ه
+ی
+ء
+ئ
+ِ
+ك
+ي
+ً
+ّ
+َ
+ة
+أ
+ٔ
+ؤ
+ُ
+ى
+إ
+ۀ
+ٍ
+ْ
+ٌ
+ٰ
+ٓ
+ٱ
--- a/easyocr/character/fr_char.txt
+++ b/easyocr/character/fr_char.txt
@@ -0,0 +1,84 @@
+A
+a
+B
+b
+C
+c
+D
+d
+E
+e
+F
+f
+G
+g
+H
+h
+I
+i
+J
+j
+K
+k
+L
+l
+M
+m
+N
+n
+O
+o
+P
+p
+Q
+q
+R
+r
+S
+s
+T
+t
+U
+u
+V
+v
+W
+w
+X
+x
+Y
+y
+Z
+z
+À
+à
+Â
+â
+Æ
+æ
+Ç
+ç
+É
+é
+È
+è
+Ê
+ê
+Ë
+ë
+Î
+î
+Ï
+ï
+Ô
+ô
+Œ
+œ
+Ù
+ù
+Û
+û
+Ü
+ü
+Ÿ
+ÿ
--- a/easyocr/character/ga_char.txt
+++ b/easyocr/character/ga_char.txt
@@ -0,0 +1,80 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+Á
+É
+Í
+Ó
+Ú
+à
+á
+â
+ã
+ä
+æ
+ç
+è
+é
+ê
+ì
+í
+ð
+ñ
+ò
+ó
+ô
+ö
+ú
+ü
+ý
+þ
+ł
--- a/easyocr/character/ge_char.txt
+++ b/easyocr/character/ge_char.txt
@@ -0,0 +1,33 @@
+ა
+ბ
+გ
+დ
+ე
+ვ
+ზ
+თ
+ი
+კ
+ლ
+მ
+ნ
+ო
+პ
+ჟ
+რ
+ს
+ტ
+უ
+ფ
+ქ
+ღ
+ყ
+შ
+ჩ
+ც
+ძ
+წ
+ჟ
+ხ
+ჯ
+ჰ
--- a/easyocr/character/gom_char.txt
+++ b/easyocr/character/gom_char.txt
@@ -0,0 +1,84 @@
+अ
+आ
+इ
+ई
+उ
+ऊ
+ऋ
+ए
+ऐ
+ऑ
+ओ
+औ
+अं
+अः
+क
+ख
+ग
+घ
+ङ
+च
+छ
+ज
+झ
+ञ
+ट
+ठ
+ड
+ढ
+ण
+त
+थ
+द
+ध
+न
+प
+फ
+ब
+भ
+म
+य
+र
+ल
+व
+श
+ष
+स
+ह
+ळ
+१
+२
+३
+४
+५
+६
+७
+८
+९
+०
+ै
+ा
+ं
+े
+ि
+ो
+्
+ु
+ी
+़
+ू
+ँ
+ृ
+ौ
+ॉ
+ज़
+ड़
+क़
+ढ़
+फ़
+ग़
+ः
+ख़
+.
+॰
+ॅ
--- a/easyocr/character/gre_char.txt
+++ b/easyocr/character/gre_char.txt
@@ -0,0 +1,69 @@
+Α
+α
+Β
+β
+Γ
+γ
+Δ
+δ
+Ε
+ε
+Ζ
+ζ
+Η
+η
+Θ
+θ
+Ι
+ι
+Κ
+κ
+Λ
+λ
+Μ
+μ
+Ν
+ν
+Ξ
+ξ
+Ο
+ο
+Π
+π
+Ρ
+ρ
+Σ
+σ
+ς
+Τ
+τ
+Υ
+υ
+Φ
+φ
+Χ
+χ
+Ψ
+ψ
+Ω
+ω
+Ά
+ά
+Έ
+έ
+Ή
+ή
+Ί
+ί
+Ϊ
+ΐ
+ϊ
+Ό
+ό
+Ύ
+Ϋ
+ύ
+ϋ
+ΰ
+Ώ
+ώ
--- a/easyocr/character/he_char.txt
+++ b/easyocr/character/he_char.txt
@@ -0,0 +1,27 @@
+א
+ב
+ג
+ד
+ה
+ו
+ז
+ח
+ט
+י
+כ
+ך
+ל
+מ
+ם
+נ
+ן
+ס
+ע
+פ
+ף
+צ
+ץ
+ק
+ר
+ש
+ת
--- a/easyocr/character/hi_char.txt
+++ b/easyocr/character/hi_char.txt
@@ -0,0 +1,84 @@
+अ
+आ
+इ
+ई
+उ
+ऊ
+ऋ
+ए
+ऐ
+ऑ
+ओ
+औ
+अं
+अः
+क
+ख
+ग
+घ
+ङ
+च
+छ
+ज
+झ
+ञ
+ट
+ठ
+ड
+ढ
+ण
+त
+थ
+द
+ध
+न
+प
+फ
+ब
+भ
+म
+य
+र
+ल
+व
+श
+ष
+स
+ह
+ळ
+१
+२
+३
+४
+५
+६
+७
+८
+९
+०
+ै
+ा
+ं
+े
+ि
+ो
+्
+ु
+ी
+़
+ू
+ँ
+ृ
+ौ
+ॉ
+ज़
+ड़
+क़
+ढ़
+फ़
+ग़
+ः
+ख़
+.
+॰
+ॅ
--- a/easyocr/character/hr_char.txt
+++ b/easyocr/character/hr_char.txt
@@ -0,0 +1,70 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+à
+á
+â
+ã
+ä
+ç
+è
+é
+í
+ñ
+ó
+ö
+ú
+ü
+Š
+š
+Ž
+ž
--- a/easyocr/character/hu_char.txt
+++ b/easyocr/character/hu_char.txt
@@ -0,0 +1,70 @@
+A
+Á
+B
+C
+D
+E
+É
+F
+G
+H
+I
+Í
+J
+K
+L
+M
+N
+O
+Ó
+Ö
+Ő
+P
+Q
+R
+S
+T
+U
+Ú
+Ü
+Ű
+V
+W
+X
+Y
+Z
+a
+á
+b
+c
+d
+e
+é
+f
+g
+h
+i
+í
+j
+k
+l
+m
+n
+o
+ó
+ö
+ő
+p
+q
+r
+s
+t
+u
+ú
+ü
+ű
+v
+w
+x
+y
+z
--- a/easyocr/character/id_char.txt
+++ b/easyocr/character/id_char.txt
@@ -0,0 +1,70 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+É
+à
+á
+â
+ä
+ç
+è
+é
+ë
+í
+ñ
+ó
+ô
+ö
+ú
+ü
+ı
+ł
--- a/easyocr/character/inh_char.txt
+++ b/easyocr/character/inh_char.txt
@@ -0,0 +1,67 @@
+А
+Б
+В
+Г
+Д
+Е
+Ё
+Ж
+З
+И
+Й
+К
+Л
+М
+Н
+О
+П
+Р
+С
+Т
+У
+Ф
+Х
+Ц
+Ч
+Ш
+Щ
+Ъ
+Ы
+Ь
+Э
+Ю
+Я
+а
+б
+в
+г
+д
+е
+ё
+ж
+з
+и
+й
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+I
--- a/easyocr/character/is_char.txt
+++ b/easyocr/character/is_char.txt
@@ -0,0 +1,77 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+Á
+Æ
+É
+Í
+Ó
+Ö
+Ú
+Ý
+Þ
+á
+ä
+å
+æ
+é
+í
+ð
+ó
+ö
+ø
+ú
+ü
+ý
+þ
+œ
+š
--- a/easyocr/character/it_char.txt
+++ b/easyocr/character/it_char.txt
@@ -0,0 +1,72 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+À
+Á
+È
+É
+Ì
+Í
+Ò
+Ó
+Ù
+Ú
+à
+á
+è
+é
+ì
+í
+ò
+ó
+ù
+ú
--- a/easyocr/character/ja_char.txt
+++ b/easyocr/character/ja_char.txt
--- a/easyocr/character/kas_char.txt
+++ b/easyocr/character/kas_char.txt
@@ -0,0 +1,76 @@
+ا
+ب
+پ
+پھ
+ت
+تھ
+ٹ
+ٹھ
+ث
+ج
+چ
+چھ
+ح
+خ
+د
+ڈ
+ذ
+ر
+ز
+ژ
+ژھ
+س
+ش
+ص
+ض
+ط
+ظ
+ع
+غ
+ف
+ق
+ک
+کھ
+گ
+ل
+م
+ن
+و
+ہ
+ء
+ی
+ے
+آ
+یی
+ُ 
+وٚ
+ۄ
+ٔ 
+ٲ
+ِ 
+ایی
+ۍ
+یہٛ
+یہ
+ٖ
+ٛ
+َ 
+ھ
+ٔ 
+ٕ
+ؠ
+أ
+ؤ
+إ
+ئ
+ي
+ً
+ٍ
+ٓ
+ٗ
+ٙ
+ٚ
+ٟ
+ٳ
+ۓ
+ۯ
--- a/easyocr/character/kbd_char.txt
+++ b/easyocr/character/kbd_char.txt
@@ -0,0 +1,67 @@
+А
+Б
+В
+Г
+Д
+Е
+Ё
+Ж
+З
+И
+Й
+К
+Л
+М
+Н
+О
+П
+Р
+С
+Т
+У
+Ф
+Х
+Ц
+Ч
+Ш
+Щ
+Ъ
+Ы
+Ь
+Э
+Ю
+Я
+а
+б
+в
+г
+д
+е
+ё
+ж
+з
+и
+й
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+I
--- a/easyocr/character/kn.txt
+++ b/easyocr/character/kn.txt
--- a/easyocr/character/kn_char.txt
+++ b/easyocr/character/kn_char.txt
@@ -0,0 +1,72 @@
+೦
+೫
+ಃ
+೩
+ತ
+ಔ
+ಠ
+ಖ
+ಪ
+ಯ
+ವ
+ಸ
+೭
+ೋ
+ಛ
+ಇ
+ಙ
+ದ
+ಗ
+ಉ
+ಳ
+೪
+ಢ
+ಝ
+ಆ
+ಜ
+ಋ
+ಟ
+ೆ
+ೈ
+ಾ
+ಓ
+೧
+ರ
+ು
+ಂ
+ೀ
+ಊ
+ಅ
+ಒ
+ಬ
+ಹ
+೮
+ೃ
+ೊ
+ೂ
+ಕ
+ಫ
+್
+ಶ
+೨
+ಎ
+೯
+ಧ
+ಡ
+ಞ
+ಐ
+ಷ
+ೌ
+ಚ
+ಣ
+೬
+ಥ
+ೇ
+ನ
+ಿ
+ಲ
+ಏ
+ಘ
+ಭ
+ಮ
+ಈ
--- a/easyocr/character/ko_char.txt
+++ b/easyocr/character/ko_char.txt
--- a/easyocr/character/ku_char.txt
+++ b/easyocr/character/ku_char.txt
@@ -0,0 +1,76 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+Ç
+Ê
+Î
+Ö
+Û
+Ü
+á
+â
+ã
+ä
+ç
+è
+é
+ê
+í
+î
+ó
+ö
+ú
+û
+ü
+ı
+ł
+š
--- a/easyocr/character/la_char.txt
+++ b/easyocr/character/la_char.txt
@@ -0,0 +1,80 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+É
+ß
+à
+á
+â
+ã
+ä
+å
+ç
+è
+é
+ê
+ë
+í
+ñ
+ò
+ó
+ô
+ö
+ø
+ú
+ü
+ý
+ı
+ł
+Š
+š
+ž
--- a/easyocr/character/lbe_char.txt
+++ b/easyocr/character/lbe_char.txt
@@ -0,0 +1,67 @@
+А
+Б
+В
+Г
+Д
+Е
+Ё
+Ж
+З
+И
+Й
+К
+Л
+М
+Н
+О
+П
+Р
+С
+Т
+У
+Ф
+Х
+Ц
+Ч
+Ш
+Щ
+Ъ
+Ы
+Ь
+Э
+Ю
+Я
+а
+б
+в
+г
+д
+е
+ё
+ж
+з
+и
+й
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+I
--- a/easyocr/character/lez_char.txt
+++ b/easyocr/character/lez_char.txt
@@ -0,0 +1,67 @@
+А
+Б
+В
+Г
+Д
+Е
+Ё
+Ж
+З
+И
+Й
+К
+Л
+М
+Н
+О
+П
+Р
+С
+Т
+У
+Ф
+Х
+Ц
+Ч
+Ш
+Щ
+Ъ
+Ы
+Ь
+Э
+Ю
+Я
+а
+б
+в
+г
+д
+е
+ё
+ж
+з
+и
+й
+к
+л
+м
+н
+о
+п
+р
+с
+т
+у
+ф
+х
+ц
+ч
+ш
+щ
+ъ
+ы
+ь
+э
+ю
+я
+I
--- a/easyocr/character/lt_char.txt
+++ b/easyocr/character/lt_char.txt
@@ -0,0 +1,64 @@
+A
+Ą
+B
+C
+Č
+D
+E
+Ę
+Ė
+F
+G
+H
+I
+Į
+Y
+J
+K
+L
+M
+N
+O
+P
+R
+S
+Š
+T
+U
+Ų
+Ū
+V
+Z
+Ž
+a
+ą
+b
+c
+č
+d
+e
+ę
+ė
+f
+g
+h
+i
+į
+y
+j
+k
+l
+m
+n
+o
+p
+r
+s
+š
+t
+u
+ų
+ū
+v
+z
+ž
--- a/easyocr/character/lv_char.txt
+++ b/easyocr/character/lv_char.txt
@@ -0,0 +1,64 @@
+A
+Ā
+B
+C
+Č
+D
+E
+Ē
+F
+G
+H
+I
+Ī
+J
+K
+Ķ
+L
+Ļ
+M
+N
+Ņ
+O
+P
+R
+S
+Š
+T
+U
+Ū
+V
+Z
+Ž
+a
+ā
+b
+c
+č
+d
+e
+ē
+f
+g
+h
+i
+ī
+j
+k
+ķ
+l
+ļ
+m
+n
+ņ
+o
+p
+r
+s
+š
+t
+u
+ū
+v
+z
+ž
--- a/Show More
+++ b/Show More