diff --git a/pdfdet/models/Paddle/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml b/pdfdet/models/Paddle/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml
index fb36a74..d63d5e5 100644
--- a/pdfdet/models/Paddle/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml
+++ b/pdfdet/models/Paddle/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml
@@ -17,7 +17,7 @@ PicoDet:
   backbone: LCNet
   neck: CSPPAN
   head: PicoHead
-  nms_cpu: True
+  #nms_cpu: True
 
 LCNet:
   scale: 1.0
diff --git a/pdfdet/models/Paddle/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout1.yml b/pdfdet/models/Paddle/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout1.yml
index 251a3dd..ebe7201 100644
--- a/pdfdet/models/Paddle/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout1.yml
+++ b/pdfdet/models/Paddle/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout1.yml
@@ -17,7 +17,7 @@ PicoDet:
   backbone: LCNet
   neck: CSPPAN
   head: PicoHead
-  nms_cpu: True
+  #nms_cpu: True
 
 LCNet:
   scale: 1.0
diff --git a/pdfdet/models/Paddle/paddle_cdla.py b/pdfdet/models/Paddle/paddle_cdla.py
index ef65277..16efe91 100644
--- a/pdfdet/models/Paddle/paddle_cdla.py
+++ b/pdfdet/models/Paddle/paddle_cdla.py
@@ -13,8 +13,32 @@
 
 from pdfdet.models.baseModel import base_module
 from ppdet.core.workspace import load_config
-from ppdet.engine import Trainer
+from ppdet.engine import Trainer as Trainer1
+from ppdet.core.workspace import create
 
+class Trainer(Trainer1):
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+
+    def predict(self,
+                images):
+
+        self.dataset.set_images(images)
+        loader = create('TestReader')(self.dataset, 0)
+
+        # Run Infer 
+        self.model.eval()
+        results = []
+        for step_id, data in enumerate(loader):
+            # forward
+            outs = self.model(data)
+
+            for key, value in outs.items():
+                if hasattr(value, 'numpy'):
+                    outs[key] = value.numpy()
+            results.append(outs)
+
+        return results
 
 class paddle_cdla_model(base_module):
     def __init__(self, *args, **kwargs) -> None:
diff --git a/pdfdet/models/Paddle/paddle_pub.py b/pdfdet/models/Paddle/paddle_pub.py
index 93da3cf..3bc7fb9 100644
--- a/pdfdet/models/Paddle/paddle_pub.py
+++ b/pdfdet/models/Paddle/paddle_pub.py
@@ -7,9 +7,9 @@
 parent_path = os.path.abspath(os.path.join(__file__, *([".."] * 1)))
 sys.path.insert(0, parent_path)
 
-from .paddle_cdla import paddle_cdla_model
+from .paddle_cdla import paddle_cdla_model,Trainer
 from ppdet.core.workspace import load_config
-from ppdet.engine import Trainer
+
 
 
 class paddle_pub_model(paddle_cdla_model):
diff --git a/pdfdet/models/Paddle/ppdet/__init__.py b/pdfdet/models/Paddle/ppdet/__init__.py
deleted file mode 100644
index ac53c97..0000000
--- a/pdfdet/models/Paddle/ppdet/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import (core, data, engine, modeling, model_zoo, optimizer, metrics,
-               utils, slim)
-
-
diff --git a/pdfdet/models/Paddle/ppdet/core/__init__.py b/pdfdet/models/Paddle/ppdet/core/__init__.py
deleted file mode 100644
index d042771..0000000
--- a/pdfdet/models/Paddle/ppdet/core/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import config
diff --git a/pdfdet/models/Paddle/ppdet/core/config/__init__.py b/pdfdet/models/Paddle/ppdet/core/config/__init__.py
deleted file mode 100644
index d0c32e2..0000000
--- a/pdfdet/models/Paddle/ppdet/core/config/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/pdfdet/models/Paddle/ppdet/core/config/schema.py b/pdfdet/models/Paddle/ppdet/core/config/schema.py
deleted file mode 100644
index 2e41b5c..0000000
--- a/pdfdet/models/Paddle/ppdet/core/config/schema.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-import inspect
-import importlib
-import re
-
-try:
-    from docstring_parser import parse as doc_parse
-except Exception:
-
-    def doc_parse(*args):
-        pass
-
-
-try:
-    from typeguard import check_type
-except Exception:
-
-    def check_type(*args):
-        pass
-
-
-__all__ = ['SchemaValue', 'SchemaDict', 'SharedConfig', 'extract_schema']
-
-
-class SchemaValue(object):
-    def __init__(self, name, doc='', type=None):
-        super(SchemaValue, self).__init__()
-        self.name = name
-        self.doc = doc
-        self.type = type
-
-    def set_default(self, value):
-        self.default = value
-
-    def has_default(self):
-        return hasattr(self, 'default')
-
-
-class SchemaDict(dict):
-    def __init__(self, **kwargs):
-        super(SchemaDict, self).__init__()
-        self.schema = {}
-        self.strict = False
-        self.doc = ""
-        self.update(kwargs)
-
-    def __setitem__(self, key, value):
-        # XXX also update regular dict to SchemaDict??
-        if isinstance(value, dict) and key in self and isinstance(self[key],
-                                                                  SchemaDict):
-            self[key].update(value)
-        else:
-            super(SchemaDict, self).__setitem__(key, value)
-
-    def __missing__(self, key):
-        if self.has_default(key):
-            return self.schema[key].default
-        elif key in self.schema:
-            return self.schema[key]
-        else:
-            raise KeyError(key)
-
-    def copy(self):
-        newone = SchemaDict()
-        newone.__dict__.update(self.__dict__)
-        newone.update(self)
-        return newone
-
-    def set_schema(self, key, value):
-        assert isinstance(value, SchemaValue)
-        self.schema[key] = value
-
-    def set_strict(self, strict):
-        self.strict = strict
-
-    def has_default(self, key):
-        return key in self.schema and self.schema[key].has_default()
-
-    def is_default(self, key):
-        if not self.has_default(key):
-            return False
-        if hasattr(self[key], '__dict__'):
-            return True
-        else:
-            return key not in self or self[key] == self.schema[key].default
-
-    def find_default_keys(self):
-        return [
-            k for k in list(self.keys()) + list(self.schema.keys())
-            if self.is_default(k)
-        ]
-
-    def mandatory(self):
-        return any([k for k in self.schema.keys() if not self.has_default(k)])
-
-    def find_missing_keys(self):
-        missing = [
-            k for k in self.schema.keys()
-            if k not in self and not self.has_default(k)
-        ]
-        placeholders = [k for k in self if self[k] in ('<missing>', '<value>')]
-        return missing + placeholders
-
-    def find_extra_keys(self):
-        return list(set(self.keys()) - set(self.schema.keys()))
-
-    def find_mismatch_keys(self):
-        mismatch_keys = []
-        for arg in self.schema.values():
-            if arg.type is not None:
-                try:
-                    check_type("{}.{}".format(self.name, arg.name),
-                               self[arg.name], arg.type)
-                except Exception:
-                    mismatch_keys.append(arg.name)
-        return mismatch_keys
-
-    def validate(self):
-        missing_keys = self.find_missing_keys()
-        if missing_keys:
-            raise ValueError("Missing param for class<{}>: {}".format(
-                self.name, ", ".join(missing_keys)))
-        extra_keys = self.find_extra_keys()
-        if extra_keys and self.strict:
-            raise ValueError("Extraneous param for class<{}>: {}".format(
-                self.name, ", ".join(extra_keys)))
-        mismatch_keys = self.find_mismatch_keys()
-        if mismatch_keys:
-            raise TypeError("Wrong param type for class<{}>: {}".format(
-                self.name, ", ".join(mismatch_keys)))
-
-
-class SharedConfig(object):
-    """
-    Representation class for `__shared__` annotations, which work as follows:
-
-    - if `key` is set for the module in config file, its value will take
-      precedence
-    - if `key` is not set for the module but present in the config file, its
-      value will be used
-    - otherwise, use the provided `default_value` as fallback
-
-    Args:
-        key: config[key] will be injected
-        default_value: fallback value
-    """
-
-    def __init__(self, key, default_value=None):
-        super(SharedConfig, self).__init__()
-        self.key = key
-        self.default_value = default_value
-
-
-def extract_schema(cls):
-    """
-    Extract schema from a given class
-
-    Args:
-        cls (type): Class from which to extract.
-
-    Returns:
-        schema (SchemaDict): Extracted schema.
-    """
-    ctor = cls.__init__
-    # python 2 compatibility
-    if hasattr(inspect, 'getfullargspec'):
-        argspec = inspect.getfullargspec(ctor)
-        annotations = argspec.annotations
-        has_kwargs = argspec.varkw is not None
-    else:
-        argspec = inspect.getfullargspec(ctor)
-        # python 2 type hinting workaround, see pep-3107
-        # however, since `typeguard` does not support python 2, type checking
-        # is still python 3 only for now
-        annotations = getattr(ctor, '__annotations__', {})
-        has_kwargs = argspec.varkw is not None
-
-    names = [arg for arg in argspec.args if arg != 'self']
-    defaults = argspec.defaults
-    num_defaults = argspec.defaults is not None and len(argspec.defaults) or 0
-    num_required = len(names) - num_defaults
-
-    docs = cls.__doc__
-    if docs is None and getattr(cls, '__category__', None) == 'op':
-        docs = cls.__call__.__doc__
-    try:
-        docstring = doc_parse(docs)
-    except Exception:
-        docstring = None
-
-    if docstring is None:
-        comments = {}
-    else:
-        comments = {}
-        for p in docstring.params:
-            match_obj = re.match('^([a-zA-Z_]+[a-zA-Z_0-9]*).*', p.arg_name)
-            if match_obj is not None:
-                comments[match_obj.group(1)] = p.description
-
-    schema = SchemaDict()
-    schema.name = cls.__name__
-    schema.doc = ""
-    if docs is not None:
-        start_pos = docs[0] == '\n' and 1 or 0
-        schema.doc = docs[start_pos:].split("\n")[0].strip()
-    # XXX handle paddle's weird doc convention
-    if '**' == schema.doc[:2] and '**' == schema.doc[-2:]:
-        schema.doc = schema.doc[2:-2].strip()
-    schema.category = hasattr(cls, '__category__') and getattr(
-        cls, '__category__') or 'module'
-    schema.strict = not has_kwargs
-    schema.pymodule = importlib.import_module(cls.__module__)
-    schema.inject = getattr(cls, '__inject__', [])
-    schema.shared = getattr(cls, '__shared__', [])
-    for idx, name in enumerate(names):
-        comment = name in comments and comments[name] or name
-        if name in schema.inject:
-            type_ = None
-        else:
-            type_ = name in annotations and annotations[name] or None
-        value_schema = SchemaValue(name, comment, type_)
-        if name in schema.shared:
-            assert idx >= num_required, "shared config must have default value"
-            default = defaults[idx - num_required]
-            value_schema.set_default(SharedConfig(name, default))
-        elif idx >= num_required:
-            default = defaults[idx - num_required]
-            value_schema.set_default(default)
-        schema.set_schema(name, value_schema)
-
-    return schema
diff --git a/pdfdet/models/Paddle/ppdet/core/config/yaml_helpers.py b/pdfdet/models/Paddle/ppdet/core/config/yaml_helpers.py
deleted file mode 100644
index 181cfe6..0000000
--- a/pdfdet/models/Paddle/ppdet/core/config/yaml_helpers.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import importlib
-import inspect
-
-import yaml
-from .schema import SharedConfig
-
-__all__ = ['serializable', 'Callable']
-
-
-def represent_dictionary_order(self, dict_data):
-    return self.represent_mapping('tag:yaml.org,2002:map', dict_data.items())
-
-
-def setup_orderdict():
-    from collections import OrderedDict
-    yaml.add_representer(OrderedDict, represent_dictionary_order)
-
-
-def _make_python_constructor(cls):
-    def python_constructor(loader, node):
-        if isinstance(node, yaml.SequenceNode):
-            args = loader.construct_sequence(node, deep=True)
-            return cls(*args)
-        else:
-            kwargs = loader.construct_mapping(node, deep=True)
-            try:
-                return cls(**kwargs)
-            except Exception as ex:
-                print("Error when construct {} instance from yaml config".
-                      format(cls.__name__))
-                raise ex
-
-    return python_constructor
-
-
-def _make_python_representer(cls):
-    # python 2 compatibility
-    if hasattr(inspect, 'getfullargspec'):
-        argspec = inspect.getfullargspec(cls)
-    else:
-        argspec = inspect.getfullargspec(cls.__init__)
-    argnames = [arg for arg in argspec.args if arg != 'self']
-
-    def python_representer(dumper, obj):
-        if argnames:
-            data = {name: getattr(obj, name) for name in argnames}
-        else:
-            data = obj.__dict__
-        if '_id' in data:
-            del data['_id']
-        return dumper.represent_mapping(u'!{}'.format(cls.__name__), data)
-
-    return python_representer
-
-
-def serializable(cls):
-    """
-    Add loader and dumper for given class, which must be
-    "trivially serializable"
-
-    Args:
-        cls: class to be serialized
-
-    Returns: cls
-    """
-    yaml.add_constructor(u'!{}'.format(cls.__name__),
-                         _make_python_constructor(cls))
-    yaml.add_representer(cls, _make_python_representer(cls))
-    return cls
-
-
-yaml.add_representer(SharedConfig,
-                     lambda d, o: d.represent_data(o.default_value))
-
-
-@serializable
-class Callable(object):
-    """
-    Helper to be used in Yaml for creating arbitrary class objects
-
-    Args:
-        full_type (str): the full module path to target function
-    """
-
-    def __init__(self, full_type, args=[], kwargs={}):
-        super(Callable, self).__init__()
-        self.full_type = full_type
-        self.args = args
-        self.kwargs = kwargs
-
-    def __call__(self):
-        if '.' in self.full_type:
-            idx = self.full_type.rfind('.')
-            module = importlib.import_module(self.full_type[:idx])
-            func_name = self.full_type[idx + 1:]
-        else:
-            try:
-                module = importlib.import_module('builtins')
-            except Exception:
-                module = importlib.import_module('__builtin__')
-            func_name = self.full_type
-
-        func = getattr(module, func_name)
-        return func(*self.args, **self.kwargs)
diff --git a/pdfdet/models/Paddle/ppdet/core/workspace.py b/pdfdet/models/Paddle/ppdet/core/workspace.py
deleted file mode 100644
index 6735bcf..0000000
--- a/pdfdet/models/Paddle/ppdet/core/workspace.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-import importlib
-import os
-import sys
-
-import yaml
-import collections
-
-try:
-    collectionsAbc = collections.abc
-except AttributeError:
-    collectionsAbc = collections
-
-from .config.schema import SchemaDict, SharedConfig, extract_schema
-from .config.yaml_helpers import serializable
-
-__all__ = [
-    'global_config',
-    'load_config',
-    'merge_config',
-    'get_registered_modules',
-    'create',
-    'register',
-    'serializable',
-    'dump_value',
-]
-
-
-def dump_value(value):
-    # XXX this is hackish, but collections.abc is not available in python 2
-    if hasattr(value, '__dict__') or isinstance(value, (dict, tuple, list)):
-        value = yaml.dump(value, default_flow_style=True)
-        value = value.replace('\n', '')
-        value = value.replace('...', '')
-        return "'{}'".format(value)
-    else:
-        # primitive types
-        return str(value)
-
-
-class AttrDict(dict):
-    """Single level attribute dict, NOT recursive"""
-
-    def __init__(self, **kwargs):
-        super(AttrDict, self).__init__()
-        super(AttrDict, self).update(kwargs)
-
-    def __getattr__(self, key):
-        if key in self:
-            return self[key]
-        raise AttributeError("object has no attribute '{}'".format(key))
-
-    def __setattr__(self, key, value):
-        self[key] = value
-
-    def copy(self):
-        new_dict = AttrDict()
-        for k, v in self.items():
-            new_dict.update({k: v})
-        return new_dict
-
-
-global_config = AttrDict()
-
-BASE_KEY = '_BASE_'
-
-
-# parse and load _BASE_ recursively
-def _load_config_with_base(file_path):
-    with open(file_path) as f:
-        file_cfg = yaml.load(f, Loader=yaml.Loader)
-
-    # NOTE: cfgs outside have higher priority than cfgs in _BASE_
-    if BASE_KEY in file_cfg:
-        all_base_cfg = AttrDict()
-        base_ymls = list(file_cfg[BASE_KEY])
-        for base_yml in base_ymls:
-            if base_yml.startswith("~"):
-                base_yml = os.path.expanduser(base_yml)
-            if not base_yml.startswith('/'):
-                base_yml = os.path.join(os.path.dirname(file_path), base_yml)
-
-            with open(base_yml) as f:
-                base_cfg = _load_config_with_base(base_yml)
-                all_base_cfg = merge_config(base_cfg, all_base_cfg)
-
-        del file_cfg[BASE_KEY]
-        return merge_config(file_cfg, all_base_cfg)
-
-    return file_cfg
-
-
-def load_config(file_path):
-    """
-    Load config from file.
-
-    Args:
-        file_path (str): Path of the config file to be loaded.
-
-    Returns: global config
-    """
-    _, ext = os.path.splitext(file_path)
-    assert ext in ['.yml', '.yaml'], "only support yaml files for now"
-
-    # load config from file and merge into global config
-    cfg = _load_config_with_base(file_path)
-    cfg['filename'] = os.path.splitext(os.path.split(file_path)[-1])[0]
-    merge_config(cfg)
-
-    return global_config
-
-
-def dict_merge(dct, merge_dct):
-    """ Recursive dict merge. Inspired by :meth:``dict.update()``, instead of
-    updating only top-level keys, dict_merge recurses down into dicts nested
-    to an arbitrary depth, updating keys. The ``merge_dct`` is merged into
-    ``dct``.
-
-    Args:
-        dct: dict onto which the merge is executed
-        merge_dct: dct merged into dct
-
-    Returns: dct
-    """
-    for k, v in merge_dct.items():
-        if (k in dct and isinstance(dct[k], dict) and
-                isinstance(merge_dct[k], collectionsAbc.Mapping)):
-            dict_merge(dct[k], merge_dct[k])
-        else:
-            dct[k] = merge_dct[k]
-    return dct
-
-
-def merge_config(config, another_cfg=None):
-    """
-    Merge config into global config or another_cfg.
-
-    Args:
-        config (dict): Config to be merged.
-
-    Returns: global config
-    """
-    global global_config
-    dct = another_cfg or global_config
-    return dict_merge(dct, config)
-
-
-def get_registered_modules():
-    return {k: v for k, v in global_config.items() if isinstance(v, SchemaDict)}
-
-
-def make_partial(cls):
-    op_module = importlib.import_module(cls.__op__.__module__)
-    op = getattr(op_module, cls.__op__.__name__)
-    cls.__category__ = getattr(cls, '__category__', None) or 'op'
-
-    def partial_apply(self, *args, **kwargs):
-        kwargs_ = self.__dict__.copy()
-        kwargs_.update(kwargs)
-        return op(*args, **kwargs_)
-
-    if getattr(cls, '__append_doc__', True):  # XXX should default to True?
-        if sys.version_info[0] > 2:
-            cls.__doc__ = "Wrapper for `{}` OP".format(op.__name__)
-            cls.__init__.__doc__ = op.__doc__
-            cls.__call__ = partial_apply
-            cls.__call__.__doc__ = op.__doc__
-        else:
-            # XXX work around for python 2
-            partial_apply.__doc__ = op.__doc__
-            cls.__call__ = partial_apply
-    return cls
-
-
-def register(cls):
-    """
-    Register a given module class.
-
-    Args:
-        cls (type): Module class to be registered.
-
-    Returns: cls
-    """
-    if cls.__name__ in global_config:
-        raise ValueError("Module class already registered: {}".format(
-            cls.__name__))
-    if hasattr(cls, '__op__'):
-        cls = make_partial(cls)
-    global_config[cls.__name__] = extract_schema(cls)
-    return cls
-
-
-def create(cls_or_name, **kwargs):
-    """
-    Create an instance of given module class.
-
-    Args:
-        cls_or_name (type or str): Class of which to create instance.
-
-    Returns: instance of type `cls_or_name`
-    """
-    assert type(cls_or_name) in [type, str
-                                 ], "should be a class or name of a class"
-    name = type(cls_or_name) == str and cls_or_name or cls_or_name.__name__
-    if name in global_config:
-        if isinstance(global_config[name], SchemaDict):
-            pass
-        elif hasattr(global_config[name], "__dict__"):
-            # support instance return directly
-            return global_config[name]
-        else:
-            raise ValueError("The module {} is not registered".format(name))
-    else:
-        raise ValueError("The module {} is not registered".format(name))
-
-    config = global_config[name]
-    cls = getattr(config.pymodule, name)
-    cls_kwargs = {}
-    cls_kwargs.update(global_config[name])
-
-    # parse `shared` annoation of registered modules
-    if getattr(config, 'shared', None):
-        for k in config.shared:
-            target_key = config[k]
-            shared_conf = config.schema[k].default
-            assert isinstance(shared_conf, SharedConfig)
-            if target_key is not None and not isinstance(target_key,
-                                                         SharedConfig):
-                continue  # value is given for the module
-            elif shared_conf.key in global_config:
-                # `key` is present in config
-                cls_kwargs[k] = global_config[shared_conf.key]
-            else:
-                cls_kwargs[k] = shared_conf.default_value
-
-    # parse `inject` annoation of registered modules
-    if getattr(cls, 'from_config', None):
-        cls_kwargs.update(cls.from_config(config, **kwargs))
-
-    if getattr(config, 'inject', None):
-        for k in config.inject:
-            target_key = config[k]
-            # optional dependency
-            if target_key is None:
-                continue
-
-            if isinstance(target_key, dict) or hasattr(target_key, '__dict__'):
-                if 'name' not in target_key.keys():
-                    continue
-                inject_name = str(target_key['name'])
-                if inject_name not in global_config:
-                    raise ValueError(
-                        "Missing injection name {} and check it's name in cfg file".
-                        format(k))
-                target = global_config[inject_name]
-                for i, v in target_key.items():
-                    if i == 'name':
-                        continue
-                    target[i] = v
-                if isinstance(target, SchemaDict):
-                    cls_kwargs[k] = create(inject_name)
-            elif isinstance(target_key, str):
-                if target_key not in global_config:
-                    raise ValueError("Missing injection config:", target_key)
-                target = global_config[target_key]
-                if isinstance(target, SchemaDict):
-                    cls_kwargs[k] = create(target_key)
-                elif hasattr(target, '__dict__'):  # serialized object
-                    cls_kwargs[k] = target
-            else:
-                raise ValueError("Unsupported injection type:", target_key)
-    # prevent modification of global config values of reference types
-    # (e.g., list, dict) from within the created module instances
-    #kwargs = copy.deepcopy(kwargs)
-    return cls(**cls_kwargs)
diff --git a/pdfdet/models/Paddle/ppdet/data/__init__.py b/pdfdet/models/Paddle/ppdet/data/__init__.py
deleted file mode 100644
index a12aa32..0000000
--- a/pdfdet/models/Paddle/ppdet/data/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from . import source
-from . import transform
-from . import reader
-
-from .source import *
-from .transform import *
-from .reader import *
diff --git a/pdfdet/models/Paddle/ppdet/data/crop_utils/__init__.py b/pdfdet/models/Paddle/ppdet/data/crop_utils/__init__.py
deleted file mode 100644
index 61d5aa2..0000000
--- a/pdfdet/models/Paddle/ppdet/data/crop_utils/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
\ No newline at end of file
diff --git a/pdfdet/models/Paddle/ppdet/data/crop_utils/annotation_cropper.py b/pdfdet/models/Paddle/ppdet/data/crop_utils/annotation_cropper.py
deleted file mode 100644
index e288fab..0000000
--- a/pdfdet/models/Paddle/ppdet/data/crop_utils/annotation_cropper.py
+++ /dev/null
@@ -1,580 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import math
-import random
-import numpy as np
-from copy import deepcopy
-from typing import List, Tuple
-from collections import defaultdict
-
-from .chip_box_utils import nms, transform_chip_boxes2image_boxes
-from .chip_box_utils import find_chips_to_cover_overlaped_boxes
-from .chip_box_utils import transform_chip_box
-from .chip_box_utils import intersection_over_box
-
-
-class AnnoCropper(object):
-    def __init__(self,
-                 image_target_sizes: List[int],
-                 valid_box_ratio_ranges: List[List[float]],
-                 chip_target_size: int,
-                 chip_target_stride: int,
-                 use_neg_chip: bool=False,
-                 max_neg_num_per_im: int=8,
-                 max_per_img: int=-1,
-                 nms_thresh: int=0.5):
-        """
-        Generate chips by chip_target_size and chip_target_stride.
-        These two parameters just like kernel_size and stride in cnn.
-
-        Each image has its raw size. After resizing, then get its target size.
-        The resizing scale = target_size / raw_size.
-        So are chips of the image.
-        box_ratio = box_raw_size / image_raw_size = box_target_size / image_target_size
-        The 'size' above mentioned is the size of long-side of image, box or chip.
-
-        :param image_target_sizes: [2000, 1000]
-        :param valid_box_ratio_ranges:  [[-1, 0.1],[0.08, -1]]
-        :param chip_target_size: 500
-        :param chip_target_stride: 200
-        """
-        self.target_sizes = image_target_sizes
-        self.valid_box_ratio_ranges = valid_box_ratio_ranges
-        assert len(self.target_sizes) == len(self.valid_box_ratio_ranges)
-        self.scale_num = len(self.target_sizes)
-        self.chip_target_size = chip_target_size  # is target size
-        self.chip_target_stride = chip_target_stride  # is target stride
-        self.use_neg_chip = use_neg_chip
-        self.max_neg_num_per_im = max_neg_num_per_im
-        self.max_per_img = max_per_img
-        self.nms_thresh = nms_thresh
-
-    def crop_anno_records(self, records: List[dict]):
-        """
-        The main logic:
-        # foreach record(image):
-        #   foreach scale:
-        #     1 generate chips by chip size and stride for each scale
-        #     2 get pos chips
-        #     - validate boxes: current scale; h,w >= 1
-        #     - find pos chips greedily by valid gt boxes in each scale
-        #     - for every valid gt box, find its corresponding pos chips in each scale
-        #     3 get neg chips
-        #     - If given proposals, find neg boxes in them which are not in pos chips
-        #     - If got neg boxes in last step, we find neg chips and assign neg boxes to neg chips such as 2.
-        # 4 sample neg chips if too much each image
-        #   transform this image-scale annotations to chips(pos chips&neg chips) annotations
-
-        :param records, standard coco_record but with extra key `proposals`(Px4), which are predicted by stage1
-                        model and maybe have neg boxes in them.
-        :return: new_records, list of dict like
-        {
-            'im_file': 'fake_image1.jpg',
-            'im_id': np.array([1]),  # new _global_chip_id as im_id
-            'h': h,  # chip height
-            'w': w,  # chip width
-            'is_crowd': is_crowd,  # Nx1 -> Mx1
-            'gt_class': gt_class,  # Nx1 -> Mx1
-            'gt_bbox': gt_bbox,  # Nx4 -> Mx4, 4 represents [x1,y1,x2,y2]
-            'gt_poly': gt_poly,  # [None]xN -> [None]xM
-            'chip': [x1, y1, x2, y2]  # added
-        }
-
-        Attention:
-        ------------------------------>x
-        |
-        |    (x1,y1)------
-        |       |        |
-        |       |        |
-        |       |        |
-        |       |        |
-        |       |        |
-        |       ----------
-        |                 (x2,y2)
-        |
-        ↓
-        y
-
-        If we use [x1, y1, x2, y2] to represent boxes or chips,
-        (x1,y1) is the left-top point which is in the box,
-        but (x2,y2) is the right-bottom point which is not in the box.
-        So x1 in [0, w-1], x2 in [1, w], y1 in [0, h-1], y2 in [1,h].
-        And you can use x2-x1 to get width, and you can use image[y1:y2, x1:x2] to get the box area.
-        """
-
-        self.chip_records = []
-        self._global_chip_id = 1
-        for r in records:
-            self._cur_im_pos_chips = [
-            ]  # element: (chip, boxes_idx), chip is [x1, y1, x2, y2], boxes_ids is List[int]
-            self._cur_im_neg_chips = []  # element: (chip, neg_box_num)
-            for scale_i in range(self.scale_num):
-                self._get_current_scale_parameters(scale_i, r)
-
-                # Cx4
-                chips = self._create_chips(r['h'], r['w'], self._cur_scale)
-
-                # # dict: chipid->[box_id, ...]
-                pos_chip2boxes_idx = self._get_valid_boxes_and_pos_chips(
-                    r['gt_bbox'], chips)
-
-                # dict: chipid->neg_box_num
-                neg_chip2box_num = self._get_neg_boxes_and_chips(
-                    chips,
-                    list(pos_chip2boxes_idx.keys()), r.get('proposals', None))
-
-                self._add_to_cur_im_chips(chips, pos_chip2boxes_idx,
-                                          neg_chip2box_num)
-
-            cur_image_records = self._trans_all_chips2annotations(r)
-            self.chip_records.extend(cur_image_records)
-        return self.chip_records
-
-    def _add_to_cur_im_chips(self, chips, pos_chip2boxes_idx, neg_chip2box_num):
-        for pos_chipid, boxes_idx in pos_chip2boxes_idx.items():
-            chip = np.array(chips[pos_chipid])  # copy chips slice
-            self._cur_im_pos_chips.append((chip, boxes_idx))
-
-        if neg_chip2box_num is None:
-            return
-
-        for neg_chipid, neg_box_num in neg_chip2box_num.items():
-            chip = np.array(chips[neg_chipid])
-            self._cur_im_neg_chips.append((chip, neg_box_num))
-
-    def _trans_all_chips2annotations(self, r):
-        gt_bbox = r['gt_bbox']
-        im_file = r['im_file']
-        is_crowd = r['is_crowd']
-        gt_class = r['gt_class']
-        # gt_poly = r['gt_poly']   # [None]xN
-        # remaining keys: im_id, h, w
-        chip_records = self._trans_pos_chips2annotations(im_file, gt_bbox,
-                                                         is_crowd, gt_class)
-
-        if not self.use_neg_chip:
-            return chip_records
-
-        sampled_neg_chips = self._sample_neg_chips()
-        neg_chip_records = self._trans_neg_chips2annotations(im_file,
-                                                             sampled_neg_chips)
-        chip_records.extend(neg_chip_records)
-        return chip_records
-
-    def _trans_pos_chips2annotations(self, im_file, gt_bbox, is_crowd,
-                                     gt_class):
-        chip_records = []
-        for chip, boxes_idx in self._cur_im_pos_chips:
-            chip_bbox, final_boxes_idx = transform_chip_box(gt_bbox, boxes_idx,
-                                                            chip)
-            x1, y1, x2, y2 = chip
-            chip_h = y2 - y1
-            chip_w = x2 - x1
-            rec = {
-                'im_file': im_file,
-                'im_id': np.array([self._global_chip_id]),
-                'h': chip_h,
-                'w': chip_w,
-                'gt_bbox': chip_bbox,
-                'is_crowd': is_crowd[final_boxes_idx].copy(),
-                'gt_class': gt_class[final_boxes_idx].copy(),
-                # 'gt_poly': [None] * len(final_boxes_idx),
-                'chip': chip
-            }
-            self._global_chip_id += 1
-            chip_records.append(rec)
-        return chip_records
-
-    def _sample_neg_chips(self):
-        pos_num = len(self._cur_im_pos_chips)
-        neg_num = len(self._cur_im_neg_chips)
-        sample_num = min(pos_num + 2, self.max_neg_num_per_im)
-        assert sample_num >= 1
-        if neg_num <= sample_num:
-            return self._cur_im_neg_chips
-
-        candidate_num = int(sample_num * 1.5)
-        candidate_neg_chips = sorted(
-            self._cur_im_neg_chips, key=lambda x: -x[1])[:candidate_num]
-        random.shuffle(candidate_neg_chips)
-        sampled_neg_chips = candidate_neg_chips[:sample_num]
-        return sampled_neg_chips
-
-    def _trans_neg_chips2annotations(self,
-                                     im_file: str,
-                                     sampled_neg_chips: List[Tuple]):
-        chip_records = []
-        for chip, neg_box_num in sampled_neg_chips:
-            x1, y1, x2, y2 = chip
-            chip_h = y2 - y1
-            chip_w = x2 - x1
-            rec = {
-                'im_file': im_file,
-                'im_id': np.array([self._global_chip_id]),
-                'h': chip_h,
-                'w': chip_w,
-                'gt_bbox': np.zeros(
-                    (0, 4), dtype=np.float32),
-                'is_crowd': np.zeros(
-                    (0, 1), dtype=np.int32),
-                'gt_class': np.zeros(
-                    (0, 1), dtype=np.int32),
-                # 'gt_poly': [],
-                'chip': chip
-            }
-            self._global_chip_id += 1
-            chip_records.append(rec)
-        return chip_records
-
-    def _get_current_scale_parameters(self, scale_i, r):
-        im_size = max(r['h'], r['w'])
-        im_target_size = self.target_sizes[scale_i]
-        self._cur_im_size, self._cur_im_target_size = im_size, im_target_size
-        self._cur_scale = self._get_current_scale(im_target_size, im_size)
-        self._cur_valid_ratio_range = self.valid_box_ratio_ranges[scale_i]
-
-    def _get_current_scale(self, im_target_size, im_size):
-        return im_target_size / im_size
-
-    def _create_chips(self, h: int, w: int, scale: float):
-        """
-        Generate chips by chip_target_size and chip_target_stride.
-        These two parameters just like kernel_size and stride in cnn.
-        :return: chips, Cx4, xy in raw size dimension
-        """
-        chip_size = self.chip_target_size  # omit target for simplicity
-        stride = self.chip_target_stride
-        width = int(scale * w)
-        height = int(scale * h)
-        min_chip_location_diff = 20  # in target size
-
-        assert chip_size >= stride
-        chip_overlap = chip_size - stride
-        if (width - chip_overlap
-            ) % stride > min_chip_location_diff:  # 不能被stride整除的部分比较大，则保留
-            w_steps = max(1, int(math.ceil((width - chip_overlap) / stride)))
-        else:  # 不能被stride整除的部分比较小，则丢弃
-            w_steps = max(1, int(math.floor((width - chip_overlap) / stride)))
-        if (height - chip_overlap) % stride > min_chip_location_diff:
-            h_steps = max(1, int(math.ceil((height - chip_overlap) / stride)))
-        else:
-            h_steps = max(1, int(math.floor((height - chip_overlap) / stride)))
-
-        chips = list()
-        for j in range(h_steps):
-            for i in range(w_steps):
-                x1 = i * stride
-                y1 = j * stride
-                x2 = min(x1 + chip_size, width)
-                y2 = min(y1 + chip_size, height)
-                chips.append([x1, y1, x2, y2])
-
-        # check  chip size
-        for item in chips:
-            if item[2] - item[0] > chip_size * 1.1 or item[3] - item[
-                    1] > chip_size * 1.1:
-                raise ValueError(item)
-        chips = np.array(chips, dtype=np.float32)
-
-        raw_size_chips = chips / scale
-        return raw_size_chips
-
-    def _get_valid_boxes_and_pos_chips(self, gt_bbox, chips):
-        valid_ratio_range = self._cur_valid_ratio_range
-        im_size = self._cur_im_size
-        scale = self._cur_scale
-        #   Nx4            N
-        valid_boxes, valid_boxes_idx = self._validate_boxes(
-            valid_ratio_range, im_size, gt_bbox, scale)
-        # dict: chipid->[box_id, ...]
-        pos_chip2boxes_idx = self._find_pos_chips(chips, valid_boxes,
-                                                  valid_boxes_idx)
-        return pos_chip2boxes_idx
-
-    def _validate_boxes(self,
-                        valid_ratio_range: List[float],
-                        im_size: int,
-                        gt_boxes: 'np.array of Nx4',
-                        scale: float):
-        """
-        :return: valid_boxes: Nx4, valid_boxes_idx: N
-        """
-        ws = (gt_boxes[:, 2] - gt_boxes[:, 0]).astype(np.int32)
-        hs = (gt_boxes[:, 3] - gt_boxes[:, 1]).astype(np.int32)
-        maxs = np.maximum(ws, hs)
-        box_ratio = maxs / im_size
-        mins = np.minimum(ws, hs)
-        target_mins = mins * scale
-
-        low = valid_ratio_range[0] if valid_ratio_range[0] > 0 else 0
-        high = valid_ratio_range[1] if valid_ratio_range[1] > 0 else np.finfo(
-            np.float32).max
-
-        valid_boxes_idx = np.nonzero((low <= box_ratio) & (box_ratio < high) & (
-            target_mins >= 2))[0]
-        valid_boxes = gt_boxes[valid_boxes_idx]
-        return valid_boxes, valid_boxes_idx
-
-    def _find_pos_chips(self,
-                        chips: 'Cx4',
-                        valid_boxes: 'Bx4',
-                        valid_boxes_idx: 'B'):
-        """
-        :return: pos_chip2boxes_idx, dict: chipid->[box_id, ...]
-        """
-        iob = intersection_over_box(chips, valid_boxes)  # overlap, CxB
-
-        iob_threshold_to_find_chips = 1.
-        pos_chip_ids, _ = self._find_chips_to_cover_overlaped_boxes(
-            iob, iob_threshold_to_find_chips)
-        pos_chip_ids = set(pos_chip_ids)
-
-        iob_threshold_to_assign_box = 0.5
-        pos_chip2boxes_idx = self._assign_boxes_to_pos_chips(
-            iob, iob_threshold_to_assign_box, pos_chip_ids, valid_boxes_idx)
-        return pos_chip2boxes_idx
-
-    def _find_chips_to_cover_overlaped_boxes(self, iob, overlap_threshold):
-        return find_chips_to_cover_overlaped_boxes(iob, overlap_threshold)
-
-    def _assign_boxes_to_pos_chips(self, iob, overlap_threshold, pos_chip_ids,
-                                   valid_boxes_idx):
-        chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)
-        pos_chip2boxes_idx = defaultdict(list)
-        for chip_id, box_id in zip(chip_ids, box_ids):
-            if chip_id not in pos_chip_ids:
-                continue
-            raw_gt_box_idx = valid_boxes_idx[box_id]
-            pos_chip2boxes_idx[chip_id].append(raw_gt_box_idx)
-        return pos_chip2boxes_idx
-
-    def _get_neg_boxes_and_chips(self,
-                                 chips: 'Cx4',
-                                 pos_chip_ids: 'D',
-                                 proposals: 'Px4'):
-        """
-        :param chips:
-        :param pos_chip_ids:
-        :param proposals:
-        :return: neg_chip2box_num, None or dict: chipid->neg_box_num
-        """
-        if not self.use_neg_chip:
-            return None
-
-        # train proposals maybe None
-        if proposals is None or len(proposals) < 1:
-            return None
-
-        valid_ratio_range = self._cur_valid_ratio_range
-        im_size = self._cur_im_size
-        scale = self._cur_scale
-
-        valid_props, _ = self._validate_boxes(valid_ratio_range, im_size,
-                                              proposals, scale)
-        neg_boxes = self._find_neg_boxes(chips, pos_chip_ids, valid_props)
-        neg_chip2box_num = self._find_neg_chips(chips, pos_chip_ids, neg_boxes)
-        return neg_chip2box_num
-
-    def _find_neg_boxes(self,
-                        chips: 'Cx4',
-                        pos_chip_ids: 'D',
-                        valid_props: 'Px4'):
-        """
-        :return: neg_boxes: Nx4
-        """
-        if len(pos_chip_ids) == 0:
-            return valid_props
-
-        pos_chips = chips[pos_chip_ids]
-        iob = intersection_over_box(pos_chips, valid_props)
-        overlap_per_prop = np.max(iob, axis=0)
-        non_overlap_props_idx = overlap_per_prop < 0.5
-        neg_boxes = valid_props[non_overlap_props_idx]
-        return neg_boxes
-
-    def _find_neg_chips(self, chips: 'Cx4', pos_chip_ids: 'D',
-                        neg_boxes: 'Nx4'):
-        """
-        :return: neg_chip2box_num, dict: chipid->neg_box_num
-        """
-        neg_chip_ids = np.setdiff1d(np.arange(len(chips)), pos_chip_ids)
-        neg_chips = chips[neg_chip_ids]
-
-        iob = intersection_over_box(neg_chips, neg_boxes)
-        iob_threshold_to_find_chips = 0.7
-        chosen_neg_chip_ids, chip_id2overlap_box_num = \
-            self._find_chips_to_cover_overlaped_boxes(iob, iob_threshold_to_find_chips)
-
-        neg_chipid2box_num = {}
-        for cid in chosen_neg_chip_ids:
-            box_num = chip_id2overlap_box_num[cid]
-            raw_chip_id = neg_chip_ids[cid]
-            neg_chipid2box_num[raw_chip_id] = box_num
-        return neg_chipid2box_num
-
-    def crop_infer_anno_records(self, records: List[dict]):
-        """
-        transform image record to chips record
-        :param records:
-        :return: new_records, list of dict like
-        {
-            'im_file': 'fake_image1.jpg',
-            'im_id': np.array([1]),  # new _global_chip_id as im_id
-            'h': h,  # chip height
-            'w': w,  # chip width
-            'chip': [x1, y1, x2, y2]  # added
-            'ori_im_h': ori_im_h  # added, origin image height
-            'ori_im_w': ori_im_w  # added, origin image width
-            'scale_i': 0  # added,
-        }
-        """
-        self.chip_records = []
-        self._global_chip_id = 1  # im_id start from 1
-        self._global_chip_id2img_id = {}
-
-        for r in records:
-            for scale_i in range(self.scale_num):
-                self._get_current_scale_parameters(scale_i, r)
-                # Cx4
-                chips = self._create_chips(r['h'], r['w'], self._cur_scale)
-                cur_img_chip_record = self._get_chips_records(r, chips, scale_i)
-                self.chip_records.extend(cur_img_chip_record)
-
-        return self.chip_records
-
-    def _get_chips_records(self, rec, chips, scale_i):
-        cur_img_chip_records = []
-        ori_im_h = rec["h"]
-        ori_im_w = rec["w"]
-        im_file = rec["im_file"]
-        ori_im_id = rec["im_id"]
-        for id, chip in enumerate(chips):
-            chip_rec = {}
-            x1, y1, x2, y2 = chip
-            chip_h = y2 - y1
-            chip_w = x2 - x1
-            chip_rec["im_file"] = im_file
-            chip_rec["im_id"] = self._global_chip_id
-            chip_rec["h"] = chip_h
-            chip_rec["w"] = chip_w
-            chip_rec["chip"] = chip
-            chip_rec["ori_im_h"] = ori_im_h
-            chip_rec["ori_im_w"] = ori_im_w
-            chip_rec["scale_i"] = scale_i
-
-            self._global_chip_id2img_id[self._global_chip_id] = int(ori_im_id)
-            self._global_chip_id += 1
-            cur_img_chip_records.append(chip_rec)
-
-        return cur_img_chip_records
-
-    def aggregate_chips_detections(self, results, records=None):
-        """
-        # 1. transform chip dets to image dets
-        # 2. nms boxes per image;
-        # 3. format output results
-        :param results:
-        :param roidb:
-        :return:
-        """
-        results = deepcopy(results)
-        records = records if records else self.chip_records
-        img_id2bbox = self._transform_chip2image_bboxes(results, records)
-        nms_img_id2bbox = self._nms_dets(img_id2bbox)
-        aggregate_results = self._reformat_results(nms_img_id2bbox)
-        return aggregate_results
-
-    def _transform_chip2image_bboxes(self, results, records):
-        # 1. Transform chip dets to image dets;
-        # 2. Filter valid range;
-        # 3. Reformat and Aggregate chip dets to Get scale_cls_dets
-        img_id2bbox = defaultdict(list)
-        for result in results:
-            bbox_locs = result['bbox']
-            bbox_nums = result['bbox_num']
-            if len(bbox_locs) == 1 and bbox_locs[0][
-                    0] == -1:  # current batch has no detections
-                # bbox_locs = array([[-1.]], dtype=float32); bbox_nums = [[1]]
-                # MultiClassNMS output: If there is no detected boxes for all images, lod will be set to {1} and Out only contains one value which is -1.
-                continue
-            im_ids = result['im_id']  # replace with range(len(bbox_nums))
-
-            last_bbox_num = 0
-            for idx, im_id in enumerate(im_ids):
-
-                cur_bbox_len = bbox_nums[idx]
-                bboxes = bbox_locs[last_bbox_num:last_bbox_num + cur_bbox_len]
-                last_bbox_num += cur_bbox_len
-                # box: [num_id, score, xmin, ymin, xmax, ymax]
-                if len(bboxes) == 0:  # current image has no detections
-                    continue
-
-                chip_rec = records[int(im_id) -
-                                   1]  # im_id starts from 1, type is np.int64
-                image_size = max(chip_rec["ori_im_h"], chip_rec["ori_im_w"])
-
-                bboxes = transform_chip_boxes2image_boxes(
-                    bboxes, chip_rec["chip"], chip_rec["ori_im_h"],
-                    chip_rec["ori_im_w"])
-
-                scale_i = chip_rec["scale_i"]
-                cur_scale = self._get_current_scale(self.target_sizes[scale_i],
-                                                    image_size)
-                _, valid_boxes_idx = self._validate_boxes(
-                    self.valid_box_ratio_ranges[scale_i], image_size,
-                    bboxes[:, 2:], cur_scale)
-                ori_img_id = self._global_chip_id2img_id[int(im_id)]
-
-                img_id2bbox[ori_img_id].append(bboxes[valid_boxes_idx])
-
-        return img_id2bbox
-
-    def _nms_dets(self, img_id2bbox):
-        # 1. NMS on each image-class
-        # 2. Limit number of detections to MAX_PER_IMAGE if requested
-        max_per_img = self.max_per_img
-        nms_thresh = self.nms_thresh
-
-        for img_id in img_id2bbox:
-            box = img_id2bbox[
-                img_id]  # list of np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
-            box = np.concatenate(box, axis=0)
-            nms_dets = nms(box, nms_thresh)
-            if max_per_img > 0:
-                if len(nms_dets) > max_per_img:
-                    keep = np.argsort(-nms_dets[:, 1])[:max_per_img]
-                    nms_dets = nms_dets[keep]
-
-            img_id2bbox[img_id] = nms_dets
-
-        return img_id2bbox
-
-    def _reformat_results(self, img_id2bbox):
-        """reformat results"""
-        im_ids = img_id2bbox.keys()
-        results = []
-        for img_id in im_ids:  # output by original im_id order
-            if len(img_id2bbox[img_id]) == 0:
-                bbox = np.array(
-                    [[-1., 0., 0., 0., 0., 0.]])  # edge case: no detections
-                bbox_num = np.array([0])
-            else:
-                # np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2]
-                bbox = img_id2bbox[img_id]
-                bbox_num = np.array([len(bbox)])
-            res = dict(im_id=np.array([[img_id]]), bbox=bbox, bbox_num=bbox_num)
-            results.append(res)
-        return results
diff --git a/pdfdet/models/Paddle/ppdet/data/crop_utils/chip_box_utils.py b/pdfdet/models/Paddle/ppdet/data/crop_utils/chip_box_utils.py
deleted file mode 100644
index cfa1e39..0000000
--- a/pdfdet/models/Paddle/ppdet/data/crop_utils/chip_box_utils.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-
-
-def bbox_area(boxes):
-    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
-
-
-def intersection_over_box(chips, boxes):
-    """
-    intersection area over box area
-    :param chips:  C
-    :param boxes:  B
-    :return: iob, CxB
-    """
-    M = chips.shape[0]
-    N = boxes.shape[0]
-    if M * N == 0:
-        return np.zeros([M, N], dtype='float32')
-
-    box_area = bbox_area(boxes)  # B
-
-    inter_x2y2 = np.minimum(np.expand_dims(chips, 1)[:, :, 2:],
-                            boxes[:, 2:])  # CxBX2
-    inter_x1y1 = np.maximum(np.expand_dims(chips, 1)[:, :, :2],
-                            boxes[:, :2])  # CxBx2
-    inter_wh = inter_x2y2 - inter_x1y1
-    inter_wh = np.clip(inter_wh, a_min=0, a_max=None)
-    inter_area = inter_wh[:, :, 0] * inter_wh[:, :, 1]  # CxB
-
-    iob = inter_area / np.expand_dims(box_area, 0)
-    return iob
-
-
-def clip_boxes(boxes, im_shape):
-    """
-    Clip boxes to image boundaries.
-    :param boxes: [N, 4]
-    :param im_shape: tuple of 2, [h, w]
-    :return: [N, 4]
-    """
-    # x1 >= 0
-    boxes[:, 0] = np.clip(boxes[:, 0], 0, im_shape[1] - 1)
-    # y1 >= 0
-    boxes[:, 1] = np.clip(boxes[:, 1], 0, im_shape[0] - 1)
-    # x2 < im_shape[1]
-    boxes[:, 2] = np.clip(boxes[:, 2], 1, im_shape[1])
-    # y2 < im_shape[0]
-    boxes[:, 3] = np.clip(boxes[:, 3], 1, im_shape[0])
-    return boxes
-
-
-def transform_chip_box(gt_bbox: 'Gx4', boxes_idx: 'B', chip: '4'):
-    boxes_idx = np.array(boxes_idx)
-    cur_gt_bbox = gt_bbox[boxes_idx].copy()  # Bx4
-    x1, y1, x2, y2 = chip
-    cur_gt_bbox[:, 0] -= x1
-    cur_gt_bbox[:, 1] -= y1
-    cur_gt_bbox[:, 2] -= x1
-    cur_gt_bbox[:, 3] -= y1
-    h = y2 - y1
-    w = x2 - x1
-    cur_gt_bbox = clip_boxes(cur_gt_bbox, (h, w))
-    ws = (cur_gt_bbox[:, 2] - cur_gt_bbox[:, 0]).astype(np.int32)
-    hs = (cur_gt_bbox[:, 3] - cur_gt_bbox[:, 1]).astype(np.int32)
-    valid_idx = (ws >= 2) & (hs >= 2)
-    return cur_gt_bbox[valid_idx], boxes_idx[valid_idx]
-
-
-def find_chips_to_cover_overlaped_boxes(iob, overlap_threshold):
-    chip_ids, box_ids = np.nonzero(iob >= overlap_threshold)
-    chip_id2overlap_box_num = np.bincount(chip_ids)  # 1d array
-    chip_id2overlap_box_num = np.pad(
-        chip_id2overlap_box_num, (0, len(iob) - len(chip_id2overlap_box_num)),
-        constant_values=0)
-
-    chosen_chip_ids = []
-    while len(box_ids) > 0:
-        value_counts = np.bincount(chip_ids)  # 1d array
-        max_count_chip_id = np.argmax(value_counts)
-        assert max_count_chip_id not in chosen_chip_ids
-        chosen_chip_ids.append(max_count_chip_id)
-
-        box_ids_in_cur_chip = box_ids[chip_ids == max_count_chip_id]
-        ids_not_in_cur_boxes_mask = np.logical_not(
-            np.isin(box_ids, box_ids_in_cur_chip))
-        chip_ids = chip_ids[ids_not_in_cur_boxes_mask]
-        box_ids = box_ids[ids_not_in_cur_boxes_mask]
-    return chosen_chip_ids, chip_id2overlap_box_num
-
-
-def transform_chip_boxes2image_boxes(chip_boxes, chip, img_h, img_w):
-    chip_boxes = np.array(sorted(chip_boxes, key=lambda item: -item[1]))
-    xmin, ymin, _, _ = chip
-    # Transform to origin image loc
-    chip_boxes[:, 2] += xmin
-    chip_boxes[:, 4] += xmin
-    chip_boxes[:, 3] += ymin
-    chip_boxes[:, 5] += ymin
-    chip_boxes = clip_boxes(chip_boxes, (img_h, img_w))
-    return chip_boxes
-
-
-def nms(dets, thresh):
-    """Apply classic DPM-style greedy NMS."""
-    if dets.shape[0] == 0:
-        return dets[[], :]
-    scores = dets[:, 1]
-    x1 = dets[:, 2]
-    y1 = dets[:, 3]
-    x2 = dets[:, 4]
-    y2 = dets[:, 5]
-
-    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
-    order = scores.argsort()[::-1]
-
-    ndets = dets.shape[0]
-    suppressed = np.zeros((ndets), dtype=np.int32)
-
-    # nominal indices
-    # _i, _j
-    # sorted indices
-    # i, j
-    # temp variables for box i's (the box currently under consideration)
-    # ix1, iy1, ix2, iy2, iarea
-
-    # variables for computing overlap with box j (lower scoring box)
-    # xx1, yy1, xx2, yy2
-    # w, h
-    # inter, ovr
-
-    for _i in range(ndets):
-        i = order[_i]
-        if suppressed[i] == 1:
-            continue
-        ix1 = x1[i]
-        iy1 = y1[i]
-        ix2 = x2[i]
-        iy2 = y2[i]
-        iarea = areas[i]
-        for _j in range(_i + 1, ndets):
-            j = order[_j]
-            if suppressed[j] == 1:
-                continue
-            xx1 = max(ix1, x1[j])
-            yy1 = max(iy1, y1[j])
-            xx2 = min(ix2, x2[j])
-            yy2 = min(iy2, y2[j])
-            w = max(0.0, xx2 - xx1 + 1)
-            h = max(0.0, yy2 - yy1 + 1)
-            inter = w * h
-            ovr = inter / (iarea + areas[j] - inter)
-            if ovr >= thresh:
-                suppressed[j] = 1
-    keep = np.where(suppressed == 0)[0]
-    dets = dets[keep, :]
-    return dets
diff --git a/pdfdet/models/Paddle/ppdet/data/culane_utils.py b/pdfdet/models/Paddle/ppdet/data/culane_utils.py
deleted file mode 100644
index ea8c948..0000000
--- a/pdfdet/models/Paddle/ppdet/data/culane_utils.py
+++ /dev/null
@@ -1,130 +0,0 @@
-import math
-import numpy as np
-from imgaug.augmentables.lines import LineString
-from scipy.interpolate import InterpolatedUnivariateSpline
-
-
-def lane_to_linestrings(lanes):
-    lines = []
-    for lane in lanes:
-        lines.append(LineString(lane))
-
-    return lines
-
-
-def linestrings_to_lanes(lines):
-    lanes = []
-    for line in lines:
-        lanes.append(line.coords)
-
-    return lanes
-
-
-def sample_lane(points, sample_ys, img_w):
-    # this function expects the points to be sorted
-    points = np.array(points)
-    if not np.all(points[1:, 1] < points[:-1, 1]):
-        raise Exception('Annotaion points have to be sorted')
-    x, y = points[:, 0], points[:, 1]
-
-    # interpolate points inside domain
-    assert len(points) > 1
-    interp = InterpolatedUnivariateSpline(
-        y[::-1], x[::-1], k=min(3, len(points) - 1))
-    domain_min_y = y.min()
-    domain_max_y = y.max()
-    sample_ys_inside_domain = sample_ys[(sample_ys >= domain_min_y) & (
-        sample_ys <= domain_max_y)]
-    assert len(sample_ys_inside_domain) > 0
-    interp_xs = interp(sample_ys_inside_domain)
-
-    # extrapolate lane to the bottom of the image with a straight line using the 2 points closest to the bottom
-    two_closest_points = points[:2]
-    extrap = np.polyfit(
-        two_closest_points[:, 1], two_closest_points[:, 0], deg=1)
-    extrap_ys = sample_ys[sample_ys > domain_max_y]
-    extrap_xs = np.polyval(extrap, extrap_ys)
-    all_xs = np.hstack((extrap_xs, interp_xs))
-
-    # separate between inside and outside points
-    inside_mask = (all_xs >= 0) & (all_xs < img_w)
-    xs_inside_image = all_xs[inside_mask]
-    xs_outside_image = all_xs[~inside_mask]
-
-    return xs_outside_image, xs_inside_image
-
-
-def filter_lane(lane):
-    assert lane[-1][1] <= lane[0][1]
-    filtered_lane = []
-    used = set()
-    for p in lane:
-        if p[1] not in used:
-            filtered_lane.append(p)
-            used.add(p[1])
-
-    return filtered_lane
-
-
-def transform_annotation(img_w, img_h, max_lanes, n_offsets, offsets_ys,
-                         n_strips, strip_size, anno):
-    old_lanes = anno['lanes']
-
-    # removing lanes with less than 2 points
-    old_lanes = filter(lambda x: len(x) > 1, old_lanes)
-    # sort lane points by Y (bottom to top of the image)
-    old_lanes = [sorted(lane, key=lambda x: -x[1]) for lane in old_lanes]
-    # remove points with same Y (keep first occurrence)
-    old_lanes = [filter_lane(lane) for lane in old_lanes]
-    # normalize the annotation coordinates
-    old_lanes = [[[x * img_w / float(img_w), y * img_h / float(img_h)]
-                  for x, y in lane] for lane in old_lanes]
-    # create tranformed annotations
-    lanes = np.ones(
-        (max_lanes, 2 + 1 + 1 + 2 + n_offsets), dtype=np.float32
-    ) * -1e5  # 2 scores, 1 start_y, 1 start_x, 1 theta, 1 length, S+1 coordinates
-    lanes_endpoints = np.ones((max_lanes, 2))
-    # lanes are invalid by default
-    lanes[:, 0] = 1
-    lanes[:, 1] = 0
-    for lane_idx, lane in enumerate(old_lanes):
-        if lane_idx >= max_lanes:
-            break
-
-        try:
-            xs_outside_image, xs_inside_image = sample_lane(lane, offsets_ys,
-                                                            img_w)
-        except AssertionError:
-            continue
-        if len(xs_inside_image) <= 1:
-            continue
-        all_xs = np.hstack((xs_outside_image, xs_inside_image))
-        lanes[lane_idx, 0] = 0
-        lanes[lane_idx, 1] = 1
-        lanes[lane_idx, 2] = len(xs_outside_image) / n_strips
-        lanes[lane_idx, 3] = xs_inside_image[0]
-
-        thetas = []
-        for i in range(1, len(xs_inside_image)):
-            theta = math.atan(
-                i * strip_size /
-                (xs_inside_image[i] - xs_inside_image[0] + 1e-5)) / math.pi
-            theta = theta if theta > 0 else 1 - abs(theta)
-            thetas.append(theta)
-
-        theta_far = sum(thetas) / len(thetas)
-
-        # lanes[lane_idx,
-        #       4] = (theta_closest + theta_far) / 2  # averaged angle
-        lanes[lane_idx, 4] = theta_far
-        lanes[lane_idx, 5] = len(xs_inside_image)
-        lanes[lane_idx, 6:6 + len(all_xs)] = all_xs
-        lanes_endpoints[lane_idx, 0] = (len(all_xs) - 1) / n_strips
-        lanes_endpoints[lane_idx, 1] = xs_inside_image[-1]
-
-    new_anno = {
-        'label': lanes,
-        'old_anno': anno,
-        'lane_endpoints': lanes_endpoints
-    }
-    return new_anno
diff --git a/pdfdet/models/Paddle/ppdet/data/reader.py b/pdfdet/models/Paddle/ppdet/data/reader.py
deleted file mode 100644
index c40f3c3..0000000
--- a/pdfdet/models/Paddle/ppdet/data/reader.py
+++ /dev/null
@@ -1,615 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import os
-import traceback
-import six
-import sys
-if sys.version_info >= (3, 0):
-    pass
-else:
-    pass
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-
-from copy import deepcopy
-
-from paddle.io import DataLoader, DistributedBatchSampler
-from .utils import default_collate_fn
-
-from ppdet.core.workspace import register
-from . import transform
-from .shm_utils import _get_shared_memory_size_in_M
-
-from ppdet.utils.logger import setup_logger
-logger = setup_logger('reader')
-
-MAIN_PID = os.getpid()
-
-
-class Compose(object):
-    def __init__(self, transforms, num_classes=80):
-        self.transforms = transforms
-        self.transforms_cls = []
-        for t in self.transforms:
-            for k, v in t.items():
-                op_cls = getattr(transform, k)
-                f = op_cls(**v)
-                if hasattr(f, 'num_classes'):
-                    f.num_classes = num_classes
-
-                self.transforms_cls.append(f)
-
-    def __call__(self, data):
-        for f in self.transforms_cls:
-            try:
-                data = f(data)
-            except Exception as e:
-                stack_info = traceback.format_exc()
-                logger.warning("fail to map sample transform [{}] "
-                               "with error: {} and stack:\n{}".format(
-                                   f, e, str(stack_info)))
-                raise e
-
-        return data
-
-
-class BatchCompose(Compose):
-    def __init__(self, transforms, num_classes=80, collate_batch=True):
-        super(BatchCompose, self).__init__(transforms, num_classes)
-        self.collate_batch = collate_batch
-
-    def __call__(self, data):
-        for f in self.transforms_cls:
-            try:
-                data = f(data)
-            except Exception as e:
-                stack_info = traceback.format_exc()
-                logger.warning("fail to map batch transform [{}] "
-                               "with error: {} and stack:\n{}".format(
-                                   f, e, str(stack_info)))
-                raise e
-
-        # remove keys which is not needed by model
-        extra_key = ['h', 'w', 'flipped']
-        for k in extra_key:
-            for sample in data:
-                if k in sample:
-                    sample.pop(k)
-
-        # batch data, if user-define batch function needed
-        # use user-defined here
-        if self.collate_batch:
-            batch_data = default_collate_fn(data)
-        else:
-            batch_data = {}
-            for k in data[0].keys():
-                tmp_data = []
-                for i in range(len(data)):
-                    tmp_data.append(data[i][k])
-                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
-                    tmp_data = np.stack(tmp_data, axis=0)
-                batch_data[k] = tmp_data
-        return batch_data
-
-
-class BaseDataLoader(object):
-    """
-    Base DataLoader implementation for detection models
-
-    Args:
-        sample_transforms (list): a list of transforms to perform
-                                  on each sample
-        batch_transforms (list): a list of transforms to perform
-                                 on batch
-        batch_size (int): batch size for batch collating, default 1.
-        shuffle (bool): whether to shuffle samples
-        drop_last (bool): whether to drop the last incomplete,
-                          default False
-        num_classes (int): class number of dataset, default 80
-        collate_batch (bool): whether to collate batch in dataloader.
-            If set to True, the samples will collate into batch according
-            to the batch size. Otherwise, the ground-truth will not collate,
-            which is used when the number of ground-truch is different in 
-            samples.
-        use_shared_memory (bool): whether to use shared memory to
-                accelerate data loading, enable this only if you
-                are sure that the shared memory size of your OS
-                is larger than memory cost of input datas of model.
-                Note that shared memory will be automatically
-                disabled if the shared memory of OS is less than
-                1G, which is not enough for detection models.
-                Default False.
-    """
-
-    def __init__(self,
-                 sample_transforms=[],
-                 batch_transforms=[],
-                 batch_size=1,
-                 shuffle=False,
-                 drop_last=False,
-                 num_classes=80,
-                 collate_batch=True,
-                 use_shared_memory=False,
-                 **kwargs):
-        # sample transform
-        self._sample_transforms = Compose(
-            sample_transforms, num_classes=num_classes)
-
-        # batch transfrom 
-        self._batch_transforms = BatchCompose(batch_transforms, num_classes,
-                                              collate_batch)
-        self.batch_size = batch_size
-        self.shuffle = shuffle
-        self.drop_last = drop_last
-        self.use_shared_memory = use_shared_memory
-        self.kwargs = kwargs
-
-    def __call__(self,
-                 dataset,
-                 worker_num,
-                 batch_sampler=None,
-                 return_list=False):
-        self.dataset = dataset
-        self.dataset.check_or_download_dataset()
-        self.dataset.parse_dataset()
-        # get data
-        self.dataset.set_transform(self._sample_transforms)
-        # set kwargs
-        self.dataset.set_kwargs(**self.kwargs)
-        # batch sampler
-        if batch_sampler is None:
-            self._batch_sampler = DistributedBatchSampler(
-                self.dataset,
-                batch_size=self.batch_size,
-                shuffle=self.shuffle,
-                drop_last=self.drop_last)
-        else:
-            self._batch_sampler = batch_sampler
-
-        # DataLoader do not start sub-process in Windows and Mac
-        # system, do not need to use shared memory
-        use_shared_memory = self.use_shared_memory and \
-                            sys.platform not in ['win32', 'darwin']
-        # check whether shared memory size is bigger than 1G(1024M)
-        if use_shared_memory:
-            shm_size = _get_shared_memory_size_in_M()
-            if shm_size is not None and shm_size < 1024.:
-                logger.warning("Shared memory size is less than 1G, "
-                               "disable shared_memory in DataLoader")
-                use_shared_memory = False
-
-        self.dataloader = DataLoader(
-            dataset=self.dataset,
-            batch_sampler=self._batch_sampler,
-            collate_fn=self._batch_transforms,
-            num_workers=worker_num,
-            return_list=return_list,
-            use_shared_memory=use_shared_memory)
-        self.loader = iter(self.dataloader)
-
-        return self
-
-    def __len__(self):
-        return len(self._batch_sampler)
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        try:
-            return next(self.loader)
-        except StopIteration:
-            self.loader = iter(self.dataloader)
-            six.reraise(*sys.exc_info())
-
-    def next(self):
-        # python2 compatibility
-        return self.__next__()
-
-
-@register
-class TrainReader(BaseDataLoader):
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 sample_transforms=[],
-                 batch_transforms=[],
-                 batch_size=1,
-                 shuffle=True,
-                 drop_last=True,
-                 num_classes=80,
-                 collate_batch=True,
-                 **kwargs):
-        super(TrainReader, self).__init__(sample_transforms, batch_transforms,
-                                          batch_size, shuffle, drop_last,
-                                          num_classes, collate_batch, **kwargs)
-
-
-@register
-class EvalReader(BaseDataLoader):
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 sample_transforms=[],
-                 batch_transforms=[],
-                 batch_size=1,
-                 shuffle=False,
-                 drop_last=False,
-                 num_classes=80,
-                 **kwargs):
-        super(EvalReader, self).__init__(sample_transforms, batch_transforms,
-                                         batch_size, shuffle, drop_last,
-                                         num_classes, **kwargs)
-
-
-@register
-class TestReader(BaseDataLoader):
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 sample_transforms=[],
-                 batch_transforms=[],
-                 batch_size=1,
-                 shuffle=False,
-                 drop_last=False,
-                 num_classes=80,
-                 **kwargs):
-        super(TestReader, self).__init__(sample_transforms, batch_transforms,
-                                         batch_size, shuffle, drop_last,
-                                         num_classes, **kwargs)
-
-
-@register
-class EvalMOTReader(BaseDataLoader):
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 sample_transforms=[],
-                 batch_transforms=[],
-                 batch_size=1,
-                 shuffle=False,
-                 drop_last=False,
-                 num_classes=1,
-                 **kwargs):
-        super(EvalMOTReader, self).__init__(sample_transforms, batch_transforms,
-                                            batch_size, shuffle, drop_last,
-                                            num_classes, **kwargs)
-
-
-@register
-class TestMOTReader(BaseDataLoader):
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 sample_transforms=[],
-                 batch_transforms=[],
-                 batch_size=1,
-                 shuffle=False,
-                 drop_last=False,
-                 num_classes=1,
-                 **kwargs):
-        super(TestMOTReader, self).__init__(sample_transforms, batch_transforms,
-                                            batch_size, shuffle, drop_last,
-                                            num_classes, **kwargs)
-
-
-# For Semi-Supervised Object Detection (SSOD)
-class Compose_SSOD(object):
-    def __init__(self, base_transforms, weak_aug, strong_aug, num_classes=80):
-        self.base_transforms = base_transforms
-        self.base_transforms_cls = []
-        for t in self.base_transforms:
-            for k, v in t.items():
-                op_cls = getattr(transform, k)
-                f = op_cls(**v)
-                if hasattr(f, 'num_classes'):
-                    f.num_classes = num_classes
-                self.base_transforms_cls.append(f)
-
-        self.weak_augs = weak_aug
-        self.weak_augs_cls = []
-        for t in self.weak_augs:
-            for k, v in t.items():
-                op_cls = getattr(transform, k)
-                f = op_cls(**v)
-                if hasattr(f, 'num_classes'):
-                    f.num_classes = num_classes
-                self.weak_augs_cls.append(f)
-
-        self.strong_augs = strong_aug
-        self.strong_augs_cls = []
-        for t in self.strong_augs:
-            for k, v in t.items():
-                op_cls = getattr(transform, k)
-                f = op_cls(**v)
-                if hasattr(f, 'num_classes'):
-                    f.num_classes = num_classes
-                self.strong_augs_cls.append(f)
-
-    def __call__(self, data):
-        for f in self.base_transforms_cls:
-            try:
-                data = f(data)
-            except Exception as e:
-                stack_info = traceback.format_exc()
-                logger.warning("fail to map sample transform [{}] "
-                               "with error: {} and stack:\n{}".format(
-                                   f, e, str(stack_info)))
-                raise e
-
-        weak_data = deepcopy(data)
-        strong_data = deepcopy(data)
-        for f in self.weak_augs_cls:
-            try:
-                weak_data = f(weak_data)
-            except Exception as e:
-                stack_info = traceback.format_exc()
-                logger.warning("fail to map weak aug [{}] "
-                               "with error: {} and stack:\n{}".format(
-                                   f, e, str(stack_info)))
-                raise e
-
-        for f in self.strong_augs_cls:
-            try:
-                strong_data = f(strong_data)
-            except Exception as e:
-                stack_info = traceback.format_exc()
-                logger.warning("fail to map strong aug [{}] "
-                               "with error: {} and stack:\n{}".format(
-                                   f, e, str(stack_info)))
-                raise e
-
-        weak_data['strong_aug'] = strong_data
-        return weak_data
-
-
-class BatchCompose_SSOD(Compose):
-    def __init__(self, transforms, num_classes=80, collate_batch=True):
-        super(BatchCompose_SSOD, self).__init__(transforms, num_classes)
-        self.collate_batch = collate_batch
-
-    def __call__(self, data):
-        # split strong_data from data(weak_data)
-        strong_data = []
-        for sample in data:
-            strong_data.append(sample['strong_aug'])
-            sample.pop('strong_aug')
-
-        for f in self.transforms_cls:
-            try:
-                data = f(data)
-                if 'BatchRandomResizeForSSOD' in f._id:
-                    strong_data = f(strong_data, data[1])[0]
-                    data = data[0]
-                else:
-                    strong_data = f(strong_data)
-            except Exception as e:
-                stack_info = traceback.format_exc()
-                logger.warning("fail to map batch transform [{}] "
-                               "with error: {} and stack:\n{}".format(
-                                   f, e, str(stack_info)))
-                raise e
-
-        # remove keys which is not needed by model
-        extra_key = ['h', 'w', 'flipped']
-        for k in extra_key:
-            for sample in data:
-                if k in sample:
-                    sample.pop(k)
-            for sample in strong_data:
-                if k in sample:
-                    sample.pop(k)
-
-        # batch data, if user-define batch function needed
-        # use user-defined here
-        if self.collate_batch:
-            batch_data = default_collate_fn(data)
-            strong_batch_data = default_collate_fn(strong_data)
-            return batch_data, strong_batch_data
-        else:
-            batch_data = {}
-            for k in data[0].keys():
-                tmp_data = []
-                for i in range(len(data)):
-                    tmp_data.append(data[i][k])
-                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
-                    tmp_data = np.stack(tmp_data, axis=0)
-                batch_data[k] = tmp_data
-
-            strong_batch_data = {}
-            for k in strong_data[0].keys():
-                tmp_data = []
-                for i in range(len(strong_data)):
-                    tmp_data.append(strong_data[i][k])
-                if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k:
-                    tmp_data = np.stack(tmp_data, axis=0)
-                strong_batch_data[k] = tmp_data
-
-        return batch_data, strong_batch_data
-
-
-class CombineSSODLoader(object):
-    def __init__(self, label_loader, unlabel_loader):
-        self.label_loader = label_loader
-        self.unlabel_loader = unlabel_loader
-
-    def __iter__(self):
-        while True:
-            try:
-                label_samples = next(self.label_loader_iter)
-            except:
-                self.label_loader_iter = iter(self.label_loader)
-                label_samples = next(self.label_loader_iter)
-
-            try:
-                unlabel_samples = next(self.unlabel_loader_iter)
-            except:
-                self.unlabel_loader_iter = iter(self.unlabel_loader)
-                unlabel_samples = next(self.unlabel_loader_iter)
-
-            yield (
-                label_samples[0],  # sup weak
-                label_samples[1],  # sup strong
-                unlabel_samples[0],  # unsup weak
-                unlabel_samples[1]  # unsup strong
-            )
-
-    def __call__(self):
-        return self.__iter__()
-
-
-class BaseSemiDataLoader(object):
-    def __init__(self,
-                 sample_transforms=[],
-                 weak_aug=[],
-                 strong_aug=[],
-                 sup_batch_transforms=[],
-                 unsup_batch_transforms=[],
-                 sup_batch_size=1,
-                 unsup_batch_size=1,
-                 shuffle=True,
-                 drop_last=True,
-                 num_classes=80,
-                 collate_batch=True,
-                 use_shared_memory=False,
-                 **kwargs):
-        # sup transforms
-        self._sample_transforms_label = Compose_SSOD(
-            sample_transforms, weak_aug, strong_aug, num_classes=num_classes)
-        self._batch_transforms_label = BatchCompose_SSOD(
-            sup_batch_transforms, num_classes, collate_batch)
-        self.batch_size_label = sup_batch_size
-
-        # unsup transforms
-        self._sample_transforms_unlabel = Compose_SSOD(
-            sample_transforms, weak_aug, strong_aug, num_classes=num_classes)
-        self._batch_transforms_unlabel = BatchCompose_SSOD(
-            unsup_batch_transforms, num_classes, collate_batch)
-        self.batch_size_unlabel = unsup_batch_size
-
-        # common
-        self.shuffle = shuffle
-        self.drop_last = drop_last
-        self.use_shared_memory = use_shared_memory
-        self.kwargs = kwargs
-
-    def __call__(self,
-                 dataset_label,
-                 dataset_unlabel,
-                 worker_num,
-                 batch_sampler_label=None,
-                 batch_sampler_unlabel=None,
-                 return_list=False):
-        # sup dataset 
-        self.dataset_label = dataset_label
-        self.dataset_label.check_or_download_dataset()
-        self.dataset_label.parse_dataset()
-        self.dataset_label.set_transform(self._sample_transforms_label)
-        self.dataset_label.set_kwargs(**self.kwargs)
-        if batch_sampler_label is None:
-            self._batch_sampler_label = DistributedBatchSampler(
-                self.dataset_label,
-                batch_size=self.batch_size_label,
-                shuffle=self.shuffle,
-                drop_last=self.drop_last)
-        else:
-            self._batch_sampler_label = batch_sampler_label
-
-        # unsup dataset
-        self.dataset_unlabel = dataset_unlabel
-        self.dataset_unlabel.length = self.dataset_label.__len__()
-        self.dataset_unlabel.check_or_download_dataset()
-        self.dataset_unlabel.parse_dataset()
-        self.dataset_unlabel.set_transform(self._sample_transforms_unlabel)
-        self.dataset_unlabel.set_kwargs(**self.kwargs)
-        if batch_sampler_unlabel is None:
-            self._batch_sampler_unlabel = DistributedBatchSampler(
-                self.dataset_unlabel,
-                batch_size=self.batch_size_unlabel,
-                shuffle=self.shuffle,
-                drop_last=self.drop_last)
-        else:
-            self._batch_sampler_unlabel = batch_sampler_unlabel
-
-        # DataLoader do not start sub-process in Windows and Mac
-        # system, do not need to use shared memory
-        use_shared_memory = self.use_shared_memory and \
-                            sys.platform not in ['win32', 'darwin']
-        # check whether shared memory size is bigger than 1G(1024M)
-        if use_shared_memory:
-            shm_size = _get_shared_memory_size_in_M()
-            if shm_size is not None and shm_size < 1024.:
-                logger.warning("Shared memory size is less than 1G, "
-                               "disable shared_memory in DataLoader")
-                use_shared_memory = False
-
-        self.dataloader_label = DataLoader(
-            dataset=self.dataset_label,
-            batch_sampler=self._batch_sampler_label,
-            collate_fn=self._batch_transforms_label,
-            num_workers=worker_num,
-            return_list=return_list,
-            use_shared_memory=use_shared_memory)
-
-        self.dataloader_unlabel = DataLoader(
-            dataset=self.dataset_unlabel,
-            batch_sampler=self._batch_sampler_unlabel,
-            collate_fn=self._batch_transforms_unlabel,
-            num_workers=worker_num,
-            return_list=return_list,
-            use_shared_memory=use_shared_memory)
-
-        self.dataloader = CombineSSODLoader(self.dataloader_label,
-                                            self.dataloader_unlabel)
-        self.loader = iter(self.dataloader)
-        return self
-
-    def __len__(self):
-        return len(self._batch_sampler_label)
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        return next(self.loader)
-
-    def next(self):
-        # python2 compatibility
-        return self.__next__()
-
-
-@register
-class SemiTrainReader(BaseSemiDataLoader):
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 sample_transforms=[],
-                 weak_aug=[],
-                 strong_aug=[],
-                 sup_batch_transforms=[],
-                 unsup_batch_transforms=[],
-                 sup_batch_size=1,
-                 unsup_batch_size=1,
-                 shuffle=True,
-                 drop_last=True,
-                 num_classes=80,
-                 collate_batch=True,
-                 **kwargs):
-        super(SemiTrainReader, self).__init__(
-            sample_transforms, weak_aug, strong_aug, sup_batch_transforms,
-            unsup_batch_transforms, sup_batch_size, unsup_batch_size, shuffle,
-            drop_last, num_classes, collate_batch, **kwargs)
diff --git a/pdfdet/models/Paddle/ppdet/data/shm_utils.py b/pdfdet/models/Paddle/ppdet/data/shm_utils.py
deleted file mode 100644
index a929a80..0000000
--- a/pdfdet/models/Paddle/ppdet/data/shm_utils.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-SIZE_UNIT = ['K', 'M', 'G', 'T']
-SHM_QUERY_CMD = 'df -h'
-SHM_KEY = 'shm'
-SHM_DEFAULT_MOUNT = '/dev/shm'
-
-# [ shared memory size check ]
-# In detection models, image/target data occupies a lot of memory, and
-# will occupy lots of shared memory in multi-process DataLoader, we use
-# following code to get shared memory size and perform a size check to
-# disable shared memory use if shared memory size is not enough.
-# Shared memory getting process as follows:
-# 1. use `df -h` get all mount info
-# 2. pick up spaces whose mount info contains 'shm'
-# 3. if 'shm' space number is only 1, return its size
-# 4. if there are multiple 'shm' space, try to find the default mount
-#    directory '/dev/shm' is Linux-like system, otherwise return the
-#    biggest space size.
-
-
-def _parse_size_in_M(size_str):
-    if size_str[-1] == 'B':
-        num, unit = size_str[:-2], size_str[-2]
-    else:
-        num, unit = size_str[:-1], size_str[-1]
-    assert unit in SIZE_UNIT, \
-            "unknown shm size unit {}".format(unit)
-    return float(num) * \
-            (1024 ** (SIZE_UNIT.index(unit) - 1))
-
-
-def _get_shared_memory_size_in_M():
-    try:
-        df_infos = os.popen(SHM_QUERY_CMD).readlines()
-    except:
-        return None
-    else:
-        shm_infos = []
-        for df_info in df_infos:
-            info = df_info.strip()
-            if info.find(SHM_KEY) >= 0:
-                shm_infos.append(info.split())
-
-        if len(shm_infos) == 0:
-            return None
-        elif len(shm_infos) == 1:
-            return _parse_size_in_M(shm_infos[0][3])
-        else:
-            default_mount_infos = [
-                si for si in shm_infos if si[-1] == SHM_DEFAULT_MOUNT
-            ]
-            if default_mount_infos:
-                return _parse_size_in_M(default_mount_infos[0][3])
-            else:
-                return max([_parse_size_in_M(si[3]) for si in shm_infos])
diff --git a/pdfdet/models/Paddle/ppdet/data/source/__init__.py b/pdfdet/models/Paddle/ppdet/data/source/__init__.py
deleted file mode 100644
index 2821ff5..0000000
--- a/pdfdet/models/Paddle/ppdet/data/source/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import coco
-# from . import voc
-# from . import widerface
-# from . import category
-# from . import keypoint_coco
-# from . import mot
-# from . import sniper_coco
-# from . import culane
-
-from .coco import *
-# from .voc import *
-# from .widerface import *
-# from .category import *
-# from .keypoint_coco import *
-# from .mot import *
-# from .sniper_coco import SniperCOCODataSet
-# from .dataset import ImageFolder
-# from .pose3d_cmb import *
-# from .culane import *
diff --git a/pdfdet/models/Paddle/ppdet/data/source/category.py b/pdfdet/models/Paddle/ppdet/data/source/category.py
deleted file mode 100644
index 8ed1f9e..0000000
--- a/pdfdet/models/Paddle/ppdet/data/source/category.py
+++ /dev/null
@@ -1,942 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-
-from ppdet.data.source.voc import pascalvoc_label
-from ppdet.data.source.widerface import widerface_label
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-__all__ = ['get_categories']
-
-
-def get_categories(metric_type, anno_file=None, arch=None):
-    """
-    Get class id to category id map and category id
-    to category name map from annotation file.
-
-    Args:
-        metric_type (str): metric type, currently support 'coco', 'voc', 'oid'
-            and 'widerface'.
-        anno_file (str): annotation file path
-    """
-    if arch == 'keypoint_arch':
-        return (None, {'id': 'keypoint'})
-
-    if anno_file == None or (not os.path.isfile(anno_file)):
-        logger.warning(
-            "anno_file '{}' is None or not set or not exist, "
-            "please recheck TrainDataset/EvalDataset/TestDataset.anno_path, "
-            "otherwise the default categories will be used by metric_type.".
-            format(anno_file))
-
-    if metric_type.lower() == 'coco' or metric_type.lower(
-    ) == 'rbox' or metric_type.lower() == 'snipercoco':
-        if anno_file and os.path.isfile(anno_file):
-            if anno_file.endswith('json'):
-                # lazy import pycocotools here
-                from pycocotools.coco import COCO
-                coco = COCO(anno_file)
-                cats = coco.loadCats(coco.getCatIds())
-
-                clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)}
-                catid2name = {cat['id']: cat['name'] for cat in cats}
-
-            elif anno_file.endswith('txt'):
-                cats = []
-                with open(anno_file) as f:
-                    for line in f.readlines():
-                        cats.append(line.strip())
-                if cats[0] == 'background': cats = cats[1:]
-
-                clsid2catid = {i: i for i in range(len(cats))}
-                catid2name = {i: name for i, name in enumerate(cats)}
-
-            else:
-                raise ValueError("anno_file {} should be json or txt.".format(
-                    anno_file))
-            return clsid2catid, catid2name
-
-        # anno file not exist, load default categories of COCO17
-        else:
-            if metric_type.lower() == 'rbox':
-                logger.warning(
-                    "metric_type: {}, load default categories of DOTA.".format(
-                        metric_type))
-                return _dota_category()
-            logger.warning("metric_type: {}, load default categories of COCO.".
-                           format(metric_type))
-            return _coco17_category()
-
-    elif metric_type.lower() == 'voc':
-        if anno_file and os.path.isfile(anno_file):
-            cats = []
-            with open(anno_file) as f:
-                for line in f.readlines():
-                    cats.append(line.strip())
-
-            if cats[0] == 'background':
-                cats = cats[1:]
-
-            clsid2catid = {i: i for i in range(len(cats))}
-            catid2name = {i: name for i, name in enumerate(cats)}
-
-            return clsid2catid, catid2name
-
-        # anno file not exist, load default categories of
-        # VOC all 20 categories
-        else:
-            logger.warning("metric_type: {}, load default categories of VOC.".
-                           format(metric_type))
-            return _vocall_category()
-
-    elif metric_type.lower() == 'oid':
-        if anno_file and os.path.isfile(anno_file):
-            logger.warning("only default categories support for OID19")
-        return _oid19_category()
-
-    elif metric_type.lower() == 'widerface':
-        return _widerface_category()
-
-    elif metric_type.lower() in [
-            'keypointtopdowncocoeval', 'keypointtopdownmpiieval',
-            'keypointtopdowncocowholebadyhandeval'
-    ]:
-        return (None, {'id': 'keypoint'})
-
-    elif metric_type.lower() == 'pose3deval':
-        return (None, {'id': 'pose3d'})
-
-    elif metric_type.lower() in ['mot', 'motdet', 'reid']:
-        if anno_file and os.path.isfile(anno_file):
-            cats = []
-            with open(anno_file) as f:
-                for line in f.readlines():
-                    cats.append(line.strip())
-            if cats[0] == 'background':
-                cats = cats[1:]
-            clsid2catid = {i: i for i in range(len(cats))}
-            catid2name = {i: name for i, name in enumerate(cats)}
-            return clsid2catid, catid2name
-        # anno file not exist, load default category 'pedestrian'.
-        else:
-            logger.warning(
-                "metric_type: {}, load default categories of pedestrian MOT.".
-                format(metric_type))
-            return _mot_category(category='pedestrian')
-
-    elif metric_type.lower() in ['kitti', 'bdd100kmot']:
-        return _mot_category(category='vehicle')
-
-    elif metric_type.lower() in ['mcmot']:
-        if anno_file and os.path.isfile(anno_file):
-            cats = []
-            with open(anno_file) as f:
-                for line in f.readlines():
-                    cats.append(line.strip())
-            if cats[0] == 'background':
-                cats = cats[1:]
-            clsid2catid = {i: i for i in range(len(cats))}
-            catid2name = {i: name for i, name in enumerate(cats)}
-            return clsid2catid, catid2name
-        # anno file not exist, load default categories of visdrone all 10 categories
-        else:
-            logger.warning(
-                "metric_type: {}, load default categories of VisDrone.".format(
-                    metric_type))
-            return _visdrone_category()
-
-    else:
-        raise ValueError("unknown metric type {}".format(metric_type))
-
-
-def _mot_category(category='pedestrian'):
-    """
-    Get class id to category id map and category id
-    to category name map of mot dataset
-    """
-    label_map = {category: 0}
-    label_map = sorted(label_map.items(), key=lambda x: x[1])
-    cats = [l[0] for l in label_map]
-
-    clsid2catid = {i: i for i in range(len(cats))}
-    catid2name = {i: name for i, name in enumerate(cats)}
-
-    return clsid2catid, catid2name
-
-
-def _coco17_category():
-    """
-    Get class id to category id map and category id
-    to category name map of COCO2017 dataset
-
-    """
-    clsid2catid = {
-        1: 1,
-        2: 2,
-        3: 3,
-        4: 4,
-        5: 5,
-        6: 6,
-        7: 7,
-        8: 8,
-        9: 9,
-        10: 10,
-        11: 11,
-        12: 13,
-        13: 14,
-        14: 15,
-        15: 16,
-        16: 17,
-        17: 18,
-        18: 19,
-        19: 20,
-        20: 21,
-        21: 22,
-        22: 23,
-        23: 24,
-        24: 25,
-        25: 27,
-        26: 28,
-        27: 31,
-        28: 32,
-        29: 33,
-        30: 34,
-        31: 35,
-        32: 36,
-        33: 37,
-        34: 38,
-        35: 39,
-        36: 40,
-        37: 41,
-        38: 42,
-        39: 43,
-        40: 44,
-        41: 46,
-        42: 47,
-        43: 48,
-        44: 49,
-        45: 50,
-        46: 51,
-        47: 52,
-        48: 53,
-        49: 54,
-        50: 55,
-        51: 56,
-        52: 57,
-        53: 58,
-        54: 59,
-        55: 60,
-        56: 61,
-        57: 62,
-        58: 63,
-        59: 64,
-        60: 65,
-        61: 67,
-        62: 70,
-        63: 72,
-        64: 73,
-        65: 74,
-        66: 75,
-        67: 76,
-        68: 77,
-        69: 78,
-        70: 79,
-        71: 80,
-        72: 81,
-        73: 82,
-        74: 84,
-        75: 85,
-        76: 86,
-        77: 87,
-        78: 88,
-        79: 89,
-        80: 90
-    }
-
-    catid2name = {
-        0: 'background',
-        1: 'person',
-        2: 'bicycle',
-        3: 'car',
-        4: 'motorcycle',
-        5: 'airplane',
-        6: 'bus',
-        7: 'train',
-        8: 'truck',
-        9: 'boat',
-        10: 'traffic light',
-        11: 'fire hydrant',
-        13: 'stop sign',
-        14: 'parking meter',
-        15: 'bench',
-        16: 'bird',
-        17: 'cat',
-        18: 'dog',
-        19: 'horse',
-        20: 'sheep',
-        21: 'cow',
-        22: 'elephant',
-        23: 'bear',
-        24: 'zebra',
-        25: 'giraffe',
-        27: 'backpack',
-        28: 'umbrella',
-        31: 'handbag',
-        32: 'tie',
-        33: 'suitcase',
-        34: 'frisbee',
-        35: 'skis',
-        36: 'snowboard',
-        37: 'sports ball',
-        38: 'kite',
-        39: 'baseball bat',
-        40: 'baseball glove',
-        41: 'skateboard',
-        42: 'surfboard',
-        43: 'tennis racket',
-        44: 'bottle',
-        46: 'wine glass',
-        47: 'cup',
-        48: 'fork',
-        49: 'knife',
-        50: 'spoon',
-        51: 'bowl',
-        52: 'banana',
-        53: 'apple',
-        54: 'sandwich',
-        55: 'orange',
-        56: 'broccoli',
-        57: 'carrot',
-        58: 'hot dog',
-        59: 'pizza',
-        60: 'donut',
-        61: 'cake',
-        62: 'chair',
-        63: 'couch',
-        64: 'potted plant',
-        65: 'bed',
-        67: 'dining table',
-        70: 'toilet',
-        72: 'tv',
-        73: 'laptop',
-        74: 'mouse',
-        75: 'remote',
-        76: 'keyboard',
-        77: 'cell phone',
-        78: 'microwave',
-        79: 'oven',
-        80: 'toaster',
-        81: 'sink',
-        82: 'refrigerator',
-        84: 'book',
-        85: 'clock',
-        86: 'vase',
-        87: 'scissors',
-        88: 'teddy bear',
-        89: 'hair drier',
-        90: 'toothbrush'
-    }
-
-    clsid2catid = {k - 1: v for k, v in clsid2catid.items()}
-    catid2name.pop(0)
-
-    return clsid2catid, catid2name
-
-
-def _dota_category():
-    """
-    Get class id to category id map and category id
-    to category name map of dota dataset
-    """
-    catid2name = {
-        0: 'background',
-        1: 'plane',
-        2: 'baseball-diamond',
-        3: 'bridge',
-        4: 'ground-track-field',
-        5: 'small-vehicle',
-        6: 'large-vehicle',
-        7: 'ship',
-        8: 'tennis-court',
-        9: 'basketball-court',
-        10: 'storage-tank',
-        11: 'soccer-ball-field',
-        12: 'roundabout',
-        13: 'harbor',
-        14: 'swimming-pool',
-        15: 'helicopter'
-    }
-    catid2name.pop(0)
-    clsid2catid = {i: i + 1 for i in range(len(catid2name))}
-    return clsid2catid, catid2name
-
-
-def _vocall_category():
-    """
-    Get class id to category id map and category id
-    to category name map of mixup voc dataset
-
-    """
-    label_map = pascalvoc_label()
-    label_map = sorted(label_map.items(), key=lambda x: x[1])
-    cats = [l[0] for l in label_map]
-
-    clsid2catid = {i: i for i in range(len(cats))}
-    catid2name = {i: name for i, name in enumerate(cats)}
-
-    return clsid2catid, catid2name
-
-
-def _widerface_category():
-    label_map = widerface_label()
-    label_map = sorted(label_map.items(), key=lambda x: x[1])
-    cats = [l[0] for l in label_map]
-    clsid2catid = {i: i for i in range(len(cats))}
-    catid2name = {i: name for i, name in enumerate(cats)}
-
-    return clsid2catid, catid2name
-
-
-def _oid19_category():
-    clsid2catid = {k: k + 1 for k in range(500)}
-
-    catid2name = {
-        0: "background",
-        1: "Infant bed",
-        2: "Rose",
-        3: "Flag",
-        4: "Flashlight",
-        5: "Sea turtle",
-        6: "Camera",
-        7: "Animal",
-        8: "Glove",
-        9: "Crocodile",
-        10: "Cattle",
-        11: "House",
-        12: "Guacamole",
-        13: "Penguin",
-        14: "Vehicle registration plate",
-        15: "Bench",
-        16: "Ladybug",
-        17: "Human nose",
-        18: "Watermelon",
-        19: "Flute",
-        20: "Butterfly",
-        21: "Washing machine",
-        22: "Raccoon",
-        23: "Segway",
-        24: "Taco",
-        25: "Jellyfish",
-        26: "Cake",
-        27: "Pen",
-        28: "Cannon",
-        29: "Bread",
-        30: "Tree",
-        31: "Shellfish",
-        32: "Bed",
-        33: "Hamster",
-        34: "Hat",
-        35: "Toaster",
-        36: "Sombrero",
-        37: "Tiara",
-        38: "Bowl",
-        39: "Dragonfly",
-        40: "Moths and butterflies",
-        41: "Antelope",
-        42: "Vegetable",
-        43: "Torch",
-        44: "Building",
-        45: "Power plugs and sockets",
-        46: "Blender",
-        47: "Billiard table",
-        48: "Cutting board",
-        49: "Bronze sculpture",
-        50: "Turtle",
-        51: "Broccoli",
-        52: "Tiger",
-        53: "Mirror",
-        54: "Bear",
-        55: "Zucchini",
-        56: "Dress",
-        57: "Volleyball",
-        58: "Guitar",
-        59: "Reptile",
-        60: "Golf cart",
-        61: "Tart",
-        62: "Fedora",
-        63: "Carnivore",
-        64: "Car",
-        65: "Lighthouse",
-        66: "Coffeemaker",
-        67: "Food processor",
-        68: "Truck",
-        69: "Bookcase",
-        70: "Surfboard",
-        71: "Footwear",
-        72: "Bench",
-        73: "Necklace",
-        74: "Flower",
-        75: "Radish",
-        76: "Marine mammal",
-        77: "Frying pan",
-        78: "Tap",
-        79: "Peach",
-        80: "Knife",
-        81: "Handbag",
-        82: "Laptop",
-        83: "Tent",
-        84: "Ambulance",
-        85: "Christmas tree",
-        86: "Eagle",
-        87: "Limousine",
-        88: "Kitchen & dining room table",
-        89: "Polar bear",
-        90: "Tower",
-        91: "Football",
-        92: "Willow",
-        93: "Human head",
-        94: "Stop sign",
-        95: "Banana",
-        96: "Mixer",
-        97: "Binoculars",
-        98: "Dessert",
-        99: "Bee",
-        100: "Chair",
-        101: "Wood-burning stove",
-        102: "Flowerpot",
-        103: "Beaker",
-        104: "Oyster",
-        105: "Woodpecker",
-        106: "Harp",
-        107: "Bathtub",
-        108: "Wall clock",
-        109: "Sports uniform",
-        110: "Rhinoceros",
-        111: "Beehive",
-        112: "Cupboard",
-        113: "Chicken",
-        114: "Man",
-        115: "Blue jay",
-        116: "Cucumber",
-        117: "Balloon",
-        118: "Kite",
-        119: "Fireplace",
-        120: "Lantern",
-        121: "Missile",
-        122: "Book",
-        123: "Spoon",
-        124: "Grapefruit",
-        125: "Squirrel",
-        126: "Orange",
-        127: "Coat",
-        128: "Punching bag",
-        129: "Zebra",
-        130: "Billboard",
-        131: "Bicycle",
-        132: "Door handle",
-        133: "Mechanical fan",
-        134: "Ring binder",
-        135: "Table",
-        136: "Parrot",
-        137: "Sock",
-        138: "Vase",
-        139: "Weapon",
-        140: "Shotgun",
-        141: "Glasses",
-        142: "Seahorse",
-        143: "Belt",
-        144: "Watercraft",
-        145: "Window",
-        146: "Giraffe",
-        147: "Lion",
-        148: "Tire",
-        149: "Vehicle",
-        150: "Canoe",
-        151: "Tie",
-        152: "Shelf",
-        153: "Picture frame",
-        154: "Printer",
-        155: "Human leg",
-        156: "Boat",
-        157: "Slow cooker",
-        158: "Croissant",
-        159: "Candle",
-        160: "Pancake",
-        161: "Pillow",
-        162: "Coin",
-        163: "Stretcher",
-        164: "Sandal",
-        165: "Woman",
-        166: "Stairs",
-        167: "Harpsichord",
-        168: "Stool",
-        169: "Bus",
-        170: "Suitcase",
-        171: "Human mouth",
-        172: "Juice",
-        173: "Skull",
-        174: "Door",
-        175: "Violin",
-        176: "Chopsticks",
-        177: "Digital clock",
-        178: "Sunflower",
-        179: "Leopard",
-        180: "Bell pepper",
-        181: "Harbor seal",
-        182: "Snake",
-        183: "Sewing machine",
-        184: "Goose",
-        185: "Helicopter",
-        186: "Seat belt",
-        187: "Coffee cup",
-        188: "Microwave oven",
-        189: "Hot dog",
-        190: "Countertop",
-        191: "Serving tray",
-        192: "Dog bed",
-        193: "Beer",
-        194: "Sunglasses",
-        195: "Golf ball",
-        196: "Waffle",
-        197: "Palm tree",
-        198: "Trumpet",
-        199: "Ruler",
-        200: "Helmet",
-        201: "Ladder",
-        202: "Office building",
-        203: "Tablet computer",
-        204: "Toilet paper",
-        205: "Pomegranate",
-        206: "Skirt",
-        207: "Gas stove",
-        208: "Cookie",
-        209: "Cart",
-        210: "Raven",
-        211: "Egg",
-        212: "Burrito",
-        213: "Goat",
-        214: "Kitchen knife",
-        215: "Skateboard",
-        216: "Salt and pepper shakers",
-        217: "Lynx",
-        218: "Boot",
-        219: "Platter",
-        220: "Ski",
-        221: "Swimwear",
-        222: "Swimming pool",
-        223: "Drinking straw",
-        224: "Wrench",
-        225: "Drum",
-        226: "Ant",
-        227: "Human ear",
-        228: "Headphones",
-        229: "Fountain",
-        230: "Bird",
-        231: "Jeans",
-        232: "Television",
-        233: "Crab",
-        234: "Microphone",
-        235: "Home appliance",
-        236: "Snowplow",
-        237: "Beetle",
-        238: "Artichoke",
-        239: "Jet ski",
-        240: "Stationary bicycle",
-        241: "Human hair",
-        242: "Brown bear",
-        243: "Starfish",
-        244: "Fork",
-        245: "Lobster",
-        246: "Corded phone",
-        247: "Drink",
-        248: "Saucer",
-        249: "Carrot",
-        250: "Insect",
-        251: "Clock",
-        252: "Castle",
-        253: "Tennis racket",
-        254: "Ceiling fan",
-        255: "Asparagus",
-        256: "Jaguar",
-        257: "Musical instrument",
-        258: "Train",
-        259: "Cat",
-        260: "Rifle",
-        261: "Dumbbell",
-        262: "Mobile phone",
-        263: "Taxi",
-        264: "Shower",
-        265: "Pitcher",
-        266: "Lemon",
-        267: "Invertebrate",
-        268: "Turkey",
-        269: "High heels",
-        270: "Bust",
-        271: "Elephant",
-        272: "Scarf",
-        273: "Barrel",
-        274: "Trombone",
-        275: "Pumpkin",
-        276: "Box",
-        277: "Tomato",
-        278: "Frog",
-        279: "Bidet",
-        280: "Human face",
-        281: "Houseplant",
-        282: "Van",
-        283: "Shark",
-        284: "Ice cream",
-        285: "Swim cap",
-        286: "Falcon",
-        287: "Ostrich",
-        288: "Handgun",
-        289: "Whiteboard",
-        290: "Lizard",
-        291: "Pasta",
-        292: "Snowmobile",
-        293: "Light bulb",
-        294: "Window blind",
-        295: "Muffin",
-        296: "Pretzel",
-        297: "Computer monitor",
-        298: "Horn",
-        299: "Furniture",
-        300: "Sandwich",
-        301: "Fox",
-        302: "Convenience store",
-        303: "Fish",
-        304: "Fruit",
-        305: "Earrings",
-        306: "Curtain",
-        307: "Grape",
-        308: "Sofa bed",
-        309: "Horse",
-        310: "Luggage and bags",
-        311: "Desk",
-        312: "Crutch",
-        313: "Bicycle helmet",
-        314: "Tick",
-        315: "Airplane",
-        316: "Canary",
-        317: "Spatula",
-        318: "Watch",
-        319: "Lily",
-        320: "Kitchen appliance",
-        321: "Filing cabinet",
-        322: "Aircraft",
-        323: "Cake stand",
-        324: "Candy",
-        325: "Sink",
-        326: "Mouse",
-        327: "Wine",
-        328: "Wheelchair",
-        329: "Goldfish",
-        330: "Refrigerator",
-        331: "French fries",
-        332: "Drawer",
-        333: "Treadmill",
-        334: "Picnic basket",
-        335: "Dice",
-        336: "Cabbage",
-        337: "Football helmet",
-        338: "Pig",
-        339: "Person",
-        340: "Shorts",
-        341: "Gondola",
-        342: "Honeycomb",
-        343: "Doughnut",
-        344: "Chest of drawers",
-        345: "Land vehicle",
-        346: "Bat",
-        347: "Monkey",
-        348: "Dagger",
-        349: "Tableware",
-        350: "Human foot",
-        351: "Mug",
-        352: "Alarm clock",
-        353: "Pressure cooker",
-        354: "Human hand",
-        355: "Tortoise",
-        356: "Baseball glove",
-        357: "Sword",
-        358: "Pear",
-        359: "Miniskirt",
-        360: "Traffic sign",
-        361: "Girl",
-        362: "Roller skates",
-        363: "Dinosaur",
-        364: "Porch",
-        365: "Human beard",
-        366: "Submarine sandwich",
-        367: "Screwdriver",
-        368: "Strawberry",
-        369: "Wine glass",
-        370: "Seafood",
-        371: "Racket",
-        372: "Wheel",
-        373: "Sea lion",
-        374: "Toy",
-        375: "Tea",
-        376: "Tennis ball",
-        377: "Waste container",
-        378: "Mule",
-        379: "Cricket ball",
-        380: "Pineapple",
-        381: "Coconut",
-        382: "Doll",
-        383: "Coffee table",
-        384: "Snowman",
-        385: "Lavender",
-        386: "Shrimp",
-        387: "Maple",
-        388: "Cowboy hat",
-        389: "Goggles",
-        390: "Rugby ball",
-        391: "Caterpillar",
-        392: "Poster",
-        393: "Rocket",
-        394: "Organ",
-        395: "Saxophone",
-        396: "Traffic light",
-        397: "Cocktail",
-        398: "Plastic bag",
-        399: "Squash",
-        400: "Mushroom",
-        401: "Hamburger",
-        402: "Light switch",
-        403: "Parachute",
-        404: "Teddy bear",
-        405: "Winter melon",
-        406: "Deer",
-        407: "Musical keyboard",
-        408: "Plumbing fixture",
-        409: "Scoreboard",
-        410: "Baseball bat",
-        411: "Envelope",
-        412: "Adhesive tape",
-        413: "Briefcase",
-        414: "Paddle",
-        415: "Bow and arrow",
-        416: "Telephone",
-        417: "Sheep",
-        418: "Jacket",
-        419: "Boy",
-        420: "Pizza",
-        421: "Otter",
-        422: "Office supplies",
-        423: "Couch",
-        424: "Cello",
-        425: "Bull",
-        426: "Camel",
-        427: "Ball",
-        428: "Duck",
-        429: "Whale",
-        430: "Shirt",
-        431: "Tank",
-        432: "Motorcycle",
-        433: "Accordion",
-        434: "Owl",
-        435: "Porcupine",
-        436: "Sun hat",
-        437: "Nail",
-        438: "Scissors",
-        439: "Swan",
-        440: "Lamp",
-        441: "Crown",
-        442: "Piano",
-        443: "Sculpture",
-        444: "Cheetah",
-        445: "Oboe",
-        446: "Tin can",
-        447: "Mango",
-        448: "Tripod",
-        449: "Oven",
-        450: "Mouse",
-        451: "Barge",
-        452: "Coffee",
-        453: "Snowboard",
-        454: "Common fig",
-        455: "Salad",
-        456: "Marine invertebrates",
-        457: "Umbrella",
-        458: "Kangaroo",
-        459: "Human arm",
-        460: "Measuring cup",
-        461: "Snail",
-        462: "Loveseat",
-        463: "Suit",
-        464: "Teapot",
-        465: "Bottle",
-        466: "Alpaca",
-        467: "Kettle",
-        468: "Trousers",
-        469: "Popcorn",
-        470: "Centipede",
-        471: "Spider",
-        472: "Sparrow",
-        473: "Plate",
-        474: "Bagel",
-        475: "Personal care",
-        476: "Apple",
-        477: "Brassiere",
-        478: "Bathroom cabinet",
-        479: "studio couch",
-        480: "Computer keyboard",
-        481: "Table tennis racket",
-        482: "Sushi",
-        483: "Cabinetry",
-        484: "Street light",
-        485: "Towel",
-        486: "Nightstand",
-        487: "Rabbit",
-        488: "Dolphin",
-        489: "Dog",
-        490: "Jug",
-        491: "Wok",
-        492: "Fire hydrant",
-        493: "Human eye",
-        494: "Skyscraper",
-        495: "Backpack",
-        496: "Potato",
-        497: "Paper towel",
-        498: "Lifejacket",
-        499: "Bicycle wheel",
-        500: "Toilet",
-    }
-
-    return clsid2catid, catid2name
-
-
-def _visdrone_category():
-    clsid2catid = {i: i for i in range(10)}
-
-    catid2name = {
-        0: 'pedestrian',
-        1: 'people',
-        2: 'bicycle',
-        3: 'car',
-        4: 'van',
-        5: 'truck',
-        6: 'tricycle',
-        7: 'awning-tricycle',
-        8: 'bus',
-        9: 'motor'
-    }
-    return clsid2catid, catid2name
diff --git a/pdfdet/models/Paddle/ppdet/data/source/coco.py b/pdfdet/models/Paddle/ppdet/data/source/coco.py
deleted file mode 100644
index 4120327..0000000
--- a/pdfdet/models/Paddle/ppdet/data/source/coco.py
+++ /dev/null
@@ -1,596 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import os
-import copy
-try:
-    from collections.abc import Sequence
-except Exception:
-    from collections import Sequence
-import numpy as np
-from ppdet.core.workspace import register, serializable
-from .dataset import DetDataset
-
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-__all__ = [
-    'COCODataSet', 'SlicedCOCODataSet', 'SemiCOCODataSet', 'COCODetDataset'
-]
-
-
-@register
-@serializable
-class COCODataSet(DetDataset):
-    """
-    Load dataset with COCO format.
-
-    Args:
-        dataset_dir (str): root directory for dataset.
-        image_dir (str): directory for images.
-        anno_path (str): coco annotation file path.
-        data_fields (list): key name of data dictionary, at least have 'image'.
-        sample_num (int): number of samples to load, -1 means all.
-        load_crowd (bool): whether to load crowded ground-truth. 
-            False as default
-        allow_empty (bool): whether to load empty entry. False as default
-        empty_ratio (float): the ratio of empty record number to total 
-            record's, if empty_ratio is out of [0. ,1.), do not sample the 
-            records and use all the empty entries. 1. as default
-        repeat (int): repeat times for dataset, use in benchmark.
-    """
-
-    def __init__(self,
-                 dataset_dir=None,
-                 image_dir=None,
-                 anno_path=None,
-                 data_fields=['image'],
-                 sample_num=-1,
-                 load_crowd=False,
-                 allow_empty=False,
-                 empty_ratio=1.,
-                 repeat=1):
-        super(COCODataSet, self).__init__(
-            dataset_dir,
-            image_dir,
-            anno_path,
-            data_fields,
-            sample_num,
-            repeat=repeat)
-        self.load_image_only = False
-        self.load_semantic = False
-        self.load_crowd = load_crowd
-        self.allow_empty = allow_empty
-        self.empty_ratio = empty_ratio
-
-    def _sample_empty(self, records, num):
-        # if empty_ratio is out of [0. ,1.), do not sample the records
-        if self.empty_ratio < 0. or self.empty_ratio >= 1.:
-            return records
-        import random
-        sample_num = min(
-            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
-        records = random.sample(records, sample_num)
-        return records
-
-    def parse_dataset(self):
-        anno_path = os.path.join(self.dataset_dir, self.anno_path)
-        image_dir = os.path.join(self.dataset_dir, self.image_dir)
-
-        assert anno_path.endswith('.json'), \
-            'invalid coco annotation file: ' + anno_path
-        from pycocotools.coco import COCO
-        coco = COCO(anno_path)
-        img_ids = coco.getImgIds()
-        img_ids.sort()
-        cat_ids = coco.getCatIds()
-        records = []
-        empty_records = []
-        ct = 0
-
-        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
-        self.cname2cid = dict({
-            coco.loadCats(catid)[0]['name']: clsid
-            for catid, clsid in self.catid2clsid.items()
-        })
-
-        if 'annotations' not in coco.dataset:
-            self.load_image_only = True
-            logger.warning('Annotation file: {} does not contains ground truth '
-                           'and load image information only.'.format(anno_path))
-
-        for img_id in img_ids:
-            img_anno = coco.loadImgs([img_id])[0]
-            im_fname = img_anno['file_name']
-            im_w = float(img_anno['width'])
-            im_h = float(img_anno['height'])
-
-            im_path = os.path.join(image_dir,
-                                   im_fname) if image_dir else im_fname
-            is_empty = False
-            if not os.path.exists(im_path):
-                logger.warning('Illegal image file: {}, and it will be '
-                               'ignored'.format(im_path))
-                continue
-
-            if im_w < 0 or im_h < 0:
-                logger.warning('Illegal width: {} or height: {} in annotation, '
-                               'and im_id: {} will be ignored'.format(
-                                   im_w, im_h, img_id))
-                continue
-
-            coco_rec = {
-                'im_file': im_path,
-                'im_id': np.array([img_id]),
-                'h': im_h,
-                'w': im_w,
-            } if 'image' in self.data_fields else {}
-
-            if not self.load_image_only:
-                ins_anno_ids = coco.getAnnIds(
-                    imgIds=[img_id], iscrowd=None if self.load_crowd else False)
-                instances = coco.loadAnns(ins_anno_ids)
-
-                bboxes = []
-                is_rbox_anno = False
-                for inst in instances:
-                    # check gt bbox
-                    if inst.get('ignore', False):
-                        continue
-                    if 'bbox' not in inst.keys():
-                        continue
-                    else:
-                        if not any(np.array(inst['bbox'])):
-                            continue
-
-                    x1, y1, box_w, box_h = inst['bbox']
-                    x2 = x1 + box_w
-                    y2 = y1 + box_h
-                    eps = 1e-5
-                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
-                        inst['clean_bbox'] = [
-                            round(float(x), 3) for x in [x1, y1, x2, y2]
-                        ]
-                        bboxes.append(inst)
-                    else:
-                        logger.warning(
-                            'Found an invalid bbox in annotations: im_id: {}, '
-                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
-                                img_id, float(inst['area']), x1, y1, x2, y2))
-
-                num_bbox = len(bboxes)
-                if num_bbox <= 0 and not self.allow_empty:
-                    continue
-                elif num_bbox <= 0:
-                    is_empty = True
-
-                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
-                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
-                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
-                gt_poly = [None] * num_bbox
-                gt_track_id = -np.ones((num_bbox, 1), dtype=np.int32)
-
-                has_segmentation = False
-                has_track_id = False
-                for i, box in enumerate(bboxes):
-                    catid = box['category_id']
-                    gt_class[i][0] = self.catid2clsid[catid]
-                    gt_bbox[i, :] = box['clean_bbox']
-                    is_crowd[i][0] = box['iscrowd']
-                    # check RLE format 
-                    if 'segmentation' in box and box['iscrowd'] == 1:
-                        gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
-                    elif 'segmentation' in box and box['segmentation']:
-                        if not np.array(
-                                box['segmentation'],
-                                dtype=object).size > 0 and not self.allow_empty:
-                            bboxes.pop(i)
-                            gt_poly.pop(i)
-                            np.delete(is_crowd, i)
-                            np.delete(gt_class, i)
-                            np.delete(gt_bbox, i)
-                        else:
-                            gt_poly[i] = box['segmentation']
-                        has_segmentation = True
-
-                    if 'track_id' in box:
-                        gt_track_id[i][0] = box['track_id']
-                        has_track_id = True
-
-                if has_segmentation and not any(
-                        gt_poly) and not self.allow_empty:
-                    continue
-
-                gt_rec = {
-                    'is_crowd': is_crowd,
-                    'gt_class': gt_class,
-                    'gt_bbox': gt_bbox,
-                    'gt_poly': gt_poly,
-                }
-                if has_track_id:
-                    gt_rec.update({'gt_track_id': gt_track_id})
-
-                for k, v in gt_rec.items():
-                    if k in self.data_fields:
-                        coco_rec[k] = v
-
-                # TODO: remove load_semantic
-                if self.load_semantic and 'semantic' in self.data_fields:
-                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
-                                            'train2017', im_fname[:-3] + 'png')
-                    coco_rec.update({'semantic': seg_path})
-
-            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
-                im_path, img_id, im_h, im_w))
-            if is_empty:
-                empty_records.append(coco_rec)
-            else:
-                records.append(coco_rec)
-            ct += 1
-            if self.sample_num > 0 and ct >= self.sample_num:
-                break
-        assert ct > 0, 'not found any coco record in %s' % (anno_path)
-        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
-                    format(ct, len(img_ids) - ct, anno_path))
-        if self.allow_empty and len(empty_records) > 0:
-            empty_records = self._sample_empty(empty_records, len(records))
-            records += empty_records
-        self.roidbs = records
-
-
-@register
-@serializable
-class SlicedCOCODataSet(COCODataSet):
-    """Sliced COCODataSet"""
-
-    def __init__(
-            self,
-            dataset_dir=None,
-            image_dir=None,
-            anno_path=None,
-            data_fields=['image'],
-            sample_num=-1,
-            load_crowd=False,
-            allow_empty=False,
-            empty_ratio=1.,
-            repeat=1,
-            sliced_size=[640, 640],
-            overlap_ratio=[0.25, 0.25], ):
-        super(SlicedCOCODataSet, self).__init__(
-            dataset_dir=dataset_dir,
-            image_dir=image_dir,
-            anno_path=anno_path,
-            data_fields=data_fields,
-            sample_num=sample_num,
-            load_crowd=load_crowd,
-            allow_empty=allow_empty,
-            empty_ratio=empty_ratio,
-            repeat=repeat, )
-        self.sliced_size = sliced_size
-        self.overlap_ratio = overlap_ratio
-
-    def parse_dataset(self):
-        anno_path = os.path.join(self.dataset_dir, self.anno_path)
-        image_dir = os.path.join(self.dataset_dir, self.image_dir)
-
-        assert anno_path.endswith('.json'), \
-            'invalid coco annotation file: ' + anno_path
-        from pycocotools.coco import COCO
-        coco = COCO(anno_path)
-        img_ids = coco.getImgIds()
-        img_ids.sort()
-        cat_ids = coco.getCatIds()
-        records = []
-        empty_records = []
-        ct = 0
-        ct_sub = 0
-
-        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
-        self.cname2cid = dict({
-            coco.loadCats(catid)[0]['name']: clsid
-            for catid, clsid in self.catid2clsid.items()
-        })
-
-        if 'annotations' not in coco.dataset:
-            self.load_image_only = True
-            logger.warning('Annotation file: {} does not contains ground truth '
-                           'and load image information only.'.format(anno_path))
-        try:
-            import sahi
-            from sahi.slicing import slice_image
-        except Exception as e:
-            logger.error(
-                'sahi not found, plaese install sahi. '
-                'for example: `pip install sahi`, see https://github.com/obss/sahi.'
-            )
-            raise e
-
-        sub_img_ids = 0
-        for img_id in img_ids:
-            img_anno = coco.loadImgs([img_id])[0]
-            im_fname = img_anno['file_name']
-            im_w = float(img_anno['width'])
-            im_h = float(img_anno['height'])
-
-            im_path = os.path.join(image_dir,
-                                   im_fname) if image_dir else im_fname
-            is_empty = False
-            if not os.path.exists(im_path):
-                logger.warning('Illegal image file: {}, and it will be '
-                               'ignored'.format(im_path))
-                continue
-
-            if im_w < 0 or im_h < 0:
-                logger.warning('Illegal width: {} or height: {} in annotation, '
-                               'and im_id: {} will be ignored'.format(
-                                   im_w, im_h, img_id))
-                continue
-
-            slice_image_result = sahi.slicing.slice_image(
-                image=im_path,
-                slice_height=self.sliced_size[0],
-                slice_width=self.sliced_size[1],
-                overlap_height_ratio=self.overlap_ratio[0],
-                overlap_width_ratio=self.overlap_ratio[1])
-
-            sub_img_num = len(slice_image_result)
-            for _ind in range(sub_img_num):
-                im = slice_image_result.images[_ind]
-                coco_rec = {
-                    'image': im,
-                    'im_id': np.array([sub_img_ids + _ind]),
-                    'h': im.shape[0],
-                    'w': im.shape[1],
-                    'ori_im_id': np.array([img_id]),
-                    'st_pix': np.array(
-                        slice_image_result.starting_pixels[_ind],
-                        dtype=np.float32),
-                    'is_last': 1 if _ind == sub_img_num - 1 else 0,
-                } if 'image' in self.data_fields else {}
-                records.append(coco_rec)
-            ct_sub += sub_img_num
-            ct += 1
-            if self.sample_num > 0 and ct >= self.sample_num:
-                break
-        assert ct > 0, 'not found any coco record in %s' % (anno_path)
-        logger.info('{} samples and slice to {} sub_samples in file {}'.format(
-            ct, ct_sub, anno_path))
-        if self.allow_empty and len(empty_records) > 0:
-            empty_records = self._sample_empty(empty_records, len(records))
-            records += empty_records
-        self.roidbs = records
-
-
-@register
-@serializable
-class SemiCOCODataSet(COCODataSet):
-    """Semi-COCODataSet used for supervised and unsupervised dataSet"""
-
-    def __init__(self,
-                 dataset_dir=None,
-                 image_dir=None,
-                 anno_path=None,
-                 data_fields=['image'],
-                 sample_num=-1,
-                 load_crowd=False,
-                 allow_empty=False,
-                 empty_ratio=1.,
-                 repeat=1,
-                 supervised=True):
-        super(SemiCOCODataSet, self).__init__(
-            dataset_dir, image_dir, anno_path, data_fields, sample_num,
-            load_crowd, allow_empty, empty_ratio, repeat)
-        self.supervised = supervised
-        self.length = -1  # defalut -1 means all
-
-    def parse_dataset(self):
-        anno_path = os.path.join(self.dataset_dir, self.anno_path)
-        image_dir = os.path.join(self.dataset_dir, self.image_dir)
-
-        assert anno_path.endswith('.json'), \
-            'invalid coco annotation file: ' + anno_path
-        from pycocotools.coco import COCO
-        coco = COCO(anno_path)
-        img_ids = coco.getImgIds()
-        img_ids.sort()
-        cat_ids = coco.getCatIds()
-        records = []
-        empty_records = []
-        ct = 0
-
-        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
-        self.cname2cid = dict({
-            coco.loadCats(catid)[0]['name']: clsid
-            for catid, clsid in self.catid2clsid.items()
-        })
-
-        if 'annotations' not in coco.dataset or self.supervised == False:
-            self.load_image_only = True
-            logger.warning('Annotation file: {} does not contains ground truth '
-                           'and load image information only.'.format(anno_path))
-
-        for img_id in img_ids:
-            img_anno = coco.loadImgs([img_id])[0]
-            im_fname = img_anno['file_name']
-            im_w = float(img_anno['width'])
-            im_h = float(img_anno['height'])
-
-            im_path = os.path.join(image_dir,
-                                   im_fname) if image_dir else im_fname
-            is_empty = False
-            if not os.path.exists(im_path):
-                logger.warning('Illegal image file: {}, and it will be '
-                               'ignored'.format(im_path))
-                continue
-
-            if im_w < 0 or im_h < 0:
-                logger.warning('Illegal width: {} or height: {} in annotation, '
-                               'and im_id: {} will be ignored'.format(
-                                   im_w, im_h, img_id))
-                continue
-
-            coco_rec = {
-                'im_file': im_path,
-                'im_id': np.array([img_id]),
-                'h': im_h,
-                'w': im_w,
-            } if 'image' in self.data_fields else {}
-
-            if not self.load_image_only:
-                ins_anno_ids = coco.getAnnIds(
-                    imgIds=[img_id], iscrowd=None if self.load_crowd else False)
-                instances = coco.loadAnns(ins_anno_ids)
-
-                bboxes = []
-                is_rbox_anno = False
-                for inst in instances:
-                    # check gt bbox
-                    if inst.get('ignore', False):
-                        continue
-                    if 'bbox' not in inst.keys():
-                        continue
-                    else:
-                        if not any(np.array(inst['bbox'])):
-                            continue
-
-                    x1, y1, box_w, box_h = inst['bbox']
-                    x2 = x1 + box_w
-                    y2 = y1 + box_h
-                    eps = 1e-5
-                    if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps:
-                        inst['clean_bbox'] = [
-                            round(float(x), 3) for x in [x1, y1, x2, y2]
-                        ]
-                        bboxes.append(inst)
-                    else:
-                        logger.warning(
-                            'Found an invalid bbox in annotations: im_id: {}, '
-                            'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format(
-                                img_id, float(inst['area']), x1, y1, x2, y2))
-
-                num_bbox = len(bboxes)
-                if num_bbox <= 0 and not self.allow_empty:
-                    continue
-                elif num_bbox <= 0:
-                    is_empty = True
-
-                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
-                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
-                is_crowd = np.zeros((num_bbox, 1), dtype=np.int32)
-                gt_poly = [None] * num_bbox
-
-                has_segmentation = False
-                for i, box in enumerate(bboxes):
-                    catid = box['category_id']
-                    gt_class[i][0] = self.catid2clsid[catid]
-                    gt_bbox[i, :] = box['clean_bbox']
-                    is_crowd[i][0] = box['iscrowd']
-                    # check RLE format 
-                    if 'segmentation' in box and box['iscrowd'] == 1:
-                        gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
-                    elif 'segmentation' in box and box['segmentation']:
-                        if not np.array(box['segmentation']
-                                        ).size > 0 and not self.allow_empty:
-                            bboxes.pop(i)
-                            gt_poly.pop(i)
-                            np.delete(is_crowd, i)
-                            np.delete(gt_class, i)
-                            np.delete(gt_bbox, i)
-                        else:
-                            gt_poly[i] = box['segmentation']
-                        has_segmentation = True
-
-                if has_segmentation and not any(
-                        gt_poly) and not self.allow_empty:
-                    continue
-
-                gt_rec = {
-                    'is_crowd': is_crowd,
-                    'gt_class': gt_class,
-                    'gt_bbox': gt_bbox,
-                    'gt_poly': gt_poly,
-                }
-
-                for k, v in gt_rec.items():
-                    if k in self.data_fields:
-                        coco_rec[k] = v
-
-                # TODO: remove load_semantic
-                if self.load_semantic and 'semantic' in self.data_fields:
-                    seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps',
-                                            'train2017', im_fname[:-3] + 'png')
-                    coco_rec.update({'semantic': seg_path})
-
-            logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format(
-                im_path, img_id, im_h, im_w))
-            if is_empty:
-                empty_records.append(coco_rec)
-            else:
-                records.append(coco_rec)
-            ct += 1
-            if self.sample_num > 0 and ct >= self.sample_num:
-                break
-        assert ct > 0, 'not found any coco record in %s' % (anno_path)
-        logger.info('Load [{} samples valid, {} samples invalid] in file {}.'.
-                    format(ct, len(img_ids) - ct, anno_path))
-        if self.allow_empty and len(empty_records) > 0:
-            empty_records = self._sample_empty(empty_records, len(records))
-            records += empty_records
-        self.roidbs = records
-
-        if self.supervised:
-            logger.info(f'Use {len(self.roidbs)} sup_samples data as LABELED')
-        else:
-            if self.length > 0:  # unsup length will be decide by sup length
-                all_roidbs = self.roidbs.copy()
-                selected_idxs = [
-                    np.random.choice(len(all_roidbs))
-                    for _ in range(self.length)
-                ]
-                self.roidbs = [all_roidbs[i] for i in selected_idxs]
-            logger.info(
-                f'Use {len(self.roidbs)} unsup_samples data as UNLABELED')
-
-    def __getitem__(self, idx):
-        n = len(self.roidbs)
-        if self.repeat > 1:
-            idx %= n
-        # data batch
-        roidb = copy.deepcopy(self.roidbs[idx])
-        if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
-            idx = np.random.randint(n)
-            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
-        elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
-            idx = np.random.randint(n)
-            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
-        elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
-            roidb = [roidb, ] + [
-                copy.deepcopy(self.roidbs[np.random.randint(n)])
-                for _ in range(4)
-            ]
-        if isinstance(roidb, Sequence):
-            for r in roidb:
-                r['curr_iter'] = self._curr_iter
-        else:
-            roidb['curr_iter'] = self._curr_iter
-        self._curr_iter += 1
-
-        return self.transform(roidb)
-
-
-# for PaddleX
-@register
-@serializable
-class COCODetDataset(COCODataSet):
-    pass
diff --git a/pdfdet/models/Paddle/ppdet/data/source/culane.py b/pdfdet/models/Paddle/ppdet/data/source/culane.py
deleted file mode 100644
index 977d608..0000000
--- a/pdfdet/models/Paddle/ppdet/data/source/culane.py
+++ /dev/null
@@ -1,206 +0,0 @@
-from ppdet.core.workspace import register, serializable
-import cv2
-import os
-import tarfile
-import numpy as np
-import os.path as osp
-from ppdet.data.source.dataset import DetDataset
-from imgaug.augmentables.lines import LineStringsOnImage
-from imgaug.augmentables.segmaps import SegmentationMapsOnImage
-from ppdet.data.culane_utils import lane_to_linestrings
-import pickle as pkl
-from ppdet.utils.logger import setup_logger
-try:
-    from collections.abc import Sequence
-except Exception:
-    from collections import Sequence
-from .dataset import DetDataset, _make_dataset, _is_valid_file
-from ppdet.utils.download import download_dataset
-
-logger = setup_logger(__name__)
-
-
-@register
-@serializable
-class CULaneDataSet(DetDataset):
-    def __init__(
-            self,
-            dataset_dir,
-            cut_height,
-            list_path,
-            split='train',
-            data_fields=['image'],
-            video_file=None,
-            frame_rate=-1, ):
-        super(CULaneDataSet, self).__init__(
-            dataset_dir=dataset_dir,
-            cut_height=cut_height,
-            split=split,
-            data_fields=data_fields)
-        self.dataset_dir = dataset_dir
-        self.list_path = osp.join(dataset_dir, list_path)
-        self.cut_height = cut_height
-        self.data_fields = data_fields
-        self.split = split
-        self.training = 'train' in split
-        self.data_infos = []
-        self.video_file = video_file
-        self.frame_rate = frame_rate
-        self._imid2path = {}
-        self.predict_dir = None
-
-    def __len__(self):
-        return len(self.data_infos)
-
-    def check_or_download_dataset(self):
-        if not osp.exists(self.dataset_dir):
-            download_dataset("dataset", dataset="culane")
-            # extract .tar files in self.dataset_dir
-            for fname in os.listdir(self.dataset_dir):
-                logger.info("Decompressing {}...".format(fname))
-                # ignore .* files
-                if fname.startswith('.'):
-                    continue
-                if fname.find('.tar.gz') >= 0:
-                    with tarfile.open(osp.join(self.dataset_dir, fname)) as tf:
-                        tf.extractall(path=self.dataset_dir)
-        logger.info("Dataset files are ready.")
-
-    def parse_dataset(self):
-        logger.info('Loading CULane annotations...')
-        if self.predict_dir is not None:
-            logger.info('switch to predict mode')
-            return
-        # Waiting for the dataset to load is tedious, let's cache it
-        os.makedirs('cache', exist_ok=True)
-        cache_path = 'cache/culane_paddle_{}.pkl'.format(self.split)
-        if os.path.exists(cache_path):
-            with open(cache_path, 'rb') as cache_file:
-                self.data_infos = pkl.load(cache_file)
-                self.max_lanes = max(
-                    len(anno['lanes']) for anno in self.data_infos)
-                return
-
-        with open(self.list_path) as list_file:
-            for line in list_file:
-                infos = self.load_annotation(line.split())
-                self.data_infos.append(infos)
-
-        # cache data infos to file
-        with open(cache_path, 'wb') as cache_file:
-            pkl.dump(self.data_infos, cache_file)
-
-    def load_annotation(self, line):
-        infos = {}
-        img_line = line[0]
-        img_line = img_line[1 if img_line[0] == '/' else 0::]
-        img_path = os.path.join(self.dataset_dir, img_line)
-        infos['img_name'] = img_line
-        infos['img_path'] = img_path
-        if len(line) > 1:
-            mask_line = line[1]
-            mask_line = mask_line[1 if mask_line[0] == '/' else 0::]
-            mask_path = os.path.join(self.dataset_dir, mask_line)
-            infos['mask_path'] = mask_path
-
-        if len(line) > 2:
-            exist_list = [int(l) for l in line[2:]]
-            infos['lane_exist'] = np.array(exist_list)
-
-        anno_path = img_path[:
-                             -3] + 'lines.txt'  # remove sufix jpg and add lines.txt
-        with open(anno_path, 'r') as anno_file:
-            data = [
-                list(map(float, line.split())) for line in anno_file.readlines()
-            ]
-        lanes = [[(lane[i], lane[i + 1]) for i in range(0, len(lane), 2)
-                  if lane[i] >= 0 and lane[i + 1] >= 0] for lane in data]
-        lanes = [list(set(lane)) for lane in lanes]  # remove duplicated points
-        lanes = [lane for lane in lanes
-                 if len(lane) > 2]  # remove lanes with less than 2 points
-
-        lanes = [sorted(
-            lane, key=lambda x: x[1]) for lane in lanes]  # sort by y
-        infos['lanes'] = lanes
-
-        return infos
-
-    def set_images(self, images):
-        self.predict_dir = images
-        self.data_infos = self._load_images()
-
-    def _find_images(self):
-        predict_dir = self.predict_dir
-        if not isinstance(predict_dir, Sequence):
-            predict_dir = [predict_dir]
-        images = []
-        for im_dir in predict_dir:
-            if os.path.isdir(im_dir):
-                im_dir = os.path.join(self.predict_dir, im_dir)
-                images.extend(_make_dataset(im_dir))
-            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
-                images.append(im_dir)
-        return images
-
-    def _load_images(self):
-        images = self._find_images()
-        ct = 0
-        records = []
-        for image in images:
-            assert image != '' and os.path.isfile(image), \
-                    "Image {} not found".format(image)
-            if self.sample_num > 0 and ct >= self.sample_num:
-                break
-            rec = {
-                'im_id': np.array([ct]),
-                "img_path": os.path.abspath(image),
-                "img_name": os.path.basename(image),
-                "lanes": []
-            }
-            self._imid2path[ct] = image
-            ct += 1
-            records.append(rec)
-        assert len(records) > 0, "No image file found"
-        return records
-
-    def get_imid2path(self):
-        return self._imid2path
-
-    def __getitem__(self, idx):
-        data_info = self.data_infos[idx]
-        img = cv2.imread(data_info['img_path'])
-        img = img[self.cut_height:, :, :]
-        sample = data_info.copy()
-        sample.update({'image': img})
-        img_org = sample['image']
-
-        if self.training:
-            label = cv2.imread(sample['mask_path'], cv2.IMREAD_UNCHANGED)
-            if len(label.shape) > 2:
-                label = label[:, :, 0]
-            label = label.squeeze()
-            label = label[self.cut_height:, :]
-            sample.update({'mask': label})
-            if self.cut_height != 0:
-                new_lanes = []
-                for i in sample['lanes']:
-                    lanes = []
-                    for p in i:
-                        lanes.append((p[0], p[1] - self.cut_height))
-                    new_lanes.append(lanes)
-                sample.update({'lanes': new_lanes})
-
-            sample['mask'] = SegmentationMapsOnImage(
-                sample['mask'], shape=img_org.shape)
-
-        sample['full_img_path'] = data_info['img_path']
-        sample['img_name'] = data_info['img_name']
-        sample['im_id'] = np.array([idx])
-
-        sample['image'] = sample['image'].copy().astype(np.uint8)
-        sample['lanes'] = lane_to_linestrings(sample['lanes'])
-        sample['lanes'] = LineStringsOnImage(
-            sample['lanes'], shape=img_org.shape)
-        sample['seg'] = np.zeros(img_org.shape)
-
-        return sample
diff --git a/pdfdet/models/Paddle/ppdet/data/source/dataset.py b/pdfdet/models/Paddle/ppdet/data/source/dataset.py
deleted file mode 100644
index 4f22b22..0000000
--- a/pdfdet/models/Paddle/ppdet/data/source/dataset.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-# 
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import os
-import copy
-import numpy as np
-try:
-    from collections.abc import Sequence
-except Exception:
-    from collections import Sequence
-from paddle.io import Dataset
-from ppdet.core.workspace import register, serializable
-from ppdet.utils.download import get_dataset_path
-from ppdet.data import source
-
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-
-@serializable
-class DetDataset(Dataset):
-    """
-    Load detection dataset.
-
-    Args:
-        dataset_dir (str): root directory for dataset.
-        image_dir (str): directory for images.
-        anno_path (str): annotation file path.
-        data_fields (list): key name of data dictionary, at least have 'image'.
-        sample_num (int): number of samples to load, -1 means all.
-        use_default_label (bool): whether to load default label list.
-        repeat (int): repeat times for dataset, use in benchmark.
-    """
-
-    def __init__(self,
-                 dataset_dir=None,
-                 image_dir=None,
-                 anno_path=None,
-                 data_fields=['image'],
-                 sample_num=-1,
-                 use_default_label=None,
-                 repeat=1,
-                 **kwargs):
-        super(DetDataset, self).__init__()
-        self.dataset_dir = dataset_dir if dataset_dir is not None else ''
-        self.anno_path = anno_path
-        self.image_dir = image_dir if image_dir is not None else ''
-        self.data_fields = data_fields
-        self.sample_num = sample_num
-        self.use_default_label = use_default_label
-        self.repeat = repeat
-        self._epoch = 0
-        self._curr_iter = 0
-
-    def __len__(self, ):
-        return len(self.roidbs) * self.repeat
-
-    def __call__(self, *args, **kwargs):
-        return self
-
-    def __getitem__(self, idx):
-        n = len(self.roidbs)
-        if self.repeat > 1:
-            idx %= n
-        # data batch
-        roidb = copy.deepcopy(self.roidbs[idx])
-        if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch:
-            idx = np.random.randint(n)
-            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
-        elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch:
-            idx = np.random.randint(n)
-            roidb = [roidb, copy.deepcopy(self.roidbs[idx])]
-        elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch:
-            roidb = [roidb, ] + [
-                copy.deepcopy(self.roidbs[np.random.randint(n)])
-                for _ in range(4)
-            ]
-        elif self.pre_img_epoch == 0 or self._epoch < self.pre_img_epoch:
-            # Add previous image as input, only used in CenterTrack
-            idx_pre_img = idx - 1
-            if idx_pre_img < 0:
-                idx_pre_img = idx + 1
-            roidb = [roidb, ] + [copy.deepcopy(self.roidbs[idx_pre_img])]
-        if isinstance(roidb, Sequence):
-            for r in roidb:
-                r['curr_iter'] = self._curr_iter
-        else:
-            roidb['curr_iter'] = self._curr_iter
-        self._curr_iter += 1
-
-        return self.transform(roidb)
-
-    def check_or_download_dataset(self):
-        self.dataset_dir = get_dataset_path(self.dataset_dir, self.anno_path,
-                                            self.image_dir)
-
-    def set_kwargs(self, **kwargs):
-        self.mixup_epoch = kwargs.get('mixup_epoch', -1)
-        self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)
-        self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)
-        self.pre_img_epoch = kwargs.get('pre_img_epoch', -1)
-
-    def set_transform(self, transform):
-        self.transform = transform
-
-    def set_epoch(self, epoch_id):
-        self._epoch = epoch_id
-
-    def parse_dataset(self, ):
-        raise NotImplementedError(
-            "Need to implement parse_dataset method of Dataset")
-
-    def get_anno(self):
-        if self.anno_path is None:
-            return
-        return os.path.join(self.dataset_dir, self.anno_path)
-
-
-def _is_valid_file(f, extensions=('.jpg', '.jpeg', '.png', '.bmp')):
-    return f.lower().endswith(extensions)
-
-
-def _make_dataset(dir):
-    dir = os.path.expanduser(dir)
-    if not os.path.isdir(dir):
-        raise ('{} should be a dir'.format(dir))
-    images = []
-    for root, _, fnames in sorted(os.walk(dir, followlinks=True)):
-        for fname in sorted(fnames):
-            path = os.path.join(root, fname)
-            if _is_valid_file(path):
-                images.append(path)
-    return images
-
-
-@register
-@serializable
-class ImageFolder(DetDataset):
-    def __init__(self,
-                 dataset_dir=None,
-                 image_dir=None,
-                 anno_path=None,
-                 sample_num=-1,
-                 use_default_label=None,
-                 **kwargs):
-        super(ImageFolder, self).__init__(
-            dataset_dir,
-            image_dir,
-            anno_path,
-            sample_num=sample_num,
-            use_default_label=use_default_label)
-        self._imid2path = {}
-        self.roidbs = None
-        self.sample_num = sample_num
-
-    def check_or_download_dataset(self):
-        return
-
-    def get_anno(self):
-        if self.anno_path is None:
-            return
-        if self.dataset_dir:
-            return os.path.join(self.dataset_dir, self.anno_path)
-        else:
-            return self.anno_path
-
-    def parse_dataset(self, ):
-        if not self.roidbs:
-            self.roidbs = self._load_images()
-
-    def _parse(self):
-        image_dir = self.image_dir
-        if not isinstance(image_dir, Sequence):
-            image_dir = [image_dir]
-        images = []
-        for im_dir in image_dir:
-            if os.path.isdir(im_dir):
-                im_dir = os.path.join(self.dataset_dir, im_dir)
-                images.extend(_make_dataset(im_dir))
-            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
-                images.append(im_dir)
-        return images
-
-    def _load_images(self):
-        images = self._parse()
-        ct = 0
-        records = []
-        for image in images:
-            assert image != '' and os.path.isfile(image), \
-                    "Image {} not found".format(image)
-            if self.sample_num > 0 and ct >= self.sample_num:
-                break
-            rec = {'im_id': np.array([ct]), 'im_file': image}
-            self._imid2path[ct] = image
-            ct += 1
-            records.append(rec)
-        assert len(records) > 0, "No image file found"
-        return records
-
-    def get_imid2path(self):
-        return self._imid2path
-
-    def set_images(self, images):
-        self.image_dir = images
-        self.roidbs = self._load_images()
-
-    def set_slice_images(self,
-                         images,
-                         slice_size=[640, 640],
-                         overlap_ratio=[0.25, 0.25]):
-        self.image_dir = images
-        ori_records = self._load_images()
-        try:
-            import sahi
-            from sahi.slicing import slice_image
-        except Exception as e:
-            logger.error(
-                'sahi not found, plaese install sahi. '
-                'for example: `pip install sahi`, see https://github.com/obss/sahi.'
-            )
-            raise e
-
-        sub_img_ids = 0
-        ct = 0
-        ct_sub = 0
-        records = []
-        for i, ori_rec in enumerate(ori_records):
-            im_path = ori_rec['im_file']
-            slice_image_result = sahi.slicing.slice_image(
-                image=im_path,
-                slice_height=slice_size[0],
-                slice_width=slice_size[1],
-                overlap_height_ratio=overlap_ratio[0],
-                overlap_width_ratio=overlap_ratio[1])
-
-            sub_img_num = len(slice_image_result)
-            for _ind in range(sub_img_num):
-                im = slice_image_result.images[_ind]
-                rec = {
-                    'image': im,
-                    'im_id': np.array([sub_img_ids + _ind]),
-                    'h': im.shape[0],
-                    'w': im.shape[1],
-                    'ori_im_id': np.array([ori_rec['im_id'][0]]),
-                    'st_pix': np.array(
-                        slice_image_result.starting_pixels[_ind],
-                        dtype=np.float32),
-                    'is_last': 1 if _ind == sub_img_num - 1 else 0,
-                } if 'image' in self.data_fields else {}
-                records.append(rec)
-            ct_sub += sub_img_num
-            ct += 1
-        logger.info('{} samples and slice to {} sub_samples.'.format(ct,
-                                                                     ct_sub))
-        self.roidbs = records
-
-    def get_label_list(self):
-        # Only VOC dataset needs label list in ImageFold 
-        return self.anno_path
-
-
-@register
-class CommonDataset(object):
-    def __init__(self, **dataset_args):
-        super(CommonDataset, self).__init__()
-        dataset_args = copy.deepcopy(dataset_args)
-        type = dataset_args.pop("name")
-        self.dataset = getattr(source, type)(**dataset_args)
-
-    def __call__(self):
-        return self.dataset
-
-
-@register
-class TrainDataset(CommonDataset):
-    pass
-
-
-@register
-class EvalMOTDataset(CommonDataset):
-    pass
-
-
-@register
-class TestMOTDataset(CommonDataset):
-    pass
-
-
-@register
-class EvalDataset(CommonDataset):
-    pass
-
-
-@register
-class TestDataset(CommonDataset):
-    pass
diff --git a/pdfdet/models/Paddle/ppdet/data/source/keypoint_coco.py b/pdfdet/models/Paddle/ppdet/data/source/keypoint_coco.py
deleted file mode 100644
index 86d8343..0000000
--- a/pdfdet/models/Paddle/ppdet/data/source/keypoint_coco.py
+++ /dev/null
@@ -1,845 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-"""
-this code is base on https://github.com/open-mmlab/mmpose
-"""
-import os
-import cv2
-import numpy as np
-import json
-import copy
-import pycocotools
-from pycocotools.coco import COCO
-from .dataset import DetDataset
-from ppdet.core.workspace import register, serializable
-
-
-@serializable
-class KeypointBottomUpBaseDataset(DetDataset):
-    """Base class for bottom-up datasets. 
-
-    All datasets should subclass it.
-    All subclasses should overwrite:
-        Methods:`_get_imganno`
-
-    Args:
-        dataset_dir (str): Root path to the dataset.
-        anno_path (str): Relative path to the annotation file.
-        image_dir (str): Path to a directory where images are held.
-            Default: None.
-        num_joints (int): keypoint numbers
-        transform (composed(operators)): A sequence of data transforms.
-        shard (list): [rank, worldsize], the distributed env params
-        test_mode (bool): Store True when building test or
-            validation dataset. Default: False.
-    """
-
-    def __init__(self,
-                 dataset_dir,
-                 image_dir,
-                 anno_path,
-                 num_joints,
-                 transform=[],
-                 shard=[0, 1],
-                 test_mode=False):
-        super().__init__(dataset_dir, image_dir, anno_path)
-        self.image_info = {}
-        self.ann_info = {}
-
-        self.img_prefix = os.path.join(dataset_dir, image_dir)
-        self.transform = transform
-        self.test_mode = test_mode
-
-        self.ann_info['num_joints'] = num_joints
-        self.img_ids = []
-
-    def parse_dataset(self):
-        pass
-
-    def __len__(self):
-        """Get dataset length."""
-        return len(self.img_ids)
-
-    def _get_imganno(self, idx):
-        """Get anno for a single image."""
-        raise NotImplementedError
-
-    def __getitem__(self, idx):
-        """Prepare image for training given the index."""
-        records = copy.deepcopy(self._get_imganno(idx))
-        records['image'] = cv2.imread(records['image_file'])
-        records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
-        if 'mask' in records:
-            records['mask'] = (records['mask'] + 0).astype('uint8')
-        records = self.transform(records)
-        return records
-
-    def parse_dataset(self):
-        return
-
-
-@register
-@serializable
-class KeypointBottomUpCocoDataset(KeypointBottomUpBaseDataset):
-    """COCO dataset for bottom-up pose estimation. 
-
-    The dataset loads raw features and apply specified transforms
-    to return a dict containing the image tensors and other information.
-
-    COCO keypoint indexes::
-
-        0: 'nose',
-        1: 'left_eye',
-        2: 'right_eye',
-        3: 'left_ear',
-        4: 'right_ear',
-        5: 'left_shoulder',
-        6: 'right_shoulder',
-        7: 'left_elbow',
-        8: 'right_elbow',
-        9: 'left_wrist',
-        10: 'right_wrist',
-        11: 'left_hip',
-        12: 'right_hip',
-        13: 'left_knee',
-        14: 'right_knee',
-        15: 'left_ankle',
-        16: 'right_ankle'
-
-    Args:
-        dataset_dir (str): Root path to the dataset.
-        anno_path (str): Relative path to the annotation file.
-        image_dir (str): Path to a directory where images are held.
-            Default: None.
-        num_joints (int): keypoint numbers
-        transform (composed(operators)): A sequence of data transforms.
-        shard (list): [rank, worldsize], the distributed env params
-        test_mode (bool): Store True when building test or
-            validation dataset. Default: False.
-    """
-
-    def __init__(self,
-                 dataset_dir,
-                 image_dir,
-                 anno_path,
-                 num_joints,
-                 transform=[],
-                 shard=[0, 1],
-                 test_mode=False,
-                 return_mask=True,
-                 return_bbox=True,
-                 return_area=True,
-                 return_class=True):
-        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
-                         transform, shard, test_mode)
-
-        self.ann_file = os.path.join(dataset_dir, anno_path)
-        self.shard = shard
-        self.test_mode = test_mode
-        self.return_mask = return_mask
-        self.return_bbox = return_bbox
-        self.return_area = return_area
-        self.return_class = return_class
-
-    def parse_dataset(self):
-        self.coco = COCO(self.ann_file)
-
-        self.img_ids = self.coco.getImgIds()
-        if not self.test_mode:
-            self.img_ids_tmp = []
-            for img_id in self.img_ids:
-                ann_ids = self.coco.getAnnIds(imgIds=img_id)
-                anno = self.coco.loadAnns(ann_ids)
-                anno = [obj for obj in anno if obj['iscrowd'] == 0]
-                if len(anno) == 0:
-                    continue
-                self.img_ids_tmp.append(img_id)
-            self.img_ids = self.img_ids_tmp
-
-        blocknum = int(len(self.img_ids) / self.shard[1])
-        self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * (
-            self.shard[0] + 1))]
-        self.num_images = len(self.img_ids)
-        self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)
-        self.dataset_name = 'coco'
-
-        cat_ids = self.coco.getCatIds()
-        self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)})
-        print('=> num_images: {}'.format(self.num_images))
-
-    @staticmethod
-    def _get_mapping_id_name(imgs):
-        """
-        Args:
-            imgs (dict): dict of image info.
-
-        Returns:
-            tuple: Image name & id mapping dicts.
-
-            - id2name (dict): Mapping image id to name.
-            - name2id (dict): Mapping image name to id.
-        """
-        id2name = {}
-        name2id = {}
-        for image_id, image in imgs.items():
-            file_name = image['file_name']
-            id2name[image_id] = file_name
-            name2id[file_name] = image_id
-
-        return id2name, name2id
-
-    def _get_imganno(self, idx):
-        """Get anno for a single image.
-
-        Args:
-            idx (int): image idx
-
-        Returns:
-            dict: info for model training
-        """
-        coco = self.coco
-        img_id = self.img_ids[idx]
-        ann_ids = coco.getAnnIds(imgIds=img_id)
-        anno = coco.loadAnns(ann_ids)
-
-        anno = [
-            obj for obj in anno
-            if obj['iscrowd'] == 0 and obj['num_keypoints'] > 0
-        ]
-
-        db_rec = {}
-        joints, orgsize = self._get_joints(anno, idx)
-        db_rec['gt_joints'] = joints
-        db_rec['im_shape'] = orgsize
-
-        if self.return_bbox:
-            db_rec['gt_bbox'] = self._get_bboxs(anno, idx)
-
-        if self.return_class:
-            db_rec['gt_class'] = self._get_labels(anno, idx)
-
-        if self.return_area:
-            db_rec['gt_areas'] = self._get_areas(anno, idx)
-
-        if self.return_mask:
-            db_rec['mask'] = self._get_mask(anno, idx)
-
-        db_rec['im_id'] = img_id
-        db_rec['image_file'] = os.path.join(self.img_prefix,
-                                            self.id2name[img_id])
-
-        return db_rec
-
-    def _get_joints(self, anno, idx):
-        """Get joints for all people in an image."""
-        num_people = len(anno)
-
-        joints = np.zeros(
-            (num_people, self.ann_info['num_joints'], 3), dtype=np.float32)
-
-        for i, obj in enumerate(anno):
-            joints[i, :self.ann_info['num_joints'], :3] = \
-                np.array(obj['keypoints']).reshape([-1, 3])
-
-        img_info = self.coco.loadImgs(self.img_ids[idx])[0]
-        orgsize = np.array([img_info['height'], img_info['width'], 1])
-
-        return joints, orgsize
-
-    def _get_bboxs(self, anno, idx):
-        num_people = len(anno)
-        gt_bboxes = np.zeros((num_people, 4), dtype=np.float32)
-
-        for idx, obj in enumerate(anno):
-            if 'bbox' in obj:
-                gt_bboxes[idx, :] = obj['bbox']
-
-        gt_bboxes[:, 2] += gt_bboxes[:, 0]
-        gt_bboxes[:, 3] += gt_bboxes[:, 1]
-        return gt_bboxes
-
-    def _get_labels(self, anno, idx):
-        num_people = len(anno)
-        gt_labels = np.zeros((num_people, 1), dtype=np.float32)
-
-        for idx, obj in enumerate(anno):
-            if 'category_id' in obj:
-                catid = obj['category_id']
-                gt_labels[idx, 0] = self.catid2clsid[catid]
-        return gt_labels
-
-    def _get_areas(self, anno, idx):
-        num_people = len(anno)
-        gt_areas = np.zeros((num_people, ), dtype=np.float32)
-
-        for idx, obj in enumerate(anno):
-            if 'area' in obj:
-                gt_areas[idx, ] = obj['area']
-        return gt_areas
-
-    def _get_mask(self, anno, idx):
-        """Get ignore masks to mask out losses."""
-        coco = self.coco
-        img_info = coco.loadImgs(self.img_ids[idx])[0]
-
-        m = np.zeros((img_info['height'], img_info['width']), dtype=np.float32)
-
-        for obj in anno:
-            if 'segmentation' in obj:
-                if obj['iscrowd']:
-                    rle = pycocotools.mask.frPyObjects(obj['segmentation'],
-                                                       img_info['height'],
-                                                       img_info['width'])
-                    m += pycocotools.mask.decode(rle)
-                elif obj['num_keypoints'] == 0:
-                    rles = pycocotools.mask.frPyObjects(obj['segmentation'],
-                                                        img_info['height'],
-                                                        img_info['width'])
-                    for rle in rles:
-                        m += pycocotools.mask.decode(rle)
-
-        return m < 0.5
-
-
-@register
-@serializable
-class KeypointBottomUpCrowdPoseDataset(KeypointBottomUpCocoDataset):
-    """CrowdPose dataset for bottom-up pose estimation. 
-
-    The dataset loads raw features and apply specified transforms
-    to return a dict containing the image tensors and other information.
-
-    CrowdPose keypoint indexes::
-
-        0: 'left_shoulder',
-        1: 'right_shoulder',
-        2: 'left_elbow',
-        3: 'right_elbow',
-        4: 'left_wrist',
-        5: 'right_wrist',
-        6: 'left_hip',
-        7: 'right_hip',
-        8: 'left_knee',
-        9: 'right_knee',
-        10: 'left_ankle',
-        11: 'right_ankle',
-        12: 'top_head',
-        13: 'neck'
-
-    Args:
-        dataset_dir (str): Root path to the dataset.
-        anno_path (str): Relative path to the annotation file.
-        image_dir (str): Path to a directory where images are held.
-            Default: None.
-        num_joints (int): keypoint numbers
-        transform (composed(operators)): A sequence of data transforms.
-        shard (list): [rank, worldsize], the distributed env params
-        test_mode (bool): Store True when building test or
-            validation dataset. Default: False.
-    """
-
-    def __init__(self,
-                 dataset_dir,
-                 image_dir,
-                 anno_path,
-                 num_joints,
-                 transform=[],
-                 shard=[0, 1],
-                 test_mode=False):
-        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
-                         transform, shard, test_mode)
-
-        self.ann_file = os.path.join(dataset_dir, anno_path)
-        self.shard = shard
-        self.test_mode = test_mode
-
-    def parse_dataset(self):
-        self.coco = COCO(self.ann_file)
-
-        self.img_ids = self.coco.getImgIds()
-        if not self.test_mode:
-            self.img_ids = [
-                img_id for img_id in self.img_ids
-                if len(self.coco.getAnnIds(
-                    imgIds=img_id, iscrowd=None)) > 0
-            ]
-        blocknum = int(len(self.img_ids) / self.shard[1])
-        self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * (
-            self.shard[0] + 1))]
-        self.num_images = len(self.img_ids)
-        self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs)
-
-        self.dataset_name = 'crowdpose'
-        print('=> num_images: {}'.format(self.num_images))
-
-
-@serializable
-class KeypointTopDownBaseDataset(DetDataset):
-    """Base class for top_down datasets.
-
-    All datasets should subclass it.
-    All subclasses should overwrite:
-        Methods:`_get_db`
-
-    Args:
-        dataset_dir (str): Root path to the dataset.
-        image_dir (str): Path to a directory where images are held.
-        anno_path (str): Relative path to the annotation file.
-        num_joints (int): keypoint numbers
-        transform (composed(operators)): A sequence of data transforms.
-    """
-
-    def __init__(self,
-                 dataset_dir,
-                 image_dir,
-                 anno_path,
-                 num_joints,
-                 transform=[]):
-        super().__init__(dataset_dir, image_dir, anno_path)
-        self.image_info = {}
-        self.ann_info = {}
-
-        self.img_prefix = os.path.join(dataset_dir, image_dir)
-        self.transform = transform
-
-        self.ann_info['num_joints'] = num_joints
-        self.db = []
-
-    def __len__(self):
-        """Get dataset length."""
-        return len(self.db)
-
-    def _get_db(self):
-        """Get a sample"""
-        raise NotImplementedError
-
-    def __getitem__(self, idx):
-        """Prepare sample for training given the index."""
-        records = copy.deepcopy(self.db[idx])
-        records['image'] = cv2.imread(records['image_file'], cv2.IMREAD_COLOR |
-                                      cv2.IMREAD_IGNORE_ORIENTATION)
-        records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
-        records['score'] = records['score'] if 'score' in records else 1
-        records = self.transform(records)
-        # print('records', records)
-        return records
-
-
-@register
-@serializable
-class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset):
-    """COCO dataset for top-down pose estimation. 
-
-    The dataset loads raw features and apply specified transforms
-    to return a dict containing the image tensors and other information.
-
-    COCO keypoint indexes:
-
-        0: 'nose',
-        1: 'left_eye',
-        2: 'right_eye',
-        3: 'left_ear',
-        4: 'right_ear',
-        5: 'left_shoulder',
-        6: 'right_shoulder',
-        7: 'left_elbow',
-        8: 'right_elbow',
-        9: 'left_wrist',
-        10: 'right_wrist',
-        11: 'left_hip',
-        12: 'right_hip',
-        13: 'left_knee',
-        14: 'right_knee',
-        15: 'left_ankle',
-        16: 'right_ankle'
-
-    Args:
-        dataset_dir (str): Root path to the dataset.
-        image_dir (str): Path to a directory where images are held.
-        anno_path (str): Relative path to the annotation file.
-        num_joints (int): Keypoint numbers
-        trainsize (list):[w, h] Image target size
-        transform (composed(operators)): A sequence of data transforms.
-        bbox_file (str): Path to a detection bbox file
-            Default: None.
-        use_gt_bbox (bool): Whether to use ground truth bbox
-            Default: True.
-        pixel_std (int): The pixel std of the scale
-            Default: 200.
-        image_thre (float): The threshold to filter the detection box
-            Default: 0.0.
-    """
-
-    def __init__(self,
-                 dataset_dir,
-                 image_dir,
-                 anno_path,
-                 num_joints,
-                 trainsize,
-                 transform=[],
-                 bbox_file=None,
-                 use_gt_bbox=True,
-                 pixel_std=200,
-                 image_thre=0.0,
-                 center_scale=None):
-        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
-                         transform)
-
-        self.bbox_file = bbox_file
-        self.use_gt_bbox = use_gt_bbox
-        self.trainsize = trainsize
-        self.pixel_std = pixel_std
-        self.image_thre = image_thre
-        self.center_scale = center_scale
-        self.dataset_name = 'coco'
-
-    def parse_dataset(self):
-        if self.use_gt_bbox:
-            self.db = self._load_coco_keypoint_annotations()
-        else:
-            self.db = self._load_coco_person_detection_results()
-
-    def _load_coco_keypoint_annotations(self):
-        coco = COCO(self.get_anno())
-        img_ids = coco.getImgIds()
-        gt_db = []
-        for index in img_ids:
-            im_ann = coco.loadImgs(index)[0]
-            width = im_ann['width']
-            height = im_ann['height']
-            file_name = im_ann['file_name']
-            im_id = int(im_ann["id"])
-
-            annIds = coco.getAnnIds(imgIds=index, iscrowd=False)
-            objs = coco.loadAnns(annIds)
-
-            valid_objs = []
-            for obj in objs:
-                x, y, w, h = obj['bbox']
-                x1 = np.max((0, x))
-                y1 = np.max((0, y))
-                x2 = np.min((width - 1, x1 + np.max((0, w - 1))))
-                y2 = np.min((height - 1, y1 + np.max((0, h - 1))))
-                if obj['area'] > 0 and x2 >= x1 and y2 >= y1:
-                    obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
-                    valid_objs.append(obj)
-            objs = valid_objs
-
-            rec = []
-            for obj in objs:
-                if max(obj['keypoints']) == 0:
-                    continue
-
-                joints = np.zeros(
-                    (self.ann_info['num_joints'], 3), dtype=np.float32)
-                joints_vis = np.zeros(
-                    (self.ann_info['num_joints'], 3), dtype=np.float32)
-                for ipt in range(self.ann_info['num_joints']):
-                    joints[ipt, 0] = obj['keypoints'][ipt * 3 + 0]
-                    joints[ipt, 1] = obj['keypoints'][ipt * 3 + 1]
-                    joints[ipt, 2] = 0
-                    t_vis = obj['keypoints'][ipt * 3 + 2]
-                    if t_vis > 1:
-                        t_vis = 1
-                    joints_vis[ipt, 0] = t_vis
-                    joints_vis[ipt, 1] = t_vis
-                    joints_vis[ipt, 2] = 0
-
-                center, scale = self._box2cs(obj['clean_bbox'][:4])
-                rec.append({
-                    'image_file': os.path.join(self.img_prefix, file_name),
-                    'center': center,
-                    'scale': scale,
-                    'gt_joints': joints,
-                    'joints_vis': joints_vis,
-                    'im_id': im_id,
-                })
-            gt_db.extend(rec)
-
-        return gt_db
-
-    def _box2cs(self, box):
-        x, y, w, h = box[:4]
-        center = np.zeros((2), dtype=np.float32)
-        center[0] = x + w * 0.5
-        center[1] = y + h * 0.5
-        aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1]
-
-        if self.center_scale is not None and np.random.rand() < 0.3:
-            center += self.center_scale * (np.random.rand(2) - 0.5) * [w, h]
-
-        if w > aspect_ratio * h:
-            h = w * 1.0 / aspect_ratio
-        elif w < aspect_ratio * h:
-            w = h * aspect_ratio
-        scale = np.array(
-            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
-            dtype=np.float32)
-        if center[0] != -1:
-            scale = scale * 1.25
-
-        return center, scale
-
-    def _load_coco_person_detection_results(self):
-        all_boxes = None
-        bbox_file_path = os.path.join(self.dataset_dir, self.bbox_file)
-        with open(bbox_file_path, 'r') as f:
-            all_boxes = json.load(f)
-
-        if not all_boxes:
-            print('=> Load %s fail!' % bbox_file_path)
-            return None
-
-        kpt_db = []
-        for n_img in range(0, len(all_boxes)):
-            det_res = all_boxes[n_img]
-            if det_res['category_id'] != 1:
-                continue
-            file_name = det_res[
-                'filename'] if 'filename' in det_res else '%012d.jpg' % det_res[
-                    'image_id']
-            img_name = os.path.join(self.img_prefix, file_name)
-            box = det_res['bbox']
-            score = det_res['score']
-            im_id = int(det_res['image_id'])
-
-            if score < self.image_thre:
-                continue
-
-            center, scale = self._box2cs(box)
-            joints = np.zeros(
-                (self.ann_info['num_joints'], 3), dtype=np.float32)
-            joints_vis = np.ones(
-                (self.ann_info['num_joints'], 3), dtype=np.float32)
-            kpt_db.append({
-                'image_file': img_name,
-                'im_id': im_id,
-                'center': center,
-                'scale': scale,
-                'score': score,
-                'gt_joints': joints,
-                'joints_vis': joints_vis,
-            })
-
-        return kpt_db
-
-
-@register
-@serializable
-class KeypointTopDownCocoWholeBodyHandDataset(KeypointTopDownBaseDataset):
-    """CocoWholeBody dataset for top-down hand pose estimation. 
-
-    The dataset loads raw features and apply specified transforms
-    to return a dict containing the image tensors and other information.
-
-    COCO-WholeBody Hand keypoint indexes:
-
-        0: 'wrist',
-        1: 'thumb1',
-        2: 'thumb2',
-        3: 'thumb3',
-        4: 'thumb4',
-        5: 'forefinger1',
-        6: 'forefinger2',
-        7: 'forefinger3',
-        8: 'forefinger4',
-        9: 'middle_finger1',
-        10: 'middle_finger2',
-        11: 'middle_finger3',
-        12: 'middle_finger4',
-        13: 'ring_finger1',
-        14: 'ring_finger2',
-        15: 'ring_finger3',
-        16: 'ring_finger4',
-        17: 'pinky_finger1',
-        18: 'pinky_finger2',
-        19: 'pinky_finger3',
-        20: 'pinky_finger4'
-
-    Args:
-        dataset_dir (str): Root path to the dataset.
-        image_dir (str): Path to a directory where images are held.
-        anno_path (str): Relative path to the annotation file.
-        num_joints (int): Keypoint numbers
-        trainsize (list):[w, h] Image target size
-        transform (composed(operators)): A sequence of data transforms.
-        pixel_std (int): The pixel std of the scale
-            Default: 200.
-    """
-
-    def __init__(self,
-                 dataset_dir,
-                 image_dir,
-                 anno_path,
-                 num_joints,
-                 trainsize,
-                 transform=[],
-                 pixel_std=200):
-        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
-                         transform)
-
-        self.trainsize = trainsize
-        self.pixel_std = pixel_std
-        self.dataset_name = 'coco_wholebady_hand'
-
-    def _box2cs(self, box):
-        x, y, w, h = box[:4]
-        center = np.zeros((2), dtype=np.float32)
-        center[0] = x + w * 0.5
-        center[1] = y + h * 0.5
-        aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1]
-
-        if w > aspect_ratio * h:
-            h = w * 1.0 / aspect_ratio
-        elif w < aspect_ratio * h:
-            w = h * aspect_ratio
-        scale = np.array(
-            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
-            dtype=np.float32)
-        if center[0] != -1:
-            scale = scale * 1.25
-
-        return center, scale
-
-    def parse_dataset(self):
-        gt_db = []
-        num_joints = self.ann_info['num_joints']
-        coco = COCO(self.get_anno())
-        img_ids = list(coco.imgs.keys())
-        for img_id in img_ids:
-            im_ann = coco.loadImgs(img_id)[0]
-            image_file = os.path.join(self.img_prefix, im_ann['file_name'])
-            im_id = int(im_ann["id"])
-
-            ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False)
-            objs = coco.loadAnns(ann_ids)
-
-            for obj in objs:
-                for type in ['left', 'right']:
-                    if (obj[f'{type}hand_valid'] and
-                            max(obj[f'{type}hand_kpts']) > 0):
-
-                        joints = np.zeros((num_joints, 3), dtype=np.float32)
-                        joints_vis = np.zeros((num_joints, 3), dtype=np.float32)
-
-                        keypoints = np.array(obj[f'{type}hand_kpts'])
-                        keypoints = keypoints.reshape(-1, 3)
-                        joints[:, :2] = keypoints[:, :2]
-                        joints_vis[:, :2] = np.minimum(1, keypoints[:, 2:3])
-
-                        center, scale = self._box2cs(obj[f'{type}hand_box'][:4])
-                        gt_db.append({
-                            'image_file': image_file,
-                            'center': center,
-                            'scale': scale,
-                            'gt_joints': joints,
-                            'joints_vis': joints_vis,
-                            'im_id': im_id,
-                        })
-
-        self.db = gt_db
-
-
-@register
-@serializable
-class KeypointTopDownMPIIDataset(KeypointTopDownBaseDataset):
-    """MPII dataset for topdown pose estimation.
-
-    The dataset loads raw features and apply specified transforms
-    to return a dict containing the image tensors and other information.
-
-    MPII keypoint indexes::
-
-        0: 'right_ankle',
-        1: 'right_knee',
-        2: 'right_hip',
-        3: 'left_hip',
-        4: 'left_knee',
-        5: 'left_ankle',
-        6: 'pelvis',
-        7: 'thorax',
-        8: 'upper_neck',
-        9: 'head_top',
-        10: 'right_wrist',
-        11: 'right_elbow',
-        12: 'right_shoulder',
-        13: 'left_shoulder',
-        14: 'left_elbow',
-        15: 'left_wrist',
-
-    Args:
-        dataset_dir (str): Root path to the dataset.
-        image_dir (str): Path to a directory where images are held.
-        anno_path (str): Relative path to the annotation file.
-        num_joints (int): Keypoint numbers
-        trainsize (list):[w, h] Image target size
-        transform (composed(operators)): A sequence of data transforms.
-    """
-
-    def __init__(self,
-                 dataset_dir,
-                 image_dir,
-                 anno_path,
-                 num_joints,
-                 transform=[]):
-        super().__init__(dataset_dir, image_dir, anno_path, num_joints,
-                         transform)
-
-        self.dataset_name = 'mpii'
-
-    def parse_dataset(self):
-        with open(self.get_anno()) as anno_file:
-            anno = json.load(anno_file)
-
-        gt_db = []
-        for a in anno:
-            image_name = a['image']
-            im_id = a['image_id'] if 'image_id' in a else int(
-                os.path.splitext(image_name)[0])
-
-            c = np.array(a['center'], dtype=np.float32)
-            s = np.array([a['scale'], a['scale']], dtype=np.float32)
-
-            # Adjust center/scale slightly to avoid cropping limbs
-            if c[0] != -1:
-                c[1] = c[1] + 15 * s[1]
-                s = s * 1.25
-            c = c - 1
-
-            joints = np.zeros(
-                (self.ann_info['num_joints'], 3), dtype=np.float32)
-            joints_vis = np.zeros(
-                (self.ann_info['num_joints'], 3), dtype=np.float32)
-            if 'gt_joints' in a:
-                joints_ = np.array(a['gt_joints'])
-                joints_[:, 0:2] = joints_[:, 0:2] - 1
-                joints_vis_ = np.array(a['joints_vis'])
-                assert len(joints_) == self.ann_info[
-                    'num_joints'], 'joint num diff: {} vs {}'.format(
-                        len(joints_), self.ann_info['num_joints'])
-
-                joints[:, 0:2] = joints_[:, 0:2]
-                joints_vis[:, 0] = joints_vis_[:]
-                joints_vis[:, 1] = joints_vis_[:]
-
-            gt_db.append({
-                'image_file': os.path.join(self.img_prefix, image_name),
-                'im_id': im_id,
-                'center': c,
-                'scale': s,
-                'gt_joints': joints,
-                'joints_vis': joints_vis
-            })
-        print("number length: {}".format(len(gt_db)))
-        self.db = gt_db
diff --git a/pdfdet/models/Paddle/ppdet/data/source/mot.py b/pdfdet/models/Paddle/ppdet/data/source/mot.py
deleted file mode 100644
index 90a8a1f..0000000
--- a/pdfdet/models/Paddle/ppdet/data/source/mot.py
+++ /dev/null
@@ -1,638 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import cv2
-import glob
-import numpy as np
-from collections import OrderedDict, defaultdict
-try:
-    from collections.abc import Sequence
-except Exception:
-    from collections import Sequence
-from .dataset import DetDataset, _make_dataset, _is_valid_file
-from ppdet.core.workspace import register, serializable
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-
-@register
-@serializable
-class MOTDataSet(DetDataset):
-    """
-    Load dataset with MOT format, only support single class MOT.
-
-    Args:
-        dataset_dir (str): root directory for dataset.
-        image_lists (str|list): mot data image lists, muiti-source mot dataset.
-        data_fields (list): key name of data dictionary, at least have 'image'.
-        sample_num (int): number of samples to load, -1 means all.
-        repeat (int): repeat times for dataset, use in benchmark.
-
-    Notes:
-        MOT datasets root directory following this:
-            dataset/mot
-            |——————image_lists
-            |        |——————caltech.train  
-            |        |——————caltech.val   
-            |        |——————mot16.train  
-            |        |——————mot17.train  
-            |        ......
-            |——————Caltech
-            |——————MOT17
-            |——————......
-
-        All the MOT datasets have the following structure:
-            Caltech
-            |——————images
-            |        └——————00001.jpg
-            |        |—————— ...
-            |        └——————0000N.jpg
-            └——————labels_with_ids
-                        └——————00001.txt
-                        |—————— ...
-                        └——————0000N.txt
-            or
-
-            MOT17
-            |——————images
-            |        └——————train
-            |        └——————test
-            └——————labels_with_ids
-                        └——————train
-    """
-
-    def __init__(self,
-                 dataset_dir=None,
-                 image_lists=[],
-                 data_fields=['image'],
-                 sample_num=-1,
-                 repeat=1):
-        super(MOTDataSet, self).__init__(
-            dataset_dir=dataset_dir,
-            data_fields=data_fields,
-            sample_num=sample_num,
-            repeat=repeat)
-        self.dataset_dir = dataset_dir
-        self.image_lists = image_lists
-        if isinstance(self.image_lists, str):
-            self.image_lists = [self.image_lists]
-        self.roidbs = None
-        self.cname2cid = None
-
-    def get_anno(self):
-        if self.image_lists == []:
-            return
-        # only used to get categories and metric
-        # only check first data, but the label_list of all data should be same.
-        first_mot_data = self.image_lists[0].split('.')[0]
-        anno_file = os.path.join(self.dataset_dir, first_mot_data,
-                                 'label_list.txt')
-        return anno_file
-
-    def parse_dataset(self):
-        self.img_files = OrderedDict()
-        self.img_start_index = OrderedDict()
-        self.label_files = OrderedDict()
-        self.tid_num = OrderedDict()
-        self.tid_start_index = OrderedDict()
-
-        img_index = 0
-        for data_name in self.image_lists:
-            # check every data image list
-            image_lists_dir = os.path.join(self.dataset_dir, 'image_lists')
-            assert os.path.isdir(image_lists_dir), \
-                "The {} is not a directory.".format(image_lists_dir)
-
-            list_path = os.path.join(image_lists_dir, data_name)
-            assert os.path.exists(list_path), \
-                "The list path {} does not exist.".format(list_path)
-
-            # record img_files, filter out empty ones
-            with open(list_path, 'r') as file:
-                self.img_files[data_name] = file.readlines()
-                self.img_files[data_name] = [
-                    os.path.join(self.dataset_dir, x.strip())
-                    for x in self.img_files[data_name]
-                ]
-                self.img_files[data_name] = list(
-                    filter(lambda x: len(x) > 0, self.img_files[data_name]))
-
-                self.img_start_index[data_name] = img_index
-                img_index += len(self.img_files[data_name])
-
-            # record label_files
-            self.label_files[data_name] = [
-                x.replace('images', 'labels_with_ids').replace(
-                    '.png', '.txt').replace('.jpg', '.txt')
-                for x in self.img_files[data_name]
-            ]
-
-        for data_name, label_paths in self.label_files.items():
-            max_index = -1
-            for lp in label_paths:
-                lb = np.loadtxt(lp)
-                if len(lb) < 1:
-                    continue
-                if len(lb.shape) < 2:
-                    img_max = lb[1]
-                else:
-                    img_max = np.max(lb[:, 1])
-                if img_max > max_index:
-                    max_index = img_max
-            self.tid_num[data_name] = int(max_index + 1)
-
-        last_index = 0
-        for i, (k, v) in enumerate(self.tid_num.items()):
-            self.tid_start_index[k] = last_index
-            last_index += v
-
-        self.num_identities_dict = defaultdict(int)
-        self.num_identities_dict[0] = int(last_index + 1)  # single class
-        self.num_imgs_each_data = [len(x) for x in self.img_files.values()]
-        self.total_imgs = sum(self.num_imgs_each_data)
-
-        logger.info('MOT dataset summary: ')
-        logger.info(self.tid_num)
-        logger.info('Total images: {}'.format(self.total_imgs))
-        logger.info('Image start index: {}'.format(self.img_start_index))
-        logger.info('Total identities: {}'.format(self.num_identities_dict[0]))
-        logger.info('Identity start index: {}'.format(self.tid_start_index))
-
-        records = []
-        cname2cid = mot_label()
-
-        for img_index in range(self.total_imgs):
-            for i, (k, v) in enumerate(self.img_start_index.items()):
-                if img_index >= v:
-                    data_name = list(self.label_files.keys())[i]
-                    start_index = v
-            img_file = self.img_files[data_name][img_index - start_index]
-            lbl_file = self.label_files[data_name][img_index - start_index]
-
-            if not os.path.exists(img_file):
-                logger.warning('Illegal image file: {}, and it will be ignored'.
-                               format(img_file))
-                continue
-            if not os.path.isfile(lbl_file):
-                logger.warning('Illegal label file: {}, and it will be ignored'.
-                               format(lbl_file))
-                continue
-
-            labels = np.loadtxt(lbl_file, dtype=np.float32).reshape(-1, 6)
-            # each row in labels (N, 6) is [gt_class, gt_identity, cx, cy, w, h]
-
-            cx, cy = labels[:, 2], labels[:, 3]
-            w, h = labels[:, 4], labels[:, 5]
-            gt_bbox = np.stack((cx, cy, w, h)).T.astype('float32')
-            gt_class = labels[:, 0:1].astype('int32')
-            gt_score = np.ones((len(labels), 1)).astype('float32')
-            gt_ide = labels[:, 1:2].astype('int32')
-            for i, _ in enumerate(gt_ide):
-                if gt_ide[i] > -1:
-                    gt_ide[i] += self.tid_start_index[data_name]
-
-            mot_rec = {
-                'im_file': img_file,
-                'im_id': img_index,
-            } if 'image' in self.data_fields else {}
-
-            gt_rec = {
-                'gt_class': gt_class,
-                'gt_score': gt_score,
-                'gt_bbox': gt_bbox,
-                'gt_ide': gt_ide,
-            }
-
-            for k, v in gt_rec.items():
-                if k in self.data_fields:
-                    mot_rec[k] = v
-
-            records.append(mot_rec)
-            if self.sample_num > 0 and img_index >= self.sample_num:
-                break
-        assert len(records) > 0, 'not found any mot record in %s' % (
-            self.image_lists)
-        self.roidbs, self.cname2cid = records, cname2cid
-
-
-@register
-@serializable
-class MCMOTDataSet(DetDataset):
-    """
-    Load dataset with MOT format, support multi-class MOT.
-
-    Args:
-        dataset_dir (str): root directory for dataset.
-        image_lists (list(str)): mcmot data image lists, muiti-source mcmot dataset.
-        data_fields (list): key name of data dictionary, at least have 'image'.
-        label_list (str): if use_default_label is False, will load
-            mapping between category and class index.
-        sample_num (int): number of samples to load, -1 means all.
-
-    Notes:
-        MCMOT datasets root directory following this:
-            dataset/mot
-            |——————image_lists
-            |        |——————visdrone_mcmot.train  
-            |        |——————visdrone_mcmot.val   
-            visdrone_mcmot
-            |——————images
-            |        └——————train
-            |        └——————val
-            └——————labels_with_ids
-                        └——————train
-    """
-
-    def __init__(self,
-                 dataset_dir=None,
-                 image_lists=[],
-                 data_fields=['image'],
-                 label_list=None,
-                 sample_num=-1):
-        super(MCMOTDataSet, self).__init__(
-            dataset_dir=dataset_dir,
-            data_fields=data_fields,
-            sample_num=sample_num)
-        self.dataset_dir = dataset_dir
-        self.image_lists = image_lists
-        if isinstance(self.image_lists, str):
-            self.image_lists = [self.image_lists]
-        self.label_list = label_list
-        self.roidbs = None
-        self.cname2cid = None
-
-    def get_anno(self):
-        if self.image_lists == []:
-            return
-        # only used to get categories and metric
-        # only check first data, but the label_list of all data should be same.
-        first_mot_data = self.image_lists[0].split('.')[0]
-        anno_file = os.path.join(self.dataset_dir, first_mot_data,
-                                 'label_list.txt')
-        return anno_file
-
-    def parse_dataset(self):
-        self.img_files = OrderedDict()
-        self.img_start_index = OrderedDict()
-        self.label_files = OrderedDict()
-        self.tid_num = OrderedDict()
-        self.tid_start_idx_of_cls_ids = defaultdict(dict)  # for MCMOT
-
-        img_index = 0
-        for data_name in self.image_lists:
-            # check every data image list
-            image_lists_dir = os.path.join(self.dataset_dir, 'image_lists')
-            assert os.path.isdir(image_lists_dir), \
-                "The {} is not a directory.".format(image_lists_dir)
-
-            list_path = os.path.join(image_lists_dir, data_name)
-            assert os.path.exists(list_path), \
-                "The list path {} does not exist.".format(list_path)
-
-            # record img_files, filter out empty ones
-            with open(list_path, 'r') as file:
-                self.img_files[data_name] = file.readlines()
-                self.img_files[data_name] = [
-                    os.path.join(self.dataset_dir, x.strip())
-                    for x in self.img_files[data_name]
-                ]
-                self.img_files[data_name] = list(
-                    filter(lambda x: len(x) > 0, self.img_files[data_name]))
-
-                self.img_start_index[data_name] = img_index
-                img_index += len(self.img_files[data_name])
-
-            # record label_files
-            self.label_files[data_name] = [
-                x.replace('images', 'labels_with_ids').replace(
-                    '.png', '.txt').replace('.jpg', '.txt')
-                for x in self.img_files[data_name]
-            ]
-
-        for data_name, label_paths in self.label_files.items():
-            # using max_ids_dict rather than max_index
-            max_ids_dict = defaultdict(int)
-            for lp in label_paths:
-                lb = np.loadtxt(lp)
-                if len(lb) < 1:
-                    continue
-                lb = lb.reshape(-1, 6)
-                for item in lb:
-                    if item[1] > max_ids_dict[int(item[0])]:
-                        # item[0]: cls_id
-                        # item[1]: track id
-                        max_ids_dict[int(item[0])] = int(item[1])
-            # track id number
-            self.tid_num[data_name] = max_ids_dict
-
-        last_idx_dict = defaultdict(int)
-        for i, (k, v) in enumerate(self.tid_num.items()):  # each sub dataset
-            for cls_id, id_num in v.items():  # v is a max_ids_dict
-                self.tid_start_idx_of_cls_ids[k][cls_id] = last_idx_dict[cls_id]
-                last_idx_dict[cls_id] += id_num
-
-        self.num_identities_dict = defaultdict(int)
-        for k, v in last_idx_dict.items():
-            self.num_identities_dict[k] = int(v)  # total ids of each category
-
-        self.num_imgs_each_data = [len(x) for x in self.img_files.values()]
-        self.total_imgs = sum(self.num_imgs_each_data)
-
-        # cname2cid and cid2cname 
-        cname2cid = {}
-        if self.label_list is not None:
-            # if use label_list for multi source mix dataset, 
-            # please make sure label_list in the first sub_dataset at least.
-            sub_dataset = self.image_lists[0].split('.')[0]
-            label_path = os.path.join(self.dataset_dir, sub_dataset,
-                                      self.label_list)
-            if not os.path.exists(label_path):
-                logger.info(
-                    "Note: label_list {} does not exists, use VisDrone 10 classes labels as default.".
-                    format(label_path))
-                cname2cid = visdrone_mcmot_label()
-            else:
-                with open(label_path, 'r') as fr:
-                    label_id = 0
-                    for line in fr.readlines():
-                        cname2cid[line.strip()] = label_id
-                        label_id += 1
-        else:
-            cname2cid = visdrone_mcmot_label()
-
-        cid2cname = dict([(v, k) for (k, v) in cname2cid.items()])
-
-        logger.info('MCMOT dataset summary: ')
-        logger.info(self.tid_num)
-        logger.info('Total images: {}'.format(self.total_imgs))
-        logger.info('Image start index: {}'.format(self.img_start_index))
-
-        logger.info('Total identities of each category: ')
-        num_identities_dict = sorted(
-            self.num_identities_dict.items(), key=lambda x: x[0])
-        total_IDs_all_cats = 0
-        for (k, v) in num_identities_dict:
-            logger.info('Category {} [{}] has {} IDs.'.format(k, cid2cname[k],
-                                                              v))
-            total_IDs_all_cats += v
-        logger.info('Total identities of all categories: {}'.format(
-            total_IDs_all_cats))
-
-        logger.info('Identity start index of each category: ')
-        for k, v in self.tid_start_idx_of_cls_ids.items():
-            sorted_v = sorted(v.items(), key=lambda x: x[0])
-            for (cls_id, start_idx) in sorted_v:
-                logger.info('Start index of dataset {} category {:d} is {:d}'
-                            .format(k, cls_id, start_idx))
-
-        records = []
-        for img_index in range(self.total_imgs):
-            for i, (k, v) in enumerate(self.img_start_index.items()):
-                if img_index >= v:
-                    data_name = list(self.label_files.keys())[i]
-                    start_index = v
-            img_file = self.img_files[data_name][img_index - start_index]
-            lbl_file = self.label_files[data_name][img_index - start_index]
-
-            if not os.path.exists(img_file):
-                logger.warning('Illegal image file: {}, and it will be ignored'.
-                               format(img_file))
-                continue
-            if not os.path.isfile(lbl_file):
-                logger.warning('Illegal label file: {}, and it will be ignored'.
-                               format(lbl_file))
-                continue
-
-            labels = np.loadtxt(lbl_file, dtype=np.float32).reshape(-1, 6)
-            # each row in labels (N, 6) is [gt_class, gt_identity, cx, cy, w, h]
-
-            cx, cy = labels[:, 2], labels[:, 3]
-            w, h = labels[:, 4], labels[:, 5]
-            gt_bbox = np.stack((cx, cy, w, h)).T.astype('float32')
-            gt_class = labels[:, 0:1].astype('int32')
-            gt_score = np.ones((len(labels), 1)).astype('float32')
-            gt_ide = labels[:, 1:2].astype('int32')
-            for i, _ in enumerate(gt_ide):
-                if gt_ide[i] > -1:
-                    cls_id = int(gt_class[i])
-                    start_idx = self.tid_start_idx_of_cls_ids[data_name][cls_id]
-                    gt_ide[i] += start_idx
-
-            mot_rec = {
-                'im_file': img_file,
-                'im_id': img_index,
-            } if 'image' in self.data_fields else {}
-
-            gt_rec = {
-                'gt_class': gt_class,
-                'gt_score': gt_score,
-                'gt_bbox': gt_bbox,
-                'gt_ide': gt_ide,
-            }
-
-            for k, v in gt_rec.items():
-                if k in self.data_fields:
-                    mot_rec[k] = v
-
-            records.append(mot_rec)
-            if self.sample_num > 0 and img_index >= self.sample_num:
-                break
-        assert len(records) > 0, 'not found any mot record in %s' % (
-            self.image_lists)
-        self.roidbs, self.cname2cid = records, cname2cid
-
-
-@register
-@serializable
-class MOTImageFolder(DetDataset):
-    """
-    Load MOT dataset with MOT format from image folder or video .
-    Args:
-        video_file (str): path of the video file, default ''.
-        frame_rate (int): frame rate of the video, use cv2 VideoCapture if not set.
-        dataset_dir (str): root directory for dataset.
-        keep_ori_im (bool): whether to keep original image, default False. 
-            Set True when used during MOT model inference while saving
-            images or video, or used in DeepSORT.
-    """
-
-    def __init__(self,
-                 video_file=None,
-                 frame_rate=-1,
-                 dataset_dir=None,
-                 data_root=None,
-                 image_dir=None,
-                 sample_num=-1,
-                 keep_ori_im=False,
-                 anno_path=None,
-                 **kwargs):
-        super(MOTImageFolder, self).__init__(
-            dataset_dir, image_dir, sample_num=sample_num)
-        self.video_file = video_file
-        self.data_root = data_root
-        self.keep_ori_im = keep_ori_im
-        self._imid2path = {}
-        self.roidbs = None
-        self.frame_rate = frame_rate
-        self.anno_path = anno_path
-
-    def check_or_download_dataset(self):
-        return
-
-    def parse_dataset(self, ):
-        if not self.roidbs:
-            if self.video_file is None:
-                self.frame_rate = 30  # set as default if infer image folder
-                self.roidbs = self._load_images()
-            else:
-                self.roidbs = self._load_video_images()
-
-    def _load_video_images(self):
-        if self.frame_rate == -1:
-            # if frame_rate is not set for video, use cv2.VideoCapture
-            cap = cv2.VideoCapture(self.video_file)
-            self.frame_rate = int(cap.get(cv2.CAP_PROP_FPS))
-
-        extension = self.video_file.split('.')[-1]
-        output_path = self.video_file.replace('.{}'.format(extension), '')
-        frames_path = video2frames(self.video_file, output_path,
-                                   self.frame_rate)
-        self.video_frames = sorted(
-            glob.glob(os.path.join(frames_path, '*.png')))
-
-        self.video_length = len(self.video_frames)
-        logger.info('Length of the video: {:d} frames.'.format(
-            self.video_length))
-        ct = 0
-        records = []
-        for image in self.video_frames:
-            assert image != '' and os.path.isfile(image), \
-                    "Image {} not found".format(image)
-            if self.sample_num > 0 and ct >= self.sample_num:
-                break
-            rec = {'im_id': np.array([ct]), 'im_file': image}
-            if self.keep_ori_im:
-                rec.update({'keep_ori_im': 1})
-            self._imid2path[ct] = image
-            ct += 1
-            records.append(rec)
-        assert len(records) > 0, "No image file found"
-        return records
-
-    def _find_images(self):
-        image_dir = self.image_dir
-        if not isinstance(image_dir, Sequence):
-            image_dir = [image_dir]
-        images = []
-        for im_dir in image_dir:
-            if os.path.isdir(im_dir):
-                im_dir = os.path.join(self.dataset_dir, im_dir)
-                images.extend(_make_dataset(im_dir))
-            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
-                images.append(im_dir)
-        return images
-
-    def _load_images(self):
-        images = self._find_images()
-        ct = 0
-        records = []
-        for image in images:
-            assert image != '' and os.path.isfile(image), \
-                    "Image {} not found".format(image)
-            if self.sample_num > 0 and ct >= self.sample_num:
-                break
-            rec = {'im_id': np.array([ct]), 'im_file': image}
-            if self.keep_ori_im:
-                rec.update({'keep_ori_im': 1})
-            self._imid2path[ct] = image
-            ct += 1
-            records.append(rec)
-        assert len(records) > 0, "No image file found"
-        return records
-
-    def get_imid2path(self):
-        return self._imid2path
-
-    def set_images(self, images):
-        self.image_dir = images
-        self.roidbs = self._load_images()
-
-    def set_video(self, video_file, frame_rate):
-        # update video_file and frame_rate by command line of tools/infer_mot.py
-        self.video_file = video_file
-        self.frame_rate = frame_rate
-        assert os.path.isfile(self.video_file) and _is_valid_video(self.video_file), \
-                "wrong or unsupported file format: {}".format(self.video_file)
-        self.roidbs = self._load_video_images()
-
-    def get_anno(self):
-        return self.anno_path
-
-
-def _is_valid_video(f, extensions=('.mp4', '.avi', '.mov', '.rmvb', 'flv')):
-    return f.lower().endswith(extensions)
-
-
-def video2frames(video_path, outpath, frame_rate, **kargs):
-    def _dict2str(kargs):
-        cmd_str = ''
-        for k, v in kargs.items():
-            cmd_str += (' ' + str(k) + ' ' + str(v))
-        return cmd_str
-
-    ffmpeg = ['ffmpeg ', ' -y -loglevel ', ' error ']
-    vid_name = os.path.basename(video_path).split('.')[0]
-    out_full_path = os.path.join(outpath, vid_name)
-
-    if not os.path.exists(out_full_path):
-        os.makedirs(out_full_path)
-
-    # video file name
-    outformat = os.path.join(out_full_path, '%08d.png')
-
-    cmd = ffmpeg
-    cmd = ffmpeg + [
-        ' -i ', video_path, ' -r ', str(frame_rate), ' -f image2 ', outformat
-    ]
-    cmd = ''.join(cmd) + _dict2str(kargs)
-
-    if os.system(cmd) != 0:
-        raise RuntimeError('ffmpeg process video: {} error'.format(video_path))
-        sys.exit(-1)
-
-    sys.stdout.flush()
-    return out_full_path
-
-
-def mot_label():
-    labels_map = {'person': 0}
-    return labels_map
-
-
-def visdrone_mcmot_label():
-    labels_map = {
-        'pedestrian': 0,
-        'people': 1,
-        'bicycle': 2,
-        'car': 3,
-        'van': 4,
-        'truck': 5,
-        'tricycle': 6,
-        'awning-tricycle': 7,
-        'bus': 8,
-        'motor': 9,
-    }
-    return labels_map
diff --git a/pdfdet/models/Paddle/ppdet/data/source/pose3d_cmb.py b/pdfdet/models/Paddle/ppdet/data/source/pose3d_cmb.py
deleted file mode 100644
index 06dbdd9..0000000
--- a/pdfdet/models/Paddle/ppdet/data/source/pose3d_cmb.py
+++ /dev/null
@@ -1,380 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import os
-import cv2
-import numpy as np
-import json
-import copy
-import pycocotools
-from pycocotools.coco import COCO
-from .dataset import DetDataset
-from ppdet.core.workspace import register, serializable
-from paddle.io import Dataset
-
-
-@serializable
-class Pose3DDataset(DetDataset):
-    """Pose3D Dataset class. 
-
-    Args:
-        dataset_dir (str): Root path to the dataset.
-        anno_list (list of str): each of the element is a relative path to the annotation file.
-        image_dirs (list of str): each of path is a relative path where images are held.
-        transform (composed(operators)): A sequence of data transforms.
-        test_mode (bool): Store True when building test or
-            validation dataset. Default: False.
-        24 joints order:
-        0-2: 'R_Ankle', 'R_Knee', 'R_Hip', 
-        3-5:'L_Hip', 'L_Knee', 'L_Ankle', 
-        6-8:'R_Wrist', 'R_Elbow', 'R_Shoulder', 
-        9-11:'L_Shoulder','L_Elbow','L_Wrist',
-        12-14:'Neck','Top_of_Head','Pelvis',
-        15-18:'Thorax','Spine','Jaw','Head',
-        19-23:'Nose','L_Eye','R_Eye','L_Ear','R_Ear'
-    """
-
-    def __init__(self,
-                 dataset_dir,
-                 image_dirs,
-                 anno_list,
-                 transform=[],
-                 num_joints=24,
-                 test_mode=False):
-        super().__init__(dataset_dir, image_dirs, anno_list)
-        self.image_info = {}
-        self.ann_info = {}
-        self.num_joints = num_joints
-
-        self.transform = transform
-        self.test_mode = test_mode
-
-        self.img_ids = []
-        self.dataset_dir = dataset_dir
-        self.image_dirs = image_dirs
-        self.anno_list = anno_list
-
-    def get_mask(self, mvm_percent=0.3):
-        num_joints = self.num_joints
-        mjm_mask = np.ones((num_joints, 1)).astype(np.float32)
-        if self.test_mode == False:
-            pb = np.random.random_sample()
-            masked_num = int(
-                pb * mvm_percent *
-                num_joints)  # at most x% of the joints could be masked
-            indices = np.random.choice(
-                np.arange(num_joints), replace=False, size=masked_num)
-            mjm_mask[indices, :] = 0.0
-        # return mjm_mask
-
-        num_joints = 10
-        mvm_mask = np.ones((num_joints, 1)).astype(np.float)
-        if self.test_mode == False:
-            num_vertices = num_joints
-            pb = np.random.random_sample()
-            masked_num = int(
-                pb * mvm_percent *
-                num_vertices)  # at most x% of the vertices could be masked
-            indices = np.random.choice(
-                np.arange(num_vertices), replace=False, size=masked_num)
-            mvm_mask[indices, :] = 0.0
-
-        mjm_mask = np.concatenate([mjm_mask, mvm_mask], axis=0)
-        return mjm_mask
-
-    def filterjoints(self, x):
-        if self.num_joints == 24:
-            return x
-        elif self.num_joints == 14:
-            return x[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18], :]
-        elif self.num_joints == 17:
-            return x[
-                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19], :]
-        else:
-            raise ValueError(
-                "unsupported joint numbers, only [24 or 17 or 14] is supported!")
-
-    def parse_dataset(self):
-        print("Loading annotations..., please wait")
-        self.annos = []
-        im_id = 0
-        self.human36m_num = 0
-        for idx, annof in enumerate(self.anno_list):
-            img_prefix = os.path.join(self.dataset_dir, self.image_dirs[idx])
-            dataf = os.path.join(self.dataset_dir, annof)
-            with open(dataf, 'r') as rf:
-                anno_data = json.load(rf)
-                annos = anno_data['data']
-                new_annos = []
-                print("{} has annos numbers: {}".format(dataf, len(annos)))
-                for anno in annos:
-                    new_anno = {}
-                    new_anno['im_id'] = im_id
-                    im_id += 1
-                    imagename = anno['imageName']
-                    if imagename.startswith("COCO_train2014_"):
-                        imagename = imagename[len("COCO_train2014_"):]
-                    elif imagename.startswith("COCO_val2014_"):
-                        imagename = imagename[len("COCO_val2014_"):]
-                    imagename = os.path.join(img_prefix, imagename)
-                    if not os.path.exists(imagename):
-                        if "train2017" in imagename:
-                            imagename = imagename.replace("train2017",
-                                                          "val2017")
-                            if not os.path.exists(imagename):
-                                print("cannot find imagepath:{}".format(
-                                    imagename))
-                                continue
-                        else:
-                            print("cannot find imagepath:{}".format(imagename))
-                            continue
-                    new_anno['imageName'] = imagename
-                    if 'human3.6m' in imagename:
-                        self.human36m_num += 1
-                    new_anno['bbox_center'] = anno['bbox_center']
-                    new_anno['bbox_scale'] = anno['bbox_scale']
-                    new_anno['joints_2d'] = np.array(anno[
-                        'gt_keypoint_2d']).astype(np.float32)
-                    if new_anno['joints_2d'].shape[0] == 49:
-                        #if the joints_2d is in SPIN format(which generated by eft), choose the last 24 public joints
-                        #for detail please refer: https://github.com/nkolot/SPIN/blob/master/constants.py
-                        new_anno['joints_2d'] = new_anno['joints_2d'][25:]
-                    new_anno['joints_3d'] = np.array(anno[
-                        'pose3d'])[:, :3].astype(np.float32)
-                    new_anno['mjm_mask'] = self.get_mask()
-                    if not 'has_3d_joints' in anno:
-                        new_anno['has_3d_joints'] = int(1)
-                        new_anno['has_2d_joints'] = int(1)
-                    else:
-                        new_anno['has_3d_joints'] = int(anno['has_3d_joints'])
-                        new_anno['has_2d_joints'] = int(anno['has_2d_joints'])
-                    new_anno['joints_2d'] = self.filterjoints(new_anno[
-                        'joints_2d'])
-                    self.annos.append(new_anno)
-                del annos
-
-    def get_temp_num(self):
-        """get temporal data number, like human3.6m"""
-        return self.human36m_num
-
-    def __len__(self):
-        """Get dataset length."""
-        return len(self.annos)
-
-    def _get_imganno(self, idx):
-        """Get anno for a single image."""
-        return self.annos[idx]
-
-    def __getitem__(self, idx):
-        """Prepare image for training given the index."""
-        records = copy.deepcopy(self._get_imganno(idx))
-        imgpath = records['imageName']
-        assert os.path.exists(imgpath), "cannot find image {}".format(imgpath)
-        records['image'] = cv2.imread(imgpath)
-        records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB)
-        records = self.transform(records)
-        return records
-
-    def check_or_download_dataset(self):
-        alldatafind = True
-        for image_dir in self.image_dirs:
-            image_dir = os.path.join(self.dataset_dir, image_dir)
-            if not os.path.isdir(image_dir):
-                print("dataset [{}] is not found".format(image_dir))
-                alldatafind = False
-        if not alldatafind:
-            raise ValueError(
-                "Some dataset is not valid and cannot download automatically now, please prepare the dataset first"
-            )
-
-
-@register
-@serializable
-class Keypoint3DMultiFramesDataset(Dataset):
-    """24 keypoints 3D dataset for pose estimation. 
-
-    each item is a list of images
-
-    The dataset loads raw features and apply specified transforms
-    to return a dict containing the image tensors and other information.
-
-    Args:
-        dataset_dir (str): Root path to the dataset.
-        image_dir (str): Path to a directory where images are held.
-    """
-
-    def __init__(
-            self,
-            dataset_dir,  # 数据集根目录
-            image_dir,  # 图像文件夹
-            p3d_dir,  # 3D关键点文件夹
-            json_path,
-            img_size,  #图像resize大小
-            num_frames,  # 帧序列长度
-            anno_path=None, ):
-
-        self.dataset_dir = dataset_dir
-        self.image_dir = image_dir
-        self.p3d_dir = p3d_dir
-        self.json_path = json_path
-        self.img_size = img_size
-        self.num_frames = num_frames
-        self.anno_path = anno_path
-
-        self.data_labels, self.mf_inds = self._generate_multi_frames_list()
-
-    def _generate_multi_frames_list(self):
-        act_list = os.listdir(self.dataset_dir)  # 动作列表
-        count = 0
-        mf_list = []
-        annos_dict = {'images': [], 'annotations': [], 'act_inds': []}
-        for act in act_list:  #对每个动作，生成帧序列
-            if '.' in act:
-                continue
-
-            json_path = os.path.join(self.dataset_dir, act, self.json_path)
-            with open(json_path, 'r') as j:
-                annos = json.load(j)
-            length = len(annos['images'])
-            for k, v in annos.items():
-                if k in annos_dict:
-                    annos_dict[k].extend(v)
-            annos_dict['act_inds'].extend([act] * length)
-
-            mf = [[i + j + count for j in range(self.num_frames)]
-                  for i in range(0, length - self.num_frames + 1)]
-            mf_list.extend(mf)
-            count += length
-
-        print("total data number:", len(mf_list))
-        return annos_dict, mf_list
-
-    def __call__(self, *args, **kwargs):
-        return self
-
-    def __getitem__(self, index):  # 拿一个连续的序列
-        inds = self.mf_inds[
-            index]  # 如[568, 569, 570, 571, 572, 573]，长度为num_frames
-
-        images = self.data_labels['images']  # all images
-        annots = self.data_labels['annotations']  # all annots
-
-        act = self.data_labels['act_inds'][inds[0]]  # 动作名（文件夹名）
-
-        kps3d_list = []
-        kps3d_vis_list = []
-        names = []
-
-        h, w = 0, 0
-        for ind in inds:  # one image
-            height = float(images[ind]['height'])
-            width = float(images[ind]['width'])
-            name = images[ind]['file_name']  # 图像名称，带有后缀
-
-            kps3d_name = name.split('.')[0] + '.obj'
-            kps3d_path = os.path.join(self.dataset_dir, act, self.p3d_dir,
-                                      kps3d_name)
-
-            joints, joints_vis = self.kps3d_process(kps3d_path)
-            joints_vis = np.array(joints_vis, dtype=np.float32)
-
-            kps3d_list.append(joints)
-            kps3d_vis_list.append(joints_vis)
-            names.append(name)
-
-        kps3d = np.array(kps3d_list)  # (6, 24, 3),(num_frames, joints_num, 3)
-        kps3d_vis = np.array(kps3d_vis_list)
-
-        # read image
-        imgs = []
-        for name in names:
-            img_path = os.path.join(self.dataset_dir, act, self.image_dir, name)
-
-            image = cv2.imread(img_path, cv2.IMREAD_COLOR |
-                               cv2.IMREAD_IGNORE_ORIENTATION)
-            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-
-            imgs.append(np.expand_dims(image, axis=0))
-
-        imgs = np.concatenate(imgs, axis=0)
-        imgs = imgs.astype(
-            np.float32)  # (6, 1080, 1920, 3),(num_frames, h, w, c)
-
-        # attention: 此时图像和标注是镜像的
-        records = {
-            'kps3d': kps3d,
-            'kps3d_vis': kps3d_vis,
-            "image": imgs,
-            'act': act,
-            'names': names,
-            'im_id': index
-        }
-
-        return self.transform(records)
-
-    def kps3d_process(self, kps3d_path):
-        count = 0
-        kps = []
-        kps_vis = []
-
-        with open(kps3d_path, 'r') as f:
-            lines = f.readlines()
-            for line in lines:
-                if line[0] == 'v':
-                    kps.append([])
-                    line = line.strip('\n').split(' ')[1:]
-                    for kp in line:
-                        kps[-1].append(float(kp))
-                    count += 1
-
-                    kps_vis.append([1, 1, 1])
-
-        kps = np.array(kps)  # 52，3
-        kps_vis = np.array(kps_vis)
-
-        kps *= 10  # scale points
-        kps -= kps[[0], :]  # set root point to zero
-
-        kps = np.concatenate((kps[0:23], kps[[37]]), axis=0)  # 24,3
-
-        kps *= 10
-
-        kps_vis = np.concatenate((kps_vis[0:23], kps_vis[[37]]), axis=0)  # 24,3
-
-        return kps, kps_vis
-
-    def __len__(self):
-        return len(self.mf_inds)
-
-    def get_anno(self):
-        if self.anno_path is None:
-            return
-        return os.path.join(self.dataset_dir, self.anno_path)
-
-    def check_or_download_dataset(self):
-        return
-
-    def parse_dataset(self, ):
-        return
-
-    def set_transform(self, transform):
-        self.transform = transform
-
-    def set_epoch(self, epoch_id):
-        self._epoch = epoch_id
-
-    def set_kwargs(self, **kwargs):
-        self.mixup_epoch = kwargs.get('mixup_epoch', -1)
-        self.cutmix_epoch = kwargs.get('cutmix_epoch', -1)
-        self.mosaic_epoch = kwargs.get('mosaic_epoch', -1)
diff --git a/pdfdet/models/Paddle/ppdet/data/source/sniper_coco.py b/pdfdet/models/Paddle/ppdet/data/source/sniper_coco.py
deleted file mode 100644
index 1b07e7a..0000000
--- a/pdfdet/models/Paddle/ppdet/data/source/sniper_coco.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import cv2
-import json
-import copy
-import numpy as np
-
-try:
-    from collections.abc import Sequence
-except Exception:
-    from collections import Sequence
-
-from ppdet.core.workspace import register, serializable
-from ppdet.data.crop_utils.annotation_cropper import AnnoCropper
-from .coco import COCODataSet
-from .dataset import _make_dataset, _is_valid_file
-from ppdet.utils.logger import setup_logger
-
-logger = setup_logger('sniper_coco_dataset')
-
-
-@register
-@serializable
-class SniperCOCODataSet(COCODataSet):
-    """SniperCOCODataSet"""
-
-    def __init__(self,
-                 dataset_dir=None,
-                 image_dir=None,
-                 anno_path=None,
-                 proposals_file=None,
-                 data_fields=['image'],
-                 sample_num=-1,
-                 load_crowd=False,
-                 allow_empty=True,
-                 empty_ratio=1.,
-                 is_trainset=True,
-                 image_target_sizes=[2000, 1000],
-                 valid_box_ratio_ranges=[[-1, 0.1],[0.08, -1]],
-                 chip_target_size=500,
-                 chip_target_stride=200,
-                 use_neg_chip=False,
-                 max_neg_num_per_im=8,
-                 max_per_img=-1,
-                 nms_thresh=0.5):
-        super(SniperCOCODataSet, self).__init__(
-            dataset_dir=dataset_dir,
-            image_dir=image_dir,
-            anno_path=anno_path,
-            data_fields=data_fields,
-            sample_num=sample_num,
-            load_crowd=load_crowd,
-            allow_empty=allow_empty,
-            empty_ratio=empty_ratio
-        )
-        self.proposals_file = proposals_file
-        self.proposals = None
-        self.anno_cropper = None
-        self.is_trainset = is_trainset
-        self.image_target_sizes = image_target_sizes
-        self.valid_box_ratio_ranges = valid_box_ratio_ranges
-        self.chip_target_size = chip_target_size
-        self.chip_target_stride = chip_target_stride
-        self.use_neg_chip = use_neg_chip
-        self.max_neg_num_per_im = max_neg_num_per_im
-        self.max_per_img = max_per_img
-        self.nms_thresh = nms_thresh
-
-
-    def parse_dataset(self):
-        if not hasattr(self, "roidbs"):
-            super(SniperCOCODataSet, self).parse_dataset()
-        if self.is_trainset:
-            self._parse_proposals()
-            self._merge_anno_proposals()
-        self.ori_roidbs = copy.deepcopy(self.roidbs)
-        self.init_anno_cropper()
-        self.roidbs = self.generate_chips_roidbs(self.roidbs, self.is_trainset)
-
-    def set_proposals_file(self, file_path):
-        self.proposals_file = file_path
-
-    def init_anno_cropper(self):
-        logger.info("Init AnnoCropper...")
-        self.anno_cropper = AnnoCropper(
-            image_target_sizes=self.image_target_sizes,
-            valid_box_ratio_ranges=self.valid_box_ratio_ranges,
-            chip_target_size=self.chip_target_size,
-            chip_target_stride=self.chip_target_stride,
-            use_neg_chip=self.use_neg_chip,
-            max_neg_num_per_im=self.max_neg_num_per_im,
-            max_per_img=self.max_per_img,
-            nms_thresh=self.nms_thresh
-        )
-
-    def generate_chips_roidbs(self, roidbs, is_trainset):
-        if is_trainset:
-            roidbs = self.anno_cropper.crop_anno_records(roidbs)
-        else:
-            roidbs = self.anno_cropper.crop_infer_anno_records(roidbs)
-        return roidbs
-
-    def _parse_proposals(self):
-        if self.proposals_file:
-            self.proposals = {}
-            logger.info("Parse proposals file:{}".format(self.proposals_file))
-            with open(self.proposals_file, 'r') as f:
-                proposals = json.load(f)
-            for prop in proposals:
-                image_id = prop["image_id"]
-                if image_id not in self.proposals:
-                    self.proposals[image_id] = []
-                x, y, w, h = prop["bbox"]
-                self.proposals[image_id].append([x, y, x + w, y + h])
-
-    def _merge_anno_proposals(self):
-        assert self.roidbs
-        if self.proposals and len(self.proposals.keys()) > 0:
-            logger.info("merge proposals to annos")
-            for id, record in enumerate(self.roidbs):
-                image_id = int(record["im_id"])
-                if image_id not in self.proposals.keys():
-                    logger.info("image id :{} no proposals".format(image_id))
-                record["proposals"] = np.array(self.proposals.get(image_id, []), dtype=np.float32)
-                self.roidbs[id] = record
-
-    def get_ori_roidbs(self):
-        if not hasattr(self, "ori_roidbs"):
-            return None
-        return self.ori_roidbs
-
-    def get_roidbs(self):
-        if not hasattr(self, "roidbs"):
-            self.parse_dataset()
-        return self.roidbs
-
-    def set_roidbs(self, roidbs):
-        self.roidbs = roidbs
-
-    def check_or_download_dataset(self):
-        return
-
-    def _parse(self):
-        image_dir = self.image_dir
-        if not isinstance(image_dir, Sequence):
-            image_dir = [image_dir]
-        images = []
-        for im_dir in image_dir:
-            if os.path.isdir(im_dir):
-                im_dir = os.path.join(self.dataset_dir, im_dir)
-                images.extend(_make_dataset(im_dir))
-            elif os.path.isfile(im_dir) and _is_valid_file(im_dir):
-                images.append(im_dir)
-        return images
-
-    def _load_images(self):
-        images = self._parse()
-        ct = 0
-        records = []
-        for image in images:
-            assert image != '' and os.path.isfile(image), \
-                "Image {} not found".format(image)
-            if self.sample_num > 0 and ct >= self.sample_num:
-                break
-            im = cv2.imread(image)
-            h, w, c = im.shape
-            rec = {'im_id': np.array([ct]), 'im_file': image, "h": h, "w": w}
-            self._imid2path[ct] = image
-            ct += 1
-            records.append(rec)
-        assert len(records) > 0, "No image file found"
-        return records
-
-    def get_imid2path(self):
-        return self._imid2path
-
-    def set_images(self, images):
-        self._imid2path = {}
-        self.image_dir = images
-        self.roidbs = self._load_images()
-
diff --git a/pdfdet/models/Paddle/ppdet/data/source/voc.py b/pdfdet/models/Paddle/ppdet/data/source/voc.py
deleted file mode 100644
index 2f10358..0000000
--- a/pdfdet/models/Paddle/ppdet/data/source/voc.py
+++ /dev/null
@@ -1,234 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import numpy as np
-
-import xml.etree.ElementTree as ET
-
-from ppdet.core.workspace import register, serializable
-
-from .dataset import DetDataset
-
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-
-@register
-@serializable
-class VOCDataSet(DetDataset):
-    """
-    Load dataset with PascalVOC format.
-
-    Notes:
-    `anno_path` must contains xml file and image file path for annotations.
-
-    Args:
-        dataset_dir (str): root directory for dataset.
-        image_dir (str): directory for images.
-        anno_path (str): voc annotation file path.
-        data_fields (list): key name of data dictionary, at least have 'image'.
-        sample_num (int): number of samples to load, -1 means all.
-        label_list (str): if use_default_label is False, will load
-            mapping between category and class index.
-        allow_empty (bool): whether to load empty entry. False as default
-        empty_ratio (float): the ratio of empty record number to total 
-            record's, if empty_ratio is out of [0. ,1.), do not sample the 
-            records and use all the empty entries. 1. as default
-        repeat (int): repeat times for dataset, use in benchmark.
-    """
-
-    def __init__(self,
-                 dataset_dir=None,
-                 image_dir=None,
-                 anno_path=None,
-                 data_fields=['image'],
-                 sample_num=-1,
-                 label_list=None,
-                 allow_empty=False,
-                 empty_ratio=1.,
-                 repeat=1):
-        super(VOCDataSet, self).__init__(
-            dataset_dir=dataset_dir,
-            image_dir=image_dir,
-            anno_path=anno_path,
-            data_fields=data_fields,
-            sample_num=sample_num,
-            repeat=repeat)
-        self.label_list = label_list
-        self.allow_empty = allow_empty
-        self.empty_ratio = empty_ratio
-
-    def _sample_empty(self, records, num):
-        # if empty_ratio is out of [0. ,1.), do not sample the records
-        if self.empty_ratio < 0. or self.empty_ratio >= 1.:
-            return records
-        import random
-        sample_num = min(
-            int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records))
-        records = random.sample(records, sample_num)
-        return records
-
-    def parse_dataset(self, ):
-        anno_path = os.path.join(self.dataset_dir, self.anno_path)
-        image_dir = os.path.join(self.dataset_dir, self.image_dir)
-
-        # mapping category name to class id
-        # first_class:0, second_class:1, ...
-        records = []
-        empty_records = []
-        ct = 0
-        cname2cid = {}
-        if self.label_list:
-            label_path = os.path.join(self.dataset_dir, self.label_list)
-            if not os.path.exists(label_path):
-                raise ValueError("label_list {} does not exists".format(
-                    label_path))
-            with open(label_path, 'r') as fr:
-                label_id = 0
-                for line in fr.readlines():
-                    cname2cid[line.strip()] = label_id
-                    label_id += 1
-        else:
-            cname2cid = pascalvoc_label()
-
-        with open(anno_path, 'r') as fr:
-            while True:
-                line = fr.readline()
-                if not line:
-                    break
-                img_file, xml_file = [os.path.join(image_dir, x) \
-                        for x in line.strip().split()[:2]]
-                if not os.path.exists(img_file):
-                    logger.warning(
-                        'Illegal image file: {}, and it will be ignored'.format(
-                            img_file))
-                    continue
-                if not os.path.isfile(xml_file):
-                    logger.warning(
-                        'Illegal xml file: {}, and it will be ignored'.format(
-                            xml_file))
-                    continue
-                tree = ET.parse(xml_file)
-                if tree.find('id') is None:
-                    im_id = np.array([ct])
-                else:
-                    im_id = np.array([int(tree.find('id').text)])
-
-                objs = tree.findall('object')
-                im_w = float(tree.find('size').find('width').text)
-                im_h = float(tree.find('size').find('height').text)
-                if im_w < 0 or im_h < 0:
-                    logger.warning(
-                        'Illegal width: {} or height: {} in annotation, '
-                        'and {} will be ignored'.format(im_w, im_h, xml_file))
-                    continue
-
-                num_bbox, i = len(objs), 0
-                gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32)
-                gt_class = np.zeros((num_bbox, 1), dtype=np.int32)
-                gt_score = np.zeros((num_bbox, 1), dtype=np.float32)
-                difficult = np.zeros((num_bbox, 1), dtype=np.int32)
-                for obj in objs:
-                    cname = obj.find('name').text
-
-                    # user dataset may not contain difficult field
-                    _difficult = obj.find('difficult')
-                    _difficult = int(
-                        _difficult.text) if _difficult is not None else 0
-
-                    x1 = float(obj.find('bndbox').find('xmin').text)
-                    y1 = float(obj.find('bndbox').find('ymin').text)
-                    x2 = float(obj.find('bndbox').find('xmax').text)
-                    y2 = float(obj.find('bndbox').find('ymax').text)
-                    x1 = max(0, x1)
-                    y1 = max(0, y1)
-                    x2 = min(im_w - 1, x2)
-                    y2 = min(im_h - 1, y2)
-                    if x2 > x1 and y2 > y1:
-                        gt_bbox[i, :] = [x1, y1, x2, y2]
-                        gt_class[i, 0] = cname2cid[cname]
-                        gt_score[i, 0] = 1.
-                        difficult[i, 0] = _difficult
-                        i += 1
-                    else:
-                        logger.warning(
-                            'Found an invalid bbox in annotations: xml_file: {}'
-                            ', x1: {}, y1: {}, x2: {}, y2: {}.'.format(
-                                xml_file, x1, y1, x2, y2))
-                gt_bbox = gt_bbox[:i, :]
-                gt_class = gt_class[:i, :]
-                gt_score = gt_score[:i, :]
-                difficult = difficult[:i, :]
-
-                voc_rec = {
-                    'im_file': img_file,
-                    'im_id': im_id,
-                    'h': im_h,
-                    'w': im_w
-                } if 'image' in self.data_fields else {}
-
-                gt_rec = {
-                    'gt_class': gt_class,
-                    'gt_score': gt_score,
-                    'gt_bbox': gt_bbox,
-                    'difficult': difficult
-                }
-                for k, v in gt_rec.items():
-                    if k in self.data_fields:
-                        voc_rec[k] = v
-
-                if len(objs) == 0:
-                    empty_records.append(voc_rec)
-                else:
-                    records.append(voc_rec)
-
-                ct += 1
-                if self.sample_num > 0 and ct >= self.sample_num:
-                    break
-        assert ct > 0, 'not found any voc record in %s' % (self.anno_path)
-        logger.debug('{} samples in file {}'.format(ct, anno_path))
-        if self.allow_empty and len(empty_records) > 0:
-            empty_records = self._sample_empty(empty_records, len(records))
-            records += empty_records
-        self.roidbs, self.cname2cid = records, cname2cid
-
-    def get_label_list(self):
-        return os.path.join(self.dataset_dir, self.label_list)
-
-
-def pascalvoc_label():
-    labels_map = {
-        'aeroplane': 0,
-        'bicycle': 1,
-        'bird': 2,
-        'boat': 3,
-        'bottle': 4,
-        'bus': 5,
-        'car': 6,
-        'cat': 7,
-        'chair': 8,
-        'cow': 9,
-        'diningtable': 10,
-        'dog': 11,
-        'horse': 12,
-        'motorbike': 13,
-        'person': 14,
-        'pottedplant': 15,
-        'sheep': 16,
-        'sofa': 17,
-        'train': 18,
-        'tvmonitor': 19
-    }
-    return labels_map
diff --git a/pdfdet/models/Paddle/ppdet/data/source/widerface.py b/pdfdet/models/Paddle/ppdet/data/source/widerface.py
deleted file mode 100644
index a17c2aa..0000000
--- a/pdfdet/models/Paddle/ppdet/data/source/widerface.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import numpy as np
-
-from ppdet.core.workspace import register, serializable
-from .dataset import DetDataset
-
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-
-@register
-@serializable
-class WIDERFaceDataSet(DetDataset):
-    """
-    Load WiderFace records with 'anno_path'
-
-    Args:
-        dataset_dir (str): root directory for dataset.
-        image_dir (str): directory for images.
-        anno_path (str): WiderFace annotation data.
-        data_fields (list): key name of data dictionary, at least have 'image'.
-        sample_num (int): number of samples to load, -1 means all.
-        with_lmk (bool): whether to load face landmark keypoint labels.
-    """
-
-    def __init__(self,
-                 dataset_dir=None,
-                 image_dir=None,
-                 anno_path=None,
-                 data_fields=['image'],
-                 sample_num=-1,
-                 with_lmk=False):
-        super(WIDERFaceDataSet, self).__init__(
-            dataset_dir=dataset_dir,
-            image_dir=image_dir,
-            anno_path=anno_path,
-            data_fields=data_fields,
-            sample_num=sample_num,
-            with_lmk=with_lmk)
-        self.anno_path = anno_path
-        self.sample_num = sample_num
-        self.roidbs = None
-        self.cname2cid = None
-        self.with_lmk = with_lmk
-
-    def parse_dataset(self):
-        anno_path = os.path.join(self.dataset_dir, self.anno_path)
-        image_dir = os.path.join(self.dataset_dir, self.image_dir)
-
-        txt_file = anno_path
-
-        records = []
-        ct = 0
-        file_lists = self._load_file_list(txt_file)
-        cname2cid = widerface_label()
-
-        for item in file_lists:
-            im_fname = item[0]
-            im_id = np.array([ct])
-            gt_bbox = np.zeros((len(item) - 1, 4), dtype=np.float32)
-            gt_class = np.zeros((len(item) - 1, 1), dtype=np.int32)
-            gt_lmk_labels = np.zeros((len(item) - 1, 10), dtype=np.float32)
-            lmk_ignore_flag = np.zeros((len(item) - 1, 1), dtype=np.int32)
-            for index_box in range(len(item)):
-                if index_box < 1:
-                    continue
-                gt_bbox[index_box - 1] = item[index_box][0]
-                if self.with_lmk:
-                    gt_lmk_labels[index_box - 1] = item[index_box][1]
-                    lmk_ignore_flag[index_box - 1] = item[index_box][2]
-            im_fname = os.path.join(image_dir,
-                                    im_fname) if image_dir else im_fname
-            widerface_rec = {
-                'im_file': im_fname,
-                'im_id': im_id,
-            } if 'image' in self.data_fields else {}
-            gt_rec = {
-                'gt_bbox': gt_bbox,
-                'gt_class': gt_class,
-            }
-            for k, v in gt_rec.items():
-                if k in self.data_fields:
-                    widerface_rec[k] = v
-            if self.with_lmk:
-                widerface_rec['gt_keypoint'] = gt_lmk_labels
-                widerface_rec['keypoint_ignore'] = lmk_ignore_flag
-
-            if len(item) != 0:
-                records.append(widerface_rec)
-
-            ct += 1
-            if self.sample_num > 0 and ct >= self.sample_num:
-                break
-        assert len(records) > 0, 'not found any widerface in %s' % (anno_path)
-        logger.debug('{} samples in file {}'.format(ct, anno_path))
-        self.roidbs, self.cname2cid = records, cname2cid
-
-    def _load_file_list(self, input_txt):
-        with open(input_txt, 'r') as f_dir:
-            lines_input_txt = f_dir.readlines()
-
-        file_dict = {}
-        num_class = 0
-        exts = ['jpg', 'jpeg', 'png', 'bmp']
-        exts += [ext.upper() for ext in exts]
-        for i in range(len(lines_input_txt)):
-            line_txt = lines_input_txt[i].strip('\n\t\r')
-            split_str = line_txt.split(' ')
-            if len(split_str) == 1:
-                img_file_name = os.path.split(split_str[0])[1]
-                split_txt = img_file_name.split('.')
-                if len(split_txt) < 2:
-                    continue
-                elif split_txt[-1] in exts:
-                    if i != 0:
-                        num_class += 1
-                    file_dict[num_class] = [line_txt]
-            else:
-                if len(line_txt) <= 6:
-                    continue
-                result_boxs = []
-                xmin = float(split_str[0])
-                ymin = float(split_str[1])
-                w = float(split_str[2])
-                h = float(split_str[3])
-                # Filter out wrong labels
-                if w < 0 or h < 0:
-                    logger.warning('Illegal box with w: {}, h: {} in '
-                                   'img: {}, and it will be ignored'.format(
-                                       w, h, file_dict[num_class][0]))
-                    continue
-                xmin = max(0, xmin)
-                ymin = max(0, ymin)
-                xmax = xmin + w
-                ymax = ymin + h
-                gt_bbox = [xmin, ymin, xmax, ymax]
-                result_boxs.append(gt_bbox)
-                if self.with_lmk:
-                    assert len(split_str) > 18, 'When `with_lmk=True`, the number' \
-                            'of characters per line in the annotation file should' \
-                            'exceed 18.'
-                    lmk0_x = float(split_str[5])
-                    lmk0_y = float(split_str[6])
-                    lmk1_x = float(split_str[8])
-                    lmk1_y = float(split_str[9])
-                    lmk2_x = float(split_str[11])
-                    lmk2_y = float(split_str[12])
-                    lmk3_x = float(split_str[14])
-                    lmk3_y = float(split_str[15])
-                    lmk4_x = float(split_str[17])
-                    lmk4_y = float(split_str[18])
-                    lmk_ignore_flag = 0 if lmk0_x == -1 else 1
-                    gt_lmk_label = [
-                        lmk0_x, lmk0_y, lmk1_x, lmk1_y, lmk2_x, lmk2_y, lmk3_x,
-                        lmk3_y, lmk4_x, lmk4_y
-                    ]
-                    result_boxs.append(gt_lmk_label)
-                    result_boxs.append(lmk_ignore_flag)
-                file_dict[num_class].append(result_boxs)
-
-        return list(file_dict.values())
-
-
-def widerface_label():
-    labels_map = {'face': 0}
-    return labels_map
diff --git a/pdfdet/models/Paddle/ppdet/data/transform/__init__.py b/pdfdet/models/Paddle/ppdet/data/transform/__init__.py
deleted file mode 100644
index d45cf47..0000000
--- a/pdfdet/models/Paddle/ppdet/data/transform/__init__.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# from . import operators
-# from . import batch_operators
-# from . import keypoint_operators
-# from . import mot_operators
-# from . import rotated_operators
-# from . import keypoints_3d_operators
-# from . import culane_operators
-
-from .operators import Decode,Resize,NormalizeImage,Permute
-from .batch_operators import PadBatch
-# from .keypoint_operators import *
-# from .mot_operators import *
-# from .rotated_operators import *
-# from .keypoints_3d_operators import *
-# from .culane_operators import *
-
-# __all__ = []
-# __all__ += registered_ops
-# __all__ += keypoint_operators.__all__
-# __all__ += mot_operators.__all__
-# __all__ += culane_operators.__all__
diff --git a/pdfdet/models/Paddle/ppdet/data/transform/atss_assigner.py b/pdfdet/models/Paddle/ppdet/data/transform/atss_assigner.py
deleted file mode 100644
index 686b140..0000000
--- a/pdfdet/models/Paddle/ppdet/data/transform/atss_assigner.py
+++ /dev/null
@@ -1,421 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# The code is based on:
-# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/atss_assigner.py
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-
-def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6):
-    """Calculate overlap between two set of bboxes.
-    If ``is_aligned `` is ``False``, then calculate the overlaps between each
-    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
-    pair of bboxes1 and bboxes2.
-    Args:
-        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
-        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
-            B indicates the batch dim, in shape (B1, B2, ..., Bn).
-            If ``is_aligned `` is ``True``, then m and n must be equal.
-        mode (str): "iou" (intersection over union) or "iof" (intersection over
-            foreground).
-        is_aligned (bool, optional): If True, then m and n must be equal.
-            Default False.
-        eps (float, optional): A value added to the denominator for numerical
-            stability. Default 1e-6.
-    Returns:
-        Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
-    """
-    assert mode in ['iou', 'iof', 'giou', 'diou'], 'Unsupported mode {}'.format(
-        mode)
-    # Either the boxes are empty or the length of boxes's last dimenstion is 4
-    assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
-    assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)
-
-    # Batch dim must be the same
-    # Batch dim: (B1, B2, ... Bn)
-    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
-    batch_shape = bboxes1.shape[:-2]
-
-    rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0
-    cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0
-    if is_aligned:
-        assert rows == cols
-
-    if rows * cols == 0:
-        if is_aligned:
-            return np.random.random(batch_shape + (rows, ))
-        else:
-            return np.random.random(batch_shape + (rows, cols))
-
-    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (
-        bboxes1[..., 3] - bboxes1[..., 1])
-    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (
-        bboxes2[..., 3] - bboxes2[..., 1])
-
-    if is_aligned:
-        lt = np.maximum(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]
-        rb = np.minimum(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]
-
-        wh = (rb - lt).clip(min=0)  # [B, rows, 2]
-        overlap = wh[..., 0] * wh[..., 1]
-
-        if mode in ['iou', 'giou']:
-            union = area1 + area2 - overlap
-        else:
-            union = area1
-        if mode == 'giou':
-            enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])
-            enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
-        if mode == 'diou':
-            enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2])
-            enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:])
-            b1_x1, b1_y1 = bboxes1[..., 0], bboxes1[..., 1]
-            b1_x2, b1_y2 = bboxes1[..., 2], bboxes1[..., 3]
-            b2_x1, b2_y1 = bboxes2[..., 0], bboxes2[..., 1]
-            b2_x2, b2_y2 = bboxes2[..., 2], bboxes2[..., 3]
-    else:
-        lt = np.maximum(bboxes1[..., :, None, :2],
-                        bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
-        rb = np.minimum(bboxes1[..., :, None, 2:],
-                        bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]
-
-        wh = (rb - lt).clip(min=0)  # [B, rows, cols, 2]
-        overlap = wh[..., 0] * wh[..., 1]
-
-        if mode in ['iou', 'giou']:
-            union = area1[..., None] + area2[..., None, :] - overlap
-        else:
-            union = area1[..., None]
-        if mode == 'giou':
-            enclosed_lt = np.minimum(bboxes1[..., :, None, :2],
-                                     bboxes2[..., None, :, :2])
-            enclosed_rb = np.maximum(bboxes1[..., :, None, 2:],
-                                     bboxes2[..., None, :, 2:])
-        if mode == 'diou':
-            enclosed_lt = np.minimum(bboxes1[..., :, None, :2],
-                                     bboxes2[..., None, :, :2])
-            enclosed_rb = np.maximum(bboxes1[..., :, None, 2:],
-                                     bboxes2[..., None, :, 2:])
-            b1_x1, b1_y1 = bboxes1[..., :, None, 0], bboxes1[..., :, None, 1]
-            b1_x2, b1_y2 = bboxes1[..., :, None, 2], bboxes1[..., :, None, 3]
-            b2_x1, b2_y1 = bboxes2[..., None, :, 0], bboxes2[..., None, :, 1]
-            b2_x2, b2_y2 = bboxes2[..., None, :, 2], bboxes2[..., None, :, 3]
-
-    eps = np.array([eps])
-    union = np.maximum(union, eps)
-    ious = overlap / union
-    if mode in ['iou', 'iof']:
-        return ious
-    # calculate gious
-    if mode in ['giou']:
-        enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
-        enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
-        enclose_area = np.maximum(enclose_area, eps)
-        gious = ious - (enclose_area - union) / enclose_area
-        return gious
-    if mode in ['diou']:
-        left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
-        right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
-        rho2 = left + right
-        enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
-        enclose_c = enclose_wh[..., 0]**2 + enclose_wh[..., 1]**2
-        enclose_c = np.maximum(enclose_c, eps)
-        dious = ious - rho2 / enclose_c
-        return dious
-
-
-def topk_(input, k, axis=1, largest=True):
-    x = -input if largest else input
-    if axis == 0:
-        row_index = np.arange(input.shape[1 - axis])
-        if k == x.shape[0]:  # argpartition requires index < len(input)
-            topk_index = np.argpartition(x, k - 1, axis=axis)[0:k, :]
-        else:
-            topk_index = np.argpartition(x, k, axis=axis)[0:k, :]
-
-        topk_data = x[topk_index, row_index]
-
-        topk_index_sort = np.argsort(topk_data, axis=axis)
-        topk_data_sort = topk_data[topk_index_sort, row_index]
-        topk_index_sort = topk_index[0:k, :][topk_index_sort, row_index]
-    else:
-        column_index = np.arange(x.shape[1 - axis])[:, None]
-        topk_index = np.argpartition(x, k, axis=axis)[:, 0:k]
-        topk_data = x[column_index, topk_index]
-        topk_data = -topk_data if largest else topk_data
-        topk_index_sort = np.argsort(topk_data, axis=axis)
-        topk_data_sort = topk_data[column_index, topk_index_sort]
-        topk_index_sort = topk_index[:, 0:k][column_index, topk_index_sort]
-
-    return topk_data_sort, topk_index_sort
-
-
-class ATSSAssigner(object):
-    """Assign a corresponding gt bbox or background to each bbox.
-
-    Each proposals will be assigned with `0` or a positive integer
-    indicating the ground truth index.
-
-    - 0: negative sample, no assigned gt
-    - positive integer: positive sample, index (1-based) of assigned gt
-
-    Args:
-        topk (float): number of bbox selected in each level
-    """
-
-    def __init__(self, topk=9):
-        self.topk = topk
-
-    def __call__(self,
-                 bboxes,
-                 num_level_bboxes,
-                 gt_bboxes,
-                 gt_bboxes_ignore=None,
-                 gt_labels=None):
-        """Assign gt to bboxes.
-        The assignment is done in following steps
-        1. compute iou between all bbox (bbox of all pyramid levels) and gt
-        2. compute center distance between all bbox and gt
-        3. on each pyramid level, for each gt, select k bbox whose center
-           are closest to the gt center, so we total select k*l bbox as
-           candidates for each gt
-        4. get corresponding iou for the these candidates, and compute the
-           mean and std, set mean + std as the iou threshold
-        5. select these candidates whose iou are greater than or equal to
-           the threshold as postive
-        6. limit the positive sample's center in gt
-        Args:
-            bboxes (np.array): Bounding boxes to be assigned, shape(n, 4).
-            num_level_bboxes (List): num of bboxes in each level
-            gt_bboxes (np.array): Groundtruth boxes, shape (k, 4).
-            gt_bboxes_ignore (np.array, optional): Ground truth bboxes that are
-                labelled as `ignored`, e.g., crowd boxes in COCO.
-            gt_labels (np.array, optional): Label of gt_bboxes, shape (k, ).
-        """
-        bboxes = bboxes[:, :4]
-        num_gt, num_bboxes = gt_bboxes.shape[0], bboxes.shape[0]
-
-        # assign 0 by default
-        assigned_gt_inds = np.zeros((num_bboxes, ), dtype=np.int64)
-
-        if num_gt == 0 or num_bboxes == 0:
-            # No ground truth or boxes, return empty assignment
-            max_overlaps = np.zeros((num_bboxes, ))
-            if num_gt == 0:
-                # No truth, assign everything to background
-                assigned_gt_inds[:] = 0
-            if not np.any(gt_labels):
-                assigned_labels = None
-            else:
-                assigned_labels = -np.ones((num_bboxes, ), dtype=np.int64)
-            return assigned_gt_inds, max_overlaps
-
-        # compute iou between all bbox and gt
-        overlaps = bbox_overlaps(bboxes, gt_bboxes)
-        # compute center distance between all bbox and gt
-        gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
-        gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
-        gt_points = np.stack((gt_cx, gt_cy), axis=1)
-
-        bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0
-        bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0
-        bboxes_points = np.stack((bboxes_cx, bboxes_cy), axis=1)
-
-        distances = np.sqrt(
-            np.power((bboxes_points[:, None, :] - gt_points[None, :, :]), 2)
-            .sum(-1))
-
-        # Selecting candidates based on the center distance
-        candidate_idxs = []
-        start_idx = 0
-        for bboxes_per_level in num_level_bboxes:
-            # on each pyramid level, for each gt,
-            # select k bbox whose center are closest to the gt center
-            end_idx = start_idx + bboxes_per_level
-            distances_per_level = distances[start_idx:end_idx, :]
-            selectable_k = min(self.topk, bboxes_per_level)
-            _, topk_idxs_per_level = topk_(
-                distances_per_level, selectable_k, axis=0, largest=False)
-            candidate_idxs.append(topk_idxs_per_level + start_idx)
-            start_idx = end_idx
-        candidate_idxs = np.concatenate(candidate_idxs, axis=0)
-
-        # get corresponding iou for the these candidates, and compute the
-        # mean and std, set mean + std as the iou threshold
-        candidate_overlaps = overlaps[candidate_idxs, np.arange(num_gt)]
-        overlaps_mean_per_gt = candidate_overlaps.mean(0)
-        overlaps_std_per_gt = candidate_overlaps.std(0)
-        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt
-
-        is_pos = candidate_overlaps >= overlaps_thr_per_gt[None, :]
-
-        # limit the positive sample's center in gt
-        for gt_idx in range(num_gt):
-            candidate_idxs[:, gt_idx] += gt_idx * num_bboxes
-        ep_bboxes_cx = np.broadcast_to(
-            bboxes_cx.reshape(1, -1), [num_gt, num_bboxes]).reshape(-1)
-        ep_bboxes_cy = np.broadcast_to(
-            bboxes_cy.reshape(1, -1), [num_gt, num_bboxes]).reshape(-1)
-        candidate_idxs = candidate_idxs.reshape(-1)
-
-        # calculate the left, top, right, bottom distance between positive
-        # bbox center and gt side
-        l_ = ep_bboxes_cx[candidate_idxs].reshape(-1, num_gt) - gt_bboxes[:, 0]
-        t_ = ep_bboxes_cy[candidate_idxs].reshape(-1, num_gt) - gt_bboxes[:, 1]
-        r_ = gt_bboxes[:, 2] - ep_bboxes_cx[candidate_idxs].reshape(-1, num_gt)
-        b_ = gt_bboxes[:, 3] - ep_bboxes_cy[candidate_idxs].reshape(-1, num_gt)
-        is_in_gts = np.stack([l_, t_, r_, b_], axis=1).min(axis=1) > 0.01
-        is_pos = is_pos & is_in_gts
-
-        # if an anchor box is assigned to multiple gts,
-        # the one with the highest IoU will be selected.
-        overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)
-        index = candidate_idxs.reshape(-1)[is_pos.reshape(-1)]
-        overlaps_inf[index] = overlaps.T.reshape(-1)[index]
-        overlaps_inf = overlaps_inf.reshape(num_gt, -1).T
-
-        max_overlaps = overlaps_inf.max(axis=1)
-        argmax_overlaps = overlaps_inf.argmax(axis=1)
-        assigned_gt_inds[max_overlaps !=
-                         -np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1
-
-        return assigned_gt_inds, max_overlaps
-
-    def get_vlr_region(self,
-                       bboxes,
-                       num_level_bboxes,
-                       gt_bboxes,
-                       gt_bboxes_ignore=None,
-                       gt_labels=None):
-        """get vlr region for ld distillation.
-        Args:
-            bboxes (np.array): Bounding boxes to be assigned, shape(n, 4).
-            num_level_bboxes (List): num of bboxes in each level
-            gt_bboxes (np.array): Groundtruth boxes, shape (k, 4).
-            gt_bboxes_ignore (np.array, optional): Ground truth bboxes that are
-                labelled as `ignored`, e.g., crowd boxes in COCO.
-            gt_labels (np.array, optional): Label of gt_bboxes, shape (k, ).
-        """
-        bboxes = bboxes[:, :4]
-
-        num_gt, num_bboxes = gt_bboxes.shape[0], bboxes.shape[0]
-
-        # compute iou between all bbox and gt
-        overlaps = bbox_overlaps(bboxes, gt_bboxes)
-
-        # compute diou between all bbox and gt
-        diou = bbox_overlaps(bboxes, gt_bboxes, mode='diou')
-
-        # assign 0 by default
-        assigned_gt_inds = np.zeros((num_bboxes, ), dtype=np.int64)
-
-        vlr_region_iou = (assigned_gt_inds + 0).astype(np.float32)
-
-        if num_gt == 0 or num_bboxes == 0:
-            # No ground truth or boxes, return empty assignment
-            max_overlaps = np.zeros((num_bboxes, ))
-            if num_gt == 0:
-                # No truth, assign everything to background
-                assigned_gt_inds[:] = 0
-            if not np.any(gt_labels):
-                assigned_labels = None
-            else:
-                assigned_labels = -np.ones((num_bboxes, ), dtype=np.int64)
-            return assigned_gt_inds, max_overlaps
-
-        # compute center distance between all bbox and gt
-        gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
-        gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
-        gt_points = np.stack((gt_cx, gt_cy), axis=1)
-
-        bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0
-        bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0
-        bboxes_points = np.stack((bboxes_cx, bboxes_cy), axis=1)
-
-        distances = np.sqrt(
-            np.power((bboxes_points[:, None, :] - gt_points[None, :, :]), 2)
-            .sum(-1))
-
-        # Selecting candidates based on the center distance
-        candidate_idxs = []
-        candidate_idxs_t = []
-        start_idx = 0
-        for bboxes_per_level in num_level_bboxes:
-            # on each pyramid level, for each gt,
-            # select k bbox whose center are closest to the gt center
-            end_idx = start_idx + bboxes_per_level
-            distances_per_level = distances[start_idx:end_idx, :]
-            selectable_t = min(self.topk, bboxes_per_level)
-            selectable_k = bboxes_per_level  #k for all
-            _, topt_idxs_per_level = topk_(
-                distances_per_level, selectable_t, axis=0, largest=False)
-            _, topk_idxs_per_level = topk_(
-                distances_per_level, selectable_k, axis=0, largest=False)
-            candidate_idxs_t.append(topt_idxs_per_level + start_idx)
-            candidate_idxs.append(topk_idxs_per_level + start_idx)
-            start_idx = end_idx
-
-        candidate_idxs_t = np.concatenate(candidate_idxs_t, axis=0)
-        candidate_idxs = np.concatenate(candidate_idxs, axis=0)
-
-        # get corresponding iou for the these candidates, and compute the
-        # mean and std, set mean + std as the iou threshold
-        candidate_overlaps_t = overlaps[candidate_idxs_t, np.arange(num_gt)]
-
-        # compute tdiou
-        t_diou = diou[candidate_idxs, np.arange(num_gt)]
-
-        overlaps_mean_per_gt = candidate_overlaps_t.mean(0)
-        overlaps_std_per_gt = candidate_overlaps_t.std(
-            0, ddof=1)  # NOTE: use Bessel correction
-        overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt
-
-        # compute region        
-        is_pos = (t_diou < overlaps_thr_per_gt[None, :]) & (
-            t_diou >= 0.25 * overlaps_thr_per_gt[None, :])
-
-        # limit the positive sample's center in gt
-        for gt_idx in range(num_gt):
-            candidate_idxs[:, gt_idx] += gt_idx * num_bboxes
-
-        candidate_idxs = candidate_idxs.reshape(-1)
-
-        # if an anchor box is assigned to multiple gts,
-        # the one with the highest IoU will be selected.
-        overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)
-        index = candidate_idxs.reshape(-1)[is_pos.reshape(-1)]
-
-        overlaps_inf[index] = overlaps.T.reshape(-1)[index]
-        overlaps_inf = overlaps_inf.reshape(num_gt, -1).T
-
-        max_overlaps = overlaps_inf.max(axis=1)
-        argmax_overlaps = overlaps_inf.argmax(axis=1)
-
-        overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1)
-        overlaps_inf = overlaps_inf.reshape(num_gt, -1).T
-
-        assigned_gt_inds[max_overlaps !=
-                         -np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1
-
-        vlr_region_iou[max_overlaps !=
-                       -np.inf] = max_overlaps[max_overlaps != -np.inf] + 0
-
-        return vlr_region_iou
diff --git a/pdfdet/models/Paddle/ppdet/data/transform/autoaugment_utils.py b/pdfdet/models/Paddle/ppdet/data/transform/autoaugment_utils.py
deleted file mode 100644
index cfa89d3..0000000
--- a/pdfdet/models/Paddle/ppdet/data/transform/autoaugment_utils.py
+++ /dev/null
@@ -1,1586 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Reference: 
-#   https://github.com/tensorflow/tpu/blob/master/models/official/detection/utils/autoaugment_utils.py
-"""AutoAugment util file."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import inspect
-import math
-from PIL import Image, ImageEnhance
-import numpy as np
-import cv2
-from copy import deepcopy
-
-# This signifies the max integer that the controller RNN could predict for the
-# augmentation scheme.
-_MAX_LEVEL = 10.
-
-# Represents an invalid bounding box that is used for checking for padding
-# lists of bounding box coordinates for a few augmentation operations
-_INVALID_BOX = [[-1.0, -1.0, -1.0, -1.0]]
-
-
-def policy_v0():
-    """Autoaugment policy that was used in AutoAugment Detection Paper."""
-    # Each tuple is an augmentation operation of the form
-    # (operation, probability, magnitude). Each element in policy is a
-    # sub-policy that will be applied sequentially on the image.
-    policy = [
-        [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)],
-        [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)],
-        [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)],
-        [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)],
-        [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)],
-    ]
-    return policy
-
-
-def policy_v1():
-    """Autoaugment policy that was used in AutoAugment Detection Paper."""
-    # Each tuple is an augmentation operation of the form
-    # (operation, probability, magnitude). Each element in policy is a
-    # sub-policy that will be applied sequentially on the image.
-    policy = [
-        [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)],
-        [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)],
-        [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)],
-        [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)],
-        [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)],
-        [('Color', 0.0, 0), ('ShearX_Only_BBoxes', 0.8, 4)],
-        [('ShearY_Only_BBoxes', 0.8, 2), ('Flip_Only_BBoxes', 0.0, 10)],
-        [('Equalize', 0.6, 10), ('TranslateX_BBox', 0.2, 2)],
-        [('Color', 1.0, 10), ('TranslateY_Only_BBoxes', 0.4, 6)],
-        [('Rotate_BBox', 0.8, 10), ('Contrast', 0.0, 10)],  # , 
-        [('Cutout', 0.2, 2), ('Brightness', 0.8, 10)],
-        [('Color', 1.0, 6), ('Equalize', 1.0, 2)],
-        [('Cutout_Only_BBoxes', 0.4, 6), ('TranslateY_Only_BBoxes', 0.8, 2)],
-        [('Color', 0.2, 8), ('Rotate_BBox', 0.8, 10)],
-        [('Sharpness', 0.4, 4), ('TranslateY_Only_BBoxes', 0.0, 4)],
-        [('Sharpness', 1.0, 4), ('SolarizeAdd', 0.4, 4)],
-        [('Rotate_BBox', 1.0, 8), ('Sharpness', 0.2, 8)],
-        [('ShearY_BBox', 0.6, 10), ('Equalize_Only_BBoxes', 0.6, 8)],
-        [('ShearX_BBox', 0.2, 6), ('TranslateY_Only_BBoxes', 0.2, 10)],
-        [('SolarizeAdd', 0.6, 8), ('Brightness', 0.8, 10)],
-    ]
-    return policy
-
-
-def policy_vtest():
-    """Autoaugment test policy for debugging."""
-    # Each tuple is an augmentation operation of the form
-    # (operation, probability, magnitude). Each element in policy is a
-    # sub-policy that will be applied sequentially on the image.
-    policy = [[('TranslateX_BBox', 1.0, 4), ('Equalize', 1.0, 10)], ]
-    return policy
-
-
-def policy_v2():
-    """Additional policy that performs well on object detection."""
-    # Each tuple is an augmentation operation of the form
-    # (operation, probability, magnitude). Each element in policy is a
-    # sub-policy that will be applied sequentially on the image.
-    policy = [
-        [('Color', 0.0, 6), ('Cutout', 0.6, 8), ('Sharpness', 0.4, 8)],
-        [('Rotate_BBox', 0.4, 8), ('Sharpness', 0.4, 2),
-         ('Rotate_BBox', 0.8, 10)],
-        [('TranslateY_BBox', 1.0, 8), ('AutoContrast', 0.8, 2)],
-        [('AutoContrast', 0.4, 6), ('ShearX_BBox', 0.8, 8),
-         ('Brightness', 0.0, 10)],
-        [('SolarizeAdd', 0.2, 6), ('Contrast', 0.0, 10),
-         ('AutoContrast', 0.6, 0)],
-        [('Cutout', 0.2, 0), ('Solarize', 0.8, 8), ('Color', 1.0, 4)],
-        [('TranslateY_BBox', 0.0, 4), ('Equalize', 0.6, 8),
-         ('Solarize', 0.0, 10)],
-        [('TranslateY_BBox', 0.2, 2), ('ShearY_BBox', 0.8, 8),
-         ('Rotate_BBox', 0.8, 8)],
-        [('Cutout', 0.8, 8), ('Brightness', 0.8, 8), ('Cutout', 0.2, 2)],
-        [('Color', 0.8, 4), ('TranslateY_BBox', 1.0, 6),
-         ('Rotate_BBox', 0.6, 6)],
-        [('Rotate_BBox', 0.6, 10), ('BBox_Cutout', 1.0, 4), ('Cutout', 0.2, 8)],
-        [('Rotate_BBox', 0.0, 0), ('Equalize', 0.6, 6),
-         ('ShearY_BBox', 0.6, 8)],
-        [('Brightness', 0.8, 8), ('AutoContrast', 0.4, 2),
-         ('Brightness', 0.2, 2)],
-        [('TranslateY_BBox', 0.4, 8), ('Solarize', 0.4, 6),
-         ('SolarizeAdd', 0.2, 10)],
-        [('Contrast', 1.0, 10), ('SolarizeAdd', 0.2, 8), ('Equalize', 0.2, 4)],
-    ]
-    return policy
-
-
-def policy_v3():
-    """"Additional policy that performs well on object detection."""
-    # Each tuple is an augmentation operation of the form
-    # (operation, probability, magnitude). Each element in policy is a
-    # sub-policy that will be applied sequentially on the image.
-    policy = [
-        [('Posterize', 0.8, 2), ('TranslateX_BBox', 1.0, 8)],
-        [('BBox_Cutout', 0.2, 10), ('Sharpness', 1.0, 8)],
-        [('Rotate_BBox', 0.6, 8), ('Rotate_BBox', 0.8, 10)],
-        [('Equalize', 0.8, 10), ('AutoContrast', 0.2, 10)],
-        [('SolarizeAdd', 0.2, 2), ('TranslateY_BBox', 0.2, 8)],
-        [('Sharpness', 0.0, 2), ('Color', 0.4, 8)],
-        [('Equalize', 1.0, 8), ('TranslateY_BBox', 1.0, 8)],
-        [('Posterize', 0.6, 2), ('Rotate_BBox', 0.0, 10)],
-        [('AutoContrast', 0.6, 0), ('Rotate_BBox', 1.0, 6)],
-        [('Equalize', 0.0, 4), ('Cutout', 0.8, 10)],
-        [('Brightness', 1.0, 2), ('TranslateY_BBox', 1.0, 6)],
-        [('Contrast', 0.0, 2), ('ShearY_BBox', 0.8, 0)],
-        [('AutoContrast', 0.8, 10), ('Contrast', 0.2, 10)],
-        [('Rotate_BBox', 1.0, 10), ('Cutout', 1.0, 10)],
-        [('SolarizeAdd', 0.8, 6), ('Equalize', 0.8, 8)],
-    ]
-    return policy
-
-
-def _equal(val1, val2, eps=1e-8):
-    return abs(val1 - val2) <= eps
-
-
-def blend(image1, image2, factor):
-    """Blend image1 and image2 using 'factor'.
-
-    Factor can be above 0.0.    A value of 0.0 means only image1 is used.
-    A value of 1.0 means only image2 is used.    A value between 0.0 and
-    1.0 means we linearly interpolate the pixel values between the two
-    images.    A value greater than 1.0 "extrapolates" the difference
-    between the two pixel values, and we clip the results to values
-    between 0 and 255.
-
-    Args:
-        image1: An image Tensor of type uint8.
-        image2: An image Tensor of type uint8.
-        factor: A floating point value above 0.0.
-
-    Returns:
-        A blended image Tensor of type uint8.
-    """
-    if factor == 0.0:
-        return image1
-    if factor == 1.0:
-        return image2
-
-    image1 = image1.astype(np.float32)
-    image2 = image2.astype(np.float32)
-
-    difference = image2 - image1
-    scaled = factor * difference
-
-    # Do addition in float.
-    temp = image1 + scaled
-
-    # Interpolate
-    if factor > 0.0 and factor < 1.0:
-        # Interpolation means we always stay within 0 and 255.
-        return temp.astype(np.uint8)
-
-    # Extrapolate:
-    #
-    # We need to clip and then cast.
-    return np.clip(temp, a_min=0, a_max=255).astype(np.uint8)
-
-
-def cutout(image, pad_size, replace=0):
-    """Apply cutout (https://arxiv.org/abs/1708.04552) to image.
-
-    This operation applies a (2*pad_size x 2*pad_size) mask of zeros to
-    a random location within `img`. The pixel values filled in will be of the
-    value `replace`. The located where the mask will be applied is randomly
-    chosen uniformly over the whole image.
-
-    Args:
-        image: An image Tensor of type uint8.
-        pad_size: Specifies how big the zero mask that will be generated is that
-            is applied to the image. The mask will be of size
-            (2*pad_size x 2*pad_size).
-        replace: What pixel value to fill in the image in the area that has
-            the cutout mask applied to it.
-
-    Returns:
-        An image Tensor that is of type uint8.
-    Example:
-        img = cv2.imread( "/home/vis/gry/train/img_data/test.jpg", cv2.COLOR_BGR2RGB )
-        new_img = cutout(img, pad_size=50, replace=0)
-    """
-    image_height, image_width = image.shape[0], image.shape[1]
-
-    cutout_center_height = np.random.randint(low=0, high=image_height)
-    cutout_center_width = np.random.randint(low=0, high=image_width)
-
-    lower_pad = np.maximum(0, cutout_center_height - pad_size)
-    upper_pad = np.maximum(0, image_height - cutout_center_height - pad_size)
-    left_pad = np.maximum(0, cutout_center_width - pad_size)
-    right_pad = np.maximum(0, image_width - cutout_center_width - pad_size)
-
-    cutout_shape = [
-        image_height - (lower_pad + upper_pad),
-        image_width - (left_pad + right_pad)
-    ]
-    padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
-    mask = np.pad(np.zeros(
-        cutout_shape, dtype=image.dtype),
-                  padding_dims,
-                  'constant',
-                  constant_values=1)
-    mask = np.expand_dims(mask, -1)
-    mask = np.tile(mask, [1, 1, 3])
-    image = np.where(
-        np.equal(mask, 0),
-        np.ones_like(
-            image, dtype=image.dtype) * replace,
-        image)
-    return image.astype(np.uint8)
-
-
-def solarize(image, threshold=128):
-    # For each pixel in the image, select the pixel
-    # if the value is less than the threshold.
-    # Otherwise, subtract 255 from the pixel.
-    return np.where(image < threshold, image, 255 - image)
-
-
-def solarize_add(image, addition=0, threshold=128):
-    # For each pixel in the image less than threshold
-    # we add 'addition' amount to it and then clip the
-    # pixel value to be between 0 and 255. The value
-    # of 'addition' is between -128 and 128.
-    added_image = image.astype(np.int64) + addition
-    added_image = np.clip(added_image, a_min=0, a_max=255).astype(np.uint8)
-    return np.where(image < threshold, added_image, image)
-
-
-def color(image, factor):
-    """use cv2 to deal"""
-    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-    degenerate = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR)
-    return blend(degenerate, image, factor)
-
-
-# refer to https://github.com/4uiiurz1/pytorch-auto-augment/blob/024b2eac4140c38df8342f09998e307234cafc80/auto_augment.py#L197
-def contrast(img, factor):
-    img = ImageEnhance.Contrast(Image.fromarray(img)).enhance(factor)
-    return np.array(img)
-
-
-def brightness(image, factor):
-    """Equivalent of PIL Brightness."""
-    degenerate = np.zeros_like(image)
-    return blend(degenerate, image, factor)
-
-
-def posterize(image, bits):
-    """Equivalent of PIL Posterize."""
-    shift = 8 - bits
-    return np.left_shift(np.right_shift(image, shift), shift)
-
-
-def rotate(image, degrees, replace):
-    """Rotates the image by degrees either clockwise or counterclockwise.
-
-    Args:
-        image: An image Tensor of type uint8.
-        degrees: Float, a scalar angle in degrees to rotate all images by. If
-            degrees is positive the image will be rotated clockwise otherwise it will
-            be rotated counterclockwise.
-        replace: A one or three value 1D tensor to fill empty pixels caused by
-            the rotate operation.
-
-    Returns:
-        The rotated version of image.
-    """
-    image = wrap(image)
-    image = Image.fromarray(image)
-    image = image.rotate(degrees)
-    image = np.array(image, dtype=np.uint8)
-    return unwrap(image, replace)
-
-
-def random_shift_bbox(image,
-                      bbox,
-                      pixel_scaling,
-                      replace,
-                      new_min_bbox_coords=None):
-    """Move the bbox and the image content to a slightly new random location.
-
-    Args:
-        image: 3D uint8 Tensor.
-        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
-            of type float that represents the normalized coordinates between 0 and 1.
-            The potential values for the new min corner of the bbox will be between
-            [old_min - pixel_scaling * bbox_height/2,
-             old_min - pixel_scaling * bbox_height/2].
-        pixel_scaling: A float between 0 and 1 that specifies the pixel range
-            that the new bbox location will be sampled from.
-        replace: A one or three value 1D tensor to fill empty pixels.
-        new_min_bbox_coords: If not None, then this is a tuple that specifies the
-            (min_y, min_x) coordinates of the new bbox. Normally this is randomly
-            specified, but this allows it to be manually set. The coordinates are
-            the absolute coordinates between 0 and image height/width and are int32.
-
-    Returns:
-        The new image that will have the shifted bbox location in it along with
-        the new bbox that contains the new coordinates.
-    """
-    # Obtains image height and width and create helper clip functions.
-    image_height, image_width = image.shape[0], image.shape[1]
-    image_height = float(image_height)
-    image_width = float(image_width)
-
-    def clip_y(val):
-        return np.clip(val, a_min=0, a_max=image_height - 1).astype(np.int32)
-
-    def clip_x(val):
-        return np.clip(val, a_min=0, a_max=image_width - 1).astype(np.int32)
-
-    # Convert bbox to pixel coordinates.
-    min_y = int(image_height * bbox[0])
-    min_x = int(image_width * bbox[1])
-    max_y = clip_y(image_height * bbox[2])
-    max_x = clip_x(image_width * bbox[3])
-
-    bbox_height, bbox_width = (max_y - min_y + 1, max_x - min_x + 1)
-    image_height = int(image_height)
-    image_width = int(image_width)
-
-    # Select the new min/max bbox ranges that are used for sampling the
-    # new min x/y coordinates of the shifted bbox.
-    minval_y = clip_y(min_y - np.int32(pixel_scaling * float(bbox_height) /
-                                       2.0))
-    maxval_y = clip_y(min_y + np.int32(pixel_scaling * float(bbox_height) /
-                                       2.0))
-    minval_x = clip_x(min_x - np.int32(pixel_scaling * float(bbox_width) / 2.0))
-    maxval_x = clip_x(min_x + np.int32(pixel_scaling * float(bbox_width) / 2.0))
-
-    # Sample and calculate the new unclipped min/max coordinates of the new bbox.
-    if new_min_bbox_coords is None:
-        unclipped_new_min_y = np.random.randint(
-            low=minval_y, high=maxval_y, dtype=np.int32)
-        unclipped_new_min_x = np.random.randint(
-            low=minval_x, high=maxval_x, dtype=np.int32)
-    else:
-        unclipped_new_min_y, unclipped_new_min_x = (
-            clip_y(new_min_bbox_coords[0]), clip_x(new_min_bbox_coords[1]))
-    unclipped_new_max_y = unclipped_new_min_y + bbox_height - 1
-    unclipped_new_max_x = unclipped_new_min_x + bbox_width - 1
-
-    # Determine if any of the new bbox was shifted outside the current image.
-    # This is used for determining if any of the original bbox content should be
-    # discarded.
-    new_min_y, new_min_x, new_max_y, new_max_x = (
-        clip_y(unclipped_new_min_y), clip_x(unclipped_new_min_x),
-        clip_y(unclipped_new_max_y), clip_x(unclipped_new_max_x))
-    shifted_min_y = (new_min_y - unclipped_new_min_y) + min_y
-    shifted_max_y = max_y - (unclipped_new_max_y - new_max_y)
-    shifted_min_x = (new_min_x - unclipped_new_min_x) + min_x
-    shifted_max_x = max_x - (unclipped_new_max_x - new_max_x)
-
-    # Create the new bbox tensor by converting pixel integer values to floats.
-    new_bbox = np.stack([
-        float(new_min_y) / float(image_height), float(new_min_x) /
-        float(image_width), float(new_max_y) / float(image_height),
-        float(new_max_x) / float(image_width)
-    ])
-
-    # Copy the contents in the bbox and fill the old bbox location
-    # with gray (128).
-    bbox_content = image[shifted_min_y:shifted_max_y + 1, shifted_min_x:
-                         shifted_max_x + 1, :]
-
-    def mask_and_add_image(min_y_, min_x_, max_y_, max_x_, mask, content_tensor,
-                           image_):
-        """Applies mask to bbox region in image then adds content_tensor to it."""
-        mask = np.pad(mask, [[min_y_, (image_height - 1) - max_y_],
-                             [min_x_, (image_width - 1) - max_x_], [0, 0]],
-                      'constant',
-                      constant_values=1)
-
-        content_tensor = np.pad(content_tensor,
-                                [[min_y_, (image_height - 1) - max_y_],
-                                 [min_x_, (image_width - 1) - max_x_], [0, 0]],
-                                'constant',
-                                constant_values=0)
-        return image_ * mask + content_tensor
-
-    # Zero out original bbox location.
-    mask = np.zeros_like(image)[min_y:max_y + 1, min_x:max_x + 1, :]
-    grey_tensor = np.zeros_like(mask) + replace[0]
-    image = mask_and_add_image(min_y, min_x, max_y, max_x, mask, grey_tensor,
-                               image)
-
-    # Fill in bbox content to new bbox location.
-    mask = np.zeros_like(bbox_content)
-    image = mask_and_add_image(new_min_y, new_min_x, new_max_y, new_max_x, mask,
-                               bbox_content, image)
-
-    return image.astype(np.uint8), new_bbox
-
-
-def _clip_bbox(min_y, min_x, max_y, max_x):
-    """Clip bounding box coordinates between 0 and 1.
-
-    Args:
-        min_y: Normalized bbox coordinate of type float between 0 and 1.
-        min_x: Normalized bbox coordinate of type float between 0 and 1.
-        max_y: Normalized bbox coordinate of type float between 0 and 1.
-        max_x: Normalized bbox coordinate of type float between 0 and 1.
-
-    Returns:
-        Clipped coordinate values between 0 and 1.
-    """
-    min_y = np.clip(min_y, a_min=0, a_max=1.0)
-    min_x = np.clip(min_x, a_min=0, a_max=1.0)
-    max_y = np.clip(max_y, a_min=0, a_max=1.0)
-    max_x = np.clip(max_x, a_min=0, a_max=1.0)
-    return min_y, min_x, max_y, max_x
-
-
-def _check_bbox_area(min_y, min_x, max_y, max_x, delta=0.05):
-    """Adjusts bbox coordinates to make sure the area is > 0.
-
-    Args:
-        min_y: Normalized bbox coordinate of type float between 0 and 1.
-        min_x: Normalized bbox coordinate of type float between 0 and 1.
-        max_y: Normalized bbox coordinate of type float between 0 and 1.
-        max_x: Normalized bbox coordinate of type float between 0 and 1.
-        delta: Float, this is used to create a gap of size 2 * delta between
-            bbox min/max coordinates that are the same on the boundary.
-            This prevents the bbox from having an area of zero.
-
-    Returns:
-        Tuple of new bbox coordinates between 0 and 1 that will now have a
-        guaranteed area > 0.
-    """
-    height = max_y - min_y
-    width = max_x - min_x
-
-    def _adjust_bbox_boundaries(min_coord, max_coord):
-        # Make sure max is never 0 and min is never 1.
-        max_coord = np.maximum(max_coord, 0.0 + delta)
-        min_coord = np.minimum(min_coord, 1.0 - delta)
-        return min_coord, max_coord
-
-    if _equal(height, 0):
-        min_y, max_y = _adjust_bbox_boundaries(min_y, max_y)
-
-    if _equal(width, 0):
-        min_x, max_x = _adjust_bbox_boundaries(min_x, max_x)
-
-    return min_y, min_x, max_y, max_x
-
-
-def _scale_bbox_only_op_probability(prob):
-    """Reduce the probability of the bbox-only operation.
-
-    Probability is reduced so that we do not distort the content of too many
-    bounding boxes that are close to each other. The value of 3.0 was a chosen
-    hyper parameter when designing the autoaugment algorithm that we found
-    empirically to work well.
-
-    Args:
-        prob: Float that is the probability of applying the bbox-only operation.
-
-    Returns:
-        Reduced probability.
-    """
-    return prob / 3.0
-
-
-def _apply_bbox_augmentation(image, bbox, augmentation_func, *args):
-    """Applies augmentation_func to the subsection of image indicated by bbox.
-
-    Args:
-        image: 3D uint8 Tensor.
-        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
-            of type float that represents the normalized coordinates between 0 and 1.
-        augmentation_func: Augmentation function that will be applied to the
-            subsection of image.
-        *args: Additional parameters that will be passed into augmentation_func
-            when it is called.
-
-    Returns:
-        A modified version of image, where the bbox location in the image will
-        have `ugmentation_func applied to it.
-    """
-    image_height = image.shape[0]
-    image_width = image.shape[1]
-
-    min_y = int(image_height * bbox[0])
-    min_x = int(image_width * bbox[1])
-    max_y = int(image_height * bbox[2])
-    max_x = int(image_width * bbox[3])
-
-    # Clip to be sure the max values do not fall out of range.
-    max_y = np.minimum(max_y, image_height - 1)
-    max_x = np.minimum(max_x, image_width - 1)
-
-    # Get the sub-tensor that is the image within the bounding box region.
-    bbox_content = image[min_y:max_y + 1, min_x:max_x + 1, :]
-
-    # Apply the augmentation function to the bbox portion of the image.
-    augmented_bbox_content = augmentation_func(bbox_content, *args)
-
-    # Pad the augmented_bbox_content and the mask to match the shape of original
-    # image.
-    augmented_bbox_content = np.pad(
-        augmented_bbox_content, [[min_y, (image_height - 1) - max_y],
-                                 [min_x, (image_width - 1) - max_x], [0, 0]],
-        'constant',
-        constant_values=1)
-
-    # Create a mask that will be used to zero out a part of the original image.
-    mask_tensor = np.zeros_like(bbox_content)
-
-    mask_tensor = np.pad(mask_tensor,
-                         [[min_y, (image_height - 1) - max_y],
-                          [min_x, (image_width - 1) - max_x], [0, 0]],
-                         'constant',
-                         constant_values=1)
-    # Replace the old bbox content with the new augmented content.
-    image = image * mask_tensor + augmented_bbox_content
-    return image.astype(np.uint8)
-
-
-def _concat_bbox(bbox, bboxes):
-    """Helper function that concates bbox to bboxes along the first dimension."""
-
-    # Note if all elements in bboxes are -1 (_INVALID_BOX), then this means
-    # we discard bboxes and start the bboxes Tensor with the current bbox.
-    bboxes_sum_check = np.sum(bboxes)
-    bbox = np.expand_dims(bbox, 0)
-    # This check will be true when it is an _INVALID_BOX
-    if _equal(bboxes_sum_check, -4):
-        bboxes = bbox
-    else:
-        bboxes = np.concatenate([bboxes, bbox], 0)
-    return bboxes
-
-
-def _apply_bbox_augmentation_wrapper(image, bbox, new_bboxes, prob,
-                                     augmentation_func, func_changes_bbox,
-                                     *args):
-    """Applies _apply_bbox_augmentation with probability prob.
-
-    Args:
-        image: 3D uint8 Tensor.
-        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
-            of type float that represents the normalized coordinates between 0 and 1.
-        new_bboxes: 2D Tensor that is a list of the bboxes in the image after they
-            have been altered by aug_func. These will only be changed when
-            func_changes_bbox is set to true. Each bbox has 4 elements
-            (min_y, min_x, max_y, max_x) of type float that are the normalized
-            bbox coordinates between 0 and 1.
-        prob: Float that is the probability of applying _apply_bbox_augmentation.
-        augmentation_func: Augmentation function that will be applied to the
-            subsection of image.
-        func_changes_bbox: Boolean. Does augmentation_func return bbox in addition
-            to image.
-        *args: Additional parameters that will be passed into augmentation_func
-            when it is called.
-
-    Returns:
-        A tuple. Fist element is a modified version of image, where the bbox
-        location in the image will have augmentation_func applied to it if it is
-        chosen to be called with probability `prob`. The second element is a
-        Tensor of Tensors of length 4 that will contain the altered bbox after
-        applying augmentation_func.
-    """
-    should_apply_op = (np.random.rand() + prob >= 1)
-    if func_changes_bbox:
-        if should_apply_op:
-            augmented_image, bbox = augmentation_func(image, bbox, *args)
-        else:
-            augmented_image, bbox = (image, bbox)
-    else:
-        if should_apply_op:
-            augmented_image = _apply_bbox_augmentation(image, bbox,
-                                                       augmentation_func, *args)
-        else:
-            augmented_image = image
-    new_bboxes = _concat_bbox(bbox, new_bboxes)
-    return augmented_image.astype(np.uint8), new_bboxes
-
-
-def _apply_multi_bbox_augmentation(image, bboxes, prob, aug_func,
-                                   func_changes_bbox, *args):
-    """Applies aug_func to the image for each bbox in bboxes.
-
-    Args:
-        image: 3D uint8 Tensor.
-        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
-            has 4 elements (min_y, min_x, max_y, max_x) of type float.
-        prob: Float that is the probability of applying aug_func to a specific
-            bounding box within the image.
-        aug_func: Augmentation function that will be applied to the
-            subsections of image indicated by the bbox values in bboxes.
-        func_changes_bbox: Boolean. Does augmentation_func return bbox in addition
-            to image.
-        *args: Additional parameters that will be passed into augmentation_func
-            when it is called.
-
-    Returns:
-        A modified version of image, where each bbox location in the image will
-        have augmentation_func applied to it if it is chosen to be called with
-        probability prob independently across all bboxes. Also the final
-        bboxes are returned that will be unchanged if func_changes_bbox is set to
-        false and if true, the new altered ones will be returned.
-    """
-    # Will keep track of the new altered bboxes after aug_func is repeatedly
-    # applied. The -1 values are a dummy value and this first Tensor will be
-    # removed upon appending the first real bbox.
-    new_bboxes = np.array(_INVALID_BOX)
-
-    # If the bboxes are empty, then just give it _INVALID_BOX. The result
-    # will be thrown away.
-    bboxes = np.array((_INVALID_BOX)) if bboxes.size == 0 else bboxes
-
-    assert bboxes.shape[1] == 4, "bboxes.shape[1] must be 4!!!!"
-
-    # pylint:disable=g-long-lambda
-    # pylint:disable=line-too-long
-    wrapped_aug_func = lambda _image, bbox, _new_bboxes: _apply_bbox_augmentation_wrapper(_image, bbox, _new_bboxes, prob, aug_func, func_changes_bbox, *args)
-    # pylint:enable=g-long-lambda
-    # pylint:enable=line-too-long
-
-    # Setup the while_loop.
-    num_bboxes = bboxes.shape[0]  # We loop until we go over all bboxes.
-    idx = 0  # Counter for the while loop.
-
-    # Conditional function when to end the loop once we go over all bboxes
-    # images_and_bboxes contain (_image, _new_bboxes)
-    def cond(_idx, _images_and_bboxes):
-        return _idx < num_bboxes
-
-    # Shuffle the bboxes so that the augmentation order is not deterministic if
-    # we are not changing the bboxes with aug_func.
-    # if not func_changes_bbox:
-    #     print(bboxes)
-    #     loop_bboxes = np.take(bboxes,np.random.permutation(bboxes.shape[0]),axis=0)
-    #     print(loop_bboxes)
-    # else:
-    #     loop_bboxes = bboxes
-    # we can not shuffle the bbox because it does not contain class information here
-    loop_bboxes = deepcopy(bboxes)
-
-    # Main function of while_loop where we repeatedly apply augmentation on the
-    # bboxes in the image.
-    # pylint:disable=g-long-lambda
-    body = lambda _idx, _images_and_bboxes: [
-            _idx + 1, wrapped_aug_func(_images_and_bboxes[0],
-                                         loop_bboxes[_idx],
-                                         _images_and_bboxes[1])]
-    while (cond(idx, (image, new_bboxes))):
-        idx, (image, new_bboxes) = body(idx, (image, new_bboxes))
-
-    # Either return the altered bboxes or the original ones depending on if
-    # we altered them in anyway.
-    if func_changes_bbox:
-        final_bboxes = new_bboxes
-    else:
-        final_bboxes = bboxes
-    return image, final_bboxes
-
-
-def _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, aug_func,
-                                           func_changes_bbox, *args):
-    """Checks to be sure num bboxes > 0 before calling inner function."""
-    num_bboxes = len(bboxes)
-    new_image = deepcopy(image)
-    new_bboxes = deepcopy(bboxes)
-    if num_bboxes != 0:
-        new_image, new_bboxes = _apply_multi_bbox_augmentation(
-            new_image, new_bboxes, prob, aug_func, func_changes_bbox, *args)
-    return new_image, new_bboxes
-
-
-def rotate_only_bboxes(image, bboxes, prob, degrees, replace):
-    """Apply rotate to each bbox in the image with probability prob."""
-    func_changes_bbox = False
-    prob = _scale_bbox_only_op_probability(prob)
-    return _apply_multi_bbox_augmentation_wrapper(
-        image, bboxes, prob, rotate, func_changes_bbox, degrees, replace)
-
-
-def shear_x_only_bboxes(image, bboxes, prob, level, replace):
-    """Apply shear_x to each bbox in the image with probability prob."""
-    func_changes_bbox = False
-    prob = _scale_bbox_only_op_probability(prob)
-    return _apply_multi_bbox_augmentation_wrapper(
-        image, bboxes, prob, shear_x, func_changes_bbox, level, replace)
-
-
-def shear_y_only_bboxes(image, bboxes, prob, level, replace):
-    """Apply shear_y to each bbox in the image with probability prob."""
-    func_changes_bbox = False
-    prob = _scale_bbox_only_op_probability(prob)
-    return _apply_multi_bbox_augmentation_wrapper(
-        image, bboxes, prob, shear_y, func_changes_bbox, level, replace)
-
-
-def translate_x_only_bboxes(image, bboxes, prob, pixels, replace):
-    """Apply translate_x to each bbox in the image with probability prob."""
-    func_changes_bbox = False
-    prob = _scale_bbox_only_op_probability(prob)
-    return _apply_multi_bbox_augmentation_wrapper(
-        image, bboxes, prob, translate_x, func_changes_bbox, pixels, replace)
-
-
-def translate_y_only_bboxes(image, bboxes, prob, pixels, replace):
-    """Apply translate_y to each bbox in the image with probability prob."""
-    func_changes_bbox = False
-    prob = _scale_bbox_only_op_probability(prob)
-    return _apply_multi_bbox_augmentation_wrapper(
-        image, bboxes, prob, translate_y, func_changes_bbox, pixels, replace)
-
-
-def flip_only_bboxes(image, bboxes, prob):
-    """Apply flip_lr to each bbox in the image with probability prob."""
-    func_changes_bbox = False
-    prob = _scale_bbox_only_op_probability(prob)
-    return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob,
-                                                  np.fliplr, func_changes_bbox)
-
-
-def solarize_only_bboxes(image, bboxes, prob, threshold):
-    """Apply solarize to each bbox in the image with probability prob."""
-    func_changes_bbox = False
-    prob = _scale_bbox_only_op_probability(prob)
-    return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, solarize,
-                                                  func_changes_bbox, threshold)
-
-
-def equalize_only_bboxes(image, bboxes, prob):
-    """Apply equalize to each bbox in the image with probability prob."""
-    func_changes_bbox = False
-    prob = _scale_bbox_only_op_probability(prob)
-    return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, equalize,
-                                                  func_changes_bbox)
-
-
-def cutout_only_bboxes(image, bboxes, prob, pad_size, replace):
-    """Apply cutout to each bbox in the image with probability prob."""
-    func_changes_bbox = False
-    prob = _scale_bbox_only_op_probability(prob)
-    return _apply_multi_bbox_augmentation_wrapper(
-        image, bboxes, prob, cutout, func_changes_bbox, pad_size, replace)
-
-
-def _rotate_bbox(bbox, image_height, image_width, degrees):
-    """Rotates the bbox coordinated by degrees.
-
-    Args:
-        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
-            of type float that represents the normalized coordinates between 0 and 1.
-        image_height: Int, height of the image.
-        image_width: Int, height of the image.
-        degrees: Float, a scalar angle in degrees to rotate all images by. If
-            degrees is positive the image will be rotated clockwise otherwise it will
-            be rotated counterclockwise.
-
-    Returns:
-        A tensor of the same shape as bbox, but now with the rotated coordinates.
-    """
-    image_height, image_width = (float(image_height), float(image_width))
-
-    # Convert from degrees to radians.
-    degrees_to_radians = math.pi / 180.0
-    radians = degrees * degrees_to_radians
-
-    # Translate the bbox to the center of the image and turn the normalized 0-1
-    # coordinates to absolute pixel locations.
-    # Y coordinates are made negative as the y axis of images goes down with
-    # increasing pixel values, so we negate to make sure x axis and y axis points
-    # are in the traditionally positive direction.
-    min_y = -int(image_height * (bbox[0] - 0.5))
-    min_x = int(image_width * (bbox[1] - 0.5))
-    max_y = -int(image_height * (bbox[2] - 0.5))
-    max_x = int(image_width * (bbox[3] - 0.5))
-    coordinates = np.stack([[min_y, min_x], [min_y, max_x], [max_y, min_x],
-                            [max_y, max_x]]).astype(np.float32)
-    # Rotate the coordinates according to the rotation matrix clockwise if
-    # radians is positive, else negative
-    rotation_matrix = np.stack([[math.cos(radians), math.sin(radians)],
-                                [-math.sin(radians), math.cos(radians)]])
-    new_coords = np.matmul(rotation_matrix,
-                           np.transpose(coordinates)).astype(np.int32)
-
-    # Find min/max values and convert them back to normalized 0-1 floats.
-    min_y = -(float(np.max(new_coords[0, :])) / image_height - 0.5)
-    min_x = float(np.min(new_coords[1, :])) / image_width + 0.5
-    max_y = -(float(np.min(new_coords[0, :])) / image_height - 0.5)
-    max_x = float(np.max(new_coords[1, :])) / image_width + 0.5
-
-    # Clip the bboxes to be sure the fall between [0, 1].
-    min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
-    min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
-    return np.stack([min_y, min_x, max_y, max_x])
-
-
-def rotate_with_bboxes(image, bboxes, degrees, replace):
-    # Rotate the image.
-    image = rotate(image, degrees, replace)
-
-    # Convert bbox coordinates to pixel values.
-    image_height, image_width = image.shape[:2]
-    # pylint:disable=g-long-lambda
-    wrapped_rotate_bbox = lambda bbox: _rotate_bbox(bbox, image_height, image_width, degrees)
-    # pylint:enable=g-long-lambda
-    new_bboxes = np.zeros_like(bboxes)
-    for idx in range(len(bboxes)):
-        new_bboxes[idx] = wrapped_rotate_bbox(bboxes[idx])
-    return image, new_bboxes
-
-
-def translate_x(image, pixels, replace):
-    """Equivalent of PIL Translate in X dimension."""
-    image = Image.fromarray(wrap(image))
-    image = image.transform(image.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0))
-    return unwrap(np.array(image), replace)
-
-
-def translate_y(image, pixels, replace):
-    """Equivalent of PIL Translate in Y dimension."""
-    image = Image.fromarray(wrap(image))
-    image = image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels))
-    return unwrap(np.array(image), replace)
-
-
-def _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal):
-    """Shifts the bbox coordinates by pixels.
-
-    Args:
-        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
-            of type float that represents the normalized coordinates between 0 and 1.
-        image_height: Int, height of the image.
-        image_width: Int, width of the image.
-        pixels: An int. How many pixels to shift the bbox.
-        shift_horizontal: Boolean. If true then shift in X dimension else shift in
-            Y dimension.
-
-    Returns:
-        A tensor of the same shape as bbox, but now with the shifted coordinates.
-    """
-    pixels = int(pixels)
-    # Convert bbox to integer pixel locations.
-    min_y = int(float(image_height) * bbox[0])
-    min_x = int(float(image_width) * bbox[1])
-    max_y = int(float(image_height) * bbox[2])
-    max_x = int(float(image_width) * bbox[3])
-
-    if shift_horizontal:
-        min_x = np.maximum(0, min_x - pixels)
-        max_x = np.minimum(image_width, max_x - pixels)
-    else:
-        min_y = np.maximum(0, min_y - pixels)
-        max_y = np.minimum(image_height, max_y - pixels)
-
-    # Convert bbox back to floats.
-    min_y = float(min_y) / float(image_height)
-    min_x = float(min_x) / float(image_width)
-    max_y = float(max_y) / float(image_height)
-    max_x = float(max_x) / float(image_width)
-
-    # Clip the bboxes to be sure the fall between [0, 1].
-    min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
-    min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
-    return np.stack([min_y, min_x, max_y, max_x])
-
-
-def translate_bbox(image, bboxes, pixels, replace, shift_horizontal):
-    """Equivalent of PIL Translate in X/Y dimension that shifts image and bbox.
-
-    Args:
-        image: 3D uint8 Tensor.
-        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
-            has 4 elements (min_y, min_x, max_y, max_x) of type float with values
-            between [0, 1].
-        pixels: An int. How many pixels to shift the image and bboxes
-        replace: A one or three value 1D tensor to fill empty pixels.
-        shift_horizontal: Boolean. If true then shift in X dimension else shift in
-            Y dimension.
-
-    Returns:
-        A tuple containing a 3D uint8 Tensor that will be the result of translating
-        image by pixels. The second element of the tuple is bboxes, where now
-        the coordinates will be shifted to reflect the shifted image.
-    """
-    if shift_horizontal:
-        image = translate_x(image, pixels, replace)
-    else:
-        image = translate_y(image, pixels, replace)
-
-    # Convert bbox coordinates to pixel values.
-    image_height, image_width = image.shape[0], image.shape[1]
-    # pylint:disable=g-long-lambda
-    wrapped_shift_bbox = lambda bbox: _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal)
-    # pylint:enable=g-long-lambda
-    new_bboxes = deepcopy(bboxes)
-    num_bboxes = len(bboxes)
-    for idx in range(num_bboxes):
-        new_bboxes[idx] = wrapped_shift_bbox(bboxes[idx])
-    return image.astype(np.uint8), new_bboxes
-
-
-def shear_x(image, level, replace):
-    """Equivalent of PIL Shearing in X dimension."""
-    # Shear parallel to x axis is a projective transform
-    # with a matrix form of:
-    # [1    level
-    #    0    1].
-    image = Image.fromarray(wrap(image))
-    image = image.transform(image.size, Image.AFFINE, (1, level, 0, 0, 1, 0))
-    return unwrap(np.array(image), replace)
-
-
-def shear_y(image, level, replace):
-    """Equivalent of PIL Shearing in Y dimension."""
-    # Shear parallel to y axis is a projective transform
-    # with a matrix form of:
-    # [1    0
-    #    level    1].
-    image = Image.fromarray(wrap(image))
-    image = image.transform(image.size, Image.AFFINE, (1, 0, 0, level, 1, 0))
-    return unwrap(np.array(image), replace)
-
-
-def _shear_bbox(bbox, image_height, image_width, level, shear_horizontal):
-    """Shifts the bbox according to how the image was sheared.
-
-    Args:
-        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
-            of type float that represents the normalized coordinates between 0 and 1.
-        image_height: Int, height of the image.
-        image_width: Int, height of the image.
-        level: Float. How much to shear the image.
-        shear_horizontal: If true then shear in X dimension else shear in
-            the Y dimension.
-
-    Returns:
-        A tensor of the same shape as bbox, but now with the shifted coordinates.
-    """
-    image_height, image_width = (float(image_height), float(image_width))
-
-    # Change bbox coordinates to be pixels.
-    min_y = int(image_height * bbox[0])
-    min_x = int(image_width * bbox[1])
-    max_y = int(image_height * bbox[2])
-    max_x = int(image_width * bbox[3])
-    coordinates = np.stack(
-        [[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]])
-    coordinates = coordinates.astype(np.float32)
-
-    # Shear the coordinates according to the translation matrix.
-    if shear_horizontal:
-        translation_matrix = np.stack([[1, 0], [-level, 1]])
-    else:
-        translation_matrix = np.stack([[1, -level], [0, 1]])
-    translation_matrix = translation_matrix.astype(np.float32)
-    new_coords = np.matmul(translation_matrix,
-                           np.transpose(coordinates)).astype(np.int32)
-
-    # Find min/max values and convert them back to floats.
-    min_y = float(np.min(new_coords[0, :])) / image_height
-    min_x = float(np.min(new_coords[1, :])) / image_width
-    max_y = float(np.max(new_coords[0, :])) / image_height
-    max_x = float(np.max(new_coords[1, :])) / image_width
-
-    # Clip the bboxes to be sure the fall between [0, 1].
-    min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x)
-    min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x)
-    return np.stack([min_y, min_x, max_y, max_x])
-
-
-def shear_with_bboxes(image, bboxes, level, replace, shear_horizontal):
-    """Applies Shear Transformation to the image and shifts the bboxes.
-
-    Args:
-        image: 3D uint8 Tensor.
-        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
-            has 4 elements (min_y, min_x, max_y, max_x) of type float with values
-            between [0, 1].
-        level: Float. How much to shear the image. This value will be between
-            -0.3 to 0.3.
-        replace: A one or three value 1D tensor to fill empty pixels.
-        shear_horizontal: Boolean. If true then shear in X dimension else shear in
-            the Y dimension.
-
-    Returns:
-        A tuple containing a 3D uint8 Tensor that will be the result of shearing
-        image by level. The second element of the tuple is bboxes, where now
-        the coordinates will be shifted to reflect the sheared image.
-    """
-    if shear_horizontal:
-        image = shear_x(image, level, replace)
-    else:
-        image = shear_y(image, level, replace)
-
-    # Convert bbox coordinates to pixel values.
-    image_height, image_width = image.shape[:2]
-    # pylint:disable=g-long-lambda
-    wrapped_shear_bbox = lambda bbox: _shear_bbox(bbox, image_height, image_width, level, shear_horizontal)
-    # pylint:enable=g-long-lambda
-    new_bboxes = deepcopy(bboxes)
-    num_bboxes = len(bboxes)
-    for idx in range(num_bboxes):
-        new_bboxes[idx] = wrapped_shear_bbox(bboxes[idx])
-    return image.astype(np.uint8), new_bboxes
-
-
-def autocontrast(image):
-    """Implements Autocontrast function from PIL.
-
-    Args:
-        image: A 3D uint8 tensor.
-
-    Returns:
-        The image after it has had autocontrast applied to it and will be of type
-        uint8.
-    """
-
-    def scale_channel(image):
-        """Scale the 2D image using the autocontrast rule."""
-        # A possibly cheaper version can be done using cumsum/unique_with_counts
-        # over the histogram values, rather than iterating over the entire image.
-        # to compute mins and maxes.
-        lo = float(np.min(image))
-        hi = float(np.max(image))
-
-        # Scale the image, making the lowest value 0 and the highest value 255.
-        def scale_values(im):
-            scale = 255.0 / (hi - lo)
-            offset = -lo * scale
-            im = im.astype(np.float32) * scale + offset
-            img = np.clip(im, a_min=0, a_max=255.0)
-            return im.astype(np.uint8)
-
-        result = scale_values(image) if hi > lo else image
-        return result
-
-    # Assumes RGB for now.    Scales each channel independently
-    # and then stacks the result.
-    s1 = scale_channel(image[:, :, 0])
-    s2 = scale_channel(image[:, :, 1])
-    s3 = scale_channel(image[:, :, 2])
-    image = np.stack([s1, s2, s3], 2)
-    return image
-
-
-def sharpness(image, factor):
-    """Implements Sharpness function from PIL."""
-    orig_image = image
-    image = image.astype(np.float32)
-    # Make image 4D for conv operation.
-    # SMOOTH PIL Kernel.
-    kernel = np.array([[1, 1, 1], [1, 5, 1], [1, 1, 1]], dtype=np.float32) / 13.
-    result = cv2.filter2D(image, -1, kernel).astype(np.uint8)
-
-    # Blend the final result.
-    return blend(result, orig_image, factor)
-
-
-def equalize(image):
-    """Implements Equalize function from PIL using."""
-
-    def scale_channel(im, c):
-        """Scale the data in the channel to implement equalize."""
-        im = im[:, :, c].astype(np.int32)
-        # Compute the histogram of the image channel.
-        histo, _ = np.histogram(im, range=[0, 255], bins=256)
-
-        # For the purposes of computing the step, filter out the nonzeros.
-        nonzero = np.where(np.not_equal(histo, 0))
-        nonzero_histo = np.reshape(np.take(histo, nonzero), [-1])
-        step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255
-
-        def build_lut(histo, step):
-            # Compute the cumulative sum, shifting by step // 2
-            # and then normalization by step.
-            lut = (np.cumsum(histo) + (step // 2)) // step
-            # Shift lut, prepending with 0.
-            lut = np.concatenate([[0], lut[:-1]], 0)
-            # Clip the counts to be in range.    This is done
-            # in the C code for image.point.
-            return np.clip(lut, a_min=0, a_max=255).astype(np.uint8)
-
-        # If step is zero, return the original image.    Otherwise, build
-        # lut from the full histogram and step and then index from it.
-        if step == 0:
-            result = im
-        else:
-            result = np.take(build_lut(histo, step), im)
-
-        return result.astype(np.uint8)
-
-    # Assumes RGB for now.    Scales each channel independently
-    # and then stacks the result.
-    s1 = scale_channel(image, 0)
-    s2 = scale_channel(image, 1)
-    s3 = scale_channel(image, 2)
-    image = np.stack([s1, s2, s3], 2)
-    return image
-
-
-def wrap(image):
-    """Returns 'image' with an extra channel set to all 1s."""
-    shape = image.shape
-    extended_channel = 255 * np.ones([shape[0], shape[1], 1], image.dtype)
-    extended = np.concatenate([image, extended_channel], 2).astype(image.dtype)
-    return extended
-
-
-def unwrap(image, replace):
-    """Unwraps an image produced by wrap.
-
-    Where there is a 0 in the last channel for every spatial position,
-    the rest of the three channels in that spatial dimension are grayed
-    (set to 128).    Operations like translate and shear on a wrapped
-    Tensor will leave 0s in empty locations.    Some transformations look
-    at the intensity of values to do preprocessing, and we want these
-    empty pixels to assume the 'average' value, rather than pure black.
-
-
-    Args:
-        image: A 3D Image Tensor with 4 channels.
-        replace: A one or three value 1D tensor to fill empty pixels.
-
-    Returns:
-        image: A 3D image Tensor with 3 channels.
-    """
-    image_shape = image.shape
-    # Flatten the spatial dimensions.
-    flattened_image = np.reshape(image, [-1, image_shape[2]])
-
-    # Find all pixels where the last channel is zero.
-    alpha_channel = flattened_image[:, 3]
-
-    replace = np.concatenate([replace, np.ones([1], image.dtype)], 0)
-
-    # Where they are zero, fill them in with 'replace'.
-    alpha_channel = np.reshape(alpha_channel, (-1, 1))
-    alpha_channel = np.tile(alpha_channel, reps=(1, flattened_image.shape[1]))
-
-    flattened_image = np.where(
-        np.equal(alpha_channel, 0),
-        np.ones_like(
-            flattened_image, dtype=image.dtype) * replace,
-        flattened_image)
-
-    image = np.reshape(flattened_image, image_shape)
-    image = image[:, :, :3]
-    return image.astype(np.uint8)
-
-
-def _cutout_inside_bbox(image, bbox, pad_fraction):
-    """Generates cutout mask and the mean pixel value of the bbox.
-
-    First a location is randomly chosen within the image as the center where the
-    cutout mask will be applied. Note this can be towards the boundaries of the
-    image, so the full cutout mask may not be applied.
-
-    Args:
-        image: 3D uint8 Tensor.
-        bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x)
-            of type float that represents the normalized coordinates between 0 and 1.
-        pad_fraction: Float that specifies how large the cutout mask should be in
-            in reference to the size of the original bbox. If pad_fraction is 0.25,
-            then the cutout mask will be of shape
-            (0.25 * bbox height, 0.25 * bbox width).
-
-    Returns:
-        A tuple. Fist element is a tensor of the same shape as image where each
-        element is either a 1 or 0 that is used to determine where the image
-        will have cutout applied. The second element is the mean of the pixels
-        in the image where the bbox is located.
-        mask value: [0,1]
-    """
-    image_height, image_width = image.shape[0], image.shape[1]
-    # Transform from shape [1, 4] to [4].
-    bbox = np.squeeze(bbox)
-
-    min_y = int(float(image_height) * bbox[0])
-    min_x = int(float(image_width) * bbox[1])
-    max_y = int(float(image_height) * bbox[2])
-    max_x = int(float(image_width) * bbox[3])
-
-    # Calculate the mean pixel values in the bounding box, which will be used
-    # to fill the cutout region.
-    mean = np.mean(image[min_y:max_y + 1, min_x:max_x + 1], axis=(0, 1))
-    # Cutout mask will be size pad_size_heigh * 2 by pad_size_width * 2 if the
-    # region lies entirely within the bbox.
-    box_height = max_y - min_y + 1
-    box_width = max_x - min_x + 1
-    pad_size_height = int(pad_fraction * (box_height / 2))
-    pad_size_width = int(pad_fraction * (box_width / 2))
-
-    # Sample the center location in the image where the zero mask will be applied.
-    cutout_center_height = np.random.randint(min_y, max_y + 1, dtype=np.int32)
-    cutout_center_width = np.random.randint(min_x, max_x + 1, dtype=np.int32)
-
-    lower_pad = np.maximum(0, cutout_center_height - pad_size_height)
-    upper_pad = np.maximum(
-        0, image_height - cutout_center_height - pad_size_height)
-    left_pad = np.maximum(0, cutout_center_width - pad_size_width)
-    right_pad = np.maximum(0,
-                           image_width - cutout_center_width - pad_size_width)
-
-    cutout_shape = [
-        image_height - (lower_pad + upper_pad),
-        image_width - (left_pad + right_pad)
-    ]
-    padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]]
-
-    mask = np.pad(np.zeros(
-        cutout_shape, dtype=image.dtype),
-                  padding_dims,
-                  'constant',
-                  constant_values=1)
-
-    mask = np.expand_dims(mask, 2)
-    mask = np.tile(mask, [1, 1, 3])
-    return mask, mean
-
-
-def bbox_cutout(image, bboxes, pad_fraction, replace_with_mean):
-    """Applies cutout to the image according to bbox information.
-
-    This is a cutout variant that using bbox information to make more informed
-    decisions on where to place the cutout mask.
-
-    Args:
-        image: 3D uint8 Tensor.
-        bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox
-            has 4 elements (min_y, min_x, max_y, max_x) of type float with values
-            between [0, 1].
-        pad_fraction: Float that specifies how large the cutout mask should be in
-            in reference to the size of the original bbox. If pad_fraction is 0.25,
-            then the cutout mask will be of shape
-            (0.25 * bbox height, 0.25 * bbox width).
-        replace_with_mean: Boolean that specified what value should be filled in
-            where the cutout mask is applied. Since the incoming image will be of
-            uint8 and will not have had any mean normalization applied, by default
-            we set the value to be 128. If replace_with_mean is True then we find
-            the mean pixel values across the channel dimension and use those to fill
-            in where the cutout mask is applied.
-
-    Returns:
-        A tuple. First element is a tensor of the same shape as image that has
-        cutout applied to it. Second element is the bboxes that were passed in
-        that will be unchanged.
-    """
-
-    def apply_bbox_cutout(image, bboxes, pad_fraction):
-        """Applies cutout to a single bounding box within image."""
-        # Choose a single bounding box to apply cutout to.
-        random_index = np.random.randint(0, bboxes.shape[0], dtype=np.int32)
-        # Select the corresponding bbox and apply cutout.
-        chosen_bbox = np.take(bboxes, random_index, axis=0)
-        mask, mean = _cutout_inside_bbox(image, chosen_bbox, pad_fraction)
-
-        # When applying cutout we either set the pixel value to 128 or to the mean
-        # value inside the bbox.
-        replace = mean if replace_with_mean else [128] * 3
-
-        # Apply the cutout mask to the image. Where the mask is 0 we fill it with
-        # `replace`.
-        image = np.where(
-            np.equal(mask, 0),
-            np.ones_like(
-                image, dtype=image.dtype) * replace,
-            image).astype(image.dtype)
-        return image
-
-    # Check to see if there are boxes, if so then apply boxcutout.
-    if len(bboxes) != 0:
-        image = apply_bbox_cutout(image, bboxes, pad_fraction)
-
-    return image, bboxes
-
-
-NAME_TO_FUNC = {
-        'AutoContrast': autocontrast,
-        'Equalize': equalize,
-        'Posterize': posterize,
-        'Solarize': solarize,
-        'SolarizeAdd': solarize_add,
-        'Color': color,
-        'Contrast': contrast,
-        'Brightness': brightness,
-        'Sharpness': sharpness,
-        'Cutout': cutout,
-        'BBox_Cutout': bbox_cutout,
-        'Rotate_BBox': rotate_with_bboxes,
-        # pylint:disable=g-long-lambda
-        'TranslateX_BBox': lambda image, bboxes, pixels, replace: translate_bbox(
-                image, bboxes, pixels, replace, shift_horizontal=True),
-        'TranslateY_BBox': lambda image, bboxes, pixels, replace: translate_bbox(
-                image, bboxes, pixels, replace, shift_horizontal=False),
-        'ShearX_BBox': lambda image, bboxes, level, replace: shear_with_bboxes(
-                image, bboxes, level, replace, shear_horizontal=True),
-        'ShearY_BBox': lambda image, bboxes, level, replace: shear_with_bboxes(
-                image, bboxes, level, replace, shear_horizontal=False),
-        # pylint:enable=g-long-lambda
-        'Rotate_Only_BBoxes': rotate_only_bboxes,
-        'ShearX_Only_BBoxes': shear_x_only_bboxes,
-        'ShearY_Only_BBoxes': shear_y_only_bboxes,
-        'TranslateX_Only_BBoxes': translate_x_only_bboxes,
-        'TranslateY_Only_BBoxes': translate_y_only_bboxes,
-        'Flip_Only_BBoxes': flip_only_bboxes,
-        'Solarize_Only_BBoxes': solarize_only_bboxes,
-        'Equalize_Only_BBoxes': equalize_only_bboxes,
-        'Cutout_Only_BBoxes': cutout_only_bboxes,
-}
-
-
-def _randomly_negate_tensor(tensor):
-    """With 50% prob turn the tensor negative."""
-    should_flip = np.floor(np.random.rand() + 0.5) >= 1
-    final_tensor = tensor if should_flip else -tensor
-    return final_tensor
-
-
-def _rotate_level_to_arg(level):
-    level = (level / _MAX_LEVEL) * 30.
-    level = _randomly_negate_tensor(level)
-    return (level, )
-
-
-def _shrink_level_to_arg(level):
-    """Converts level to ratio by which we shrink the image content."""
-    if level == 0:
-        return (1.0, )  # if level is zero, do not shrink the image
-    # Maximum shrinking ratio is 2.9.
-    level = 2. / (_MAX_LEVEL / level) + 0.9
-    return (level, )
-
-
-def _enhance_level_to_arg(level):
-    return ((level / _MAX_LEVEL) * 1.8 + 0.1, )
-
-
-def _shear_level_to_arg(level):
-    level = (level / _MAX_LEVEL) * 0.3
-    # Flip level to negative with 50% chance.
-    level = _randomly_negate_tensor(level)
-    return (level, )
-
-
-def _translate_level_to_arg(level, translate_const):
-    level = (level / _MAX_LEVEL) * float(translate_const)
-    # Flip level to negative with 50% chance.
-    level = _randomly_negate_tensor(level)
-    return (level, )
-
-
-def _bbox_cutout_level_to_arg(level, hparams):
-    cutout_pad_fraction = (level /
-                           _MAX_LEVEL) * 0.75  # hparams.cutout_max_pad_fraction
-    return (cutout_pad_fraction, False)  # hparams.cutout_bbox_replace_with_mean
-
-
-def level_to_arg(hparams):
-    return {
-        'AutoContrast': lambda level: (),
-        'Equalize': lambda level: (),
-        'Posterize': lambda level: (int((level / _MAX_LEVEL) * 4), ),
-        'Solarize': lambda level: (int((level / _MAX_LEVEL) * 256), ),
-        'SolarizeAdd': lambda level: (int((level / _MAX_LEVEL) * 110), ),
-        'Color': _enhance_level_to_arg,
-        'Contrast': _enhance_level_to_arg,
-        'Brightness': _enhance_level_to_arg,
-        'Sharpness': _enhance_level_to_arg,
-        'Cutout':
-        lambda level: (int((level / _MAX_LEVEL) * 100), ),  # hparams.cutout_const=100
-        # pylint:disable=g-long-lambda
-        'BBox_Cutout': lambda level: _bbox_cutout_level_to_arg(level, hparams),
-        'TranslateX_BBox':
-        lambda level: _translate_level_to_arg(level, 250),  # hparams.translate_const=250
-        'TranslateY_BBox':
-        lambda level: _translate_level_to_arg(level, 250),  # hparams.translate_cons
-        # pylint:enable=g-long-lambda
-        'ShearX_BBox': _shear_level_to_arg,
-        'ShearY_BBox': _shear_level_to_arg,
-        'Rotate_BBox': _rotate_level_to_arg,
-        'Rotate_Only_BBoxes': _rotate_level_to_arg,
-        'ShearX_Only_BBoxes': _shear_level_to_arg,
-        'ShearY_Only_BBoxes': _shear_level_to_arg,
-        # pylint:disable=g-long-lambda
-        'TranslateX_Only_BBoxes':
-        lambda level: _translate_level_to_arg(level, 120),  # hparams.translate_bbox_const
-        'TranslateY_Only_BBoxes':
-        lambda level: _translate_level_to_arg(level, 120),  # hparams.translate_bbox_const
-        # pylint:enable=g-long-lambda
-        'Flip_Only_BBoxes': lambda level: (),
-        'Solarize_Only_BBoxes':
-        lambda level: (int((level / _MAX_LEVEL) * 256), ),
-        'Equalize_Only_BBoxes': lambda level: (),
-        # pylint:disable=g-long-lambda
-        'Cutout_Only_BBoxes':
-        lambda level: (int((level / _MAX_LEVEL) * 50), ),  # hparams.cutout_bbox_const
-        # pylint:enable=g-long-lambda
-    }
-
-
-def bbox_wrapper(func):
-    """Adds a bboxes function argument to func and returns unchanged bboxes."""
-
-    def wrapper(images, bboxes, *args, **kwargs):
-        return (func(images, *args, **kwargs), bboxes)
-
-    return wrapper
-
-
-def _parse_policy_info(name, prob, level, replace_value, augmentation_hparams):
-    """Return the function that corresponds to `name` and update `level` param."""
-    func = NAME_TO_FUNC[name]
-    args = level_to_arg(augmentation_hparams)[name](level)
-
-    # Check to see if prob is passed into function. This is used for operations
-    # where we alter bboxes independently.
-    # pytype:disable=wrong-arg-types
-    if 'prob' in inspect.getfullargspec(func)[0]:
-        args = tuple([prob] + list(args))
-    # pytype:enable=wrong-arg-types
-
-    # Add in replace arg if it is required for the function that is being called.
-    if 'replace' in inspect.getfullargspec(func)[0]:
-        # Make sure replace is the final argument
-        assert 'replace' == inspect.getfullargspec(func)[0][-1]
-        args = tuple(list(args) + [replace_value])
-
-    # Add bboxes as the second positional argument for the function if it does
-    # not already exist.
-    if 'bboxes' not in inspect.getfullargspec(func)[0]:
-        func = bbox_wrapper(func)
-    return (func, prob, args)
-
-
-def _apply_func_with_prob(func, image, args, prob, bboxes):
-    """Apply `func` to image w/ `args` as input with probability `prob`."""
-    assert isinstance(args, tuple)
-    assert 'bboxes' == inspect.getfullargspec(func)[0][1]
-
-    # If prob is a function argument, then this randomness is being handled
-    # inside the function, so make sure it is always called.
-    if 'prob' in inspect.getfullargspec(func)[0]:
-        prob = 1.0
-
-    # Apply the function with probability `prob`.
-    should_apply_op = np.floor(np.random.rand() + 0.5) >= 1
-    if should_apply_op:
-        augmented_image, augmented_bboxes = func(image, bboxes, *args)
-    else:
-        augmented_image, augmented_bboxes = (image, bboxes)
-    return augmented_image, augmented_bboxes
-
-
-def select_and_apply_random_policy(policies, image, bboxes):
-    """Select a random policy from `policies` and apply it to `image`."""
-    policy_to_select = np.random.randint(0, len(policies), dtype=np.int32)
-    # policy_to_select = 6 # for test
-    for (i, policy) in enumerate(policies):
-        if i == policy_to_select:
-            image, bboxes = policy(image, bboxes)
-    return (image, bboxes)
-
-
-def build_and_apply_nas_policy(policies, image, bboxes, augmentation_hparams):
-    """Build a policy from the given policies passed in and apply to image.
-
-    Args:
-        policies: list of lists of tuples in the form `(func, prob, level)`, `func`
-            is a string name of the augmentation function, `prob` is the probability
-            of applying the `func` operation, `level` is the input argument for
-            `func`.
-        image: numpy array that the resulting policy will be applied to.
-        bboxes:
-        augmentation_hparams: Hparams associated with the NAS learned policy.
-
-    Returns:
-        A version of image that now has data augmentation applied to it based on
-        the `policies` pass into the function. Additionally, returns bboxes if
-        a value for them is passed in that is not None
-    """
-    replace_value = [128, 128, 128]
-
-    # func is the string name of the augmentation function, prob is the
-    # probability of applying the operation and level is the parameter associated
-
-    # tf_policies are functions that take in an image and return an augmented
-    # image.
-    tf_policies = []
-    for policy in policies:
-        tf_policy = []
-        # Link string name to the correct python function and make sure the correct
-        # argument is passed into that function.
-        for policy_info in policy:
-            policy_info = list(
-                policy_info) + [replace_value, augmentation_hparams]
-
-            tf_policy.append(_parse_policy_info(*policy_info))
-        # Now build the tf policy that will apply the augmentation procedue
-        # on image.
-        def make_final_policy(tf_policy_):
-            def final_policy(image_, bboxes_):
-                for func, prob, args in tf_policy_:
-                    image_, bboxes_ = _apply_func_with_prob(func, image_, args,
-                                                            prob, bboxes_)
-                return image_, bboxes_
-
-            return final_policy
-
-        tf_policies.append(make_final_policy(tf_policy))
-
-    augmented_images, augmented_bboxes = select_and_apply_random_policy(
-        tf_policies, image, bboxes)
-    # If no bounding boxes were specified, then just return the images.
-    return (augmented_images, augmented_bboxes)
-
-
-# TODO(barretzoph): Add in ArXiv link once paper is out.
-def distort_image_with_autoaugment(image, bboxes, augmentation_name):
-    """Applies the AutoAugment policy to `image` and `bboxes`.
-
-    Args:
-        image: `Tensor` of shape [height, width, 3] representing an image.
-        bboxes: `Tensor` of shape [N, 4] representing ground truth boxes that are
-            normalized between [0, 1].
-        augmentation_name: The name of the AutoAugment policy to use. The available
-            options are `v0`, `v1`, `v2`, `v3` and `test`. `v0` is the policy used for
-            all of the results in the paper and was found to achieve the best results
-            on the COCO dataset. `v1`, `v2` and `v3` are additional good policies
-            found on the COCO dataset that have slight variation in what operations
-            were used during the search procedure along with how many operations are
-            applied in parallel to a single image (2 vs 3).
-
-    Returns:
-        A tuple containing the augmented versions of `image` and `bboxes`.
-    """
-    available_policies = {
-        'v0': policy_v0,
-        'v1': policy_v1,
-        'v2': policy_v2,
-        'v3': policy_v3,
-        'test': policy_vtest
-    }
-    if augmentation_name not in available_policies:
-        raise ValueError('Invalid augmentation_name: {}'.format(
-            augmentation_name))
-
-    policy = available_policies[augmentation_name]()
-    augmentation_hparams = {}
-    return build_and_apply_nas_policy(policy, image, bboxes,
-                                      augmentation_hparams)
diff --git a/pdfdet/models/Paddle/ppdet/data/transform/batch_operators.py b/pdfdet/models/Paddle/ppdet/data/transform/batch_operators.py
deleted file mode 100644
index f1ea702..0000000
--- a/pdfdet/models/Paddle/ppdet/data/transform/batch_operators.py
+++ /dev/null
@@ -1,1532 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import typing
-
-try:
-    from collections.abc import Sequence
-except Exception:
-    from collections import Sequence
-
-import cv2
-import copy
-import math
-import numpy as np
-from .operators import register_op, BaseOperator, Resize
-from .op_helper import jaccard_overlap, gaussian2D, gaussian_radius, draw_umich_gaussian
-from .atss_assigner import ATSSAssigner
-from scipy import ndimage
-
-from ppdet.modeling import bbox_utils
-from ppdet.utils.logger import setup_logger
-from ppdet.modeling.keypoint_utils import get_affine_transform, affine_transform
-logger = setup_logger(__name__)
-
-__all__ = [
-    'PadBatch', 'BatchRandomResize', 'Gt2YoloTarget', 'Gt2FCOSTarget',
-    'Gt2TTFTarget', 'Gt2Solov2Target', 'Gt2SparseTarget', 'PadMaskBatch',
-    'Gt2GFLTarget', 'Gt2CenterNetTarget', 'Gt2CenterTrackTarget', 'PadGT',
-    'PadRGT', 'BatchRandomResizeForSSOD'
-]
-
-
-@register_op
-class PadBatch(BaseOperator):
-    """
-    Pad a batch of samples so they can be divisible by a stride.
-    The layout of each image should be 'CHW'.
-    Args:
-        pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure
-            height and width is divisible by `pad_to_stride`.
-    """
-
-    def __init__(self, pad_to_stride=0):
-        super(PadBatch, self).__init__()
-        self.pad_to_stride = pad_to_stride
-
-    def __call__(self, samples, context=None):
-        """
-        Args:
-            samples (list): a batch of sample, each is dict.
-        """
-        coarsest_stride = self.pad_to_stride
-
-        # multi scale input is nested list
-        if isinstance(samples,
-                      typing.Sequence) and len(samples) > 0 and isinstance(
-                          samples[0], typing.Sequence):
-            inner_samples = samples[0]
-        else:
-            inner_samples = samples
-
-        max_shape = np.array(
-            [data['image'].shape for data in inner_samples]).max(axis=0)
-        if coarsest_stride > 0:
-            max_shape[1] = int(
-                np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride)
-            max_shape[2] = int(
-                np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride)
-
-        for data in inner_samples:
-            im = data['image']
-            im_c, im_h, im_w = im.shape[:]
-            padding_im = np.zeros(
-                (im_c, max_shape[1], max_shape[2]), dtype=np.float32)
-            padding_im[:, :im_h, :im_w] = im
-            data['image'] = padding_im
-            if 'semantic' in data and data['semantic'] is not None:
-                semantic = data['semantic']
-                padding_sem = np.zeros(
-                    (1, max_shape[1], max_shape[2]), dtype=np.float32)
-                padding_sem[:, :im_h, :im_w] = semantic
-                data['semantic'] = padding_sem
-            if 'gt_segm' in data and data['gt_segm'] is not None:
-                gt_segm = data['gt_segm']
-                padding_segm = np.zeros(
-                    (gt_segm.shape[0], max_shape[1], max_shape[2]),
-                    dtype=np.uint8)
-                padding_segm[:, :im_h, :im_w] = gt_segm
-                data['gt_segm'] = padding_segm
-
-        return samples
-
-
-@register_op
-class BatchRandomResize(BaseOperator):
-    """
-    Resize image to target size randomly. random target_size and interpolation method
-    Args:
-        target_size (int, list, tuple): image target size, if random size is True, must be list or tuple
-        keep_ratio (bool): whether keep_raio or not, default true
-        interp (int): the interpolation method
-        random_size (bool): whether random select target size of image
-        random_interp (bool): whether random select interpolation method
-    """
-
-    def __init__(self,
-                 target_size,
-                 keep_ratio,
-                 interp=cv2.INTER_NEAREST,
-                 random_size=True,
-                 random_interp=False):
-        super(BatchRandomResize, self).__init__()
-        self.keep_ratio = keep_ratio
-        self.interps = [
-            cv2.INTER_NEAREST,
-            cv2.INTER_LINEAR,
-            cv2.INTER_AREA,
-            cv2.INTER_CUBIC,
-            cv2.INTER_LANCZOS4,
-        ]
-        self.interp = interp
-        assert isinstance(target_size, (
-            int, Sequence)), "target_size must be int, list or tuple"
-        if random_size and not isinstance(target_size, list):
-            raise TypeError(
-                "Type of target_size is invalid when random_size is True. Must be List, now is {}".
-                format(type(target_size)))
-        self.target_size = target_size
-        self.random_size = random_size
-        self.random_interp = random_interp
-
-    def __call__(self, samples, context=None):
-        if self.random_size:
-            index = np.random.choice(len(self.target_size))
-            target_size = self.target_size[index]
-        else:
-            target_size = self.target_size
-
-        if self.random_interp:
-            interp = np.random.choice(self.interps)
-        else:
-            interp = self.interp
-
-        resizer = Resize(target_size, keep_ratio=self.keep_ratio, interp=interp)
-        return resizer(samples, context=context)
-
-
-@register_op
-class Gt2YoloTarget(BaseOperator):
-    __shared__ = ['num_classes']
-    """
-    Generate YOLOv3 targets by groud truth data, this operator is only used in
-    fine grained YOLOv3 loss mode
-    """
-
-    def __init__(self,
-                 anchors,
-                 anchor_masks,
-                 downsample_ratios,
-                 num_classes=80,
-                 iou_thresh=1.):
-        super(Gt2YoloTarget, self).__init__()
-        self.anchors = anchors
-        self.anchor_masks = anchor_masks
-        self.downsample_ratios = downsample_ratios
-        self.num_classes = num_classes
-        self.iou_thresh = iou_thresh
-
-    def __call__(self, samples, context=None):
-        assert len(self.anchor_masks) == len(self.downsample_ratios), \
-            "anchor_masks', and 'downsample_ratios' should have same length."
-
-        h, w = samples[0]['image'].shape[1:3]
-        an_hw = np.array(self.anchors) / np.array([[w, h]])
-        for sample in samples:
-            gt_bbox = sample['gt_bbox']
-            gt_class = sample['gt_class']
-            if 'gt_score' not in sample:
-                sample['gt_score'] = np.ones(
-                    (gt_bbox.shape[0], 1), dtype=np.float32)
-            gt_score = sample['gt_score']
-            for i, (
-                    mask, downsample_ratio
-            ) in enumerate(zip(self.anchor_masks, self.downsample_ratios)):
-                grid_h = int(h / downsample_ratio)
-                grid_w = int(w / downsample_ratio)
-                target = np.zeros(
-                    (len(mask), 6 + self.num_classes, grid_h, grid_w),
-                    dtype=np.float32)
-                for b in range(gt_bbox.shape[0]):
-                    gx, gy, gw, gh = gt_bbox[b, :]
-                    cls = gt_class[b]
-                    score = gt_score[b]
-                    if gw <= 0. or gh <= 0. or score <= 0.:
-                        continue
-
-                    # find best match anchor index
-                    best_iou = 0.
-                    best_idx = -1
-                    for an_idx in range(an_hw.shape[0]):
-                        iou = jaccard_overlap(
-                            [0., 0., gw, gh],
-                            [0., 0., an_hw[an_idx, 0], an_hw[an_idx, 1]])
-                        if iou > best_iou:
-                            best_iou = iou
-                            best_idx = an_idx
-
-                    gi = int(gx * grid_w)
-                    gj = int(gy * grid_h)
-
-                    # gtbox should be regresed in this layes if best match 
-                    # anchor index in anchor mask of this layer
-                    if best_idx in mask:
-                        best_n = mask.index(best_idx)
-
-                        # x, y, w, h, scale
-                        target[best_n, 0, gj, gi] = gx * grid_w - gi
-                        target[best_n, 1, gj, gi] = gy * grid_h - gj
-                        target[best_n, 2, gj, gi] = np.log(
-                            gw * w / self.anchors[best_idx][0])
-                        target[best_n, 3, gj, gi] = np.log(
-                            gh * h / self.anchors[best_idx][1])
-                        target[best_n, 4, gj, gi] = 2.0 - gw * gh
-
-                        # objectness record gt_score
-                        target[best_n, 5, gj, gi] = score
-
-                        # classification
-                        target[best_n, 6 + cls, gj, gi] = 1.
-
-                    # For non-matched anchors, calculate the target if the iou 
-                    # between anchor and gt is larger than iou_thresh
-                    if self.iou_thresh < 1:
-                        for idx, mask_i in enumerate(mask):
-                            if mask_i == best_idx: continue
-                            iou = jaccard_overlap(
-                                [0., 0., gw, gh],
-                                [0., 0., an_hw[mask_i, 0], an_hw[mask_i, 1]])
-                            if iou > self.iou_thresh and target[idx, 5, gj,
-                                                                gi] == 0.:
-                                # x, y, w, h, scale
-                                target[idx, 0, gj, gi] = gx * grid_w - gi
-                                target[idx, 1, gj, gi] = gy * grid_h - gj
-                                target[idx, 2, gj, gi] = np.log(
-                                    gw * w / self.anchors[mask_i][0])
-                                target[idx, 3, gj, gi] = np.log(
-                                    gh * h / self.anchors[mask_i][1])
-                                target[idx, 4, gj, gi] = 2.0 - gw * gh
-
-                                # objectness record gt_score
-                                target[idx, 5, gj, gi] = score
-
-                                # classification
-                                target[idx, 6 + cls, gj, gi] = 1.
-                sample['target{}'.format(i)] = target
-
-            # remove useless gt_class and gt_score after target calculated
-            sample.pop('gt_class')
-            sample.pop('gt_score')
-
-        return samples
-
-
-@register_op
-class Gt2FCOSTarget(BaseOperator):
-    """
-    Generate FCOS targets by groud truth data
-    """
-
-    def __init__(self,
-                 object_sizes_boundary,
-                 center_sampling_radius,
-                 downsample_ratios,
-                 num_shift=0.5,
-                 multiply_strides_reg_targets=False,
-                 norm_reg_targets=True):
-        super(Gt2FCOSTarget, self).__init__()
-        self.center_sampling_radius = center_sampling_radius
-        self.downsample_ratios = downsample_ratios
-        self.INF = np.inf
-        self.object_sizes_boundary = [-1] + object_sizes_boundary + [self.INF]
-        object_sizes_of_interest = []
-        for i in range(len(self.object_sizes_boundary) - 1):
-            object_sizes_of_interest.append([
-                self.object_sizes_boundary[i], self.object_sizes_boundary[i + 1]
-            ])
-        self.object_sizes_of_interest = object_sizes_of_interest
-        self.num_shift = num_shift
-        self.multiply_strides_reg_targets = multiply_strides_reg_targets
-        self.norm_reg_targets = norm_reg_targets
-
-    def _compute_points(self, w, h):
-        """
-        compute the corresponding points in each feature map
-        :param h: image height
-        :param w: image width
-        :return: points from all feature map
-        """
-        locations = []
-        for stride in self.downsample_ratios:
-            shift_x = np.arange(0, w, stride).astype(np.float32)
-            shift_y = np.arange(0, h, stride).astype(np.float32)
-            shift_x, shift_y = np.meshgrid(shift_x, shift_y)
-            shift_x = shift_x.flatten()
-            shift_y = shift_y.flatten()
-            location = np.stack(
-                [shift_x, shift_y], axis=1) + stride * self.num_shift
-            locations.append(location)
-        num_points_each_level = [len(location) for location in locations]
-        locations = np.concatenate(locations, axis=0)
-        return locations, num_points_each_level
-
-    def _convert_xywh2xyxy(self, gt_bbox, w, h):
-        """
-        convert the bounding box from style xywh to xyxy
-        :param gt_bbox: bounding boxes normalized into [0, 1]
-        :param w: image width
-        :param h: image height
-        :return: bounding boxes in xyxy style
-        """
-        bboxes = gt_bbox.copy()
-        bboxes[:, [0, 2]] = bboxes[:, [0, 2]] * w
-        bboxes[:, [1, 3]] = bboxes[:, [1, 3]] * h
-        bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2]
-        bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3]
-        return bboxes
-
-    def _check_inside_boxes_limited(self, gt_bbox, xs, ys,
-                                    num_points_each_level):
-        """
-        check if points is within the clipped boxes
-        :param gt_bbox: bounding boxes
-        :param xs: horizontal coordinate of points
-        :param ys: vertical coordinate of points
-        :return: the mask of points is within gt_box or not
-        """
-        bboxes = np.reshape(
-            gt_bbox, newshape=[1, gt_bbox.shape[0], gt_bbox.shape[1]])
-        bboxes = np.tile(bboxes, reps=[xs.shape[0], 1, 1])
-        ct_x = (bboxes[:, :, 0] + bboxes[:, :, 2]) / 2
-        ct_y = (bboxes[:, :, 1] + bboxes[:, :, 3]) / 2
-        beg = 0
-        clipped_box = bboxes.copy()
-        for lvl, stride in enumerate(self.downsample_ratios):
-            end = beg + num_points_each_level[lvl]
-            stride_exp = self.center_sampling_radius * stride
-            clipped_box[beg:end, :, 0] = np.maximum(
-                bboxes[beg:end, :, 0], ct_x[beg:end, :] - stride_exp)
-            clipped_box[beg:end, :, 1] = np.maximum(
-                bboxes[beg:end, :, 1], ct_y[beg:end, :] - stride_exp)
-            clipped_box[beg:end, :, 2] = np.minimum(
-                bboxes[beg:end, :, 2], ct_x[beg:end, :] + stride_exp)
-            clipped_box[beg:end, :, 3] = np.minimum(
-                bboxes[beg:end, :, 3], ct_y[beg:end, :] + stride_exp)
-            beg = end
-        l_res = xs - clipped_box[:, :, 0]
-        r_res = clipped_box[:, :, 2] - xs
-        t_res = ys - clipped_box[:, :, 1]
-        b_res = clipped_box[:, :, 3] - ys
-        clipped_box_reg_targets = np.stack([l_res, t_res, r_res, b_res], axis=2)
-        inside_gt_box = np.min(clipped_box_reg_targets, axis=2) > 0
-        return inside_gt_box
-
-    def __call__(self, samples, context=None):
-        assert len(self.object_sizes_of_interest) == len(self.downsample_ratios), \
-            "object_sizes_of_interest', and 'downsample_ratios' should have same length."
-
-        for sample in samples:
-            im = sample['image']
-            bboxes = sample['gt_bbox']
-            gt_class = sample['gt_class']
-            # calculate the locations
-            h, w = im.shape[1:3]
-            points, num_points_each_level = self._compute_points(w, h)
-            object_scale_exp = []
-            for i, num_pts in enumerate(num_points_each_level):
-                object_scale_exp.append(
-                    np.tile(
-                        np.array([self.object_sizes_of_interest[i]]),
-                        reps=[num_pts, 1]))
-            object_scale_exp = np.concatenate(object_scale_exp, axis=0)
-
-            gt_area = (bboxes[:, 2] - bboxes[:, 0]) * (
-                bboxes[:, 3] - bboxes[:, 1])
-            xs, ys = points[:, 0], points[:, 1]
-            xs = np.reshape(xs, newshape=[xs.shape[0], 1])
-            xs = np.tile(xs, reps=[1, bboxes.shape[0]])
-            ys = np.reshape(ys, newshape=[ys.shape[0], 1])
-            ys = np.tile(ys, reps=[1, bboxes.shape[0]])
-
-            l_res = xs - bboxes[:, 0]
-            r_res = bboxes[:, 2] - xs
-            t_res = ys - bboxes[:, 1]
-            b_res = bboxes[:, 3] - ys
-            reg_targets = np.stack([l_res, t_res, r_res, b_res], axis=2)
-            if self.center_sampling_radius > 0:
-                is_inside_box = self._check_inside_boxes_limited(
-                    bboxes, xs, ys, num_points_each_level)
-            else:
-                is_inside_box = np.min(reg_targets, axis=2) > 0
-            # check if the targets is inside the corresponding level
-            max_reg_targets = np.max(reg_targets, axis=2)
-            lower_bound = np.tile(
-                np.expand_dims(
-                    object_scale_exp[:, 0], axis=1),
-                reps=[1, max_reg_targets.shape[1]])
-            high_bound = np.tile(
-                np.expand_dims(
-                    object_scale_exp[:, 1], axis=1),
-                reps=[1, max_reg_targets.shape[1]])
-            is_match_current_level = \
-                (max_reg_targets > lower_bound) & \
-                (max_reg_targets < high_bound)
-            points2gtarea = np.tile(
-                np.expand_dims(
-                    gt_area, axis=0), reps=[xs.shape[0], 1])
-            points2gtarea[is_inside_box == 0] = self.INF
-            points2gtarea[is_match_current_level == 0] = self.INF
-            points2min_area = points2gtarea.min(axis=1)
-            points2min_area_ind = points2gtarea.argmin(axis=1)
-            labels = gt_class[points2min_area_ind] + 1
-            labels[points2min_area == self.INF] = 0
-            reg_targets = reg_targets[range(xs.shape[0]), points2min_area_ind]
-            ctn_targets = np.sqrt((reg_targets[:, [0, 2]].min(axis=1) / \
-                                  reg_targets[:, [0, 2]].max(axis=1)) * \
-                                  (reg_targets[:, [1, 3]].min(axis=1) / \
-                                   reg_targets[:, [1, 3]].max(axis=1))).astype(np.float32)
-            ctn_targets = np.reshape(
-                ctn_targets, newshape=[ctn_targets.shape[0], 1])
-            ctn_targets[labels <= 0] = 0
-            pos_ind = np.nonzero(labels != 0)
-            reg_targets_pos = reg_targets[pos_ind[0], :]
-            split_sections = []
-            beg = 0
-            for lvl in range(len(num_points_each_level)):
-                end = beg + num_points_each_level[lvl]
-                split_sections.append(end)
-                beg = end
-            labels_by_level = np.split(labels, split_sections, axis=0)
-            reg_targets_by_level = np.split(reg_targets, split_sections, axis=0)
-            ctn_targets_by_level = np.split(ctn_targets, split_sections, axis=0)
-            for lvl in range(len(self.downsample_ratios)):
-                grid_w = int(np.ceil(w / self.downsample_ratios[lvl]))
-                grid_h = int(np.ceil(h / self.downsample_ratios[lvl]))
-                if self.norm_reg_targets:
-                    if self.multiply_strides_reg_targets:
-                        sample['reg_target{}'.format(lvl)] = np.reshape(
-                            reg_targets_by_level[lvl],
-                            newshape=[grid_h, grid_w, 4])
-                    else:
-                        sample['reg_target{}'.format(lvl)] = \
-                            np.reshape(
-                                reg_targets_by_level[lvl] / \
-                                self.downsample_ratios[lvl],
-                                newshape=[grid_h, grid_w, 4])
-                else:
-                    sample['reg_target{}'.format(lvl)] = np.reshape(
-                        reg_targets_by_level[lvl],
-                        newshape=[grid_h, grid_w, 4])
-                sample['labels{}'.format(lvl)] = np.reshape(
-                    labels_by_level[lvl], newshape=[grid_h, grid_w, 1])
-                sample['centerness{}'.format(lvl)] = np.reshape(
-                    ctn_targets_by_level[lvl], newshape=[grid_h, grid_w, 1])
-
-            sample.pop('is_crowd', None)
-            sample.pop('difficult', None)
-            sample.pop('gt_class', None)
-            sample.pop('gt_bbox', None)
-        return samples
-
-
-@register_op
-class Gt2GFLTarget(BaseOperator):
-    __shared__ = ['num_classes']
-    """
-    Generate GFocal loss targets by groud truth data
-    """
-
-    def __init__(self,
-                 num_classes=80,
-                 downsample_ratios=[8, 16, 32, 64, 128],
-                 grid_cell_scale=4,
-                 cell_offset=0,
-                 compute_vlr_region=False):
-        super(Gt2GFLTarget, self).__init__()
-        self.num_classes = num_classes
-        self.downsample_ratios = downsample_ratios
-        self.grid_cell_scale = grid_cell_scale
-        self.cell_offset = cell_offset
-        self.compute_vlr_region = compute_vlr_region
-
-        self.assigner = ATSSAssigner()
-
-    def get_grid_cells(self, featmap_size, scale, stride, offset=0):
-        """
-        Generate grid cells of a feature map for target assignment.
-        Args:
-            featmap_size: Size of a single level feature map.
-            scale: Grid cell scale.
-            stride: Down sample stride of the feature map.
-            offset: Offset of grid cells.
-        return:
-            Grid_cells xyxy position. Size should be [feat_w * feat_h, 4]
-        """
-        cell_size = stride * scale
-        h, w = featmap_size
-        x_range = (np.arange(w, dtype=np.float32) + offset) * stride
-        y_range = (np.arange(h, dtype=np.float32) + offset) * stride
-        x, y = np.meshgrid(x_range, y_range)
-        y = y.flatten()
-        x = x.flatten()
-        grid_cells = np.stack(
-            [
-                x - 0.5 * cell_size, y - 0.5 * cell_size, x + 0.5 * cell_size,
-                y + 0.5 * cell_size
-            ],
-            axis=-1)
-        return grid_cells
-
-    def get_sample(self, assign_gt_inds, gt_bboxes):
-        pos_inds = np.unique(np.nonzero(assign_gt_inds > 0)[0])
-        neg_inds = np.unique(np.nonzero(assign_gt_inds == 0)[0])
-        pos_assigned_gt_inds = assign_gt_inds[pos_inds] - 1
-
-        if gt_bboxes.size == 0:
-            # hack for index error case
-            assert pos_assigned_gt_inds.size == 0
-            pos_gt_bboxes = np.empty_like(gt_bboxes).reshape(-1, 4)
-        else:
-            if len(gt_bboxes.shape) < 2:
-                gt_bboxes = gt_bboxes.resize(-1, 4)
-            pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
-        return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds
-
-    def __call__(self, samples, context=None):
-        assert len(samples) > 0
-        batch_size = len(samples)
-        # get grid cells of image
-        h, w = samples[0]['image'].shape[1:3]
-        multi_level_grid_cells = []
-        for stride in self.downsample_ratios:
-            featmap_size = (int(math.ceil(h / stride)),
-                            int(math.ceil(w / stride)))
-            multi_level_grid_cells.append(
-                self.get_grid_cells(featmap_size, self.grid_cell_scale, stride,
-                                    self.cell_offset))
-        mlvl_grid_cells_list = [
-            multi_level_grid_cells for i in range(batch_size)
-        ]
-        # pixel cell number of multi-level feature maps
-        num_level_cells = [
-            grid_cells.shape[0] for grid_cells in mlvl_grid_cells_list[0]
-        ]
-        num_level_cells_list = [num_level_cells] * batch_size
-        # concat all level cells and to a single array
-        for i in range(batch_size):
-            mlvl_grid_cells_list[i] = np.concatenate(mlvl_grid_cells_list[i])
-        # target assign on all images
-        for sample, grid_cells, num_level_cells in zip(
-                samples, mlvl_grid_cells_list, num_level_cells_list):
-            gt_bboxes = sample['gt_bbox']
-            gt_labels = sample['gt_class'].squeeze()
-            if gt_labels.size == 1:
-                gt_labels = np.array([gt_labels]).astype(np.int32)
-            gt_bboxes_ignore = None
-            assign_gt_inds, _ = self.assigner(grid_cells, num_level_cells,
-                                              gt_bboxes, gt_bboxes_ignore,
-                                              gt_labels)
-
-            if self.compute_vlr_region:
-                vlr_region = self.assigner.get_vlr_region(
-                    grid_cells, num_level_cells, gt_bboxes, gt_bboxes_ignore,
-                    gt_labels)
-                sample['vlr_regions'] = vlr_region
-
-            pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds = self.get_sample(
-                assign_gt_inds, gt_bboxes)
-
-            num_cells = grid_cells.shape[0]
-            bbox_targets = np.zeros_like(grid_cells)
-            bbox_weights = np.zeros_like(grid_cells)
-            labels = np.ones([num_cells], dtype=np.int64) * self.num_classes
-            label_weights = np.zeros([num_cells], dtype=np.float32)
-
-            if len(pos_inds) > 0:
-                pos_bbox_targets = pos_gt_bboxes
-                bbox_targets[pos_inds, :] = pos_bbox_targets
-                bbox_weights[pos_inds, :] = 1.0
-                if not np.any(gt_labels):
-                    labels[pos_inds] = 0
-                else:
-                    labels[pos_inds] = gt_labels[pos_assigned_gt_inds]
-
-                label_weights[pos_inds] = 1.0
-            if len(neg_inds) > 0:
-                label_weights[neg_inds] = 1.0
-            sample['grid_cells'] = grid_cells
-            sample['labels'] = labels
-            sample['label_weights'] = label_weights
-            sample['bbox_targets'] = bbox_targets
-            sample['pos_num'] = max(pos_inds.size, 1)
-            sample.pop('is_crowd', None)
-            sample.pop('difficult', None)
-            sample.pop('gt_class', None)
-            sample.pop('gt_bbox', None)
-            sample.pop('gt_score', None)
-        return samples
-
-
-@register_op
-class Gt2TTFTarget(BaseOperator):
-    __shared__ = ['num_classes']
-    """
-    Gt2TTFTarget
-    Generate TTFNet targets by ground truth data
-    
-    Args:
-        num_classes(int): the number of classes.
-        down_ratio(int): the down ratio from images to heatmap, 4 by default.
-        alpha(float): the alpha parameter to generate gaussian target.
-            0.54 by default.
-    """
-
-    def __init__(self, num_classes=80, down_ratio=4, alpha=0.54):
-        super(Gt2TTFTarget, self).__init__()
-        self.down_ratio = down_ratio
-        self.num_classes = num_classes
-        self.alpha = alpha
-
-    def __call__(self, samples, context=None):
-        output_size = samples[0]['image'].shape[1]
-        feat_size = output_size // self.down_ratio
-        for sample in samples:
-            heatmap = np.zeros(
-                (self.num_classes, feat_size, feat_size), dtype='float32')
-            box_target = np.ones(
-                (4, feat_size, feat_size), dtype='float32') * -1
-            reg_weight = np.zeros((1, feat_size, feat_size), dtype='float32')
-
-            gt_bbox = sample['gt_bbox']
-            gt_class = sample['gt_class']
-
-            bbox_w = gt_bbox[:, 2] - gt_bbox[:, 0] + 1
-            bbox_h = gt_bbox[:, 3] - gt_bbox[:, 1] + 1
-            area = bbox_w * bbox_h
-            boxes_areas_log = np.log(area)
-            boxes_ind = np.argsort(boxes_areas_log, axis=0)[::-1]
-            boxes_area_topk_log = boxes_areas_log[boxes_ind]
-            gt_bbox = gt_bbox[boxes_ind]
-            gt_class = gt_class[boxes_ind]
-
-            feat_gt_bbox = gt_bbox / self.down_ratio
-            feat_gt_bbox = np.clip(feat_gt_bbox, 0, feat_size - 1)
-            feat_hs, feat_ws = (feat_gt_bbox[:, 3] - feat_gt_bbox[:, 1],
-                                feat_gt_bbox[:, 2] - feat_gt_bbox[:, 0])
-
-            ct_inds = np.stack(
-                [(gt_bbox[:, 0] + gt_bbox[:, 2]) / 2,
-                 (gt_bbox[:, 1] + gt_bbox[:, 3]) / 2],
-                axis=1) / self.down_ratio
-
-            h_radiuses_alpha = (feat_hs / 2. * self.alpha).astype('int32')
-            w_radiuses_alpha = (feat_ws / 2. * self.alpha).astype('int32')
-
-            for k in range(len(gt_bbox)):
-                cls_id = gt_class[k]
-                fake_heatmap = np.zeros((feat_size, feat_size), dtype='float32')
-                self.draw_truncate_gaussian(fake_heatmap, ct_inds[k],
-                                            h_radiuses_alpha[k],
-                                            w_radiuses_alpha[k])
-
-                heatmap[cls_id] = np.maximum(heatmap[cls_id], fake_heatmap)
-                box_target_inds = fake_heatmap > 0
-                box_target[:, box_target_inds] = gt_bbox[k][:, None]
-
-                local_heatmap = fake_heatmap[box_target_inds]
-                ct_div = np.sum(local_heatmap)
-                local_heatmap *= boxes_area_topk_log[k]
-                reg_weight[0, box_target_inds] = local_heatmap / ct_div
-            sample['ttf_heatmap'] = heatmap
-            sample['ttf_box_target'] = box_target
-            sample['ttf_reg_weight'] = reg_weight
-            sample.pop('is_crowd', None)
-            sample.pop('difficult', None)
-            sample.pop('gt_class', None)
-            sample.pop('gt_bbox', None)
-            sample.pop('gt_score', None)
-        return samples
-
-    def draw_truncate_gaussian(self, heatmap, center, h_radius, w_radius):
-        h, w = 2 * h_radius + 1, 2 * w_radius + 1
-        sigma_x = w / 6
-        sigma_y = h / 6
-        gaussian = gaussian2D((h, w), sigma_x, sigma_y)
-
-        x, y = int(center[0]), int(center[1])
-
-        height, width = heatmap.shape[0:2]
-
-        left, right = min(x, w_radius), min(width - x, w_radius + 1)
-        top, bottom = min(y, h_radius), min(height - y, h_radius + 1)
-
-        masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
-        masked_gaussian = gaussian[h_radius - top:h_radius + bottom, w_radius -
-                                   left:w_radius + right]
-        if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
-            heatmap[y - top:y + bottom, x - left:x + right] = np.maximum(
-                masked_heatmap, masked_gaussian)
-        return heatmap
-
-
-@register_op
-class Gt2Solov2Target(BaseOperator):
-    """Assign mask target and labels in SOLOv2 network.
-    The code of this function is based on:
-        https://github.com/WXinlong/SOLO/blob/master/mmdet/models/anchor_heads/solov2_head.py#L271
-    Args:
-        num_grids (list): The list of feature map grids size.
-        scale_ranges (list): The list of mask boundary range.
-        coord_sigma (float): The coefficient of coordinate area length.
-        sampling_ratio (float): The ratio of down sampling.
-    """
-
-    def __init__(self,
-                 num_grids=[40, 36, 24, 16, 12],
-                 scale_ranges=[[1, 96], [48, 192], [96, 384], [192, 768],
-                               [384, 2048]],
-                 coord_sigma=0.2,
-                 sampling_ratio=4.0):
-        super(Gt2Solov2Target, self).__init__()
-        self.num_grids = num_grids
-        self.scale_ranges = scale_ranges
-        self.coord_sigma = coord_sigma
-        self.sampling_ratio = sampling_ratio
-
-    def _scale_size(self, im, scale):
-        h, w = im.shape[:2]
-        new_size = (int(w * float(scale) + 0.5), int(h * float(scale) + 0.5))
-        resized_img = cv2.resize(
-            im, None, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
-        return resized_img
-
-    def __call__(self, samples, context=None):
-        sample_id = 0
-        max_ins_num = [0] * len(self.num_grids)
-        for sample in samples:
-            gt_bboxes_raw = sample['gt_bbox']
-            gt_labels_raw = sample['gt_class'] + 1
-            im_c, im_h, im_w = sample['image'].shape[:]
-            gt_masks_raw = sample['gt_segm'].astype(np.uint8)
-            mask_feat_size = [
-                int(im_h / self.sampling_ratio), int(im_w / self.sampling_ratio)
-            ]
-            gt_areas = np.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) *
-                               (gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1]))
-            ins_ind_label_list = []
-            idx = 0
-            for (lower_bound, upper_bound), num_grid \
-                    in zip(self.scale_ranges, self.num_grids):
-
-                hit_indices = ((gt_areas >= lower_bound) &
-                               (gt_areas <= upper_bound)).nonzero()[0]
-                num_ins = len(hit_indices)
-
-                ins_label = []
-                grid_order = []
-                cate_label = np.zeros([num_grid, num_grid], dtype=np.int64)
-                ins_ind_label = np.zeros([num_grid**2], dtype=np.bool_)
-
-                if num_ins == 0:
-                    ins_label = np.zeros(
-                        [1, mask_feat_size[0], mask_feat_size[1]],
-                        dtype=np.uint8)
-                    ins_ind_label_list.append(ins_ind_label)
-                    sample['cate_label{}'.format(idx)] = cate_label.flatten()
-                    sample['ins_label{}'.format(idx)] = ins_label
-                    sample['grid_order{}'.format(idx)] = np.asarray(
-                        [sample_id * num_grid * num_grid + 0], dtype=np.int32)
-                    idx += 1
-                    continue
-                gt_bboxes = gt_bboxes_raw[hit_indices]
-                gt_labels = gt_labels_raw[hit_indices]
-                gt_masks = gt_masks_raw[hit_indices, ...]
-
-                half_ws = 0.5 * (
-                    gt_bboxes[:, 2] - gt_bboxes[:, 0]) * self.coord_sigma
-                half_hs = 0.5 * (
-                    gt_bboxes[:, 3] - gt_bboxes[:, 1]) * self.coord_sigma
-
-                for seg_mask, gt_label, half_h, half_w in zip(
-                        gt_masks, gt_labels, half_hs, half_ws):
-                    if seg_mask.sum() == 0:
-                        continue
-                    # mass center
-                    upsampled_size = (mask_feat_size[0] * 4,
-                                      mask_feat_size[1] * 4)
-                    center_h, center_w = ndimage.measurements.center_of_mass(
-                        seg_mask)
-                    coord_w = int(
-                        (center_w / upsampled_size[1]) // (1. / num_grid))
-                    coord_h = int(
-                        (center_h / upsampled_size[0]) // (1. / num_grid))
-
-                    # left, top, right, down
-                    top_box = max(0,
-                                  int(((center_h - half_h) / upsampled_size[0])
-                                      // (1. / num_grid)))
-                    down_box = min(num_grid - 1,
-                                   int(((center_h + half_h) / upsampled_size[0])
-                                       // (1. / num_grid)))
-                    left_box = max(0,
-                                   int(((center_w - half_w) / upsampled_size[1])
-                                       // (1. / num_grid)))
-                    right_box = min(num_grid - 1,
-                                    int(((center_w + half_w) /
-                                         upsampled_size[1]) // (1. / num_grid)))
-
-                    top = max(top_box, coord_h - 1)
-                    down = min(down_box, coord_h + 1)
-                    left = max(coord_w - 1, left_box)
-                    right = min(right_box, coord_w + 1)
-
-                    cate_label[top:(down + 1), left:(right + 1)] = gt_label
-                    seg_mask = self._scale_size(
-                        seg_mask, scale=1. / self.sampling_ratio)
-                    for i in range(top, down + 1):
-                        for j in range(left, right + 1):
-                            label = int(i * num_grid + j)
-                            cur_ins_label = np.zeros(
-                                [mask_feat_size[0], mask_feat_size[1]],
-                                dtype=np.uint8)
-                            cur_ins_label[:seg_mask.shape[0], :seg_mask.shape[
-                                1]] = seg_mask
-                            ins_label.append(cur_ins_label)
-                            ins_ind_label[label] = True
-                            grid_order.append(sample_id * num_grid * num_grid +
-                                              label)
-                if ins_label == []:
-                    ins_label = np.zeros(
-                        [1, mask_feat_size[0], mask_feat_size[1]],
-                        dtype=np.uint8)
-                    ins_ind_label_list.append(ins_ind_label)
-                    sample['cate_label{}'.format(idx)] = cate_label.flatten()
-                    sample['ins_label{}'.format(idx)] = ins_label
-                    sample['grid_order{}'.format(idx)] = np.asarray(
-                        [sample_id * num_grid * num_grid + 0], dtype=np.int32)
-                else:
-                    ins_label = np.stack(ins_label, axis=0)
-                    ins_ind_label_list.append(ins_ind_label)
-                    sample['cate_label{}'.format(idx)] = cate_label.flatten()
-                    sample['ins_label{}'.format(idx)] = ins_label
-                    sample['grid_order{}'.format(idx)] = np.asarray(
-                        grid_order, dtype=np.int32)
-                    assert len(grid_order) > 0
-                max_ins_num[idx] = max(
-                    max_ins_num[idx],
-                    sample['ins_label{}'.format(idx)].shape[0])
-                idx += 1
-            ins_ind_labels = np.concatenate([
-                ins_ind_labels_level_img
-                for ins_ind_labels_level_img in ins_ind_label_list
-            ])
-            fg_num = np.sum(ins_ind_labels)
-            sample['fg_num'] = fg_num
-            sample_id += 1
-
-            sample.pop('is_crowd')
-            sample.pop('gt_class')
-            sample.pop('gt_bbox')
-            sample.pop('gt_poly')
-            sample.pop('gt_segm')
-
-        # padding batch
-        for data in samples:
-            for idx in range(len(self.num_grids)):
-                gt_ins_data = np.zeros(
-                    [
-                        max_ins_num[idx],
-                        data['ins_label{}'.format(idx)].shape[1],
-                        data['ins_label{}'.format(idx)].shape[2]
-                    ],
-                    dtype=np.uint8)
-                gt_ins_data[0:data['ins_label{}'.format(idx)].shape[
-                    0], :, :] = data['ins_label{}'.format(idx)]
-                gt_grid_order = np.zeros([max_ins_num[idx]], dtype=np.int32)
-                gt_grid_order[0:data['grid_order{}'.format(idx)].shape[
-                    0]] = data['grid_order{}'.format(idx)]
-                data['ins_label{}'.format(idx)] = gt_ins_data
-                data['grid_order{}'.format(idx)] = gt_grid_order
-
-        return samples
-
-
-@register_op
-class Gt2SparseTarget(BaseOperator):
-    def __init__(self, use_padding_shape=False):
-        super(Gt2SparseTarget, self).__init__()
-        self.use_padding_shape = use_padding_shape
-
-    def __call__(self, samples, context=None):
-        for sample in samples:
-            ori_h, ori_w = sample['h'], sample['w']
-            if self.use_padding_shape:
-                h, w = sample["image"].shape[1:3]
-                if "scale_factor" in sample:
-                    sf_w, sf_h = sample["scale_factor"][1], sample[
-                        "scale_factor"][0]
-                    sample["scale_factor_whwh"] = np.array(
-                        [sf_w, sf_h, sf_w, sf_h], dtype=np.float32)
-                else:
-                    sample["scale_factor_whwh"] = np.array(
-                        [1.0, 1.0, 1.0, 1.0], dtype=np.float32)
-            else:
-                h, w = round(sample['im_shape'][0]), round(sample['im_shape'][
-                    1])
-                sample["scale_factor_whwh"] = np.array(
-                    [w / ori_w, h / ori_h, w / ori_w, h / ori_h],
-                    dtype=np.float32)
-
-            sample["img_whwh"] = np.array([w, h, w, h], dtype=np.float32)
-            sample["ori_shape"] = np.array([ori_h, ori_w], dtype=np.int32)
-
-        return samples
-
-
-@register_op
-class PadMaskBatch(BaseOperator):
-    """
-    Pad a batch of samples so that they can be divisible by a stride.
-    The layout of each image should be 'CHW'.
-    Args:
-        pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure
-            height and width is divisible by `pad_to_stride`.
-        return_pad_mask (bool): If `return_pad_mask = True`, return
-            `pad_mask` for transformer.
-    """
-
-    def __init__(self, pad_to_stride=0, return_pad_mask=True):
-        super(PadMaskBatch, self).__init__()
-        self.pad_to_stride = pad_to_stride
-        self.return_pad_mask = return_pad_mask
-
-    def __call__(self, samples, context=None):
-        """
-        Args:
-            samples (list): a batch of sample, each is dict.
-        """
-        coarsest_stride = self.pad_to_stride
-
-        max_shape = np.array([data['image'].shape for data in samples]).max(
-            axis=0)
-        if coarsest_stride > 0:
-            max_shape[1] = int(
-                np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride)
-            max_shape[2] = int(
-                np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride)
-
-        for data in samples:
-            im = data['image']
-            im_c, im_h, im_w = im.shape[:]
-            padding_im = np.zeros(
-                (im_c, max_shape[1], max_shape[2]), dtype=np.float32)
-            padding_im[:, :im_h, :im_w] = im.astype(np.float32)
-            data['image'] = padding_im
-            if 'semantic' in data and data['semantic'] is not None:
-                semantic = data['semantic']
-                padding_sem = np.zeros(
-                    (1, max_shape[1], max_shape[2]), dtype=np.float32)
-                padding_sem[:, :im_h, :im_w] = semantic
-                data['semantic'] = padding_sem
-            if 'gt_segm' in data and data['gt_segm'] is not None:
-                gt_segm = data['gt_segm']
-                padding_segm = np.zeros(
-                    (gt_segm.shape[0], max_shape[1], max_shape[2]),
-                    dtype=np.uint8)
-                padding_segm[:, :im_h, :im_w] = gt_segm
-                data['gt_segm'] = padding_segm
-            if self.return_pad_mask:
-                padding_mask = np.zeros(
-                    (max_shape[1], max_shape[2]), dtype=np.float32)
-                padding_mask[:im_h, :im_w] = 1.
-                data['pad_mask'] = padding_mask
-
-        return samples
-
-
-@register_op
-class Gt2CenterNetTarget(BaseOperator):
-    __shared__ = ['num_classes']
-    """Gt2CenterNetTarget
-    Genterate CenterNet targets by ground-truth
-    Args:
-        down_ratio (int): The down sample ratio between output feature and 
-                          input image.
-        num_classes (int): The number of classes, 80 by default.
-        max_objs (int): The maximum objects detected, 128 by default.
-    """
-
-    def __init__(self, num_classes=80, down_ratio=4, max_objs=128):
-        super(Gt2CenterNetTarget, self).__init__()
-        self.nc = num_classes
-        self.down_ratio = down_ratio
-        self.max_objs = max_objs
-
-    def __call__(self, sample, context=None):
-        input_h, input_w = sample['image'].shape[1:]
-        output_h = input_h // self.down_ratio
-        output_w = input_w // self.down_ratio
-        gt_bbox = sample['gt_bbox']
-        gt_class = sample['gt_class']
-
-        hm = np.zeros((self.nc, output_h, output_w), dtype=np.float32)
-        wh = np.zeros((self.max_objs, 2), dtype=np.float32)
-        reg = np.zeros((self.max_objs, 2), dtype=np.float32)
-        ind = np.zeros((self.max_objs), dtype=np.int64)
-        reg_mask = np.zeros((self.max_objs), dtype=np.int32)
-        cat_spec_wh = np.zeros((self.max_objs, self.nc * 2), dtype=np.float32)
-        cat_spec_mask = np.zeros((self.max_objs, self.nc * 2), dtype=np.int32)
-
-        trans_output = get_affine_transform(
-            center=sample['center'],
-            input_size=[sample['scale'], sample['scale']],
-            rot=0,
-            output_size=[output_w, output_h])
-
-        gt_det = []
-        for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)):
-            cls = int(cls)
-            bbox[:2] = affine_transform(bbox[:2], trans_output)
-            bbox[2:] = affine_transform(bbox[2:], trans_output)
-            bbox_amodal = copy.deepcopy(bbox)
-            bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
-            bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
-            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
-            if h > 0 and w > 0:
-                radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
-                radius = max(0, int(radius))
-                ct = np.array(
-                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
-                    dtype=np.float32)
-                ct_int = ct.astype(np.int32)
-
-                # get hm,wh,reg,ind,ind_mask
-                draw_umich_gaussian(hm[cls], ct_int, radius)
-                wh[i] = 1. * w, 1. * h
-                reg[i] = ct - ct_int
-                ind[i] = ct_int[1] * output_w + ct_int[0]
-                reg_mask[i] = 1
-                cat_spec_wh[i, cls * 2:cls * 2 + 2] = wh[i]
-                cat_spec_mask[i, cls * 2:cls * 2 + 2] = 1
-                gt_det.append([
-                    ct[0] - w / 2, ct[1] - h / 2, ct[0] + w / 2, ct[1] + h / 2,
-                    1, cls
-                ])
-
-        sample.pop('gt_bbox', None)
-        sample.pop('gt_class', None)
-        sample.pop('center', None)
-        sample.pop('scale', None)
-        sample.pop('is_crowd', None)
-        sample.pop('difficult', None)
-
-        sample['index'] = ind
-        sample['index_mask'] = reg_mask
-        sample['heatmap'] = hm
-        sample['size'] = wh
-        sample['offset'] = reg
-        return sample
-
-
-@register_op
-class PadGT(BaseOperator):
-    """
-    Pad 0 to `gt_class`, `gt_bbox`, `gt_score`...
-    The num_max_boxes is the largest for batch.
-    Args:
-        return_gt_mask (bool): If true, return `pad_gt_mask`,
-                                1 means bbox, 0 means no bbox.
-    """
-
-    def __init__(self, return_gt_mask=True, pad_img=False, minimum_gtnum=0):
-        super(PadGT, self).__init__()
-        self.return_gt_mask = return_gt_mask
-        self.pad_img = pad_img
-        self.minimum_gtnum = minimum_gtnum
-
-    def _impad(self,
-               img: np.ndarray,
-               *,
-               shape=None,
-               padding=None,
-               pad_val=0,
-               padding_mode='constant') -> np.ndarray:
-        """Pad the given image to a certain shape or pad on all sides with
-        specified padding mode and padding value.
-
-        Args:
-            img (ndarray): Image to be padded.
-            shape (tuple[int]): Expected padding shape (h, w). Default: None.
-            padding (int or tuple[int]): Padding on each border. If a single int is
-                provided this is used to pad all borders. If tuple of length 2 is
-                provided this is the padding on left/right and top/bottom
-                respectively. If a tuple of length 4 is provided this is the
-                padding for the left, top, right and bottom borders respectively.
-                Default: None. Note that `shape` and `padding` can not be both
-                set.
-            pad_val (Number | Sequence[Number]): Values to be filled in padding
-                areas when padding_mode is 'constant'. Default: 0.
-            padding_mode (str): Type of padding. Should be: constant, edge,
-                reflect or symmetric. Default: constant.
-                - constant: pads with a constant value, this value is specified
-                with pad_val.
-                - edge: pads with the last value at the edge of the image.
-                - reflect: pads with reflection of image without repeating the last
-                value on the edge. For example, padding [1, 2, 3, 4] with 2
-                elements on both sides in reflect mode will result in
-                [3, 2, 1, 2, 3, 4, 3, 2].
-                - symmetric: pads with reflection of image repeating the last value
-                on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
-                both sides in symmetric mode will result in
-                [2, 1, 1, 2, 3, 4, 4, 3]
-
-        Returns:
-            ndarray: The padded image.
-        """
-
-        assert (shape is not None) ^ (padding is not None)
-        if shape is not None:
-            width = max(shape[1] - img.shape[1], 0)
-            height = max(shape[0] - img.shape[0], 0)
-            padding = (0, 0, int(width), int(height))
-
-        # check pad_val
-        import numbers
-        if isinstance(pad_val, tuple):
-            assert len(pad_val) == img.shape[-1]
-        elif not isinstance(pad_val, numbers.Number):
-            raise TypeError('pad_val must be a int or a tuple. '
-                            f'But received {type(pad_val)}')
-
-        # check padding
-        if isinstance(padding, tuple) and len(padding) in [2, 4]:
-            if len(padding) == 2:
-                padding = (padding[0], padding[1], padding[0], padding[1])
-        elif isinstance(padding, numbers.Number):
-            padding = (padding, padding, padding, padding)
-        else:
-            raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
-                             f'But received {padding}')
-
-        # check padding mode
-        assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
-
-        border_type = {
-            'constant': cv2.BORDER_CONSTANT,
-            'edge': cv2.BORDER_REPLICATE,
-            'reflect': cv2.BORDER_REFLECT_101,
-            'symmetric': cv2.BORDER_REFLECT
-        }
-        img = cv2.copyMakeBorder(
-            img,
-            padding[1],
-            padding[3],
-            padding[0],
-            padding[2],
-            border_type[padding_mode],
-            value=pad_val)
-
-        return img
-
-    def checkmaxshape(self, samples):
-        maxh, maxw = 0, 0
-        for sample in samples:
-            h, w = sample['im_shape']
-            if h > maxh:
-                maxh = h
-            if w > maxw:
-                maxw = w
-        return (maxh, maxw)
-
-    def __call__(self, samples, context=None):
-        num_max_boxes = max([len(s['gt_bbox']) for s in samples])
-        num_max_boxes = max(self.minimum_gtnum, num_max_boxes)
-        if self.pad_img:
-            maxshape = self.checkmaxshape(samples)
-        for sample in samples:
-            if self.pad_img:
-                img = sample['image']
-                padimg = self._impad(img, shape=maxshape)
-                sample['image'] = padimg
-            if self.return_gt_mask:
-                sample['pad_gt_mask'] = np.zeros(
-                    (num_max_boxes, 1), dtype=np.float32)
-            if num_max_boxes == 0:
-                continue
-
-            num_gt = len(sample['gt_bbox'])
-            pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32)
-            pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32)
-            if num_gt > 0:
-                pad_gt_class[:num_gt] = sample['gt_class']
-                pad_gt_bbox[:num_gt] = sample['gt_bbox']
-            sample['gt_class'] = pad_gt_class
-            sample['gt_bbox'] = pad_gt_bbox
-            # pad_gt_mask
-            if 'pad_gt_mask' in sample:
-                sample['pad_gt_mask'][:num_gt] = 1
-            # gt_score
-            if 'gt_score' in sample:
-                pad_gt_score = np.zeros((num_max_boxes, 1), dtype=np.float32)
-                if num_gt > 0:
-                    pad_gt_score[:num_gt] = sample['gt_score']
-                sample['gt_score'] = pad_gt_score
-            if 'is_crowd' in sample:
-                pad_is_crowd = np.zeros((num_max_boxes, 1), dtype=np.int32)
-                if num_gt > 0:
-                    pad_is_crowd[:num_gt] = sample['is_crowd']
-                sample['is_crowd'] = pad_is_crowd
-            if 'difficult' in sample:
-                pad_diff = np.zeros((num_max_boxes, 1), dtype=np.int32)
-                if num_gt > 0:
-                    pad_diff[:num_gt] = sample['difficult']
-                sample['difficult'] = pad_diff
-            if 'gt_joints' in sample:
-                num_joints = sample['gt_joints'].shape[1]
-                pad_gt_joints = np.zeros(
-                    (num_max_boxes, num_joints, 3), dtype=np.float32)
-                if num_gt > 0:
-                    pad_gt_joints[:num_gt] = sample['gt_joints']
-                sample['gt_joints'] = pad_gt_joints
-            if 'gt_areas' in sample:
-                pad_gt_areas = np.zeros((num_max_boxes, 1), dtype=np.float32)
-                if num_gt > 0:
-                    pad_gt_areas[:num_gt, 0] = sample['gt_areas']
-                sample['gt_areas'] = pad_gt_areas
-        return samples
-
-
-@register_op
-class PadRGT(BaseOperator):
-    """
-    Pad 0 to `gt_class`, `gt_bbox`, `gt_score`...
-    The num_max_boxes is the largest for batch.
-    Args:
-        return_gt_mask (bool): If true, return `pad_gt_mask`,
-                                1 means bbox, 0 means no bbox.
-    """
-
-    def __init__(self, return_gt_mask=True):
-        super(PadRGT, self).__init__()
-        self.return_gt_mask = return_gt_mask
-
-    def pad_field(self, sample, field, num_gt):
-        name, shape, dtype = field
-        if name in sample:
-            pad_v = np.zeros(shape, dtype=dtype)
-            if num_gt > 0:
-                pad_v[:num_gt] = sample[name]
-            sample[name] = pad_v
-
-    def __call__(self, samples, context=None):
-        num_max_boxes = max([len(s['gt_bbox']) for s in samples])
-        for sample in samples:
-            if self.return_gt_mask:
-                sample['pad_gt_mask'] = np.zeros(
-                    (num_max_boxes, 1), dtype=np.float32)
-            if num_max_boxes == 0:
-                continue
-
-            num_gt = len(sample['gt_bbox'])
-            pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32)
-            pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32)
-            if num_gt > 0:
-                pad_gt_class[:num_gt] = sample['gt_class']
-                pad_gt_bbox[:num_gt] = sample['gt_bbox']
-            sample['gt_class'] = pad_gt_class
-            sample['gt_bbox'] = pad_gt_bbox
-            # pad_gt_mask
-            if 'pad_gt_mask' in sample:
-                sample['pad_gt_mask'][:num_gt] = 1
-            # gt_score
-            names = ['gt_score', 'is_crowd', 'difficult', 'gt_poly', 'gt_rbox']
-            dims = [1, 1, 1, 8, 5]
-            dtypes = [np.float32, np.int32, np.int32, np.float32, np.float32]
-
-            for name, dim, dtype in zip(names, dims, dtypes):
-                self.pad_field(sample, [name, (num_max_boxes, dim), dtype],
-                               num_gt)
-
-        return samples
-
-
-@register_op
-class Gt2CenterTrackTarget(BaseOperator):
-    __shared__ = ['num_classes']
-    """Gt2CenterTrackTarget
-    Genterate CenterTrack targets by ground-truth
-    Args:
-        num_classes (int): The number of classes, 1 by default.
-        down_ratio (int): The down sample ratio between output feature and 
-                          input image.
-        max_objs (int): The maximum objects detected, 256 by default.
-    """
-
-    def __init__(self,
-                 num_classes=1,
-                 down_ratio=4,
-                 max_objs=256,
-                 hm_disturb=0.05,
-                 lost_disturb=0.4,
-                 fp_disturb=0.1,
-                 pre_hm=True,
-                 add_tracking=True,
-                 add_ltrb_amodal=True):
-        super(Gt2CenterTrackTarget, self).__init__()
-        self.nc = num_classes
-        self.down_ratio = down_ratio
-        self.max_objs = max_objs
-
-        self.hm_disturb = hm_disturb
-        self.lost_disturb = lost_disturb
-        self.fp_disturb = fp_disturb
-        self.pre_hm = pre_hm
-        self.add_tracking = add_tracking
-        self.add_ltrb_amodal = add_ltrb_amodal
-
-    def _get_pre_dets(self, input_h, input_w, trans_input_pre, gt_bbox_pre,
-                      gt_class_pre, gt_track_id_pre):
-        hm_h, hm_w = input_h, input_w
-        reutrn_hm = self.pre_hm
-        pre_hm = np.zeros(
-            (1, hm_h, hm_w), dtype=np.float32) if reutrn_hm else None
-        pre_cts, track_ids = [], []
-
-        for i, (
-                bbox, cls, track_id
-        ) in enumerate(zip(gt_bbox_pre, gt_class_pre, gt_track_id_pre)):
-            cls = int(cls)
-            bbox[:2] = affine_transform(bbox[:2], trans_input_pre)
-            bbox[2:] = affine_transform(bbox[2:], trans_input_pre)
-            bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, hm_w - 1)
-            bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, hm_h - 1)
-            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
-            max_rad = 1
-            if (h > 0 and w > 0):
-                radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
-                radius = max(0, int(radius))
-                max_rad = max(max_rad, radius)
-                ct = np.array(
-                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
-                    dtype=np.float32)
-                ct0 = ct.copy()
-                conf = 1
-
-                ct[0] = ct[0] + np.random.randn() * self.hm_disturb * w
-                ct[1] = ct[1] + np.random.randn() * self.hm_disturb * h
-                conf = 1 if np.random.rand() > self.lost_disturb else 0
-
-                ct_int = ct.astype(np.int32)
-                if conf == 0:
-                    pre_cts.append(ct / self.down_ratio)
-                else:
-                    pre_cts.append(ct0 / self.down_ratio)
-
-                track_ids.append(track_id)
-                if reutrn_hm:
-                    draw_umich_gaussian(pre_hm[0], ct_int, radius, k=conf)
-
-                if np.random.rand() < self.fp_disturb and reutrn_hm:
-                    ct2 = ct0.copy()
-                    # Hard code heatmap disturb ratio, haven't tried other numbers.
-                    ct2[0] = ct2[0] + np.random.randn() * 0.05 * w
-                    ct2[1] = ct2[1] + np.random.randn() * 0.05 * h
-                    ct2_int = ct2.astype(np.int32)
-                    draw_umich_gaussian(pre_hm[0], ct2_int, radius, k=conf)
-        return pre_hm, pre_cts, track_ids
-
-    def __call__(self, sample, context=None):
-        input_h, input_w = sample['image'].shape[1:]
-        output_h = input_h // self.down_ratio
-        output_w = input_w // self.down_ratio
-        gt_bbox = sample['gt_bbox']
-        gt_class = sample['gt_class']
-
-        # init
-        hm = np.zeros((self.nc, output_h, output_w), dtype=np.float32)
-        wh = np.zeros((self.max_objs, 2), dtype=np.float32)
-        reg = np.zeros((self.max_objs, 2), dtype=np.float32)
-        ind = np.zeros((self.max_objs), dtype=np.int64)
-        reg_mask = np.zeros((self.max_objs), dtype=np.int32)
-        if self.add_tracking:
-            tr = np.zeros((self.max_objs, 2), dtype=np.float32)
-        if self.add_ltrb_amodal:
-            ltrb_amodal = np.zeros((self.max_objs, 4), dtype=np.float32)
-
-        trans_output = get_affine_transform(
-            center=sample['center'],
-            input_size=[sample['scale'], sample['scale']],
-            rot=0,
-            output_size=[output_w, output_h])
-
-        pre_hm, pre_cts, track_ids = self._get_pre_dets(
-            input_h, input_w, sample['trans_input'], sample['pre_gt_bbox'],
-            sample['pre_gt_class'], sample['pre_gt_track_id'])
-
-        for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)):
-            cls = int(cls)
-            rect = np.array(
-                [[bbox[0], bbox[1]], [bbox[0], bbox[3]], [bbox[2], bbox[3]],
-                 [bbox[2], bbox[1]]],
-                dtype=np.float32)
-            for t in range(4):
-                rect[t] = affine_transform(rect[t], trans_output)
-                bbox[:2] = rect[:, 0].min(), rect[:, 1].min()
-                bbox[2:] = rect[:, 0].max(), rect[:, 1].max()
-
-            bbox_amodal = copy.deepcopy(bbox)
-            bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1)
-            bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1)
-
-            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
-            if h > 0 and w > 0:
-                radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
-                radius = max(0, int(radius))
-                ct = np.array(
-                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
-                    dtype=np.float32)
-                ct_int = ct.astype(np.int32)
-
-                # get hm,wh,reg,ind,ind_mask
-                draw_umich_gaussian(hm[cls], ct_int, radius)
-                wh[i] = 1. * w, 1. * h
-                reg[i] = ct - ct_int
-                ind[i] = ct_int[1] * output_w + ct_int[0]
-                reg_mask[i] = 1
-                if self.add_tracking:
-                    if sample['gt_track_id'][i] in track_ids:
-                        pre_ct = pre_cts[track_ids.index(sample['gt_track_id'][
-                            i])]
-                        tr[i] = pre_ct - ct_int
-
-                if self.add_ltrb_amodal:
-                    ltrb_amodal[i] = \
-                        bbox_amodal[0] - ct_int[0], bbox_amodal[1] - ct_int[1], \
-                        bbox_amodal[2] - ct_int[0], bbox_amodal[3] - ct_int[1]
-
-        new_sample = {'image': sample['image']}
-        new_sample['index'] = ind
-        new_sample['index_mask'] = reg_mask
-        new_sample['heatmap'] = hm
-        new_sample['size'] = wh
-        new_sample['offset'] = reg
-        if self.add_tracking:
-            new_sample['tracking'] = tr
-        if self.add_ltrb_amodal:
-            new_sample['ltrb_amodal'] = ltrb_amodal
-
-        new_sample['pre_image'] = sample['pre_image']
-        new_sample['pre_hm'] = pre_hm
-
-        del sample
-        return new_sample
-
-
-@register_op
-class BatchRandomResizeForSSOD(BaseOperator):
-    """
-    Resize image to target size randomly. random target_size and interpolation method
-    Args:
-        target_size (int, list, tuple): image target size, if random size is True, must be list or tuple
-        keep_ratio (bool): whether keep_raio or not, default true
-        interp (int): the interpolation method
-        random_size (bool): whether random select target size of image
-        random_interp (bool): whether random select interpolation method
-    """
-
-    def __init__(self,
-                 target_size,
-                 keep_ratio,
-                 interp=cv2.INTER_NEAREST,
-                 random_size=True,
-                 random_interp=False):
-        super(BatchRandomResizeForSSOD, self).__init__()
-        self.keep_ratio = keep_ratio
-        self.interps = [
-            cv2.INTER_NEAREST,
-            cv2.INTER_LINEAR,
-            cv2.INTER_AREA,
-            cv2.INTER_CUBIC,
-            cv2.INTER_LANCZOS4,
-        ]
-        self.interp = interp
-        assert isinstance(target_size, (
-            int, Sequence)), "target_size must be int, list or tuple"
-        if random_size and not isinstance(target_size, list):
-            raise TypeError(
-                "Type of target_size is invalid when random_size is True. Must be List, now is {}".
-                format(type(target_size)))
-        self.target_size = target_size
-        self.random_size = random_size
-        self.random_interp = random_interp
-
-    def __call__(self, samples, context=None):
-        if self.random_size:
-            index = np.random.choice(len(self.target_size))
-            target_size = self.target_size[index]
-        else:
-            target_size = self.target_size
-        if context is not None:
-            target_size = self.target_size[context]
-        if self.random_interp:
-            interp = np.random.choice(self.interps)
-        else:
-            interp = self.interp
-
-        resizer = Resize(target_size, keep_ratio=self.keep_ratio, interp=interp)
-        return [resizer(samples, context=context), index]
diff --git a/pdfdet/models/Paddle/ppdet/data/transform/culane_operators.py b/pdfdet/models/Paddle/ppdet/data/transform/culane_operators.py
deleted file mode 100644
index 4790435..0000000
--- a/pdfdet/models/Paddle/ppdet/data/transform/culane_operators.py
+++ /dev/null
@@ -1,366 +0,0 @@
-import numpy as np
-import imgaug.augmenters as iaa
-from .operators import BaseOperator, register_op
-from ppdet.utils.logger import setup_logger
-from ppdet.data.culane_utils import linestrings_to_lanes, transform_annotation
-
-logger = setup_logger(__name__)
-
-__all__ = [
-    "CULaneTrainProcess", "CULaneDataProcess", "HorizontalFlip",
-    "ChannelShuffle", "CULaneAffine", "CULaneResize", "OneOfBlur",
-    "MultiplyAndAddToBrightness", "AddToHueAndSaturation"
-]
-
-
-def trainTransforms(img_h, img_w):
-    transforms = [{
-        'name': 'Resize',
-        'parameters': dict(size=dict(
-            height=img_h, width=img_w)),
-        'p': 1.0
-    }, {
-        'name': 'HorizontalFlip',
-        'parameters': dict(p=1.0),
-        'p': 0.5
-    }, {
-        'name': 'ChannelShuffle',
-        'parameters': dict(p=1.0),
-        'p': 0.1
-    }, {
-        'name': 'MultiplyAndAddToBrightness',
-        'parameters': dict(
-            mul=(0.85, 1.15), add=(-10, 10)),
-        'p': 0.6
-    }, {
-        'name': 'AddToHueAndSaturation',
-        'parameters': dict(value=(-10, 10)),
-        'p': 0.7
-    }, {
-        'name': 'OneOf',
-        'transforms': [
-            dict(
-                name='MotionBlur', parameters=dict(k=(3, 5))), dict(
-                    name='MedianBlur', parameters=dict(k=(3, 5)))
-        ],
-        'p': 0.2
-    }, {
-        'name': 'Affine',
-        'parameters': dict(
-            translate_percent=dict(
-                x=(-0.1, 0.1), y=(-0.1, 0.1)),
-            rotate=(-10, 10),
-            scale=(0.8, 1.2)),
-        'p': 0.7
-    }, {
-        'name': 'Resize',
-        'parameters': dict(size=dict(
-            height=img_h, width=img_w)),
-        'p': 1.0
-    }]
-    return transforms
-
-
-@register_op
-class CULaneTrainProcess(BaseOperator):
-    def __init__(self, img_w, img_h):
-        super(CULaneTrainProcess, self).__init__()
-        self.img_w = img_w
-        self.img_h = img_h
-        self.transforms = trainTransforms(self.img_h, self.img_w)
-
-        if self.transforms is not None:
-            img_transforms = []
-            for aug in self.transforms:
-                p = aug['p']
-                if aug['name'] != 'OneOf':
-                    img_transforms.append(
-                        iaa.Sometimes(
-                            p=p,
-                            then_list=getattr(iaa, aug['name'])(**aug[
-                                'parameters'])))
-                else:
-                    img_transforms.append(
-                        iaa.Sometimes(
-                            p=p,
-                            then_list=iaa.OneOf([
-                                getattr(iaa, aug_['name'])(**aug_['parameters'])
-                                for aug_ in aug['transforms']
-                            ])))
-        else:
-            img_transforms = []
-        self.iaa_transform = iaa.Sequential(img_transforms)
-
-    def apply(self, sample, context=None):
-        img, line_strings, seg = self.iaa_transform(
-            image=sample['image'],
-            line_strings=sample['lanes'],
-            segmentation_maps=sample['mask'])
-        sample['image'] = img
-        sample['lanes'] = line_strings
-        sample['mask'] = seg
-        return sample
-
-
-@register_op
-class CULaneDataProcess(BaseOperator):
-    def __init__(self, img_w, img_h, num_points, max_lanes):
-        super(CULaneDataProcess, self).__init__()
-        self.img_w = img_w
-        self.img_h = img_h
-        self.num_points = num_points
-        self.n_offsets = num_points
-        self.n_strips = num_points - 1
-        self.strip_size = self.img_h / self.n_strips
-
-        self.max_lanes = max_lanes
-        self.offsets_ys = np.arange(self.img_h, -1, -self.strip_size)
-
-    def apply(self, sample, context=None):
-        data = {}
-        line_strings = sample['lanes']
-        line_strings.clip_out_of_image_()
-        new_anno = {'lanes': linestrings_to_lanes(line_strings)}
-
-        for i in range(30):
-            try:
-                annos = transform_annotation(
-                    self.img_w, self.img_h, self.max_lanes, self.n_offsets,
-                    self.offsets_ys, self.n_strips, self.strip_size, new_anno)
-                label = annos['label']
-                lane_endpoints = annos['lane_endpoints']
-                break
-            except:
-                if (i + 1) == 30:
-                    logger.critical('Transform annotation failed 30 times :(')
-                    exit()
-
-        sample['image'] = sample['image'].astype(np.float32) / 255.
-        data['image'] = sample['image'].transpose(2, 0, 1)
-        data['lane_line'] = label
-        data['seg'] = sample['seg']
-        data['full_img_path'] = sample['full_img_path']
-        data['img_name'] = sample['img_name']
-        data['im_id'] = sample['im_id']
-
-        if 'mask' in sample.keys():
-            data['seg'] = sample['mask'].get_arr()
-
-        data['im_shape'] = np.array([self.img_w, self.img_h], dtype=np.float32)
-        data['scale_factor'] = np.array([1., 1.], dtype=np.float32)
-
-        return data
-
-
-@register_op
-class CULaneResize(BaseOperator):
-    def __init__(self, img_h, img_w, prob=0.5):
-        super(CULaneResize, self).__init__()
-        self.img_h = img_h
-        self.img_w = img_w
-        self.prob = prob
-
-    def apply(self, sample, context=None):
-        transform = iaa.Sometimes(self.prob,
-                                  iaa.Resize({
-                                      "height": self.img_h,
-                                      "width": self.img_w
-                                  }))
-        if 'mask' in sample.keys():
-            img, line_strings, seg = transform(
-                image=sample['image'],
-                line_strings=sample['lanes'],
-                segmentation_maps=sample['mask'])
-            sample['image'] = img
-            sample['lanes'] = line_strings
-            sample['mask'] = seg
-        else:
-            img, line_strings = transform(
-                image=sample['image'].copy().astype(np.uint8),
-                line_strings=sample['lanes'])
-            sample['image'] = img
-            sample['lanes'] = line_strings
-
-        return sample
-
-
-@register_op
-class HorizontalFlip(BaseOperator):
-    def __init__(self, prob=0.5):
-        super(HorizontalFlip, self).__init__()
-        self.prob = prob
-
-    def apply(self, sample, context=None):
-        transform = iaa.Sometimes(self.prob, iaa.HorizontalFlip(1.0))
-        if 'mask' in sample.keys():
-            img, line_strings, seg = transform(
-                image=sample['image'],
-                line_strings=sample['lanes'],
-                segmentation_maps=sample['mask'])
-            sample['image'] = img
-            sample['lanes'] = line_strings
-            sample['mask'] = seg
-        else:
-            img, line_strings = transform(
-                image=sample['image'], line_strings=sample['lanes'])
-            sample['image'] = img
-            sample['lanes'] = line_strings
-
-        return sample
-
-
-@register_op
-class ChannelShuffle(BaseOperator):
-    def __init__(self, prob=0.1):
-        super(ChannelShuffle, self).__init__()
-        self.prob = prob
-
-    def apply(self, sample, context=None):
-        transform = iaa.Sometimes(self.prob, iaa.ChannelShuffle(1.0))
-        if 'mask' in sample.keys():
-            img, line_strings, seg = transform(
-                image=sample['image'],
-                line_strings=sample['lanes'],
-                segmentation_maps=sample['mask'])
-            sample['image'] = img
-            sample['lanes'] = line_strings
-            sample['mask'] = seg
-        else:
-            img, line_strings = transform(
-                image=sample['image'], line_strings=sample['lanes'])
-            sample['image'] = img
-            sample['lanes'] = line_strings
-
-        return sample
-
-
-@register_op
-class MultiplyAndAddToBrightness(BaseOperator):
-    def __init__(self, mul=(0.85, 1.15), add=(-10, 10), prob=0.5):
-        super(MultiplyAndAddToBrightness, self).__init__()
-        self.mul = tuple(mul)
-        self.add = tuple(add)
-        self.prob = prob
-
-    def apply(self, sample, context=None):
-        transform = iaa.Sometimes(
-            self.prob,
-            iaa.MultiplyAndAddToBrightness(
-                mul=self.mul, add=self.add))
-        if 'mask' in sample.keys():
-            img, line_strings, seg = transform(
-                image=sample['image'],
-                line_strings=sample['lanes'],
-                segmentation_maps=sample['mask'])
-            sample['image'] = img
-            sample['lanes'] = line_strings
-            sample['mask'] = seg
-        else:
-            img, line_strings = transform(
-                image=sample['image'], line_strings=sample['lanes'])
-            sample['image'] = img
-            sample['lanes'] = line_strings
-
-        return sample
-
-
-@register_op
-class AddToHueAndSaturation(BaseOperator):
-    def __init__(self, value=(-10, 10), prob=0.5):
-        super(AddToHueAndSaturation, self).__init__()
-        self.value = tuple(value)
-        self.prob = prob
-
-    def apply(self, sample, context=None):
-        transform = iaa.Sometimes(
-            self.prob, iaa.AddToHueAndSaturation(value=self.value))
-        if 'mask' in sample.keys():
-            img, line_strings, seg = transform(
-                image=sample['image'],
-                line_strings=sample['lanes'],
-                segmentation_maps=sample['mask'])
-            sample['image'] = img
-            sample['lanes'] = line_strings
-            sample['mask'] = seg
-        else:
-            img, line_strings = transform(
-                image=sample['image'], line_strings=sample['lanes'])
-            sample['image'] = img
-            sample['lanes'] = line_strings
-
-        return sample
-
-
-@register_op
-class OneOfBlur(BaseOperator):
-    def __init__(self, MotionBlur_k=(3, 5), MedianBlur_k=(3, 5), prob=0.5):
-        super(OneOfBlur, self).__init__()
-        self.MotionBlur_k = tuple(MotionBlur_k)
-        self.MedianBlur_k = tuple(MedianBlur_k)
-        self.prob = prob
-
-    def apply(self, sample, context=None):
-        transform = iaa.Sometimes(
-            self.prob,
-            iaa.OneOf([
-                iaa.MotionBlur(k=self.MotionBlur_k),
-                iaa.MedianBlur(k=self.MedianBlur_k)
-            ]))
-
-        if 'mask' in sample.keys():
-            img, line_strings, seg = transform(
-                image=sample['image'],
-                line_strings=sample['lanes'],
-                segmentation_maps=sample['mask'])
-            sample['image'] = img
-            sample['lanes'] = line_strings
-            sample['mask'] = seg
-        else:
-            img, line_strings = transform(
-                image=sample['image'], line_strings=sample['lanes'])
-            sample['image'] = img
-            sample['lanes'] = line_strings
-
-        return sample
-
-
-@register_op
-class CULaneAffine(BaseOperator):
-    def __init__(self,
-                 translate_percent_x=(-0.1, 0.1),
-                 translate_percent_y=(-0.1, 0.1),
-                 rotate=(3, 5),
-                 scale=(0.8, 1.2),
-                 prob=0.5):
-        super(CULaneAffine, self).__init__()
-        self.translate_percent = {
-            'x': tuple(translate_percent_x),
-            'y': tuple(translate_percent_y)
-        }
-        self.rotate = tuple(rotate)
-        self.scale = tuple(scale)
-        self.prob = prob
-
-    def apply(self, sample, context=None):
-        transform = iaa.Sometimes(
-            self.prob,
-            iaa.Affine(
-                translate_percent=self.translate_percent,
-                rotate=self.rotate,
-                scale=self.scale))
-
-        if 'mask' in sample.keys():
-            img, line_strings, seg = transform(
-                image=sample['image'],
-                line_strings=sample['lanes'],
-                segmentation_maps=sample['mask'])
-            sample['image'] = img
-            sample['lanes'] = line_strings
-            sample['mask'] = seg
-        else:
-            img, line_strings = transform(
-                image=sample['image'], line_strings=sample['lanes'])
-            sample['image'] = img
-            sample['lanes'] = line_strings
-
-        return sample
diff --git a/pdfdet/models/Paddle/ppdet/data/transform/gridmask_utils.py b/pdfdet/models/Paddle/ppdet/data/transform/gridmask_utils.py
deleted file mode 100644
index c187015..0000000
--- a/pdfdet/models/Paddle/ppdet/data/transform/gridmask_utils.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# The code is based on:
-# https://github.com/dvlab-research/GridMask/blob/master/detection_grid/maskrcnn_benchmark/data/transforms/grid.py
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-import numpy as np
-from PIL import Image
-
-
-class Gridmask(object):
-    def __init__(self,
-                 use_h=True,
-                 use_w=True,
-                 rotate=1,
-                 offset=False,
-                 ratio=0.5,
-                 mode=1,
-                 prob=0.7,
-                 upper_iter=360000):
-        super(Gridmask, self).__init__()
-        self.use_h = use_h
-        self.use_w = use_w
-        self.rotate = rotate
-        self.offset = offset
-        self.ratio = ratio
-        self.mode = mode
-        self.prob = prob
-        self.st_prob = prob
-        self.upper_iter = upper_iter
-
-    def __call__(self, x, curr_iter):
-        self.prob = self.st_prob * min(1, 1.0 * curr_iter / self.upper_iter)
-        if np.random.rand() > self.prob:
-            return x
-        h, w, _ = x.shape
-        hh = int(1.5 * h)
-        ww = int(1.5 * w)
-        d = np.random.randint(2, h)
-        self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1)
-        mask = np.ones((hh, ww), np.float32)
-        st_h = np.random.randint(d)
-        st_w = np.random.randint(d)
-        if self.use_h:
-            for i in range(hh // d):
-                s = d * i + st_h
-                t = min(s + self.l, hh)
-                mask[s:t, :] *= 0
-        if self.use_w:
-            for i in range(ww // d):
-                s = d * i + st_w
-                t = min(s + self.l, ww)
-                mask[:, s:t] *= 0
-
-        r = np.random.randint(self.rotate)
-        mask = Image.fromarray(np.uint8(mask))
-        mask = mask.rotate(r)
-        mask = np.asarray(mask)
-        mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) // 2
-                    + w].astype(np.float32)
-
-        if self.mode == 1:
-            mask = 1 - mask
-        mask = np.expand_dims(mask, axis=-1)
-        if self.offset:
-            offset = (2 * (np.random.rand(h, w) - 0.5)).astype(np.float32)
-            x = (x * mask + offset * (1 - mask)).astype(x.dtype)
-        else:
-            x = (x * mask).astype(x.dtype)
-
-        return x
diff --git a/pdfdet/models/Paddle/ppdet/data/transform/keypoint_operators.py b/pdfdet/models/Paddle/ppdet/data/transform/keypoint_operators.py
deleted file mode 100644
index d29aa23..0000000
--- a/pdfdet/models/Paddle/ppdet/data/transform/keypoint_operators.py
+++ /dev/null
@@ -1,1742 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# function:
-#    operators to process sample,
-#    eg: decode/resize/crop image
-
-from __future__ import absolute_import
-
-try:
-    from collections.abc import Sequence
-except Exception:
-    from collections import Sequence
-
-import cv2
-import numpy as np
-import math
-import copy
-
-from ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints, get_affine_transform, affine_transform, get_warp_matrix
-from ppdet.core.workspace import serializable
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-registered_ops = []
-
-__all__ = [
-    'RandomAffine', 'KeyPointFlip', 'TagGenerate', 'ToHeatmaps',
-    'NormalizePermute', 'EvalAffine', 'RandomFlipHalfBodyTransform',
-    'TopDownRandomFlip', 'TopDownRandomShiftBboxCenter', 'TopDownGetRandomScaleRotation',
-    'TopDownAffine', 'ToHeatmapsTopDown', 'ToHeatmapsTopDown_DARK',
-    'ToHeatmapsTopDown_UDP', 'TopDownEvalAffine',
-    'AugmentationbyInformantionDropping', 'SinglePoseAffine', 'NoiseJitter',
-    'FlipPose', 'PETR_Resize'
-]
-
-
-def register_keypointop(cls):
-    return serializable(cls)
-
-
-@register_keypointop
-class KeyPointFlip(object):
-    """Get the fliped image by flip_prob. flip the coords also
-    the left coords and right coords should exchange while flip, for the right keypoint will be left keypoint after image fliped
-
-    Args:
-        flip_permutation (list[17]): the left-right exchange order list corresponding to [0,1,2,...,16]
-        hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet
-        flip_prob (float): the ratio whether to flip the image
-        records(dict): the dict contained the image, mask and coords
-
-    Returns:
-        records(dict): contain the image, mask and coords after tranformed
-
-    """
-
-    def __init__(self, flip_permutation, hmsize=None, flip_prob=0.5):
-        super(KeyPointFlip, self).__init__()
-        assert isinstance(flip_permutation, Sequence)
-        self.flip_permutation = flip_permutation
-        self.flip_prob = flip_prob
-        self.hmsize = hmsize
-
-    def _flipjoints(self, records, sizelst):
-        '''
-        records['gt_joints'] is Sequence in higherhrnet
-        '''
-        if not ('gt_joints' in records and len(records['gt_joints']) > 0):
-            return records
-
-        kpts_lst = records['gt_joints']
-        if isinstance(kpts_lst, Sequence):
-            for idx, hmsize in enumerate(sizelst):
-                if kpts_lst[idx].ndim == 3:
-                    kpts_lst[idx] = kpts_lst[idx][:, self.flip_permutation]
-                else:
-                    kpts_lst[idx] = kpts_lst[idx][self.flip_permutation]
-                kpts_lst[idx][..., 0] = hmsize - kpts_lst[idx][..., 0]
-        else:
-            hmsize = sizelst[0]
-            if kpts_lst.ndim == 3:
-                kpts_lst = kpts_lst[:, self.flip_permutation]
-            else:
-                kpts_lst = kpts_lst[self.flip_permutation]
-            kpts_lst[..., 0] = hmsize - kpts_lst[..., 0]
-
-        records['gt_joints'] = kpts_lst
-        return records
-
-    def _flipmask(self, records, sizelst):
-        if not 'mask' in records:
-            return records
-
-        mask_lst = records['mask']
-        for idx, hmsize in enumerate(sizelst):
-            if len(mask_lst) > idx:
-                mask_lst[idx] = mask_lst[idx][:, ::-1]
-        records['mask'] = mask_lst
-        return records
-
-    def _flipbbox(self, records, sizelst):
-        if not 'gt_bbox' in records:
-            return records
-
-        bboxes = records['gt_bbox']
-        hmsize = sizelst[0]
-        bboxes[:, 0::2] = hmsize - bboxes[:, 0::2][:, ::-1]
-        bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, hmsize)
-        records['gt_bbox'] = bboxes
-        return records
-
-    def __call__(self, records):
-        flip = np.random.random() < self.flip_prob
-        if flip:
-            image = records['image']
-            image = image[:, ::-1]
-            records['image'] = image
-            if self.hmsize is None:
-                sizelst = [image.shape[1]]
-            else:
-                sizelst = self.hmsize
-            self._flipjoints(records, sizelst)
-            self._flipmask(records, sizelst)
-            self._flipbbox(records, sizelst)
-
-        return records
-
-
-@register_keypointop
-class RandomAffine(object):
-    """apply affine transform to image, mask and coords
-    to achieve the rotate, scale and shift effect for training image
-
-    Args:
-        max_degree (float): the max abslute rotate degree to apply, transform range is [-max_degree, max_degree]
-        max_scale (list[2]): the scale range to apply, transform range is [min, max]
-        max_shift (float): the max abslute shift ratio to apply, transform range is [-max_shift*imagesize, max_shift*imagesize]
-        hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet
-        trainsize (list[2]): the standard length used to train, the 'scale_type' of [h,w] will be resize to trainsize for standard
-        scale_type (str): the length of [h,w] to used for trainsize, chosed between 'short' and 'long'
-        records(dict): the dict contained the image, mask and coords
-
-    Returns:
-        records(dict): contain the image, mask and coords after tranformed
-
-    """
-
-    def __init__(self,
-                 max_degree=30,
-                 scale=[0.75, 1.5],
-                 max_shift=0.2,
-                 hmsize=None,
-                 trainsize=[512, 512],
-                 scale_type='short',
-                 boldervalue=[114, 114, 114]):
-        super(RandomAffine, self).__init__()
-        self.max_degree = max_degree
-        self.min_scale = scale[0]
-        self.max_scale = scale[1]
-        self.max_shift = max_shift
-        self.hmsize = hmsize
-        self.trainsize = trainsize
-        self.scale_type = scale_type
-        self.boldervalue = boldervalue
-
-    def _get_affine_matrix_old(self, center, scale, res, rot=0):
-        """Generate transformation matrix."""
-        h = scale
-        t = np.zeros((3, 3), dtype=np.float32)
-        t[0, 0] = float(res[1]) / h
-        t[1, 1] = float(res[0]) / h
-        t[0, 2] = res[1] * (-float(center[0]) / h + .5)
-        t[1, 2] = res[0] * (-float(center[1]) / h + .5)
-        t[2, 2] = 1
-        if rot != 0:
-            rot = -rot  # To match direction of rotation from cropping
-            rot_mat = np.zeros((3, 3), dtype=np.float32)
-            rot_rad = rot * np.pi / 180
-            sn, cs = np.sin(rot_rad), np.cos(rot_rad)
-            rot_mat[0, :2] = [cs, -sn]
-            rot_mat[1, :2] = [sn, cs]
-            rot_mat[2, 2] = 1
-            # Need to rotate around center
-            t_mat = np.eye(3)
-            t_mat[0, 2] = -res[1] / 2
-            t_mat[1, 2] = -res[0] / 2
-            t_inv = t_mat.copy()
-            t_inv[:2, 2] *= -1
-            t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))
-        return t
-
-    def _get_affine_matrix(self, center, scale, res, rot=0):
-        """Generate transformation matrix."""
-        w, h = scale
-        t = np.zeros((3, 3), dtype=np.float32)
-        t[0, 0] = float(res[0]) / w
-        t[1, 1] = float(res[1]) / h
-        t[0, 2] = res[0] * (-float(center[0]) / w + .5)
-        t[1, 2] = res[1] * (-float(center[1]) / h + .5)
-        t[2, 2] = 1
-        if rot != 0:
-            rot = -rot  # To match direction of rotation from cropping
-            rot_mat = np.zeros((3, 3), dtype=np.float32)
-            rot_rad = rot * np.pi / 180
-            sn, cs = np.sin(rot_rad), np.cos(rot_rad)
-            rot_mat[0, :2] = [cs, -sn]
-            rot_mat[1, :2] = [sn, cs]
-            rot_mat[2, 2] = 1
-            # Need to rotate around center
-            t_mat = np.eye(3)
-            t_mat[0, 2] = -res[0] / 2
-            t_mat[1, 2] = -res[1] / 2
-            t_inv = t_mat.copy()
-            t_inv[:2, 2] *= -1
-            t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t)))
-        return t
-
-    def _affine_joints_mask(self,
-                            degree,
-                            center,
-                            roi_size,
-                            dsize,
-                            keypoints=None,
-                            heatmap_mask=None,
-                            gt_bbox=None):
-        kpts = None
-        mask = None
-        bbox = None
-        mask_affine_mat = self._get_affine_matrix(center, roi_size, dsize,
-                                                  degree)[:2]
-        if heatmap_mask is not None:
-            mask = cv2.warpAffine(heatmap_mask, mask_affine_mat, dsize)
-            mask = ((mask / 255) > 0.5).astype(np.float32)
-        if keypoints is not None:
-            kpts = copy.deepcopy(keypoints)
-            kpts[..., 0:2] = warp_affine_joints(kpts[..., 0:2].copy(),
-                                                mask_affine_mat)
-            kpts[(kpts[..., 0]) > dsize[0], :] = 0
-            kpts[(kpts[..., 1]) > dsize[1], :] = 0
-            kpts[(kpts[..., 0]) < 0, :] = 0
-            kpts[(kpts[..., 1]) < 0, :] = 0
-        if gt_bbox is not None:
-            temp_bbox = gt_bbox[:, [0, 3, 2, 1]]
-            cat_bbox = np.concatenate((gt_bbox, temp_bbox), axis=-1)
-            gt_bbox_warped = warp_affine_joints(cat_bbox, mask_affine_mat)
-            bbox = np.zeros_like(gt_bbox)
-            bbox[:, 0] = gt_bbox_warped[:, 0::2].min(1).clip(0, dsize[0])
-            bbox[:, 2] = gt_bbox_warped[:, 0::2].max(1).clip(0, dsize[0])
-            bbox[:, 1] = gt_bbox_warped[:, 1::2].min(1).clip(0, dsize[1])
-            bbox[:, 3] = gt_bbox_warped[:, 1::2].max(1).clip(0, dsize[1])
-        return kpts, mask, bbox
-
-    def __call__(self, records):
-        image = records['image']
-        shape = np.array(image.shape[:2][::-1])
-        keypoints = None
-        heatmap_mask = None
-        gt_bbox = None
-        if 'gt_joints' in records:
-            keypoints = records['gt_joints']
-
-        if 'mask' in records:
-            heatmap_mask = records['mask']
-            heatmap_mask *= 255
-
-        if 'gt_bbox' in records:
-            gt_bbox = records['gt_bbox']
-
-        degree = (np.random.random() * 2 - 1) * self.max_degree
-        center = center = np.array((np.array(shape) / 2))
-
-        aug_scale = np.random.random() * (self.max_scale - self.min_scale
-                                          ) + self.min_scale
-        if self.scale_type == 'long':
-            scale = np.array([max(shape[0], shape[1]) / 1.0] * 2)
-        elif self.scale_type == 'short':
-            scale = np.array([min(shape[0], shape[1]) / 1.0] * 2)
-        elif self.scale_type == 'wh':
-            scale = shape
-        else:
-            raise ValueError('Unknown scale type: {}'.format(self.scale_type))
-        roi_size = aug_scale * scale
-        dx = int(0)
-        dy = int(0)
-        if self.max_shift > 0:
-
-            dx = np.random.randint(-self.max_shift * roi_size[0],
-                                   self.max_shift * roi_size[0])
-            dy = np.random.randint(-self.max_shift * roi_size[0],
-                                   self.max_shift * roi_size[1])
-
-        center += np.array([dx, dy])
-        input_size = 2 * center
-        if self.trainsize != -1:
-            dsize = self.trainsize
-            imgshape = (dsize)
-        else:
-            dsize = scale
-            imgshape = (shape.tolist())
-
-        image_affine_mat = self._get_affine_matrix(center, roi_size, dsize,
-                                                   degree)[:2]
-        image = cv2.warpAffine(
-            image,
-            image_affine_mat,
-            imgshape,
-            flags=cv2.INTER_LINEAR,
-            borderValue=self.boldervalue)
-
-        if self.hmsize is None:
-            kpts, mask, gt_bbox = self._affine_joints_mask(
-                degree, center, roi_size, dsize, keypoints, heatmap_mask,
-                gt_bbox)
-            records['image'] = image
-            if kpts is not None: records['gt_joints'] = kpts
-            if mask is not None: records['mask'] = mask
-            if gt_bbox is not None: records['gt_bbox'] = gt_bbox
-            return records
-
-        kpts_lst = []
-        mask_lst = []
-        for hmsize in self.hmsize:
-            kpts, mask, gt_bbox = self._affine_joints_mask(
-                degree, center, roi_size, [hmsize, hmsize], keypoints,
-                heatmap_mask, gt_bbox)
-            kpts_lst.append(kpts)
-            mask_lst.append(mask)
-        records['image'] = image
-
-        if 'gt_joints' in records:
-            records['gt_joints'] = kpts_lst
-        if 'mask' in records:
-            records['mask'] = mask_lst
-        if 'gt_bbox' in records:
-            records['gt_bbox'] = gt_bbox
-        return records
-
-
-@register_keypointop
-class EvalAffine(object):
-    """apply affine transform to image
-    resize the short of [h,w] to standard size for eval
-
-    Args:
-        size (int): the standard length used to train, the 'short' of [h,w] will be resize to trainsize for standard
-        records(dict): the dict contained the image, mask and coords
-
-    Returns:
-        records(dict): contain the image, mask and coords after tranformed
-
-    """
-
-    def __init__(self, size, stride=64):
-        super(EvalAffine, self).__init__()
-        self.size = size
-        self.stride = stride
-
-    def __call__(self, records):
-        image = records['image']
-        mask = records['mask'] if 'mask' in records else None
-        s = self.size
-        h, w, _ = image.shape
-        trans, size_resized = get_affine_mat_kernel(h, w, s, inv=False)
-        image_resized = cv2.warpAffine(image, trans, size_resized)
-        if mask is not None:
-            mask = cv2.warpAffine(mask, trans, size_resized)
-            records['mask'] = mask
-        if 'gt_joints' in records:
-            del records['gt_joints']
-        records['image'] = image_resized
-        records['scale_factor'] = self.size / min(h, w)
-        return records
-
-
-@register_keypointop
-class NormalizePermute(object):
-    def __init__(self,
-                 mean=[123.675, 116.28, 103.53],
-                 std=[58.395, 57.120, 57.375],
-                 is_scale=True):
-        super(NormalizePermute, self).__init__()
-        self.mean = mean
-        self.std = std
-        self.is_scale = is_scale
-
-    def __call__(self, records):
-        image = records['image']
-        image = image.astype(np.float32)
-        if self.is_scale:
-            image /= 255.
-        image = image.transpose((2, 0, 1))
-        mean = np.array(self.mean, dtype=np.float32)
-        std = np.array(self.std, dtype=np.float32)
-        invstd = 1. / std
-        for v, m, s in zip(image, mean, invstd):
-            v.__isub__(m).__imul__(s)
-        records['image'] = image
-        return records
-
-
-@register_keypointop
-class TagGenerate(object):
-    """record gt coords for aeloss to sample coords value in tagmaps
-
-    Args:
-        num_joints (int): the keypoint numbers of dataset to train
-        num_people (int): maxmum people to support for sample aeloss
-        records(dict): the dict contained the image, mask and coords
-
-    Returns:
-        records(dict): contain the gt coords used in tagmap
-
-    """
-
-    def __init__(self, num_joints, max_people=30):
-        super(TagGenerate, self).__init__()
-        self.max_people = max_people
-        self.num_joints = num_joints
-
-    def __call__(self, records):
-        kpts_lst = records['gt_joints']
-        kpts = kpts_lst[0]
-        tagmap = np.zeros((self.max_people, self.num_joints, 4), dtype=np.int64)
-        inds = np.where(kpts[..., 2] > 0)
-        p, j = inds[0], inds[1]
-        visible = kpts[inds]
-        # tagmap is [p, j, 3], where last dim is j, y, x
-        tagmap[p, j, 0] = j
-        tagmap[p, j, 1] = visible[..., 1]  # y
-        tagmap[p, j, 2] = visible[..., 0]  # x
-        tagmap[p, j, 3] = 1
-        records['tagmap'] = tagmap
-        del records['gt_joints']
-        return records
-
-
-@register_keypointop
-class ToHeatmaps(object):
-    """to generate the gaussin heatmaps of keypoint for heatmap loss
-
-    Args:
-        num_joints (int): the keypoint numbers of dataset to train
-        hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet
-        sigma (float): the std of gaussin kernel genereted
-        records(dict): the dict contained the image, mask and coords
-
-    Returns:
-        records(dict): contain the heatmaps used to heatmaploss
-
-    """
-
-    def __init__(self, num_joints, hmsize, sigma=None):
-        super(ToHeatmaps, self).__init__()
-        self.num_joints = num_joints
-        self.hmsize = np.array(hmsize)
-        if sigma is None:
-            sigma = hmsize[0] // 64
-        self.sigma = sigma
-
-        r = 6 * sigma + 3
-        x = np.arange(0, r, 1, np.float32)
-        y = x[:, None]
-        x0, y0 = 3 * sigma + 1, 3 * sigma + 1
-        self.gaussian = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2))
-
-    def __call__(self, records):
-        kpts_lst = records['gt_joints']
-        mask_lst = records['mask']
-        for idx, hmsize in enumerate(self.hmsize):
-            mask = mask_lst[idx]
-            kpts = kpts_lst[idx]
-            heatmaps = np.zeros((self.num_joints, hmsize, hmsize))
-            inds = np.where(kpts[..., 2] > 0)
-            visible = kpts[inds].astype(np.int64)[..., :2]
-            ul = np.round(visible - 3 * self.sigma - 1)
-            br = np.round(visible + 3 * self.sigma + 2)
-            sul = np.maximum(0, -ul)
-            sbr = np.minimum(hmsize, br) - ul
-            dul = np.clip(ul, 0, hmsize - 1)
-            dbr = np.clip(br, 0, hmsize)
-            for i in range(len(visible)):
-                if visible[i][0] < 0 or visible[i][1] < 0 or visible[i][
-                        0] >= hmsize or visible[i][1] >= hmsize:
-                    continue
-                dx1, dy1 = dul[i]
-                dx2, dy2 = dbr[i]
-                sx1, sy1 = sul[i]
-                sx2, sy2 = sbr[i]
-                heatmaps[inds[1][i], dy1:dy2, dx1:dx2] = np.maximum(
-                    self.gaussian[sy1:sy2, sx1:sx2],
-                    heatmaps[inds[1][i], dy1:dy2, dx1:dx2])
-            records['heatmap_gt{}x'.format(idx + 1)] = heatmaps
-            records['mask_{}x'.format(idx + 1)] = mask
-        del records['mask']
-        return records
-
-
-@register_keypointop
-class RandomFlipHalfBodyTransform(object):
-    """apply data augment to image and coords
-    to achieve the flip, scale, rotate and half body transform effect for training image
-
-    Args:
-        trainsize (list):[w, h], Image target size
-        upper_body_ids (list): The upper body joint ids
-        flip_pairs (list): The left-right joints exchange order list
-        pixel_std (int): The pixel std of the scale
-        scale (float): The scale factor to transform the image
-        rot (int): The rotate factor to transform the image
-        num_joints_half_body (int): The joints threshold of the half body transform
-        prob_half_body (float): The threshold of the half body transform
-        flip (bool): Whether to flip the image
-
-    Returns:
-        records(dict): contain the image and coords after tranformed
-
-    """
-
-    def __init__(self,
-                 trainsize,
-                 upper_body_ids,
-                 flip_pairs,
-                 pixel_std,
-                 scale=0.35,
-                 rot=40,
-                 num_joints_half_body=8,
-                 prob_half_body=0.3,
-                 flip=True,
-                 rot_prob=0.6):
-        super(RandomFlipHalfBodyTransform, self).__init__()
-        self.trainsize = trainsize
-        self.upper_body_ids = upper_body_ids
-        self.flip_pairs = flip_pairs
-        self.pixel_std = pixel_std
-        self.scale = scale
-        self.rot = rot
-        self.num_joints_half_body = num_joints_half_body
-        self.prob_half_body = prob_half_body
-        self.flip = flip
-        self.aspect_ratio = trainsize[0] * 1.0 / trainsize[1]
-        self.rot_prob = rot_prob
-
-    def halfbody_transform(self, joints, joints_vis):
-        upper_joints = []
-        lower_joints = []
-        for joint_id in range(joints.shape[0]):
-            if joints_vis[joint_id][0] > 0:
-                if joint_id in self.upper_body_ids:
-                    upper_joints.append(joints[joint_id])
-                else:
-                    lower_joints.append(joints[joint_id])
-        if np.random.randn() < 0.5 and len(upper_joints) > 2:
-            selected_joints = upper_joints
-        else:
-            selected_joints = lower_joints if len(
-                lower_joints) > 2 else upper_joints
-        if len(selected_joints) < 2:
-            return None, None
-        selected_joints = np.array(selected_joints, dtype=np.float32)
-        center = selected_joints.mean(axis=0)[:2]
-        left_top = np.amin(selected_joints, axis=0)
-        right_bottom = np.amax(selected_joints, axis=0)
-        w = right_bottom[0] - left_top[0]
-        h = right_bottom[1] - left_top[1]
-        if w > self.aspect_ratio * h:
-            h = w * 1.0 / self.aspect_ratio
-        elif w < self.aspect_ratio * h:
-            w = h * self.aspect_ratio
-        scale = np.array(
-            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
-            dtype=np.float32)
-        scale = scale * 1.5
-
-        return center, scale
-
-    def flip_joints(self, joints, joints_vis, width, matched_parts):
-        joints[:, 0] = width - joints[:, 0] - 1
-        for pair in matched_parts:
-            joints[pair[0], :], joints[pair[1], :] = \
-                joints[pair[1], :], joints[pair[0], :].copy()
-            joints_vis[pair[0], :], joints_vis[pair[1], :] = \
-                joints_vis[pair[1], :], joints_vis[pair[0], :].copy()
-
-        return joints * joints_vis, joints_vis
-
-    def __call__(self, records):
-        image = records['image']
-        joints = records['gt_joints']
-        joints_vis = records['joints_vis']
-        c = records['center']
-        s = records['scale']
-        r = 0
-        if (np.sum(joints_vis[:, 0]) > self.num_joints_half_body and
-                np.random.rand() < self.prob_half_body):
-            c_half_body, s_half_body = self.halfbody_transform(joints,
-                                                               joints_vis)
-            if c_half_body is not None and s_half_body is not None:
-                c, s = c_half_body, s_half_body
-        sf = self.scale
-        rf = self.rot
-        s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
-        r = np.clip(np.random.randn() * rf, -rf * 2,
-                    rf * 2) if np.random.random() <= self.rot_prob else 0
-
-        if self.flip and np.random.random() <= 0.5:
-            image = image[:, ::-1, :]
-            joints, joints_vis = self.flip_joints(
-                joints, joints_vis, image.shape[1], self.flip_pairs)
-            c[0] = image.shape[1] - c[0] - 1
-        records['image'] = image
-        records['gt_joints'] = joints
-        records['joints_vis'] = joints_vis
-        records['center'] = c
-        records['scale'] = s
-        records['rotate'] = r
-
-        return records
-
-
-@register_keypointop
-class AugmentationbyInformantionDropping(object):
-    """AID: Augmentation by Informantion Dropping. Please refer 
-        to https://arxiv.org/abs/2008.07139 
-    
-    Args:
-        prob_cutout (float): The probability of the Cutout augmentation.
-        offset_factor (float): Offset factor of cutout center.
-        num_patch (int): Number of patches to be cutout.                       
-        records(dict): the dict contained the image and coords
-        
-    Returns:
-        records (dict): contain the image and coords after tranformed
-    
-    """
-
-    def __init__(self,
-                 trainsize,
-                 prob_cutout=0.0,
-                 offset_factor=0.2,
-                 num_patch=1):
-        self.prob_cutout = prob_cutout
-        self.offset_factor = offset_factor
-        self.num_patch = num_patch
-        self.trainsize = trainsize
-
-    def _cutout(self, img, joints, joints_vis):
-        height, width, _ = img.shape
-        img = img.reshape((height * width, -1))
-        feat_x_int = np.arange(0, width)
-        feat_y_int = np.arange(0, height)
-        feat_x_int, feat_y_int = np.meshgrid(feat_x_int, feat_y_int)
-        feat_x_int = feat_x_int.reshape((-1, ))
-        feat_y_int = feat_y_int.reshape((-1, ))
-        for _ in range(self.num_patch):
-            vis_idx, _ = np.where(joints_vis > 0)
-            occlusion_joint_id = np.random.choice(vis_idx)
-            center = joints[occlusion_joint_id, 0:2]
-            offset = np.random.randn(2) * self.trainsize[0] * self.offset_factor
-            center = center + offset
-            radius = np.random.uniform(0.1, 0.2) * self.trainsize[0]
-            x_offset = (center[0] - feat_x_int) / radius
-            y_offset = (center[1] - feat_y_int) / radius
-            dis = x_offset**2 + y_offset**2
-            keep_pos = np.where((dis <= 1) & (dis >= 0))[0]
-            img[keep_pos, :] = 0
-        img = img.reshape((height, width, -1))
-        return img
-
-    def __call__(self, records):
-        img = records['image']
-        joints = records['gt_joints']
-        joints_vis = records['joints_vis']
-        if np.random.rand() < self.prob_cutout:
-            img = self._cutout(img, joints, joints_vis)
-        records['image'] = img
-        return records
-
-
-@register_keypointop
-class TopDownRandomFlip(object):
-    """Data augmentation with random image flip.
-
-    Args:
-        flip_perm: (list[tuple]): Pairs of keypoints which are mirrored
-                (for example, left ear and right ear).
-        flip_prob (float): Probability of flip.
-    """
-
-    def __init__(self, flip_perm=[], flip_prob=0.5):
-        self.flip_perm = flip_perm
-        self.flip_prob = flip_prob
-
-    def flip_joints(self, joints_3d, joints_3d_visible, img_width, flip_pairs):
-        assert len(joints_3d) == len(joints_3d_visible)
-        assert img_width > 0
-
-        joints_3d_flipped = joints_3d.copy()
-        joints_3d_visible_flipped = joints_3d_visible.copy()
-
-        # Swap left-right parts
-        for left, right in flip_pairs:
-            joints_3d_flipped[left, :] = joints_3d[right, :]
-            joints_3d_flipped[right, :] = joints_3d[left, :]
-
-            joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :]
-            joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :]
-
-        # Flip horizontally
-        joints_3d_flipped[:, 0] = img_width - 1 - joints_3d_flipped[:, 0]
-        joints_3d_flipped = joints_3d_flipped * (joints_3d_visible_flipped > 0)
-
-        return joints_3d_flipped, joints_3d_visible_flipped
-
-    def __call__(self, results):
-        """Perform data augmentation with random image flip."""
-        if np.random.rand() <= self.flip_prob:
-            return results
-
-        img = results['image']
-        joints_3d = results['gt_joints']
-        joints_3d_visible = results['joints_vis']
-        center = results['center']
-
-        # A flag indicating whether the image is flipped,
-        # which can be used by child class.
-        if not isinstance(img, list):
-            img = img[:, ::-1, :]
-        else:
-            img = [i[:, ::-1, :] for i in img]
-        if not isinstance(img, list):
-            joints_3d, joints_3d_visible = self.flip_joints(
-                joints_3d, joints_3d_visible, img.shape[1],
-                self.flip_perm)
-            center[0] = img.shape[1] - center[0] - 1
-        else:
-            joints_3d, joints_3d_visible = self.flip_joints(
-                joints_3d, joints_3d_visible, img[0].shape[1],
-                self.flip_perm)
-            center[0] = img[0].shape[1] - center[0] - 1
-
-        results['image'] = img
-        results['gt_joints'] = joints_3d
-        results['joints_vis'] = joints_3d_visible
-        results['center'] = center
-
-        return results
-
-
-@register_keypointop
-class TopDownRandomShiftBboxCenter(object):
-    """Random shift the bbox center.
-
-    Args:
-        shift_factor (float): The factor to control the shift range, which is
-            scale*pixel_std*scale_factor. Default: 0.16
-        shift_prob (float): Probability of applying random shift. Default: 0.3
-    """
-
-    def __init__(self, shift_factor=0.16, shift_prob=0.3):
-        self.shift_factor = shift_factor
-        self.shift_prob = shift_prob
-
-    def __call__(self, results):
-        center = results['center']
-        scale = results['scale']
-        if np.random.rand() < self.shift_prob:
-            center += np.random.uniform(
-                -1, 1, 2) * self.shift_factor * scale * 200.0
-
-        results['center'] = center
-        return results
-
-@register_keypointop
-class TopDownGetRandomScaleRotation(object):
-    """Data augmentation with random scaling & rotating.
-
-    Args:
-        rot_factor (int): Rotating to ``[-2*rot_factor, 2*rot_factor]``.
-        scale_factor (float): Scaling to ``[1-scale_factor, 1+scale_factor]``.
-        rot_prob (float): Probability of random rotation.
-    """
-
-    def __init__(self, rot_factor=40, scale_factor=0.5, rot_prob=0.6):
-        self.rot_factor = rot_factor
-        self.scale_factor = scale_factor
-        self.rot_prob = rot_prob
-
-    def __call__(self, results):
-        """Perform data augmentation with random scaling & rotating."""
-        s = results['scale']
-
-        sf = self.scale_factor
-        rf = self.rot_factor
-
-        s_factor = np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
-        s = s * s_factor
-
-        r_factor = np.clip(np.random.randn() * rf, -rf * 2, rf * 2)
-        r = r_factor if np.random.rand() <= self.rot_prob else 0
-
-        results['scale'] = s
-        results['rotate'] = r
-
-        return results
-
-
-@register_keypointop
-class TopDownAffine(object):
-    """apply affine transform to image and coords
-
-    Args:
-        trainsize (list): [w, h], the standard size used to train
-        use_udp (bool): whether to use Unbiased Data Processing.
-        records(dict): the dict contained the image and coords
-
-    Returns:
-        records (dict): contain the image and coords after tranformed
-
-    """
-
-    def __init__(self, trainsize, use_udp=False):
-        self.trainsize = trainsize
-        self.use_udp = use_udp
-
-    def __call__(self, records):
-        image = records['image']
-        joints = records['gt_joints']
-        joints_vis = records['joints_vis']
-        rot = records['rotate'] if "rotate" in records else 0
-        if self.use_udp:
-            trans = get_warp_matrix(
-                rot, records['center'] * 2.0,
-                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0],
-                records['scale'] * 200.0)
-            image = cv2.warpAffine(
-                image,
-                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
-                flags=cv2.INTER_LINEAR)
-            joints[:, 0:2] = warp_affine_joints(joints[:, 0:2].copy(), trans)
-        else:
-            trans = get_affine_transform(records['center'], records['scale'] *
-                                         200, rot, self.trainsize)
-            image = cv2.warpAffine(
-                image,
-                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
-                flags=cv2.INTER_LINEAR)
-            for i in range(joints.shape[0]):
-                if joints_vis[i, 0] > 0.0:
-                    joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
-
-        records['image'] = image
-        records['gt_joints'] = joints
-
-        return records
-
-
-@register_keypointop
-class SinglePoseAffine(object):
-    """apply affine transform to image and coords
-
-    Args:
-        trainsize (list): [w, h], the standard size used to train
-        use_udp (bool): whether to use Unbiased Data Processing.
-        records(dict): the dict contained the image and coords
-
-    Returns:
-        records (dict): contain the image and coords after tranformed
-
-    """
-
-    def __init__(self,
-                 trainsize,
-                 rotate=[1.0, 30],
-                 scale=[1.0, 0.25],
-                 use_udp=False):
-        self.trainsize = trainsize
-        self.use_udp = use_udp
-        self.rot_prob = rotate[0]
-        self.rot_range = rotate[1]
-        self.scale_prob = scale[0]
-        self.scale_ratio = scale[1]
-
-    def __call__(self, records):
-        image = records['image']
-        if 'joints_2d' in records:
-            joints = records['joints_2d'] if 'joints_2d' in records else None
-            joints_vis = records[
-                'joints_vis'] if 'joints_vis' in records else np.ones(
-                    (len(joints), 1))
-        rot = 0
-        s = 1.
-        if np.random.random() < self.rot_prob:
-            rot = np.clip(np.random.randn() * self.rot_range,
-                          -self.rot_range * 2, self.rot_range * 2)
-        if np.random.random() < self.scale_prob:
-            s = np.clip(np.random.randn() * self.scale_ratio + 1,
-                        1 - self.scale_ratio, 1 + self.scale_ratio)
-
-        if self.use_udp:
-            trans = get_warp_matrix(
-                rot,
-                np.array(records['bbox_center']) * 2.0,
-                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0],
-                records['bbox_scale'] * 200.0 * s)
-            image = cv2.warpAffine(
-                image,
-                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
-                flags=cv2.INTER_LINEAR)
-            if 'joints_2d' in records:
-                joints[:, 0:2] = warp_affine_joints(joints[:, 0:2].copy(),
-                                                    trans)
-        else:
-            trans = get_affine_transform(
-                np.array(records['bbox_center']),
-                records['bbox_scale'] * s * 200, rot, self.trainsize)
-            image = cv2.warpAffine(
-                image,
-                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
-                flags=cv2.INTER_LINEAR)
-            if 'joints_2d' in records:
-                for i in range(len(joints)):
-                    if joints_vis[i, 0] > 0.0:
-                        joints[i, 0:2] = affine_transform(joints[i, 0:2], trans)
-
-        if 'joints_3d' in records:
-            pose3d = records['joints_3d']
-            if not rot == 0:
-                trans_3djoints = np.eye(3)
-                rot_rad = -rot * np.pi / 180
-                sn, cs = np.sin(rot_rad), np.cos(rot_rad)
-                trans_3djoints[0, :2] = [cs, -sn]
-                trans_3djoints[1, :2] = [sn, cs]
-                pose3d[:, :3] = np.einsum('ij,kj->ki', trans_3djoints,
-                                          pose3d[:, :3])
-                records['joints_3d'] = pose3d
-
-        records['image'] = image
-        if 'joints_2d' in records:
-            records['joints_2d'] = joints
-
-        return records
-
-
-@register_keypointop
-class NoiseJitter(object):
-    """apply NoiseJitter to image
-
-    Args:
-        noise_factor (float): the noise factor ratio used to generate the jitter
-
-    Returns:
-        records (dict): contain the image and coords after tranformed
-
-    """
-
-    def __init__(self, noise_factor=0.4):
-        self.noise_factor = noise_factor
-
-    def __call__(self, records):
-        self.pn = np.random.uniform(1 - self.noise_factor,
-                                    1 + self.noise_factor, 3)
-        rgb_img = records['image']
-        rgb_img[:, :, 0] = np.minimum(
-            255.0, np.maximum(0.0, rgb_img[:, :, 0] * self.pn[0]))
-        rgb_img[:, :, 1] = np.minimum(
-            255.0, np.maximum(0.0, rgb_img[:, :, 1] * self.pn[1]))
-        rgb_img[:, :, 2] = np.minimum(
-            255.0, np.maximum(0.0, rgb_img[:, :, 2] * self.pn[2]))
-        records['image'] = rgb_img
-        return records
-
-
-@register_keypointop
-class FlipPose(object):
-    """random apply flip to image
-
-    Args:
-        noise_factor (float): the noise factor ratio used to generate the jitter
-
-    Returns:
-        records (dict): contain the image and coords after tranformed
-
-    """
-
-    def __init__(self, flip_prob=0.5, img_res=224, num_joints=14):
-        self.flip_pob = flip_prob
-        self.img_res = img_res
-        if num_joints == 24:
-            self.perm = [
-                5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13, 14, 15, 16, 17,
-                18, 19, 21, 20, 23, 22
-            ]
-        elif num_joints == 14:
-            self.perm = [5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13]
-        else:
-            print("error num_joints in flip :{}".format(num_joints))
-
-    def __call__(self, records):
-
-        if np.random.random() < self.flip_pob:
-            img = records['image']
-            img = np.fliplr(img)
-
-            if 'joints_2d' in records:
-                joints_2d = records['joints_2d']
-                joints_2d = joints_2d[self.perm]
-                joints_2d[:, 0] = self.img_res - joints_2d[:, 0]
-                records['joints_2d'] = joints_2d
-
-            if 'joints_3d' in records:
-                joints_3d = records['joints_3d']
-                joints_3d = joints_3d[self.perm]
-                joints_3d[:, 0] = -joints_3d[:, 0]
-                records['joints_3d'] = joints_3d
-
-            records['image'] = img
-        return records
-
-
-@register_keypointop
-class TopDownEvalAffine(object):
-    """apply affine transform to image and coords
-
-    Args:
-        trainsize (list): [w, h], the standard size used to train
-        use_udp (bool): whether to use Unbiased Data Processing.
-        records(dict): the dict contained the image and coords
-
-    Returns:
-        records (dict): contain the image and coords after tranformed
-
-    """
-
-    def __init__(self, trainsize, use_udp=False):
-        self.trainsize = trainsize
-        self.use_udp = use_udp
-
-    def __call__(self, records):
-        image = records['image']
-        rot = 0
-        imshape = records['im_shape'][::-1]
-        center = imshape / 2.
-        scale = imshape
-
-        if self.use_udp:
-            trans = get_warp_matrix(
-                rot, center * 2.0,
-                [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], scale)
-            image = cv2.warpAffine(
-                image,
-                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
-                flags=cv2.INTER_LINEAR)
-        else:
-            trans = get_affine_transform(center, scale, rot, self.trainsize)
-            image = cv2.warpAffine(
-                image,
-                trans, (int(self.trainsize[0]), int(self.trainsize[1])),
-                flags=cv2.INTER_LINEAR)
-        records['image'] = image
-
-        return records
-
-
-@register_keypointop
-class ToHeatmapsTopDown(object):
-    """to generate the gaussin heatmaps of keypoint for heatmap loss
-
-    Args:
-        hmsize (list): [w, h] output heatmap's size
-        sigma (float): the std of gaussin kernel genereted
-        records(dict): the dict contained the image and coords
-
-    Returns:
-        records (dict): contain the heatmaps used to heatmaploss
-
-    """
-
-    def __init__(self, hmsize, sigma):
-        super(ToHeatmapsTopDown, self).__init__()
-        self.hmsize = np.array(hmsize)
-        self.sigma = sigma
-
-    def __call__(self, records):
-        """refer to
-            https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
-            Copyright (c) Microsoft, under the MIT License.
-        """
-        joints = records['gt_joints']
-        joints_vis = records['joints_vis']
-        num_joints = joints.shape[0]
-        image_size = np.array(
-            [records['image'].shape[1], records['image'].shape[0]])
-        target_weight = np.ones((num_joints, 1), dtype=np.float32)
-        target_weight[:, 0] = joints_vis[:, 0]
-        target = np.zeros(
-            (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32)
-        tmp_size = self.sigma * 3
-        feat_stride = image_size / self.hmsize
-        for joint_id in range(num_joints):
-            mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5)
-            mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5)
-            # Check that any part of the gaussian is in-bounds
-            ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
-            br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
-            if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[
-                    0] < 0 or br[1] < 0:
-                # If not, just return the image as is
-                target_weight[joint_id] = 0
-                continue
-            # # Generate gaussian
-            size = 2 * tmp_size + 1
-            x = np.arange(0, size, 1, np.float32)
-            y = x[:, np.newaxis]
-            x0 = y0 = size // 2
-            # The gaussian is not normalized, we want the center value to equal 1
-            g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * self.sigma**2))
-
-            # Usable gaussian range
-            g_x = max(0, -ul[0]), min(br[0], self.hmsize[0]) - ul[0]
-            g_y = max(0, -ul[1]), min(br[1], self.hmsize[1]) - ul[1]
-            # Image range
-            img_x = max(0, ul[0]), min(br[0], self.hmsize[0])
-            img_y = max(0, ul[1]), min(br[1], self.hmsize[1])
-
-            v = target_weight[joint_id]
-            if v > 0.5:
-                target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[
-                    0]:g_y[1], g_x[0]:g_x[1]]
-        records['target'] = target
-        records['target_weight'] = target_weight
-        del records['gt_joints'], records['joints_vis']
-
-        return records
-
-
-@register_keypointop
-class ToHeatmapsTopDown_DARK(object):
-    """to generate the gaussin heatmaps of keypoint for heatmap loss
-
-    Args:
-        hmsize (list): [w, h] output heatmap's size
-        sigma (float): the std of gaussin kernel genereted
-        records(dict): the dict contained the image and coords
-
-    Returns:
-        records (dict): contain the heatmaps used to heatmaploss
-
-    """
-
-    def __init__(self, hmsize, sigma):
-        super(ToHeatmapsTopDown_DARK, self).__init__()
-        self.hmsize = np.array(hmsize)
-        self.sigma = sigma
-
-    def __call__(self, records):
-        joints = records['gt_joints']
-        joints_vis = records['joints_vis']
-        num_joints = joints.shape[0]
-        image_size = np.array(
-            [records['image'].shape[1], records['image'].shape[0]])
-        target_weight = np.ones((num_joints, 1), dtype=np.float32)
-        target_weight[:, 0] = joints_vis[:, 0]
-        target = np.zeros(
-            (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32)
-        tmp_size = self.sigma * 3
-        feat_stride = image_size / self.hmsize
-        for joint_id in range(num_joints):
-            mu_x = joints[joint_id][0] / feat_stride[0]
-            mu_y = joints[joint_id][1] / feat_stride[1]
-            # Check that any part of the gaussian is in-bounds
-            ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
-            br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
-            if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[
-                    0] < 0 or br[1] < 0:
-                # If not, just return the image as is
-                target_weight[joint_id] = 0
-                continue
-
-            x = np.arange(0, self.hmsize[0], 1, np.float32)
-            y = np.arange(0, self.hmsize[1], 1, np.float32)
-            y = y[:, np.newaxis]
-
-            v = target_weight[joint_id]
-            if v > 0.5:
-                target[joint_id] = np.exp(-(
-                    (x - mu_x)**2 + (y - mu_y)**2) / (2 * self.sigma**2))
-        records['target'] = target
-        records['target_weight'] = target_weight
-        del records['gt_joints'], records['joints_vis']
-
-        return records
-
-
-@register_keypointop
-class ToHeatmapsTopDown_UDP(object):
-    """This code is based on:
-        https://github.com/HuangJunJie2017/UDP-Pose/blob/master/deep-high-resolution-net.pytorch/lib/dataset/JointsDataset.py
-       
-        to generate the gaussian heatmaps of keypoint for heatmap loss.
-        ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing
-        for Human Pose Estimation (CVPR 2020).
-
-    Args:
-        hmsize (list): [w, h] output heatmap's size
-        sigma (float): the std of gaussin kernel genereted
-        records(dict): the dict contained the image and coords
-
-    Returns:
-        records (dict): contain the heatmaps used to heatmaploss
-    """
-
-    def __init__(self, hmsize, sigma):
-        super(ToHeatmapsTopDown_UDP, self).__init__()
-        self.hmsize = np.array(hmsize)
-        self.sigma = sigma
-
-    def __call__(self, records):
-        joints = records['gt_joints']
-        joints_vis = records['joints_vis']
-        num_joints = joints.shape[0]
-        image_size = np.array(
-            [records['image'].shape[1], records['image'].shape[0]])
-        target_weight = np.ones((num_joints, 1), dtype=np.float32)
-        target_weight[:, 0] = joints_vis[:, 0]
-        target = np.zeros(
-            (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32)
-        tmp_size = self.sigma * 3
-        size = 2 * tmp_size + 1
-        x = np.arange(0, size, 1, np.float32)
-        y = x[:, None]
-        feat_stride = (image_size - 1.0) / (self.hmsize - 1.0)
-        for joint_id in range(num_joints):
-            mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5)
-            mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5)
-            # Check that any part of the gaussian is in-bounds
-            ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)]
-            br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)]
-            if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[
-                    0] < 0 or br[1] < 0:
-                # If not, just return the image as is
-                target_weight[joint_id] = 0
-                continue
-
-            mu_x_ac = joints[joint_id][0] / feat_stride[0]
-            mu_y_ac = joints[joint_id][1] / feat_stride[1]
-            x0 = y0 = size // 2
-            x0 += mu_x_ac - mu_x
-            y0 += mu_y_ac - mu_y
-            g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * self.sigma**2))
-            # Usable gaussian range
-            g_x = max(0, -ul[0]), min(br[0], self.hmsize[0]) - ul[0]
-            g_y = max(0, -ul[1]), min(br[1], self.hmsize[1]) - ul[1]
-            # Image range
-            img_x = max(0, ul[0]), min(br[0], self.hmsize[0])
-            img_y = max(0, ul[1]), min(br[1], self.hmsize[1])
-
-            v = target_weight[joint_id]
-            if v > 0.5:
-                target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[
-                    0]:g_y[1], g_x[0]:g_x[1]]
-        records['target'] = target
-        records['target_weight'] = target_weight
-        del records['gt_joints'], records['joints_vis']
-
-        return records
-
-
-from typing import Optional, Tuple, Union, List
-import numbers
-
-
-def _scale_size(
-        size: Tuple[int, int],
-        scale: Union[float, int, tuple], ) -> Tuple[int, int]:
-    """Rescale a size by a ratio.
-
-    Args:
-        size (tuple[int]): (w, h).
-        scale (float | tuple(float)): Scaling factor.
-
-    Returns:
-        tuple[int]: scaled size.
-    """
-    if isinstance(scale, (float, int)):
-        scale = (scale, scale)
-    w, h = size
-    return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)
-
-
-def rescale_size(old_size: tuple,
-                 scale: Union[float, int, tuple],
-                 return_scale: bool=False) -> tuple:
-    """Calculate the new size to be rescaled to.
-
-    Args:
-        old_size (tuple[int]): The old size (w, h) of image.
-        scale (float | tuple[int]): The scaling factor or maximum size.
-            If it is a float number, then the image will be rescaled by this
-            factor, else if it is a tuple of 2 integers, then the image will
-            be rescaled as large as possible within the scale.
-        return_scale (bool): Whether to return the scaling factor besides the
-            rescaled image size.
-
-    Returns:
-        tuple[int]: The new rescaled image size.
-    """
-    w, h = old_size
-    if isinstance(scale, (float, int)):
-        if scale <= 0:
-            raise ValueError(f'Invalid scale {scale}, must be positive.')
-        scale_factor = scale
-    elif isinstance(scale, list):
-        max_long_edge = max(scale)
-        max_short_edge = min(scale)
-        scale_factor = min(max_long_edge / max(h, w),
-                           max_short_edge / min(h, w))
-    else:
-        raise TypeError(
-            f'Scale must be a number or tuple of int, but got {type(scale)}')
-
-    new_size = _scale_size((w, h), scale_factor)
-
-    if return_scale:
-        return new_size, scale_factor
-    else:
-        return new_size
-
-
-def imrescale(img: np.ndarray,
-              scale: Union[float, Tuple[int, int]],
-              return_scale: bool=False,
-              interpolation: str='bilinear',
-              backend: Optional[str]=None) -> Union[np.ndarray, Tuple[
-                  np.ndarray, float]]:
-    """Resize image while keeping the aspect ratio.
-
-    Args:
-        img (ndarray): The input image.
-        scale (float | tuple[int]): The scaling factor or maximum size.
-            If it is a float number, then the image will be rescaled by this
-            factor, else if it is a tuple of 2 integers, then the image will
-            be rescaled as large as possible within the scale.
-        return_scale (bool): Whether to return the scaling factor besides the
-            rescaled image.
-        interpolation (str): Same as :func:`resize`.
-        backend (str | None): Same as :func:`resize`.
-
-    Returns:
-        ndarray: The rescaled image.
-    """
-    h, w = img.shape[:2]
-    new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
-    rescaled_img = imresize(
-        img, new_size, interpolation=interpolation, backend=backend)
-    if return_scale:
-        return rescaled_img, scale_factor
-    else:
-        return rescaled_img
-
-
-def imresize(
-        img: np.ndarray,
-        size: Tuple[int, int],
-        return_scale: bool=False,
-        interpolation: str='bilinear',
-        out: Optional[np.ndarray]=None,
-        backend: Optional[str]=None,
-        interp=cv2.INTER_LINEAR, ) -> Union[Tuple[np.ndarray, float, float],
-                                            np.ndarray]:
-    """Resize image to a given size.
-
-    Args:
-        img (ndarray): The input image.
-        size (tuple[int]): Target size (w, h).
-        return_scale (bool): Whether to return `w_scale` and `h_scale`.
-        interpolation (str): Interpolation method, accepted values are
-            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
-            backend, "nearest", "bilinear" for 'pillow' backend.
-        out (ndarray): The output destination.
-        backend (str | None): The image resize backend type. Options are `cv2`,
-            `pillow`, `None`. If backend is None, the global imread_backend
-            specified by ``mmcv.use_backend()`` will be used. Default: None.
-
-    Returns:
-        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
-        `resized_img`.
-    """
-    h, w = img.shape[:2]
-    if backend is None:
-        backend = imread_backend
-    if backend not in ['cv2', 'pillow']:
-        raise ValueError(f'backend: {backend} is not supported for resize.'
-                         f"Supported backends are 'cv2', 'pillow'")
-
-    if backend == 'pillow':
-        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
-        pil_image = Image.fromarray(img)
-        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
-        resized_img = np.array(pil_image)
-    else:
-        resized_img = cv2.resize(img, size, dst=out, interpolation=interp)
-    if not return_scale:
-        return resized_img
-    else:
-        w_scale = size[0] / w
-        h_scale = size[1] / h
-        return resized_img, w_scale, h_scale
-
-
-class PETR_Resize:
-    """Resize images & bbox & mask.
-
-    This transform resizes the input image to some scale. Bboxes and masks are
-    then resized with the same scale factor. If the input dict contains the key
-    "scale", then the scale in the input dict is used, otherwise the specified
-    scale in the init method is used. If the input dict contains the key
-    "scale_factor" (if MultiScaleFlipAug does not give img_scale but
-    scale_factor), the actual scale will be computed by image shape and
-    scale_factor.
-
-    `img_scale` can either be a tuple (single-scale) or a list of tuple
-    (multi-scale). There are 3 multiscale modes:
-
-    - ``ratio_range is not None``: randomly sample a ratio from the ratio \
-      range and multiply it with the image scale.
-    - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
-      sample a scale from the multiscale range.
-    - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
-      sample a scale from multiple scales.
-
-    Args:
-        img_scale (tuple or list[tuple]): Images scales for resizing.
-        multiscale_mode (str): Either "range" or "value".
-        ratio_range (tuple[float]): (min_ratio, max_ratio)
-        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
-            image.
-        bbox_clip_border (bool, optional): Whether to clip the objects outside
-            the border of the image. In some dataset like MOT17, the gt bboxes
-            are allowed to cross the border of images. Therefore, we don't
-            need to clip the gt bboxes in these cases. Defaults to True.
-        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
-            These two backends generates slightly different results. Defaults
-            to 'cv2'.
-        interpolation (str): Interpolation method, accepted values are
-            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
-            backend, "nearest", "bilinear" for 'pillow' backend.
-        override (bool, optional): Whether to override `scale` and
-            `scale_factor` so as to call resize twice. Default False. If True,
-            after the first resizing, the existed `scale` and `scale_factor`
-            will be ignored so the second resizing can be allowed.
-            This option is a work-around for multiple times of resize in DETR.
-            Defaults to False.
-    """
-
-    def __init__(self,
-                 img_scale=None,
-                 multiscale_mode='range',
-                 ratio_range=None,
-                 keep_ratio=True,
-                 bbox_clip_border=True,
-                 backend='cv2',
-                 interpolation='bilinear',
-                 override=False,
-                 keypoint_clip_border=True):
-        if img_scale is None:
-            self.img_scale = None
-        else:
-            if isinstance(img_scale, list):
-                self.img_scale = img_scale
-            else:
-                self.img_scale = [img_scale]
-            assert isinstance(self.img_scale, list)
-
-        if ratio_range is not None:
-            # mode 1: given a scale and a range of image ratio
-            assert len(self.img_scale) == 1
-        else:
-            # mode 2: given multiple scales or a range of scales
-            assert multiscale_mode in ['value', 'range']
-
-        self.backend = backend
-        self.multiscale_mode = multiscale_mode
-        self.ratio_range = ratio_range
-        self.keep_ratio = keep_ratio
-        # TODO: refactor the override option in Resize
-        self.interpolation = interpolation
-        self.override = override
-        self.bbox_clip_border = bbox_clip_border
-        self.keypoint_clip_border = keypoint_clip_border
-
-    @staticmethod
-    def random_select(img_scales):
-        """Randomly select an img_scale from given candidates.
-
-        Args:
-            img_scales (list[tuple]): Images scales for selection.
-
-        Returns:
-            (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
-                where ``img_scale`` is the selected image scale and \
-                ``scale_idx`` is the selected index in the given candidates.
-        """
-
-        assert isinstance(img_scales, list)
-        scale_idx = np.random.randint(len(img_scales))
-        img_scale = img_scales[scale_idx]
-        return img_scale, scale_idx
-
-    @staticmethod
-    def random_sample(img_scales):
-        """Randomly sample an img_scale when ``multiscale_mode=='range'``.
-
-        Args:
-            img_scales (list[tuple]): Images scale range for sampling.
-                There must be two tuples in img_scales, which specify the lower
-                and upper bound of image scales.
-
-        Returns:
-            (tuple, None): Returns a tuple ``(img_scale, None)``, where \
-                ``img_scale`` is sampled scale and None is just a placeholder \
-                to be consistent with :func:`random_select`.
-        """
-
-        assert isinstance(img_scales, list) and len(img_scales) == 2
-        img_scale_long = [max(s) for s in img_scales]
-        img_scale_short = [min(s) for s in img_scales]
-        long_edge = np.random.randint(
-            min(img_scale_long), max(img_scale_long) + 1)
-        short_edge = np.random.randint(
-            min(img_scale_short), max(img_scale_short) + 1)
-        img_scale = (long_edge, short_edge)
-        return img_scale, None
-
-    @staticmethod
-    def random_sample_ratio(img_scale, ratio_range):
-        """Randomly sample an img_scale when ``ratio_range`` is specified.
-
-        A ratio will be randomly sampled from the range specified by
-        ``ratio_range``. Then it would be multiplied with ``img_scale`` to
-        generate sampled scale.
-
-        Args:
-            img_scale (list): Images scale base to multiply with ratio.
-            ratio_range (tuple[float]): The minimum and maximum ratio to scale
-                the ``img_scale``.
-
-        Returns:
-            (tuple, None): Returns a tuple ``(scale, None)``, where \
-                ``scale`` is sampled ratio multiplied with ``img_scale`` and \
-                None is just a placeholder to be consistent with \
-                :func:`random_select`.
-        """
-
-        assert isinstance(img_scale, list) and len(img_scale) == 2
-        min_ratio, max_ratio = ratio_range
-        assert min_ratio <= max_ratio
-        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
-        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
-        return scale, None
-
-    def _random_scale(self, results):
-        """Randomly sample an img_scale according to ``ratio_range`` and
-        ``multiscale_mode``.
-
-        If ``ratio_range`` is specified, a ratio will be sampled and be
-        multiplied with ``img_scale``.
-        If multiple scales are specified by ``img_scale``, a scale will be
-        sampled according to ``multiscale_mode``.
-        Otherwise, single scale will be used.
-
-        Args:
-            results (dict): Result dict from :obj:`dataset`.
-
-        Returns:
-            dict: Two new keys 'scale` and 'scale_idx` are added into \
-                ``results``, which would be used by subsequent pipelines.
-        """
-
-        if self.ratio_range is not None:
-            scale, scale_idx = self.random_sample_ratio(self.img_scale[0],
-                                                        self.ratio_range)
-        elif len(self.img_scale) == 1:
-            scale, scale_idx = self.img_scale[0], 0
-        elif self.multiscale_mode == 'range':
-            scale, scale_idx = self.random_sample(self.img_scale)
-        elif self.multiscale_mode == 'value':
-            scale, scale_idx = self.random_select(self.img_scale)
-        else:
-            raise NotImplementedError
-        results['scale'] = scale
-        results['scale_idx'] = scale_idx
-
-    def _resize_img(self, results):
-        """Resize images with ``results['scale']``."""
-        for key in ['image'] if 'image' in results else []:
-            if self.keep_ratio:
-                img, scale_factor = imrescale(
-                    results[key],
-                    results['scale'],
-                    return_scale=True,
-                    interpolation=self.interpolation,
-                    backend=self.backend)
-                # the w_scale and h_scale has minor difference
-                # a real fix should be done in the imrescale in the future
-                new_h, new_w = img.shape[:2]
-                h, w = results[key].shape[:2]
-                w_scale = new_w / w
-                h_scale = new_h / h
-            else:
-                img, w_scale, h_scale = imresize(
-                    results[key],
-                    results['scale'],
-                    return_scale=True,
-                    interpolation=self.interpolation,
-                    backend=self.backend)
-
-            scale_factor = np.array(
-                [w_scale, h_scale, w_scale, h_scale], dtype=np.float32)
-            results['im_shape'] = np.array(img.shape)
-            # in case that there is no padding
-            results['pad_shape'] = img.shape
-            results['scale_factor'] = scale_factor
-            results['keep_ratio'] = self.keep_ratio
-            # img_pad = self.impad(img, shape=results['scale'])
-            results[key] = img
-
-    def _resize_bboxes(self, results):
-        """Resize bounding boxes with ``results['scale_factor']``."""
-        for key in ['gt_bbox'] if 'gt_bbox' in results else []:
-            bboxes = results[key] * results['scale_factor']
-            if self.bbox_clip_border:
-                img_shape = results['im_shape']
-                bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
-                bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
-            results[key] = bboxes
-
-    def _resize_masks(self, results):
-        """Resize masks with ``results['scale']``"""
-        for key in ['mask'] if 'mask' in results else []:
-            if results[key] is None:
-                continue
-            if self.keep_ratio:
-                results[key] = results[key].rescale(results['scale'])
-            else:
-                results[key] = results[key].resize(results['im_shape'][:2])
-
-    def _resize_seg(self, results):
-        """Resize semantic segmentation map with ``results['scale']``."""
-        for key in ['seg'] if 'seg' in results else []:
-            if self.keep_ratio:
-                gt_seg = imrescale(
-                    results[key],
-                    results['scale'],
-                    interpolation='nearest',
-                    backend=self.backend)
-            else:
-                gt_seg = imresize(
-                    results[key],
-                    results['scale'],
-                    interpolation='nearest',
-                    backend=self.backend)
-            results[key] = gt_seg
-
-    def _resize_keypoints(self, results):
-        """Resize keypoints with ``results['scale_factor']``."""
-        for key in ['gt_joints'] if 'gt_joints' in results else []:
-            keypoints = results[key].copy()
-            keypoints[..., 0] = keypoints[..., 0] * results['scale_factor'][0]
-            keypoints[..., 1] = keypoints[..., 1] * results['scale_factor'][1]
-            if self.keypoint_clip_border:
-                img_shape = results['im_shape']
-                keypoints[..., 0] = np.clip(keypoints[..., 0], 0, img_shape[1])
-                keypoints[..., 1] = np.clip(keypoints[..., 1], 0, img_shape[0])
-            results[key] = keypoints
-
-    def _resize_areas(self, results):
-        """Resize mask areas with ``results['scale_factor']``."""
-        for key in ['gt_areas'] if 'gt_areas' in results else []:
-            areas = results[key].copy()
-            areas = areas * results['scale_factor'][0] * results[
-                'scale_factor'][1]
-            results[key] = areas
-
-    def __call__(self, results):
-        """Call function to resize images, bounding boxes, masks, semantic
-        segmentation map.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Resized results, 'im_shape', 'pad_shape', 'scale_factor', \
-                'keep_ratio' keys are added into result dict.
-        """
-        if 'scale' not in results:
-            if 'scale_factor' in results:
-                img_shape = results['image'].shape[:2]
-                scale_factor = results['scale_factor'][0]
-                # assert isinstance(scale_factor, float)
-                results['scale'] = [int(x * scale_factor)
-                                    for x in img_shape][::-1]
-            else:
-                self._random_scale(results)
-        else:
-            if not self.override:
-                assert 'scale_factor' not in results, (
-                    'scale and scale_factor cannot be both set.')
-            else:
-                results.pop('scale')
-                if 'scale_factor' in results:
-                    results.pop('scale_factor')
-                self._random_scale(results)
-
-        self._resize_img(results)
-        self._resize_bboxes(results)
-        self._resize_masks(results)
-        self._resize_seg(results)
-        self._resize_keypoints(results)
-        self._resize_areas(results)
-        return results
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(img_scale={self.img_scale}, '
-        repr_str += f'multiscale_mode={self.multiscale_mode}, '
-        repr_str += f'ratio_range={self.ratio_range}, '
-        repr_str += f'keep_ratio={self.keep_ratio}, '
-        repr_str += f'bbox_clip_border={self.bbox_clip_border})'
-        repr_str += f'keypoint_clip_border={self.keypoint_clip_border})'
-        return repr_str
diff --git a/pdfdet/models/Paddle/ppdet/data/transform/keypoints_3d_operators.py b/pdfdet/models/Paddle/ppdet/data/transform/keypoints_3d_operators.py
deleted file mode 100644
index 13337bc..0000000
--- a/pdfdet/models/Paddle/ppdet/data/transform/keypoints_3d_operators.py
+++ /dev/null
@@ -1,296 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-
-try:
-    from collections.abc import Sequence
-except Exception:
-    from collections import Sequence
-import cv2
-import numpy as np
-import math
-import copy
-import random
-import uuid
-from numbers import Number, Integral
-
-from ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints, get_affine_transform, affine_transform, get_warp_matrix
-from ppdet.core.workspace import serializable
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-registered_ops = []
-
-__all__ = [
-    'CropAndFlipImages', 'PermuteImages', 'RandomFlipHalfBody3DTransformImages'
-]
-
-import matplotlib.pyplot as plt
-from PIL import Image, ImageDraw
-from mpl_toolkits.mplot3d import Axes3D
-
-
-def register_keypointop(cls):
-    return serializable(cls)
-
-
-def register_op(cls):
-    registered_ops.append(cls.__name__)
-    if not hasattr(BaseOperator, cls.__name__):
-        setattr(BaseOperator, cls.__name__, cls)
-    else:
-        raise KeyError("The {} class has been registered.".format(cls.__name__))
-    return serializable(cls)
-
-
-class BaseOperator(object):
-    def __init__(self, name=None):
-        if name is None:
-            name = self.__class__.__name__
-        self._id = name + '_' + str(uuid.uuid4())[-6:]
-
-    def apply(self, sample, context=None):
-        """ Process a sample.
-        Args:
-            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
-            context (dict): info about this sample processing
-        Returns:
-            result (dict): a processed sample
-        """
-        return sample
-
-    def __call__(self, sample, context=None):
-        """ Process a sample.
-        Args:
-            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
-            context (dict): info about this sample processing
-        Returns:
-            result (dict): a processed sample
-        """
-        if isinstance(sample, Sequence):  # for batch_size
-            for i in range(len(sample)):
-                sample[i] = self.apply(sample[i], context)
-        else:
-            # image.shape changed
-            sample = self.apply(sample, context)
-        return sample
-
-    def __str__(self):
-        return str(self._id)
-
-
-@register_keypointop
-class CropAndFlipImages(object):
-    """Crop all images"""
-
-    def __init__(self, crop_range, flip_pairs=None):
-        super(CropAndFlipImages, self).__init__()
-        self.crop_range = crop_range
-        self.flip_pairs = flip_pairs
-
-    def __call__(self, records):  # tuple
-        images = records["image"]
-        images = images[:, :, ::-1, :]
-        images = images[:, :, self.crop_range[0]:self.crop_range[1]]
-        records["image"] = images
-
-        if "kps2d" in records.keys():
-            kps2d = records["kps2d"]
-
-            width, height = images.shape[2], images.shape[1]
-            kps2d = np.array(kps2d)
-            kps2d[:, :, 0] = kps2d[:, :, 0] - self.crop_range[0]
-
-            for pair in self.flip_pairs:
-                kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \
-                    kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy()
-
-            records["kps2d"] = kps2d
-
-        return records
-
-
-@register_op
-class PermuteImages(BaseOperator):
-    def __init__(self):
-        """
-        Change the channel to be (batch_size, C, H, W) #(6, 3, 1080, 1920)
-        """
-        super(PermuteImages, self).__init__()
-
-    def apply(self, sample, context=None):
-        images = sample["image"]
-        images = images.transpose((0, 3, 1, 2))
-
-        sample["image"] = images
-
-        return sample
-
-
-@register_keypointop
-class RandomFlipHalfBody3DTransformImages(object):
-    """apply data augment to images and coords
-    to achieve the flip, scale, rotate and half body transform effect for training image
-    Args:
-        trainsize (list):[w, h], Image target size
-        upper_body_ids (list): The upper body joint ids
-        flip_pairs (list): The left-right joints exchange order list
-        pixel_std (int): The pixel std of the scale
-        scale (float): The scale factor to transform the image
-        rot (int): The rotate factor to transform the image
-        num_joints_half_body (int): The joints threshold of the half body transform
-        prob_half_body (float): The threshold of the half body transform
-        flip (bool): Whether to flip the image
-    Returns:
-        records(dict): contain the image and coords after tranformed
-    """
-
-    def __init__(self,
-                 trainsize,
-                 upper_body_ids,
-                 flip_pairs,
-                 pixel_std,
-                 scale=0.35,
-                 rot=40,
-                 num_joints_half_body=8,
-                 prob_half_body=0.3,
-                 flip=True,
-                 rot_prob=0.6,
-                 do_occlusion=False):
-        super(RandomFlipHalfBody3DTransformImages, self).__init__()
-        self.trainsize = trainsize
-        self.upper_body_ids = upper_body_ids
-        self.flip_pairs = flip_pairs
-        self.pixel_std = pixel_std
-        self.scale = scale
-        self.rot = rot
-        self.num_joints_half_body = num_joints_half_body
-        self.prob_half_body = prob_half_body
-        self.flip = flip
-        self.aspect_ratio = trainsize[0] * 1.0 / trainsize[1]
-        self.rot_prob = rot_prob
-        self.do_occlusion = do_occlusion
-
-    def halfbody_transform(self, joints, joints_vis):
-        upper_joints = []
-        lower_joints = []
-        for joint_id in range(joints.shape[0]):
-            if joints_vis[joint_id][0] > 0:
-                if joint_id in self.upper_body_ids:
-                    upper_joints.append(joints[joint_id])
-                else:
-                    lower_joints.append(joints[joint_id])
-        if np.random.randn() < 0.5 and len(upper_joints) > 2:
-            selected_joints = upper_joints
-        else:
-            selected_joints = lower_joints if len(
-                lower_joints) > 2 else upper_joints
-        if len(selected_joints) < 2:
-            return None, None
-        selected_joints = np.array(selected_joints, dtype=np.float32)
-        center = selected_joints.mean(axis=0)[:2]
-        left_top = np.amin(selected_joints, axis=0)
-        right_bottom = np.amax(selected_joints, axis=0)
-        w = right_bottom[0] - left_top[0]
-        h = right_bottom[1] - left_top[1]
-        if w > self.aspect_ratio * h:
-            h = w * 1.0 / self.aspect_ratio
-        elif w < self.aspect_ratio * h:
-            w = h * self.aspect_ratio
-        scale = np.array(
-            [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std],
-            dtype=np.float32)
-        scale = scale * 1.5
-
-        return center, scale
-
-    def flip_joints(self, joints, joints_vis, width, matched_parts, kps2d=None):
-        # joints: (6, 24, 3),(num_frames, num_joints, 3)
-
-        joints[:, :, 0] = width - joints[:, :, 0] - 1  # x
-        if kps2d is not None:
-            kps2d[:, :, 0] = width - kps2d[:, :, 0] - 1
-
-        for pair in matched_parts:
-            joints[:, pair[0], :], joints[:,pair[1], :] = \
-                joints[:,pair[1], :], joints[:,pair[0], :].copy()
-
-            joints_vis[:,pair[0], :], joints_vis[:,pair[1], :] = \
-                joints_vis[:,pair[1], :], joints_vis[:,pair[0], :].copy()
-
-            if kps2d is not None:
-                kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \
-                    kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy()
-
-        # move to zero
-        joints -= joints[:, [0], :]  # (batch_size, 24, 3),numpy.ndarray
-
-        return joints, joints_vis, kps2d
-
-    def __call__(self, records):
-        images = records[
-            'image']  #kps3d, kps3d_vis, images. images.shape(num_frames, width, height, 3)
-
-        joints = records['kps3d']
-        joints_vis = records['kps3d_vis']
-
-        kps2d = None
-        if 'kps2d' in records.keys():
-            kps2d = records['kps2d']
-
-        if self.flip and np.random.random() <= 0.5:
-            images = images[:, :, ::-1, :]  # 图像水平翻转 (6, 1080, 810, 3)
-            joints, joints_vis, kps2d = self.flip_joints(
-                joints, joints_vis, images.shape[2], self.flip_pairs,
-                kps2d)  # 关键点左右对称翻转
-        occlusion = False
-        if self.do_occlusion and random.random() <= 0.5:  # 随机遮挡
-            height = images[0].shape[0]
-            width = images[0].shape[1]
-            occlusion = True
-            while True:
-                area_min = 0.0
-                area_max = 0.2
-                synth_area = (random.random() *
-                              (area_max - area_min) + area_min) * width * height
-
-                ratio_min = 0.3
-                ratio_max = 1 / 0.3
-                synth_ratio = (random.random() *
-                               (ratio_max - ratio_min) + ratio_min)
-
-                synth_h = math.sqrt(synth_area * synth_ratio)
-                synth_w = math.sqrt(synth_area / synth_ratio)
-                synth_xmin = random.random() * (width - synth_w - 1)
-                synth_ymin = random.random() * (height - synth_h - 1)
-
-                if synth_xmin >= 0 and synth_ymin >= 0 and synth_xmin + synth_w < width and synth_ymin + synth_h < height:
-                    xmin = int(synth_xmin)
-                    ymin = int(synth_ymin)
-                    w = int(synth_w)
-                    h = int(synth_h)
-
-                    mask = np.random.rand(h, w, 3) * 255
-                    images[:, ymin:ymin + h, xmin:xmin + w, :] = mask[
-                        None, :, :, :]
-                    break
-
-        records['image'] = images
-        records['kps3d'] = joints
-        records['kps3d_vis'] = joints_vis
-        if kps2d is not None:
-            records['kps2d'] = kps2d
-
-        return records
diff --git a/pdfdet/models/Paddle/ppdet/data/transform/mot_operators.py b/pdfdet/models/Paddle/ppdet/data/transform/mot_operators.py
deleted file mode 100644
index e533ea3..0000000
--- a/pdfdet/models/Paddle/ppdet/data/transform/mot_operators.py
+++ /dev/null
@@ -1,627 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-try:
-    from collections.abc import Sequence
-except Exception:
-    from collections import Sequence
-from numbers import Integral
-
-import cv2
-import copy
-import numpy as np
-import random
-import math
-
-from .operators import BaseOperator, register_op
-from .batch_operators import Gt2TTFTarget
-from ppdet.modeling.bbox_utils import bbox_iou_np_expand
-from ppdet.utils.logger import setup_logger
-from .op_helper import gaussian_radius
-logger = setup_logger(__name__)
-
-__all__ = [
-    'RGBReverse', 'LetterBoxResize', 'MOTRandomAffine', 'Gt2JDETargetThres',
-    'Gt2JDETargetMax', 'Gt2FairMOTTarget'
-]
-
-
-@register_op
-class RGBReverse(BaseOperator):
-    """RGB to BGR, or BGR to RGB, sensitive to MOTRandomAffine
-    """
-
-    def __init__(self):
-        super(RGBReverse, self).__init__()
-
-    def apply(self, sample, context=None):
-        im = sample['image']
-        sample['image'] = np.ascontiguousarray(im[:, :, ::-1])
-        return sample
-
-
-@register_op
-class LetterBoxResize(BaseOperator):
-    def __init__(self, target_size):
-        """
-        Resize image to target size, convert normalized xywh to pixel xyxy
-        format ([x_center, y_center, width, height] -> [x0, y0, x1, y1]).
-        Args:
-            target_size (int|list): image target size.
-        """
-        super(LetterBoxResize, self).__init__()
-        if not isinstance(target_size, (Integral, Sequence)):
-            raise TypeError(
-                "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
-                format(type(target_size)))
-        if isinstance(target_size, Integral):
-            target_size = [target_size, target_size]
-        self.target_size = target_size
-
-    def apply_image(self, img, height, width, color=(127.5, 127.5, 127.5)):
-        # letterbox: resize a rectangular image to a padded rectangular
-        shape = img.shape[:2]  # [height, width]
-        ratio_h = float(height) / shape[0]
-        ratio_w = float(width) / shape[1]
-        ratio = min(ratio_h, ratio_w)
-        new_shape = (round(shape[1] * ratio),
-                     round(shape[0] * ratio))  # [width, height]
-        padw = (width - new_shape[0]) / 2
-        padh = (height - new_shape[1]) / 2
-        top, bottom = round(padh - 0.1), round(padh + 0.1)
-        left, right = round(padw - 0.1), round(padw + 0.1)
-
-        img = cv2.resize(
-            img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
-        img = cv2.copyMakeBorder(
-            img, top, bottom, left, right, cv2.BORDER_CONSTANT,
-            value=color)  # padded rectangular
-        return img, ratio, padw, padh
-
-    def apply_bbox(self, bbox0, h, w, ratio, padw, padh):
-        bboxes = bbox0.copy()
-        bboxes[:, 0] = ratio * w * (bbox0[:, 0] - bbox0[:, 2] / 2) + padw
-        bboxes[:, 1] = ratio * h * (bbox0[:, 1] - bbox0[:, 3] / 2) + padh
-        bboxes[:, 2] = ratio * w * (bbox0[:, 0] + bbox0[:, 2] / 2) + padw
-        bboxes[:, 3] = ratio * h * (bbox0[:, 1] + bbox0[:, 3] / 2) + padh
-        return bboxes
-
-    def apply(self, sample, context=None):
-        """ Resize the image numpy.
-        """
-        im = sample['image']
-        h, w = sample['im_shape']
-        if not isinstance(im, np.ndarray):
-            raise TypeError("{}: image type is not numpy.".format(self))
-        if len(im.shape) != 3:
-            from PIL import UnidentifiedImageError
-            raise UnidentifiedImageError(
-                '{}: image is not 3-dimensional.'.format(self))
-
-        # apply image
-        height, width = self.target_size
-        img, ratio, padw, padh = self.apply_image(
-            im, height=height, width=width)
-
-        sample['image'] = img
-        new_shape = (round(h * ratio), round(w * ratio))
-        sample['im_shape'] = np.asarray(new_shape, dtype=np.float32)
-        sample['scale_factor'] = np.asarray([ratio, ratio], dtype=np.float32)
-
-        # apply bbox
-        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
-            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], h, w, ratio,
-                                                padw, padh)
-        return sample
-
-
-@register_op
-class MOTRandomAffine(BaseOperator):
-    """ 
-    Affine transform to image and coords to achieve the rotate, scale and
-    shift effect for training image.
-
-    Args:
-        degrees (list[2]): the rotate range to apply, transform range is [min, max]
-        translate (list[2]): the translate range to apply, transform range is [min, max]
-        scale (list[2]): the scale range to apply, transform range is [min, max]
-        shear (list[2]): the shear range to apply, transform range is [min, max]
-        borderValue (list[3]): value used in case of a constant border when appling
-            the perspective transformation
-        reject_outside (bool): reject warped bounding bboxes outside of image
-
-    Returns:
-        records(dict): contain the image and coords after tranformed
-
-    """
-
-    def __init__(self,
-                 degrees=(-5, 5),
-                 translate=(0.10, 0.10),
-                 scale=(0.50, 1.20),
-                 shear=(-2, 2),
-                 borderValue=(127.5, 127.5, 127.5),
-                 reject_outside=True):
-        super(MOTRandomAffine, self).__init__()
-        self.degrees = degrees
-        self.translate = translate
-        self.scale = scale
-        self.shear = shear
-        self.borderValue = borderValue
-        self.reject_outside = reject_outside
-
-    def apply(self, sample, context=None):
-        # https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4
-        border = 0  # width of added border (optional)
-
-        img = sample['image']
-        height, width = img.shape[0], img.shape[1]
-
-        # Rotation and Scale
-        R = np.eye(3)
-        a = random.random() * (self.degrees[1] - self.degrees[0]
-                               ) + self.degrees[0]
-        s = random.random() * (self.scale[1] - self.scale[0]) + self.scale[0]
-        R[:2] = cv2.getRotationMatrix2D(
-            angle=a, center=(width / 2, height / 2), scale=s)
-
-        # Translation
-        T = np.eye(3)
-        T[0, 2] = (
-            random.random() * 2 - 1
-        ) * self.translate[0] * height + border  # x translation (pixels)
-        T[1, 2] = (
-            random.random() * 2 - 1
-        ) * self.translate[1] * width + border  # y translation (pixels)
-
-        # Shear
-        S = np.eye(3)
-        S[0, 1] = math.tan((random.random() *
-                            (self.shear[1] - self.shear[0]) + self.shear[0]) *
-                           math.pi / 180)  # x shear (deg)
-        S[1, 0] = math.tan((random.random() *
-                            (self.shear[1] - self.shear[0]) + self.shear[0]) *
-                           math.pi / 180)  # y shear (deg)
-
-        M = S @T @R  # Combined rotation matrix. ORDER IS IMPORTANT HERE!!
-        imw = cv2.warpPerspective(
-            img,
-            M,
-            dsize=(width, height),
-            flags=cv2.INTER_LINEAR,
-            borderValue=self.borderValue)  # BGR order borderValue
-
-        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
-            targets = sample['gt_bbox']
-            n = targets.shape[0]
-            points = targets.copy()
-            area0 = (points[:, 2] - points[:, 0]) * (
-                points[:, 3] - points[:, 1])
-
-            # warp points
-            xy = np.ones((n * 4, 3))
-            xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
-                n * 4, 2)  # x1y1, x2y2, x1y2, x2y1
-            xy = (xy @M.T)[:, :2].reshape(n, 8)
-
-            # create new boxes
-            x = xy[:, [0, 2, 4, 6]]
-            y = xy[:, [1, 3, 5, 7]]
-            xy = np.concatenate(
-                (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T
-
-            # apply angle-based reduction
-            radians = a * math.pi / 180
-            reduction = max(abs(math.sin(radians)), abs(math.cos(radians)))**0.5
-            x = (xy[:, 2] + xy[:, 0]) / 2
-            y = (xy[:, 3] + xy[:, 1]) / 2
-            w = (xy[:, 2] - xy[:, 0]) * reduction
-            h = (xy[:, 3] - xy[:, 1]) * reduction
-            xy = np.concatenate(
-                (x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T
-
-            # reject warped points outside of image
-            if self.reject_outside:
-                np.clip(xy[:, 0], 0, width, out=xy[:, 0])
-                np.clip(xy[:, 2], 0, width, out=xy[:, 2])
-                np.clip(xy[:, 1], 0, height, out=xy[:, 1])
-                np.clip(xy[:, 3], 0, height, out=xy[:, 3])
-            w = xy[:, 2] - xy[:, 0]
-            h = xy[:, 3] - xy[:, 1]
-            area = w * h
-            ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16))
-            i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10)
-
-            if sum(i) > 0:
-                sample['gt_bbox'] = xy[i].astype(sample['gt_bbox'].dtype)
-                sample['gt_class'] = sample['gt_class'][i]
-                if 'difficult' in sample:
-                    sample['difficult'] = sample['difficult'][i]
-                if 'gt_ide' in sample:
-                    sample['gt_ide'] = sample['gt_ide'][i]
-                if 'is_crowd' in sample:
-                    sample['is_crowd'] = sample['is_crowd'][i]
-                sample['image'] = imw
-                return sample
-            else:
-                return sample
-
-
-@register_op
-class Gt2JDETargetThres(BaseOperator):
-    __shared__ = ['num_classes']
-    """
-    Generate JDE targets by groud truth data when training
-    Args:
-        anchors (list): anchors of JDE model
-        anchor_masks (list): anchor_masks of JDE model
-        downsample_ratios (list): downsample ratios of JDE model
-        ide_thresh (float): thresh of identity, higher is groud truth 
-        fg_thresh (float): thresh of foreground, higher is foreground
-        bg_thresh (float): thresh of background, lower is background
-        num_classes (int): number of classes
-    """
-
-    def __init__(self,
-                 anchors,
-                 anchor_masks,
-                 downsample_ratios,
-                 ide_thresh=0.5,
-                 fg_thresh=0.5,
-                 bg_thresh=0.4,
-                 num_classes=1):
-        super(Gt2JDETargetThres, self).__init__()
-        self.anchors = anchors
-        self.anchor_masks = anchor_masks
-        self.downsample_ratios = downsample_ratios
-        self.ide_thresh = ide_thresh
-        self.fg_thresh = fg_thresh
-        self.bg_thresh = bg_thresh
-        self.num_classes = num_classes
-
-    def generate_anchor(self, nGh, nGw, anchor_hw):
-        nA = len(anchor_hw)
-        yy, xx = np.meshgrid(np.arange(nGh), np.arange(nGw))
-
-        mesh = np.stack([xx.T, yy.T], axis=0)  # [2, nGh, nGw]
-        mesh = np.repeat(mesh[None, :], nA, axis=0)  # [nA, 2, nGh, nGw]
-
-        anchor_offset_mesh = anchor_hw[:, :, None][:, :, :, None]
-        anchor_offset_mesh = np.repeat(anchor_offset_mesh, nGh, axis=-2)
-        anchor_offset_mesh = np.repeat(anchor_offset_mesh, nGw, axis=-1)
-
-        anchor_mesh = np.concatenate(
-            [mesh, anchor_offset_mesh], axis=1)  # [nA, 4, nGh, nGw]
-        return anchor_mesh
-
-    def encode_delta(self, gt_box_list, fg_anchor_list):
-        px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \
-                        fg_anchor_list[:, 2], fg_anchor_list[:,3]
-        gx, gy, gw, gh = gt_box_list[:, 0], gt_box_list[:, 1], \
-                        gt_box_list[:, 2], gt_box_list[:, 3]
-        dx = (gx - px) / pw
-        dy = (gy - py) / ph
-        dw = np.log(gw / pw)
-        dh = np.log(gh / ph)
-        return np.stack([dx, dy, dw, dh], axis=1)
-
-    def pad_box(self, sample, num_max):
-        assert 'gt_bbox' in sample
-        bbox = sample['gt_bbox']
-        gt_num = len(bbox)
-        pad_bbox = np.zeros((num_max, 4), dtype=np.float32)
-        if gt_num > 0:
-            pad_bbox[:gt_num, :] = bbox[:gt_num, :]
-        sample['gt_bbox'] = pad_bbox
-        if 'gt_score' in sample:
-            pad_score = np.zeros((num_max, ), dtype=np.float32)
-            if gt_num > 0:
-                pad_score[:gt_num] = sample['gt_score'][:gt_num, 0]
-            sample['gt_score'] = pad_score
-        if 'difficult' in sample:
-            pad_diff = np.zeros((num_max, ), dtype=np.int32)
-            if gt_num > 0:
-                pad_diff[:gt_num] = sample['difficult'][:gt_num, 0]
-            sample['difficult'] = pad_diff
-        if 'is_crowd' in sample:
-            pad_crowd = np.zeros((num_max, ), dtype=np.int32)
-            if gt_num > 0:
-                pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0]
-            sample['is_crowd'] = pad_crowd
-        if 'gt_ide' in sample:
-            pad_ide = np.zeros((num_max, ), dtype=np.int32)
-            if gt_num > 0:
-                pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0]
-            sample['gt_ide'] = pad_ide
-        return sample
-
-    def __call__(self, samples, context=None):
-        assert len(self.anchor_masks) == len(self.downsample_ratios), \
-            "anchor_masks', and 'downsample_ratios' should have same length."
-        h, w = samples[0]['image'].shape[1:3]
-
-        num_max = 0
-        for sample in samples:
-            num_max = max(num_max, len(sample['gt_bbox']))
-
-        for sample in samples:
-            gt_bbox = sample['gt_bbox']
-            gt_ide = sample['gt_ide']
-            for i, (anchor_hw, downsample_ratio
-                    ) in enumerate(zip(self.anchors, self.downsample_ratios)):
-                anchor_hw = np.array(
-                    anchor_hw, dtype=np.float32) / downsample_ratio
-                nA = len(anchor_hw)
-                nGh, nGw = int(h / downsample_ratio), int(w / downsample_ratio)
-                tbox = np.zeros((nA, nGh, nGw, 4), dtype=np.float32)
-                tconf = np.zeros((nA, nGh, nGw), dtype=np.float32)
-                tid = -np.ones((nA, nGh, nGw, 1), dtype=np.float32)
-
-                gxy, gwh = gt_bbox[:, 0:2].copy(), gt_bbox[:, 2:4].copy()
-                gxy[:, 0] = gxy[:, 0] * nGw
-                gxy[:, 1] = gxy[:, 1] * nGh
-                gwh[:, 0] = gwh[:, 0] * nGw
-                gwh[:, 1] = gwh[:, 1] * nGh
-                gxy[:, 0] = np.clip(gxy[:, 0], 0, nGw - 1)
-                gxy[:, 1] = np.clip(gxy[:, 1], 0, nGh - 1)
-                tboxes = np.concatenate([gxy, gwh], axis=1)
-
-                anchor_mesh = self.generate_anchor(nGh, nGw, anchor_hw)
-
-                anchor_list = np.transpose(anchor_mesh,
-                                           (0, 2, 3, 1)).reshape(-1, 4)
-                iou_pdist = bbox_iou_np_expand(
-                    anchor_list, tboxes, x1y1x2y2=False)
-
-                iou_max = np.max(iou_pdist, axis=1)
-                max_gt_index = np.argmax(iou_pdist, axis=1)
-
-                iou_map = iou_max.reshape(nA, nGh, nGw)
-                gt_index_map = max_gt_index.reshape(nA, nGh, nGw)
-
-                id_index = iou_map > self.ide_thresh
-                fg_index = iou_map > self.fg_thresh
-                bg_index = iou_map < self.bg_thresh
-                ign_index = (iou_map < self.fg_thresh) * (
-                    iou_map > self.bg_thresh)
-                tconf[fg_index] = 1
-                tconf[bg_index] = 0
-                tconf[ign_index] = -1
-
-                gt_index = gt_index_map[fg_index]
-                gt_box_list = tboxes[gt_index]
-                gt_id_list = gt_ide[gt_index_map[id_index]]
-
-                if np.sum(fg_index) > 0:
-                    tid[id_index] = gt_id_list
-
-                    fg_anchor_list = anchor_list.reshape(nA, nGh, nGw,
-                                                         4)[fg_index]
-                    delta_target = self.encode_delta(gt_box_list,
-                                                     fg_anchor_list)
-                    tbox[fg_index] = delta_target
-
-                sample['tbox{}'.format(i)] = tbox
-                sample['tconf{}'.format(i)] = tconf
-                sample['tide{}'.format(i)] = tid
-            sample.pop('gt_class')
-            sample = self.pad_box(sample, num_max)
-        return samples
-
-
-@register_op
-class Gt2JDETargetMax(BaseOperator):
-    __shared__ = ['num_classes']
-    """
-    Generate JDE targets by groud truth data when evaluating
-    Args:
-        anchors (list): anchors of JDE model
-        anchor_masks (list): anchor_masks of JDE model
-        downsample_ratios (list): downsample ratios of JDE model
-        max_iou_thresh (float): iou thresh for high quality anchor
-        num_classes (int): number of classes
-    """
-
-    def __init__(self,
-                 anchors,
-                 anchor_masks,
-                 downsample_ratios,
-                 max_iou_thresh=0.60,
-                 num_classes=1):
-        super(Gt2JDETargetMax, self).__init__()
-        self.anchors = anchors
-        self.anchor_masks = anchor_masks
-        self.downsample_ratios = downsample_ratios
-        self.max_iou_thresh = max_iou_thresh
-        self.num_classes = num_classes
-
-    def __call__(self, samples, context=None):
-        assert len(self.anchor_masks) == len(self.downsample_ratios), \
-            "anchor_masks', and 'downsample_ratios' should have same length."
-        h, w = samples[0]['image'].shape[1:3]
-        for sample in samples:
-            gt_bbox = sample['gt_bbox']
-            gt_ide = sample['gt_ide']
-            for i, (anchor_hw, downsample_ratio
-                    ) in enumerate(zip(self.anchors, self.downsample_ratios)):
-                anchor_hw = np.array(
-                    anchor_hw, dtype=np.float32) / downsample_ratio
-                nA = len(anchor_hw)
-                nGh, nGw = int(h / downsample_ratio), int(w / downsample_ratio)
-                tbox = np.zeros((nA, nGh, nGw, 4), dtype=np.float32)
-                tconf = np.zeros((nA, nGh, nGw), dtype=np.float32)
-                tid = -np.ones((nA, nGh, nGw, 1), dtype=np.float32)
-
-                gxy, gwh = gt_bbox[:, 0:2].copy(), gt_bbox[:, 2:4].copy()
-                gxy[:, 0] = gxy[:, 0] * nGw
-                gxy[:, 1] = gxy[:, 1] * nGh
-                gwh[:, 0] = gwh[:, 0] * nGw
-                gwh[:, 1] = gwh[:, 1] * nGh
-                gi = np.clip(gxy[:, 0], 0, nGw - 1).astype(int)
-                gj = np.clip(gxy[:, 1], 0, nGh - 1).astype(int)
-
-                # iou of targets-anchors (using wh only)
-                box1 = gwh
-                box2 = anchor_hw[:, None, :]
-                inter_area = np.minimum(box1, box2).prod(2)
-                iou = inter_area / (
-                    box1.prod(1) + box2.prod(2) - inter_area + 1e-16)
-
-                # Select best iou_pred and anchor
-                iou_best = iou.max(0)  # best anchor [0-2] for each target
-                a = np.argmax(iou, axis=0)
-
-                # Select best unique target-anchor combinations
-                iou_order = np.argsort(-iou_best)  # best to worst
-
-                # Unique anchor selection
-                u = np.stack((gi, gj, a), 0)[:, iou_order]
-                _, first_unique = np.unique(u, axis=1, return_index=True)
-                mask = iou_order[first_unique]
-                # best anchor must share significant commonality (iou) with target
-                # TODO: examine arbitrary threshold
-                idx = mask[iou_best[mask] > self.max_iou_thresh]
-
-                if len(idx) > 0:
-                    a_i, gj_i, gi_i = a[idx], gj[idx], gi[idx]
-                    t_box = gt_bbox[idx]
-                    t_id = gt_ide[idx]
-                    if len(t_box.shape) == 1:
-                        t_box = t_box.reshape(1, 4)
-
-                    gxy, gwh = t_box[:, 0:2].copy(), t_box[:, 2:4].copy()
-                    gxy[:, 0] = gxy[:, 0] * nGw
-                    gxy[:, 1] = gxy[:, 1] * nGh
-                    gwh[:, 0] = gwh[:, 0] * nGw
-                    gwh[:, 1] = gwh[:, 1] * nGh
-
-                    # XY coordinates
-                    tbox[:, :, :, 0:2][a_i, gj_i, gi_i] = gxy - gxy.astype(int)
-                    # Width and height in yolo method
-                    tbox[:, :, :, 2:4][a_i, gj_i, gi_i] = np.log(gwh /
-                                                                 anchor_hw[a_i])
-                    tconf[a_i, gj_i, gi_i] = 1
-                    tid[a_i, gj_i, gi_i] = t_id
-
-                sample['tbox{}'.format(i)] = tbox
-                sample['tconf{}'.format(i)] = tconf
-                sample['tide{}'.format(i)] = tid
-
-
-class Gt2FairMOTTarget(Gt2TTFTarget):
-    __shared__ = ['num_classes']
-    """
-    Generate FairMOT targets by ground truth data.
-    Difference between Gt2FairMOTTarget and Gt2TTFTarget are:
-        1. the gaussian kernal radius to generate a heatmap.
-        2. the targets needed during training.
-    
-    Args:
-        num_classes(int): the number of classes.
-        down_ratio(int): the down ratio from images to heatmap, 4 by default.
-        max_objs(int): the maximum number of ground truth objects in a image, 500 by default.
-    """
-
-    def __init__(self, num_classes=1, down_ratio=4, max_objs=500):
-        super(Gt2TTFTarget, self).__init__()
-        self.down_ratio = down_ratio
-        self.num_classes = num_classes
-        self.max_objs = max_objs
-
-    def __call__(self, samples, context=None):
-        for b_id, sample in enumerate(samples):
-            output_h = sample['image'].shape[1] // self.down_ratio
-            output_w = sample['image'].shape[2] // self.down_ratio
-
-            heatmap = np.zeros(
-                (self.num_classes, output_h, output_w), dtype='float32')
-            bbox_size = np.zeros((self.max_objs, 4), dtype=np.float32)
-            center_offset = np.zeros((self.max_objs, 2), dtype=np.float32)
-            index = np.zeros((self.max_objs, ), dtype=np.int64)
-            index_mask = np.zeros((self.max_objs, ), dtype=np.int32)
-            reid = np.zeros((self.max_objs, ), dtype=np.int64)
-            bbox_xys = np.zeros((self.max_objs, 4), dtype=np.float32)
-            if self.num_classes > 1:
-                # each category corresponds to a set of track ids
-                cls_tr_ids = np.zeros(
-                    (self.num_classes, output_h, output_w), dtype=np.int64)
-                cls_id_map = np.full((output_h, output_w), -1, dtype=np.int64)
-
-            gt_bbox = sample['gt_bbox']
-            gt_class = sample['gt_class']
-            gt_ide = sample['gt_ide']
-
-            for k in range(len(gt_bbox)):
-                cls_id = gt_class[k][0]
-                bbox = gt_bbox[k]
-                ide = gt_ide[k][0]
-                bbox[[0, 2]] = bbox[[0, 2]] * output_w
-                bbox[[1, 3]] = bbox[[1, 3]] * output_h
-                bbox_amodal = copy.deepcopy(bbox)
-                bbox_amodal[0] = bbox_amodal[0] - bbox_amodal[2] / 2.
-                bbox_amodal[1] = bbox_amodal[1] - bbox_amodal[3] / 2.
-                bbox_amodal[2] = bbox_amodal[0] + bbox_amodal[2]
-                bbox_amodal[3] = bbox_amodal[1] + bbox_amodal[3]
-                bbox[0] = np.clip(bbox[0], 0, output_w - 1)
-                bbox[1] = np.clip(bbox[1], 0, output_h - 1)
-                h = bbox[3]
-                w = bbox[2]
-
-                bbox_xy = copy.deepcopy(bbox)
-                bbox_xy[0] = bbox_xy[0] - bbox_xy[2] / 2
-                bbox_xy[1] = bbox_xy[1] - bbox_xy[3] / 2
-                bbox_xy[2] = bbox_xy[0] + bbox_xy[2]
-                bbox_xy[3] = bbox_xy[1] + bbox_xy[3]
-
-                if h > 0 and w > 0:
-                    radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7)
-                    radius = max(0, int(radius))
-                    ct = np.array([bbox[0], bbox[1]], dtype=np.float32)
-                    ct_int = ct.astype(np.int32)
-                    self.draw_truncate_gaussian(heatmap[cls_id], ct_int, radius,
-                                                radius)
-                    bbox_size[k] = ct[0] - bbox_amodal[0], ct[1] - bbox_amodal[1], \
-                            bbox_amodal[2] - ct[0], bbox_amodal[3] - ct[1]
-
-                    index[k] = ct_int[1] * output_w + ct_int[0]
-                    center_offset[k] = ct - ct_int
-                    index_mask[k] = 1
-                    reid[k] = ide
-                    bbox_xys[k] = bbox_xy
-                    if self.num_classes > 1:
-                        cls_id_map[ct_int[1], ct_int[0]] = cls_id
-                        cls_tr_ids[cls_id][ct_int[1]][ct_int[0]] = ide - 1
-                        # track id start from 0
-
-            sample['heatmap'] = heatmap
-            sample['index'] = index
-            sample['offset'] = center_offset
-            sample['size'] = bbox_size
-            sample['index_mask'] = index_mask
-            sample['reid'] = reid
-            if self.num_classes > 1:
-                sample['cls_id_map'] = cls_id_map
-                sample['cls_tr_ids'] = cls_tr_ids
-            sample['bbox_xys'] = bbox_xys
-            sample.pop('is_crowd', None)
-            sample.pop('difficult', None)
-            sample.pop('gt_class', None)
-            sample.pop('gt_bbox', None)
-            sample.pop('gt_score', None)
-            sample.pop('gt_ide', None)
-        return samples
diff --git a/pdfdet/models/Paddle/ppdet/data/transform/op_helper.py b/pdfdet/models/Paddle/ppdet/data/transform/op_helper.py
deleted file mode 100644
index 6c40030..0000000
--- a/pdfdet/models/Paddle/ppdet/data/transform/op_helper.py
+++ /dev/null
@@ -1,494 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# this file contains helper methods for BBOX processing
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import random
-import math
-import cv2
-
-
-def meet_emit_constraint(src_bbox, sample_bbox):
-    center_x = (src_bbox[2] + src_bbox[0]) / 2
-    center_y = (src_bbox[3] + src_bbox[1]) / 2
-    if center_x >= sample_bbox[0] and \
-            center_x <= sample_bbox[2] and \
-            center_y >= sample_bbox[1] and \
-            center_y <= sample_bbox[3]:
-        return True
-    return False
-
-
-def clip_bbox(src_bbox):
-    src_bbox[0] = max(min(src_bbox[0], 1.0), 0.0)
-    src_bbox[1] = max(min(src_bbox[1], 1.0), 0.0)
-    src_bbox[2] = max(min(src_bbox[2], 1.0), 0.0)
-    src_bbox[3] = max(min(src_bbox[3], 1.0), 0.0)
-    return src_bbox
-
-
-def bbox_area(src_bbox):
-    if src_bbox[2] < src_bbox[0] or src_bbox[3] < src_bbox[1]:
-        return 0.
-    else:
-        width = src_bbox[2] - src_bbox[0]
-        height = src_bbox[3] - src_bbox[1]
-        return width * height
-
-
-def is_overlap(object_bbox, sample_bbox):
-    if object_bbox[0] >= sample_bbox[2] or \
-       object_bbox[2] <= sample_bbox[0] or \
-       object_bbox[1] >= sample_bbox[3] or \
-       object_bbox[3] <= sample_bbox[1]:
-        return False
-    else:
-        return True
-
-
-def filter_and_process(sample_bbox, bboxes, labels, scores=None,
-                       keypoints=None):
-    new_bboxes = []
-    new_labels = []
-    new_scores = []
-    new_keypoints = []
-    new_kp_ignore = []
-    for i in range(len(bboxes)):
-        new_bbox = [0, 0, 0, 0]
-        obj_bbox = [bboxes[i][0], bboxes[i][1], bboxes[i][2], bboxes[i][3]]
-        if not meet_emit_constraint(obj_bbox, sample_bbox):
-            continue
-        if not is_overlap(obj_bbox, sample_bbox):
-            continue
-        sample_width = sample_bbox[2] - sample_bbox[0]
-        sample_height = sample_bbox[3] - sample_bbox[1]
-        new_bbox[0] = (obj_bbox[0] - sample_bbox[0]) / sample_width
-        new_bbox[1] = (obj_bbox[1] - sample_bbox[1]) / sample_height
-        new_bbox[2] = (obj_bbox[2] - sample_bbox[0]) / sample_width
-        new_bbox[3] = (obj_bbox[3] - sample_bbox[1]) / sample_height
-        new_bbox = clip_bbox(new_bbox)
-        if bbox_area(new_bbox) > 0:
-            new_bboxes.append(new_bbox)
-            new_labels.append([labels[i][0]])
-            if scores is not None:
-                new_scores.append([scores[i][0]])
-            if keypoints is not None:
-                sample_keypoint = keypoints[0][i]
-                for j in range(len(sample_keypoint)):
-                    kp_len = sample_height if j % 2 else sample_width
-                    sample_coord = sample_bbox[1] if j % 2 else sample_bbox[0]
-                    sample_keypoint[j] = (
-                        sample_keypoint[j] - sample_coord) / kp_len
-                    sample_keypoint[j] = max(min(sample_keypoint[j], 1.0), 0.0)
-                new_keypoints.append(sample_keypoint)
-                new_kp_ignore.append(keypoints[1][i])
-
-    bboxes = np.array(new_bboxes)
-    labels = np.array(new_labels)
-    scores = np.array(new_scores)
-    if keypoints is not None:
-        keypoints = np.array(new_keypoints)
-        new_kp_ignore = np.array(new_kp_ignore)
-        return bboxes, labels, scores, (keypoints, new_kp_ignore)
-    return bboxes, labels, scores
-
-
-def bbox_area_sampling(bboxes, labels, scores, target_size, min_size):
-    new_bboxes = []
-    new_labels = []
-    new_scores = []
-    for i, bbox in enumerate(bboxes):
-        w = float((bbox[2] - bbox[0]) * target_size)
-        h = float((bbox[3] - bbox[1]) * target_size)
-        if w * h < float(min_size * min_size):
-            continue
-        else:
-            new_bboxes.append(bbox)
-            new_labels.append(labels[i])
-            if scores is not None and scores.size != 0:
-                new_scores.append(scores[i])
-    bboxes = np.array(new_bboxes)
-    labels = np.array(new_labels)
-    scores = np.array(new_scores)
-    return bboxes, labels, scores
-
-
-def generate_sample_bbox(sampler):
-    scale = np.random.uniform(sampler[2], sampler[3])
-    aspect_ratio = np.random.uniform(sampler[4], sampler[5])
-    aspect_ratio = max(aspect_ratio, (scale**2.0))
-    aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))
-    bbox_width = scale * (aspect_ratio**0.5)
-    bbox_height = scale / (aspect_ratio**0.5)
-    xmin_bound = 1 - bbox_width
-    ymin_bound = 1 - bbox_height
-    xmin = np.random.uniform(0, xmin_bound)
-    ymin = np.random.uniform(0, ymin_bound)
-    xmax = xmin + bbox_width
-    ymax = ymin + bbox_height
-    sampled_bbox = [xmin, ymin, xmax, ymax]
-    return sampled_bbox
-
-
-def generate_sample_bbox_square(sampler, image_width, image_height):
-    scale = np.random.uniform(sampler[2], sampler[3])
-    aspect_ratio = np.random.uniform(sampler[4], sampler[5])
-    aspect_ratio = max(aspect_ratio, (scale**2.0))
-    aspect_ratio = min(aspect_ratio, 1 / (scale**2.0))
-    bbox_width = scale * (aspect_ratio**0.5)
-    bbox_height = scale / (aspect_ratio**0.5)
-    if image_height < image_width:
-        bbox_width = bbox_height * image_height / image_width
-    else:
-        bbox_height = bbox_width * image_width / image_height
-    xmin_bound = 1 - bbox_width
-    ymin_bound = 1 - bbox_height
-    xmin = np.random.uniform(0, xmin_bound)
-    ymin = np.random.uniform(0, ymin_bound)
-    xmax = xmin + bbox_width
-    ymax = ymin + bbox_height
-    sampled_bbox = [xmin, ymin, xmax, ymax]
-    return sampled_bbox
-
-
-def data_anchor_sampling(bbox_labels, image_width, image_height, scale_array,
-                         resize_width):
-    num_gt = len(bbox_labels)
-    # np.random.randint range: [low, high)
-    rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0
-
-    if num_gt != 0:
-        norm_xmin = bbox_labels[rand_idx][0]
-        norm_ymin = bbox_labels[rand_idx][1]
-        norm_xmax = bbox_labels[rand_idx][2]
-        norm_ymax = bbox_labels[rand_idx][3]
-
-        xmin = norm_xmin * image_width
-        ymin = norm_ymin * image_height
-        wid = image_width * (norm_xmax - norm_xmin)
-        hei = image_height * (norm_ymax - norm_ymin)
-        range_size = 0
-
-        area = wid * hei
-        for scale_ind in range(0, len(scale_array) - 1):
-            if area > scale_array[scale_ind] ** 2 and area < \
-                    scale_array[scale_ind + 1] ** 2:
-                range_size = scale_ind + 1
-                break
-
-        if area > scale_array[len(scale_array) - 2]**2:
-            range_size = len(scale_array) - 2
-
-        scale_choose = 0.0
-        if range_size == 0:
-            rand_idx_size = 0
-        else:
-            # np.random.randint range: [low, high)
-            rng_rand_size = np.random.randint(0, range_size + 1)
-            rand_idx_size = rng_rand_size % (range_size + 1)
-
-        if rand_idx_size == range_size:
-            min_resize_val = scale_array[rand_idx_size] / 2.0
-            max_resize_val = min(2.0 * scale_array[rand_idx_size],
-                                 2 * math.sqrt(wid * hei))
-            scale_choose = random.uniform(min_resize_val, max_resize_val)
-        else:
-            min_resize_val = scale_array[rand_idx_size] / 2.0
-            max_resize_val = 2.0 * scale_array[rand_idx_size]
-            scale_choose = random.uniform(min_resize_val, max_resize_val)
-
-        sample_bbox_size = wid * resize_width / scale_choose
-
-        w_off_orig = 0.0
-        h_off_orig = 0.0
-        if sample_bbox_size < max(image_height, image_width):
-            if wid <= sample_bbox_size:
-                w_off_orig = np.random.uniform(xmin + wid - sample_bbox_size,
-                                               xmin)
-            else:
-                w_off_orig = np.random.uniform(xmin,
-                                               xmin + wid - sample_bbox_size)
-
-            if hei <= sample_bbox_size:
-                h_off_orig = np.random.uniform(ymin + hei - sample_bbox_size,
-                                               ymin)
-            else:
-                h_off_orig = np.random.uniform(ymin,
-                                               ymin + hei - sample_bbox_size)
-
-        else:
-            w_off_orig = np.random.uniform(image_width - sample_bbox_size, 0.0)
-            h_off_orig = np.random.uniform(image_height - sample_bbox_size, 0.0)
-
-        w_off_orig = math.floor(w_off_orig)
-        h_off_orig = math.floor(h_off_orig)
-
-        # Figure out top left coordinates.
-        w_off = float(w_off_orig / image_width)
-        h_off = float(h_off_orig / image_height)
-
-        sampled_bbox = [
-            w_off, h_off, w_off + float(sample_bbox_size / image_width),
-            h_off + float(sample_bbox_size / image_height)
-        ]
-        return sampled_bbox
-    else:
-        return 0
-
-
-def jaccard_overlap(sample_bbox, object_bbox):
-    if sample_bbox[0] >= object_bbox[2] or \
-        sample_bbox[2] <= object_bbox[0] or \
-        sample_bbox[1] >= object_bbox[3] or \
-        sample_bbox[3] <= object_bbox[1]:
-        return 0
-    intersect_xmin = max(sample_bbox[0], object_bbox[0])
-    intersect_ymin = max(sample_bbox[1], object_bbox[1])
-    intersect_xmax = min(sample_bbox[2], object_bbox[2])
-    intersect_ymax = min(sample_bbox[3], object_bbox[3])
-    intersect_size = (intersect_xmax - intersect_xmin) * (
-        intersect_ymax - intersect_ymin)
-    sample_bbox_size = bbox_area(sample_bbox)
-    object_bbox_size = bbox_area(object_bbox)
-    overlap = intersect_size / (
-        sample_bbox_size + object_bbox_size - intersect_size)
-    return overlap
-
-
-def intersect_bbox(bbox1, bbox2):
-    if bbox2[0] > bbox1[2] or bbox2[2] < bbox1[0] or \
-        bbox2[1] > bbox1[3] or bbox2[3] < bbox1[1]:
-        intersection_box = [0.0, 0.0, 0.0, 0.0]
-    else:
-        intersection_box = [
-            max(bbox1[0], bbox2[0]), max(bbox1[1], bbox2[1]),
-            min(bbox1[2], bbox2[2]), min(bbox1[3], bbox2[3])
-        ]
-    return intersection_box
-
-
-def bbox_coverage(bbox1, bbox2):
-    inter_box = intersect_bbox(bbox1, bbox2)
-    intersect_size = bbox_area(inter_box)
-
-    if intersect_size > 0:
-        bbox1_size = bbox_area(bbox1)
-        return intersect_size / bbox1_size
-    else:
-        return 0.
-
-
-def satisfy_sample_constraint(sampler,
-                              sample_bbox,
-                              gt_bboxes,
-                              satisfy_all=False):
-    if sampler[6] == 0 and sampler[7] == 0:
-        return True
-    satisfied = []
-    for i in range(len(gt_bboxes)):
-        object_bbox = [
-            gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]
-        ]
-        overlap = jaccard_overlap(sample_bbox, object_bbox)
-        if sampler[6] != 0 and \
-                overlap < sampler[6]:
-            satisfied.append(False)
-            continue
-        if sampler[7] != 0 and \
-                overlap > sampler[7]:
-            satisfied.append(False)
-            continue
-        satisfied.append(True)
-        if not satisfy_all:
-            return True
-
-    if satisfy_all:
-        return np.all(satisfied)
-    else:
-        return False
-
-
-def satisfy_sample_constraint_coverage(sampler, sample_bbox, gt_bboxes):
-    if sampler[6] == 0 and sampler[7] == 0:
-        has_jaccard_overlap = False
-    else:
-        has_jaccard_overlap = True
-    if sampler[8] == 0 and sampler[9] == 0:
-        has_object_coverage = False
-    else:
-        has_object_coverage = True
-
-    if not has_jaccard_overlap and not has_object_coverage:
-        return True
-    found = False
-    for i in range(len(gt_bboxes)):
-        object_bbox = [
-            gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3]
-        ]
-        if has_jaccard_overlap:
-            overlap = jaccard_overlap(sample_bbox, object_bbox)
-            if sampler[6] != 0 and \
-                    overlap < sampler[6]:
-                continue
-            if sampler[7] != 0 and \
-                    overlap > sampler[7]:
-                continue
-            found = True
-        if has_object_coverage:
-            object_coverage = bbox_coverage(object_bbox, sample_bbox)
-            if sampler[8] != 0 and \
-                    object_coverage < sampler[8]:
-                continue
-            if sampler[9] != 0 and \
-                    object_coverage > sampler[9]:
-                continue
-            found = True
-        if found:
-            return True
-    return found
-
-
-def crop_image_sampling(img, sample_bbox, image_width, image_height,
-                        target_size):
-    # no clipping here
-    xmin = int(sample_bbox[0] * image_width)
-    xmax = int(sample_bbox[2] * image_width)
-    ymin = int(sample_bbox[1] * image_height)
-    ymax = int(sample_bbox[3] * image_height)
-
-    w_off = xmin
-    h_off = ymin
-    width = xmax - xmin
-    height = ymax - ymin
-    cross_xmin = max(0.0, float(w_off))
-    cross_ymin = max(0.0, float(h_off))
-    cross_xmax = min(float(w_off + width - 1.0), float(image_width))
-    cross_ymax = min(float(h_off + height - 1.0), float(image_height))
-    cross_width = cross_xmax - cross_xmin
-    cross_height = cross_ymax - cross_ymin
-
-    roi_xmin = 0 if w_off >= 0 else abs(w_off)
-    roi_ymin = 0 if h_off >= 0 else abs(h_off)
-    roi_width = cross_width
-    roi_height = cross_height
-
-    roi_y1 = int(roi_ymin)
-    roi_y2 = int(roi_ymin + roi_height)
-    roi_x1 = int(roi_xmin)
-    roi_x2 = int(roi_xmin + roi_width)
-
-    cross_y1 = int(cross_ymin)
-    cross_y2 = int(cross_ymin + cross_height)
-    cross_x1 = int(cross_xmin)
-    cross_x2 = int(cross_xmin + cross_width)
-
-    sample_img = np.zeros((height, width, 3))
-    sample_img[roi_y1: roi_y2, roi_x1: roi_x2] = \
-        img[cross_y1: cross_y2, cross_x1: cross_x2]
-
-    sample_img = cv2.resize(
-        sample_img, (target_size, target_size), interpolation=cv2.INTER_AREA)
-
-    return sample_img
-
-
-def is_poly(segm):
-    assert isinstance(segm, (list, dict)), \
-        "Invalid segm type: {}".format(type(segm))
-    return isinstance(segm, list)
-
-
-def gaussian_radius(bbox_size, min_overlap):
-    height, width = bbox_size
-
-    a1 = 1
-    b1 = (height + width)
-    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
-    sq1 = np.sqrt(b1**2 - 4 * a1 * c1)
-    radius1 = (b1 + sq1) / (2 * a1)
-
-    a2 = 4
-    b2 = 2 * (height + width)
-    c2 = (1 - min_overlap) * width * height
-    sq2 = np.sqrt(b2**2 - 4 * a2 * c2)
-    radius2 = (b2 + sq2) / 2
-
-    a3 = 4 * min_overlap
-    b3 = -2 * min_overlap * (height + width)
-    c3 = (min_overlap - 1) * width * height
-    sq3 = np.sqrt(b3**2 - 4 * a3 * c3)
-    radius3 = (b3 + sq3) / 2
-    return min(radius1, radius2, radius3)
-
-
-def draw_gaussian(heatmap, center, radius, k=1, delte=6):
-    diameter = 2 * radius + 1
-    sigma = diameter / delte
-    gaussian = gaussian2D((diameter, diameter), sigma_x=sigma, sigma_y=sigma)
-
-    x, y = center
-
-    height, width = heatmap.shape[0:2]
-
-    left, right = min(x, radius), min(width - x, radius + 1)
-    top, bottom = min(y, radius), min(height - y, radius + 1)
-
-    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
-    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
-                               radius + right]
-    np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
-
-
-def gaussian2D(shape, sigma_x=1, sigma_y=1):
-    m, n = [(ss - 1.) / 2. for ss in shape]
-    y, x = np.ogrid[-m:m + 1, -n:n + 1]
-
-    h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y *
-                                                            sigma_y)))
-    h[h < np.finfo(h.dtype).eps * h.max()] = 0
-    return h
-
-
-def draw_umich_gaussian(heatmap, center, radius, k=1):
-    """
-    draw_umich_gaussian, refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/utils/image.py#L126
-    """
-    diameter = 2 * radius + 1
-    gaussian = gaussian2D(
-        (diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6)
-
-    x, y = int(center[0]), int(center[1])
-
-    height, width = heatmap.shape[0:2]
-
-    left, right = min(x, radius), min(width - x, radius + 1)
-    top, bottom = min(y, radius), min(height - y, radius + 1)
-
-    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
-    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
-                               radius + right]
-    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
-        np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
-    return heatmap
-
-
-def get_border(border, size):
-    i = 1
-    while size - border // i <= border // i:
-        i *= 2
-    return border // i
diff --git a/pdfdet/models/Paddle/ppdet/data/transform/operators.py b/pdfdet/models/Paddle/ppdet/data/transform/operators.py
deleted file mode 100644
index 5c51a93..0000000
--- a/pdfdet/models/Paddle/ppdet/data/transform/operators.py
+++ /dev/null
@@ -1,4148 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# function:
-#    operators to process sample,
-#    eg: decode/resize/crop image
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-try:
-    from collections.abc import Sequence
-except Exception:
-    from collections import Sequence
-
-from numbers import Number, Integral
-
-import uuid
-import random
-import math
-import numpy as np
-import os
-import copy
-import logging
-import cv2
-from PIL import Image, ImageDraw, ImageEnhance
-import pickle
-import threading
-MUTEX = threading.Lock()
-
-import paddle
-from ppdet.core.workspace import serializable
-from ..reader import Compose
-
-from .op_helper import (satisfy_sample_constraint, filter_and_process,
-                        generate_sample_bbox, clip_bbox, data_anchor_sampling,
-                        satisfy_sample_constraint_coverage, crop_image_sampling,
-                        generate_sample_bbox_square, bbox_area_sampling,
-                        is_poly, get_border)
-
-from ppdet.utils.logger import setup_logger
-from ppdet.utils.compact import imagedraw_textsize_c
-
-from ppdet.modeling.keypoint_utils import get_affine_transform, affine_transform
-logger = setup_logger(__name__)
-
-registered_ops = []
-
-
-def register_op(cls):
-    registered_ops.append(cls.__name__)
-    if not hasattr(BaseOperator, cls.__name__):
-        setattr(BaseOperator, cls.__name__, cls)
-    else:
-        raise KeyError("The {} class has been registered.".format(cls.__name__))
-    return serializable(cls)
-
-
-class BboxError(ValueError):
-    pass
-
-
-class ImageError(ValueError):
-    pass
-
-
-class BaseOperator(object):
-    def __init__(self, name=None):
-        if name is None:
-            name = self.__class__.__name__
-        self._id = name + '_' + str(uuid.uuid4())[-6:]
-
-    def apply(self, sample, context=None):
-        """ Process a sample.
-        Args:
-            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
-            context (dict): info about this sample processing
-        Returns:
-            result (dict): a processed sample
-        """
-        return sample
-
-    def __call__(self, sample, context=None):
-        """ Process a sample.
-        Args:
-            sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx}
-            context (dict): info about this sample processing
-        Returns:
-            result (dict): a processed sample
-        """
-        if isinstance(sample, Sequence):
-            for i in range(len(sample)):
-                sample[i] = self.apply(sample[i], context)
-        else:
-            sample = self.apply(sample, context)
-        return sample
-
-    def __str__(self):
-        return str(self._id)
-
-
-@register_op
-class Decode(BaseOperator):
-    def __init__(self):
-        """ Transform the image data to numpy format following the rgb format
-        """
-        super(Decode, self).__init__()
-
-    def apply(self, sample, context=None):
-        """ load image if 'im_file' field is not empty but 'image' is"""
-        if 'image' not in sample:
-            with open(sample['im_file'], 'rb') as f:
-                sample['image'] = f.read()
-            sample.pop('im_file')
-
-        try:
-            im = sample['image']
-            data = np.frombuffer(im, dtype='uint8')
-            im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
-            if 'keep_ori_im' in sample and sample['keep_ori_im']:
-                sample['ori_image'] = im
-            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
-        except:
-            im = sample['image']
-
-        sample['image'] = im
-        if 'h' not in sample:
-            sample['h'] = im.shape[0]
-        elif sample['h'] != im.shape[0]:
-            logger.warning(
-                "The actual image height: {} is not equal to the "
-                "height: {} in annotation, and update sample['h'] by actual "
-                "image height.".format(im.shape[0], sample['h']))
-            sample['h'] = im.shape[0]
-        if 'w' not in sample:
-            sample['w'] = im.shape[1]
-        elif sample['w'] != im.shape[1]:
-            logger.warning(
-                "The actual image width: {} is not equal to the "
-                "width: {} in annotation, and update sample['w'] by actual "
-                "image width.".format(im.shape[1], sample['w']))
-            sample['w'] = im.shape[1]
-
-        sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
-        sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
-        return sample
-
-
-def _make_dirs(dirname):
-    try:
-        from pathlib import Path
-    except ImportError:
-        from pathlib2 import Path
-    Path(dirname).mkdir(exist_ok=True)
-
-
-@register_op
-class DecodeCache(BaseOperator):
-    def __init__(self, cache_root=None):
-        '''decode image and caching
-        '''
-        super(DecodeCache, self).__init__()
-
-        self.use_cache = False if cache_root is None else True
-        self.cache_root = cache_root
-
-        if cache_root is not None:
-            _make_dirs(cache_root)
-
-    def apply(self, sample, context=None):
-
-        if self.use_cache and os.path.exists(
-                self.cache_path(self.cache_root, sample['im_file'])):
-            path = self.cache_path(self.cache_root, sample['im_file'])
-            im = self.load(path)
-
-        else:
-            if 'image' not in sample:
-                with open(sample['im_file'], 'rb') as f:
-                    sample['image'] = f.read()
-
-            im = sample['image']
-            data = np.frombuffer(im, dtype='uint8')
-            im = cv2.imdecode(data, 1)  # BGR mode, but need RGB mode
-            if 'keep_ori_im' in sample and sample['keep_ori_im']:
-                sample['ori_image'] = im
-            im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
-
-            if self.use_cache and not os.path.exists(
-                    self.cache_path(self.cache_root, sample['im_file'])):
-                path = self.cache_path(self.cache_root, sample['im_file'])
-                self.dump(im, path)
-
-        sample['image'] = im
-        sample['h'] = im.shape[0]
-        sample['w'] = im.shape[1]
-
-        sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
-        sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
-
-        sample.pop('im_file')
-
-        return sample
-
-    @staticmethod
-    def cache_path(dir_oot, im_file):
-        return os.path.join(dir_oot, os.path.basename(im_file) + '.pkl')
-
-    @staticmethod
-    def load(path):
-        with open(path, 'rb') as f:
-            im = pickle.load(f)
-        return im
-
-    @staticmethod
-    def dump(obj, path):
-        MUTEX.acquire()
-        try:
-            with open(path, 'wb') as f:
-                pickle.dump(obj, f)
-
-        except Exception as e:
-            logger.warning('dump {} occurs exception {}'.format(path, str(e)))
-
-        finally:
-            MUTEX.release()
-
-
-@register_op
-class SniperDecodeCrop(BaseOperator):
-    def __init__(self):
-        super(SniperDecodeCrop, self).__init__()
-
-    def __call__(self, sample, context=None):
-        if 'image' not in sample:
-            with open(sample['im_file'], 'rb') as f:
-                sample['image'] = f.read()
-            sample.pop('im_file')
-
-        im = sample['image']
-        data = np.frombuffer(im, dtype='uint8')
-        im = cv2.imdecode(data, cv2.IMREAD_COLOR)  # BGR mode, but need RGB mode
-        if 'keep_ori_im' in sample and sample['keep_ori_im']:
-            sample['ori_image'] = im
-        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
-
-        chip = sample['chip']
-        x1, y1, x2, y2 = [int(xi) for xi in chip]
-        im = im[max(y1, 0):min(y2, im.shape[0]), max(x1, 0):min(x2, im.shape[
-            1]), :]
-
-        sample['image'] = im
-        h = im.shape[0]
-        w = im.shape[1]
-        # sample['im_info'] = [h, w, 1.0]
-        sample['h'] = h
-        sample['w'] = w
-
-        sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32)
-        sample['scale_factor'] = np.array([1., 1.], dtype=np.float32)
-        return sample
-
-
-@register_op
-class Permute(BaseOperator):
-    def __init__(self):
-        """
-        Change the channel to be (C, H, W)
-        """
-        super(Permute, self).__init__()
-
-    def apply(self, sample, context=None):
-        im = sample['image']
-        im = im.transpose((2, 0, 1))
-        sample['image'] = im
-
-        if 'pre_image' in sample:
-            pre_im = sample['pre_image']
-            pre_im = pre_im.transpose((2, 0, 1))
-            sample['pre_image'] = pre_im
-        return sample
-
-
-@register_op
-class Lighting(BaseOperator):
-    """
-    Lighting the image by eigenvalues and eigenvectors
-    Args:
-        eigval (list): eigenvalues
-        eigvec (list): eigenvectors
-        alphastd (float): random weight of lighting, 0.1 by default
-    """
-
-    def __init__(self, eigval, eigvec, alphastd=0.1):
-        super(Lighting, self).__init__()
-        self.alphastd = alphastd
-        self.eigval = np.array(eigval).astype('float32')
-        self.eigvec = np.array(eigvec).astype('float32')
-
-    def apply(self, sample, context=None):
-        alpha = np.random.normal(scale=self.alphastd, size=(3, ))
-        sample['image'] += np.dot(self.eigvec, self.eigval * alpha)
-
-        if 'pre_image' in sample:
-            sample['pre_image'] += np.dot(self.eigvec, self.eigval * alpha)
-        return sample
-
-
-@register_op
-class RandomErasingImage(BaseOperator):
-    def __init__(self, prob=0.5, lower=0.02, higher=0.4, aspect_ratio=0.3):
-        """
-        Random Erasing Data Augmentation, see https://arxiv.org/abs/1708.04896
-        Args:
-            prob (float): probability to carry out random erasing
-            lower (float): lower limit of the erasing area ratio
-            higher (float): upper limit of the erasing area ratio
-            aspect_ratio (float): aspect ratio of the erasing region
-        """
-        super(RandomErasingImage, self).__init__()
-        self.prob = prob
-        self.lower = lower
-        self.higher = higher
-        self.aspect_ratio = aspect_ratio
-
-    def apply(self, sample, context=None):
-        gt_bbox = sample['gt_bbox']
-        im = sample['image']
-        if not isinstance(im, np.ndarray):
-            raise TypeError("{}: image is not a numpy array.".format(self))
-        if len(im.shape) != 3:
-            raise ImageError("{}: image is not 3-dimensional.".format(self))
-
-        for idx in range(gt_bbox.shape[0]):
-            if self.prob <= np.random.rand():
-                continue
-
-            x1, y1, x2, y2 = gt_bbox[idx, :]
-            w_bbox = x2 - x1
-            h_bbox = y2 - y1
-            area = w_bbox * h_bbox
-
-            target_area = random.uniform(self.lower, self.higher) * area
-            aspect_ratio = random.uniform(self.aspect_ratio,
-                                          1 / self.aspect_ratio)
-
-            h = int(round(math.sqrt(target_area * aspect_ratio)))
-            w = int(round(math.sqrt(target_area / aspect_ratio)))
-
-            if w < w_bbox and h < h_bbox:
-                off_y1 = random.randint(0, int(h_bbox - h))
-                off_x1 = random.randint(0, int(w_bbox - w))
-                im[int(y1 + off_y1):int(y1 + off_y1 + h), int(x1 + off_x1):int(
-                    x1 + off_x1 + w), :] = 0
-        sample['image'] = im
-        return sample
-
-
-@register_op
-class NormalizeImage(BaseOperator):
-    def __init__(self,
-                 mean=[0.485, 0.456, 0.406],
-                 std=[0.229, 0.224, 0.225],
-                 is_scale=True,
-                 norm_type='mean_std'):
-        """
-        Args:
-            mean (list): the pixel mean
-            std (list): the pixel variance
-            is_scale (bool): scale the pixel to [0,1]
-            norm_type (str): type in ['mean_std', 'none']
-        """
-        super(NormalizeImage, self).__init__()
-        self.mean = mean
-        self.std = std
-        self.is_scale = is_scale
-        self.norm_type = norm_type
-        if not (isinstance(self.mean, list) and isinstance(self.std, list) and
-                isinstance(self.is_scale, bool) and
-                self.norm_type in ['mean_std', 'none']):
-            raise TypeError("{}: input type is invalid.".format(self))
-        from functools import reduce
-        if reduce(lambda x, y: x * y, self.std) == 0:
-            raise ValueError('{}: std is invalid!'.format(self))
-
-    def apply(self, sample, context=None):
-        """Normalize the image.
-        Operators:
-            1.(optional) Scale the pixel to [0,1]
-            2.(optional) Each pixel minus mean and is divided by std
-        """
-        im = sample['image']
-
-        im = im.astype(np.float32, copy=False)
-        if self.is_scale:
-            scale = 1.0 / 255.0
-            im *= scale
-
-        if self.norm_type == 'mean_std':
-            mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
-            std = np.array(self.std)[np.newaxis, np.newaxis, :]
-            im -= mean
-            im /= std
-
-        sample['image'] = im
-
-        if 'pre_image' in sample:
-            pre_im = sample['pre_image']
-            pre_im = pre_im.astype(np.float32, copy=False)
-            if self.is_scale:
-                scale = 1.0 / 255.0
-                pre_im *= scale
-
-            if self.norm_type == 'mean_std':
-                mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
-                std = np.array(self.std)[np.newaxis, np.newaxis, :]
-                pre_im -= mean
-                pre_im /= std
-            sample['pre_image'] = pre_im
-
-        return sample
-
-
-@register_op
-class GridMask(BaseOperator):
-    def __init__(self,
-                 use_h=True,
-                 use_w=True,
-                 rotate=1,
-                 offset=False,
-                 ratio=0.5,
-                 mode=1,
-                 prob=0.7,
-                 upper_iter=360000):
-        """
-        GridMask Data Augmentation, see https://arxiv.org/abs/2001.04086
-        Args:
-            use_h (bool): whether to mask vertically
-            use_w (boo;): whether to mask horizontally
-            rotate (float): angle for the mask to rotate
-            offset (float): mask offset
-            ratio (float): mask ratio
-            mode (int): gridmask mode
-            prob (float): max probability to carry out gridmask
-            upper_iter (int): suggested to be equal to global max_iter
-        """
-        super(GridMask, self).__init__()
-        self.use_h = use_h
-        self.use_w = use_w
-        self.rotate = rotate
-        self.offset = offset
-        self.ratio = ratio
-        self.mode = mode
-        self.prob = prob
-        self.upper_iter = upper_iter
-
-        from .gridmask_utils import Gridmask
-        self.gridmask_op = Gridmask(
-            use_h,
-            use_w,
-            rotate=rotate,
-            offset=offset,
-            ratio=ratio,
-            mode=mode,
-            prob=prob,
-            upper_iter=upper_iter)
-
-    def apply(self, sample, context=None):
-        sample['image'] = self.gridmask_op(sample['image'], sample['curr_iter'])
-        return sample
-
-
-@register_op
-class RandomDistort(BaseOperator):
-    """Random color distortion.
-    Args:
-        hue (list): hue settings. in [lower, upper, probability] format.
-        saturation (list): saturation settings. in [lower, upper, probability] format.
-        contrast (list): contrast settings. in [lower, upper, probability] format.
-        brightness (list): brightness settings. in [lower, upper, probability] format.
-        random_apply (bool): whether to apply in random (yolo) or fixed (SSD) order.
-        count (int): the number of doing distrot.
-        random_channel (bool): whether to swap channels randomly.
-        prob (float): the probability of enhancing the sample.
-    """
-
-    def __init__(self,
-                 hue=[-18, 18, 0.5],
-                 saturation=[0.5, 1.5, 0.5],
-                 contrast=[0.5, 1.5, 0.5],
-                 brightness=[0.5, 1.5, 0.5],
-                 random_apply=True,
-                 count=4,
-                 random_channel=False,
-                 prob=1.0):
-        super(RandomDistort, self).__init__()
-        self.hue = hue
-        self.saturation = saturation
-        self.contrast = contrast
-        self.brightness = brightness
-        self.random_apply = random_apply
-        self.count = count
-        self.random_channel = random_channel
-        self.prob = prob
-
-    def apply_hue(self, img):
-        low, high, prob = self.hue
-        if np.random.uniform(0., 1.) < prob:
-            return img
-        delta = np.random.uniform(low, high)
-        img = np.array(img.convert('HSV'))
-        img[:, :, 0] = img[:, :, 0] + delta
-        img = Image.fromarray(img, mode='HSV').convert('RGB')
-        return img
-
-    def apply_saturation(self, img):
-        low, high, prob = self.saturation
-        if np.random.uniform(0., 1.) < prob:
-            return img
-        delta = np.random.uniform(low, high)
-        img = ImageEnhance.Color(img).enhance(delta)
-        return img
-
-    def apply_contrast(self, img):
-        low, high, prob = self.contrast
-        if np.random.uniform(0., 1.) < prob:
-            return img
-        delta = np.random.uniform(low, high)
-        img = ImageEnhance.Contrast(img).enhance(delta)
-        return img
-
-    def apply_brightness(self, img):
-        low, high, prob = self.brightness
-        if np.random.uniform(0., 1.) < prob:
-            return img
-        delta = np.random.uniform(low, high)
-        img = ImageEnhance.Brightness(img).enhance(delta)
-        return img
-
-    def apply(self, sample, context=None):
-        if random.random() > self.prob:
-            return sample
-        img = sample['image']
-        img = Image.fromarray(img.astype(np.uint8))
-        if self.random_apply:
-            functions = [
-                self.apply_brightness, self.apply_contrast,
-                self.apply_saturation, self.apply_hue
-            ]
-            distortions = np.random.permutation(functions)[:self.count]
-            for func in distortions:
-                img = func(img)
-            img = np.asarray(img).astype(np.float32)
-            sample['image'] = img
-            return sample
-
-        img = self.apply_brightness(img)
-        mode = np.random.randint(0, 2)
-        if mode:
-            img = self.apply_contrast(img)
-        img = self.apply_saturation(img)
-        img = self.apply_hue(img)
-        if not mode:
-            img = self.apply_contrast(img)
-
-        img = np.asarray(img).astype(np.float32)
-        if self.random_channel:
-            if np.random.randint(0, 2):
-                img = img[..., np.random.permutation(3)]
-        sample['image'] = img
-        return sample
-
-
-@register_op
-class PhotoMetricDistortion(BaseOperator):
-    """Apply photometric distortion to image sequentially, every transformation
-    is applied with a probability of 0.5. The position of random contrast is in
-    second or second to last.
-
-    1. random brightness
-    2. random contrast (mode 0)
-    3. convert color from BGR to HSV
-    4. random saturation
-    5. random hue
-    6. convert color from HSV to BGR
-    7. random contrast (mode 1)
-    8. randomly swap channels
-
-    Args:
-        brightness_delta (int): delta of brightness.
-        contrast_range (tuple): range of contrast.
-        saturation_range (tuple): range of saturation.
-        hue_delta (int): delta of hue.
-    """
-
-    def __init__(self,
-                 brightness_delta=32,
-                 contrast_range=(0.5, 1.5),
-                 saturation_range=(0.5, 1.5),
-                 hue_delta=18):
-        super(PhotoMetricDistortion, self).__init__()
-        self.brightness_delta = brightness_delta
-        self.contrast_lower, self.contrast_upper = contrast_range
-        self.saturation_lower, self.saturation_upper = saturation_range
-        self.hue_delta = hue_delta
-
-    def apply(self, results, context=None):
-        """Call function to perform photometric distortion on images.
-
-        Args:
-            results (dict): Result dict from loading pipeline.
-
-        Returns:
-            dict: Result dict with images distorted.
-        """
-
-        img = results['image']
-        img = img.astype(np.float32)
-        # random brightness
-        if np.random.randint(2):
-            delta = np.random.uniform(-self.brightness_delta,
-                                      self.brightness_delta)
-            img += delta
-
-        # mode == 0 --> do random contrast first
-        # mode == 1 --> do random contrast last
-        mode = np.random.randint(2)
-        if mode == 1:
-            if np.random.randint(2):
-                alpha = np.random.uniform(self.contrast_lower,
-                                          self.contrast_upper)
-                img *= alpha
-
-        # convert color from BGR to HSV
-        img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
-
-        # random saturation
-        if np.random.randint(2):
-            img[..., 1] *= np.random.uniform(self.saturation_lower,
-                                             self.saturation_upper)
-
-        # random hue
-        if np.random.randint(2):
-            img[..., 0] += np.random.uniform(-self.hue_delta, self.hue_delta)
-            img[..., 0][img[..., 0] > 360] -= 360
-            img[..., 0][img[..., 0] < 0] += 360
-
-        # convert color from HSV to BGR
-        img = cv2.cvtColor(img, cv2.COLOR_HSV2BGR)
-
-        # random contrast
-        if mode == 0:
-            if np.random.randint(2):
-                alpha = np.random.uniform(self.contrast_lower,
-                                          self.contrast_upper)
-                img *= alpha
-
-        # randomly swap channels
-        if np.random.randint(2):
-            img = img[..., np.random.permutation(3)]
-
-        results['image'] = img
-        return results
-
-    def __repr__(self):
-        repr_str = self.__class__.__name__
-        repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
-        repr_str += 'contrast_range='
-        repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
-        repr_str += 'saturation_range='
-        repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
-        repr_str += f'hue_delta={self.hue_delta})'
-        return repr_str
-
-
-@register_op
-class AutoAugment(BaseOperator):
-    def __init__(self, autoaug_type="v1"):
-        """
-        Args:
-            autoaug_type (str): autoaug type, support v0, v1, v2, v3, test
-        """
-        super(AutoAugment, self).__init__()
-        self.autoaug_type = autoaug_type
-
-    def apply(self, sample, context=None):
-        """
-        Learning Data Augmentation Strategies for Object Detection, see https://arxiv.org/abs/1906.11172
-        """
-        im = sample['image']
-        gt_bbox = sample['gt_bbox']
-        if not isinstance(im, np.ndarray):
-            raise TypeError("{}: image is not a numpy array.".format(self))
-        if len(im.shape) != 3:
-            raise ImageError("{}: image is not 3-dimensional.".format(self))
-        if len(gt_bbox) == 0:
-            return sample
-
-        height, width, _ = im.shape
-        norm_gt_bbox = np.ones_like(gt_bbox, dtype=np.float32)
-        norm_gt_bbox[:, 0] = gt_bbox[:, 1] / float(height)
-        norm_gt_bbox[:, 1] = gt_bbox[:, 0] / float(width)
-        norm_gt_bbox[:, 2] = gt_bbox[:, 3] / float(height)
-        norm_gt_bbox[:, 3] = gt_bbox[:, 2] / float(width)
-
-        from .autoaugment_utils import distort_image_with_autoaugment
-        im, norm_gt_bbox = distort_image_with_autoaugment(im, norm_gt_bbox,
-                                                          self.autoaug_type)
-
-        gt_bbox[:, 0] = norm_gt_bbox[:, 1] * float(width)
-        gt_bbox[:, 1] = norm_gt_bbox[:, 0] * float(height)
-        gt_bbox[:, 2] = norm_gt_bbox[:, 3] * float(width)
-        gt_bbox[:, 3] = norm_gt_bbox[:, 2] * float(height)
-
-        sample['image'] = im
-        sample['gt_bbox'] = gt_bbox
-        return sample
-
-
-@register_op
-class RandomFlip(BaseOperator):
-    def __init__(self, prob=0.5):
-        """
-        Args:
-            prob (float): the probability of flipping image
-        """
-        super(RandomFlip, self).__init__()
-        self.prob = prob
-        if not (isinstance(self.prob, float)):
-            raise TypeError("{}: input type is invalid.".format(self))
-
-    def apply_segm(self, segms, height, width):
-        def _flip_poly(poly, width):
-            flipped_poly = np.array(poly)
-            flipped_poly[0::2] = width - np.array(poly[0::2])
-            return flipped_poly.tolist()
-
-        def _flip_rle(rle, height, width):
-            if 'counts' in rle and type(rle['counts']) == list:
-                rle = mask_util.frPyObjects(rle, height, width)
-            mask = mask_util.decode(rle)
-            mask = mask[:, ::-1]
-            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
-            return rle
-
-        flipped_segms = []
-        for segm in segms:
-            if is_poly(segm):
-                # Polygon format
-                flipped_segms.append([_flip_poly(poly, width) for poly in segm])
-            else:
-                # RLE format
-                import pycocotools.mask as mask_util
-                flipped_segms.append(_flip_rle(segm, height, width))
-        return flipped_segms
-
-    def apply_keypoint(self, gt_keypoint, width):
-        for i in range(gt_keypoint.shape[1]):
-            if i % 2 == 0:
-                old_x = gt_keypoint[:, i].copy()
-                gt_keypoint[:, i] = width - old_x
-        return gt_keypoint
-
-    def apply_image(self, image):
-        return image[:, ::-1, :]
-
-    def apply_bbox(self, bbox, width):
-        oldx1 = bbox[:, 0].copy()
-        oldx2 = bbox[:, 2].copy()
-        bbox[:, 0] = width - oldx2
-        bbox[:, 2] = width - oldx1
-        return bbox
-
-    def apply(self, sample, context=None):
-        """Filp the image and bounding box.
-        Operators:
-            1. Flip the image numpy.
-            2. Transform the bboxes' x coordinates.
-              (Must judge whether the coordinates are normalized!)
-            3. Transform the segmentations' x coordinates.
-              (Must judge whether the coordinates are normalized!)
-        Output:
-            sample: the image, bounding box and segmentation part
-                    in sample are flipped.
-        """
-        if np.random.uniform(0, 1) < self.prob:
-            im = sample['image']
-            height, width = im.shape[:2]
-            im = self.apply_image(im)
-            if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
-                sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], width)
-            if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
-                sample['gt_poly'] = self.apply_segm(sample['gt_poly'], height,
-                                                    width)
-            if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0:
-                sample['gt_keypoint'] = self.apply_keypoint(
-                    sample['gt_keypoint'], width)
-
-            if 'semantic' in sample and sample['semantic']:
-                sample['semantic'] = sample['semantic'][:, ::-1]
-
-            if 'gt_segm' in sample and sample['gt_segm'].any():
-                sample['gt_segm'] = sample['gt_segm'][:, :, ::-1]
-
-            sample['flipped'] = True
-            sample['image'] = im
-        return sample
-
-
-@register_op
-class Resize(BaseOperator):
-    def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR):
-        """
-        Resize image to target size. if keep_ratio is True, 
-        resize the image's long side to the maximum of target_size
-        if keep_ratio is False, resize the image to target size(h, w)
-        Args:
-            target_size (int|list): image target size
-            keep_ratio (bool): whether keep_ratio or not, default true
-            interp (int): the interpolation method
-        """
-        super(Resize, self).__init__()
-        self.keep_ratio = keep_ratio
-        self.interp = interp
-        if not isinstance(target_size, (Integral, Sequence)):
-            raise TypeError(
-                "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
-                format(type(target_size)))
-        if isinstance(target_size, Integral):
-            target_size = [target_size, target_size]
-        self.target_size = target_size
-
-    def apply_image(self, image, scale):
-        im_scale_x, im_scale_y = scale
-
-        return cv2.resize(
-            image,
-            None,
-            None,
-            fx=im_scale_x,
-            fy=im_scale_y,
-            interpolation=self.interp)
-
-    def apply_bbox(self, bbox, scale, size):
-        im_scale_x, im_scale_y = scale
-        resize_w, resize_h = size
-        bbox[:, 0::2] *= im_scale_x
-        bbox[:, 1::2] *= im_scale_y
-        bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)
-        bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)
-        return bbox
-
-    def apply_area(self, area, scale):
-        im_scale_x, im_scale_y = scale
-        return area * im_scale_x * im_scale_y
-
-    def apply_joints(self, joints, scale, size):
-        im_scale_x, im_scale_y = scale
-        resize_w, resize_h = size
-        joints[..., 0] *= im_scale_x
-        joints[..., 1] *= im_scale_y
-        joints[..., 0] = np.clip(joints[..., 0], 0, resize_w)
-        joints[..., 1] = np.clip(joints[..., 1], 0, resize_h)
-        return joints
-
-    def apply_segm(self, segms, im_size, scale):
-        def _resize_poly(poly, im_scale_x, im_scale_y):
-            resized_poly = np.array(poly).astype('float32')
-            resized_poly[0::2] *= im_scale_x
-            resized_poly[1::2] *= im_scale_y
-            return resized_poly.tolist()
-
-        def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):
-            if 'counts' in rle and type(rle['counts']) == list:
-                rle = mask_util.frPyObjects(rle, im_h, im_w)
-
-            mask = mask_util.decode(rle)
-            mask = cv2.resize(
-                mask,
-                None,
-                None,
-                fx=im_scale_x,
-                fy=im_scale_y,
-                interpolation=self.interp)
-            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
-            return rle
-
-        im_h, im_w = im_size
-        im_scale_x, im_scale_y = scale
-        resized_segms = []
-        for segm in segms:
-            if is_poly(segm):
-                # Polygon format
-                resized_segms.append([
-                    _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm
-                ])
-            else:
-                # RLE format
-                import pycocotools.mask as mask_util
-                resized_segms.append(
-                    _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))
-
-        return resized_segms
-
-    def apply(self, sample, context=None):
-        """ Resize the image numpy.
-        """
-        im = sample['image']
-        if not isinstance(im, np.ndarray):
-            raise TypeError("{}: image type is not numpy.".format(self))
-
-        # apply image
-        if len(im.shape) == 3:
-            im_shape = im.shape
-        else:
-            im_shape = im[0].shape
-
-        if self.keep_ratio:
-            im_size_min = np.min(im_shape[0:2])
-            im_size_max = np.max(im_shape[0:2])
-
-            target_size_min = np.min(self.target_size)
-            target_size_max = np.max(self.target_size)
-
-            im_scale = min(target_size_min / im_size_min,
-                           target_size_max / im_size_max)
-
-            resize_h = int(im_scale * float(im_shape[0]) + 0.5)
-            resize_w = int(im_scale * float(im_shape[1]) + 0.5)
-        else:
-            resize_h, resize_w = self.target_size
-
-        im_scale_y = resize_h / im_shape[0]
-        im_scale_x = resize_w / im_shape[1]
-
-        if len(im.shape) == 3:
-            im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
-            sample['image'] = im.astype(np.float32)
-        else:
-            resized_images = []
-            for one_im in im:
-                applied_im = self.apply_image(one_im, [im_scale_x, im_scale_y])
-                resized_images.append(applied_im)
-
-            sample['image'] = np.array(resized_images)
-
-        # 2d keypoints resize
-        if 'kps2d' in sample.keys():
-            kps2d = sample['kps2d']
-            kps2d[:, :, 0] = kps2d[:, :, 0] * im_scale_x
-            kps2d[:, :, 1] = kps2d[:, :, 1] * im_scale_y
-
-            sample['kps2d'] = kps2d
-
-        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
-        if 'scale_factor' in sample:
-            scale_factor = sample['scale_factor']
-            sample['scale_factor'] = np.asarray(
-                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
-                dtype=np.float32)
-        else:
-            sample['scale_factor'] = np.asarray(
-                [im_scale_y, im_scale_x], dtype=np.float32)
-
-        # apply bbox
-        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
-            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'],
-                                                [im_scale_x, im_scale_y],
-                                                [resize_w, resize_h])
-
-        # apply areas
-        if 'gt_areas' in sample:
-            sample['gt_areas'] = self.apply_area(sample['gt_areas'],
-                                                 [im_scale_x, im_scale_y])
-
-        # apply polygon
-        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
-            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2],
-                                                [im_scale_x, im_scale_y])
-
-        # apply semantic
-        if 'semantic' in sample and sample['semantic']:
-            semantic = sample['semantic']
-            semantic = cv2.resize(
-                semantic.astype('float32'),
-                None,
-                None,
-                fx=im_scale_x,
-                fy=im_scale_y,
-                interpolation=self.interp)
-            semantic = np.asarray(semantic).astype('int32')
-            semantic = np.expand_dims(semantic, 0)
-            sample['semantic'] = semantic
-
-        # apply gt_segm
-        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
-            masks = [
-                cv2.resize(
-                    gt_segm,
-                    None,
-                    None,
-                    fx=im_scale_x,
-                    fy=im_scale_y,
-                    interpolation=cv2.INTER_NEAREST)
-                for gt_segm in sample['gt_segm']
-            ]
-            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
-
-        if 'gt_joints' in sample:
-            sample['gt_joints'] = self.apply_joints(sample['gt_joints'],
-                                                    [im_scale_x, im_scale_y],
-                                                    [resize_w, resize_h])
-
-        return sample
-
-
-@register_op
-class MultiscaleTestResize(BaseOperator):
-    def __init__(self,
-                 origin_target_size=[800, 1333],
-                 target_size=[],
-                 interp=cv2.INTER_LINEAR,
-                 use_flip=True):
-        """
-        Rescale image to the each size in target size, and capped at max_size.
-        Args:
-            origin_target_size (list): origin target size of image
-            target_size (list): A list of target sizes of image.
-            interp (int): the interpolation method.
-            use_flip (bool): whether use flip augmentation.
-        """
-        super(MultiscaleTestResize, self).__init__()
-        self.interp = interp
-        self.use_flip = use_flip
-
-        if not isinstance(target_size, Sequence):
-            raise TypeError(
-                "Type of target_size is invalid. Must be List or Tuple, now is {}".
-                format(type(target_size)))
-        self.target_size = target_size
-
-        if not isinstance(origin_target_size, Sequence):
-            raise TypeError(
-                "Type of origin_target_size is invalid. Must be List or Tuple, now is {}".
-                format(type(origin_target_size)))
-
-        self.origin_target_size = origin_target_size
-
-    def apply(self, sample, context=None):
-        """ Resize the image numpy for multi-scale test.
-        """
-        samples = []
-        resizer = Resize(
-            self.origin_target_size, keep_ratio=True, interp=self.interp)
-        samples.append(resizer(sample.copy(), context))
-        if self.use_flip:
-            flipper = RandomFlip(1.1)
-            samples.append(flipper(sample.copy(), context=context))
-
-        for size in self.target_size:
-            resizer = Resize(size, keep_ratio=True, interp=self.interp)
-            samples.append(resizer(sample.copy(), context))
-
-        return samples
-
-
-@register_op
-class RandomResize(BaseOperator):
-    def __init__(self,
-                 target_size,
-                 keep_ratio=True,
-                 interp=cv2.INTER_LINEAR,
-                 random_range=False,
-                 random_size=True,
-                 random_interp=False):
-        """
-        Resize image to target size randomly. random target_size and interpolation method
-        Args:
-            target_size (int, list, tuple): image target size, if random size is True, must be list or tuple
-            keep_ratio (bool): whether keep_raio or not, default true
-            interp (int): the interpolation method
-            random_range (bool): whether random select target size of image, the target_size must be 
-                a [[min_short_edge, long_edge], [max_short_edge, long_edge]]
-            random_size (bool): whether random select target size of image
-            random_interp (bool): whether random select interpolation method
-        """
-        super(RandomResize, self).__init__()
-        self.keep_ratio = keep_ratio
-        self.interp = interp
-        self.interps = [
-            cv2.INTER_NEAREST,
-            cv2.INTER_LINEAR,
-            cv2.INTER_AREA,
-            cv2.INTER_CUBIC,
-            cv2.INTER_LANCZOS4,
-        ]
-        assert isinstance(target_size, (
-            Integral, Sequence)), "target_size must be Integer, List or Tuple"
-        if (random_range or random_size) and not isinstance(target_size,
-                                                            Sequence):
-            raise TypeError(
-                "Type of target_size is invalid when random_size or random_range is True. Must be List or Tuple, now is {}".
-                format(type(target_size)))
-        if random_range and not len(target_size) == 2:
-            raise TypeError(
-                "target_size must be two list as [[min_short_edge, long_edge], [max_short_edge, long_edge]] when random_range is True."
-            )
-        self.target_size = target_size
-        self.random_range = random_range
-        self.random_size = random_size
-        self.random_interp = random_interp
-
-    def apply(self, sample, context=None):
-        """ Resize the image numpy.
-        """
-        if self.random_range:
-            short_edge = np.random.randint(self.target_size[0][0],
-                                           self.target_size[1][0] + 1)
-            long_edge = max(self.target_size[0][1], self.target_size[1][1] + 1)
-            target_size = [short_edge, long_edge]
-        else:
-            if self.random_size:
-                target_size = random.choice(self.target_size)
-            else:
-                target_size = self.target_size
-
-        if self.random_interp:
-            interp = random.choice(self.interps)
-        else:
-            interp = self.interp
-
-        resizer = Resize(target_size, self.keep_ratio, interp)
-        return resizer(sample, context=context)
-
-
-@register_op
-class RandomExpand(BaseOperator):
-    """Random expand the canvas.
-    Args:
-        ratio (float): maximum expansion ratio.
-        prob (float): probability to expand.
-        fill_value (list): color value used to fill the canvas. in RGB order.
-    """
-
-    def __init__(self, ratio=4., prob=0.5, fill_value=(127.5, 127.5, 127.5)):
-        super(RandomExpand, self).__init__()
-        assert ratio > 1.01, "expand ratio must be larger than 1.01"
-        self.ratio = ratio
-        self.prob = prob
-        assert isinstance(fill_value, (Number, Sequence)), \
-            "fill value must be either float or sequence"
-        if isinstance(fill_value, Number):
-            fill_value = (fill_value, ) * 3
-        if not isinstance(fill_value, tuple):
-            fill_value = tuple(fill_value)
-        self.fill_value = fill_value
-
-    def apply(self, sample, context=None):
-        if np.random.uniform(0., 1.) < self.prob:
-            return sample
-
-        im = sample['image']
-        height, width = im.shape[:2]
-        ratio = np.random.uniform(1., self.ratio)
-        h = int(height * ratio)
-        w = int(width * ratio)
-        if not h > height or not w > width:
-            return sample
-        y = np.random.randint(0, h - height)
-        x = np.random.randint(0, w - width)
-        offsets, size = [x, y], [h, w]
-
-        pad = Pad(size,
-                  pad_mode=-1,
-                  offsets=offsets,
-                  fill_value=self.fill_value)
-
-        return pad(sample, context=context)
-
-
-@register_op
-class CropWithSampling(BaseOperator):
-    def __init__(self, batch_sampler, satisfy_all=False, avoid_no_bbox=True):
-        """
-        Args:
-            batch_sampler (list): Multiple sets of different
-                                  parameters for cropping.
-            satisfy_all (bool): whether all boxes must satisfy.
-            e.g.[[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0],
-                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 1.0],
-                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 1.0],
-                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 1.0],
-                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 1.0],
-                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 1.0],
-                 [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]]
-           [max sample, max trial, min scale, max scale,
-            min aspect ratio, max aspect ratio,
-            min overlap, max overlap]
-            avoid_no_bbox (bool): whether to avoid the
-                                  situation where the box does not appear.
-        """
-        super(CropWithSampling, self).__init__()
-        self.batch_sampler = batch_sampler
-        self.satisfy_all = satisfy_all
-        self.avoid_no_bbox = avoid_no_bbox
-
-    def apply(self, sample, context):
-        """
-        Crop the image and modify bounding box.
-        Operators:
-            1. Scale the image width and height.
-            2. Crop the image according to a radom sample.
-            3. Rescale the bounding box.
-            4. Determine if the new bbox is satisfied in the new image.
-        Returns:
-            sample: the image, bounding box are replaced.
-        """
-        assert 'image' in sample, "image data not found"
-        im = sample['image']
-        gt_bbox = sample['gt_bbox']
-        gt_class = sample['gt_class']
-        im_height, im_width = im.shape[:2]
-        gt_score = None
-        if 'gt_score' in sample:
-            gt_score = sample['gt_score']
-        sampled_bbox = []
-        gt_bbox = gt_bbox.tolist()
-        for sampler in self.batch_sampler:
-            found = 0
-            for i in range(sampler[1]):
-                if found >= sampler[0]:
-                    break
-                sample_bbox = generate_sample_bbox(sampler)
-                if satisfy_sample_constraint(sampler, sample_bbox, gt_bbox,
-                                             self.satisfy_all):
-                    sampled_bbox.append(sample_bbox)
-                    found = found + 1
-        im = np.array(im)
-        while sampled_bbox:
-            idx = int(np.random.uniform(0, len(sampled_bbox)))
-            sample_bbox = sampled_bbox.pop(idx)
-            sample_bbox = clip_bbox(sample_bbox)
-            crop_bbox, crop_class, crop_score = \
-                filter_and_process(sample_bbox, gt_bbox, gt_class, scores=gt_score)
-            if self.avoid_no_bbox:
-                if len(crop_bbox) < 1:
-                    continue
-            xmin = int(sample_bbox[0] * im_width)
-            xmax = int(sample_bbox[2] * im_width)
-            ymin = int(sample_bbox[1] * im_height)
-            ymax = int(sample_bbox[3] * im_height)
-            im = im[ymin:ymax, xmin:xmax]
-            sample['image'] = im
-            sample['gt_bbox'] = crop_bbox
-            sample['gt_class'] = crop_class
-            sample['gt_score'] = crop_score
-            return sample
-        return sample
-
-
-@register_op
-class CropWithDataAchorSampling(BaseOperator):
-    def __init__(self,
-                 batch_sampler,
-                 anchor_sampler=None,
-                 target_size=None,
-                 das_anchor_scales=[16, 32, 64, 128],
-                 sampling_prob=0.5,
-                 min_size=8.,
-                 avoid_no_bbox=True):
-        """
-        Args:
-            anchor_sampler (list): anchor_sampling sets of different
-                                  parameters for cropping.
-            batch_sampler (list): Multiple sets of different
-                                  parameters for cropping.
-              e.g.[[1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0]]
-                  [[1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
-                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
-                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
-                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0],
-                   [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]]
-              [max sample, max trial, min scale, max scale,
-               min aspect ratio, max aspect ratio,
-               min overlap, max overlap, min coverage, max coverage]
-            target_size (int): target image size.
-            das_anchor_scales (list[float]): a list of anchor scales in data
-                anchor smapling.
-            min_size (float): minimum size of sampled bbox.
-            avoid_no_bbox (bool): whether to avoid the
-                                  situation where the box does not appear.
-        """
-        super(CropWithDataAchorSampling, self).__init__()
-        self.anchor_sampler = anchor_sampler
-        self.batch_sampler = batch_sampler
-        self.target_size = target_size
-        self.sampling_prob = sampling_prob
-        self.min_size = min_size
-        self.avoid_no_bbox = avoid_no_bbox
-        self.das_anchor_scales = np.array(das_anchor_scales)
-
-    def apply(self, sample, context):
-        """
-        Crop the image and modify bounding box.
-        Operators:
-            1. Scale the image width and height.
-            2. Crop the image according to a radom sample.
-            3. Rescale the bounding box.
-            4. Determine if the new bbox is satisfied in the new image.
-        Returns:
-            sample: the image, bounding box are replaced.
-        """
-        assert 'image' in sample, "image data not found"
-        im = sample['image']
-        gt_bbox = sample['gt_bbox']
-        gt_class = sample['gt_class']
-        image_height, image_width = im.shape[:2]
-        gt_bbox[:, 0] /= image_width
-        gt_bbox[:, 1] /= image_height
-        gt_bbox[:, 2] /= image_width
-        gt_bbox[:, 3] /= image_height
-        gt_score = None
-        if 'gt_score' in sample:
-            gt_score = sample['gt_score']
-        sampled_bbox = []
-        gt_bbox = gt_bbox.tolist()
-
-        prob = np.random.uniform(0., 1.)
-        if prob > self.sampling_prob:  # anchor sampling
-            assert self.anchor_sampler
-            for sampler in self.anchor_sampler:
-                found = 0
-                for i in range(sampler[1]):
-                    if found >= sampler[0]:
-                        break
-                    sample_bbox = data_anchor_sampling(
-                        gt_bbox, image_width, image_height,
-                        self.das_anchor_scales, self.target_size)
-                    if sample_bbox == 0:
-                        break
-                    if satisfy_sample_constraint_coverage(sampler, sample_bbox,
-                                                          gt_bbox):
-                        sampled_bbox.append(sample_bbox)
-                        found = found + 1
-            im = np.array(im)
-            while sampled_bbox:
-                idx = int(np.random.uniform(0, len(sampled_bbox)))
-                sample_bbox = sampled_bbox.pop(idx)
-
-                if 'gt_keypoint' in sample.keys():
-                    keypoints = (sample['gt_keypoint'],
-                                 sample['keypoint_ignore'])
-                    crop_bbox, crop_class, crop_score, gt_keypoints = \
-                        filter_and_process(sample_bbox, gt_bbox, gt_class,
-                                scores=gt_score,
-                                keypoints=keypoints)
-                else:
-                    crop_bbox, crop_class, crop_score = filter_and_process(
-                        sample_bbox, gt_bbox, gt_class, scores=gt_score)
-                crop_bbox, crop_class, crop_score = bbox_area_sampling(
-                    crop_bbox, crop_class, crop_score, self.target_size,
-                    self.min_size)
-
-                if self.avoid_no_bbox:
-                    if len(crop_bbox) < 1:
-                        continue
-                im = crop_image_sampling(im, sample_bbox, image_width,
-                                         image_height, self.target_size)
-                height, width = im.shape[:2]
-                crop_bbox[:, 0] *= width
-                crop_bbox[:, 1] *= height
-                crop_bbox[:, 2] *= width
-                crop_bbox[:, 3] *= height
-                sample['image'] = im
-                sample['gt_bbox'] = crop_bbox
-                sample['gt_class'] = crop_class
-                if 'gt_score' in sample:
-                    sample['gt_score'] = crop_score
-                if 'gt_keypoint' in sample.keys():
-                    sample['gt_keypoint'] = gt_keypoints[0]
-                    sample['keypoint_ignore'] = gt_keypoints[1]
-                return sample
-            return sample
-
-        else:
-            for sampler in self.batch_sampler:
-                found = 0
-                for i in range(sampler[1]):
-                    if found >= sampler[0]:
-                        break
-                    sample_bbox = generate_sample_bbox_square(
-                        sampler, image_width, image_height)
-                    if satisfy_sample_constraint_coverage(sampler, sample_bbox,
-                                                          gt_bbox):
-                        sampled_bbox.append(sample_bbox)
-                        found = found + 1
-            im = np.array(im)
-            while sampled_bbox:
-                idx = int(np.random.uniform(0, len(sampled_bbox)))
-                sample_bbox = sampled_bbox.pop(idx)
-                sample_bbox = clip_bbox(sample_bbox)
-
-                if 'gt_keypoint' in sample.keys():
-                    keypoints = (sample['gt_keypoint'],
-                                 sample['keypoint_ignore'])
-                    crop_bbox, crop_class, crop_score, gt_keypoints = \
-                        filter_and_process(sample_bbox, gt_bbox, gt_class,
-                                scores=gt_score,
-                                keypoints=keypoints)
-                else:
-                    crop_bbox, crop_class, crop_score = filter_and_process(
-                        sample_bbox, gt_bbox, gt_class, scores=gt_score)
-                # sampling bbox according the bbox area
-                crop_bbox, crop_class, crop_score = bbox_area_sampling(
-                    crop_bbox, crop_class, crop_score, self.target_size,
-                    self.min_size)
-
-                if self.avoid_no_bbox:
-                    if len(crop_bbox) < 1:
-                        continue
-                xmin = int(sample_bbox[0] * image_width)
-                xmax = int(sample_bbox[2] * image_width)
-                ymin = int(sample_bbox[1] * image_height)
-                ymax = int(sample_bbox[3] * image_height)
-                im = im[ymin:ymax, xmin:xmax]
-                height, width = im.shape[:2]
-                crop_bbox[:, 0] *= width
-                crop_bbox[:, 1] *= height
-                crop_bbox[:, 2] *= width
-                crop_bbox[:, 3] *= height
-                sample['image'] = im
-                sample['gt_bbox'] = crop_bbox
-                sample['gt_class'] = crop_class
-                if 'gt_score' in sample:
-                    sample['gt_score'] = crop_score
-                if 'gt_keypoint' in sample.keys():
-                    sample['gt_keypoint'] = gt_keypoints[0]
-                    sample['keypoint_ignore'] = gt_keypoints[1]
-                return sample
-            return sample
-
-
-@register_op
-class RandomCrop(BaseOperator):
-    """Random crop image and bboxes.
-    Args:
-        aspect_ratio (list): aspect ratio of cropped region.
-            in [min, max] format.
-        thresholds (list): iou thresholds for decide a valid bbox crop.
-        scaling (list): ratio between a cropped region and the original image.
-             in [min, max] format.
-        num_attempts (int): number of tries before giving up.
-        allow_no_crop (bool): allow return without actually cropping them.
-        cover_all_box (bool): ensure all bboxes are covered in the final crop.
-        is_mask_crop(bool): whether crop the segmentation.
-    """
-
-    def __init__(self,
-                 aspect_ratio=[.5, 2.],
-                 thresholds=[.0, .1, .3, .5, .7, .9],
-                 scaling=[.3, 1.],
-                 num_attempts=50,
-                 allow_no_crop=True,
-                 cover_all_box=False,
-                 is_mask_crop=False,
-                 ioumode="iou",
-                 prob=1.0):
-        super(RandomCrop, self).__init__()
-        self.aspect_ratio = aspect_ratio
-        self.thresholds = thresholds
-        self.scaling = scaling
-        self.num_attempts = num_attempts
-        self.allow_no_crop = allow_no_crop
-        self.cover_all_box = cover_all_box
-        self.is_mask_crop = is_mask_crop
-        self.ioumode = ioumode
-        self.prob = prob
-
-    def crop_segms(self, segms, valid_ids, crop, height, width):
-        def _crop_poly(segm, crop):
-            xmin, ymin, xmax, ymax = crop
-            crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]
-            crop_p = np.array(crop_coord).reshape(4, 2)
-            crop_p = Polygon(crop_p)
-
-            crop_segm = list()
-            for poly in segm:
-                poly = np.array(poly).reshape(len(poly) // 2, 2)
-                polygon = Polygon(poly)
-                if not polygon.is_valid:
-                    exterior = polygon.exterior
-                    multi_lines = exterior.intersection(exterior)
-                    polygons = shapely.ops.polygonize(multi_lines)
-                    polygon = MultiPolygon(polygons)
-                multi_polygon = list()
-                if isinstance(polygon, MultiPolygon):
-                    multi_polygon = copy.deepcopy(polygon)
-                else:
-                    multi_polygon.append(copy.deepcopy(polygon))
-                for per_polygon in multi_polygon:
-                    inter = per_polygon.intersection(crop_p)
-                    if not inter:
-                        continue
-                    if isinstance(inter, (MultiPolygon, GeometryCollection)):
-                        for part in inter:
-                            if not isinstance(part, Polygon):
-                                continue
-                            part = np.squeeze(
-                                np.array(part.exterior.coords[:-1]).reshape(1,
-                                                                            -1))
-                            part[0::2] -= xmin
-                            part[1::2] -= ymin
-                            crop_segm.append(part.tolist())
-                    elif isinstance(inter, Polygon):
-                        crop_poly = np.squeeze(
-                            np.array(inter.exterior.coords[:-1]).reshape(1, -1))
-                        crop_poly[0::2] -= xmin
-                        crop_poly[1::2] -= ymin
-                        crop_segm.append(crop_poly.tolist())
-                    else:
-                        continue
-            return crop_segm
-
-        def _crop_rle(rle, crop, height, width):
-            if 'counts' in rle and type(rle['counts']) == list:
-                rle = mask_util.frPyObjects(rle, height, width)
-            mask = mask_util.decode(rle)
-            mask = mask[crop[1]:crop[3], crop[0]:crop[2]]
-            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
-            return rle
-
-        crop_segms = []
-        for id in valid_ids:
-            segm = segms[id]
-            if is_poly(segm):
-                import copy
-                import shapely.ops
-                from shapely.geometry import Polygon, MultiPolygon, GeometryCollection
-                logging.getLogger("shapely").setLevel(logging.WARNING)
-                # Polygon format
-                crop_segms.append(_crop_poly(segm, crop))
-            else:
-                # RLE format
-                import pycocotools.mask as mask_util
-                crop_segms.append(_crop_rle(segm, crop, height, width))
-        return crop_segms
-
-    def set_fake_bboxes(self, sample):
-        sample['gt_bbox'] = np.array(
-            [
-                [32, 32, 128, 128],
-                [32, 32, 128, 256],
-                [32, 64, 128, 128],
-                [32, 64, 128, 256],
-                [64, 64, 128, 256],
-                [64, 64, 256, 256],
-                [64, 32, 128, 256],
-                [64, 32, 128, 256],
-                [96, 32, 128, 256],
-                [96, 32, 128, 256],
-            ],
-            dtype=np.float32)
-        sample['gt_class'] = np.array(
-            [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]], np.int32)
-        return sample
-
-    def apply(self, sample, context=None):
-        if random.random() > self.prob:
-            return sample
-
-        if 'gt_bbox' not in sample:
-            # only used in semi-det as unsup data
-            sample = self.set_fake_bboxes(sample)
-            sample = self.random_crop(sample, fake_bboxes=True)
-            del sample['gt_bbox']
-            del sample['gt_class']
-            return sample
-
-        if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
-            return sample
-        sample = self.random_crop(sample)
-        return sample
-
-    def random_crop(self, sample, fake_bboxes=False):
-        h, w = sample['image'].shape[:2]
-        gt_bbox = sample['gt_bbox']
-
-        # NOTE Original method attempts to generate one candidate for each
-        # threshold then randomly sample one from the resulting list.
-        # Here a short circuit approach is taken, i.e., randomly choose a
-        # threshold and attempt to find a valid crop, and simply return the
-        # first one found.
-        # The probability is not exactly the same, kinda resembling the
-        # "Monty Hall" problem. Actually carrying out the attempts will affect
-        # observability (just like opening doors in the "Monty Hall" game).
-        thresholds = list(self.thresholds)
-        if self.allow_no_crop:
-            thresholds.append('no_crop')
-        np.random.shuffle(thresholds)
-
-        for thresh in thresholds:
-            if thresh == 'no_crop':
-                return sample
-
-            found = False
-            for i in range(self.num_attempts):
-                scale = np.random.uniform(*self.scaling)
-                if self.aspect_ratio is not None:
-                    min_ar, max_ar = self.aspect_ratio
-                    aspect_ratio = np.random.uniform(
-                        max(min_ar, scale**2), min(max_ar, scale**-2))
-                    h_scale = scale / np.sqrt(aspect_ratio)
-                    w_scale = scale * np.sqrt(aspect_ratio)
-                else:
-                    h_scale = np.random.uniform(*self.scaling)
-                    w_scale = np.random.uniform(*self.scaling)
-                crop_h = h * h_scale
-                crop_w = w * w_scale
-                if self.aspect_ratio is None:
-                    if crop_h / crop_w < 0.5 or crop_h / crop_w > 2.0:
-                        continue
-
-                crop_h = int(crop_h)
-                crop_w = int(crop_w)
-                crop_y = np.random.randint(0, h - crop_h)
-                crop_x = np.random.randint(0, w - crop_w)
-                crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
-                if self.ioumode == "iof":
-                    iou = self._gtcropiou_matrix(
-                        gt_bbox, np.array(
-                            [crop_box], dtype=np.float32))
-                elif self.ioumode == "iou":
-                    iou = self._iou_matrix(
-                        gt_bbox, np.array(
-                            [crop_box], dtype=np.float32))
-                if iou.max() < thresh:
-                    continue
-
-                if self.cover_all_box and iou.min() < thresh:
-                    continue
-
-                cropped_box, valid_ids = self._crop_box_with_center_constraint(
-                    gt_bbox, np.array(
-                        crop_box, dtype=np.float32))
-                if valid_ids.size > 0:
-                    found = True
-                    break
-
-            if found:
-                if self.is_mask_crop and 'gt_poly' in sample and len(sample[
-                        'gt_poly']) > 0:
-                    crop_polys = self.crop_segms(
-                        sample['gt_poly'],
-                        valid_ids,
-                        np.array(
-                            crop_box, dtype=np.int64),
-                        h,
-                        w)
-                    if [] in crop_polys:
-                        delete_id = list()
-                        valid_polys = list()
-                        for id, crop_poly in enumerate(crop_polys):
-                            if crop_poly == []:
-                                delete_id.append(id)
-                            else:
-                                valid_polys.append(crop_poly)
-                        valid_ids = np.delete(valid_ids, delete_id)
-                        if len(valid_polys) == 0:
-                            return sample
-                        sample['gt_poly'] = valid_polys
-                    else:
-                        sample['gt_poly'] = crop_polys
-
-                if 'gt_segm' in sample:
-                    sample['gt_segm'] = self._crop_segm(sample['gt_segm'],
-                                                        crop_box)
-                    sample['gt_segm'] = np.take(
-                        sample['gt_segm'], valid_ids, axis=0)
-
-                sample['image'] = self._crop_image(sample['image'], crop_box)
-                if fake_bboxes == True:
-                    return sample
-
-                sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0)
-                sample['gt_class'] = np.take(
-                    sample['gt_class'], valid_ids, axis=0)
-                if 'gt_score' in sample:
-                    sample['gt_score'] = np.take(
-                        sample['gt_score'], valid_ids, axis=0)
-
-                if 'is_crowd' in sample:
-                    sample['is_crowd'] = np.take(
-                        sample['is_crowd'], valid_ids, axis=0)
-
-                if 'difficult' in sample:
-                    sample['difficult'] = np.take(
-                        sample['difficult'], valid_ids, axis=0)
-
-                if 'gt_joints' in sample:
-                    sample['gt_joints'] = self._crop_joints(sample['gt_joints'],
-                                                            crop_box)
-
-                return sample
-
-        return sample
-
-    def _iou_matrix(self, a, b):
-        tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
-        br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
-
-        area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)
-        area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
-        area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
-        area_o = (area_a[:, np.newaxis] + area_b - area_i)
-        return area_i / (area_o + 1e-10)
-
-    def _gtcropiou_matrix(self, a, b):
-        tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2])
-        br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
-
-        area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2)
-        area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
-        area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
-        area_o = (area_a[:, np.newaxis] + area_b - area_i)
-        return area_i / (area_a + 1e-10)
-
-    def _crop_box_with_center_constraint(self, box, crop):
-        cropped_box = box.copy()
-
-        cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2])
-        cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:])
-        cropped_box[:, :2] -= crop[:2]
-        cropped_box[:, 2:] -= crop[:2]
-
-        centers = (box[:, :2] + box[:, 2:]) / 2
-        valid = np.logical_and(crop[:2] <= centers,
-                               centers < crop[2:]).all(axis=1)
-        valid = np.logical_and(
-            valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1))
-
-        return cropped_box, np.where(valid)[0]
-
-    def _crop_image(self, img, crop):
-        x1, y1, x2, y2 = crop
-        return img[y1:y2, x1:x2, :]
-
-    def _crop_segm(self, segm, crop):
-        x1, y1, x2, y2 = crop
-        return segm[:, y1:y2, x1:x2]
-
-    def _crop_joints(self, joints, crop):
-        x1, y1, x2, y2 = crop
-        joints[joints[..., 0] > x2, :] = 0
-        joints[joints[..., 1] > y2, :] = 0
-        joints[joints[..., 0] < x1, :] = 0
-        joints[joints[..., 1] < y1, :] = 0
-        joints[..., 0] -= x1
-        joints[..., 1] -= y1
-        return joints
-
-
-@register_op
-class RandomScaledCrop(BaseOperator):
-    """Resize image and bbox based on long side (with optional random scaling),
-       then crop or pad image to target size.
-    Args:
-        target_size (int|list): target size, "hw" format.
-        scale_range (list): random scale range.
-        interp (int): interpolation method, default to `cv2.INTER_LINEAR`.
-        fill_value (float|list|tuple): color value used to fill the canvas,
-            in RGB order.
-    """
-
-    def __init__(self,
-                 target_size=512,
-                 scale_range=[.1, 2.],
-                 interp=cv2.INTER_LINEAR,
-                 fill_value=(123.675, 116.28, 103.53)):
-        super(RandomScaledCrop, self).__init__()
-        assert isinstance(target_size, (
-            Integral, Sequence)), "target_size must be Integer, List or Tuple"
-        if isinstance(target_size, Integral):
-            target_size = [target_size, ] * 2
-
-        self.target_size = target_size
-        self.scale_range = scale_range
-        self.interp = interp
-        assert isinstance(fill_value, (Number, Sequence)), \
-            "fill value must be either float or sequence"
-        if isinstance(fill_value, Number):
-            fill_value = (fill_value, ) * 3
-        if not isinstance(fill_value, tuple):
-            fill_value = tuple(fill_value)
-        self.fill_value = fill_value
-
-    def apply_image(self, img, output_size, offset_x, offset_y):
-        th, tw = self.target_size
-        rh, rw = output_size
-        img = cv2.resize(
-            img, (rw, rh), interpolation=self.interp).astype(np.float32)
-        canvas = np.ones([th, tw, 3], dtype=np.float32)
-        canvas *= np.array(self.fill_value, dtype=np.float32)
-        canvas[:min(th, rh), :min(tw, rw)] = \
-            img[offset_y:offset_y + th, offset_x:offset_x + tw]
-        return canvas
-
-    def apply_bbox(self, gt_bbox, gt_class, scale, offset_x, offset_y):
-        th, tw = self.target_size
-        shift_array = np.array(
-            [
-                offset_x,
-                offset_y,
-            ] * 2, dtype=np.float32)
-        boxes = gt_bbox * scale - shift_array
-        boxes[:, 0::2] = np.clip(boxes[:, 0::2], 0, tw)
-        boxes[:, 1::2] = np.clip(boxes[:, 1::2], 0, th)
-        # filter boxes with no area
-        area = np.prod(boxes[..., 2:] - boxes[..., :2], axis=1)
-        valid = (area > 1.).nonzero()[0]
-        return boxes[valid], gt_class[valid], valid
-
-    def apply_segm(self, segms, output_size, offset_x, offset_y, valid=None):
-        th, tw = self.target_size
-        rh, rw = output_size
-        out_segms = []
-        for segm in segms:
-            segm = cv2.resize(segm, (rw, rh), interpolation=cv2.INTER_NEAREST)
-            segm = segm.astype(np.float32)
-            canvas = np.zeros([th, tw], dtype=segm.dtype)
-            canvas[:min(th, rh), :min(tw, rw)] = \
-                segm[offset_y:offset_y + th, offset_x:offset_x + tw]
-            out_segms.append(canvas)
-        out_segms = np.stack(out_segms)
-        return out_segms if valid is None else out_segms[valid]
-
-    def apply(self, sample, context=None):
-        img = sample['image']
-        h, w = img.shape[:2]
-        random_scale = np.random.uniform(*self.scale_range)
-        target_scale_size = [t * random_scale for t in self.target_size]
-        # Compute actual rescaling applied to image.
-        scale = min(target_scale_size[0] / h, target_scale_size[1] / w)
-        output_size = [int(round(h * scale)), int(round(w * scale))]
-        # get offset
-        offset_x = int(
-            max(0, np.random.uniform(0., output_size[1] - self.target_size[1])))
-        offset_y = int(
-            max(0, np.random.uniform(0., output_size[0] - self.target_size[0])))
-
-        # apply to image
-        sample['image'] = self.apply_image(img, output_size, offset_x, offset_y)
-
-        # apply to bbox
-        valid = None
-        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
-            sample['gt_bbox'], sample['gt_class'], valid = self.apply_bbox(
-                sample['gt_bbox'], sample['gt_class'], scale, offset_x,
-                offset_y)
-
-        # apply to segm
-        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
-            sample['gt_segm'] = self.apply_segm(sample['gt_segm'], output_size,
-                                                offset_x, offset_y, valid)
-
-        sample['im_shape'] = np.asarray(output_size, dtype=np.float32)
-        scale_factor = sample['scale_factor']
-        sample['scale_factor'] = np.asarray(
-            [scale_factor[0] * scale, scale_factor[1] * scale],
-            dtype=np.float32)
-
-        return sample
-
-
-@register_op
-class Cutmix(BaseOperator):
-    def __init__(self, alpha=1.5, beta=1.5):
-        """ 
-        CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features, see https://arxiv.org/abs/1905.04899
-        Cutmix image and gt_bbbox/gt_score
-        Args:
-             alpha (float): alpha parameter of beta distribute
-             beta (float): beta parameter of beta distribute
-        """
-        super(Cutmix, self).__init__()
-        self.alpha = alpha
-        self.beta = beta
-        if self.alpha <= 0.0:
-            raise ValueError("alpha shold be positive in {}".format(self))
-        if self.beta <= 0.0:
-            raise ValueError("beta shold be positive in {}".format(self))
-
-    def apply_image(self, img1, img2, factor):
-        """ _rand_bbox """
-        h = max(img1.shape[0], img2.shape[0])
-        w = max(img1.shape[1], img2.shape[1])
-        cut_rat = np.sqrt(1. - factor)
-
-        cut_w = np.int32(w * cut_rat)
-        cut_h = np.int32(h * cut_rat)
-
-        # uniform
-        cx = np.random.randint(w)
-        cy = np.random.randint(h)
-
-        bbx1 = np.clip(cx - cut_w // 2, 0, w - 1)
-        bby1 = np.clip(cy - cut_h // 2, 0, h - 1)
-        bbx2 = np.clip(cx + cut_w // 2, 0, w - 1)
-        bby2 = np.clip(cy + cut_h // 2, 0, h - 1)
-
-        img_1_pad = np.zeros((h, w, img1.shape[2]), 'float32')
-        img_1_pad[:img1.shape[0], :img1.shape[1], :] = \
-            img1.astype('float32')
-        img_2_pad = np.zeros((h, w, img2.shape[2]), 'float32')
-        img_2_pad[:img2.shape[0], :img2.shape[1], :] = \
-            img2.astype('float32')
-        img_1_pad[bby1:bby2, bbx1:bbx2, :] = img_2_pad[bby1:bby2, bbx1:bbx2, :]
-        return img_1_pad
-
-    def __call__(self, sample, context=None):
-        if not isinstance(sample, Sequence):
-            return sample
-
-        assert len(sample) == 2, 'cutmix need two samples'
-
-        factor = np.random.beta(self.alpha, self.beta)
-        factor = max(0.0, min(1.0, factor))
-        if factor >= 1.0:
-            return sample[0]
-        if factor <= 0.0:
-            return sample[1]
-        img1 = sample[0]['image']
-        img2 = sample[1]['image']
-        img = self.apply_image(img1, img2, factor)
-        gt_bbox1 = sample[0]['gt_bbox']
-        gt_bbox2 = sample[1]['gt_bbox']
-        gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0)
-        gt_class1 = sample[0]['gt_class']
-        gt_class2 = sample[1]['gt_class']
-        gt_class = np.concatenate((gt_class1, gt_class2), axis=0)
-        gt_score1 = np.ones_like(sample[0]['gt_class'])
-        gt_score2 = np.ones_like(sample[1]['gt_class'])
-        gt_score = np.concatenate(
-            (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0)
-        result = copy.deepcopy(sample[0])
-        result['image'] = img
-        result['gt_bbox'] = gt_bbox
-        result['gt_score'] = gt_score
-        result['gt_class'] = gt_class
-        if 'is_crowd' in sample[0]:
-            is_crowd1 = sample[0]['is_crowd']
-            is_crowd2 = sample[1]['is_crowd']
-            is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0)
-            result['is_crowd'] = is_crowd
-        if 'difficult' in sample[0]:
-            is_difficult1 = sample[0]['difficult']
-            is_difficult2 = sample[1]['difficult']
-            is_difficult = np.concatenate(
-                (is_difficult1, is_difficult2), axis=0)
-            result['difficult'] = is_difficult
-        return result
-
-
-@register_op
-class Mixup(BaseOperator):
-    def __init__(self, alpha=1.5, beta=1.5):
-        """ Mixup image and gt_bbbox/gt_score
-        Args:
-            alpha (float): alpha parameter of beta distribute
-            beta (float): beta parameter of beta distribute
-        """
-        super(Mixup, self).__init__()
-        self.alpha = alpha
-        self.beta = beta
-        if self.alpha <= 0.0:
-            raise ValueError("alpha shold be positive in {}".format(self))
-        if self.beta <= 0.0:
-            raise ValueError("beta shold be positive in {}".format(self))
-
-    def apply_image(self, img1, img2, factor):
-        h = max(img1.shape[0], img2.shape[0])
-        w = max(img1.shape[1], img2.shape[1])
-        img = np.zeros((h, w, img1.shape[2]), 'float32')
-        img[:img1.shape[0], :img1.shape[1], :] = \
-            img1.astype('float32') * factor
-        img[:img2.shape[0], :img2.shape[1], :] += \
-            img2.astype('float32') * (1.0 - factor)
-        return img.astype('uint8')
-
-    def __call__(self, sample, context=None):
-        if not isinstance(sample, Sequence):
-            return sample
-
-        assert len(sample) == 2, 'mixup need two samples'
-
-        factor = np.random.beta(self.alpha, self.beta)
-        factor = max(0.0, min(1.0, factor))
-        if factor >= 1.0:
-            return sample[0]
-        if factor <= 0.0:
-            return sample[1]
-        im = self.apply_image(sample[0]['image'], sample[1]['image'], factor)
-        result = copy.deepcopy(sample[0])
-        result['image'] = im
-        # apply bbox and score
-        if 'gt_bbox' in sample[0]:
-            gt_bbox1 = sample[0]['gt_bbox']
-            gt_bbox2 = sample[1]['gt_bbox']
-            gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0)
-            result['gt_bbox'] = gt_bbox
-        if 'gt_class' in sample[0]:
-            gt_class1 = sample[0]['gt_class']
-            gt_class2 = sample[1]['gt_class']
-            gt_class = np.concatenate((gt_class1, gt_class2), axis=0)
-            result['gt_class'] = gt_class
-
-            gt_score1 = np.ones_like(sample[0]['gt_class'])
-            gt_score2 = np.ones_like(sample[1]['gt_class'])
-            gt_score = np.concatenate(
-                (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0)
-            result['gt_score'] = gt_score.astype('float32')
-        if 'is_crowd' in sample[0]:
-            is_crowd1 = sample[0]['is_crowd']
-            is_crowd2 = sample[1]['is_crowd']
-            is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0)
-            result['is_crowd'] = is_crowd
-        if 'difficult' in sample[0]:
-            is_difficult1 = sample[0]['difficult']
-            is_difficult2 = sample[1]['difficult']
-            is_difficult = np.concatenate(
-                (is_difficult1, is_difficult2), axis=0)
-            result['difficult'] = is_difficult
-
-        if 'gt_ide' in sample[0]:
-            gt_ide1 = sample[0]['gt_ide']
-            gt_ide2 = sample[1]['gt_ide']
-            gt_ide = np.concatenate((gt_ide1, gt_ide2), axis=0)
-            result['gt_ide'] = gt_ide
-        return result
-
-
-@register_op
-class NormalizeBox(BaseOperator):
-    """Transform the bounding box's coornidates to [0,1]."""
-
-    def __init__(self):
-        super(NormalizeBox, self).__init__()
-
-    def apply(self, sample, context):
-        im = sample['image']
-        if 'gt_bbox' in sample.keys():
-            gt_bbox = sample['gt_bbox']
-            height, width, _ = im.shape
-            for i in range(gt_bbox.shape[0]):
-                gt_bbox[i][0] = gt_bbox[i][0] / width
-                gt_bbox[i][1] = gt_bbox[i][1] / height
-                gt_bbox[i][2] = gt_bbox[i][2] / width
-                gt_bbox[i][3] = gt_bbox[i][3] / height
-            sample['gt_bbox'] = gt_bbox
-
-            if 'gt_keypoint' in sample.keys():
-                gt_keypoint = sample['gt_keypoint']
-
-                for i in range(gt_keypoint.shape[1]):
-                    if i % 2:
-                        gt_keypoint[:, i] = gt_keypoint[:, i] / height
-                    else:
-                        gt_keypoint[:, i] = gt_keypoint[:, i] / width
-                sample['gt_keypoint'] = gt_keypoint
-
-            return sample
-        else:
-            return sample
-
-
-@register_op
-class BboxXYXY2XYWH(BaseOperator):
-    """
-    Convert bbox XYXY format to XYWH format.
-    """
-
-    def __init__(self):
-        super(BboxXYXY2XYWH, self).__init__()
-
-    def apply(self, sample, context=None):
-        if 'gt_bbox' in sample.keys():
-            bbox = sample['gt_bbox']
-            bbox[:, 2:4] = bbox[:, 2:4] - bbox[:, :2]
-            bbox[:, :2] = bbox[:, :2] + bbox[:, 2:4] / 2.
-            sample['gt_bbox'] = bbox
-            return sample
-        else:
-            return sample
-
-
-@register_op
-class PadBox(BaseOperator):
-    def __init__(self, num_max_boxes=50):
-        """
-        Pad zeros to bboxes if number of bboxes is less than num_max_boxes.
-        Args:
-            num_max_boxes (int): the max number of bboxes
-        """
-        self.num_max_boxes = num_max_boxes
-        super(PadBox, self).__init__()
-
-    def apply(self, sample, context=None):
-        assert 'gt_bbox' in sample
-        bbox = sample['gt_bbox']
-        gt_num = min(self.num_max_boxes, len(bbox))
-        num_max = self.num_max_boxes
-        # fields = context['fields'] if context else []
-        pad_bbox = np.zeros((num_max, 4), dtype=np.float32)
-        if gt_num > 0:
-            pad_bbox[:gt_num, :] = bbox[:gt_num, :]
-        sample['gt_bbox'] = pad_bbox
-        if 'gt_class' in sample:
-            pad_class = np.zeros((num_max, ), dtype=np.int32)
-            if gt_num > 0:
-                pad_class[:gt_num] = sample['gt_class'][:gt_num, 0]
-            sample['gt_class'] = pad_class
-        if 'gt_score' in sample:
-            pad_score = np.zeros((num_max, ), dtype=np.float32)
-            if gt_num > 0:
-                pad_score[:gt_num] = sample['gt_score'][:gt_num, 0]
-            sample['gt_score'] = pad_score
-        # in training, for example in op ExpandImage,
-        # the bbox and gt_class is expandded, but the difficult is not,
-        # so, judging by it's length
-        if 'difficult' in sample:
-            pad_diff = np.zeros((num_max, ), dtype=np.int32)
-            if gt_num > 0:
-                pad_diff[:gt_num] = sample['difficult'][:gt_num, 0]
-            sample['difficult'] = pad_diff
-        if 'is_crowd' in sample:
-            pad_crowd = np.zeros((num_max, ), dtype=np.int32)
-            if gt_num > 0:
-                pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0]
-            sample['is_crowd'] = pad_crowd
-        if 'gt_ide' in sample:
-            pad_ide = np.zeros((num_max, ), dtype=np.int32)
-            if gt_num > 0:
-                pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0]
-            sample['gt_ide'] = pad_ide
-        return sample
-
-
-@register_op
-class DebugVisibleImage(BaseOperator):
-    """
-    In debug mode, visualize images according to `gt_box`.
-    (Currently only supported when not cropping and flipping image.)
-    """
-
-    def __init__(self, output_dir='output/debug', is_normalized=False):
-        super(DebugVisibleImage, self).__init__()
-        self.is_normalized = is_normalized
-        self.output_dir = output_dir
-        if not os.path.isdir(output_dir):
-            os.makedirs(output_dir)
-        if not isinstance(self.is_normalized, bool):
-            raise TypeError("{}: input type is invalid.".format(self))
-
-    def apply(self, sample, context=None):
-        image = Image.fromarray(sample['image'].astype(np.uint8))
-        out_file_name = '{:012d}.jpg'.format(sample['im_id'][0])
-        width = sample['w']
-        height = sample['h']
-        gt_bbox = sample['gt_bbox']
-        gt_class = sample['gt_class']
-        draw = ImageDraw.Draw(image)
-        for i in range(gt_bbox.shape[0]):
-            if self.is_normalized:
-                gt_bbox[i][0] = gt_bbox[i][0] * width
-                gt_bbox[i][1] = gt_bbox[i][1] * height
-                gt_bbox[i][2] = gt_bbox[i][2] * width
-                gt_bbox[i][3] = gt_bbox[i][3] * height
-
-            xmin, ymin, xmax, ymax = gt_bbox[i]
-            draw.line(
-                [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
-                 (xmin, ymin)],
-                width=2,
-                fill='green')
-            # draw label
-            text = str(gt_class[i][0])
-            tw, th = imagedraw_textsize_c(draw, text)
-            draw.rectangle(
-                [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green')
-            draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
-
-        if 'gt_keypoint' in sample.keys():
-            gt_keypoint = sample['gt_keypoint']
-            if self.is_normalized:
-                for i in range(gt_keypoint.shape[1]):
-                    if i % 2:
-                        gt_keypoint[:, i] = gt_keypoint[:, i] * height
-                    else:
-                        gt_keypoint[:, i] = gt_keypoint[:, i] * width
-            for i in range(gt_keypoint.shape[0]):
-                keypoint = gt_keypoint[i]
-                for j in range(int(keypoint.shape[0] / 2)):
-                    x1 = round(keypoint[2 * j]).astype(np.int32)
-                    y1 = round(keypoint[2 * j + 1]).astype(np.int32)
-                    draw.ellipse(
-                        (x1, y1, x1 + 5, y1 + 5), fill='green', outline='green')
-        save_path = os.path.join(self.output_dir, out_file_name)
-        image.save(save_path, quality=95)
-        return sample
-
-
-@register_op
-class Pad(BaseOperator):
-    def __init__(self,
-                 size=None,
-                 size_divisor=32,
-                 pad_mode=0,
-                 offsets=None,
-                 fill_value=(127.5, 127.5, 127.5)):
-        """
-        Pad image to a specified size or multiple of size_divisor.
-        Args:
-            size (int, Sequence): image target size, if None, pad to multiple of size_divisor, default None
-            size_divisor (int): size divisor, default 32
-            pad_mode (int): pad mode, currently only supports four modes [-1, 0, 1, 2]. if -1, use specified offsets
-                if 0, only pad to right and bottom. if 1, pad according to center. if 2, only pad left and top
-            offsets (list): [offset_x, offset_y], specify offset while padding, only supported pad_mode=-1
-            fill_value (bool): rgb value of pad area, default (127.5, 127.5, 127.5)
-        """
-        super(Pad, self).__init__()
-
-        if not isinstance(size, (int, Sequence)):
-            raise TypeError(
-                "Type of target_size is invalid when random_size is True. \
-                            Must be List, now is {}".format(type(size)))
-
-        if isinstance(size, int):
-            size = [size, size]
-
-        assert pad_mode in [
-            -1, 0, 1, 2
-        ], 'currently only supports four modes [-1, 0, 1, 2]'
-        if pad_mode == -1:
-            assert offsets, 'if pad_mode is -1, offsets should not be None'
-
-        self.size = size
-        self.size_divisor = size_divisor
-        self.pad_mode = pad_mode
-        self.fill_value = fill_value
-        self.offsets = offsets
-
-    def apply_segm(self, segms, offsets, im_size, size):
-        def _expand_poly(poly, x, y):
-            expanded_poly = np.array(poly)
-            expanded_poly[0::2] += x
-            expanded_poly[1::2] += y
-            return expanded_poly.tolist()
-
-        def _expand_rle(rle, x, y, height, width, h, w):
-            if 'counts' in rle and type(rle['counts']) == list:
-                rle = mask_util.frPyObjects(rle, height, width)
-            mask = mask_util.decode(rle)
-            expanded_mask = np.full((h, w), 0).astype(mask.dtype)
-            expanded_mask[y:y + height, x:x + width] = mask
-            rle = mask_util.encode(
-                np.array(
-                    expanded_mask, order='F', dtype=np.uint8))
-            return rle
-
-        x, y = offsets
-        height, width = im_size
-        h, w = size
-        expanded_segms = []
-        for segm in segms:
-            if is_poly(segm):
-                # Polygon format
-                expanded_segms.append(
-                    [_expand_poly(poly, x, y) for poly in segm])
-            else:
-                # RLE format
-                import pycocotools.mask as mask_util
-                expanded_segms.append(
-                    _expand_rle(segm, x, y, height, width, h, w))
-        return expanded_segms
-
-    def apply_bbox(self, bbox, offsets):
-        return bbox + np.array(offsets * 2, dtype=np.float32)
-
-    def apply_keypoint(self, keypoints, offsets):
-        n = len(keypoints[0]) // 2
-        return keypoints + np.array(offsets * n, dtype=np.float32)
-
-    def apply_image(self, image, offsets, im_size, size):
-        x, y = offsets
-        im_h, im_w = im_size
-        h, w = size
-        canvas = np.ones((h, w, 3), dtype=np.float32)
-        canvas *= np.array(self.fill_value, dtype=np.float32)
-        canvas[y:y + im_h, x:x + im_w, :] = image.astype(np.float32)
-        return canvas
-
-    def apply(self, sample, context=None):
-        im = sample['image']
-        im_h, im_w = im.shape[:2]
-        if self.size:
-            h, w = self.size
-            assert (
-                im_h <= h and im_w <= w
-            ), '(h, w) of target size should be greater than (im_h, im_w)'
-        else:
-            h = int(np.ceil(im_h / self.size_divisor) * self.size_divisor)
-            w = int(np.ceil(im_w / self.size_divisor) * self.size_divisor)
-
-        if h == im_h and w == im_w:
-            sample['image'] = im.astype(np.float32)
-            return sample
-
-        if self.pad_mode == -1:
-            offset_x, offset_y = self.offsets
-        elif self.pad_mode == 0:
-            offset_y, offset_x = 0, 0
-        elif self.pad_mode == 1:
-            offset_y, offset_x = (h - im_h) // 2, (w - im_w) // 2
-        else:
-            offset_y, offset_x = h - im_h, w - im_w
-
-        offsets, im_size, size = [offset_x, offset_y], [im_h, im_w], [h, w]
-
-        sample['image'] = self.apply_image(im, offsets, im_size, size)
-
-        if self.pad_mode == 0:
-            return sample
-        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
-            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], offsets)
-
-        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
-            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], offsets,
-                                                im_size, size)
-
-        if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0:
-            sample['gt_keypoint'] = self.apply_keypoint(sample['gt_keypoint'],
-                                                        offsets)
-
-        return sample
-
-
-@register_op
-class Poly2Mask(BaseOperator):
-    """
-    gt poly to mask annotations.
-    Args:
-        del_poly (bool): Whether to delete poly after generating mask. Default: False.
-    """
-
-    def __init__(self, del_poly=False):
-        super(Poly2Mask, self).__init__()
-        import pycocotools.mask as maskUtils
-        self.maskutils = maskUtils
-        self.del_poly = del_poly
-
-    def _poly2mask(self, mask_ann, img_h, img_w):
-        if isinstance(mask_ann, list):
-            # polygon -- a single object might consist of multiple parts
-            # we merge all parts into one mask rle code
-            rles = self.maskutils.frPyObjects(mask_ann, img_h, img_w)
-            rle = self.maskutils.merge(rles)
-        elif isinstance(mask_ann['counts'], list):
-            # uncompressed RLE
-            rle = self.maskutils.frPyObjects(mask_ann, img_h, img_w)
-        else:
-            # rle
-            rle = mask_ann
-        mask = self.maskutils.decode(rle)
-        return mask
-
-    def apply(self, sample, context=None):
-        assert 'gt_poly' in sample
-        im_h, im_w = sample['im_shape']
-        masks = [
-            self._poly2mask(gt_poly, im_h, im_w)
-            for gt_poly in sample['gt_poly']
-        ]
-        sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
-        if self.del_poly:
-            del (sample['gt_poly'])
-
-        return sample
-
-
-@register_op
-class AugmentHSV(BaseOperator):
-    """ 
-    Augment the SV channel of image data.
-    Args:
-        fraction (float): the fraction for augment. Default: 0.5.
-        is_bgr (bool): whether the image is BGR mode. Default: True.
-        hgain (float): H channel gains
-        sgain (float): S channel gains
-        vgain (float): V channel gains
-    """
-
-    def __init__(self,
-                 fraction=0.50,
-                 is_bgr=True,
-                 hgain=None,
-                 sgain=None,
-                 vgain=None):
-        super(AugmentHSV, self).__init__()
-        self.fraction = fraction
-        self.is_bgr = is_bgr
-        self.hgain = hgain
-        self.sgain = sgain
-        self.vgain = vgain
-        self.use_hsvgain = False if hgain is None else True
-
-    def apply(self, sample, context=None):
-        img = sample['image']
-        if self.is_bgr:
-            img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
-        else:
-            img_hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV)
-
-        if self.use_hsvgain:
-            hsv_augs = np.random.uniform(
-                -1, 1, 3) * [self.hgain, self.sgain, self.vgain]
-            # random selection of h, s, v
-            hsv_augs *= np.random.randint(0, 2, 3)
-            img_hsv[..., 0] = (img_hsv[..., 0] + hsv_augs[0]) % 180
-            img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_augs[1], 0, 255)
-            img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_augs[2], 0, 255)
-
-        else:
-            S = img_hsv[:, :, 1].astype(np.float32)
-            V = img_hsv[:, :, 2].astype(np.float32)
-
-            a = (random.random() * 2 - 1) * self.fraction + 1
-            S *= a
-            if a > 1:
-                np.clip(S, a_min=0, a_max=255, out=S)
-
-            a = (random.random() * 2 - 1) * self.fraction + 1
-            V *= a
-            if a > 1:
-                np.clip(V, a_min=0, a_max=255, out=V)
-
-            img_hsv[:, :, 1] = S.astype(np.uint8)
-            img_hsv[:, :, 2] = V.astype(np.uint8)
-
-        if self.is_bgr:
-            cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img)
-        else:
-            cv2.cvtColor(img_hsv, cv2.COLOR_HSV2RGB, dst=img)
-
-        sample['image'] = img.astype(np.float32)
-        return sample
-
-
-@register_op
-class Norm2PixelBbox(BaseOperator):
-    """
-    Transform the bounding box's coornidates which is in [0,1] to pixels.
-    """
-
-    def __init__(self):
-        super(Norm2PixelBbox, self).__init__()
-
-    def apply(self, sample, context=None):
-        assert 'gt_bbox' in sample
-        bbox = sample['gt_bbox']
-        height, width = sample['image'].shape[:2]
-        bbox[:, 0::2] = bbox[:, 0::2] * width
-        bbox[:, 1::2] = bbox[:, 1::2] * height
-        sample['gt_bbox'] = bbox
-        return sample
-
-
-@register_op
-class BboxCXCYWH2XYXY(BaseOperator):
-    """
-    Convert bbox CXCYWH format to XYXY format.
-    [center_x, center_y, width, height] -> [x0, y0, x1, y1]
-    """
-
-    def __init__(self):
-        super(BboxCXCYWH2XYXY, self).__init__()
-
-    def apply(self, sample, context=None):
-        assert 'gt_bbox' in sample
-        bbox0 = sample['gt_bbox']
-        bbox = bbox0.copy()
-
-        bbox[:, :2] = bbox0[:, :2] - bbox0[:, 2:4] / 2.
-        bbox[:, 2:4] = bbox0[:, :2] + bbox0[:, 2:4] / 2.
-        sample['gt_bbox'] = bbox
-        return sample
-
-
-@register_op
-class RandomResizeCrop(BaseOperator):
-    """Random resize and crop image and bboxes.
-    Args:
-        resizes (list): resize image to one of resizes. if keep_ratio is True and mode is
-        'long', resize the image's long side to the maximum of target_size, if keep_ratio is
-        True and mode is 'short', resize the image's short side to the minimum of target_size.
-        cropsizes (list): crop sizes after resize, [(min_crop_1, max_crop_1), ...]
-        mode (str): resize mode, `long` or `short`. Details see resizes. 
-        prob (float): probability of this op.
-        keep_ratio (bool): whether keep_ratio or not, default true
-        interp (int): the interpolation method
-        thresholds (list): iou thresholds for decide a valid bbox crop.
-        num_attempts (int): number of tries before giving up.
-        allow_no_crop (bool): allow return without actually cropping them.
-        cover_all_box (bool): ensure all bboxes are covered in the final crop.
-        is_mask_crop(bool): whether crop the segmentation.
-    """
-
-    def __init__(self,
-                 resizes,
-                 cropsizes,
-                 prob=0.5,
-                 mode='short',
-                 keep_ratio=True,
-                 interp=cv2.INTER_LINEAR,
-                 num_attempts=3,
-                 cover_all_box=False,
-                 allow_no_crop=False,
-                 thresholds=[0.3, 0.5, 0.7],
-                 is_mask_crop=False,
-                 ioumode="iou"):
-        super(RandomResizeCrop, self).__init__()
-
-        self.resizes = resizes
-        self.cropsizes = cropsizes
-        self.prob = prob
-        self.mode = mode
-        self.ioumode = ioumode
-
-        self.resizer = Resize(0, keep_ratio=keep_ratio, interp=interp)
-        self.croper = RandomCrop(
-            num_attempts=num_attempts,
-            cover_all_box=cover_all_box,
-            thresholds=thresholds,
-            allow_no_crop=allow_no_crop,
-            is_mask_crop=is_mask_crop)
-
-    def _format_size(self, size):
-        if isinstance(size, Integral):
-            size = (size, size)
-        return size
-
-    def apply(self, sample, context=None):
-        if random.random() < self.prob:
-            _resize = self._format_size(random.choice(self.resizes))
-            _cropsize = self._format_size(random.choice(self.cropsizes))
-            sample = self._resize(
-                self.resizer,
-                sample,
-                size=_resize,
-                mode=self.mode,
-                context=context)
-            sample = self._random_crop(
-                self.croper, sample, size=_cropsize, context=context)
-        return sample
-
-    @staticmethod
-    def _random_crop(croper, sample, size, context=None):
-        if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
-            return sample
-
-        self = croper
-        h, w = sample['image'].shape[:2]
-        gt_bbox = sample['gt_bbox']
-        cropsize = size
-        min_crop = min(cropsize)
-        max_crop = max(cropsize)
-
-        thresholds = list(self.thresholds)
-        np.random.shuffle(thresholds)
-
-        for thresh in thresholds:
-            found = False
-            for _ in range(self.num_attempts):
-
-                crop_h = random.randint(min_crop, min(h, max_crop))
-                crop_w = random.randint(min_crop, min(w, max_crop))
-
-                crop_y = random.randint(0, h - crop_h)
-                crop_x = random.randint(0, w - crop_w)
-
-                crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h]
-                if self.ioumode == "iof":
-                    iou = self._gtcropiou_matrix(
-                        gt_bbox, np.array(
-                            [crop_box], dtype=np.float32))
-                elif self.ioumode == "iou":
-                    iou = self._iou_matrix(
-                        gt_bbox, np.array(
-                            [crop_box], dtype=np.float32))
-                if iou.max() < thresh:
-                    continue
-
-                if self.cover_all_box and iou.min() < thresh:
-                    continue
-
-                cropped_box, valid_ids = self._crop_box_with_center_constraint(
-                    gt_bbox, np.array(
-                        crop_box, dtype=np.float32))
-                if valid_ids.size > 0:
-                    found = True
-                    break
-
-            if found:
-                if self.is_mask_crop and 'gt_poly' in sample and len(sample[
-                        'gt_poly']) > 0:
-                    crop_polys = self.crop_segms(
-                        sample['gt_poly'],
-                        valid_ids,
-                        np.array(
-                            crop_box, dtype=np.int64),
-                        h,
-                        w)
-                    if [] in crop_polys:
-                        delete_id = list()
-                        valid_polys = list()
-                        for id, crop_poly in enumerate(crop_polys):
-                            if crop_poly == []:
-                                delete_id.append(id)
-                            else:
-                                valid_polys.append(crop_poly)
-                        valid_ids = np.delete(valid_ids, delete_id)
-                        if len(valid_polys) == 0:
-                            return sample
-                        sample['gt_poly'] = valid_polys
-                    else:
-                        sample['gt_poly'] = crop_polys
-
-                if 'gt_segm' in sample:
-                    sample['gt_segm'] = self._crop_segm(sample['gt_segm'],
-                                                        crop_box)
-                    sample['gt_segm'] = np.take(
-                        sample['gt_segm'], valid_ids, axis=0)
-
-                sample['image'] = self._crop_image(sample['image'], crop_box)
-                sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0)
-                sample['gt_class'] = np.take(
-                    sample['gt_class'], valid_ids, axis=0)
-                if 'gt_score' in sample:
-                    sample['gt_score'] = np.take(
-                        sample['gt_score'], valid_ids, axis=0)
-
-                if 'is_crowd' in sample:
-                    sample['is_crowd'] = np.take(
-                        sample['is_crowd'], valid_ids, axis=0)
-
-                if 'gt_areas' in sample:
-                    sample['gt_areas'] = np.take(
-                        sample['gt_areas'], valid_ids, axis=0)
-
-                if 'gt_joints' in sample:
-                    gt_joints = self._crop_joints(sample['gt_joints'], crop_box)
-                    sample['gt_joints'] = gt_joints[valid_ids]
-                return sample
-
-        return sample
-
-    @staticmethod
-    def _resize(resizer, sample, size, mode='short', context=None):
-        self = resizer
-        im = sample['image']
-        target_size = size
-
-        if not isinstance(im, np.ndarray):
-            raise TypeError("{}: image type is not numpy.".format(self))
-        if len(im.shape) != 3:
-            raise ImageError('{}: image is not 3-dimensional.'.format(self))
-
-        # apply image
-        im_shape = im.shape
-        if self.keep_ratio:
-
-            im_size_min = np.min(im_shape[0:2])
-            im_size_max = np.max(im_shape[0:2])
-
-            target_size_min = np.min(target_size)
-            target_size_max = np.max(target_size)
-
-            if mode == 'long':
-                im_scale = min(target_size_min / im_size_min,
-                               target_size_max / im_size_max)
-            else:
-                im_scale = max(target_size_min / im_size_min,
-                               target_size_max / im_size_max)
-
-            resize_h = int(im_scale * float(im_shape[0]) + 0.5)
-            resize_w = int(im_scale * float(im_shape[1]) + 0.5)
-
-            im_scale_x = im_scale
-            im_scale_y = im_scale
-        else:
-            resize_h, resize_w = target_size
-            im_scale_y = resize_h / im_shape[0]
-            im_scale_x = resize_w / im_shape[1]
-
-        im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
-        sample['image'] = im
-        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
-        if 'scale_factor' in sample:
-            scale_factor = sample['scale_factor']
-            sample['scale_factor'] = np.asarray(
-                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
-                dtype=np.float32)
-        else:
-            sample['scale_factor'] = np.asarray(
-                [im_scale_y, im_scale_x], dtype=np.float32)
-
-        # apply bbox
-        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
-            sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'],
-                                                [im_scale_x, im_scale_y],
-                                                [resize_w, resize_h])
-
-        # apply polygon
-        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
-            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2],
-                                                [im_scale_x, im_scale_y])
-
-        # apply semantic
-        if 'semantic' in sample and sample['semantic']:
-            semantic = sample['semantic']
-            semantic = cv2.resize(
-                semantic.astype('float32'),
-                None,
-                None,
-                fx=im_scale_x,
-                fy=im_scale_y,
-                interpolation=self.interp)
-            semantic = np.asarray(semantic).astype('int32')
-            semantic = np.expand_dims(semantic, 0)
-            sample['semantic'] = semantic
-
-        # apply gt_segm
-        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
-            masks = [
-                cv2.resize(
-                    gt_segm,
-                    None,
-                    None,
-                    fx=im_scale_x,
-                    fy=im_scale_y,
-                    interpolation=cv2.INTER_NEAREST)
-                for gt_segm in sample['gt_segm']
-            ]
-            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
-
-        if 'gt_joints' in sample:
-            sample['gt_joints'] = self.apply_joints(sample['gt_joints'],
-                                                    [im_scale_x, im_scale_y],
-                                                    [resize_w, resize_h])
-
-        return sample
-
-
-@register_op
-class RandomSelect(BaseOperator):
-    """
-    Randomly choose a transformation between transforms1 and transforms2,
-    and the probability of choosing transforms1 is p.
-
-    The code is based on https://github.com/facebookresearch/detr/blob/main/datasets/transforms.py
-
-    """
-
-    def __init__(self, transforms1, transforms2, p=0.5):
-        super(RandomSelect, self).__init__()
-        self.transforms1 = Compose(transforms1)
-        self.transforms2 = Compose(transforms2)
-        self.p = p
-
-    def apply(self, sample, context=None):
-        if random.random() < self.p:
-            return self.transforms1(sample)
-        return self.transforms2(sample)
-
-
-@register_op
-class RandomSelects(BaseOperator):
-    """
-    Randomly choose a transformation between transforms1 and transforms2,
-    and the probability of choosing transforms1 is p.
-
-    The code is based on https://github.com/facebookresearch/detr/blob/main/datasets/transforms.py
-
-    """
-
-    def __init__(self, transforms_list, p=None):
-        super(RandomSelects, self).__init__()
-        if p is not None:
-            assert isinstance(p, (list, tuple))
-            assert len(transforms_list) == len(p)
-        else:
-            assert len(transforms_list) > 0
-        self.transforms = [Compose(t) for t in transforms_list]
-        self.p = p
-
-    def apply(self, sample, context=None):
-        if self.p is None:
-            return random.choice(self.transforms)(sample)
-        else:
-            prob = random.random()
-            for p, t in zip(self.p, self.transforms):
-                if prob <= p:
-                    return t(sample)
-
-
-@register_op
-class RandomShortSideResize(BaseOperator):
-    def __init__(self,
-                 short_side_sizes,
-                 max_size=None,
-                 interp=cv2.INTER_LINEAR,
-                 random_interp=False):
-        """
-        Resize the image randomly according to the short side. If max_size is not None,
-        the long side is scaled according to max_size. The whole process will be keep ratio.
-        Args:
-            short_side_sizes (list|tuple): Image target short side size.
-            max_size (int): The size of the longest side of image after resize.
-            interp (int): The interpolation method.
-            random_interp (bool): Whether random select interpolation method.
-        """
-        super(RandomShortSideResize, self).__init__()
-
-        assert isinstance(short_side_sizes,
-                          Sequence), "short_side_sizes must be List or Tuple"
-
-        self.short_side_sizes = short_side_sizes
-        self.max_size = max_size
-        self.interp = interp
-        self.random_interp = random_interp
-        self.interps = [
-            cv2.INTER_NEAREST,
-            cv2.INTER_LINEAR,
-            cv2.INTER_AREA,
-            cv2.INTER_CUBIC,
-            cv2.INTER_LANCZOS4,
-        ]
-
-    def get_size_with_aspect_ratio(self, image_shape, size, max_size=None):
-        h, w = image_shape
-        max_clip = False
-        if max_size is not None:
-            min_original_size = float(min((w, h)))
-            max_original_size = float(max((w, h)))
-            if max_original_size / min_original_size * size > max_size:
-                size = int(max_size * min_original_size / max_original_size)
-                max_clip = True
-
-        if (w <= h and w == size) or (h <= w and h == size):
-            return (w, h)
-
-        if w < h:
-            ow = size
-            oh = int(round(size * h / w)) if not max_clip else max_size
-        else:
-            oh = size
-            ow = int(round(size * w / h)) if not max_clip else max_size
-
-        return (ow, oh)
-
-    def resize(self,
-               sample,
-               target_size,
-               max_size=None,
-               interp=cv2.INTER_LINEAR):
-        im = sample['image']
-        if not isinstance(im, np.ndarray):
-            raise TypeError("{}: image type is not numpy.".format(self))
-        if len(im.shape) != 3:
-            raise ImageError('{}: image is not 3-dimensional.'.format(self))
-
-        target_size = self.get_size_with_aspect_ratio(im.shape[:2], target_size,
-                                                      max_size)
-        im_scale_y, im_scale_x = target_size[1] / im.shape[0], target_size[
-            0] / im.shape[1]
-
-        sample['image'] = cv2.resize(im, target_size, interpolation=interp)
-        sample['im_shape'] = np.asarray(target_size[::-1], dtype=np.float32)
-        if 'scale_factor' in sample:
-            scale_factor = sample['scale_factor']
-            sample['scale_factor'] = np.asarray(
-                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
-                dtype=np.float32)
-        else:
-            sample['scale_factor'] = np.asarray(
-                [im_scale_y, im_scale_x], dtype=np.float32)
-
-        # apply bbox
-        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
-            sample['gt_bbox'] = self.apply_bbox(
-                sample['gt_bbox'], [im_scale_x, im_scale_y], target_size)
-        # apply polygon
-        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
-            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im.shape[:2],
-                                                [im_scale_x, im_scale_y])
-        # apply semantic
-        if 'semantic' in sample and sample['semantic']:
-            semantic = sample['semantic']
-            semantic = cv2.resize(
-                semantic.astype('float32'),
-                target_size,
-                interpolation=self.interp)
-            semantic = np.asarray(semantic).astype('int32')
-            semantic = np.expand_dims(semantic, 0)
-            sample['semantic'] = semantic
-        # apply gt_segm
-        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
-            masks = [
-                cv2.resize(
-                    gt_segm, target_size, interpolation=cv2.INTER_NEAREST)
-                for gt_segm in sample['gt_segm']
-            ]
-            sample['gt_segm'] = np.asarray(masks).astype(np.uint8)
-
-        if 'gt_joints' in sample:
-            sample['gt_joints'] = self.apply_joints(
-                sample['gt_joints'], [im_scale_x, im_scale_y], target_size)
-
-        # apply areas
-        if 'gt_areas' in sample:
-            sample['gt_areas'] = self.apply_area(sample['gt_areas'],
-                                                 [im_scale_x, im_scale_y])
-
-        return sample
-
-    def apply_bbox(self, bbox, scale, size):
-        im_scale_x, im_scale_y = scale
-        resize_w, resize_h = size
-        bbox[:, 0::2] *= im_scale_x
-        bbox[:, 1::2] *= im_scale_y
-        bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w)
-        bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h)
-        return bbox.astype('float32')
-
-    def apply_joints(self, joints, scale, size):
-        im_scale_x, im_scale_y = scale
-        resize_w, resize_h = size
-        joints[..., 0] *= im_scale_x
-        joints[..., 1] *= im_scale_y
-        # joints[joints[..., 0] >= resize_w, :] = 0
-        # joints[joints[..., 1] >= resize_h, :] = 0
-        # joints[joints[..., 0] < 0, :] = 0
-        # joints[joints[..., 1] < 0, :] = 0
-        joints[..., 0] = np.clip(joints[..., 0], 0, resize_w)
-        joints[..., 1] = np.clip(joints[..., 1], 0, resize_h)
-        return joints
-
-    def apply_area(self, area, scale):
-        im_scale_x, im_scale_y = scale
-        return area * im_scale_x * im_scale_y
-
-    def apply_segm(self, segms, im_size, scale):
-        def _resize_poly(poly, im_scale_x, im_scale_y):
-            resized_poly = np.array(poly).astype('float32')
-            resized_poly[0::2] *= im_scale_x
-            resized_poly[1::2] *= im_scale_y
-            return resized_poly.tolist()
-
-        def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y):
-            if 'counts' in rle and type(rle['counts']) == list:
-                rle = mask_util.frPyObjects(rle, im_h, im_w)
-
-            mask = mask_util.decode(rle)
-            mask = cv2.resize(
-                mask,
-                None,
-                None,
-                fx=im_scale_x,
-                fy=im_scale_y,
-                interpolation=self.interp)
-            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
-            return rle
-
-        im_h, im_w = im_size
-        im_scale_x, im_scale_y = scale
-        resized_segms = []
-        for segm in segms:
-            if is_poly(segm):
-                # Polygon format
-                resized_segms.append([
-                    _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm
-                ])
-            else:
-                # RLE format
-                import pycocotools.mask as mask_util
-                resized_segms.append(
-                    _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y))
-
-        return resized_segms
-
-    def apply(self, sample, context=None):
-        target_size = random.choice(self.short_side_sizes)
-        interp = random.choice(
-            self.interps) if self.random_interp else self.interp
-
-        return self.resize(sample, target_size, self.max_size, interp)
-
-
-@register_op
-class RandomShortSideRangeResize(RandomShortSideResize):
-    def __init__(self, scales, interp=cv2.INTER_LINEAR, random_interp=False):
-        """
-        Resize the image randomly according to the short side. If max_size is not None,
-        the long side is scaled according to max_size. The whole process will be keep ratio.
-        Args:
-            short_side_sizes (list|tuple): Image target short side size.
-            interp (int): The interpolation method.
-            random_interp (bool): Whether random select interpolation method.
-        """
-        super(RandomShortSideRangeResize, self).__init__(scales, None, interp,
-                                                         random_interp)
-
-        assert isinstance(scales,
-                          Sequence), "short_side_sizes must be List or Tuple"
-
-        self.scales = scales
-
-    def random_sample(self, img_scales):
-        img_scale_long = [max(s) for s in img_scales]
-        img_scale_short = [min(s) for s in img_scales]
-        long_edge = np.random.randint(
-            min(img_scale_long), max(img_scale_long) + 1)
-        short_edge = np.random.randint(
-            min(img_scale_short), max(img_scale_short) + 1)
-        img_scale = (long_edge, short_edge)
-        return img_scale
-
-    def apply(self, sample, context=None):
-        long_edge, short_edge = self.random_sample(self.short_side_sizes)
-        # print("target size:{}".format((long_edge, short_edge)))
-        interp = random.choice(
-            self.interps) if self.random_interp else self.interp
-
-        return self.resize(sample, short_edge, long_edge, interp)
-
-
-@register_op
-class RandomSizeCrop(BaseOperator):
-    """
-    Cut the image randomly according to `min_size` and `max_size`
-    Args:
-        min_size (int): Min size for edges of cropped image.
-        max_size (int): Max size for edges of cropped image. If it
-                        is set to larger than length of the input image,
-                        the output will keep the origin length.
-        keep_empty (bool): Whether to keep the cropped result with no object.
-                           If it is set to False, the no-object result will not
-                           be returned, replaced by the original input.
-    """
-
-    def __init__(self, min_size, max_size, keep_empty=True):
-        super(RandomSizeCrop, self).__init__()
-        self.min_size = min_size
-        self.max_size = max_size
-        self.keep_empty = keep_empty
-
-        from paddle.vision.transforms.functional import crop as paddle_crop
-        self.paddle_crop = paddle_crop
-
-    @staticmethod
-    def get_crop_params(img_shape, output_size):
-        """Get parameters for ``crop`` for a random crop.
-        Args:
-            img_shape (list|tuple): Image's height and width.
-            output_size (list|tuple): Expected output size of the crop.
-        Returns:
-            tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
-        """
-        h, w = img_shape
-        th, tw = output_size
-
-        if h + 1 < th or w + 1 < tw:
-            raise ValueError(
-                "Required crop size {} is larger then input image size {}".
-                format((th, tw), (h, w)))
-
-        if w == tw and h == th:
-            return 0, 0, h, w
-
-        i = random.randint(0, h - th + 1)
-        j = random.randint(0, w - tw + 1)
-        return i, j, th, tw
-
-    def crop(self, sample, region):
-        keep_index = None
-        # apply bbox and check whether the cropped result is valid
-        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
-            croped_bbox = self.apply_bbox(sample['gt_bbox'], region)
-            bbox = croped_bbox.reshape([-1, 2, 2])
-            area = (bbox[:, 1, :] - bbox[:, 0, :]).prod(axis=1)
-            keep_index = np.where(area > 0)[0]
-
-            if not self.keep_empty and len(keep_index) == 0:
-                # When keep_empty is set to False, cropped with no-object will
-                # not be used and return the origin content.
-                return sample
-
-            sample['gt_bbox'] = croped_bbox[keep_index] if len(
-                keep_index) > 0 else np.zeros(
-                    [0, 4], dtype=np.float32)
-            sample['gt_class'] = sample['gt_class'][keep_index] if len(
-                keep_index) > 0 else np.zeros(
-                    [0, 1], dtype=np.float32)
-            if 'gt_score' in sample:
-                sample['gt_score'] = sample['gt_score'][keep_index] if len(
-                    keep_index) > 0 else np.zeros(
-                        [0, 1], dtype=np.float32)
-            if 'is_crowd' in sample:
-                sample['is_crowd'] = sample['is_crowd'][keep_index] if len(
-                    keep_index) > 0 else np.zeros(
-                        [0, 1], dtype=np.float32)
-            if 'gt_areas' in sample:
-                sample['gt_areas'] = np.take(
-                    sample['gt_areas'], keep_index, axis=0)
-
-        image_shape = sample['image'].shape[:2]
-        sample['image'] = self.paddle_crop(sample['image'], *region)
-        sample['im_shape'] = np.array(
-            sample['image'].shape[:2], dtype=np.float32)
-
-        # apply polygon
-        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
-            sample['gt_poly'] = self.apply_segm(sample['gt_poly'], region,
-                                                image_shape)
-            sample['gt_poly'] = np.array(sample['gt_poly'])
-            if keep_index is not None and len(keep_index) > 0:
-                sample['gt_poly'] = sample['gt_poly'][keep_index]
-            sample['gt_poly'] = sample['gt_poly'].tolist()
-        # apply gt_segm
-        if 'gt_segm' in sample and len(sample['gt_segm']) > 0:
-            i, j, h, w = region
-            sample['gt_segm'] = sample['gt_segm'][:, i:i + h, j:j + w]
-            if keep_index is not None and len(keep_index) > 0:
-                sample['gt_segm'] = sample['gt_segm'][keep_index]
-
-        if 'gt_joints' in sample:
-            gt_joints = self._crop_joints(sample['gt_joints'], region)
-            sample['gt_joints'] = gt_joints
-            if keep_index is not None:
-                sample['gt_joints'] = sample['gt_joints'][keep_index]
-
-        return sample
-
-    def apply_bbox(self, bbox, region):
-        i, j, h, w = region
-        region_size = np.asarray([w, h])
-        crop_bbox = bbox - np.asarray([j, i, j, i])
-        crop_bbox = np.minimum(crop_bbox.reshape([-1, 2, 2]), region_size)
-        crop_bbox = crop_bbox.clip(min=0)
-        return crop_bbox.reshape([-1, 4]).astype('float32')
-
-    def _crop_joints(self, joints, region):
-        y1, x1, h, w = region
-        x2 = x1 + w
-        y2 = y1 + h
-        # x1, y1, x2, y2 = crop
-        joints[..., 0] -= x1
-        joints[..., 1] -= y1
-        joints[joints[..., 0] > w, :] = 0
-        joints[joints[..., 1] > h, :] = 0
-        joints[joints[..., 0] < 0, :] = 0
-        joints[joints[..., 1] < 0, :] = 0
-        return joints
-
-    def apply_segm(self, segms, region, image_shape):
-        def _crop_poly(segm, crop):
-            xmin, ymin, xmax, ymax = crop
-            crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin]
-            crop_p = np.array(crop_coord).reshape(4, 2)
-            crop_p = Polygon(crop_p)
-
-            crop_segm = list()
-            for poly in segm:
-                poly = np.array(poly).reshape(len(poly) // 2, 2)
-                polygon = Polygon(poly)
-                if not polygon.is_valid:
-                    exterior = polygon.exterior
-                    multi_lines = exterior.intersection(exterior)
-                    polygons = shapely.ops.polygonize(multi_lines)
-                    polygon = MultiPolygon(polygons)
-                multi_polygon = list()
-                if isinstance(polygon, MultiPolygon):
-                    multi_polygon = copy.deepcopy(polygon)
-                else:
-                    multi_polygon.append(copy.deepcopy(polygon))
-                for per_polygon in multi_polygon:
-                    inter = per_polygon.intersection(crop_p)
-                    if not inter:
-                        continue
-                    if isinstance(inter, (MultiPolygon, GeometryCollection)):
-                        for part in inter:
-                            if not isinstance(part, Polygon):
-                                continue
-                            part = np.squeeze(
-                                np.array(part.exterior.coords[:-1]).reshape(1,
-                                                                            -1))
-                            part[0::2] -= xmin
-                            part[1::2] -= ymin
-                            crop_segm.append(part.tolist())
-                    elif isinstance(inter, Polygon):
-                        crop_poly = np.squeeze(
-                            np.array(inter.exterior.coords[:-1]).reshape(1, -1))
-                        crop_poly[0::2] -= xmin
-                        crop_poly[1::2] -= ymin
-                        crop_segm.append(crop_poly.tolist())
-                    else:
-                        continue
-            return crop_segm
-
-        def _crop_rle(rle, crop, height, width):
-            if 'counts' in rle and type(rle['counts']) == list:
-                rle = mask_util.frPyObjects(rle, height, width)
-            mask = mask_util.decode(rle)
-            mask = mask[crop[1]:crop[3], crop[0]:crop[2]]
-            rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8))
-            return rle
-
-        i, j, h, w = region
-        crop = [j, i, j + w, i + h]
-        height, width = image_shape
-        crop_segms = []
-        for segm in segms:
-            if is_poly(segm):
-                import copy
-                import shapely.ops
-                from shapely.geometry import Polygon, MultiPolygon, GeometryCollection
-                # Polygon format
-                crop_segms.append(_crop_poly(segm, crop))
-            else:
-                # RLE format
-                import pycocotools.mask as mask_util
-                crop_segms.append(_crop_rle(segm, crop, height, width))
-        return crop_segms
-
-    def apply(self, sample, context=None):
-        h = random.randint(self.min_size,
-                           min(sample['image'].shape[0], self.max_size))
-        w = random.randint(self.min_size,
-                           min(sample['image'].shape[1], self.max_size))
-
-        region = self.get_crop_params(sample['image'].shape[:2], [h, w])
-        return self.crop(sample, region)
-
-
-@register_op
-class WarpAffine(BaseOperator):
-    def __init__(self,
-                 keep_res=False,
-                 pad=31,
-                 input_h=512,
-                 input_w=512,
-                 scale=0.4,
-                 shift=0.1,
-                 down_ratio=4):
-        """WarpAffine
-        Warp affine the image
-        The code is based on https://github.com/xingyizhou/CenterNet/blob/master/src/lib/datasets/sample/ctdet.py
-        """
-        super(WarpAffine, self).__init__()
-        self.keep_res = keep_res
-        self.pad = pad
-        self.input_h = input_h
-        self.input_w = input_w
-        self.scale = scale
-        self.shift = shift
-        self.down_ratio = down_ratio
-
-    def apply(self, sample, context=None):
-        img = sample['image']
-        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
-
-        h, w = img.shape[:2]
-
-        if self.keep_res:
-            # True in detection eval/infer
-            input_h = (h | self.pad) + 1
-            input_w = (w | self.pad) + 1
-            s = np.array([input_w, input_h], dtype=np.float32)
-            c = np.array([w // 2, h // 2], dtype=np.float32)
-        else:
-            # False in centertrack eval_mot/eval_mot
-            s = max(h, w) * 1.0
-            input_h, input_w = self.input_h, self.input_w
-            c = np.array([w / 2., h / 2.], dtype=np.float32)
-
-        trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
-        img = cv2.resize(img, (w, h))
-        inp = cv2.warpAffine(
-            img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
-        sample['image'] = inp
-
-        if not self.keep_res:
-            out_h = input_h // self.down_ratio
-            out_w = input_w // self.down_ratio
-            trans_output = get_affine_transform(c, s, 0, [out_w, out_h])
-
-            sample.update({
-                'center': c,
-                'scale': s,
-                'out_height': out_h,
-                'out_width': out_w,
-                'inp_height': input_h,
-                'inp_width': input_w,
-                'trans_input': trans_input,
-                'trans_output': trans_output,
-            })
-        return sample
-
-
-@register_op
-class FlipWarpAffine(BaseOperator):
-    def __init__(self,
-                 keep_res=False,
-                 pad=31,
-                 input_h=512,
-                 input_w=512,
-                 not_rand_crop=False,
-                 scale=0.4,
-                 shift=0.1,
-                 flip=0.5,
-                 is_scale=True,
-                 use_random=True,
-                 add_pre_img=False):
-        """FlipWarpAffine
-        1. Random Crop
-        2. Flip the image horizontal
-        3. Warp affine the image
-        4. (Optinal) Add previous image
-        """
-        super(FlipWarpAffine, self).__init__()
-        self.keep_res = keep_res
-        self.pad = pad
-        self.input_h = input_h
-        self.input_w = input_w
-        self.not_rand_crop = not_rand_crop
-        self.scale = scale
-        self.shift = shift
-        self.flip = flip
-        self.is_scale = is_scale
-        self.use_random = use_random
-        self.add_pre_img = add_pre_img
-
-    def __call__(self, samples, context=None):
-        if self.add_pre_img:
-            assert isinstance(samples, Sequence) and len(samples) == 2
-            sample, pre_sample = samples[0], samples[1]
-        else:
-            sample = samples
-
-        img = sample['image']
-        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
-        if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0:
-            return sample
-
-        h, w = img.shape[:2]
-        flipped = 0
-
-        if self.keep_res:
-            input_h = (h | self.pad) + 1
-            input_w = (w | self.pad) + 1
-            s = np.array([input_w, input_h], dtype=np.float32)
-            c = np.array([w // 2, h // 2], dtype=np.float32)
-        else:
-            # centernet training default
-            s = max(h, w) * 1.0
-            input_h, input_w = self.input_h, self.input_w
-            c = np.array([w / 2., h / 2.], dtype=np.float32)
-
-        if self.use_random:
-            gt_bbox = sample['gt_bbox']
-            if not self.not_rand_crop:
-                # centernet default
-                s = s * np.random.choice(np.arange(0.6, 1.4, 0.1))
-                w_border = get_border(128, w)
-                h_border = get_border(128, h)
-                c[0] = np.random.randint(low=w_border, high=w - w_border)
-                c[1] = np.random.randint(low=h_border, high=h - h_border)
-            else:
-                sf = self.scale
-                cf = self.shift
-                c[0] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)
-                c[1] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf)
-                s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
-
-            if np.random.random() < self.flip:
-                img = img[:, ::-1, :]
-                c[0] = w - c[0] - 1
-                oldx1 = gt_bbox[:, 0].copy()
-                oldx2 = gt_bbox[:, 2].copy()
-                gt_bbox[:, 0] = w - oldx2 - 1
-                gt_bbox[:, 2] = w - oldx1 - 1
-                flipped = 1
-            sample['gt_bbox'] = gt_bbox
-
-        trans_input = get_affine_transform(c, s, 0, [input_w, input_h])
-        inp = cv2.warpAffine(
-            img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR)
-        if self.is_scale:
-            inp = (inp.astype(np.float32) / 255.)
-
-        sample['image'] = inp
-        sample['center'] = c
-        sample['scale'] = s
-
-        if self.add_pre_img:
-            sample['trans_input'] = trans_input
-
-            # previous image, use same aug trans_input as current image
-            pre_img = pre_sample['image']
-            pre_img = cv2.cvtColor(pre_img, cv2.COLOR_RGB2BGR)
-            if flipped:
-                pre_img = pre_img[:, ::-1, :].copy()
-            pre_inp = cv2.warpAffine(
-                pre_img,
-                trans_input, (input_w, input_h),
-                flags=cv2.INTER_LINEAR)
-            if self.is_scale:
-                pre_inp = (pre_inp.astype(np.float32) / 255.)
-            sample['pre_image'] = pre_inp
-
-            # if empty gt_bbox
-            if 'gt_bbox' in pre_sample and len(pre_sample['gt_bbox']) == 0:
-                return sample
-            pre_gt_bbox = pre_sample['gt_bbox']
-            if flipped:
-                pre_oldx1 = pre_gt_bbox[:, 0].copy()
-                pre_oldx2 = pre_gt_bbox[:, 2].copy()
-                pre_gt_bbox[:, 0] = w - pre_oldx1 - 1
-                pre_gt_bbox[:, 2] = w - pre_oldx2 - 1
-            sample['pre_gt_bbox'] = pre_gt_bbox
-
-            sample['pre_gt_class'] = pre_sample['gt_class']
-            sample['pre_gt_track_id'] = pre_sample['gt_track_id']
-            del pre_sample
-
-        return sample
-
-
-@register_op
-class CenterRandColor(BaseOperator):
-    """Random color for CenterNet series models.
-    Args:
-        saturation (float): saturation settings.
-        contrast (float): contrast settings.
-        brightness (float): brightness settings.
-    """
-
-    def __init__(self, saturation=0.4, contrast=0.4, brightness=0.4):
-        super(CenterRandColor, self).__init__()
-        self.saturation = saturation
-        self.contrast = contrast
-        self.brightness = brightness
-
-    def apply_saturation(self, img, img_gray):
-        alpha = 1. + np.random.uniform(
-            low=-self.saturation, high=self.saturation)
-        self._blend(alpha, img, img_gray[:, :, None])
-        return img
-
-    def apply_contrast(self, img, img_gray):
-        alpha = 1. + np.random.uniform(low=-self.contrast, high=self.contrast)
-        img_mean = img_gray.mean()
-        self._blend(alpha, img, img_mean)
-        return img
-
-    def apply_brightness(self, img, img_gray):
-        alpha = 1 + np.random.uniform(
-            low=-self.brightness, high=self.brightness)
-        img *= alpha
-        return img
-
-    def _blend(self, alpha, img, img_mean):
-        img *= alpha
-        img_mean *= (1 - alpha)
-        img += img_mean
-
-    def apply(self, sample, context=None):
-        functions = [
-            self.apply_brightness,
-            self.apply_contrast,
-            self.apply_saturation,
-        ]
-
-        img = sample['image']
-        img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-        distortions = np.random.permutation(functions)
-        for func in distortions:
-            img = func(img, img_gray)
-        sample['image'] = img
-
-        if 'pre_image' in sample:
-            pre_img = sample['pre_image']
-            pre_img_gray = cv2.cvtColor(pre_img, cv2.COLOR_BGR2GRAY)
-            pre_distortions = np.random.permutation(functions)
-            for func in pre_distortions:
-                pre_img = func(pre_img, pre_img_gray)
-            sample['pre_image'] = pre_img
-
-        return sample
-
-
-@register_op
-class Mosaic(BaseOperator):
-    """ Mosaic operator for image and gt_bboxes
-    The code is based on https://github.com/Megvii-BaseDetection/YOLOX/blob/main/yolox/data/datasets/mosaicdetection.py
-
-    1. get mosaic coords
-    2. clip bbox and get mosaic_labels
-    3. random_affine augment
-    4. Mixup augment as copypaste (optinal), not used in tiny/nano
-
-    Args:
-        prob (float): probability of using Mosaic, 1.0 as default
-        input_dim (list[int]): input shape
-        degrees (list[2]): the rotate range to apply, transform range is [min, max]
-        translate (list[2]): the translate range to apply, transform range is [min, max]
-        scale (list[2]): the scale range to apply, transform range is [min, max]
-        shear (list[2]): the shear range to apply, transform range is [min, max]
-        enable_mixup (bool): whether to enable Mixup or not
-        mixup_prob (float): probability of using Mixup, 1.0 as default
-        mixup_scale (list[int]): scale range of Mixup
-        remove_outside_box (bool): whether remove outside boxes, False as
-            default in COCO dataset, True in MOT dataset
-    """
-
-    def __init__(self,
-                 prob=1.0,
-                 input_dim=[640, 640],
-                 degrees=[-10, 10],
-                 translate=[-0.1, 0.1],
-                 scale=[0.1, 2],
-                 shear=[-2, 2],
-                 enable_mixup=True,
-                 mixup_prob=1.0,
-                 mixup_scale=[0.5, 1.5],
-                 remove_outside_box=False):
-        super(Mosaic, self).__init__()
-        self.prob = prob
-        if isinstance(input_dim, Integral):
-            input_dim = [input_dim, input_dim]
-        self.input_dim = input_dim
-        self.degrees = degrees
-        self.translate = translate
-        self.scale = scale
-        self.shear = shear
-        self.enable_mixup = enable_mixup
-        self.mixup_prob = mixup_prob
-        self.mixup_scale = mixup_scale
-        self.remove_outside_box = remove_outside_box
-
-    def get_mosaic_coords(self, mosaic_idx, xc, yc, w, h, input_h, input_w):
-        # (x1, y1, x2, y2) means coords in large image,
-        # small_coords means coords in small image in mosaic aug.
-        if mosaic_idx == 0:
-            # top left
-            x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc
-            small_coords = w - (x2 - x1), h - (y2 - y1), w, h
-        elif mosaic_idx == 1:
-            # top right
-            x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc
-            small_coords = 0, h - (y2 - y1), min(w, x2 - x1), h
-        elif mosaic_idx == 2:
-            # bottom left
-            x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h)
-            small_coords = w - (x2 - x1), 0, w, min(y2 - y1, h)
-        elif mosaic_idx == 3:
-            # bottom right
-            x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2,
-                                                                   yc + h)
-            small_coords = 0, 0, min(w, x2 - x1), min(y2 - y1, h)
-
-        return (x1, y1, x2, y2), small_coords
-
-    def random_affine_augment(self,
-                              img,
-                              labels=[],
-                              input_dim=[640, 640],
-                              degrees=[-10, 10],
-                              scales=[0.1, 2],
-                              shears=[-2, 2],
-                              translates=[-0.1, 0.1]):
-        # random rotation and scale
-        degree = random.uniform(degrees[0], degrees[1])
-        scale = random.uniform(scales[0], scales[1])
-        assert scale > 0, "Argument scale should be positive."
-        R = cv2.getRotationMatrix2D(angle=degree, center=(0, 0), scale=scale)
-        M = np.ones([2, 3])
-
-        # random shear
-        shear = random.uniform(shears[0], shears[1])
-        shear_x = math.tan(shear * math.pi / 180)
-        shear_y = math.tan(shear * math.pi / 180)
-        M[0] = R[0] + shear_y * R[1]
-        M[1] = R[1] + shear_x * R[0]
-
-        # random translation
-        translate = random.uniform(translates[0], translates[1])
-        translation_x = translate * input_dim[0]
-        translation_y = translate * input_dim[1]
-        M[0, 2] = translation_x
-        M[1, 2] = translation_y
-
-        # warpAffine
-        img = cv2.warpAffine(
-            img, M, dsize=tuple(input_dim), borderValue=(114, 114, 114))
-
-        num_gts = len(labels)
-        if num_gts > 0:
-            # warp corner points
-            corner_points = np.ones((4 * num_gts, 3))
-            corner_points[:, :2] = labels[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape(
-                4 * num_gts, 2)  # x1y1, x2y2, x1y2, x2y1
-            # apply affine transform
-            corner_points = corner_points @M.T
-            corner_points = corner_points.reshape(num_gts, 8)
-
-            # create new boxes
-            corner_xs = corner_points[:, 0::2]
-            corner_ys = corner_points[:, 1::2]
-            new_bboxes = np.concatenate((corner_xs.min(1), corner_ys.min(1),
-                                         corner_xs.max(1), corner_ys.max(1)))
-            new_bboxes = new_bboxes.reshape(4, num_gts).T
-
-            # clip boxes
-            new_bboxes[:, 0::2] = np.clip(new_bboxes[:, 0::2], 0, input_dim[0])
-            new_bboxes[:, 1::2] = np.clip(new_bboxes[:, 1::2], 0, input_dim[1])
-            labels[:, :4] = new_bboxes
-
-        return img, labels
-
-    def __call__(self, sample, context=None):
-        if not isinstance(sample, Sequence):
-            return sample
-
-        assert len(
-            sample) == 5, "Mosaic needs 5 samples, 4 for mosaic and 1 for mixup."
-        if np.random.uniform(0., 1.) > self.prob:
-            return sample[0]
-
-        mosaic_gt_bbox, mosaic_gt_class, mosaic_is_crowd, mosaic_difficult = [], [], [], []
-        input_h, input_w = self.input_dim
-        yc = int(random.uniform(0.5 * input_h, 1.5 * input_h))
-        xc = int(random.uniform(0.5 * input_w, 1.5 * input_w))
-        mosaic_img = np.full((input_h * 2, input_w * 2, 3), 114, dtype=np.uint8)
-
-        # 1. get mosaic coords
-        for mosaic_idx, sp in enumerate(sample[:4]):
-            img = sp['image']
-            gt_bbox = sp['gt_bbox']
-            h0, w0 = img.shape[:2]
-            scale = min(1. * input_h / h0, 1. * input_w / w0)
-            img = cv2.resize(
-                img, (int(w0 * scale), int(h0 * scale)),
-                interpolation=cv2.INTER_LINEAR)
-            (h, w, c) = img.shape[:3]
-
-            # suffix l means large image, while s means small image in mosaic aug.
-            (l_x1, l_y1, l_x2, l_y2), (
-                s_x1, s_y1, s_x2, s_y2) = self.get_mosaic_coords(
-                    mosaic_idx, xc, yc, w, h, input_h, input_w)
-
-            mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2]
-            padw, padh = l_x1 - s_x1, l_y1 - s_y1
-
-            # Normalized xywh to pixel xyxy format
-            _gt_bbox = gt_bbox.copy()
-            if len(gt_bbox) > 0:
-                _gt_bbox[:, 0] = scale * gt_bbox[:, 0] + padw
-                _gt_bbox[:, 1] = scale * gt_bbox[:, 1] + padh
-                _gt_bbox[:, 2] = scale * gt_bbox[:, 2] + padw
-                _gt_bbox[:, 3] = scale * gt_bbox[:, 3] + padh
-
-            mosaic_gt_bbox.append(_gt_bbox)
-            mosaic_gt_class.append(sp['gt_class'])
-            if 'is_crowd' in sp:
-                mosaic_is_crowd.append(sp['is_crowd'])
-            if 'difficult' in sp:
-                mosaic_difficult.append(sp['difficult'])
-
-        # 2. clip bbox and get mosaic_labels([gt_bbox, gt_class, is_crowd])
-        if len(mosaic_gt_bbox):
-            mosaic_gt_bbox = np.concatenate(mosaic_gt_bbox, 0)
-            mosaic_gt_class = np.concatenate(mosaic_gt_class, 0)
-            if mosaic_is_crowd:
-                mosaic_is_crowd = np.concatenate(mosaic_is_crowd, 0)
-                mosaic_labels = np.concatenate([
-                    mosaic_gt_bbox,
-                    mosaic_gt_class.astype(mosaic_gt_bbox.dtype),
-                    mosaic_is_crowd.astype(mosaic_gt_bbox.dtype)
-                ], 1)
-            elif mosaic_difficult:
-                mosaic_difficult = np.concatenate(mosaic_difficult, 0)
-                mosaic_labels = np.concatenate([
-                    mosaic_gt_bbox,
-                    mosaic_gt_class.astype(mosaic_gt_bbox.dtype),
-                    mosaic_difficult.astype(mosaic_gt_bbox.dtype)
-                ], 1)
-            else:
-                mosaic_labels = np.concatenate([
-                    mosaic_gt_bbox, mosaic_gt_class.astype(mosaic_gt_bbox.dtype)
-                ], 1)
-            if self.remove_outside_box:
-                # for MOT dataset
-                flag1 = mosaic_gt_bbox[:, 0] < 2 * input_w
-                flag2 = mosaic_gt_bbox[:, 2] > 0
-                flag3 = mosaic_gt_bbox[:, 1] < 2 * input_h
-                flag4 = mosaic_gt_bbox[:, 3] > 0
-                flag_all = flag1 * flag2 * flag3 * flag4
-                mosaic_labels = mosaic_labels[flag_all]
-            else:
-                mosaic_labels[:, 0] = np.clip(mosaic_labels[:, 0], 0,
-                                              2 * input_w)
-                mosaic_labels[:, 1] = np.clip(mosaic_labels[:, 1], 0,
-                                              2 * input_h)
-                mosaic_labels[:, 2] = np.clip(mosaic_labels[:, 2], 0,
-                                              2 * input_w)
-                mosaic_labels[:, 3] = np.clip(mosaic_labels[:, 3], 0,
-                                              2 * input_h)
-        else:
-            mosaic_labels = np.zeros((1, 6))
-
-        # 3. random_affine augment
-        mosaic_img, mosaic_labels = self.random_affine_augment(
-            mosaic_img,
-            mosaic_labels,
-            input_dim=self.input_dim,
-            degrees=self.degrees,
-            translates=self.translate,
-            scales=self.scale,
-            shears=self.shear)
-
-        # 4. Mixup augment as copypaste, https://arxiv.org/abs/2012.07177
-        # optinal, not used(enable_mixup=False) in tiny/nano
-        if (self.enable_mixup and not len(mosaic_labels) == 0 and
-                random.random() < self.mixup_prob):
-            sample_mixup = sample[4]
-            mixup_img = sample_mixup['image']
-            if 'is_crowd' in sample_mixup:
-                cp_labels = np.concatenate([
-                    sample_mixup['gt_bbox'],
-                    sample_mixup['gt_class'].astype(mosaic_labels.dtype),
-                    sample_mixup['is_crowd'].astype(mosaic_labels.dtype)
-                ], 1)
-            elif 'difficult' in sample_mixup:
-                cp_labels = np.concatenate([
-                    sample_mixup['gt_bbox'],
-                    sample_mixup['gt_class'].astype(mosaic_labels.dtype),
-                    sample_mixup['difficult'].astype(mosaic_labels.dtype)
-                ], 1)
-            else:
-                cp_labels = np.concatenate([
-                    sample_mixup['gt_bbox'],
-                    sample_mixup['gt_class'].astype(mosaic_labels.dtype)
-                ], 1)
-            mosaic_img, mosaic_labels = self.mixup_augment(
-                mosaic_img, mosaic_labels, self.input_dim, cp_labels, mixup_img)
-
-        sample0 = sample[0]
-        sample0['image'] = mosaic_img.astype(np.uint8)  # can not be float32
-        sample0['h'] = float(mosaic_img.shape[0])
-        sample0['w'] = float(mosaic_img.shape[1])
-        sample0['im_shape'][0] = sample0['h']
-        sample0['im_shape'][1] = sample0['w']
-        sample0['gt_bbox'] = mosaic_labels[:, :4].astype(np.float32)
-        sample0['gt_class'] = mosaic_labels[:, 4:5].astype(np.float32)
-        if 'is_crowd' in sample[0]:
-            sample0['is_crowd'] = mosaic_labels[:, 5:6].astype(np.float32)
-        if 'difficult' in sample[0]:
-            sample0['difficult'] = mosaic_labels[:, 5:6].astype(np.float32)
-        return sample0
-
-    def mixup_augment(self, origin_img, origin_labels, input_dim, cp_labels,
-                      img):
-        jit_factor = random.uniform(*self.mixup_scale)
-        FLIP = random.uniform(0, 1) > 0.5
-        if len(img.shape) == 3:
-            cp_img = np.ones(
-                (input_dim[0], input_dim[1], 3), dtype=np.uint8) * 114
-        else:
-            cp_img = np.ones(input_dim, dtype=np.uint8) * 114
-
-        cp_scale_ratio = min(input_dim[0] / img.shape[0],
-                             input_dim[1] / img.shape[1])
-        resized_img = cv2.resize(
-            img, (int(img.shape[1] * cp_scale_ratio),
-                  int(img.shape[0] * cp_scale_ratio)),
-            interpolation=cv2.INTER_LINEAR)
-
-        cp_img[:int(img.shape[0] * cp_scale_ratio), :int(img.shape[
-            1] * cp_scale_ratio)] = resized_img
-
-        cp_img = cv2.resize(cp_img, (int(cp_img.shape[1] * jit_factor),
-                                     int(cp_img.shape[0] * jit_factor)))
-        cp_scale_ratio *= jit_factor
-
-        if FLIP:
-            cp_img = cp_img[:, ::-1, :]
-
-        origin_h, origin_w = cp_img.shape[:2]
-        target_h, target_w = origin_img.shape[:2]
-        padded_img = np.zeros(
-            (max(origin_h, target_h), max(origin_w, target_w), 3),
-            dtype=np.uint8)
-        padded_img[:origin_h, :origin_w] = cp_img
-
-        x_offset, y_offset = 0, 0
-        if padded_img.shape[0] > target_h:
-            y_offset = random.randint(0, padded_img.shape[0] - target_h - 1)
-        if padded_img.shape[1] > target_w:
-            x_offset = random.randint(0, padded_img.shape[1] - target_w - 1)
-        padded_cropped_img = padded_img[y_offset:y_offset + target_h, x_offset:
-                                        x_offset + target_w]
-
-        # adjust boxes
-        cp_bboxes_origin_np = cp_labels[:, :4].copy()
-        cp_bboxes_origin_np[:, 0::2] = np.clip(cp_bboxes_origin_np[:, 0::2] *
-                                               cp_scale_ratio, 0, origin_w)
-        cp_bboxes_origin_np[:, 1::2] = np.clip(cp_bboxes_origin_np[:, 1::2] *
-                                               cp_scale_ratio, 0, origin_h)
-
-        if FLIP:
-            cp_bboxes_origin_np[:, 0::2] = (
-                origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1])
-        cp_bboxes_transformed_np = cp_bboxes_origin_np.copy()
-        if self.remove_outside_box:
-            # for MOT dataset
-            cp_bboxes_transformed_np[:, 0::2] -= x_offset
-            cp_bboxes_transformed_np[:, 1::2] -= y_offset
-        else:
-            cp_bboxes_transformed_np[:, 0::2] = np.clip(
-                cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w)
-            cp_bboxes_transformed_np[:, 1::2] = np.clip(
-                cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h)
-
-        cls_labels = cp_labels[:, 4:5].copy()
-        box_labels = cp_bboxes_transformed_np
-        if cp_labels.shape[-1] == 6:
-            crd_labels = cp_labels[:, 5:6].copy()
-            labels = np.hstack((box_labels, cls_labels, crd_labels))
-        else:
-            labels = np.hstack((box_labels, cls_labels))
-        if self.remove_outside_box:
-            labels = labels[labels[:, 0] < target_w]
-            labels = labels[labels[:, 2] > 0]
-            labels = labels[labels[:, 1] < target_h]
-            labels = labels[labels[:, 3] > 0]
-
-        origin_labels = np.vstack((origin_labels, labels))
-        origin_img = origin_img.astype(np.float32)
-        origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype(
-            np.float32)
-
-        return origin_img.astype(np.uint8), origin_labels
-
-
-@register_op
-class PadResize(BaseOperator):
-    """ PadResize for image and gt_bbbox
-
-    Args:
-        target_size (list[int]): input shape
-        fill_value (float): pixel value of padded image
-    """
-
-    def __init__(self, target_size, fill_value=114):
-        super(PadResize, self).__init__()
-        if isinstance(target_size, Integral):
-            target_size = [target_size, target_size]
-        self.target_size = target_size
-        self.fill_value = fill_value
-
-    def _resize(self, img, bboxes, labels):
-        ratio = min(self.target_size[0] / img.shape[0],
-                    self.target_size[1] / img.shape[1])
-        w, h = int(img.shape[1] * ratio), int(img.shape[0] * ratio)
-        resized_img = cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR)
-
-        if len(bboxes) > 0:
-            bboxes *= ratio
-            mask = np.minimum(bboxes[:, 2] - bboxes[:, 0],
-                              bboxes[:, 3] - bboxes[:, 1]) > 1
-            bboxes = bboxes[mask]
-            labels = labels[mask]
-        return resized_img, bboxes, labels
-
-    def _pad(self, img):
-        h, w, _ = img.shape
-        if h == self.target_size[0] and w == self.target_size[1]:
-            return img
-        padded_img = np.full(
-            (self.target_size[0], self.target_size[1], 3),
-            self.fill_value,
-            dtype=np.uint8)
-        padded_img[:h, :w] = img
-        return padded_img
-
-    def apply(self, sample, context=None):
-        image = sample['image']
-        bboxes = sample['gt_bbox']
-        labels = sample['gt_class']
-        image, bboxes, labels = self._resize(image, bboxes, labels)
-        sample['image'] = self._pad(image).astype(np.float32)
-        sample['gt_bbox'] = bboxes
-        sample['gt_class'] = labels
-        return sample
-
-
-@register_op
-class RandomShift(BaseOperator):
-    """
-    Randomly shift image
-
-    Args:
-        prob (float): probability to do random shift.
-        max_shift (int): max shift pixels
-        filter_thr (int): filter gt bboxes if one side is smaller than this
-    """
-
-    def __init__(self, prob=0.5, max_shift=32, filter_thr=1):
-        super(RandomShift, self).__init__()
-        self.prob = prob
-        self.max_shift = max_shift
-        self.filter_thr = filter_thr
-
-    def calc_shift_coor(self, im_h, im_w, shift_h, shift_w):
-        return [
-            max(0, shift_w), max(0, shift_h), min(im_w, im_w + shift_w),
-            min(im_h, im_h + shift_h)
-        ]
-
-    def apply(self, sample, context=None):
-        if random.random() > self.prob:
-            return sample
-
-        im = sample['image']
-        gt_bbox = sample['gt_bbox']
-        gt_class = sample['gt_class']
-        im_h, im_w = im.shape[:2]
-        shift_h = random.randint(-self.max_shift, self.max_shift)
-        shift_w = random.randint(-self.max_shift, self.max_shift)
-
-        gt_bbox[:, 0::2] += shift_w
-        gt_bbox[:, 1::2] += shift_h
-        gt_bbox[:, 0::2] = np.clip(gt_bbox[:, 0::2], 0, im_w)
-        gt_bbox[:, 1::2] = np.clip(gt_bbox[:, 1::2], 0, im_h)
-        gt_bbox_h = gt_bbox[:, 2] - gt_bbox[:, 0]
-        gt_bbox_w = gt_bbox[:, 3] - gt_bbox[:, 1]
-        keep = (gt_bbox_w > self.filter_thr) & (gt_bbox_h > self.filter_thr)
-        if not keep.any():
-            return sample
-
-        gt_bbox = gt_bbox[keep]
-        gt_class = gt_class[keep]
-
-        # shift image
-        coor_new = self.calc_shift_coor(im_h, im_w, shift_h, shift_w)
-        # shift frame to the opposite direction
-        coor_old = self.calc_shift_coor(im_h, im_w, -shift_h, -shift_w)
-        canvas = np.zeros_like(im)
-        canvas[coor_new[1]:coor_new[3], coor_new[0]:coor_new[2]] \
-            = im[coor_old[1]:coor_old[3], coor_old[0]:coor_old[2]]
-
-        sample['image'] = canvas
-        sample['gt_bbox'] = gt_bbox
-        sample['gt_class'] = gt_class
-        return sample
-
-
-@register_op
-class StrongAugImage(BaseOperator):
-    def __init__(self, transforms):
-        super(StrongAugImage, self).__init__()
-        self.transforms = Compose(transforms)
-
-    def apply(self, sample, context=None):
-        im = sample
-        im['image'] = sample['image'].astype('uint8')
-        results = self.transforms(im)
-        sample['image'] = results['image'].astype('uint8')
-        return sample
-
-
-@register_op
-class RandomColorJitter(BaseOperator):
-    def __init__(self,
-                 prob=0.8,
-                 brightness=0.4,
-                 contrast=0.4,
-                 saturation=0.4,
-                 hue=0.1):
-        super(RandomColorJitter, self).__init__()
-        self.prob = prob
-        self.brightness = brightness
-        self.contrast = contrast
-        self.saturation = saturation
-        self.hue = hue
-
-    def apply(self, sample, context=None):
-        if np.random.uniform(0, 1) < self.prob:
-            from paddle.vision.transforms import ColorJitter
-            transform = ColorJitter(self.brightness, self.contrast,
-                                    self.saturation, self.hue)
-            sample['image'] = transform(sample['image'].astype(np.uint8))
-            sample['image'] = sample['image'].astype(np.float32)
-        return sample
-
-
-@register_op
-class RandomGrayscale(BaseOperator):
-    def __init__(self, prob=0.2):
-        super(RandomGrayscale, self).__init__()
-        self.prob = prob
-
-    def apply(self, sample, context=None):
-        if np.random.uniform(0, 1) < self.prob:
-            from paddle.vision.transforms import Grayscale
-            transform = Grayscale(num_output_channels=3)
-            sample['image'] = transform(sample['image'])
-        return sample
-
-
-@register_op
-class RandomGaussianBlur(BaseOperator):
-    def __init__(self, prob=0.5, sigma=[0.1, 2.0]):
-        super(RandomGaussianBlur, self).__init__()
-        self.prob = prob
-        self.sigma = sigma
-
-    def apply(self, sample, context=None):
-        if np.random.uniform(0, 1) < self.prob:
-            sigma = np.random.uniform(self.sigma[0], self.sigma[1])
-            im = cv2.GaussianBlur(sample['image'], (23, 23), sigma)
-            sample['image'] = im
-        return sample
-
-
-@register_op
-class RandomErasing(BaseOperator):
-    def __init__(self,
-                 prob=0.5,
-                 scale=(0.02, 0.33),
-                 ratio=(0.3, 3.3),
-                 value=0,
-                 inplace=False):
-        super(RandomErasing, self).__init__()
-        assert isinstance(scale,
-                          (tuple, list)), "scale should be a tuple or list"
-        assert (scale[0] >= 0 and scale[1] <= 1 and scale[0] <= scale[1]
-                ), "scale should be of kind (min, max) and in range [0, 1]"
-        assert isinstance(ratio,
-                          (tuple, list)), "ratio should be a tuple or list"
-        assert (ratio[0] >= 0 and
-                ratio[0] <= ratio[1]), "ratio should be of kind (min, max)"
-        assert isinstance(
-            value, (Number, str, tuple,
-                    list)), "value should be a number, tuple, list or str"
-        if isinstance(value, str) and value != "random":
-            raise ValueError("value must be 'random' when type is str")
-        self.prob = prob
-        self.scale = scale
-        self.ratio = ratio
-        self.value = value
-        self.inplace = inplace
-
-    def _erase(self, img, i, j, h, w, v, inplace=False):
-        if not inplace:
-            img = img.copy()
-        img[i:i + h, j:j + w, ...] = v
-        return img
-
-    def _get_param(self, img, scale, ratio, value):
-        shape = np.asarray(img).astype(np.uint8).shape
-        h, w, c = shape[-3], shape[-2], shape[-1]
-        img_area = h * w
-        log_ratio = np.log(ratio)
-        for _ in range(1):
-            erase_area = np.random.uniform(*scale) * img_area
-            aspect_ratio = np.exp(np.random.uniform(*log_ratio))
-            erase_h = int(round(np.sqrt(erase_area * aspect_ratio)))
-            erase_w = int(round(np.sqrt(erase_area / aspect_ratio)))
-            if erase_h >= h or erase_w >= w:
-                continue
-
-            if value is None:
-                v = np.random.normal(size=[erase_h, erase_w, c]) * 255
-            else:
-                v = np.array(value)[None, None, :]
-            top = np.random.randint(0, h - erase_h + 1)
-            left = np.random.randint(0, w - erase_w + 1)
-            return top, left, erase_h, erase_w, v
-        return 0, 0, h, w, img
-
-    def apply(self, sample, context=None):
-        if random.random() < self.prob:
-            if isinstance(self.value, Number):
-                value = [self.value]
-            elif isinstance(self.value, str):
-                value = None
-            else:
-                value = self.value
-            if value is not None and not (len(value) == 1 or len(value) == 3):
-                raise ValueError(
-                    "Value should be a single number or a sequence with length equals to image's channel."
-                )
-            im = sample['image']
-            top, left, erase_h, erase_w, v = self._get_param(im, self.scale,
-                                                             self.ratio, value)
-            im = self._erase(im, top, left, erase_h, erase_w, v, self.inplace)
-            sample['image'] = im
-        return sample
-
-
-@register_op
-class RandomErasingCrop(BaseOperator):
-    def __init__(self):
-        super(RandomErasingCrop, self).__init__()
-        self.transform1 = RandomErasing(
-            prob=0.7, scale=(0.05, 0.2), ratio=(0.3, 3.3), value="random")
-        self.transform2 = RandomErasing(
-            prob=0.5, scale=(0.05, 0.2), ratio=(0.1, 6), value="random")
-        self.transform3 = RandomErasing(
-            prob=0.3, scale=(0.05, 0.2), ratio=(0.05, 8), value="random")
-
-    def apply(self, sample, context=None):
-        sample = self.transform1(sample)
-        sample = self.transform2(sample)
-        sample = self.transform3(sample)
-        return sample
diff --git a/pdfdet/models/Paddle/ppdet/data/transform/rotated_operators.py b/pdfdet/models/Paddle/ppdet/data/transform/rotated_operators.py
deleted file mode 100644
index 5e9cebb..0000000
--- a/pdfdet/models/Paddle/ppdet/data/transform/rotated_operators.py
+++ /dev/null
@@ -1,480 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-try:
-    from collections.abc import Sequence
-except Exception:
-    from collections import Sequence
-
-from numbers import Number, Integral
-
-import cv2
-import numpy as np
-import math
-import copy
-
-from .operators import register_op, BaseOperator
-from ppdet.modeling.rbox_utils import poly2rbox_le135_np, poly2rbox_oc_np, rbox2poly_np
-from ppdet.utils.logger import setup_logger
-from ppdet.utils.compact import imagedraw_textsize_c
-logger = setup_logger(__name__)
-
-
-@register_op
-class RRotate(BaseOperator):
-    """ Rotate Image, Polygon, Box
-
-    Args:
-        scale (float): rotate scale
-        angle (float): rotate angle
-        fill_value (int, tuple): fill color
-        auto_bound (bool): whether auto bound or not
-    """
-
-    def __init__(self, scale=1.0, angle=0., fill_value=0., auto_bound=True):
-        super(RRotate, self).__init__()
-        self.scale = scale
-        self.angle = angle
-        self.fill_value = fill_value
-        self.auto_bound = auto_bound
-
-    def get_rotated_matrix(self, angle, scale, h, w):
-        center = ((w - 1) * 0.5, (h - 1) * 0.5)
-        matrix = cv2.getRotationMatrix2D(center, -angle, scale)
-        # calculate the new size
-        cos = np.abs(matrix[0, 0])
-        sin = np.abs(matrix[0, 1])
-        new_w = h * sin + w * cos
-        new_h = h * cos + w * sin
-        # calculate offset
-        n_w = int(np.round(new_w))
-        n_h = int(np.round(new_h))
-        if self.auto_bound:
-            ratio = min(w / n_w, h / n_h)
-            matrix = cv2.getRotationMatrix2D(center, -angle, ratio)
-        else:
-            matrix[0, 2] += (new_w - w) * 0.5
-            matrix[1, 2] += (new_h - h) * 0.5
-            w = n_w
-            h = n_h
-        return matrix, h, w
-
-    def get_rect_from_pts(self, pts, h, w):
-        """ get minimum rectangle of points
-        """
-        assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct'
-        min_x, min_y = np.min(pts[:, 0::2], axis=1), np.min(pts[:, 1::2],
-                                                            axis=1)
-        max_x, max_y = np.max(pts[:, 0::2], axis=1), np.max(pts[:, 1::2],
-                                                            axis=1)
-        min_x, min_y = np.clip(min_x, 0, w), np.clip(min_y, 0, h)
-        max_x, max_y = np.clip(max_x, 0, w), np.clip(max_y, 0, h)
-        boxes = np.stack([min_x, min_y, max_x, max_y], axis=-1)
-        return boxes
-
-    def apply_image(self, image, matrix, h, w):
-        return cv2.warpAffine(
-            image, matrix, (w, h), borderValue=self.fill_value)
-
-    def apply_pts(self, pts, matrix, h, w):
-        assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct'
-        # n is number of samples and m is two times the number of points due to (x, y)
-        _, m = pts.shape
-        # transpose points
-        pts_ = pts.reshape(-1, 2).T
-        # pad 1 to convert the points to homogeneous coordinates
-        padding = np.ones((1, pts_.shape[1]), pts.dtype)
-        rotated_pts = np.matmul(matrix, np.concatenate((pts_, padding), axis=0))
-        return rotated_pts[:2, :].T.reshape(-1, m)
-
-    def apply(self, sample, context=None):
-        image = sample['image']
-        h, w = image.shape[:2]
-        matrix, h, w = self.get_rotated_matrix(self.angle, self.scale, h, w)
-        sample['image'] = self.apply_image(image, matrix, h, w)
-        polys = sample['gt_poly']
-        # TODO: segment or keypoint to be processed 
-        if len(polys) > 0:
-            pts = self.apply_pts(polys, matrix, h, w)
-            sample['gt_poly'] = pts
-            sample['gt_bbox'] = self.get_rect_from_pts(pts, h, w)
-
-        return sample
-
-
-@register_op
-class RandomRRotate(BaseOperator):
-    """ Random Rotate Image
-    Args:
-        scale (float, tuple, list): rotate scale
-        scale_mode (str): mode of scale, [range, value, None]
-        angle (float, tuple, list): rotate angle
-        angle_mode (str): mode of angle, [range, value, None]
-        fill_value (float, tuple, list): fill value
-        rotate_prob (float): probability of rotation
-        auto_bound (bool): whether auto bound or not
-    """
-
-    def __init__(self,
-                 scale=1.0,
-                 scale_mode=None,
-                 angle=0.,
-                 angle_mode=None,
-                 fill_value=0.,
-                 rotate_prob=1.0,
-                 auto_bound=True):
-        super(RandomRRotate, self).__init__()
-        self.scale = scale
-        self.scale_mode = scale_mode
-        self.angle = angle
-        self.angle_mode = angle_mode
-        self.fill_value = fill_value
-        self.rotate_prob = rotate_prob
-        self.auto_bound = auto_bound
-
-    def get_angle(self, angle, angle_mode):
-        assert not angle_mode or angle_mode in [
-            'range', 'value'
-        ], 'angle mode should be in [range, value, None]'
-        if not angle_mode:
-            return angle
-        elif angle_mode == 'range':
-            low, high = angle
-            return np.random.rand() * (high - low) + low
-        elif angle_mode == 'value':
-            return np.random.choice(angle)
-
-    def get_scale(self, scale, scale_mode):
-        assert not scale_mode or scale_mode in [
-            'range', 'value'
-        ], 'scale mode should be in [range, value, None]'
-        if not scale_mode:
-            return scale
-        elif scale_mode == 'range':
-            low, high = scale
-            return np.random.rand() * (high - low) + low
-        elif scale_mode == 'value':
-            return np.random.choice(scale)
-
-    def apply(self, sample, context=None):
-        if np.random.rand() > self.rotate_prob:
-            return sample
-
-        angle = self.get_angle(self.angle, self.angle_mode)
-        scale = self.get_scale(self.scale, self.scale_mode)
-        rotator = RRotate(scale, angle, self.fill_value, self.auto_bound)
-        return rotator(sample)
-
-
-@register_op
-class Poly2RBox(BaseOperator):
-    """ Polygon to Rotated Box, using new OpenCV definition since 4.5.1
-
-    Args:
-        filter_threshold (int, float): threshold to filter annotations
-        filter_mode (str): filter mode, ['area', 'edge']
-        rbox_type (str): rbox type, ['le135', 'oc']
-
-    """
-
-    def __init__(self, filter_threshold=4, filter_mode=None, rbox_type='le135'):
-        super(Poly2RBox, self).__init__()
-        self.filter_fn = lambda size: self.filter(size, filter_threshold, filter_mode)
-        self.rbox_fn = poly2rbox_le135_np if rbox_type == 'le135' else poly2rbox_oc_np
-
-    def filter(self, size, threshold, mode):
-        if mode == 'area':
-            if size[0] * size[1] < threshold:
-                return True
-        elif mode == 'edge':
-            if min(size) < threshold:
-                return True
-        return False
-
-    def get_rbox(self, polys):
-        valid_ids, rboxes, bboxes = [], [], []
-        for i, poly in enumerate(polys):
-            cx, cy, w, h, angle = self.rbox_fn(poly)
-            if self.filter_fn((w, h)):
-                continue
-            rboxes.append(np.array([cx, cy, w, h, angle], dtype=np.float32))
-            valid_ids.append(i)
-            xmin, ymin = min(poly[0::2]), min(poly[1::2])
-            xmax, ymax = max(poly[0::2]), max(poly[1::2])
-            bboxes.append(np.array([xmin, ymin, xmax, ymax], dtype=np.float32))
-
-        if len(valid_ids) == 0:
-            rboxes = np.zeros((0, 5), dtype=np.float32)
-            bboxes = np.zeros((0, 4), dtype=np.float32)
-        else:
-            rboxes = np.stack(rboxes)
-            bboxes = np.stack(bboxes)
-
-        return rboxes, bboxes, valid_ids
-
-    def apply(self, sample, context=None):
-        rboxes, bboxes, valid_ids = self.get_rbox(sample['gt_poly'])
-        sample['gt_rbox'] = rboxes
-        sample['gt_bbox'] = bboxes
-        for k in ['gt_class', 'gt_score', 'gt_poly', 'is_crowd', 'difficult']:
-            if k in sample:
-                sample[k] = sample[k][valid_ids]
-
-        return sample
-
-
-@register_op
-class Poly2Array(BaseOperator):
-    """ convert gt_poly to np.array for rotated bboxes
-    """
-
-    def __init__(self):
-        super(Poly2Array, self).__init__()
-
-    def apply(self, sample, context=None):
-        if 'gt_poly' in sample:
-            sample['gt_poly'] = np.array(
-                sample['gt_poly'], dtype=np.float32).reshape((-1, 8))
-
-        return sample
-
-
-@register_op
-class RResize(BaseOperator):
-    def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR):
-        """
-        Resize image to target size. if keep_ratio is True, 
-        resize the image's long side to the maximum of target_size
-        if keep_ratio is False, resize the image to target size(h, w)
-        Args:
-            target_size (int|list): image target size
-            keep_ratio (bool): whether keep_ratio or not, default true
-            interp (int): the interpolation method
-        """
-        super(RResize, self).__init__()
-        self.keep_ratio = keep_ratio
-        self.interp = interp
-        if not isinstance(target_size, (Integral, Sequence)):
-            raise TypeError(
-                "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}".
-                format(type(target_size)))
-        if isinstance(target_size, Integral):
-            target_size = [target_size, target_size]
-        self.target_size = target_size
-
-    def apply_image(self, image, scale):
-        im_scale_x, im_scale_y = scale
-
-        return cv2.resize(
-            image,
-            None,
-            None,
-            fx=im_scale_x,
-            fy=im_scale_y,
-            interpolation=self.interp)
-
-    def apply_pts(self, pts, scale, size):
-        im_scale_x, im_scale_y = scale
-        resize_w, resize_h = size
-        pts[:, 0::2] *= im_scale_x
-        pts[:, 1::2] *= im_scale_y
-        pts[:, 0::2] = np.clip(pts[:, 0::2], 0, resize_w)
-        pts[:, 1::2] = np.clip(pts[:, 1::2], 0, resize_h)
-        return pts
-
-    def apply(self, sample, context=None):
-        """ Resize the image numpy.
-        """
-        im = sample['image']
-        if not isinstance(im, np.ndarray):
-            raise TypeError("{}: image type is not numpy.".format(self))
-        if len(im.shape) != 3:
-            raise ImageError('{}: image is not 3-dimensional.'.format(self))
-
-        # apply image
-        im_shape = im.shape
-        if self.keep_ratio:
-
-            im_size_min = np.min(im_shape[0:2])
-            im_size_max = np.max(im_shape[0:2])
-
-            target_size_min = np.min(self.target_size)
-            target_size_max = np.max(self.target_size)
-
-            im_scale = min(target_size_min / im_size_min,
-                           target_size_max / im_size_max)
-
-            resize_h = im_scale * float(im_shape[0])
-            resize_w = im_scale * float(im_shape[1])
-
-            im_scale_x = im_scale
-            im_scale_y = im_scale
-        else:
-            resize_h, resize_w = self.target_size
-            im_scale_y = resize_h / im_shape[0]
-            im_scale_x = resize_w / im_shape[1]
-
-        im = self.apply_image(sample['image'], [im_scale_x, im_scale_y])
-        sample['image'] = im.astype(np.float32)
-        sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32)
-        if 'scale_factor' in sample:
-            scale_factor = sample['scale_factor']
-            sample['scale_factor'] = np.asarray(
-                [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x],
-                dtype=np.float32)
-        else:
-            sample['scale_factor'] = np.asarray(
-                [im_scale_y, im_scale_x], dtype=np.float32)
-
-        # apply bbox
-        if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
-            sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'],
-                                               [im_scale_x, im_scale_y],
-                                               [resize_w, resize_h])
-
-        # apply polygon
-        if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
-            sample['gt_poly'] = self.apply_pts(sample['gt_poly'],
-                                               [im_scale_x, im_scale_y],
-                                               [resize_w, resize_h])
-
-        return sample
-
-
-@register_op
-class RandomRFlip(BaseOperator):
-    def __init__(self, prob=0.5):
-        """
-        Args:
-            prob (float): the probability of flipping image
-        """
-        super(RandomRFlip, self).__init__()
-        self.prob = prob
-        if not (isinstance(self.prob, float)):
-            raise TypeError("{}: input type is invalid.".format(self))
-
-    def apply_image(self, image):
-        return image[:, ::-1, :]
-
-    def apply_pts(self, pts, width):
-        oldx = pts[:, 0::2].copy()
-        pts[:, 0::2] = width - oldx - 1
-        return pts
-
-    def apply(self, sample, context=None):
-        """Filp the image and bounding box.
-        Operators:
-            1. Flip the image numpy.
-            2. Transform the bboxes' x coordinates.
-              (Must judge whether the coordinates are normalized!)
-            3. Transform the segmentations' x coordinates.
-              (Must judge whether the coordinates are normalized!)
-        Output:
-            sample: the image, bounding box and segmentation part
-                    in sample are flipped.
-        """
-        if np.random.uniform(0, 1) < self.prob:
-            im = sample['image']
-            height, width = im.shape[:2]
-            im = self.apply_image(im)
-            if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0:
-                sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'], width)
-            if 'gt_poly' in sample and len(sample['gt_poly']) > 0:
-                sample['gt_poly'] = self.apply_pts(sample['gt_poly'], width)
-
-            sample['flipped'] = True
-            sample['image'] = im
-        return sample
-
-
-@register_op
-class VisibleRBox(BaseOperator):
-    """
-    In debug mode, visualize images according to `gt_box`.
-    (Currently only supported when not cropping and flipping image.)
-    """
-
-    def __init__(self, output_dir='debug'):
-        super(VisibleRBox, self).__init__()
-        self.output_dir = output_dir
-        if not os.path.isdir(output_dir):
-            os.makedirs(output_dir)
-
-    def apply(self, sample, context=None):
-        image = Image.fromarray(sample['image'].astype(np.uint8))
-        out_file_name = '{:012d}.jpg'.format(sample['im_id'][0])
-        width = sample['w']
-        height = sample['h']
-        # gt_poly = sample['gt_rbox']
-        gt_poly = sample['gt_poly']
-        gt_class = sample['gt_class']
-        draw = ImageDraw.Draw(image)
-        for i in range(gt_poly.shape[0]):
-            x1, y1, x2, y2, x3, y3, x4, y4 = gt_poly[i]
-            draw.line(
-                [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)],
-                width=2,
-                fill='green')
-            # draw label
-            xmin = min(x1, x2, x3, x4)
-            ymin = min(y1, y2, y3, y4)
-            text = str(gt_class[i][0])
-            tw, th = imagedraw_textsize_c(draw, text)
-            draw.rectangle(
-                [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green')
-            draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255))
-
-        if 'gt_keypoint' in sample.keys():
-            gt_keypoint = sample['gt_keypoint']
-            if self.is_normalized:
-                for i in range(gt_keypoint.shape[1]):
-                    if i % 2:
-                        gt_keypoint[:, i] = gt_keypoint[:, i] * height
-                    else:
-                        gt_keypoint[:, i] = gt_keypoint[:, i] * width
-            for i in range(gt_keypoint.shape[0]):
-                keypoint = gt_keypoint[i]
-                for j in range(int(keypoint.shape[0] / 2)):
-                    x1 = round(keypoint[2 * j]).astype(np.int32)
-                    y1 = round(keypoint[2 * j + 1]).astype(np.int32)
-                    draw.ellipse(
-                        (x1, y1, x1 + 5, y1 + 5), fill='green', outline='green')
-        save_path = os.path.join(self.output_dir, out_file_name)
-        image.save(save_path, quality=95)
-        return sample
-
-
-@register_op
-class Rbox2Poly(BaseOperator):
-    """
-    Convert rbbox format to poly format.
-    """
-
-    def __init__(self):
-        super(Rbox2Poly, self).__init__()
-
-    def apply(self, sample, context=None):
-        assert 'gt_rbox' in sample
-        assert sample['gt_rbox'].shape[1] == 5
-        rboxes = sample['gt_rbox']
-        polys = rbox2poly_np(rboxes)
-        sample['gt_poly'] = polys
-        xmin, ymin = polys[:, 0::2].min(1), polys[:, 1::2].min(1)
-        xmax, ymax = polys[:, 0::2].max(1), polys[:, 1::2].max(1)
-        sample['gt_bbox'] = np.stack([xmin, ymin, xmin, ymin], axis=1)
-        return sample
diff --git a/pdfdet/models/Paddle/ppdet/data/utils.py b/pdfdet/models/Paddle/ppdet/data/utils.py
deleted file mode 100644
index 02573e6..0000000
--- a/pdfdet/models/Paddle/ppdet/data/utils.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import numbers
-import numpy as np
-
-try:
-    from collections.abc import Sequence, Mapping
-except:
-    from collections import Sequence, Mapping
-
-
-def default_collate_fn(batch):
-    """
-    Default batch collating function for :code:`paddle.io.DataLoader`,
-    get input data as a list of sample datas, each element in list
-    if the data of a sample, and sample data should composed of list,
-    dictionary, string, number, numpy array, this
-    function will parse input data recursively and stack number,
-    numpy array and paddle.Tensor datas as batch datas. e.g. for
-    following input data:
-    [{'image': np.array(shape=[3, 224, 224]), 'label': 1},
-     {'image': np.array(shape=[3, 224, 224]), 'label': 3},
-     {'image': np.array(shape=[3, 224, 224]), 'label': 4},
-     {'image': np.array(shape=[3, 224, 224]), 'label': 5},]
-    
-    
-    This default collate function zipped each number and numpy array
-    field together and stack each field as the batch field as follows:
-    {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])}
-    Args:  
-        batch(list of sample data): batch should be a list of sample data.
-    
-    Returns:
-        Batched data: batched each number, numpy array and paddle.Tensor
-                      in input data.
-    """
-    sample = batch[0]
-    if isinstance(sample, np.ndarray):
-        batch = np.stack(batch, axis=0)
-        return batch
-    elif isinstance(sample, numbers.Number):
-        batch = np.array(batch)
-        return batch
-    elif isinstance(sample, (str, bytes)):
-        return batch
-    elif isinstance(sample, Mapping):
-        return {
-            key: default_collate_fn([d[key] for d in batch])
-            for key in sample
-        }
-    elif isinstance(sample, Sequence):
-        sample_fields_num = len(sample)
-        if not all(len(sample) == sample_fields_num for sample in iter(batch)):
-            raise RuntimeError(
-                "fileds number not same among samples in a batch")
-        return [default_collate_fn(fields) for fields in zip(*batch)]
-
-    raise TypeError("batch data con only contains: tensor, numpy.ndarray, "
-                    "dict, list, number, but got {}".format(type(sample)))
diff --git a/pdfdet/models/Paddle/ppdet/engine/__init__.py b/pdfdet/models/Paddle/ppdet/engine/__init__.py
deleted file mode 100644
index 91166e8..0000000
--- a/pdfdet/models/Paddle/ppdet/engine/__init__.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from . import trainer
-from .trainer import *
-
-from . import trainer_cot
-from .trainer_cot import *
-
-from . import callbacks
-from .callbacks import *
-
-from . import env
-from .env import *
-
-__all__ = trainer.__all__ \
-        + callbacks.__all__ \
-        + env.__all__
-
-from . import tracker
-from .tracker import *
-__all__ = __all__ + tracker.__all__
-
-from . import trainer_ssod
-from .trainer_ssod import *
-__all__ = __all__ + trainer_ssod.__all__
diff --git a/pdfdet/models/Paddle/ppdet/engine/callbacks.py b/pdfdet/models/Paddle/ppdet/engine/callbacks.py
deleted file mode 100644
index 87dcd61..0000000
--- a/pdfdet/models/Paddle/ppdet/engine/callbacks.py
+++ /dev/null
@@ -1,693 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import datetime
-import six
-import copy
-import json
-
-import paddle
-import paddle.distributed as dist
-
-from ppdet.utils.checkpoint import save_model, save_semi_model
-from ppdet.metrics import get_infer_results
-
-from ppdet.utils.logger import setup_logger
-logger = setup_logger('ppdet.engine')
-
-__all__ = [
-    'Callback', 'ComposeCallback', 'LogPrinter', 'Checkpointer',
-    'VisualDLWriter', 'SniperProposalsGenerator'
-]
-
-
-class Callback(object):
-    def __init__(self, model):
-        self.model = model
-
-    def on_step_begin(self, status):
-        pass
-
-    def on_step_end(self, status):
-        pass
-
-    def on_epoch_begin(self, status):
-        pass
-
-    def on_epoch_end(self, status):
-        pass
-
-    def on_train_begin(self, status):
-        pass
-
-    def on_train_end(self, status):
-        pass
-
-
-class ComposeCallback(object):
-    def __init__(self, callbacks):
-        callbacks = [c for c in list(callbacks) if c is not None]
-        for c in callbacks:
-            assert isinstance(
-                c, Callback), "callback should be subclass of Callback"
-        self._callbacks = callbacks
-
-    def on_step_begin(self, status):
-        for c in self._callbacks:
-            c.on_step_begin(status)
-
-    def on_step_end(self, status):
-        for c in self._callbacks:
-            c.on_step_end(status)
-
-    def on_epoch_begin(self, status):
-        for c in self._callbacks:
-            c.on_epoch_begin(status)
-
-    def on_epoch_end(self, status):
-        for c in self._callbacks:
-            c.on_epoch_end(status)
-
-    def on_train_begin(self, status):
-        for c in self._callbacks:
-            c.on_train_begin(status)
-
-    def on_train_end(self, status):
-        for c in self._callbacks:
-            c.on_train_end(status)
-
-
-class LogPrinter(Callback):
-    def __init__(self, model):
-        super(LogPrinter, self).__init__(model)
-
-    def on_step_end(self, status):
-        if dist.get_world_size() < 2 or dist.get_rank() == 0:
-            mode = status['mode']
-            if mode == 'train':
-                epoch_id = status['epoch_id']
-                step_id = status['step_id']
-                steps_per_epoch = status['steps_per_epoch']
-                training_staus = status['training_staus']
-                batch_time = status['batch_time']
-                data_time = status['data_time']
-
-                epoches = self.model.cfg.epoch
-                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
-                ))]['batch_size']
-
-                logs = training_staus.log()
-                space_fmt = ':' + str(len(str(steps_per_epoch))) + 'd'
-                if step_id % self.model.cfg.log_iter == 0:
-                    eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id
-                    eta_sec = eta_steps * batch_time.global_avg
-                    eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
-                    ips = float(batch_size) / batch_time.avg
-                    fmt = ' '.join([
-                        'Epoch: [{}]',
-                        '[{' + space_fmt + '}/{}]',
-                        'learning_rate: {lr:.6f}',
-                        '{meters}',
-                        'eta: {eta}',
-                        'batch_cost: {btime}',
-                        'data_cost: {dtime}',
-                        'ips: {ips:.4f} images/s',
-                    ])
-                    fmt = fmt.format(
-                        epoch_id,
-                        step_id,
-                        steps_per_epoch,
-                        lr=status['learning_rate'],
-                        meters=logs,
-                        eta=eta_str,
-                        btime=str(batch_time),
-                        dtime=str(data_time),
-                        ips=ips)
-                    logger.info(fmt)
-            if mode == 'eval':
-                step_id = status['step_id']
-                if step_id % 100 == 0:
-                    logger.info("Eval iter: {}".format(step_id))
-
-    def on_epoch_end(self, status):
-        if dist.get_world_size() < 2 or dist.get_rank() == 0:
-            mode = status['mode']
-            if mode == 'eval':
-                sample_num = status['sample_num']
-                cost_time = status['cost_time']
-                logger.info('Total sample number: {}, average FPS: {}'.format(
-                    sample_num, sample_num / cost_time))
-
-
-class Checkpointer(Callback):
-    def __init__(self, model):
-        super(Checkpointer, self).__init__(model)
-        self.best_ap = -1000.
-        self.save_dir = self.model.cfg.save_dir
-        if hasattr(self.model.model, 'student_model'):
-            self.weight = self.model.model.student_model
-        else:
-            self.weight = self.model.model
-
-    def on_epoch_end(self, status):
-        # Checkpointer only performed during training
-        mode = status['mode']
-        epoch_id = status['epoch_id']
-        weight = None
-        save_name = None
-        if dist.get_world_size() < 2 or dist.get_rank() == 0:
-            if mode == 'train':
-                end_epoch = self.model.cfg.epoch
-                if (
-                        epoch_id + 1
-                ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
-                    save_name = str(
-                        epoch_id) if epoch_id != end_epoch - 1 else "model_final"
-                    weight = self.weight.state_dict()
-            elif mode == 'eval':
-                if 'save_best_model' in status and status['save_best_model']:
-                    for metric in self.model._metrics:
-                        map_res = metric.get_results()
-                        eval_func = "ap"
-                        if 'pose3d' in map_res:
-                            key = 'pose3d'
-                            eval_func = "mpjpe"
-                        elif 'bbox' in map_res:
-                            key = 'bbox'
-                        elif 'keypoint' in map_res:
-                            key = 'keypoint'
-                        else:
-                            key = 'mask'
-                        if key not in map_res:
-                            logger.warning("Evaluation results empty, this may be due to " \
-                                        "training iterations being too few or not " \
-                                        "loading the correct weights.")
-                            return
-                        if map_res[key][0] >= self.best_ap:
-                            self.best_ap = map_res[key][0]
-                            save_name = 'best_model'
-                            weight = self.weight.state_dict()
-                        logger.info("Best test {} {} is {:0.3f}.".format(
-                            key, eval_func, abs(self.best_ap)))
-            if weight:
-                if self.model.use_ema:
-                    exchange_save_model = status.get('exchange_save_model',
-                                                     False)
-                    if not exchange_save_model:
-                        # save model and ema_model
-                        save_model(
-                            status['weight'],
-                            self.model.optimizer,
-                            self.save_dir,
-                            save_name,
-                            epoch_id + 1,
-                            ema_model=weight)
-                    else:
-                        # save model(student model) and ema_model(teacher model)
-                        # in DenseTeacher SSOD, the teacher model will be higher,
-                        # so exchange when saving pdparams
-                        student_model = status['weight']  # model
-                        teacher_model = weight  # ema_model
-                        save_model(
-                            teacher_model,
-                            self.model.optimizer,
-                            self.save_dir,
-                            save_name,
-                            epoch_id + 1,
-                            ema_model=student_model)
-                        del teacher_model
-                        del student_model
-                else:
-                    save_model(weight, self.model.optimizer, self.save_dir,
-                               save_name, epoch_id + 1)
-
-
-class WiferFaceEval(Callback):
-    def __init__(self, model):
-        super(WiferFaceEval, self).__init__(model)
-
-    def on_epoch_begin(self, status):
-        assert self.model.mode == 'eval', \
-            "WiferFaceEval can only be set during evaluation"
-        for metric in self.model._metrics:
-            metric.update(self.model.model)
-        sys.exit()
-
-
-class VisualDLWriter(Callback):
-    """
-    Use VisualDL to log data or image
-    """
-
-    def __init__(self, model):
-        super(VisualDLWriter, self).__init__(model)
-
-        assert six.PY3, "VisualDL requires Python >= 3.5"
-        try:
-            from visualdl import LogWriter
-        except Exception as e:
-            logger.error('visualdl not found, plaese install visualdl. '
-                         'for example: `pip install visualdl`.')
-            raise e
-        self.vdl_writer = LogWriter(
-            model.cfg.get('vdl_log_dir', 'vdl_log_dir/scalar'))
-        self.vdl_loss_step = 0
-        self.vdl_mAP_step = 0
-        self.vdl_image_step = 0
-        self.vdl_image_frame = 0
-
-    def on_step_end(self, status):
-        mode = status['mode']
-        if dist.get_world_size() < 2 or dist.get_rank() == 0:
-            if mode == 'train':
-                training_staus = status['training_staus']
-                for loss_name, loss_value in training_staus.get().items():
-                    self.vdl_writer.add_scalar(loss_name, loss_value,
-                                               self.vdl_loss_step)
-                self.vdl_loss_step += 1
-            elif mode == 'test':
-                ori_image = status['original_image']
-                result_image = status['result_image']
-                self.vdl_writer.add_image(
-                    "original/frame_{}".format(self.vdl_image_frame), ori_image,
-                    self.vdl_image_step)
-                self.vdl_writer.add_image(
-                    "result/frame_{}".format(self.vdl_image_frame),
-                    result_image, self.vdl_image_step)
-                self.vdl_image_step += 1
-                # each frame can display ten pictures at most.
-                if self.vdl_image_step % 10 == 0:
-                    self.vdl_image_step = 0
-                    self.vdl_image_frame += 1
-
-    def on_epoch_end(self, status):
-        mode = status['mode']
-        if dist.get_world_size() < 2 or dist.get_rank() == 0:
-            if mode == 'eval':
-                for metric in self.model._metrics:
-                    for key, map_value in metric.get_results().items():
-                        self.vdl_writer.add_scalar("{}-mAP".format(key),
-                                                   map_value[0],
-                                                   self.vdl_mAP_step)
-                self.vdl_mAP_step += 1
-
-
-class WandbCallback(Callback):
-    def __init__(self, model):
-        super(WandbCallback, self).__init__(model)
-
-        try:
-            import wandb
-            self.wandb = wandb
-        except Exception as e:
-            logger.error('wandb not found, please install wandb. '
-                         'Use: `pip install wandb`.')
-            raise e
-
-        self.wandb_params = model.cfg.get('wandb', None)
-        self.save_dir = self.model.cfg.save_dir
-        if self.wandb_params is None:
-            self.wandb_params = {}
-        for k, v in model.cfg.items():
-            if k.startswith("wandb_"):
-                self.wandb_params.update({k.lstrip("wandb_"): v})
-
-        self._run = None
-        if dist.get_world_size() < 2 or dist.get_rank() == 0:
-            _ = self.run
-            self.run.config.update(self.model.cfg)
-            self.run.define_metric("epoch")
-            self.run.define_metric("eval/*", step_metric="epoch")
-
-        self.best_ap = -1000.
-        self.fps = []
-
-    @property
-    def run(self):
-        if self._run is None:
-            if self.wandb.run is not None:
-                logger.info(
-                    "There is an ongoing wandb run which will be used"
-                    "for logging. Please use `wandb.finish()` to end that"
-                    "if the behaviour is not intended")
-                self._run = self.wandb.run
-            else:
-                self._run = self.wandb.init(**self.wandb_params)
-        return self._run
-
-    def save_model(self,
-                   optimizer,
-                   save_dir,
-                   save_name,
-                   last_epoch,
-                   ema_model=None,
-                   ap=None,
-                   fps=None,
-                   tags=None):
-        if dist.get_world_size() < 2 or dist.get_rank() == 0:
-            model_path = os.path.join(save_dir, save_name)
-            metadata = {}
-            metadata["last_epoch"] = last_epoch
-            if ap:
-                metadata["ap"] = ap
-
-            if fps:
-                metadata["fps"] = fps
-
-            if ema_model is None:
-                ema_artifact = self.wandb.Artifact(
-                    name="ema_model-{}".format(self.run.id),
-                    type="model",
-                    metadata=metadata)
-                model_artifact = self.wandb.Artifact(
-                    name="model-{}".format(self.run.id),
-                    type="model",
-                    metadata=metadata)
-
-                ema_artifact.add_file(model_path + ".pdema", name="model_ema")
-                model_artifact.add_file(model_path + ".pdparams", name="model")
-
-                self.run.log_artifact(ema_artifact, aliases=tags)
-                self.run.log_artfact(model_artifact, aliases=tags)
-            else:
-                model_artifact = self.wandb.Artifact(
-                    name="model-{}".format(self.run.id),
-                    type="model",
-                    metadata=metadata)
-                model_artifact.add_file(model_path + ".pdparams", name="model")
-                self.run.log_artifact(model_artifact, aliases=tags)
-
-    def on_step_end(self, status):
-
-        mode = status['mode']
-        if dist.get_world_size() < 2 or dist.get_rank() == 0:
-            if mode == 'train':
-                training_status = status['training_staus'].get()
-                for k, v in training_status.items():
-                    training_status[k] = float(v)
-
-                # calculate ips, data_cost, batch_cost
-                batch_time = status['batch_time']
-                data_time = status['data_time']
-                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
-                ))]['batch_size']
-
-                ips = float(batch_size) / float(batch_time.avg)
-                data_cost = float(data_time.avg)
-                batch_cost = float(batch_time.avg)
-
-                metrics = {"train/" + k: v for k, v in training_status.items()}
-
-                metrics["train/ips"] = ips
-                metrics["train/data_cost"] = data_cost
-                metrics["train/batch_cost"] = batch_cost
-
-                self.fps.append(ips)
-                self.run.log(metrics)
-
-    def on_epoch_end(self, status):
-        mode = status['mode']
-        epoch_id = status['epoch_id']
-        save_name = None
-        if dist.get_world_size() < 2 or dist.get_rank() == 0:
-            if mode == 'train':
-                fps = sum(self.fps) / len(self.fps)
-                self.fps = []
-
-                end_epoch = self.model.cfg.epoch
-                if (
-                        epoch_id + 1
-                ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1:
-                    save_name = str(
-                        epoch_id) if epoch_id != end_epoch - 1 else "model_final"
-                    tags = ["latest", "epoch_{}".format(epoch_id)]
-                    self.save_model(
-                        self.model.optimizer,
-                        self.save_dir,
-                        save_name,
-                        epoch_id + 1,
-                        self.model.use_ema,
-                        fps=fps,
-                        tags=tags)
-            if mode == 'eval':
-                sample_num = status['sample_num']
-                cost_time = status['cost_time']
-
-                fps = sample_num / cost_time
-
-                merged_dict = {}
-                for metric in self.model._metrics:
-                    for key, map_value in metric.get_results().items():
-                        merged_dict["eval/{}-mAP".format(key)] = map_value[0]
-                merged_dict["epoch"] = status["epoch_id"]
-                merged_dict["eval/fps"] = sample_num / cost_time
-
-                self.run.log(merged_dict)
-
-                if 'save_best_model' in status and status['save_best_model']:
-                    for metric in self.model._metrics:
-                        map_res = metric.get_results()
-                        if 'pose3d' in map_res:
-                            key = 'pose3d'
-                        elif 'bbox' in map_res:
-                            key = 'bbox'
-                        elif 'keypoint' in map_res:
-                            key = 'keypoint'
-                        else:
-                            key = 'mask'
-                        if key not in map_res:
-                            logger.warning("Evaluation results empty, this may be due to " \
-                                        "training iterations being too few or not " \
-                                        "loading the correct weights.")
-                            return
-                        if map_res[key][0] >= self.best_ap:
-                            self.best_ap = map_res[key][0]
-                            save_name = 'best_model'
-                            tags = ["best", "epoch_{}".format(epoch_id)]
-
-                            self.save_model(
-                                self.model.optimizer,
-                                self.save_dir,
-                                save_name,
-                                last_epoch=epoch_id + 1,
-                                ema_model=self.model.use_ema,
-                                ap=abs(self.best_ap),
-                                fps=fps,
-                                tags=tags)
-
-    def on_train_end(self, status):
-        self.run.finish()
-
-
-class SniperProposalsGenerator(Callback):
-    def __init__(self, model):
-        super(SniperProposalsGenerator, self).__init__(model)
-        ori_dataset = self.model.dataset
-        self.dataset = self._create_new_dataset(ori_dataset)
-        self.loader = self.model.loader
-        self.cfg = self.model.cfg
-        self.infer_model = self.model.model
-
-    def _create_new_dataset(self, ori_dataset):
-        dataset = copy.deepcopy(ori_dataset)
-        # init anno_cropper
-        dataset.init_anno_cropper()
-        # generate infer roidbs
-        ori_roidbs = dataset.get_ori_roidbs()
-        roidbs = dataset.anno_cropper.crop_infer_anno_records(ori_roidbs)
-        # set new roidbs
-        dataset.set_roidbs(roidbs)
-
-        return dataset
-
-    def _eval_with_loader(self, loader):
-        results = []
-        with paddle.no_grad():
-            self.infer_model.eval()
-            for step_id, data in enumerate(loader):
-                outs = self.infer_model(data)
-                for key in ['im_shape', 'scale_factor', 'im_id']:
-                    outs[key] = data[key]
-                for key, value in outs.items():
-                    if hasattr(value, 'numpy'):
-                        outs[key] = value.numpy()
-
-                results.append(outs)
-
-        return results
-
-    def on_train_end(self, status):
-        self.loader.dataset = self.dataset
-        results = self._eval_with_loader(self.loader)
-        results = self.dataset.anno_cropper.aggregate_chips_detections(results)
-        # sniper
-        proposals = []
-        clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()}
-        for outs in results:
-            batch_res = get_infer_results(outs, clsid2catid)
-            start = 0
-            for i, im_id in enumerate(outs['im_id']):
-                bbox_num = outs['bbox_num']
-                end = start + bbox_num[i]
-                bbox_res = batch_res['bbox'][start:end] \
-                    if 'bbox' in batch_res else None
-                if bbox_res:
-                    proposals += bbox_res
-        logger.info("save proposals in {}".format(self.cfg.proposals_path))
-        with open(self.cfg.proposals_path, 'w') as f:
-            json.dump(proposals, f)
-
-
-class SemiLogPrinter(LogPrinter):
-    def __init__(self, model):
-        super(SemiLogPrinter, self).__init__(model)
-
-    def on_step_end(self, status):
-        if dist.get_world_size() < 2 or dist.get_rank() == 0:
-            mode = status['mode']
-            if mode == 'train':
-                epoch_id = status['epoch_id']
-                step_id = status['step_id']
-                iter_id = status['iter_id']
-                steps_per_epoch = status['steps_per_epoch']
-                training_staus = status['training_staus']
-                batch_time = status['batch_time']
-                data_time = status['data_time']
-
-                epoches = self.model.cfg.epoch
-                batch_size = self.model.cfg['{}Reader'.format(mode.capitalize(
-                ))]['batch_size']
-                iters = epoches * steps_per_epoch
-                logs = training_staus.log()
-                iter_space_fmt = ':' + str(len(str(iters))) + 'd'
-                space_fmt = ':' + str(len(str(iters))) + 'd'
-                if step_id % self.model.cfg.log_iter == 0:
-                    eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id
-                    eta_sec = eta_steps * batch_time.global_avg
-                    eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
-                    ips = float(batch_size) / batch_time.avg
-                    fmt = ' '.join([
-                        '{' + iter_space_fmt + '}/{} iters',
-                        'Epoch: [{}]',
-                        '[{' + space_fmt + '}/{}]',
-                        'learning_rate: {lr:.6f}',
-                        '{meters}',
-                        'eta: {eta}',
-                        'batch_cost: {btime}',
-                        'data_cost: {dtime}',
-                        'ips: {ips:.4f} images/s',
-                    ])
-                    fmt = fmt.format(
-                        iter_id,
-                        iters,
-                        epoch_id,
-                        step_id,
-                        steps_per_epoch,
-                        lr=status['learning_rate'],
-                        meters=logs,
-                        eta=eta_str,
-                        btime=str(batch_time),
-                        dtime=str(data_time),
-                        ips=ips)
-                    logger.info(fmt)
-            if mode == 'eval':
-                step_id = status['step_id']
-                if step_id % 100 == 0:
-                    logger.info("Eval iter: {}".format(step_id))
-
-
-class SemiCheckpointer(Checkpointer):
-    def __init__(self, model):
-        super(SemiCheckpointer, self).__init__(model)
-        cfg = self.model.cfg
-        self.best_ap = 0.
-        self.save_dir = os.path.join(self.model.cfg.save_dir,
-                                     self.model.cfg.filename)
-        if hasattr(self.model.model, 'student') and hasattr(self.model.model,
-                                                            'teacher'):
-            self.weight = (self.model.model.teacher, self.model.model.student)
-        elif hasattr(self.model.model, 'student') or hasattr(self.model.model,
-                                                             'teacher'):
-            raise AttributeError(
-                "model has no attribute 'student' or 'teacher'")
-        else:
-            raise AttributeError(
-                "model has no attribute 'student' and 'teacher'")
-
-    def every_n_iters(self, iter_id, n):
-        return (iter_id + 1) % n == 0 if n > 0 else False
-
-    def on_step_end(self, status):
-        # Checkpointer only performed during training
-        mode = status['mode']
-        eval_interval = status['eval_interval']
-        save_interval = status['save_interval']
-        iter_id = status['iter_id']
-        epoch_id = status['epoch_id']
-        t_weight = None
-        s_weight = None
-        save_name = None
-        if dist.get_world_size() < 2 or dist.get_rank() == 0:
-            if self.every_n_iters(iter_id, save_interval) and mode == 'train':
-                save_name = "last_epoch"
-                # save_name = str(iter_id + 1)
-                t_weight = self.weight[0].state_dict()
-                s_weight = self.weight[1].state_dict()
-                save_semi_model(t_weight, s_weight, self.model.optimizer,
-                                self.save_dir, save_name, epoch_id + 1,
-                                iter_id + 1)
-
-    def on_epoch_end(self, status):
-        # Checkpointer only performed during training
-        mode = status['mode']
-        eval_interval = status['eval_interval']
-        save_interval = status['save_interval']
-        iter_id = status['iter_id']
-        epoch_id = status['epoch_id']
-        t_weight = None
-        s_weight = None
-        save_name = None
-        if dist.get_world_size() < 2 or dist.get_rank() == 0:
-            if self.every_n_iters(iter_id, eval_interval) and mode == 'eval':
-                if 'save_best_model' in status and status['save_best_model']:
-                    for metric in self.model._metrics:
-                        map_res = metric.get_results()
-                        if 'bbox' in map_res:
-                            key = 'bbox'
-                        elif 'keypoint' in map_res:
-                            key = 'keypoint'
-                        else:
-                            key = 'mask'
-                        if key not in map_res:
-                            logger.warning("Evaluation results empty, this may be due to " \
-                                        "training iterations being too few or not " \
-                                        "loading the correct weights.")
-                            return
-                        if map_res[key][0] > self.best_ap:
-                            self.best_ap = map_res[key][0]
-                            save_name = 'best_model'
-                            t_weight = self.weight[0].state_dict()
-                            s_weight = self.weight[1].state_dict()
-                        logger.info("Best teacher test {} ap is {:0.3f}.".
-                                    format(key, self.best_ap))
-                    if t_weight and s_weight:
-                        save_semi_model(t_weight, s_weight,
-                                        self.model.optimizer, self.save_dir,
-                                        save_name, epoch_id + 1, iter_id + 1)
diff --git a/pdfdet/models/Paddle/ppdet/engine/env.py b/pdfdet/models/Paddle/ppdet/engine/env.py
deleted file mode 100644
index 0a89657..0000000
--- a/pdfdet/models/Paddle/ppdet/engine/env.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import random
-import numpy as np
-
-import paddle
-from paddle.distributed import fleet
-
-__all__ = ['init_parallel_env', 'set_random_seed', 'init_fleet_env']
-
-
-def init_fleet_env(find_unused_parameters=False):
-    strategy = fleet.DistributedStrategy()
-    strategy.find_unused_parameters = find_unused_parameters
-    fleet.init(is_collective=True, strategy=strategy)
-
-
-def init_parallel_env():
-    env = os.environ
-    dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env
-    if dist:
-        trainer_id = int(env['PADDLE_TRAINER_ID'])
-        local_seed = (99 + trainer_id)
-        random.seed(local_seed)
-        np.random.seed(local_seed)
-
-    paddle.distributed.init_parallel_env()
-
-
-def set_random_seed(seed):
-    paddle.seed(seed)
-    random.seed(seed)
-    np.random.seed(seed)
diff --git a/pdfdet/models/Paddle/ppdet/engine/export_utils.py b/pdfdet/models/Paddle/ppdet/engine/export_utils.py
deleted file mode 100644
index daaa39a..0000000
--- a/pdfdet/models/Paddle/ppdet/engine/export_utils.py
+++ /dev/null
@@ -1,373 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import yaml
-from collections import OrderedDict
-
-import paddle
-from ppdet.data.source.category import get_categories
-
-from ppdet.utils.logger import setup_logger
-logger = setup_logger('ppdet.engine')
-
-# Global dictionary
-TRT_MIN_SUBGRAPH = {
-    'YOLO': 3,
-    'PPYOLOE': 3,
-    'SSD': 60,
-    'RCNN': 40,
-    'RetinaNet': 40,
-    'S2ANet': 80,
-    'EfficientDet': 40,
-    'Face': 3,
-    'TTFNet': 60,
-    'FCOS': 16,
-    'SOLOv2': 60,
-    'HigherHRNet': 3,
-    'HRNet': 3,
-    'DeepSORT': 3,
-    'ByteTrack': 10,
-    'CenterTrack': 5,
-    'JDE': 10,
-    'FairMOT': 5,
-    'GFL': 16,
-    'PicoDet': 3,
-    'CenterNet': 5,
-    'TOOD': 5,
-    'YOLOX': 8,
-    'YOLOF': 40,
-    'METRO_Body': 3,
-    'DETR': 3,
-    'CLRNet': 3
-}
-
-KEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet']
-MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
-LANE_ARCH = ['CLRNet']
-
-TO_STATIC_SPEC = {
-    'yolov3_darknet53_270e_coco': [{
-        'im_id': paddle.static.InputSpec(
-            name='im_id', shape=[-1, 1], dtype='float32'),
-        'is_crowd': paddle.static.InputSpec(
-            name='is_crowd', shape=[-1, 50], dtype='float32'),
-        'gt_bbox': paddle.static.InputSpec(
-            name='gt_bbox', shape=[-1, 50, 4], dtype='float32'),
-        'curr_iter': paddle.static.InputSpec(
-            name='curr_iter', shape=[-1], dtype='float32'),
-        'image': paddle.static.InputSpec(
-            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
-        'im_shape': paddle.static.InputSpec(
-            name='im_shape', shape=[-1, 2], dtype='float32'),
-        'scale_factor': paddle.static.InputSpec(
-            name='scale_factor', shape=[-1, 2], dtype='float32'),
-        'target0': paddle.static.InputSpec(
-            name='target0', shape=[-1, 3, 86, -1, -1], dtype='float32'),
-        'target1': paddle.static.InputSpec(
-            name='target1', shape=[-1, 3, 86, -1, -1], dtype='float32'),
-        'target2': paddle.static.InputSpec(
-            name='target2', shape=[-1, 3, 86, -1, -1], dtype='float32'),
-    }],
-    'tinypose_128x96': [{
-        'center': paddle.static.InputSpec(
-            name='center', shape=[-1, 2], dtype='float32'),
-        'scale': paddle.static.InputSpec(
-            name='scale', shape=[-1, 2], dtype='float32'),
-        'im_id': paddle.static.InputSpec(
-            name='im_id', shape=[-1, 1], dtype='float32'),
-        'image': paddle.static.InputSpec(
-            name='image', shape=[-1, 3, 128, 96], dtype='float32'),
-        'score': paddle.static.InputSpec(
-            name='score', shape=[-1], dtype='float32'),
-        'rotate': paddle.static.InputSpec(
-            name='rotate', shape=[-1], dtype='float32'),
-        'target': paddle.static.InputSpec(
-            name='target', shape=[-1, 17, 32, 24], dtype='float32'),
-        'target_weight': paddle.static.InputSpec(
-            name='target_weight', shape=[-1, 17, 1], dtype='float32'),
-    }],
-    'fcos_r50_fpn_1x_coco': [{
-        'im_id': paddle.static.InputSpec(
-            name='im_id', shape=[-1, 1], dtype='float32'),
-        'curr_iter': paddle.static.InputSpec(
-            name='curr_iter', shape=[-1], dtype='float32'),
-        'image': paddle.static.InputSpec(
-            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
-        'im_shape': paddle.static.InputSpec(
-            name='im_shape', shape=[-1, 2], dtype='float32'),
-        'scale_factor': paddle.static.InputSpec(
-            name='scale_factor', shape=[-1, 2], dtype='float32'),
-        'reg_target0': paddle.static.InputSpec(
-            name='reg_target0', shape=[-1, 160, 160, 4], dtype='float32'),
-        'labels0': paddle.static.InputSpec(
-            name='labels0', shape=[-1, 160, 160, 1], dtype='int32'),
-        'centerness0': paddle.static.InputSpec(
-            name='centerness0', shape=[-1, 160, 160, 1], dtype='float32'),
-        'reg_target1': paddle.static.InputSpec(
-            name='reg_target1', shape=[-1, 80, 80, 4], dtype='float32'),
-        'labels1': paddle.static.InputSpec(
-            name='labels1', shape=[-1, 80, 80, 1], dtype='int32'),
-        'centerness1': paddle.static.InputSpec(
-            name='centerness1', shape=[-1, 80, 80, 1], dtype='float32'),
-        'reg_target2': paddle.static.InputSpec(
-            name='reg_target2', shape=[-1, 40, 40, 4], dtype='float32'),
-        'labels2': paddle.static.InputSpec(
-            name='labels2', shape=[-1, 40, 40, 1], dtype='int32'),
-        'centerness2': paddle.static.InputSpec(
-            name='centerness2', shape=[-1, 40, 40, 1], dtype='float32'),
-        'reg_target3': paddle.static.InputSpec(
-            name='reg_target3', shape=[-1, 20, 20, 4], dtype='float32'),
-        'labels3': paddle.static.InputSpec(
-            name='labels3', shape=[-1, 20, 20, 1], dtype='int32'),
-        'centerness3': paddle.static.InputSpec(
-            name='centerness3', shape=[-1, 20, 20, 1], dtype='float32'),
-        'reg_target4': paddle.static.InputSpec(
-            name='reg_target4', shape=[-1, 10, 10, 4], dtype='float32'),
-        'labels4': paddle.static.InputSpec(
-            name='labels4', shape=[-1, 10, 10, 1], dtype='int32'),
-        'centerness4': paddle.static.InputSpec(
-            name='centerness4', shape=[-1, 10, 10, 1], dtype='float32'),
-    }],
-    'picodet_s_320_coco_lcnet': [{
-        'im_id': paddle.static.InputSpec(
-            name='im_id', shape=[-1, 1], dtype='float32'),
-        'is_crowd': paddle.static.InputSpec(
-            name='is_crowd', shape=[-1, -1, 1], dtype='float32'),
-        'gt_class': paddle.static.InputSpec(
-            name='gt_class', shape=[-1, -1, 1], dtype='int32'),
-        'gt_bbox': paddle.static.InputSpec(
-            name='gt_bbox', shape=[-1, -1, 4], dtype='float32'),
-        'curr_iter': paddle.static.InputSpec(
-            name='curr_iter', shape=[-1], dtype='float32'),
-        'image': paddle.static.InputSpec(
-            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
-        'im_shape': paddle.static.InputSpec(
-            name='im_shape', shape=[-1, 2], dtype='float32'),
-        'scale_factor': paddle.static.InputSpec(
-            name='scale_factor', shape=[-1, 2], dtype='float32'),
-        'pad_gt_mask': paddle.static.InputSpec(
-            name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'),
-    }],
-    'ppyoloe_crn_s_300e_coco': [{
-        'im_id': paddle.static.InputSpec(
-            name='im_id', shape=[-1, 1], dtype='float32'),
-        'is_crowd': paddle.static.InputSpec(
-            name='is_crowd', shape=[-1, -1, 1], dtype='float32'),
-        'gt_class': paddle.static.InputSpec(
-            name='gt_class', shape=[-1, -1, 1], dtype='int32'),
-        'gt_bbox': paddle.static.InputSpec(
-            name='gt_bbox', shape=[-1, -1, 4], dtype='float32'),
-        'curr_iter': paddle.static.InputSpec(
-            name='curr_iter', shape=[-1], dtype='float32'),
-        'image': paddle.static.InputSpec(
-            name='image', shape=[-1, 3, -1, -1], dtype='float32'),
-        'im_shape': paddle.static.InputSpec(
-            name='im_shape', shape=[-1, 2], dtype='float32'),
-        'scale_factor': paddle.static.InputSpec(
-            name='scale_factor', shape=[-1, 2], dtype='float32'),
-        'pad_gt_mask': paddle.static.InputSpec(
-            name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'),
-    }],
-}
-
-
-def apply_to_static(config, model):
-    filename = config.get('filename', None)
-    spec = TO_STATIC_SPEC.get(filename, None)
-    model = paddle.jit.to_static(model, input_spec=spec)
-    logger.info("Successfully to apply @to_static with specs: {}".format(spec))
-    return model
-
-
-def _prune_input_spec(input_spec, program, targets):
-    # try to prune static program to figure out pruned input spec
-    # so we perform following operations in static mode
-    device = paddle.get_device()
-    paddle.enable_static()
-    paddle.set_device(device)
-    pruned_input_spec = [{}]
-    program = program.clone()
-    program = program._prune(targets=targets)
-    global_block = program.global_block()
-    for name, spec in input_spec[0].items():
-        try:
-            v = global_block.var(name)
-            pruned_input_spec[0][name] = spec
-        except Exception:
-            pass
-    paddle.disable_static(place=device)
-    return pruned_input_spec
-
-
-def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape):
-    preprocess_list = []
-    label_list = []
-    if arch != "lane_arch":
-        anno_file = dataset_cfg.get_anno()
-
-        clsid2catid, catid2name = get_categories(metric, anno_file, arch)
-
-        label_list = [str(cat) for cat in catid2name.values()]
-
-    fuse_normalize = reader_cfg.get('fuse_normalize', False)
-    sample_transforms = reader_cfg['sample_transforms']
-    for st in sample_transforms[1:]:
-        for key, value in st.items():
-            p = {'type': key}
-            if key == 'Resize':
-                if int(image_shape[1]) != -1:
-                    value['target_size'] = image_shape[1:]
-                value['interp'] = value.get('interp', 1)  # cv2.INTER_LINEAR
-            if fuse_normalize and key == 'NormalizeImage':
-                continue
-            p.update(value)
-            preprocess_list.append(p)
-    batch_transforms = reader_cfg.get('batch_transforms', None)
-    if batch_transforms:
-        for bt in batch_transforms:
-            for key, value in bt.items():
-                # for deploy/infer, use PadStride(stride) instead PadBatch(pad_to_stride)
-                if key == 'PadBatch':
-                    preprocess_list.append({
-                        'type': 'PadStride',
-                        'stride': value['pad_to_stride']
-                    })
-                    break
-                elif key == "CULaneResize":
-                    # cut and resize
-                    p = {'type': key}
-                    p.update(value)
-                    p.update({"cut_height": dataset_cfg.cut_height})
-                    preprocess_list.append(p)
-                    break
-
-    return preprocess_list, label_list
-
-
-def _parse_tracker(tracker_cfg):
-    tracker_params = {}
-    for k, v in tracker_cfg.items():
-        tracker_params.update({k: v})
-    return tracker_params
-
-
-def _dump_infer_config(config, path, image_shape, model):
-    arch_state = False
-    from ppdet.core.config.yaml_helpers import setup_orderdict
-    setup_orderdict()
-    use_dynamic_shape = True if image_shape[2] == -1 else False
-    infer_cfg = OrderedDict({
-        'mode': 'paddle',
-        'draw_threshold': 0.5,
-        'metric': config['metric'],
-        'use_dynamic_shape': use_dynamic_shape
-    })
-    export_onnx = config.get('export_onnx', False)
-    export_eb = config.get('export_eb', False)
-
-    infer_arch = config['architecture']
-    if 'RCNN' in infer_arch and export_onnx:
-        logger.warning(
-            "Exporting RCNN model to ONNX only support batch_size = 1")
-        infer_cfg['export_onnx'] = True
-        infer_cfg['export_eb'] = export_eb
-
-    if infer_arch in MOT_ARCH:
-        if infer_arch == 'DeepSORT':
-            tracker_cfg = config['DeepSORTTracker']
-        elif infer_arch == 'CenterTrack':
-            tracker_cfg = config['CenterTracker']
-        else:
-            tracker_cfg = config['JDETracker']
-        infer_cfg['tracker'] = _parse_tracker(tracker_cfg)
-
-    for arch, min_subgraph_size in TRT_MIN_SUBGRAPH.items():
-        if arch in infer_arch:
-            infer_cfg['arch'] = arch
-            infer_cfg['min_subgraph_size'] = min_subgraph_size
-            arch_state = True
-            break
-
-    if infer_arch == 'PPYOLOEWithAuxHead':
-        infer_arch = 'PPYOLOE'
-
-    if infer_arch in ['PPYOLOE', 'YOLOX', 'YOLOF']:
-        infer_cfg['arch'] = infer_arch
-        infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch]
-        arch_state = True
-
-    if not arch_state:
-        logger.error(
-            'Architecture: {} is not supported for exporting model now.\n'.
-            format(infer_arch) +
-            'Please set TRT_MIN_SUBGRAPH in ppdet/engine/export_utils.py')
-        os._exit(0)
-    if 'mask_head' in config[config['architecture']] and config[config[
-            'architecture']]['mask_head']:
-        infer_cfg['mask'] = True
-    label_arch = 'detection_arch'
-    if infer_arch in KEYPOINT_ARCH:
-        label_arch = 'keypoint_arch'
-
-    if infer_arch in LANE_ARCH:
-        infer_cfg['arch'] = infer_arch
-        infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch]
-        infer_cfg['img_w'] = config['img_w']
-        infer_cfg['ori_img_h'] = config['ori_img_h']
-        infer_cfg['cut_height'] = config['cut_height']
-        label_arch = 'lane_arch'
-        head_name = "CLRHead"
-        infer_cfg['conf_threshold'] = config[head_name]['conf_threshold']
-        infer_cfg['nms_thres'] = config[head_name]['nms_thres']
-        infer_cfg['max_lanes'] = config[head_name]['max_lanes']
-        infer_cfg['num_points'] = config[head_name]['num_points']
-        arch_state = True
-
-    if infer_arch in MOT_ARCH:
-        if config['metric'] in ['COCO', 'VOC']:
-            # MOT model run as Detector
-            reader_cfg = config['TestReader']
-            dataset_cfg = config['TestDataset']
-        else:
-            # 'metric' in ['MOT', 'MCMOT', 'KITTI']
-            label_arch = 'mot_arch'
-            reader_cfg = config['TestMOTReader']
-            dataset_cfg = config['TestMOTDataset']
-    else:
-        reader_cfg = config['TestReader']
-        dataset_cfg = config['TestDataset']
-
-    infer_cfg['Preprocess'], infer_cfg['label_list'] = _parse_reader(
-        reader_cfg, dataset_cfg, config['metric'], label_arch, image_shape[1:])
-
-    if infer_arch == 'PicoDet':
-        if hasattr(config, 'export') and config['export'].get(
-                'post_process',
-                False) and not config['export'].get('benchmark', False):
-            infer_cfg['arch'] = 'GFL'
-        head_name = 'PicoHeadV2' if config['PicoHeadV2'] else 'PicoHead'
-        infer_cfg['NMS'] = config[head_name]['nms']
-        # In order to speed up the prediction, the threshold of nms 
-        # is adjusted here, which can be changed in infer_cfg.yml
-        config[head_name]['nms']["score_threshold"] = 0.3
-        config[head_name]['nms']["nms_threshold"] = 0.5
-        infer_cfg['fpn_stride'] = config[head_name]['fpn_stride']
-
-    yaml.dump(infer_cfg, open(path, 'w'))
-    logger.info("Export inference config file to {}".format(os.path.join(path)))
diff --git a/pdfdet/models/Paddle/ppdet/engine/tracker.py b/pdfdet/models/Paddle/ppdet/engine/tracker.py
deleted file mode 100644
index 90eb0c5..0000000
--- a/pdfdet/models/Paddle/ppdet/engine/tracker.py
+++ /dev/null
@@ -1,731 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import glob
-import re
-import paddle
-import paddle.nn as nn
-import numpy as np
-from tqdm import tqdm
-from collections import defaultdict
-
-from ppdet.core.workspace import create
-from ppdet.utils.checkpoint import load_weight, load_pretrain_weight
-from ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box
-from ppdet.modeling.mot.utils import MOTTimer, load_det_results, write_mot_results, save_vis_results
-from ppdet.modeling.mot.tracker import JDETracker, CenterTracker
-from ppdet.modeling.mot.tracker import DeepSORTTracker, OCSORTTracker, BOTSORTTracker
-from ppdet.modeling.architectures import YOLOX
-from ppdet.metrics import Metric, MOTMetric, KITTIMOTMetric, MCMOTMetric
-from ppdet.data.source.category import get_categories
-import ppdet.utils.stats as stats
-
-from .callbacks import Callback, ComposeCallback
-
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
-MOT_ARCH_JDE = MOT_ARCH[:2]
-MOT_ARCH_SDE = MOT_ARCH[2:4]
-MOT_DATA_TYPE = ['mot', 'mcmot', 'kitti']
-
-__all__ = ['Tracker']
-
-
-class Tracker(object):
-    def __init__(self, cfg, mode='eval'):
-        self.cfg = cfg
-        assert mode.lower() in ['test', 'eval'], \
-                "mode should be 'test' or 'eval'"
-        self.mode = mode.lower()
-        self.optimizer = None
-
-        # build MOT data loader
-        self.dataset = cfg['{}MOTDataset'.format(self.mode.capitalize())]
-
-        # build model
-        self.model = create(cfg.architecture)
-
-        if isinstance(self.model.detector, YOLOX):
-            for k, m in self.model.named_sublayers():
-                if isinstance(m, nn.BatchNorm2D):
-                    m._epsilon = 1e-3  # for amp(fp16)
-                    m._momentum = 0.97  # 0.03 in pytorch
-
-        anno_file = self.dataset.get_anno()
-        clsid2catid, catid2name = get_categories(
-            self.cfg.metric, anno_file=anno_file)
-        self.ids2names = []
-        for k, v in catid2name.items():
-            self.ids2names.append(v)
-
-        self.status = {}
-        self.start_epoch = 0
-
-        # initial default callbacks
-        self._init_callbacks()
-
-        # initial default metrics
-        self._init_metrics()
-        self._reset_metrics()
-
-    def _init_callbacks(self):
-        self._callbacks = []
-        self._compose_callback = None
-
-    def _init_metrics(self):
-        if self.mode in ['test']:
-            self._metrics = []
-            return
-
-        if self.cfg.metric == 'MOT':
-            self._metrics = [MOTMetric(), ]
-        elif self.cfg.metric == 'MCMOT':
-            self._metrics = [MCMOTMetric(self.cfg.num_classes), ]
-        elif self.cfg.metric == 'KITTI':
-            self._metrics = [KITTIMOTMetric(), ]
-        else:
-            logger.warning("Metric not support for metric type {}".format(
-                self.cfg.metric))
-            self._metrics = []
-
-    def _reset_metrics(self):
-        for metric in self._metrics:
-            metric.reset()
-
-    def register_callbacks(self, callbacks):
-        callbacks = [h for h in list(callbacks) if h is not None]
-        for c in callbacks:
-            assert isinstance(c, Callback), \
-                    "metrics shoule be instances of subclass of Metric"
-        self._callbacks.extend(callbacks)
-        self._compose_callback = ComposeCallback(self._callbacks)
-
-    def register_metrics(self, metrics):
-        metrics = [m for m in list(metrics) if m is not None]
-        for m in metrics:
-            assert isinstance(m, Metric), \
-                    "metrics shoule be instances of subclass of Metric"
-        self._metrics.extend(metrics)
-
-    def load_weights_jde(self, weights):
-        load_weight(self.model, weights, self.optimizer)
-
-    def load_weights_sde(self, det_weights, reid_weights):
-        with_detector = self.model.detector is not None
-        with_reid = self.model.reid is not None
-
-        if with_detector:
-            load_weight(self.model.detector, det_weights)
-            if with_reid:
-                load_weight(self.model.reid, reid_weights)
-        else:
-            load_weight(self.model.reid, reid_weights)
-
-    def _eval_seq_centertrack(self,
-                              dataloader,
-                              save_dir=None,
-                              show_image=False,
-                              frame_rate=30,
-                              draw_threshold=0):
-        assert isinstance(self.model.tracker, CenterTracker)
-        if save_dir:
-            if not os.path.exists(save_dir): os.makedirs(save_dir)
-        tracker = self.model.tracker
-
-        timer = MOTTimer()
-        frame_id = 0
-        self.status['mode'] = 'track'
-        self.model.eval()
-        results = defaultdict(list)  # only support single class now
-
-        for step_id, data in enumerate(tqdm(dataloader)):
-            self.status['step_id'] = step_id
-            if step_id == 0:
-                self.model.reset_tracking()
-
-            # forward
-            timer.tic()
-            pred_ret = self.model(data)
-
-            online_targets = tracker.update(pred_ret)
-            online_tlwhs, online_scores, online_ids = [], [], []
-            for t in online_targets:
-                bbox = t['bbox']
-                tlwh = [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]]
-                tscore = float(t['score'])
-                tid = int(t['tracking_id'])
-                if tlwh[2] * tlwh[3] > 0:
-                    online_tlwhs.append(tlwh)
-                    online_ids.append(tid)
-                    online_scores.append(tscore)
-            timer.toc()
-            # save results
-            results[0].append(
-                (frame_id + 1, online_tlwhs, online_scores, online_ids))
-            save_vis_results(data, frame_id, online_ids, online_tlwhs,
-                             online_scores, timer.average_time, show_image,
-                             save_dir, self.cfg.num_classes, self.ids2names)
-            frame_id += 1
-        return results, frame_id, timer.average_time, timer.calls
-
-    def _eval_seq_jde(self,
-                      dataloader,
-                      save_dir=None,
-                      show_image=False,
-                      frame_rate=30,
-                      draw_threshold=0):
-        if save_dir:
-            if not os.path.exists(save_dir): os.makedirs(save_dir)
-        tracker = self.model.tracker
-        tracker.max_time_lost = int(frame_rate / 30.0 * tracker.track_buffer)
-
-        timer = MOTTimer()
-        frame_id = 0
-        self.status['mode'] = 'track'
-        self.model.eval()
-        results = defaultdict(list)  # support single class and multi classes
-
-        for step_id, data in enumerate(tqdm(dataloader)):
-            self.status['step_id'] = step_id
-            # forward
-            timer.tic()
-            pred_dets, pred_embs = self.model(data)
-
-            pred_dets, pred_embs = pred_dets.numpy(), pred_embs.numpy()
-            online_targets_dict = self.model.tracker.update(pred_dets,
-                                                            pred_embs)
-            online_tlwhs = defaultdict(list)
-            online_scores = defaultdict(list)
-            online_ids = defaultdict(list)
-            for cls_id in range(self.cfg.num_classes):
-                online_targets = online_targets_dict[cls_id]
-                for t in online_targets:
-                    tlwh = t.tlwh
-                    tid = t.track_id
-                    tscore = t.score
-                    if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue
-                    if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[
-                            3] > tracker.vertical_ratio:
-                        continue
-                    online_tlwhs[cls_id].append(tlwh)
-                    online_ids[cls_id].append(tid)
-                    online_scores[cls_id].append(tscore)
-                # save results
-                results[cls_id].append(
-                    (frame_id + 1, online_tlwhs[cls_id], online_scores[cls_id],
-                     online_ids[cls_id]))
-
-            timer.toc()
-            save_vis_results(data, frame_id, online_ids, online_tlwhs,
-                             online_scores, timer.average_time, show_image,
-                             save_dir, self.cfg.num_classes, self.ids2names)
-            frame_id += 1
-
-        return results, frame_id, timer.average_time, timer.calls
-
-    def _eval_seq_sde(self,
-                      dataloader,
-                      save_dir=None,
-                      show_image=False,
-                      frame_rate=30,
-                      seq_name='',
-                      scaled=False,
-                      det_file='',
-                      draw_threshold=0):
-        if save_dir:
-            if not os.path.exists(save_dir): os.makedirs(save_dir)
-        use_detector = False if not self.model.detector else True
-        use_reid = hasattr(self.model, 'reid')
-        if use_reid and self.model.reid is not None:
-            use_reid = True
-        else:
-            use_reid = False
-
-        timer = MOTTimer()
-        results = defaultdict(list)
-        frame_id = 0
-        self.status['mode'] = 'track'
-        self.model.eval()
-        if use_reid:
-            self.model.reid.eval()
-        if not use_detector:
-            dets_list = load_det_results(det_file, len(dataloader))
-            logger.info('Finish loading detection results file {}.'.format(
-                det_file))
-
-        tracker = self.model.tracker
-        for step_id, data in enumerate(tqdm(dataloader)):
-            self.status['step_id'] = step_id
-            ori_image = data['ori_image']  # [bs, H, W, 3]
-            ori_image_shape = data['ori_image'].shape[1:3]
-            # ori_image_shape: [H, W]
-
-            input_shape = data['image'].shape[2:]
-            # input_shape: [h, w], before data transforms, set in model config
-
-            im_shape = data['im_shape'][0].numpy()
-            # im_shape: [new_h, new_w], after data transforms
-            scale_factor = data['scale_factor'][0].numpy()
-
-            empty_detections = False
-            # when it has no detected bboxes, will not inference reid model 
-            # and if visualize, use original image instead
-
-            # forward
-            timer.tic()
-            if not use_detector:
-                dets = dets_list[frame_id]
-                bbox_tlwh = np.array(dets['bbox'], dtype='float32')
-                if bbox_tlwh.shape[0] > 0:
-                    # detector outputs: pred_cls_ids, pred_scores, pred_bboxes
-                    pred_cls_ids = np.array(dets['cls_id'], dtype='float32')
-                    pred_scores = np.array(dets['score'], dtype='float32')
-                    pred_bboxes = np.concatenate(
-                        (bbox_tlwh[:, 0:2],
-                         bbox_tlwh[:, 2:4] + bbox_tlwh[:, 0:2]),
-                        axis=1)
-                else:
-                    logger.warning(
-                        'Frame {} has not object, try to modify score threshold.'.
-                        format(frame_id))
-                    empty_detections = True
-            else:
-                outs = self.model.detector(data)
-                outs['bbox'] = outs['bbox'].numpy()
-                outs['bbox_num'] = outs['bbox_num'].numpy()
-
-                if len(outs['bbox']) > 0 and empty_detections == False:
-                    # detector outputs: pred_cls_ids, pred_scores, pred_bboxes
-                    pred_cls_ids = outs['bbox'][:, 0:1]
-                    pred_scores = outs['bbox'][:, 1:2]
-                    if not scaled:
-                        # Note: scaled=False only in JDE YOLOv3 or other detectors
-                        # with LetterBoxResize and JDEBBoxPostProcess.
-                        #
-                        # 'scaled' means whether the coords after detector outputs
-                        # have been scaled back to the original image, set True 
-                        # in general detector, set False in JDE YOLOv3.
-                        pred_bboxes = scale_coords(outs['bbox'][:, 2:],
-                                                   input_shape, im_shape,
-                                                   scale_factor)
-                    else:
-                        pred_bboxes = outs['bbox'][:, 2:]
-                    pred_dets_old = np.concatenate(
-                        (pred_cls_ids, pred_scores, pred_bboxes), axis=1)
-                else:
-                    logger.warning(
-                        'Frame {} has not detected object, try to modify score threshold.'.
-                        format(frame_id))
-                    empty_detections = True
-
-            if not empty_detections:
-                pred_xyxys, keep_idx = clip_box(pred_bboxes, ori_image_shape)
-                if len(keep_idx[0]) == 0:
-                    logger.warning(
-                        'Frame {} has not detected object left after clip_box.'.
-                        format(frame_id))
-                    empty_detections = True
-
-            if empty_detections:
-                timer.toc()
-                # if visualize, use original image instead
-                online_ids, online_tlwhs, online_scores = None, None, None
-                save_vis_results(data, frame_id, online_ids, online_tlwhs,
-                                 online_scores, timer.average_time, show_image,
-                                 save_dir, self.cfg.num_classes, self.ids2names)
-                frame_id += 1
-                # thus will not inference reid model
-                continue
-
-            pred_cls_ids = pred_cls_ids[keep_idx[0]]
-            pred_scores = pred_scores[keep_idx[0]]
-            pred_dets = np.concatenate(
-                (pred_cls_ids, pred_scores, pred_xyxys), axis=1)
-
-            if use_reid:
-                crops = get_crops(
-                    pred_xyxys,
-                    ori_image,
-                    w=tracker.input_size[0],
-                    h=tracker.input_size[1])
-                crops = paddle.to_tensor(crops)
-
-                data.update({'crops': crops})
-                pred_embs = self.model(data)['embeddings'].numpy()
-            else:
-                pred_embs = None
-
-            if isinstance(tracker, DeepSORTTracker):
-                online_tlwhs, online_scores, online_ids = [], [], []
-                tracker.predict()
-                online_targets = tracker.update(pred_dets, pred_embs)
-                for t in online_targets:
-                    if not t.is_confirmed() or t.time_since_update > 1:
-                        continue
-                    tlwh = t.to_tlwh()
-                    tscore = t.score
-                    tid = t.track_id
-                    if tscore < draw_threshold: continue
-                    if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue
-                    if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[
-                            3] > tracker.vertical_ratio:
-                        continue
-                    online_tlwhs.append(tlwh)
-                    online_scores.append(tscore)
-                    online_ids.append(tid)
-                timer.toc()
-
-                # save results
-                results[0].append(
-                    (frame_id + 1, online_tlwhs, online_scores, online_ids))
-                save_vis_results(data, frame_id, online_ids, online_tlwhs,
-                                 online_scores, timer.average_time, show_image,
-                                 save_dir, self.cfg.num_classes, self.ids2names)
-
-            elif isinstance(tracker, JDETracker):
-                # trick hyperparams only used for MOTChallenge (MOT17, MOT20) Test-set
-                tracker.track_buffer, tracker.conf_thres = get_trick_hyperparams(
-                    seq_name, tracker.track_buffer, tracker.conf_thres)
-
-                online_targets_dict = tracker.update(pred_dets_old, pred_embs)
-                online_tlwhs = defaultdict(list)
-                online_scores = defaultdict(list)
-                online_ids = defaultdict(list)
-                for cls_id in range(self.cfg.num_classes):
-                    online_targets = online_targets_dict[cls_id]
-                    for t in online_targets:
-                        tlwh = t.tlwh
-                        tid = t.track_id
-                        tscore = t.score
-                        if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue
-                        if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[
-                                3] > tracker.vertical_ratio:
-                            continue
-                        online_tlwhs[cls_id].append(tlwh)
-                        online_ids[cls_id].append(tid)
-                        online_scores[cls_id].append(tscore)
-                    # save results
-                    results[cls_id].append(
-                        (frame_id + 1, online_tlwhs[cls_id],
-                         online_scores[cls_id], online_ids[cls_id]))
-                timer.toc()
-                save_vis_results(data, frame_id, online_ids, online_tlwhs,
-                                 online_scores, timer.average_time, show_image,
-                                 save_dir, self.cfg.num_classes, self.ids2names)
-
-            elif isinstance(tracker, OCSORTTracker):
-                # OC_SORT Tracker
-                online_targets = tracker.update(pred_dets_old, pred_embs)
-                online_tlwhs = []
-                online_ids = []
-                online_scores = []
-                for t in online_targets:
-                    tlwh = [t[0], t[1], t[2] - t[0], t[3] - t[1]]
-                    tscore = float(t[4])
-                    tid = int(t[5])
-                    if tlwh[2] * tlwh[3] > 0:
-                        online_tlwhs.append(tlwh)
-                        online_ids.append(tid)
-                        online_scores.append(tscore)
-                timer.toc()
-                # save results
-                results[0].append(
-                    (frame_id + 1, online_tlwhs, online_scores, online_ids))
-                save_vis_results(data, frame_id, online_ids, online_tlwhs,
-                                 online_scores, timer.average_time, show_image,
-                                 save_dir, self.cfg.num_classes, self.ids2names)
-
-            elif isinstance(tracker, BOTSORTTracker):
-                # BOTSORT Tracker
-                online_targets = tracker.update(
-                    pred_dets_old, img=ori_image.numpy())
-                online_tlwhs = []
-                online_ids = []
-                online_scores = []
-                for t in online_targets:
-                    tlwh = t.tlwh
-                    tid = t.track_id
-                    tscore = t.score
-                    if tlwh[2] * tlwh[3] > 0:
-                        online_tlwhs.append(tlwh)
-                        online_ids.append(tid)
-                        online_scores.append(tscore)
-                timer.toc()
-                # save results
-                results[0].append(
-                    (frame_id + 1, online_tlwhs, online_scores, online_ids))
-                save_vis_results(data, frame_id, online_ids, online_tlwhs,
-                                 online_scores, timer.average_time, show_image,
-                                 save_dir, self.cfg.num_classes, self.ids2names)
-
-            else:
-                raise ValueError(tracker)
-            frame_id += 1
-
-        return results, frame_id, timer.average_time, timer.calls
-
-    def mot_evaluate(self,
-                     data_root,
-                     seqs,
-                     output_dir,
-                     data_type='mot',
-                     model_type='JDE',
-                     save_images=False,
-                     save_videos=False,
-                     show_image=False,
-                     scaled=False,
-                     det_results_dir=''):
-        if not os.path.exists(output_dir): os.makedirs(output_dir)
-        result_root = os.path.join(output_dir, 'mot_results')
-        if not os.path.exists(result_root): os.makedirs(result_root)
-        assert data_type in MOT_DATA_TYPE, \
-            "data_type should be 'mot', 'mcmot' or 'kitti'"
-        assert model_type in MOT_ARCH, \
-            "model_type should be 'JDE', 'DeepSORT', 'FairMOT' or 'ByteTrack'"
-
-        # run tracking
-        n_frame = 0
-        timer_avgs, timer_calls = [], []
-        for seq in seqs:
-            infer_dir = os.path.join(data_root, seq)
-            if not os.path.exists(infer_dir) or not os.path.isdir(infer_dir):
-                logger.warning("Seq {} error, {} has no images.".format(
-                    seq, infer_dir))
-                continue
-            if os.path.exists(os.path.join(infer_dir, 'img1')):
-                infer_dir = os.path.join(infer_dir, 'img1')
-
-            frame_rate = 30
-            seqinfo = os.path.join(data_root, seq, 'seqinfo.ini')
-            if os.path.exists(seqinfo):
-                meta_info = open(seqinfo).read()
-                frame_rate = int(meta_info[meta_info.find('frameRate') + 10:
-                                           meta_info.find('\nseqLength')])
-
-            save_dir = os.path.join(output_dir, 'mot_outputs',
-                                    seq) if save_images or save_videos else None
-            logger.info('Evaluate seq: {}'.format(seq))
-
-            self.dataset.set_images(self.get_infer_images(infer_dir))
-            dataloader = create('EvalMOTReader')(self.dataset, 0)
-
-            result_filename = os.path.join(result_root, '{}.txt'.format(seq))
-
-            with paddle.no_grad():
-                if model_type in MOT_ARCH_JDE:
-                    results, nf, ta, tc = self._eval_seq_jde(
-                        dataloader,
-                        save_dir=save_dir,
-                        show_image=show_image,
-                        frame_rate=frame_rate)
-                elif model_type in MOT_ARCH_SDE:
-                    results, nf, ta, tc = self._eval_seq_sde(
-                        dataloader,
-                        save_dir=save_dir,
-                        show_image=show_image,
-                        frame_rate=frame_rate,
-                        seq_name=seq,
-                        scaled=scaled,
-                        det_file=os.path.join(det_results_dir,
-                                              '{}.txt'.format(seq)))
-                elif model_type == 'CenterTrack':
-                    results, nf, ta, tc = self._eval_seq_centertrack(
-                        dataloader,
-                        save_dir=save_dir,
-                        show_image=show_image,
-                        frame_rate=frame_rate)
-                else:
-                    raise ValueError(model_type)
-
-            write_mot_results(result_filename, results, data_type,
-                              self.cfg.num_classes)
-            n_frame += nf
-            timer_avgs.append(ta)
-            timer_calls.append(tc)
-
-            if save_videos:
-                output_video_path = os.path.join(save_dir, '..',
-                                                 '{}_vis.mp4'.format(seq))
-                cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg {}'.format(
-                    save_dir, output_video_path)
-                os.system(cmd_str)
-                logger.info('Save video in {}.'.format(output_video_path))
-
-            # update metrics
-            for metric in self._metrics:
-                metric.update(data_root, seq, data_type, result_root,
-                              result_filename)
-
-        timer_avgs = np.asarray(timer_avgs)
-        timer_calls = np.asarray(timer_calls)
-        all_time = np.dot(timer_avgs, timer_calls)
-        avg_time = all_time / np.sum(timer_calls)
-        logger.info('Time elapsed: {:.2f} seconds, FPS: {:.2f}'.format(
-            all_time, 1.0 / avg_time))
-
-        # accumulate metric to log out
-        for metric in self._metrics:
-            metric.accumulate()
-            metric.log()
-        # reset metric states for metric may performed multiple times
-        self._reset_metrics()
-
-    def get_infer_images(self, infer_dir):
-        assert infer_dir is None or os.path.isdir(infer_dir), \
-            "{} is not a directory".format(infer_dir)
-        images = set()
-        assert os.path.isdir(infer_dir), \
-            "infer_dir {} is not a directory".format(infer_dir)
-        exts = ['jpg', 'jpeg', 'png', 'bmp']
-        exts += [ext.upper() for ext in exts]
-        for ext in exts:
-            images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
-        images = list(images)
-        images.sort()
-        assert len(images) > 0, "no image found in {}".format(infer_dir)
-        logger.info("Found {} inference images in total.".format(len(images)))
-        return images
-
-    def mot_predict_seq(self,
-                        video_file,
-                        frame_rate,
-                        image_dir,
-                        output_dir,
-                        data_type='mot',
-                        model_type='JDE',
-                        save_images=False,
-                        save_videos=True,
-                        show_image=False,
-                        scaled=False,
-                        det_results_dir='',
-                        draw_threshold=0.5):
-        assert video_file is not None or image_dir is not None, \
-            "--video_file or --image_dir should be set."
-        assert video_file is None or os.path.isfile(video_file), \
-                "{} is not a file".format(video_file)
-        assert image_dir is None or os.path.isdir(image_dir), \
-                "{} is not a directory".format(image_dir)
-
-        if not os.path.exists(output_dir): os.makedirs(output_dir)
-        result_root = os.path.join(output_dir, 'mot_results')
-        if not os.path.exists(result_root): os.makedirs(result_root)
-        assert data_type in MOT_DATA_TYPE, \
-            "data_type should be 'mot', 'mcmot' or 'kitti'"
-        assert model_type in MOT_ARCH, \
-            "model_type should be 'JDE', 'DeepSORT', 'FairMOT' or 'ByteTrack'"
-
-        # run tracking        
-        if video_file:
-            seq = video_file.split('/')[-1].split('.')[0]
-            self.dataset.set_video(video_file, frame_rate)
-            logger.info('Starting tracking video {}'.format(video_file))
-        elif image_dir:
-            seq = image_dir.split('/')[-1].split('.')[0]
-            if os.path.exists(os.path.join(image_dir, 'img1')):
-                image_dir = os.path.join(image_dir, 'img1')
-            images = [
-                '{}/{}'.format(image_dir, x) for x in os.listdir(image_dir)
-            ]
-            images.sort()
-            self.dataset.set_images(images)
-            logger.info('Starting tracking folder {}, found {} images'.format(
-                image_dir, len(images)))
-        else:
-            raise ValueError('--video_file or --image_dir should be set.')
-
-        save_dir = os.path.join(output_dir, 'mot_outputs',
-                                seq) if save_images or save_videos else None
-
-        dataloader = create('TestMOTReader')(self.dataset, 0)
-        result_filename = os.path.join(result_root, '{}.txt'.format(seq))
-        if frame_rate == -1:
-            frame_rate = self.dataset.frame_rate
-
-        with paddle.no_grad():
-            if model_type in MOT_ARCH_JDE:
-                results, nf, ta, tc = self._eval_seq_jde(
-                    dataloader,
-                    save_dir=save_dir,
-                    show_image=show_image,
-                    frame_rate=frame_rate,
-                    draw_threshold=draw_threshold)
-            elif model_type in MOT_ARCH_SDE:
-                results, nf, ta, tc = self._eval_seq_sde(
-                    dataloader,
-                    save_dir=save_dir,
-                    show_image=show_image,
-                    frame_rate=frame_rate,
-                    seq_name=seq,
-                    scaled=scaled,
-                    det_file=os.path.join(det_results_dir,
-                                          '{}.txt'.format(seq)),
-                    draw_threshold=draw_threshold)
-            elif model_type == 'CenterTrack':
-                results, nf, ta, tc = self._eval_seq_centertrack(
-                    dataloader,
-                    save_dir=save_dir,
-                    show_image=show_image,
-                    frame_rate=frame_rate)
-            else:
-                raise ValueError(model_type)
-
-        if save_videos:
-            output_video_path = os.path.join(save_dir, '..',
-                                             '{}_vis.mp4'.format(seq))
-            cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg {}'.format(
-                save_dir, output_video_path)
-            os.system(cmd_str)
-            logger.info('Save video in {}'.format(output_video_path))
-
-        write_mot_results(result_filename, results, data_type,
-                          self.cfg.num_classes)
-
-
-def get_trick_hyperparams(video_name, ori_buffer, ori_thresh):
-    if video_name[:3] != 'MOT':
-        # only used for MOTChallenge (MOT17, MOT20) Test-set
-        return ori_buffer, ori_thresh
-
-    video_name = video_name[:8]
-    if 'MOT17-05' in video_name:
-        track_buffer = 14
-    elif 'MOT17-13' in video_name:
-        track_buffer = 25
-    else:
-        track_buffer = ori_buffer
-
-    if 'MOT17-01' in video_name:
-        track_thresh = 0.65
-    elif 'MOT17-06' in video_name:
-        track_thresh = 0.65
-    elif 'MOT17-12' in video_name:
-        track_thresh = 0.7
-    elif 'MOT17-14' in video_name:
-        track_thresh = 0.67
-    else:
-        track_thresh = ori_thresh
-
-    if 'MOT20-06' in video_name or 'MOT20-08' in video_name:
-        track_thresh = 0.3
-    else:
-        track_thresh = ori_thresh
-
-    return track_buffer, ori_thresh
diff --git a/pdfdet/models/Paddle/ppdet/engine/trainer.py b/pdfdet/models/Paddle/ppdet/engine/trainer.py
deleted file mode 100644
index f2d44d1..0000000
--- a/pdfdet/models/Paddle/ppdet/engine/trainer.py
+++ /dev/null
@@ -1,1321 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import copy
-import time
-from tqdm import tqdm
-
-import numpy as np
-import typing
-from PIL import Image, ImageOps, ImageFile
-
-ImageFile.LOAD_TRUNCATED_IMAGES = True
-
-import paddle
-import paddle.nn as nn
-import paddle.distributed as dist
-from paddle.distributed import fleet
-from paddle.static import InputSpec
-from ppdet.optimizer import ModelEMA
-
-from ppdet.core.workspace import create
-from ppdet.utils.checkpoint import load_weight, load_pretrain_weight
-from ppdet.utils.visualizer import visualize_results, save_result
-from ppdet.metrics import get_infer_results, KeyPointTopDownCOCOEval, KeyPointTopDownCOCOWholeBadyHandEval, KeyPointTopDownMPIIEval, Pose3DEval
-from ppdet.metrics import Metric, COCOMetric, VOCMetric, WiderFaceMetric, RBoxMetric, JDEDetMetric, SNIPERCOCOMetric, CULaneMetric
-from ppdet.data.source.sniper_coco import SniperCOCODataSet
-from ppdet.data.source.category import get_categories
-import ppdet.utils.stats as stats
-from ppdet.utils.fuse_utils import fuse_conv_bn
-from ppdet.utils import profiler
-from ppdet.modeling.post_process import multiclass_nms
-from ppdet.modeling.lane_utils import imshow_lanes
-
-from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, WiferFaceEval, VisualDLWriter, SniperProposalsGenerator, WandbCallback, SemiCheckpointer, SemiLogPrinter
-from .export_utils import _dump_infer_config, _prune_input_spec, apply_to_static
-
-from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients
-
-from ppdet.utils.logger import setup_logger
-logger = setup_logger('ppdet.engine')
-
-__all__ = ['Trainer']
-
-MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
-
-
-class Trainer(object):
-    def __init__(self, cfg, mode='train'):
-        self.cfg = cfg.copy()
-        assert mode.lower() in ['train', 'eval', 'test'], \
-                "mode should be 'train', 'eval' or 'test'"
-        self.mode = mode.lower()
-        self.optimizer = None
-        self.is_loaded_weights = False
-        self.use_amp = self.cfg.get('amp', False)
-        self.amp_level = self.cfg.get('amp_level', 'O1')
-        self.custom_white_list = self.cfg.get('custom_white_list', None)
-        self.custom_black_list = self.cfg.get('custom_black_list', None)
-        self.use_master_grad = self.cfg.get('master_grad', False)
-        if 'slim' in cfg and cfg['slim_type'] == 'PTQ':
-            self.cfg['TestDataset'] = create('TestDataset')()
-
-        # build data loader
-        capital_mode = self.mode.capitalize()
-        if cfg.architecture in MOT_ARCH and self.mode in [
-                'eval', 'test'
-        ] and cfg.metric not in ['COCO', 'VOC']:
-            self.dataset = self.cfg['{}MOTDataset'.format(
-                capital_mode)] = create('{}MOTDataset'.format(capital_mode))()
-        else:
-            self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
-                '{}Dataset'.format(capital_mode))()
-
-        if cfg.architecture == 'DeepSORT' and self.mode == 'train':
-            logger.error('DeepSORT has no need of training on mot dataset.')
-            sys.exit(1)
-
-        if cfg.architecture == 'FairMOT' and self.mode == 'eval':
-            images = self.parse_mot_images(cfg)
-            self.dataset.set_images(images)
-
-        if self.mode == 'train':
-            self.loader = create('{}Reader'.format(capital_mode))(
-                self.dataset, cfg.worker_num)
-
-        if cfg.architecture == 'JDE' and self.mode == 'train':
-            self.cfg['JDEEmbeddingHead'][
-                'num_identities'] = self.dataset.num_identities_dict[0]
-            # JDE only support single class MOT now.
-
-        if cfg.architecture == 'FairMOT' and self.mode == 'train':
-            self.cfg['FairMOTEmbeddingHead'][
-                'num_identities_dict'] = self.dataset.num_identities_dict
-            # FairMOT support single class and multi-class MOT now.
-
-        # build model
-        if 'model' not in self.cfg:
-            self.model = create(cfg.architecture)
-        else:
-            self.model = self.cfg.model
-            self.is_loaded_weights = True
-
-        if cfg.architecture == 'YOLOX':
-            for k, m in self.model.named_sublayers():
-                if isinstance(m, nn.BatchNorm2D):
-                    m._epsilon = 1e-3  # for amp(fp16)
-                    m._momentum = 0.97  # 0.03 in pytorch
-
-        #normalize params for deploy
-        if 'slim' in cfg and cfg['slim_type'] == 'OFA':
-            self.model.model.load_meanstd(cfg['TestReader'][
-                'sample_transforms'])
-        elif 'slim' in cfg and cfg['slim_type'] == 'Distill':
-            self.model.student_model.load_meanstd(cfg['TestReader'][
-                'sample_transforms'])
-        elif 'slim' in cfg and cfg[
-                'slim_type'] == 'DistillPrune' and self.mode == 'train':
-            self.model.student_model.load_meanstd(cfg['TestReader'][
-                'sample_transforms'])
-        else:
-            self.model.load_meanstd(cfg['TestReader']['sample_transforms'])
-
-        # EvalDataset build with BatchSampler to evaluate in single device
-        # TODO: multi-device evaluate
-        if self.mode == 'eval':
-            if cfg.architecture == 'FairMOT':
-                self.loader = create('EvalMOTReader')(self.dataset, 0)
-            elif cfg.architecture == "METRO_Body":
-                reader_name = '{}Reader'.format(self.mode.capitalize())
-                self.loader = create(reader_name)(self.dataset, cfg.worker_num)
-            else:
-                self._eval_batch_sampler = paddle.io.BatchSampler(
-                    self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
-                reader_name = '{}Reader'.format(self.mode.capitalize())
-                # If metric is VOC, need to be set collate_batch=False.
-                if cfg.metric == 'VOC':
-                    self.cfg[reader_name]['collate_batch'] = False
-                self.loader = create(reader_name)(self.dataset, cfg.worker_num,
-                                                  self._eval_batch_sampler)
-        # TestDataset build after user set images, skip loader creation here
-
-        # get Params
-        print_params = self.cfg.get('print_params', False)
-        if print_params:
-            params = sum([
-                p.numel() for n, p in self.model.named_parameters()
-                if all([x not in n for x in ['_mean', '_variance', 'aux_']])
-            ])  # exclude BatchNorm running status
-            logger.info('Model Params : {} M.'.format((params / 1e6).numpy()[
-                0]))
-
-        # build optimizer in train mode
-        if self.mode == 'train':
-            steps_per_epoch = len(self.loader)
-            if steps_per_epoch < 1:
-                logger.warning(
-                    "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader."
-                )
-            self.lr = create('LearningRate')(steps_per_epoch)
-            self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
-
-            # Unstructured pruner is only enabled in the train mode.
-            if self.cfg.get('unstructured_prune'):
-                self.pruner = create('UnstructuredPruner')(self.model,
-                                                           steps_per_epoch)
-        if self.use_amp and self.amp_level == 'O2':
-            paddle_version = paddle.__version__[:3]
-            # paddle version >= 2.5.0 or develop
-            if paddle_version in ["2.5", "0.0"]:
-                self.model, self.optimizer = paddle.amp.decorate(
-                    models=self.model,
-                    optimizers=self.optimizer,
-                    level=self.amp_level,
-                    master_grad=self.use_master_grad)
-            else:
-                self.model, self.optimizer = paddle.amp.decorate(
-                    models=self.model,
-                    optimizers=self.optimizer,
-                    level=self.amp_level)
-        self.use_ema = ('use_ema' in cfg and cfg['use_ema'])
-        if self.use_ema:
-            ema_decay = self.cfg.get('ema_decay', 0.9998)
-            ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')
-            cycle_epoch = self.cfg.get('cycle_epoch', -1)
-            ema_black_list = self.cfg.get('ema_black_list', None)
-            ema_filter_no_grad = self.cfg.get('ema_filter_no_grad', False)
-            self.ema = ModelEMA(
-                self.model,
-                decay=ema_decay,
-                ema_decay_type=ema_decay_type,
-                cycle_epoch=cycle_epoch,
-                ema_black_list=ema_black_list,
-                ema_filter_no_grad=ema_filter_no_grad)
-
-        self._nranks = dist.get_world_size()
-        self._local_rank = dist.get_rank()
-
-        self.status = {}
-
-        self.start_epoch = 0
-        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch
-
-        # initial default callbacks
-        self._init_callbacks()
-
-        # initial default metrics
-        self._init_metrics()
-        self._reset_metrics()
-
-    def _init_callbacks(self):
-        if self.mode == 'train':
-            if self.cfg.get('ssod_method',
-                            False) and self.cfg['ssod_method'] == 'Semi_RTDETR':
-                self._callbacks = [SemiLogPrinter(self), SemiCheckpointer(self)]
-            else:
-                self._callbacks = [LogPrinter(self), Checkpointer(self)]
-            if self.cfg.get('use_vdl', False):
-                self._callbacks.append(VisualDLWriter(self))
-            if self.cfg.get('save_proposals', False):
-                self._callbacks.append(SniperProposalsGenerator(self))
-            if self.cfg.get('use_wandb', False) or 'wandb' in self.cfg:
-                self._callbacks.append(WandbCallback(self))
-            self._compose_callback = ComposeCallback(self._callbacks)
-        elif self.mode == 'eval':
-            self._callbacks = [LogPrinter(self)]
-            if self.cfg.metric == 'WiderFace':
-                self._callbacks.append(WiferFaceEval(self))
-            self._compose_callback = ComposeCallback(self._callbacks)
-        elif self.mode == 'test' and self.cfg.get('use_vdl', False):
-            self._callbacks = [VisualDLWriter(self)]
-            self._compose_callback = ComposeCallback(self._callbacks)
-        else:
-            self._callbacks = []
-            self._compose_callback = None
-
-    def _init_metrics(self, validate=False):
-        if self.mode == 'test' or (self.mode == 'train' and not validate):
-            self._metrics = []
-            return
-        classwise = self.cfg['classwise'] if 'classwise' in self.cfg else False
-        if self.cfg.metric == 'COCO' or self.cfg.metric == "SNIPERCOCO":
-            # TODO: bias should be unified
-            bias = 1 if self.cfg.get('bias', False) else 0
-            output_eval = self.cfg['output_eval'] \
-                if 'output_eval' in self.cfg else None
-            save_prediction_only = self.cfg.get('save_prediction_only', False)
-
-            # pass clsid2catid info to metric instance to avoid multiple loading
-            # annotation file
-            clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} \
-                                if self.mode == 'eval' else None
-
-            # when do validation in train, annotation file should be get from
-            # EvalReader instead of self.dataset(which is TrainReader)
-            if self.mode == 'train' and validate:
-                eval_dataset = self.cfg['EvalDataset']
-                eval_dataset.check_or_download_dataset()
-                anno_file = eval_dataset.get_anno()
-                dataset = eval_dataset
-            else:
-                dataset = self.dataset
-                anno_file = dataset.get_anno()
-
-            IouType = self.cfg['IouType'] if 'IouType' in self.cfg else 'bbox'
-            if self.cfg.metric == "COCO":
-                self._metrics = [
-                    COCOMetric(
-                        anno_file=anno_file,
-                        clsid2catid=clsid2catid,
-                        classwise=classwise,
-                        output_eval=output_eval,
-                        bias=bias,
-                        IouType=IouType,
-                        save_prediction_only=save_prediction_only)
-                ]
-            elif self.cfg.metric == "SNIPERCOCO":  # sniper
-                self._metrics = [
-                    SNIPERCOCOMetric(
-                        anno_file=anno_file,
-                        dataset=dataset,
-                        clsid2catid=clsid2catid,
-                        classwise=classwise,
-                        output_eval=output_eval,
-                        bias=bias,
-                        IouType=IouType,
-                        save_prediction_only=save_prediction_only)
-                ]
-        elif self.cfg.metric == 'RBOX':
-            # TODO: bias should be unified
-            bias = self.cfg['bias'] if 'bias' in self.cfg else 0
-            output_eval = self.cfg['output_eval'] \
-                if 'output_eval' in self.cfg else None
-            save_prediction_only = self.cfg.get('save_prediction_only', False)
-            imid2path = self.cfg.get('imid2path', None)
-
-            # when do validation in train, annotation file should be get from
-            # EvalReader instead of self.dataset(which is TrainReader)
-            anno_file = self.dataset.get_anno()
-            if self.mode == 'train' and validate:
-                eval_dataset = self.cfg['EvalDataset']
-                eval_dataset.check_or_download_dataset()
-                anno_file = eval_dataset.get_anno()
-
-            self._metrics = [
-                RBoxMetric(
-                    anno_file=anno_file,
-                    classwise=classwise,
-                    output_eval=output_eval,
-                    bias=bias,
-                    save_prediction_only=save_prediction_only,
-                    imid2path=imid2path)
-            ]
-        elif self.cfg.metric == 'VOC':
-            output_eval = self.cfg['output_eval'] \
-                if 'output_eval' in self.cfg else None
-            save_prediction_only = self.cfg.get('save_prediction_only', False)
-
-            self._metrics = [
-                VOCMetric(
-                    label_list=self.dataset.get_label_list(),
-                    class_num=self.cfg.num_classes,
-                    map_type=self.cfg.map_type,
-                    classwise=classwise,
-                    output_eval=output_eval,
-                    save_prediction_only=save_prediction_only)
-            ]
-        elif self.cfg.metric == 'WiderFace':
-            multi_scale = self.cfg.multi_scale_eval if 'multi_scale_eval' in self.cfg else True
-            self._metrics = [
-                WiderFaceMetric(
-                    image_dir=os.path.join(self.dataset.dataset_dir,
-                                           self.dataset.image_dir),
-                    anno_file=self.dataset.get_anno(),
-                    multi_scale=multi_scale)
-            ]
-        elif self.cfg.metric == 'KeyPointTopDownCOCOEval':
-            eval_dataset = self.cfg['EvalDataset']
-            eval_dataset.check_or_download_dataset()
-            anno_file = eval_dataset.get_anno()
-            save_prediction_only = self.cfg.get('save_prediction_only', False)
-            self._metrics = [
-                KeyPointTopDownCOCOEval(
-                    anno_file,
-                    len(eval_dataset),
-                    self.cfg.num_joints,
-                    self.cfg.save_dir,
-                    save_prediction_only=save_prediction_only)
-            ]
-        elif self.cfg.metric == 'KeyPointTopDownCOCOWholeBadyHandEval':
-            eval_dataset = self.cfg['EvalDataset']
-            eval_dataset.check_or_download_dataset()
-            anno_file = eval_dataset.get_anno()
-            save_prediction_only = self.cfg.get('save_prediction_only', False)
-            self._metrics = [
-                KeyPointTopDownCOCOWholeBadyHandEval(
-                    anno_file,
-                    len(eval_dataset),
-                    self.cfg.num_joints,
-                    self.cfg.save_dir,
-                    save_prediction_only=save_prediction_only)
-            ]
-        elif self.cfg.metric == 'KeyPointTopDownMPIIEval':
-            eval_dataset = self.cfg['EvalDataset']
-            eval_dataset.check_or_download_dataset()
-            anno_file = eval_dataset.get_anno()
-            save_prediction_only = self.cfg.get('save_prediction_only', False)
-            self._metrics = [
-                KeyPointTopDownMPIIEval(
-                    anno_file,
-                    len(eval_dataset),
-                    self.cfg.num_joints,
-                    self.cfg.save_dir,
-                    save_prediction_only=save_prediction_only)
-            ]
-        elif self.cfg.metric == 'Pose3DEval':
-            save_prediction_only = self.cfg.get('save_prediction_only', False)
-            self._metrics = [
-                Pose3DEval(
-                    self.cfg.save_dir,
-                    save_prediction_only=save_prediction_only)
-            ]
-        elif self.cfg.metric == 'MOTDet':
-            self._metrics = [JDEDetMetric(), ]
-        elif self.cfg.metric == 'CULaneMetric':
-            output_eval = self.cfg.get('output_eval', None)
-            self._metrics = [
-                CULaneMetric(
-                    cfg=self.cfg,
-                    output_eval=output_eval,
-                    split=self.dataset.split,
-                    dataset_dir=self.cfg.dataset_dir)
-            ]
-        else:
-            logger.warning("Metric not support for metric type {}".format(
-                self.cfg.metric))
-            self._metrics = []
-
-    def _reset_metrics(self):
-        for metric in self._metrics:
-            metric.reset()
-
-    def register_callbacks(self, callbacks):
-        callbacks = [c for c in list(callbacks) if c is not None]
-        for c in callbacks:
-            assert isinstance(c, Callback), \
-                    "metrics shoule be instances of subclass of Metric"
-        self._callbacks.extend(callbacks)
-        self._compose_callback = ComposeCallback(self._callbacks)
-
-    def register_metrics(self, metrics):
-        metrics = [m for m in list(metrics) if m is not None]
-        for m in metrics:
-            assert isinstance(m, Metric), \
-                    "metrics shoule be instances of subclass of Metric"
-        self._metrics.extend(metrics)
-
-    def load_weights(self, weights, ARSL_eval=False):
-        if self.is_loaded_weights:
-            return
-        self.start_epoch = 0
-        load_pretrain_weight(self.model, weights, ARSL_eval)
-        logger.debug("Load weights {} to start training".format(weights))
-
-    def load_weights_sde(self, det_weights, reid_weights):
-        if self.model.detector:
-            load_weight(self.model.detector, det_weights)
-            if self.model.reid:
-                load_weight(self.model.reid, reid_weights)
-        else:
-            load_weight(self.model.reid, reid_weights)
-
-    def resume_weights(self, weights):
-        # support Distill resume weights
-        if hasattr(self.model, 'student_model'):
-            self.start_epoch = load_weight(self.model.student_model, weights,
-                                           self.optimizer)
-        else:
-            self.start_epoch = load_weight(self.model, weights, self.optimizer,
-                                           self.ema if self.use_ema else None)
-        logger.debug("Resume weights of epoch {}".format(self.start_epoch))
-
-    def train(self, validate=False):
-        assert self.mode == 'train', "Model not in 'train' mode"
-        Init_mark = False
-        if validate:
-            self.cfg['EvalDataset'] = self.cfg.EvalDataset = create(
-                "EvalDataset")()
-
-        model = self.model
-        if self.cfg.get('to_static', False):
-            model = apply_to_static(self.cfg, model)
-        sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
-                   (self.cfg.use_gpu or self.cfg.use_mlu) and self._nranks > 1)
-        if sync_bn:
-            model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
-
-        # enabel auto mixed precision mode
-        if self.use_amp:
-            scaler = paddle.amp.GradScaler(
-                enable=self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu,
-                init_loss_scaling=self.cfg.get('init_loss_scaling', 1024))
-        # get distributed model
-        if self.cfg.get('fleet', False):
-            model = fleet.distributed_model(model)
-            self.optimizer = fleet.distributed_optimizer(self.optimizer)
-        elif self._nranks > 1:
-            find_unused_parameters = self.cfg[
-                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
-            model = paddle.DataParallel(
-                model, find_unused_parameters=find_unused_parameters)
-
-        self.status.update({
-            'epoch_id': self.start_epoch,
-            'step_id': 0,
-            'steps_per_epoch': len(self.loader)
-        })
-
-        self.status['batch_time'] = stats.SmoothedValue(
-            self.cfg.log_iter, fmt='{avg:.4f}')
-        self.status['data_time'] = stats.SmoothedValue(
-            self.cfg.log_iter, fmt='{avg:.4f}')
-        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)
-
-        if self.cfg.get('print_flops', False):
-            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
-                self.dataset, self.cfg.worker_num)
-            self._flops(flops_loader)
-        profiler_options = self.cfg.get('profiler_options', None)
-
-        self._compose_callback.on_train_begin(self.status)
-
-        use_fused_allreduce_gradients = self.cfg[
-            'use_fused_allreduce_gradients'] if 'use_fused_allreduce_gradients' in self.cfg else False
-
-        for epoch_id in range(self.start_epoch, self.cfg.epoch):
-            self.status['mode'] = 'train'
-            self.status['epoch_id'] = epoch_id
-            self._compose_callback.on_epoch_begin(self.status)
-            self.loader.dataset.set_epoch(epoch_id)
-            model.train()
-            iter_tic = time.time()
-            for step_id, data in enumerate(self.loader):
-                self.status['data_time'].update(time.time() - iter_tic)
-                self.status['step_id'] = step_id
-                profiler.add_profiler_step(profiler_options)
-                self._compose_callback.on_step_begin(self.status)
-                data['epoch_id'] = epoch_id
-                if self.cfg.get('to_static',
-                                False) and 'image_file' in data.keys():
-                    data.pop('image_file')
-
-                if self.use_amp:
-                    if isinstance(
-                            model, paddle.
-                            DataParallel) and use_fused_allreduce_gradients:
-                        with model.no_sync():
-                            with paddle.amp.auto_cast(
-                                    enable=self.cfg.use_gpu or
-                                    self.cfg.use_npu or self.cfg.use_mlu,
-                                    custom_white_list=self.custom_white_list,
-                                    custom_black_list=self.custom_black_list,
-                                    level=self.amp_level):
-                                # model forward
-                                outputs = model(data)
-                                loss = outputs['loss']
-                            # model backward
-                            scaled_loss = scaler.scale(loss)
-                            scaled_loss.backward()
-                        fused_allreduce_gradients(
-                            list(model.parameters()), None)
-                    else:
-                        with paddle.amp.auto_cast(
-                                enable=self.cfg.use_gpu or self.cfg.use_npu or
-                                self.cfg.use_mlu,
-                                custom_white_list=self.custom_white_list,
-                                custom_black_list=self.custom_black_list,
-                                level=self.amp_level):
-                            # model forward
-                            outputs = model(data)
-                            loss = outputs['loss']
-                        # model backward
-                        scaled_loss = scaler.scale(loss)
-                        scaled_loss.backward()
-                    # in dygraph mode, optimizer.minimize is equal to optimizer.step
-                    scaler.minimize(self.optimizer, scaled_loss)
-                else:
-                    if isinstance(
-                            model, paddle.
-                            DataParallel) and use_fused_allreduce_gradients:
-                        with model.no_sync():
-                            # model forward
-                            outputs = model(data)
-                            loss = outputs['loss']
-                            # model backward
-                            loss.backward()
-                        fused_allreduce_gradients(
-                            list(model.parameters()), None)
-                    else:
-                        # model forward
-                        outputs = model(data)
-                        loss = outputs['loss']
-                        # model backward
-                        loss.backward()
-                    self.optimizer.step()
-                curr_lr = self.optimizer.get_lr()
-                self.lr.step()
-                if self.cfg.get('unstructured_prune'):
-                    self.pruner.step()
-                self.optimizer.clear_grad()
-                self.status['learning_rate'] = curr_lr
-
-                if self._nranks < 2 or self._local_rank == 0:
-                    self.status['training_staus'].update(outputs)
-
-                self.status['batch_time'].update(time.time() - iter_tic)
-                self._compose_callback.on_step_end(self.status)
-                if self.use_ema:
-                    self.ema.update()
-                iter_tic = time.time()
-
-            if self.cfg.get('unstructured_prune'):
-                self.pruner.update_params()
-
-            is_snapshot = (self._nranks < 2 or (self._local_rank == 0 or self.cfg.metric == "Pose3DEval")) \
-                       and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)
-            if is_snapshot and self.use_ema:
-                # apply ema weight on model
-                weight = copy.deepcopy(self.model.state_dict())
-                self.model.set_dict(self.ema.apply())
-                self.status['weight'] = weight
-
-            self._compose_callback.on_epoch_end(self.status)
-
-            if validate and is_snapshot:
-                if not hasattr(self, '_eval_loader'):
-                    # build evaluation dataset and loader
-                    self._eval_dataset = self.cfg.EvalDataset
-                    self._eval_batch_sampler = \
-                        paddle.io.BatchSampler(
-                            self._eval_dataset,
-                            batch_size=self.cfg.EvalReader['batch_size'])
-                    # If metric is VOC, need to be set collate_batch=False.
-                    if self.cfg.metric == 'VOC':
-                        self.cfg['EvalReader']['collate_batch'] = False
-                    if self.cfg.metric == "Pose3DEval":
-                        self._eval_loader = create('EvalReader')(
-                            self._eval_dataset, self.cfg.worker_num)
-                    else:
-                        self._eval_loader = create('EvalReader')(
-                            self._eval_dataset,
-                            self.cfg.worker_num,
-                            batch_sampler=self._eval_batch_sampler)
-                # if validation in training is enabled, metrics should be re-init
-                # Init_mark makes sure this code will only execute once
-                if validate and Init_mark == False:
-                    Init_mark = True
-                    self._init_metrics(validate=validate)
-                    self._reset_metrics()
-
-                with paddle.no_grad():
-                    self.status['save_best_model'] = True
-                    self._eval_with_loader(self._eval_loader)
-
-            if is_snapshot and self.use_ema:
-                # reset original weight
-                self.model.set_dict(weight)
-                self.status.pop('weight')
-
-        self._compose_callback.on_train_end(self.status)
-
-    def _eval_with_loader(self, loader):
-        sample_num = 0
-        tic = time.time()
-        self._compose_callback.on_epoch_begin(self.status)
-        self.status['mode'] = 'eval'
-
-        self.model.eval()
-        if self.cfg.get('print_flops', False):
-            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
-                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
-            self._flops(flops_loader)
-        for step_id, data in enumerate(loader):
-            self.status['step_id'] = step_id
-            self._compose_callback.on_step_begin(self.status)
-            # forward
-            if self.use_amp:
-                with paddle.amp.auto_cast(
-                        enable=self.cfg.use_gpu or self.cfg.use_npu or
-                        self.cfg.use_mlu,
-                        custom_white_list=self.custom_white_list,
-                        custom_black_list=self.custom_black_list,
-                        level=self.amp_level):
-                    outs = self.model(data)
-            else:
-                outs = self.model(data)
-
-            # update metrics
-            for metric in self._metrics:
-                metric.update(data, outs)
-
-            # multi-scale inputs: all inputs have same im_id
-            if isinstance(data, typing.Sequence):
-                sample_num += data[0]['im_id'].numpy().shape[0]
-            else:
-                sample_num += data['im_id'].numpy().shape[0]
-            self._compose_callback.on_step_end(self.status)
-
-        self.status['sample_num'] = sample_num
-        self.status['cost_time'] = time.time() - tic
-
-        # accumulate metric to log out
-        for metric in self._metrics:
-            metric.accumulate()
-            metric.log()
-        self._compose_callback.on_epoch_end(self.status)
-        # reset metric states for metric may performed multiple times
-        self._reset_metrics()
-
-    def evaluate(self):
-        # get distributed model
-        if self.cfg.get('fleet', False):
-            self.model = fleet.distributed_model(self.model)
-            self.optimizer = fleet.distributed_optimizer(self.optimizer)
-        elif self._nranks > 1:
-            find_unused_parameters = self.cfg[
-                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
-            self.model = paddle.DataParallel(
-                self.model, find_unused_parameters=find_unused_parameters)
-        with paddle.no_grad():
-            self._eval_with_loader(self.loader)
-
-    def _eval_with_loader_slice(self,
-                                loader,
-                                slice_size=[640, 640],
-                                overlap_ratio=[0.25, 0.25],
-                                combine_method='nms',
-                                match_threshold=0.6,
-                                match_metric='iou'):
-        sample_num = 0
-        tic = time.time()
-        self._compose_callback.on_epoch_begin(self.status)
-        self.status['mode'] = 'eval'
-        self.model.eval()
-        if self.cfg.get('print_flops', False):
-            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
-                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
-            self._flops(flops_loader)
-
-        merged_bboxs = []
-        for step_id, data in enumerate(loader):
-            self.status['step_id'] = step_id
-            self._compose_callback.on_step_begin(self.status)
-            # forward
-            if self.use_amp:
-                with paddle.amp.auto_cast(
-                        enable=self.cfg.use_gpu or self.cfg.use_npu or
-                        self.cfg.use_mlu,
-                        custom_white_list=self.custom_white_list,
-                        custom_black_list=self.custom_black_list,
-                        level=self.amp_level):
-                    outs = self.model(data)
-            else:
-                outs = self.model(data)
-
-            shift_amount = data['st_pix']
-            outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount
-            outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount
-            merged_bboxs.append(outs['bbox'])
-
-            if data['is_last'] > 0:
-                # merge matching predictions
-                merged_results = {'bbox': []}
-                if combine_method == 'nms':
-                    final_boxes = multiclass_nms(
-                        np.concatenate(merged_bboxs), self.cfg.num_classes,
-                        match_threshold, match_metric)
-                    merged_results['bbox'] = np.concatenate(final_boxes)
-                elif combine_method == 'concat':
-                    merged_results['bbox'] = np.concatenate(merged_bboxs)
-                else:
-                    raise ValueError(
-                        "Now only support 'nms' or 'concat' to fuse detection results."
-                    )
-                merged_results['im_id'] = np.array([[0]])
-                merged_results['bbox_num'] = np.array(
-                    [len(merged_results['bbox'])])
-
-                merged_bboxs = []
-                data['im_id'] = data['ori_im_id']
-                # update metrics
-                for metric in self._metrics:
-                    metric.update(data, merged_results)
-
-                # multi-scale inputs: all inputs have same im_id
-                if isinstance(data, typing.Sequence):
-                    sample_num += data[0]['im_id'].numpy().shape[0]
-                else:
-                    sample_num += data['im_id'].numpy().shape[0]
-
-            self._compose_callback.on_step_end(self.status)
-
-        self.status['sample_num'] = sample_num
-        self.status['cost_time'] = time.time() - tic
-
-        # accumulate metric to log out
-        for metric in self._metrics:
-            metric.accumulate()
-            metric.log()
-        self._compose_callback.on_epoch_end(self.status)
-        # reset metric states for metric may performed multiple times
-        self._reset_metrics()
-
-    def evaluate_slice(self,
-                       slice_size=[640, 640],
-                       overlap_ratio=[0.25, 0.25],
-                       combine_method='nms',
-                       match_threshold=0.6,
-                       match_metric='iou'):
-        with paddle.no_grad():
-            self._eval_with_loader_slice(self.loader, slice_size, overlap_ratio,
-                                         combine_method, match_threshold,
-                                         match_metric)
-
-    def slice_predict(self,
-                      images,
-                      slice_size=[640, 640],
-                      overlap_ratio=[0.25, 0.25],
-                      combine_method='nms',
-                      match_threshold=0.6,
-                      match_metric='iou',
-                      draw_threshold=0.5,
-                      output_dir='output',
-                      save_results=False,
-                      visualize=True):
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-
-        self.dataset.set_slice_images(images, slice_size, overlap_ratio)
-        loader = create('TestReader')(self.dataset, 0)
-        imid2path = self.dataset.get_imid2path()
-
-        def setup_metrics_for_loader():
-            # mem
-            metrics = copy.deepcopy(self._metrics)
-            mode = self.mode
-            save_prediction_only = self.cfg[
-                'save_prediction_only'] if 'save_prediction_only' in self.cfg else None
-            output_eval = self.cfg[
-                'output_eval'] if 'output_eval' in self.cfg else None
-
-            # modify
-            self.mode = '_test'
-            self.cfg['save_prediction_only'] = True
-            self.cfg['output_eval'] = output_dir
-            self.cfg['imid2path'] = imid2path
-            self._init_metrics()
-
-            # restore
-            self.mode = mode
-            self.cfg.pop('save_prediction_only')
-            if save_prediction_only is not None:
-                self.cfg['save_prediction_only'] = save_prediction_only
-
-            self.cfg.pop('output_eval')
-            if output_eval is not None:
-                self.cfg['output_eval'] = output_eval
-
-            self.cfg.pop('imid2path')
-
-            _metrics = copy.deepcopy(self._metrics)
-            self._metrics = metrics
-
-            return _metrics
-
-        if save_results:
-            metrics = setup_metrics_for_loader()
-        else:
-            metrics = []
-
-        anno_file = self.dataset.get_anno()
-        clsid2catid, catid2name = get_categories(
-            self.cfg.metric, anno_file=anno_file)
-
-        # Run Infer 
-        self.status['mode'] = 'test'
-        self.model.eval()
-        if self.cfg.get('print_flops', False):
-            flops_loader = create('TestReader')(self.dataset, 0)
-            self._flops(flops_loader)
-
-        results = []  # all images
-        merged_bboxs = []  # single image
-        for step_id, data in enumerate(tqdm(loader)):
-            self.status['step_id'] = step_id
-            # forward
-            outs = self.model(data)
-
-            outs['bbox'] = outs['bbox'].numpy()  # only in test mode
-            shift_amount = data['st_pix']
-            outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount.numpy()
-            outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount.numpy()
-            merged_bboxs.append(outs['bbox'])
-
-            if data['is_last'] > 0:
-                # merge matching predictions
-                merged_results = {'bbox': []}
-                if combine_method == 'nms':
-                    final_boxes = multiclass_nms(
-                        np.concatenate(merged_bboxs), self.cfg.num_classes,
-                        match_threshold, match_metric)
-                    merged_results['bbox'] = np.concatenate(final_boxes)
-                elif combine_method == 'concat':
-                    merged_results['bbox'] = np.concatenate(merged_bboxs)
-                else:
-                    raise ValueError(
-                        "Now only support 'nms' or 'concat' to fuse detection results."
-                    )
-                merged_results['im_id'] = np.array([[0]])
-                merged_results['bbox_num'] = np.array(
-                    [len(merged_results['bbox'])])
-
-                merged_bboxs = []
-                data['im_id'] = data['ori_im_id']
-
-                for _m in metrics:
-                    _m.update(data, merged_results)
-
-                for key in ['im_shape', 'scale_factor', 'im_id']:
-                    if isinstance(data, typing.Sequence):
-                        merged_results[key] = data[0][key]
-                    else:
-                        merged_results[key] = data[key]
-                for key, value in merged_results.items():
-                    if hasattr(value, 'numpy'):
-                        merged_results[key] = value.numpy()
-                results.append(merged_results)
-
-        for _m in metrics:
-            _m.accumulate()
-            _m.reset()
-
-        if visualize:
-            for outs in results:
-                batch_res = get_infer_results(outs, clsid2catid)
-                bbox_num = outs['bbox_num']
-
-                start = 0
-                for i, im_id in enumerate(outs['im_id']):
-                    image_path = imid2path[int(im_id)]
-                    image = Image.open(image_path).convert('RGB')
-                    image = ImageOps.exif_transpose(image)
-                    self.status['original_image'] = np.array(image.copy())
-
-                    end = start + bbox_num[i]
-                    bbox_res = batch_res['bbox'][start:end] \
-                            if 'bbox' in batch_res else None
-                    mask_res = batch_res['mask'][start:end] \
-                            if 'mask' in batch_res else None
-                    segm_res = batch_res['segm'][start:end] \
-                            if 'segm' in batch_res else None
-                    keypoint_res = batch_res['keypoint'][start:end] \
-                            if 'keypoint' in batch_res else None
-                    pose3d_res = batch_res['pose3d'][start:end] \
-                            if 'pose3d' in batch_res else None
-                    image = visualize_results(
-                        image, bbox_res, mask_res, segm_res, keypoint_res,
-                        pose3d_res, int(im_id), catid2name, draw_threshold)
-                    self.status['result_image'] = np.array(image.copy())
-                    if self._compose_callback:
-                        self._compose_callback.on_step_end(self.status)
-                    # save image with detection
-                    save_name = self._get_save_image_name(output_dir,
-                                                          image_path)
-                    logger.info("Detection bbox results save in {}".format(
-                        save_name))
-                    image.save(save_name, quality=95)
-
-                    start = end
-
-    def predict(self,
-                images):
-
-        self.dataset.set_images(images)
-        loader = create('TestReader')(self.dataset, 0)
-
-        # Run Infer 
-        self.model.eval()
-        results = []
-        for step_id, data in enumerate(loader):
-            # forward
-            if hasattr(self.model, 'modelTeacher'):
-                outs = self.model.modelTeacher(data)
-            else:
-                outs = self.model(data)
-
-            for key in ['im_shape', 'scale_factor', 'im_id']:
-                if isinstance(data, typing.Sequence):
-                    outs[key] = data[0][key]
-                else:
-                    outs[key] = data[key]
-            for key, value in outs.items():
-                if hasattr(value, 'numpy'):
-                    outs[key] = value.numpy()
-            results.append(outs)
-
-        return results
-
-    def _get_save_image_name(self, output_dir, image_path):
-        """
-        Get save image name from source image path.
-        """
-        image_name = os.path.split(image_path)[-1]
-        name, ext = os.path.splitext(image_name)
-        return os.path.join(output_dir, "{}".format(name)) + ext
-
-    def _get_infer_cfg_and_input_spec(self,
-                                      save_dir,
-                                      prune_input=True,
-                                      kl_quant=False,
-                                      yaml_name=None):
-        if yaml_name is None:
-            yaml_name = 'infer_cfg.yml'
-        image_shape = None
-        im_shape = [None, 2]
-        scale_factor = [None, 2]
-        if self.cfg.architecture in MOT_ARCH:
-            test_reader_name = 'TestMOTReader'
-        else:
-            test_reader_name = 'TestReader'
-        if 'inputs_def' in self.cfg[test_reader_name]:
-            inputs_def = self.cfg[test_reader_name]['inputs_def']
-            image_shape = inputs_def.get('image_shape', None)
-        # set image_shape=[None, 3, -1, -1] as default
-        if image_shape is None:
-            image_shape = [None, 3, -1, -1]
-
-        if len(image_shape) == 3:
-            image_shape = [None] + image_shape
-        else:
-            im_shape = [image_shape[0], 2]
-            scale_factor = [image_shape[0], 2]
-
-        if hasattr(self.model, 'deploy'):
-            self.model.deploy = True
-
-        if 'slim' not in self.cfg:
-            for layer in self.model.sublayers():
-                if hasattr(layer, 'convert_to_deploy'):
-                    layer.convert_to_deploy()
-
-        if hasattr(self.cfg, 'export') and 'fuse_conv_bn' in self.cfg[
-                'export'] and self.cfg['export']['fuse_conv_bn']:
-            self.model = fuse_conv_bn(self.model)
-
-        export_post_process = self.cfg['export'].get(
-            'post_process', False) if hasattr(self.cfg, 'export') else True
-        export_nms = self.cfg['export'].get('nms', False) if hasattr(
-            self.cfg, 'export') else True
-        export_benchmark = self.cfg['export'].get(
-            'benchmark', False) if hasattr(self.cfg, 'export') else False
-        if hasattr(self.model, 'fuse_norm'):
-            self.model.fuse_norm = self.cfg['TestReader'].get('fuse_normalize',
-                                                              False)
-        if hasattr(self.model, 'export_post_process'):
-            self.model.export_post_process = export_post_process if not export_benchmark else False
-        if hasattr(self.model, 'export_nms'):
-            self.model.export_nms = export_nms if not export_benchmark else False
-        if export_post_process and not export_benchmark:
-            image_shape = [None] + image_shape[1:]
-
-        # Save infer cfg
-        _dump_infer_config(self.cfg,
-                           os.path.join(save_dir, yaml_name), image_shape,
-                           self.model)
-
-        input_spec = [{
-            "image": InputSpec(
-                shape=image_shape, name='image'),
-            "im_shape": InputSpec(
-                shape=im_shape, name='im_shape'),
-            "scale_factor": InputSpec(
-                shape=scale_factor, name='scale_factor')
-        }]
-        if self.cfg.architecture == 'DeepSORT':
-            input_spec[0].update({
-                "crops": InputSpec(
-                    shape=[None, 3, 192, 64], name='crops')
-            })
-
-        if self.cfg.architecture == 'CLRNet':
-            input_spec[0].update({
-                "full_img_path": str,
-                "img_name": str,
-            })
-        if prune_input:
-            static_model = paddle.jit.to_static(
-                self.model, input_spec=input_spec)
-            # NOTE: dy2st do not pruned program, but jit.save will prune program
-            # input spec, prune input spec here and save with pruned input spec
-            pruned_input_spec = _prune_input_spec(
-                input_spec, static_model.forward.main_program,
-                static_model.forward.outputs)
-        else:
-            static_model = None
-            pruned_input_spec = input_spec
-
-        # TODO: Hard code, delete it when support prune input_spec.
-        if self.cfg.architecture == 'PicoDet' and not export_post_process:
-            pruned_input_spec = [{
-                "image": InputSpec(
-                    shape=image_shape, name='image')
-            }]
-        if kl_quant:
-            if self.cfg.architecture == 'PicoDet' or 'ppyoloe' in self.cfg.weights:
-                pruned_input_spec = [{
-                    "image": InputSpec(
-                        shape=image_shape, name='image'),
-                    "scale_factor": InputSpec(
-                        shape=scale_factor, name='scale_factor')
-                }]
-            elif 'tinypose' in self.cfg.weights:
-                pruned_input_spec = [{
-                    "image": InputSpec(
-                        shape=image_shape, name='image')
-                }]
-
-        return static_model, pruned_input_spec
-
-    def export(self, output_dir='output_inference', for_fd=False):
-        if hasattr(self.model, 'aux_neck'):
-            self.model.__delattr__('aux_neck')
-        if hasattr(self.model, 'aux_head'):
-            self.model.__delattr__('aux_head')
-        self.model.eval()
-
-        model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
-        if for_fd:
-            save_dir = output_dir
-            save_name = 'inference'
-            yaml_name = 'inference.yml'
-        else:
-            save_dir = os.path.join(output_dir, model_name)
-            save_name = 'model'
-            yaml_name = None
-
-        if not os.path.exists(save_dir):
-            os.makedirs(save_dir)
-
-        static_model, pruned_input_spec = self._get_infer_cfg_and_input_spec(
-            save_dir, yaml_name=yaml_name)
-
-        # dy2st and save model
-        if 'slim' not in self.cfg or 'QAT' not in self.cfg['slim_type']:
-            paddle.jit.save(
-                static_model,
-                os.path.join(save_dir, save_name),
-                input_spec=pruned_input_spec)
-        else:
-            self.cfg.slim.save_quantized_model(
-                self.model,
-                os.path.join(save_dir, save_name),
-                input_spec=pruned_input_spec)
-        logger.info("Export model and saved in {}".format(save_dir))
-
-    def post_quant(self, output_dir='output_inference'):
-        model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
-        save_dir = os.path.join(output_dir, model_name)
-        if not os.path.exists(save_dir):
-            os.makedirs(save_dir)
-
-        for idx, data in enumerate(self.loader):
-            self.model(data)
-            if idx == int(self.cfg.get('quant_batch_num', 10)):
-                break
-
-        # TODO: support prune input_spec
-        kl_quant = True if hasattr(self.cfg.slim, 'ptq') else False
-        _, pruned_input_spec = self._get_infer_cfg_and_input_spec(
-            save_dir, prune_input=False, kl_quant=kl_quant)
-
-        self.cfg.slim.save_quantized_model(
-            self.model,
-            os.path.join(save_dir, 'model'),
-            input_spec=pruned_input_spec)
-        logger.info("Export Post-Quant model and saved in {}".format(save_dir))
-
-    def _flops(self, loader):
-        if hasattr(self.model, 'aux_neck'):
-            self.model.__delattr__('aux_neck')
-        if hasattr(self.model, 'aux_head'):
-            self.model.__delattr__('aux_head')
-        self.model.eval()
-        try:
-            import paddleslim
-        except Exception as e:
-            logger.warning(
-                'Unable to calculate flops, please install paddleslim, for example: `pip install paddleslim`'
-            )
-            return
-
-        from paddleslim.analysis import dygraph_flops as flops
-        input_data = None
-        for data in loader:
-            input_data = data
-            break
-
-        input_spec = [{
-            "image": input_data['image'][0].unsqueeze(0),
-            "im_shape": input_data['im_shape'][0].unsqueeze(0),
-            "scale_factor": input_data['scale_factor'][0].unsqueeze(0)
-        }]
-        flops = flops(self.model, input_spec) / (1000**3)
-        logger.info(" Model FLOPs : {:.6f}G. (image shape is {})".format(
-            flops, input_data['image'][0].unsqueeze(0).shape))
-
-    def parse_mot_images(self, cfg):
-        import glob
-        # for quant
-        dataset_dir = cfg['EvalMOTDataset'].dataset_dir
-        data_root = cfg['EvalMOTDataset'].data_root
-        data_root = '{}/{}'.format(dataset_dir, data_root)
-        seqs = os.listdir(data_root)
-        seqs.sort()
-        all_images = []
-        for seq in seqs:
-            infer_dir = os.path.join(data_root, seq)
-            assert infer_dir is None or os.path.isdir(infer_dir), \
-                "{} is not a directory".format(infer_dir)
-            images = set()
-            exts = ['jpg', 'jpeg', 'png', 'bmp']
-            exts += [ext.upper() for ext in exts]
-            for ext in exts:
-                images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
-            images = list(images)
-            images.sort()
-            assert len(images) > 0, "no image found in {}".format(infer_dir)
-            all_images.extend(images)
-            logger.info("Found {} inference images in total.".format(
-                len(images)))
-        return all_images
-
-    def predict_culane(self,
-                       images,
-                       output_dir='output',
-                       save_results=False,
-                       visualize=True):
-        if not os.path.exists(output_dir):
-            os.makedirs(output_dir)
-
-        self.dataset.set_images(images)
-        loader = create('TestReader')(self.dataset, 0)
-
-        imid2path = self.dataset.get_imid2path()
-
-        def setup_metrics_for_loader():
-            # mem
-            metrics = copy.deepcopy(self._metrics)
-            mode = self.mode
-            save_prediction_only = self.cfg[
-                'save_prediction_only'] if 'save_prediction_only' in self.cfg else None
-            output_eval = self.cfg[
-                'output_eval'] if 'output_eval' in self.cfg else None
-
-            # modify
-            self.mode = '_test'
-            self.cfg['save_prediction_only'] = True
-            self.cfg['output_eval'] = output_dir
-            self.cfg['imid2path'] = imid2path
-            self._init_metrics()
-
-            # restore
-            self.mode = mode
-            self.cfg.pop('save_prediction_only')
-            if save_prediction_only is not None:
-                self.cfg['save_prediction_only'] = save_prediction_only
-
-            self.cfg.pop('output_eval')
-            if output_eval is not None:
-                self.cfg['output_eval'] = output_eval
-
-            self.cfg.pop('imid2path')
-
-            _metrics = copy.deepcopy(self._metrics)
-            self._metrics = metrics
-
-            return _metrics
-
-        if save_results:
-            metrics = setup_metrics_for_loader()
-        else:
-            metrics = []
-
-        # Run Infer 
-        self.status['mode'] = 'test'
-        self.model.eval()
-        if self.cfg.get('print_flops', False):
-            flops_loader = create('TestReader')(self.dataset, 0)
-            self._flops(flops_loader)
-        results = []
-        for step_id, data in enumerate(tqdm(loader)):
-            self.status['step_id'] = step_id
-            # forward
-            outs = self.model(data)
-
-            for _m in metrics:
-                _m.update(data, outs)
-
-            for key in ['im_shape', 'scale_factor', 'im_id']:
-                if isinstance(data, typing.Sequence):
-                    outs[key] = data[0][key]
-                else:
-                    outs[key] = data[key]
-            for key, value in outs.items():
-                if hasattr(value, 'numpy'):
-                    outs[key] = value.numpy()
-            results.append(outs)
-
-        for _m in metrics:
-            _m.accumulate()
-            _m.reset()
-
-        if visualize:
-            import cv2
-
-            for outs in results:
-                for i in range(len(outs['img_path'])):
-                    lanes = outs['lanes'][i]
-                    img_path = outs['img_path'][i]
-                    img = cv2.imread(img_path)
-                    out_file = os.path.join(output_dir,
-                                            os.path.basename(img_path))
-                    lanes = [
-                        lane.to_array(
-                            sample_y_range=[
-                                self.cfg['sample_y']['start'],
-                                self.cfg['sample_y']['end'],
-                                self.cfg['sample_y']['step']
-                            ],
-                            img_w=self.cfg.ori_img_w,
-                            img_h=self.cfg.ori_img_h) for lane in lanes
-                    ]
-                    imshow_lanes(img, lanes, out_file=out_file)
-
-        return results
diff --git a/pdfdet/models/Paddle/ppdet/engine/trainer_cot.py b/pdfdet/models/Paddle/ppdet/engine/trainer_cot.py
deleted file mode 100644
index 38d95fa..0000000
--- a/pdfdet/models/Paddle/ppdet/engine/trainer_cot.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from ppdet.core.workspace import create
-from ppdet.utils.logger import setup_logger
-logger = setup_logger('ppdet.engine')
-
-from . import Trainer
-__all__ = ['TrainerCot']
-
-class TrainerCot(Trainer):
-    """
-    Trainer for label-cotuning
-    calculate the relationship between base_classes and novel_classes
-    """
-    def __init__(self, cfg, mode='train'):
-        super(TrainerCot, self).__init__(cfg, mode)
-        self.cotuning_init()
-
-    def cotuning_init(self):    
-        num_classes_novel = self.cfg['num_classes']
-
-        self.load_weights(self.cfg.pretrain_weights)
-
-        self.model.eval()
-        relationship = self.model.relationship_learning(self.loader, num_classes_novel)
-    
-        self.model.init_cot_head(relationship)
-        self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
-
-
diff --git a/pdfdet/models/Paddle/ppdet/engine/trainer_ssod.py b/pdfdet/models/Paddle/ppdet/engine/trainer_ssod.py
deleted file mode 100644
index ab4a100..0000000
--- a/pdfdet/models/Paddle/ppdet/engine/trainer_ssod.py
+++ /dev/null
@@ -1,1192 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import copy
-import time
-import typing
-import numpy as np
-
-import paddle
-import paddle.nn as nn
-import paddle.distributed as dist
-from paddle.distributed import fleet
-from ppdet.optimizer import ModelEMA, SimpleModelEMA
-from ppdet.core.workspace import create
-from ppdet.utils.checkpoint import load_weight, load_pretrain_weight, save_model
-import ppdet.utils.stats as stats
-from ppdet.utils import profiler
-from ppdet.modeling.ssod.utils import align_weak_strong_shape
-from .trainer import Trainer
-from ppdet.utils.logger import setup_logger
-from paddle.static import InputSpec
-from ppdet.engine.export_utils import _dump_infer_config, _prune_input_spec
-MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack']
-
-logger = setup_logger('ppdet.engine')
-
-__all__ = ['Trainer_DenseTeacher', 'Trainer_ARSL', 'Trainer_Semi_RTDETR']
-
-
-class Trainer_DenseTeacher(Trainer):
-    def __init__(self, cfg, mode='train'):
-        self.cfg = cfg
-        assert mode.lower() in ['train', 'eval', 'test'], \
-                "mode should be 'train', 'eval' or 'test'"
-        self.mode = mode.lower()
-        self.optimizer = None
-        self.is_loaded_weights = False
-        self.use_amp = self.cfg.get('amp', False)
-        self.amp_level = self.cfg.get('amp_level', 'O1')
-        self.custom_white_list = self.cfg.get('custom_white_list', None)
-        self.custom_black_list = self.cfg.get('custom_black_list', None)
-
-        # build data loader
-        capital_mode = self.mode.capitalize()
-        self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
-            '{}Dataset'.format(capital_mode))()
-
-        if self.mode == 'train':
-            self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create(
-                'UnsupTrainDataset')
-            self.loader = create('SemiTrainReader')(
-                self.dataset, self.dataset_unlabel, cfg.worker_num)
-
-        # build model
-        if 'model' not in self.cfg:
-            self.model = create(cfg.architecture)
-        else:
-            self.model = self.cfg.model
-            self.is_loaded_weights = True
-
-        # EvalDataset build with BatchSampler to evaluate in single device
-        # TODO: multi-device evaluate
-        if self.mode == 'eval':
-            self._eval_batch_sampler = paddle.io.BatchSampler(
-                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
-            # If metric is VOC, need to be set collate_batch=False.
-            if cfg.metric == 'VOC':
-                cfg['EvalReader']['collate_batch'] = False
-            self.loader = create('EvalReader')(self.dataset, cfg.worker_num,
-                                               self._eval_batch_sampler)
-        # TestDataset build after user set images, skip loader creation here
-
-        # build optimizer in train mode
-        if self.mode == 'train':
-            steps_per_epoch = len(self.loader)
-            if steps_per_epoch < 1:
-                logger.warning(
-                    "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader."
-                )
-            self.lr = create('LearningRate')(steps_per_epoch)
-            self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
-
-            # Unstructured pruner is only enabled in the train mode.
-            if self.cfg.get('unstructured_prune'):
-                self.pruner = create('UnstructuredPruner')(self.model,
-                                                           steps_per_epoch)
-        if self.use_amp and self.amp_level == 'O2':
-            self.model, self.optimizer = paddle.amp.decorate(
-                models=self.model,
-                optimizers=self.optimizer,
-                level=self.amp_level)
-
-        self.use_ema = ('use_ema' in cfg and cfg['use_ema'])
-        if self.use_ema:
-            ema_decay = self.cfg.get('ema_decay', 0.9998)
-            ema_decay_type = self.cfg.get('ema_decay_type', 'threshold')
-            cycle_epoch = self.cfg.get('cycle_epoch', -1)
-            ema_black_list = self.cfg.get('ema_black_list', None)
-            self.ema = ModelEMA(
-                self.model,
-                decay=ema_decay,
-                ema_decay_type=ema_decay_type,
-                cycle_epoch=cycle_epoch,
-                ema_black_list=ema_black_list)
-            self.ema_start_iters = self.cfg.get('ema_start_iters', 0)
-
-        # simple_ema for SSOD
-        self.use_simple_ema = ('use_simple_ema' in cfg and
-                               cfg['use_simple_ema'])
-        if self.use_simple_ema:
-            self.use_ema = True
-            ema_decay = self.cfg.get('ema_decay', 0.9996)
-            self.ema = SimpleModelEMA(self.model, decay=ema_decay)
-            self.ema_start_iters = self.cfg.get('ema_start_iters', 0)
-
-        self._nranks = dist.get_world_size()
-        self._local_rank = dist.get_rank()
-
-        self.status = {}
-
-        self.start_epoch = 0
-        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch
-
-        # initial default callbacks
-        self._init_callbacks()
-
-        # initial default metrics
-        self._init_metrics()
-        self._reset_metrics()
-
-    def load_weights(self, weights):
-        if self.is_loaded_weights:
-            return
-        self.start_epoch = 0
-        load_pretrain_weight(self.model, weights)
-        load_pretrain_weight(self.ema.model, weights)
-        logger.info("Load weights {} to start training for teacher and student".
-                    format(weights))
-
-    def resume_weights(self, weights, exchange=True):
-        # support Distill resume weights
-        if hasattr(self.model, 'student_model'):
-            self.start_epoch = load_weight(self.model.student_model, weights,
-                                           self.optimizer, exchange)
-        else:
-            self.start_epoch = load_weight(self.model, weights, self.optimizer,
-                                           self.ema
-                                           if self.use_ema else None, exchange)
-        logger.debug("Resume weights of epoch {}".format(self.start_epoch))
-
-    def train(self, validate=False):
-        self.semi_start_iters = self.cfg.get('semi_start_iters', 5000)
-        Init_mark = False
-        if validate:
-            self.cfg['EvalDataset'] = self.cfg.EvalDataset = create(
-                "EvalDataset")()
-
-        sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
-                   self.cfg.use_gpu and self._nranks > 1)
-        if sync_bn:
-            self.model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
-                self.model)
-
-        if self.cfg.get('fleet', False):
-            self.model = fleet.distributed_model(self.model)
-            self.optimizer = fleet.distributed_optimizer(self.optimizer)
-        elif self._nranks > 1:
-            find_unused_parameters = self.cfg[
-                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
-            self.model = paddle.DataParallel(
-                self.model, find_unused_parameters=find_unused_parameters)
-            self.ema.model = paddle.DataParallel(
-                self.ema.model, find_unused_parameters=find_unused_parameters)
-
-        self.status.update({
-            'epoch_id': self.start_epoch,
-            'step_id': 0,
-            'steps_per_epoch': len(self.loader),
-            'exchange_save_model': True,
-        })
-        # Note: exchange_save_model
-        # in DenseTeacher SSOD, the teacher model will be higher, so exchange when saving pdparams
-
-        self.status['batch_time'] = stats.SmoothedValue(
-            self.cfg.log_iter, fmt='{avg:.4f}')
-        self.status['data_time'] = stats.SmoothedValue(
-            self.cfg.log_iter, fmt='{avg:.4f}')
-        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)
-        profiler_options = self.cfg.get('profiler_options', None)
-        self._compose_callback.on_train_begin(self.status)
-
-        train_cfg = self.cfg.DenseTeacher['train_cfg']
-        concat_sup_data = train_cfg.get('concat_sup_data', True)
-
-        for param in self.ema.model.parameters():
-            param.stop_gradient = True
-
-        for epoch_id in range(self.start_epoch, self.cfg.epoch):
-            self.status['mode'] = 'train'
-            self.status['epoch_id'] = epoch_id
-            self._compose_callback.on_epoch_begin(self.status)
-            self.loader.dataset_label.set_epoch(epoch_id)
-            self.loader.dataset_unlabel.set_epoch(epoch_id)
-            iter_tic = time.time()
-            loss_dict = {
-                'loss': paddle.to_tensor([0]),
-                'loss_sup_sum': paddle.to_tensor([0]),
-                'loss_unsup_sum': paddle.to_tensor([0]),
-                'fg_sum': paddle.to_tensor([0]),
-            }
-            if self._nranks > 1:
-                for k in self.model._layers.get_loss_keys():
-                    loss_dict.update({k: paddle.to_tensor([0.])})
-                for k in self.model._layers.get_loss_keys():
-                    loss_dict.update({'distill_' + k: paddle.to_tensor([0.])})
-            else:
-                for k in self.model.get_loss_keys():
-                    loss_dict.update({k: paddle.to_tensor([0.])})
-                for k in self.model.get_loss_keys():
-                    loss_dict.update({'distill_' + k: paddle.to_tensor([0.])})
-
-            # Note: for step_id, data in enumerate(self.loader): # enumerate bug
-            for step_id in range(len(self.loader)):
-                data = next(self.loader)
-
-                self.model.train()
-                self.ema.model.eval()
-                data_sup_w, data_sup_s, data_unsup_w, data_unsup_s = data
-
-                self.status['data_time'].update(time.time() - iter_tic)
-                self.status['step_id'] = step_id
-                profiler.add_profiler_step(profiler_options)
-                self._compose_callback.on_step_begin(self.status)
-
-                if data_sup_w['image'].shape != data_sup_s['image'].shape:
-                    data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w,
-                                                                     data_sup_s)
-
-                data_sup_w['epoch_id'] = epoch_id
-                data_sup_s['epoch_id'] = epoch_id
-                if concat_sup_data:
-                    for k, v in data_sup_s.items():
-                        if k in ['epoch_id']:
-                            continue
-                        data_sup_s[k] = paddle.concat([v, data_sup_w[k]])
-                    loss_dict_sup = self.model(data_sup_s)
-                else:
-                    loss_dict_sup_w = self.model(data_sup_w)
-                    loss_dict_sup = self.model(data_sup_s)
-                    for k, v in loss_dict_sup_w.items():
-                        loss_dict_sup[k] = (loss_dict_sup[k] + v) * 0.5
-
-                losses_sup = loss_dict_sup['loss'] * train_cfg['sup_weight']
-                losses_sup.backward()
-
-                losses = losses_sup.detach()
-                loss_dict.update(loss_dict_sup)
-                loss_dict.update({'loss_sup_sum': loss_dict['loss']})
-
-                curr_iter = len(self.loader) * epoch_id + step_id
-                st_iter = self.semi_start_iters
-                if curr_iter == st_iter:
-                    logger.info("***" * 30)
-                    logger.info('Semi starting ...')
-                    logger.info("***" * 30)
-                if curr_iter > st_iter:
-                    unsup_weight = train_cfg['unsup_weight']
-                    if train_cfg['suppress'] == 'linear':
-                        tar_iter = st_iter * 2
-                        if curr_iter <= tar_iter:
-                            unsup_weight *= (curr_iter - st_iter) / st_iter
-                    elif train_cfg['suppress'] == 'exp':
-                        tar_iter = st_iter + 2000
-                        if curr_iter <= tar_iter:
-                            scale = np.exp((curr_iter - tar_iter) / 1000)
-                            unsup_weight *= scale
-                    elif train_cfg['suppress'] == 'step':
-                        tar_iter = st_iter * 2
-                        if curr_iter <= tar_iter:
-                            unsup_weight *= 0.25
-                    else:
-                        raise ValueError
-
-                    if data_unsup_w['image'].shape != data_unsup_s[
-                            'image'].shape:
-                        data_unsup_w, data_unsup_s = align_weak_strong_shape(
-                            data_unsup_w, data_unsup_s)
-
-                    data_unsup_w['epoch_id'] = epoch_id
-                    data_unsup_s['epoch_id'] = epoch_id
-
-                    data_unsup_s['get_data'] = True
-                    student_preds = self.model(data_unsup_s)
-
-                    with paddle.no_grad():
-                        data_unsup_w['is_teacher'] = True
-                        teacher_preds = self.ema.model(data_unsup_w)
-
-                    train_cfg['curr_iter'] = curr_iter
-                    train_cfg['st_iter'] = st_iter
-                    if self._nranks > 1:
-                        loss_dict_unsup = self.model._layers.get_ssod_loss(
-                            student_preds, teacher_preds, train_cfg)
-                    else:
-                        loss_dict_unsup = self.model.get_ssod_loss(
-                            student_preds, teacher_preds, train_cfg)
-
-                    fg_num = loss_dict_unsup["fg_sum"]
-                    del loss_dict_unsup["fg_sum"]
-                    distill_weights = train_cfg['loss_weight']
-                    loss_dict_unsup = {
-                        k: v * distill_weights[k]
-                        for k, v in loss_dict_unsup.items()
-                    }
-
-                    losses_unsup = sum([
-                        metrics_value
-                        for metrics_value in loss_dict_unsup.values()
-                    ]) * unsup_weight
-                    losses_unsup.backward()
-
-                    loss_dict.update(loss_dict_unsup)
-                    loss_dict.update({'loss_unsup_sum': losses_unsup})
-                    losses += losses_unsup.detach()
-                    loss_dict.update({"fg_sum": fg_num})
-                    loss_dict['loss'] = losses
-
-                self.optimizer.step()
-                curr_lr = self.optimizer.get_lr()
-                self.lr.step()
-                self.optimizer.clear_grad()
-                self.status['learning_rate'] = curr_lr
-                if self._nranks < 2 or self._local_rank == 0:
-                    self.status['training_staus'].update(loss_dict)
-
-                self.status['batch_time'].update(time.time() - iter_tic)
-                self._compose_callback.on_step_end(self.status)
-                # Note: ema_start_iters
-                if self.use_ema and curr_iter == self.ema_start_iters:
-                    logger.info("***" * 30)
-                    logger.info('EMA starting ...')
-                    logger.info("***" * 30)
-                    self.ema.update(self.model, decay=0)
-                elif self.use_ema and curr_iter > self.ema_start_iters:
-                    self.ema.update(self.model)
-                iter_tic = time.time()
-
-            is_snapshot = (self._nranks < 2 or self._local_rank == 0) \
-                       and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1)
-            if is_snapshot and self.use_ema:
-                # apply ema weight on model
-                weight = copy.deepcopy(self.ema.model.state_dict())
-                for k, v in weight.items():
-                    if paddle.is_floating_point(v):
-                        weight[k].stop_gradient = True
-                self.status['weight'] = weight
-
-            self._compose_callback.on_epoch_end(self.status)
-
-            if validate and is_snapshot:
-                if not hasattr(self, '_eval_loader'):
-                    # build evaluation dataset and loader
-                    self._eval_dataset = self.cfg.EvalDataset
-                    self._eval_batch_sampler = \
-                        paddle.io.BatchSampler(
-                            self._eval_dataset,
-                            batch_size=self.cfg.EvalReader['batch_size'])
-                    # If metric is VOC, need to be set collate_batch=False.
-                    if self.cfg.metric == 'VOC':
-                        self.cfg['EvalReader']['collate_batch'] = False
-                    self._eval_loader = create('EvalReader')(
-                        self._eval_dataset,
-                        self.cfg.worker_num,
-                        batch_sampler=self._eval_batch_sampler)
-                # if validation in training is enabled, metrics should be re-init
-                # Init_mark makes sure this code will only execute once
-                if validate and Init_mark == False:
-                    Init_mark = True
-                    self._init_metrics(validate=validate)
-                    self._reset_metrics()
-
-                with paddle.no_grad():
-                    self.status['save_best_model'] = True
-                    self._eval_with_loader(self._eval_loader)
-
-            if is_snapshot and self.use_ema:
-                self.status.pop('weight')
-
-        self._compose_callback.on_train_end(self.status)
-
-    def evaluate(self):
-        # get distributed model
-        if self.cfg.get('fleet', False):
-            self.model = fleet.distributed_model(self.model)
-            self.optimizer = fleet.distributed_optimizer(self.optimizer)
-        elif self._nranks > 1:
-            find_unused_parameters = self.cfg[
-                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
-            self.model = paddle.DataParallel(
-                self.model, find_unused_parameters=find_unused_parameters)
-        with paddle.no_grad():
-            self._eval_with_loader(self.loader)
-
-    def _eval_with_loader(self, loader):
-        sample_num = 0
-        tic = time.time()
-        self._compose_callback.on_epoch_begin(self.status)
-        self.status['mode'] = 'eval'
-
-        test_cfg = self.cfg.DenseTeacher['test_cfg']
-        if test_cfg['inference_on'] == 'teacher':
-            logger.info("***** teacher model evaluating *****")
-            eval_model = self.ema.model
-        else:
-            logger.info("***** student model evaluating *****")
-            eval_model = self.model
-
-        eval_model.eval()
-        if self.cfg.get('print_flops', False):
-            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
-                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
-            self._flops(flops_loader)
-        for step_id, data in enumerate(loader):
-            self.status['step_id'] = step_id
-            self._compose_callback.on_step_begin(self.status)
-            # forward
-            if self.use_amp:
-                with paddle.amp.auto_cast(
-                        enable=self.cfg.use_gpu or self.cfg.use_mlu,
-                        custom_white_list=self.custom_white_list,
-                        custom_black_list=self.custom_black_list,
-                        level=self.amp_level):
-                    outs = eval_model(data)
-            else:
-                outs = eval_model(data)
-
-            # update metrics
-            for metric in self._metrics:
-                metric.update(data, outs)
-
-            # multi-scale inputs: all inputs have same im_id
-            if isinstance(data, typing.Sequence):
-                sample_num += data[0]['im_id'].numpy().shape[0]
-            else:
-                sample_num += data['im_id'].numpy().shape[0]
-            self._compose_callback.on_step_end(self.status)
-
-        self.status['sample_num'] = sample_num
-        self.status['cost_time'] = time.time() - tic
-
-        # accumulate metric to log out
-        for metric in self._metrics:
-            metric.accumulate()
-            metric.log()
-        self._compose_callback.on_epoch_end(self.status)
-        self._reset_metrics()
-
-
-class Trainer_ARSL(Trainer):
-    def __init__(self, cfg, mode='train'):
-        self.cfg = cfg
-        assert mode.lower() in ['train', 'eval', 'test'], \
-                "mode should be 'train', 'eval' or 'test'"
-        self.mode = mode.lower()
-        self.optimizer = None
-        self.is_loaded_weights = False
-        capital_mode = self.mode.capitalize()
-        self.use_ema = False
-        self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
-            '{}Dataset'.format(capital_mode))()
-        if self.mode == 'train':
-            self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create(
-                'UnsupTrainDataset')
-            self.loader = create('SemiTrainReader')(
-                self.dataset, self.dataset_unlabel, cfg.worker_num)
-
-        # build model
-        if 'model' not in self.cfg:
-            self.student_model = create(cfg.architecture)
-            self.teacher_model = create(cfg.architecture)
-            self.model = EnsembleTSModel(self.teacher_model, self.student_model)
-        else:
-            self.model = self.cfg.model
-            self.is_loaded_weights = True
-        # save path for burn-in model
-        self.base_path = cfg.get('weights')
-        self.base_path = os.path.dirname(self.base_path)
-
-        # EvalDataset build with BatchSampler to evaluate in single device
-        # TODO: multi-device evaluate
-        if self.mode == 'eval':
-            self._eval_batch_sampler = paddle.io.BatchSampler(
-                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
-            self.loader = create('{}Reader'.format(self.mode.capitalize()))(
-                self.dataset, cfg.worker_num, self._eval_batch_sampler)
-        # TestDataset build after user set images, skip loader creation here
-
-        self.start_epoch = 0
-        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch
-        self.epoch_iter = self.cfg.epoch_iter  # set fixed iter in each epoch to control checkpoint
-
-        # build optimizer in train mode
-        if self.mode == 'train':
-            steps_per_epoch = self.epoch_iter
-            self.lr = create('LearningRate')(steps_per_epoch)
-            self.optimizer = create('OptimizerBuilder')(self.lr,
-                                                        self.model.modelStudent)
-
-        self._nranks = dist.get_world_size()
-        self._local_rank = dist.get_rank()
-
-        self.status = {}
-
-        # initial default callbacks
-        self._init_callbacks()
-
-        # initial default metrics
-        self._init_metrics()
-        self._reset_metrics()
-        self.iter = 0
-
-    def resume_weights(self, weights):
-        # support Distill resume weights
-        if hasattr(self.model, 'student_model'):
-            self.start_epoch = load_weight(self.model.student_model, weights,
-                                           self.optimizer)
-        else:
-            self.start_epoch = load_weight(self.model, weights, self.optimizer)
-        logger.debug("Resume weights of epoch {}".format(self.start_epoch))
-
-    def train(self, validate=False):
-        assert self.mode == 'train', "Model not in 'train' mode"
-        Init_mark = False
-
-        # if validation in training is enabled, metrics should be re-init
-        if validate:
-            self._init_metrics(validate=validate)
-            self._reset_metrics()
-
-        if self.cfg.get('fleet', False):
-            self.model.modelStudent = fleet.distributed_model(
-                self.model.modelStudent)
-            self.optimizer = fleet.distributed_optimizer(self.optimizer)
-        elif self._nranks > 1:
-            find_unused_parameters = self.cfg[
-                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
-            self.model.modelStudent = paddle.DataParallel(
-                self.model.modelStudent,
-                find_unused_parameters=find_unused_parameters)
-
-        # set fixed iter in each epoch to control checkpoint
-        self.status.update({
-            'epoch_id': self.start_epoch,
-            'step_id': 0,
-            'steps_per_epoch': self.epoch_iter
-        })
-        print('338 Len of DataLoader: {}'.format(len(self.loader)))
-
-        self.status['batch_time'] = stats.SmoothedValue(
-            self.cfg.log_iter, fmt='{avg:.4f}')
-        self.status['data_time'] = stats.SmoothedValue(
-            self.cfg.log_iter, fmt='{avg:.4f}')
-        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)
-
-        self._compose_callback.on_train_begin(self.status)
-
-        epoch_id = self.start_epoch
-        self.iter = self.start_epoch * self.epoch_iter
-        # use iter rather than epoch to control training schedule
-        while self.iter < self.cfg.max_iter:
-            # epoch loop
-            self.status['mode'] = 'train'
-            self.status['epoch_id'] = epoch_id
-            self._compose_callback.on_epoch_begin(self.status)
-            self.loader.dataset_label.set_epoch(epoch_id)
-            self.loader.dataset_unlabel.set_epoch(epoch_id)
-            paddle.device.cuda.empty_cache()  # clear GPU memory
-            # set model status
-            self.model.modelStudent.train()
-            self.model.modelTeacher.eval()
-            iter_tic = time.time()
-
-            # iter loop in each eopch
-            for step_id in range(self.epoch_iter):
-                data = next(self.loader)
-                self.status['data_time'].update(time.time() - iter_tic)
-                self.status['step_id'] = step_id
-                # profiler.add_profiler_step(profiler_options)
-                self._compose_callback.on_step_begin(self.status)
-
-                # model forward and calculate loss
-                loss_dict = self.run_step_full_semisup(data)
-
-                if (step_id + 1) % self.cfg.optimize_rate == 0:
-                    self.optimizer.step()
-                    self.optimizer.clear_grad()
-                curr_lr = self.optimizer.get_lr()
-                self.lr.step()
-
-                # update log status
-                self.status['learning_rate'] = curr_lr
-                if self._nranks < 2 or self._local_rank == 0:
-                    self.status['training_staus'].update(loss_dict)
-                self.status['batch_time'].update(time.time() - iter_tic)
-                self._compose_callback.on_step_end(self.status)
-                self.iter += 1
-                iter_tic = time.time()
-
-            self._compose_callback.on_epoch_end(self.status)
-
-            if validate and (self._nranks < 2 or self._local_rank == 0) \
-                    and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 \
-                             or epoch_id == self.end_epoch - 1):
-                if not hasattr(self, '_eval_loader'):
-                    # build evaluation dataset and loader
-                    self._eval_dataset = self.cfg.EvalDataset
-                    self._eval_batch_sampler = \
-                        paddle.io.BatchSampler(
-                            self._eval_dataset,
-                            batch_size=self.cfg.EvalReader['batch_size'])
-                    self._eval_loader = create('EvalReader')(
-                        self._eval_dataset,
-                        self.cfg.worker_num,
-                        batch_sampler=self._eval_batch_sampler)
-                if validate and Init_mark == False:
-                    Init_mark = True
-                    self._init_metrics(validate=validate)
-                    self._reset_metrics()
-                with paddle.no_grad():
-                    self.status['save_best_model'] = True
-                    # before burn-in stage, eval student. after burn-in stage, eval teacher
-                    if self.iter <= self.cfg.SEMISUPNET['BURN_UP_STEP']:
-                        print("start eval student model")
-                        self._eval_with_loader(
-                            self._eval_loader, mode="student")
-                    else:
-                        print("start eval teacher model")
-                        self._eval_with_loader(
-                            self._eval_loader, mode="teacher")
-
-            epoch_id += 1
-
-        self._compose_callback.on_train_end(self.status)
-
-    def merge_data(self, data1, data2):
-        data = copy.deepcopy(data1)
-        for k, v in data1.items():
-            if type(v) is paddle.Tensor:
-                data[k] = paddle.concat(x=[data[k], data2[k]], axis=0)
-            elif type(v) is list:
-                data[k].extend(data2[k])
-        return data
-
-    def run_step_full_semisup(self, data):
-        label_data_k, label_data_q, unlabel_data_k, unlabel_data_q = data
-        data_merge = self.merge_data(label_data_k, label_data_q)
-        loss_sup_dict = self.model.modelStudent(data_merge, branch="supervised")
-        loss_dict = {}
-        for key in loss_sup_dict.keys():
-            if key[:4] == "loss":
-                loss_dict[key] = loss_sup_dict[key] * 1
-        losses_sup = paddle.add_n(list(loss_dict.values()))
-        # norm loss when using gradient accumulation
-        losses_sup = losses_sup / self.cfg.optimize_rate
-        losses_sup.backward()
-
-        for key in loss_sup_dict.keys():
-            loss_dict[key + "_pseudo"] = paddle.to_tensor([0])
-        loss_dict["loss_tot"] = losses_sup
-        """
-        semi-supervised training after burn-in stage
-        """
-        if self.iter >= self.cfg.SEMISUPNET['BURN_UP_STEP']:
-            # init teacher model with burn-up weight
-            if self.iter == self.cfg.SEMISUPNET['BURN_UP_STEP']:
-                print(
-                    'Starting semi-supervised learning and load the teacher model.'
-                )
-                self._update_teacher_model(keep_rate=0.00)
-                # save burn-in model
-                if dist.get_world_size() < 2 or dist.get_rank() == 0:
-                    print('saving burn-in model.')
-                    save_name = 'burnIn'
-                    epoch_id = self.iter // self.epoch_iter
-                    save_model(self.model, self.optimizer, self.base_path,
-                               save_name, epoch_id)
-            # Update teacher model with EMA
-            elif (self.iter + 1) % self.cfg.optimize_rate == 0:
-                self._update_teacher_model(
-                    keep_rate=self.cfg.SEMISUPNET['EMA_KEEP_RATE'])
-
-            #warm-up weight for pseudo loss
-            pseudo_weight = self.cfg.SEMISUPNET['UNSUP_LOSS_WEIGHT']
-            pseudo_warmup_iter = self.cfg.SEMISUPNET['PSEUDO_WARM_UP_STEPS']
-            temp = self.iter - self.cfg.SEMISUPNET['BURN_UP_STEP']
-            if temp <= pseudo_warmup_iter:
-                pseudo_weight *= (temp / pseudo_warmup_iter)
-
-            # get teacher predictions on weak-augmented unlabeled data
-            with paddle.no_grad():
-                teacher_pred = self.model.modelTeacher(
-                    unlabel_data_k, branch='semi_supervised')
-
-            # calculate unsupervised loss on strong-augmented unlabeled data
-            loss_unsup_dict = self.model.modelStudent(
-                unlabel_data_q,
-                branch="semi_supervised",
-                teacher_prediction=teacher_pred, )
-
-            for key in loss_unsup_dict.keys():
-                if key[-6:] == "pseudo":
-                    loss_unsup_dict[key] = loss_unsup_dict[key] * pseudo_weight
-            losses_unsup = paddle.add_n(list(loss_unsup_dict.values()))
-            # norm loss when using gradient accumulation
-            losses_unsup = losses_unsup / self.cfg.optimize_rate
-            losses_unsup.backward()
-
-            loss_dict.update(loss_unsup_dict)
-            loss_dict["loss_tot"] += losses_unsup
-        return loss_dict
-
-    def export(self, output_dir='output_inference'):
-        self.model.eval()
-        model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0]
-        save_dir = os.path.join(output_dir, model_name)
-        if not os.path.exists(save_dir):
-            os.makedirs(save_dir)
-        image_shape = None
-        if self.cfg.architecture in MOT_ARCH:
-            test_reader_name = 'TestMOTReader'
-        else:
-            test_reader_name = 'TestReader'
-        if 'inputs_def' in self.cfg[test_reader_name]:
-            inputs_def = self.cfg[test_reader_name]['inputs_def']
-            image_shape = inputs_def.get('image_shape', None)
-        # set image_shape=[3, -1, -1] as default
-        if image_shape is None:
-            image_shape = [3, -1, -1]
-
-        self.model.modelTeacher.eval()
-        if hasattr(self.model.modelTeacher, 'deploy'):
-            self.model.modelTeacher.deploy = True
-
-        # Save infer cfg
-        _dump_infer_config(self.cfg,
-                           os.path.join(save_dir, 'infer_cfg.yml'), image_shape,
-                           self.model.modelTeacher)
-
-        input_spec = [{
-            "image": InputSpec(
-                shape=[None] + image_shape, name='image'),
-            "im_shape": InputSpec(
-                shape=[None, 2], name='im_shape'),
-            "scale_factor": InputSpec(
-                shape=[None, 2], name='scale_factor')
-        }]
-        if self.cfg.architecture == 'DeepSORT':
-            input_spec[0].update({
-                "crops": InputSpec(
-                    shape=[None, 3, 192, 64], name='crops')
-            })
-
-        static_model = paddle.jit.to_static(
-            self.model.modelTeacher, input_spec=input_spec)
-        # NOTE: dy2st do not pruned program, but jit.save will prune program
-        # input spec, prune input spec here and save with pruned input spec
-        pruned_input_spec = _prune_input_spec(input_spec,
-                                              static_model.forward.main_program,
-                                              static_model.forward.outputs)
-
-        # dy2st and save model
-        if 'slim' not in self.cfg or self.cfg['slim_type'] != 'QAT':
-            paddle.jit.save(
-                static_model,
-                os.path.join(save_dir, 'model'),
-                input_spec=pruned_input_spec)
-        else:
-            self.cfg.slim.save_quantized_model(
-                self.model.modelTeacher,
-                os.path.join(save_dir, 'model'),
-                input_spec=pruned_input_spec)
-        logger.info("Export model and saved in {}".format(save_dir))
-
-    def _eval_with_loader(self, loader, mode="teacher"):
-        sample_num = 0
-        tic = time.time()
-        self._compose_callback.on_epoch_begin(self.status)
-        self.status['mode'] = 'eval'
-        # self.model.eval()
-        self.model.modelTeacher.eval()
-        self.model.modelStudent.eval()
-        for step_id, data in enumerate(loader):
-            self.status['step_id'] = step_id
-            self._compose_callback.on_step_begin(self.status)
-            if mode == "teacher":
-                outs = self.model.modelTeacher(data)
-            else:
-                outs = self.model.modelStudent(data)
-
-            # update metrics
-            for metric in self._metrics:
-                metric.update(data, outs)
-
-            sample_num += data['im_id'].numpy().shape[0]
-            self._compose_callback.on_step_end(self.status)
-
-        self.status['sample_num'] = sample_num
-        self.status['cost_time'] = time.time() - tic
-
-        # accumulate metric to log out
-        for metric in self._metrics:
-            metric.accumulate()
-            metric.log()
-        self._compose_callback.on_epoch_end(self.status)
-        # reset metric states for metric may performed multiple times
-        self._reset_metrics()
-
-    def evaluate(self):
-        with paddle.no_grad():
-            self._eval_with_loader(self.loader)
-
-    @paddle.no_grad()
-    def _update_teacher_model(self, keep_rate=0.996):
-        student_model_dict = copy.deepcopy(self.model.modelStudent.state_dict())
-        new_teacher_dict = dict()
-        for key, value in self.model.modelTeacher.state_dict().items():
-            if key in student_model_dict.keys():
-                v = student_model_dict[key] * (1 - keep_rate
-                                               ) + value * keep_rate
-                v.stop_gradient = True
-                new_teacher_dict[key] = v
-            else:
-                raise Exception("{} is not found in student model".format(key))
-
-        self.model.modelTeacher.set_dict(new_teacher_dict)
-
-
-class EnsembleTSModel(nn.Layer):
-    def __init__(self, modelTeacher, modelStudent):
-        super(EnsembleTSModel, self).__init__()
-        self.modelTeacher = modelTeacher
-        self.modelStudent = modelStudent
-
-
-class Trainer_Semi_RTDETR(Trainer):
-    def __init__(self, cfg, mode='train'):
-        self.cfg = cfg
-        assert mode.lower() in ['train', 'eval', 'test'], \
-                "mode should be 'train', 'eval' or 'test'"
-        self.mode = mode.lower()
-        self.optimizer = None
-        self.is_loaded_weights = False
-        self.use_amp = self.cfg.get('amp', False)
-        self.amp_level = self.cfg.get('amp_level', 'O1')
-        self.custom_white_list = self.cfg.get('custom_white_list', None)
-        self.custom_black_list = self.cfg.get('custom_black_list', None)
-
-        # build data loader
-        capital_mode = self.mode.capitalize()
-        self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create(
-            '{}Dataset'.format(capital_mode))()
-
-        if self.mode == 'train':
-            self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create(
-                'UnsupTrainDataset')
-            self.loader = create('SemiTrainReader')(
-                self.dataset, self.dataset_unlabel, cfg.worker_num)
-
-        # build model
-        if 'model' not in self.cfg:
-            self.model = create(cfg.SSOD)
-        else:
-            self.model = self.cfg.model
-            self.is_loaded_weights = True
-
-        # EvalDataset build with BatchSampler to evaluate in single device
-        # TODO: multi-device evaluate
-        if self.mode == 'eval':
-            self._eval_batch_sampler = paddle.io.BatchSampler(
-                self.dataset, batch_size=self.cfg.EvalReader['batch_size'])
-            # If metric is VOC, need to be set collate_batch=False.
-            if cfg.metric == 'VOC':
-                cfg['EvalReader']['collate_batch'] = False
-            self.loader = create('EvalReader')(self.dataset, cfg.worker_num,
-                                               self._eval_batch_sampler)
-        # TestDataset build after user set images, skip loader creation here
-
-        # build optimizer in train mode
-        if self.mode == 'train':
-            steps_per_epoch = len(self.loader)
-            if steps_per_epoch < 1:
-                logger.warning(
-                    "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader."
-                )
-            self.lr = create('LearningRate')(steps_per_epoch)
-            self.optimizer = create('OptimizerBuilder')(self.lr, self.model)
-
-            # Unstructured pruner is only enabled in the train mode.
-            if self.cfg.get('unstructured_prune'):
-                self.pruner = create('UnstructuredPruner')(self.model,
-                                                           steps_per_epoch)
-        if self.use_amp and self.amp_level == 'O2':
-            self.model, self.optimizer = paddle.amp.decorate(
-                models=self.model,
-                optimizers=self.optimizer,
-                level=self.amp_level)
-
-        self._nranks = dist.get_world_size()
-        self._local_rank = dist.get_rank()
-
-        self.status = {}
-
-        self.start_epoch = 0
-        self.start_iter = 0
-        self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch
-
-        # initial default callbacks
-        self._init_callbacks()
-
-        # initial default metrics
-        self._init_metrics()
-        self._reset_metrics()
-
-    def load_semi_weights(self, t_weights, s_weights):
-        if self.is_loaded_weights:
-            return
-        self.start_epoch = 0
-        load_pretrain_weight(self.model.teacher, t_weights)
-        load_pretrain_weight(self.model.student, s_weights)
-        logger.info("Load teacher weights {} to start training".format(
-            t_weights))
-        logger.info("Load student weights {} to start training".format(
-            s_weights))
-
-    def resume_weights(self, weights, exchange=True):
-        # support Distill resume weights
-        if hasattr(self.model, 'student_model'):
-            self.start_epoch = load_weight(self.model.student_model, weights,
-                                           self.optimizer, exchange)
-        else:
-            self.start_iter, self.start_epoch = load_weight(
-                self.model, weights, self.optimizer, self.ema
-                if self.use_ema else None, exchange)
-        logger.debug("Resume weights of epoch {}".format(self.start_epoch))
-        logger.debug("Resume weights of iter {}".format(self.start_iter))
-
-    def train(self, validate=False):
-        assert self.mode == 'train', "Model not in 'train' mode"
-        Init_mark = False
-        if validate:
-            self.cfg.EvalDataset = create("EvalDataset")()
-
-        model = self.model
-        sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and
-                   self.cfg.use_gpu and self._nranks > 1)
-        if sync_bn:
-            # self.model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
-            #     self.model)
-            model.teacher = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
-                model.teacher)
-            model.student = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(
-                self.model.student)
-
-        if self.cfg.get('fleet', False):
-            # model = fleet.distributed_model(model)
-            model = fleet.distributed_model(model)
-
-            self.optimizer = fleet.distributed_optimizer(self.optimizer)
-        elif self._nranks > 1:
-            find_unused_parameters = self.cfg[
-                'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False
-            model = paddle.DataParallel(
-                model, find_unused_parameters=find_unused_parameters)
-
-        if self.cfg.get('amp', False):
-            scaler = amp.GradScaler(
-                enable=self.cfg.use_gpu or self.cfg.use_npu,
-                init_loss_scaling=1024)
-
-        self.status.update({
-            'epoch_id': self.start_epoch,
-            'iter_id': self.start_iter,
-            # 'step_id': self.start_step,
-            'steps_per_epoch': len(self.loader),
-        })
-
-        self.status['batch_time'] = stats.SmoothedValue(
-            self.cfg.log_iter, fmt='{avg:.4f}')
-        self.status['data_time'] = stats.SmoothedValue(
-            self.cfg.log_iter, fmt='{avg:.4f}')
-        self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter)
-
-        if self.cfg.get('print_flops', False):
-            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
-                self.dataset, self.cfg.worker_num)
-            self._flops(flops_loader)
-        profiler_options = self.cfg.get('profiler_options', None)
-
-        self._compose_callback.on_train_begin(self.status)
-        iter_id = self.start_iter
-        self.status['iter_id'] = iter_id
-        self.status['eval_interval'] = self.cfg.eval_interval
-        self.status['save_interval'] = self.cfg.save_interval
-        for epoch_id in range(self.start_epoch, self.cfg.epoch):
-            self.status['mode'] = 'train'
-            self.status['epoch_id'] = epoch_id
-            self._compose_callback.on_epoch_begin(self.status)
-            self.loader.dataset_label.set_epoch(epoch_id)
-            self.loader.dataset_unlabel.set_epoch(epoch_id)
-            iter_tic = time.time()
-            if self._nranks > 1:
-                # print(model)
-                model._layers.teacher.eval()
-                model._layers.student.train()
-            else:
-                model.teacher.eval()
-                model.student.train()
-            iter_tic = time.time()
-            for step_id in range(len(self.loader)):
-                data = next(self.loader)
-                data_sup_w, data_sup_s, data_unsup_w, data_unsup_s = data
-                data_sup_w['epoch_id'] = epoch_id
-                data_sup_s['epoch_id'] = epoch_id
-                data_unsup_w['epoch_id'] = epoch_id
-                data_unsup_s['epoch_id'] = epoch_id
-                data = [data_sup_w, data_sup_s, data_unsup_w, data_unsup_s]
-                iter_id += 1
-                self.status['data_time'].update(time.time() - iter_tic)
-                self.status['step_id'] = step_id
-                self.status['iter_id'] = iter_id
-                data.append(iter_id)
-                profiler.add_profiler_step(profiler_options)
-                self._compose_callback.on_step_begin(self.status)
-                if self.cfg.get('amp', False):
-                    with amp.auto_cast(enable=self.cfg.use_gpu):
-                        # model forward
-                        if self._nranks > 1:
-                            outputs = model._layers(data)
-                        else:
-                            outputs = model(data)
-                        loss = outputs['loss']
-
-                    scaled_loss = scaler.scale(loss)
-                    scaled_loss.backward()
-                    scaler.minimize(self.optimizer, scaled_loss)
-                else:
-                    outputs = model(data)
-                    loss = outputs['loss']
-                    # model backward
-                    loss.backward()
-                    self.optimizer.step()
-                curr_lr = self.optimizer.get_lr()
-                self.lr.step()
-                if self.cfg.get('unstructured_prune'):
-                    self.pruner.step()
-                self.optimizer.clear_grad()
-                # print(outputs)
-                # outputs=reduce_dict(outputs)
-                # if self.model.debug:
-                #     check_gradient(model)
-                # self.check_gradient()
-                self.status['learning_rate'] = curr_lr
-                if self._nranks < 2 or self._local_rank == 0:
-                    self.status['training_staus'].update(outputs)
-
-                self.status['batch_time'].update(time.time() - iter_tic)
-
-                if validate and (self._nranks < 2 or self._local_rank == 0) and \
-                                ((iter_id + 1) % self.cfg.eval_interval == 0):
-                    if not hasattr(self, '_eval_loader'):
-                        # build evaluation dataset and loader
-                        self._eval_dataset = self.cfg.EvalDataset
-                        self._eval_batch_sampler = \
-                            paddle.io.BatchSampler(
-                                self._eval_dataset,
-                                batch_size=self.cfg.EvalReader['batch_size'])
-                        # If metric is VOC, need to be set collate_batch=False.
-                        if self.cfg.metric == 'VOC':
-                            self.cfg['EvalReader']['collate_batch'] = False
-                        self._eval_loader = create('EvalReader')(
-                            self._eval_dataset,
-                            self.cfg.worker_num,
-                            batch_sampler=self._eval_batch_sampler)
-                    # if validation in training is enabled, metrics should be re-init
-                    # Init_mark makes sure this code will only execute once
-                    if validate and Init_mark == False:
-                        Init_mark = True
-                        self._init_metrics(validate=validate)
-                        self._reset_metrics()
-
-                    with paddle.no_grad():
-                        self.status['save_best_model'] = True
-                        self._eval_with_loader(self._eval_loader)
-                    model._layers.student.train()
-
-                self._compose_callback.on_step_end(self.status)
-
-                iter_tic = time.time()
-
-            if self.cfg.get('unstructured_prune'):
-                self.pruner.update_params()
-            self._compose_callback.on_epoch_end(self.status)
-
-        self._compose_callback.on_train_end(self.status)
-
-    def _eval_with_loader(self, loader):
-        sample_num = 0
-        tic = time.time()
-        self._compose_callback.on_epoch_begin(self.status)
-        self.status['mode'] = 'eval'
-        self.model.eval()
-        if self.cfg.get('print_flops', False):
-            flops_loader = create('{}Reader'.format(self.mode.capitalize()))(
-                self.dataset, self.cfg.worker_num, self._eval_batch_sampler)
-            self._flops(flops_loader)
-        print("*****teacher evaluate*****")
-        for step_id, data in enumerate(loader):
-            self.status['step_id'] = step_id
-            self._compose_callback.on_step_begin(self.status)
-            # forward
-            outs = self.model.teacher(data)
-
-            # update metrics
-            for metric in self._metrics:
-                metric.update(data, outs)
-
-            # multi-scale inputs: all inputs have same im_id
-            if isinstance(data, typing.Sequence):
-                sample_num += data[0]['im_id'].numpy().shape[0]
-            else:
-                sample_num += data['im_id'].numpy().shape[0]
-            self._compose_callback.on_step_end(self.status)
-
-        self.status['sample_num'] = sample_num
-        self.status['cost_time'] = time.time() - tic
-
-        # accumulate metric to log out
-        for metric in self._metrics:
-            metric.accumulate()
-            metric.log()
-        self._compose_callback.on_epoch_end(self.status)
-        # reset metric states for metric may performed multiple times
-        self._reset_metrics()
-
-        print("*****student evaluate*****")
-        for step_id, data in enumerate(loader):
-            self.status['step_id'] = step_id
-            self._compose_callback.on_step_begin(self.status)
-            # forward
-            outs = self.model.student(data)
-
-            # update metrics
-            for metric in self._metrics:
-                metric.update(data, outs)
-
-            # multi-scale inputs: all inputs have same im_id
-            if isinstance(data, typing.Sequence):
-                sample_num += data[0]['im_id'].numpy().shape[0]
-            else:
-                sample_num += data['im_id'].numpy().shape[0]
-            self._compose_callback.on_step_end(self.status)
-
-        self.status['sample_num'] = sample_num
-        self.status['cost_time'] = time.time() - tic
-
-        # accumulate metric to log out
-        for metric in self._metrics:
-            metric.accumulate()
-            metric.log()
-        # reset metric states for metric may performed multiple times
-        self._reset_metrics()
-        self.status['mode'] = 'train'
-
-    def evaluate(self):
-        with paddle.no_grad():
-            self._eval_with_loader(self.loader)
diff --git a/pdfdet/models/Paddle/ppdet/ext_op/README.md b/pdfdet/models/Paddle/ppdet/ext_op/README.md
deleted file mode 100644
index 0d67062..0000000
--- a/pdfdet/models/Paddle/ppdet/ext_op/README.md
+++ /dev/null
@@ -1,35 +0,0 @@
-# 自定义OP编译
-旋转框IOU计算OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。
-
-## 1. 环境依赖
-- Paddle >= 2.0.1
-- gcc 8.2
-
-## 2. 安装
-```
-python setup.py install
-```
-
-编译完成后即可使用，以下为`rbox_iou`的使用示例
-```
-# 引入自定义op
-from ext_op import rbox_iou
-
-paddle.set_device('gpu:0')
-paddle.disable_static()
-
-rbox1 = np.random.rand(13000, 5)
-rbox2 = np.random.rand(7, 5)
-
-pd_rbox1 = paddle.to_tensor(rbox1)
-pd_rbox2 = paddle.to_tensor(rbox2)
-
-iou = rbox_iou(pd_rbox1, pd_rbox2)
-print('iou', iou)
-```
-
-## 3. 单元测试
-可以通过执行单元测试来确认自定义算子功能的正确性，执行单元测试的示例如下所示：
-```
-python unittest/test_matched_rbox_iou.py
-```
diff --git a/pdfdet/models/Paddle/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cc b/pdfdet/models/Paddle/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cc
deleted file mode 100644
index b16e8c1..0000000
--- a/pdfdet/models/Paddle/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// The code is based on
-// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
-
-#include "../rbox_iou/rbox_iou_utils.h"
-#include "paddle/extension.h"
-
-template <typename T>
-void matched_rbox_iou_cpu_kernel(const int rbox_num, const T *rbox1_data_ptr,
-                                 const T *rbox2_data_ptr, T *output_data_ptr) {
-
-  int i;
-  for (i = 0; i < rbox_num; i++) {
-    output_data_ptr[i] =
-        rbox_iou_single<T>(rbox1_data_ptr + i * 5, rbox2_data_ptr + i * 5);
-  }
-}
-
-#define CHECK_INPUT_CPU(x)                                                     \
-  PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
-
-std::vector<paddle::Tensor>
-MatchedRboxIouCPUForward(const paddle::Tensor &rbox1,
-                         const paddle::Tensor &rbox2) {
-  CHECK_INPUT_CPU(rbox1);
-  CHECK_INPUT_CPU(rbox2);
-  PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim");
-
-  auto rbox_num = rbox1.shape()[0];
-  auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::CPUPlace());
-
-  PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "matched_rbox_iou_cpu_kernel", ([&] {
-                               matched_rbox_iou_cpu_kernel<data_t>(
-                                   rbox_num, rbox1.data<data_t>(),
-                                   rbox2.data<data_t>(), output.data<data_t>());
-                             }));
-
-  return {output};
-}
-
-#ifdef PADDLE_WITH_CUDA
-std::vector<paddle::Tensor>
-MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
-                          const paddle::Tensor &rbox2);
-#endif
-
-#define CHECK_INPUT_SAME(x1, x2)                                               \
-  PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.")
-
-std::vector<paddle::Tensor> MatchedRboxIouForward(const paddle::Tensor &rbox1,
-                                                  const paddle::Tensor &rbox2) {
-  CHECK_INPUT_SAME(rbox1, rbox2);
-  if (rbox1.is_cpu()) {
-    return MatchedRboxIouCPUForward(rbox1, rbox2);
-#ifdef PADDLE_WITH_CUDA
-  } else if (rbox1.is_gpu()) {
-    return MatchedRboxIouCUDAForward(rbox1, rbox2);
-#endif
-  }
-}
-
-std::vector<std::vector<int64_t>>
-MatchedRboxIouInferShape(std::vector<int64_t> rbox1_shape,
-                         std::vector<int64_t> rbox2_shape) {
-  return {{rbox1_shape[0]}};
-}
-
-std::vector<paddle::DataType> MatchedRboxIouInferDtype(paddle::DataType t1,
-                                                       paddle::DataType t2) {
-  return {t1};
-}
-
-PD_BUILD_OP(matched_rbox_iou)
-    .Inputs({"RBOX1", "RBOX2"})
-    .Outputs({"Output"})
-    .SetKernelFn(PD_KERNEL(MatchedRboxIouForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(MatchedRboxIouInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(MatchedRboxIouInferDtype));
diff --git a/pdfdet/models/Paddle/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cu b/pdfdet/models/Paddle/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cu
deleted file mode 100644
index 53454d1..0000000
--- a/pdfdet/models/Paddle/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cu
+++ /dev/null
@@ -1,58 +0,0 @@
-//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// The code is based on
-// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
-
-#include "../rbox_iou/rbox_iou_utils.h"
-#include "paddle/extension.h"
-
-template <typename T>
-__global__ void
-matched_rbox_iou_cuda_kernel(const int rbox_num, const T *rbox1_data_ptr,
-                             const T *rbox2_data_ptr, T *output_data_ptr) {
-  for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < rbox_num;
-       tid += blockDim.x * gridDim.x) {
-    output_data_ptr[tid] =
-        rbox_iou_single<T>(rbox1_data_ptr + tid * 5, rbox2_data_ptr + tid * 5);
-  }
-}
-
-#define CHECK_INPUT_GPU(x)                                                     \
-  PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
-
-std::vector<paddle::Tensor>
-MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1,
-                          const paddle::Tensor &rbox2) {
-  CHECK_INPUT_GPU(rbox1);
-  CHECK_INPUT_GPU(rbox2);
-  PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim");
-
-  auto rbox_num = rbox1.shape()[0];
-
-  auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::GPUPlace());
-
-  const int thread_per_block = 512;
-  const int block_per_grid = CeilDiv(rbox_num, thread_per_block);
-
-  PD_DISPATCH_FLOATING_TYPES(
-      rbox1.type(), "matched_rbox_iou_cuda_kernel", ([&] {
-        matched_rbox_iou_cuda_kernel<
-            data_t><<<block_per_grid, thread_per_block, 0, rbox1.stream()>>>(
-            rbox_num, rbox1.data<data_t>(), rbox2.data<data_t>(),
-            output.data<data_t>());
-      }));
-
-  return {output};
-}
diff --git a/pdfdet/models/Paddle/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cc b/pdfdet/models/Paddle/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cc
deleted file mode 100644
index 44f4eb6..0000000
--- a/pdfdet/models/Paddle/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "../rbox_iou/rbox_iou_utils.h"
-#include "paddle/extension.h"
-
-template <typename T>
-void nms_rotated_cpu_kernel(const T *boxes_data, const float threshold,
-                            const int64_t num_boxes, int64_t *num_keep_boxes,
-                            int64_t *output_data) {
-
-  int num_masks = CeilDiv(num_boxes, 64);
-  std::vector<int64_t> masks(num_masks, 0);
-  for (int64_t i = 0; i < num_boxes; ++i) {
-    if (masks[i / 64] & 1ULL << (i % 64))
-      continue;
-    T box_1[5];
-    for (int k = 0; k < 5; ++k) {
-      box_1[k] = boxes_data[i * 5 + k];
-    }
-    for (int64_t j = i + 1; j < num_boxes; ++j) {
-      if (masks[j / 64] & 1ULL << (j % 64))
-        continue;
-      T box_2[5];
-      for (int k = 0; k < 5; ++k) {
-        box_2[k] = boxes_data[j * 5 + k];
-      }
-      if (rbox_iou_single<T>(box_1, box_2) > threshold) {
-        masks[j / 64] |= 1ULL << (j % 64);
-      }
-    }
-  }
-  int64_t output_data_idx = 0;
-  for (int64_t i = 0; i < num_boxes; ++i) {
-    if (masks[i / 64] & 1ULL << (i % 64))
-      continue;
-    output_data[output_data_idx++] = i;
-  }
-  *num_keep_boxes = output_data_idx;
-  for (; output_data_idx < num_boxes; ++output_data_idx) {
-    output_data[output_data_idx] = 0;
-  }
-}
-
-#define CHECK_INPUT_CPU(x)                                                     \
-  PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
-
-std::vector<paddle::Tensor> NMSRotatedCPUForward(const paddle::Tensor &boxes,
-                                                 const paddle::Tensor &scores,
-                                                 float threshold) {
-  CHECK_INPUT_CPU(boxes);
-  CHECK_INPUT_CPU(scores);
-
-  auto num_boxes = boxes.shape()[0];
-
-  auto order_t =
-      std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true));
-  auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0);
-
-  auto keep =
-      paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace());
-  int64_t num_keep_boxes = 0;
-
-  PD_DISPATCH_FLOATING_TYPES(boxes.type(), "nms_rotated_cpu_kernel", ([&] {
-                               nms_rotated_cpu_kernel<data_t>(
-                                   boxes_sorted.data<data_t>(), threshold,
-                                   num_boxes, &num_keep_boxes,
-                                   keep.data<int64_t>());
-                             }));
-
-  keep = keep.slice(0, num_keep_boxes);
-  return {paddle::gather(order_t, keep, /* axis=*/0)};
-}
-
-#ifdef PADDLE_WITH_CUDA
-std::vector<paddle::Tensor> NMSRotatedCUDAForward(const paddle::Tensor &boxes,
-                                                  const paddle::Tensor &scores,
-                                                  float threshold);
-#endif
-
-std::vector<paddle::Tensor> NMSRotatedForward(const paddle::Tensor &boxes,
-                                              const paddle::Tensor &scores,
-                                              float threshold) {
-  if (boxes.is_cpu()) {
-    return NMSRotatedCPUForward(boxes, scores, threshold);
-#ifdef PADDLE_WITH_CUDA
-  } else if (boxes.is_gpu()) {
-    return NMSRotatedCUDAForward(boxes, scores, threshold);
-#endif
-  }
-}
-
-std::vector<std::vector<int64_t>>
-NMSRotatedInferShape(std::vector<int64_t> boxes_shape,
-                     std::vector<int64_t> scores_shape) {
-  return {{-1}};
-}
-
-std::vector<paddle::DataType> NMSRotatedInferDtype(paddle::DataType t1,
-                                                   paddle::DataType t2) {
-  return {paddle::DataType::INT64};
-}
-
-PD_BUILD_OP(nms_rotated)
-    .Inputs({"Boxes", "Scores"})
-    .Outputs({"Output"})
-    .Attrs({"threshold: float"})
-    .SetKernelFn(PD_KERNEL(NMSRotatedForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(NMSRotatedInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(NMSRotatedInferDtype));
\ No newline at end of file
diff --git a/pdfdet/models/Paddle/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cu b/pdfdet/models/Paddle/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cu
deleted file mode 100644
index d20dddb..0000000
--- a/pdfdet/models/Paddle/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cu
+++ /dev/null
@@ -1,96 +0,0 @@
-//   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "../rbox_iou/rbox_iou_utils.h"
-#include "paddle/extension.h"
-
-static const int64_t threadsPerBlock = sizeof(int64_t) * 8;
-
-template <typename T>
-__global__ void
-nms_rotated_cuda_kernel(const T *boxes_data, const float threshold,
-                        const int64_t num_boxes, int64_t *masks) {
-  auto raw_start = blockIdx.y;
-  auto col_start = blockIdx.x;
-  if (raw_start > col_start)
-    return;
-  const int raw_last_storage =
-      min(num_boxes - raw_start * threadsPerBlock, threadsPerBlock);
-  const int col_last_storage =
-      min(num_boxes - col_start * threadsPerBlock, threadsPerBlock);
-  if (threadIdx.x < raw_last_storage) {
-    int64_t mask = 0;
-    auto current_box_idx = raw_start * threadsPerBlock + threadIdx.x;
-    const T *current_box = boxes_data + current_box_idx * 5;
-    for (int i = 0; i < col_last_storage; ++i) {
-      const T *target_box = boxes_data + (col_start * threadsPerBlock + i) * 5;
-      if (rbox_iou_single<T>(current_box, target_box) > threshold) {
-        mask |= 1ULL << i;
-      }
-    }
-    const int blocks_per_line = CeilDiv(num_boxes, threadsPerBlock);
-    masks[current_box_idx * blocks_per_line + col_start] = mask;
-  }
-}
-
-#define CHECK_INPUT_GPU(x)                                                     \
-  PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
-
-std::vector<paddle::Tensor> NMSRotatedCUDAForward(const paddle::Tensor &boxes,
-                                                  const paddle::Tensor &scores,
-                                                  float threshold) {
-  CHECK_INPUT_GPU(boxes);
-  CHECK_INPUT_GPU(scores);
-
-  auto num_boxes = boxes.shape()[0];
-  auto order_t =
-      std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true));
-  auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0);
-
-  const auto blocks_per_line = CeilDiv(num_boxes, threadsPerBlock);
-  dim3 block(threadsPerBlock);
-  dim3 grid(blocks_per_line, blocks_per_line);
-  auto mask_dev = paddle::empty({num_boxes * blocks_per_line},
-                                paddle::DataType::INT64, paddle::GPUPlace());
-
-  PD_DISPATCH_FLOATING_TYPES(
-      boxes.type(), "nms_rotated_cuda_kernel", ([&] {
-        nms_rotated_cuda_kernel<data_t><<<grid, block, 0, boxes.stream()>>>(
-            boxes_sorted.data<data_t>(), threshold, num_boxes,
-            mask_dev.data<int64_t>());
-      }));
-
-  auto mask_host = mask_dev.copy_to(paddle::CPUPlace(), true);
-  auto keep_host =
-      paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace());
-  int64_t *keep_host_ptr = keep_host.data<int64_t>();
-  int64_t *mask_host_ptr = mask_host.data<int64_t>();
-  std::vector<int64_t> remv(blocks_per_line);
-  int64_t last_box_num = 0;
-  for (int64_t i = 0; i < num_boxes; ++i) {
-    auto remv_element_id = i / threadsPerBlock;
-    auto remv_bit_id = i % threadsPerBlock;
-    if (!(remv[remv_element_id] & 1ULL << remv_bit_id)) {
-      keep_host_ptr[last_box_num++] = i;
-      int64_t *current_mask = mask_host_ptr + i * blocks_per_line;
-      for (auto j = remv_element_id; j < blocks_per_line; ++j) {
-        remv[j] |= current_mask[j];
-      }
-    }
-  }
-
-  keep_host = keep_host.slice(0, last_box_num);
-  auto keep_dev = keep_host.copy_to(paddle::GPUPlace(), true);
-  return {paddle::gather(order_t, keep_dev, /* axis=*/0)};
-}
\ No newline at end of file
diff --git a/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cc b/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cc
deleted file mode 100644
index c8e7528..0000000
--- a/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// The code is based on
-// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
-
-#include "paddle/extension.h"
-#include "rbox_iou_utils.h"
-
-template <typename T>
-void rbox_iou_cpu_kernel(const int rbox1_num, const int rbox2_num,
-                         const T *rbox1_data_ptr, const T *rbox2_data_ptr,
-                         T *output_data_ptr) {
-
-  int i, j;
-  for (i = 0; i < rbox1_num; i++) {
-    for (j = 0; j < rbox2_num; j++) {
-      int offset = i * rbox2_num + j;
-      output_data_ptr[offset] =
-          rbox_iou_single<T>(rbox1_data_ptr + i * 5, rbox2_data_ptr + j * 5);
-    }
-  }
-}
-
-#define CHECK_INPUT_CPU(x)                                                     \
-  PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
-
-std::vector<paddle::Tensor> RboxIouCPUForward(const paddle::Tensor &rbox1,
-                                              const paddle::Tensor &rbox2) {
-  CHECK_INPUT_CPU(rbox1);
-  CHECK_INPUT_CPU(rbox2);
-
-  auto rbox1_num = rbox1.shape()[0];
-  auto rbox2_num = rbox2.shape()[0];
-
-  auto output =
-      paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::CPUPlace());
-
-  PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "rbox_iou_cpu_kernel", ([&] {
-                               rbox_iou_cpu_kernel<data_t>(
-                                   rbox1_num, rbox2_num, rbox1.data<data_t>(),
-                                   rbox2.data<data_t>(), output.data<data_t>());
-                             }));
-
-  return {output};
-}
-
-#ifdef PADDLE_WITH_CUDA
-std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,
-                                               const paddle::Tensor &rbox2);
-#endif
-
-#define CHECK_INPUT_SAME(x1, x2)                                               \
-  PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.")
-
-std::vector<paddle::Tensor> RboxIouForward(const paddle::Tensor &rbox1,
-                                           const paddle::Tensor &rbox2) {
-  CHECK_INPUT_SAME(rbox1, rbox2);
-  if (rbox1.is_cpu()) {
-    return RboxIouCPUForward(rbox1, rbox2);
-#ifdef PADDLE_WITH_CUDA
-  } else if (rbox1.is_gpu()) {
-    return RboxIouCUDAForward(rbox1, rbox2);
-#endif
-  }
-}
-
-std::vector<std::vector<int64_t>>
-RboxIouInferShape(std::vector<int64_t> rbox1_shape,
-                  std::vector<int64_t> rbox2_shape) {
-  return {{rbox1_shape[0], rbox2_shape[0]}};
-}
-
-std::vector<paddle::DataType> RboxIouInferDtype(paddle::DataType t1,
-                                                paddle::DataType t2) {
-  return {t1};
-}
-
-PD_BUILD_OP(rbox_iou)
-    .Inputs({"RBox1", "RBox2"})
-    .Outputs({"Output"})
-    .SetKernelFn(PD_KERNEL(RboxIouForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(RboxIouInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(RboxIouInferDtype));
diff --git a/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cu b/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cu
deleted file mode 100644
index baedb6d..0000000
--- a/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cu
+++ /dev/null
@@ -1,109 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// The code is based on
-// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
-
-#include "paddle/extension.h"
-#include "rbox_iou_utils.h"
-
-// 2D block with 32 * 16 = 512 threads per block
-const int BLOCK_DIM_X = 32;
-const int BLOCK_DIM_Y = 16;
-
-template <typename T>
-__global__ void rbox_iou_cuda_kernel(const int rbox1_num, const int rbox2_num,
-                                     const T *rbox1_data_ptr,
-                                     const T *rbox2_data_ptr,
-                                     T *output_data_ptr) {
-
-  // get row_start and col_start
-  const int rbox1_block_idx = blockIdx.x * blockDim.x;
-  const int rbox2_block_idx = blockIdx.y * blockDim.y;
-
-  const int rbox1_thread_num = min(rbox1_num - rbox1_block_idx, blockDim.x);
-  const int rbox2_thread_num = min(rbox2_num - rbox2_block_idx, blockDim.y);
-
-  __shared__ T block_boxes1[BLOCK_DIM_X * 5];
-  __shared__ T block_boxes2[BLOCK_DIM_Y * 5];
-
-  // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y
-  if (threadIdx.x < rbox1_thread_num && threadIdx.y == 0) {
-    block_boxes1[threadIdx.x * 5 + 0] =
-        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 0];
-    block_boxes1[threadIdx.x * 5 + 1] =
-        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 1];
-    block_boxes1[threadIdx.x * 5 + 2] =
-        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 2];
-    block_boxes1[threadIdx.x * 5 + 3] =
-        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 3];
-    block_boxes1[threadIdx.x * 5 + 4] =
-        rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 4];
-  }
-
-  // threadIdx.x < BLOCK_DIM_Y=rbox2_thread_num, just use same condition as
-  // above: threadIdx.y == 0
-  if (threadIdx.x < rbox2_thread_num && threadIdx.y == 0) {
-    block_boxes2[threadIdx.x * 5 + 0] =
-        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 0];
-    block_boxes2[threadIdx.x * 5 + 1] =
-        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 1];
-    block_boxes2[threadIdx.x * 5 + 2] =
-        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 2];
-    block_boxes2[threadIdx.x * 5 + 3] =
-        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 3];
-    block_boxes2[threadIdx.x * 5 + 4] =
-        rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 4];
-  }
-
-  // sync
-  __syncthreads();
-
-  if (threadIdx.x < rbox1_thread_num && threadIdx.y < rbox2_thread_num) {
-    int offset = (rbox1_block_idx + threadIdx.x) * rbox2_num + rbox2_block_idx +
-                 threadIdx.y;
-    output_data_ptr[offset] = rbox_iou_single<T>(
-        block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5);
-  }
-}
-
-#define CHECK_INPUT_GPU(x)                                                     \
-  PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
-
-std::vector<paddle::Tensor> RboxIouCUDAForward(const paddle::Tensor &rbox1,
-                                               const paddle::Tensor &rbox2) {
-  CHECK_INPUT_GPU(rbox1);
-  CHECK_INPUT_GPU(rbox2);
-
-  auto rbox1_num = rbox1.shape()[0];
-  auto rbox2_num = rbox2.shape()[0];
-
-  auto output =
-      paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::GPUPlace());
-
-  const int blocks_x = CeilDiv(rbox1_num, BLOCK_DIM_X);
-  const int blocks_y = CeilDiv(rbox2_num, BLOCK_DIM_Y);
-
-  dim3 blocks(blocks_x, blocks_y);
-  dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y);
-
-  PD_DISPATCH_FLOATING_TYPES(
-      rbox1.type(), "rbox_iou_cuda_kernel", ([&] {
-        rbox_iou_cuda_kernel<data_t><<<blocks, threads, 0, rbox1.stream()>>>(
-            rbox1_num, rbox2_num, rbox1.data<data_t>(), rbox2.data<data_t>(),
-            output.data<data_t>());
-      }));
-
-  return {output};
-}
diff --git a/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou_utils.h b/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou_utils.h
deleted file mode 100644
index 6f275dd..0000000
--- a/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou_utils.h
+++ /dev/null
@@ -1,356 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-//
-// The code is based on
-// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/
-
-#pragma once
-
-#include <cassert>
-#include <cmath>
-#include <vector>
-
-#ifdef __CUDACC__
-// Designates functions callable from the host (CPU) and the device (GPU)
-#define HOST_DEVICE __host__ __device__
-#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
-#else
-#include <algorithm>
-#define HOST_DEVICE
-#define HOST_DEVICE_INLINE HOST_DEVICE inline
-#endif
-
-namespace {
-
-template <typename T> struct RotatedBox { T x_ctr, y_ctr, w, h, a; };
-
-template <typename T> struct Point {
-  T x, y;
-  HOST_DEVICE_INLINE Point(const T &px = 0, const T &py = 0) : x(px), y(py) {}
-  HOST_DEVICE_INLINE Point operator+(const Point &p) const {
-    return Point(x + p.x, y + p.y);
-  }
-  HOST_DEVICE_INLINE Point &operator+=(const Point &p) {
-    x += p.x;
-    y += p.y;
-    return *this;
-  }
-  HOST_DEVICE_INLINE Point operator-(const Point &p) const {
-    return Point(x - p.x, y - p.y);
-  }
-  HOST_DEVICE_INLINE Point operator*(const T coeff) const {
-    return Point(x * coeff, y * coeff);
-  }
-};
-
-template <typename T>
-HOST_DEVICE_INLINE T dot_2d(const Point<T> &A, const Point<T> &B) {
-  return A.x * B.x + A.y * B.y;
-}
-
-template <typename T>
-HOST_DEVICE_INLINE T cross_2d(const Point<T> &A, const Point<T> &B) {
-  return A.x * B.y - B.x * A.y;
-}
-
-template <typename T>
-HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox<T> &box,
-                                             Point<T> (&pts)[4]) {
-  // M_PI / 180. == 0.01745329251
-  // double theta = box.a * 0.01745329251;
-  // MODIFIED
-  double theta = box.a;
-  T cosTheta2 = (T)cos(theta) * 0.5f;
-  T sinTheta2 = (T)sin(theta) * 0.5f;
-
-  // y: top --> down; x: left --> right
-  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
-  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
-  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
-  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
-  pts[2].x = 2 * box.x_ctr - pts[0].x;
-  pts[2].y = 2 * box.y_ctr - pts[0].y;
-  pts[3].x = 2 * box.x_ctr - pts[1].x;
-  pts[3].y = 2 * box.y_ctr - pts[1].y;
-}
-
-template <typename T>
-HOST_DEVICE_INLINE int get_intersection_points(const Point<T> (&pts1)[4],
-                                               const Point<T> (&pts2)[4],
-                                               Point<T> (&intersections)[24]) {
-  // Line vector
-  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
-  Point<T> vec1[4], vec2[4];
-  for (int i = 0; i < 4; i++) {
-    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
-    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
-  }
-
-  // Line test - test all line combos for intersection
-  int num = 0; // number of intersections
-  for (int i = 0; i < 4; i++) {
-    for (int j = 0; j < 4; j++) {
-      // Solve for 2x2 Ax=b
-      T det = cross_2d<T>(vec2[j], vec1[i]);
-
-      // This takes care of parallel lines
-      if (fabs(det) <= 1e-14) {
-        continue;
-      }
-
-      auto vec12 = pts2[j] - pts1[i];
-
-      T t1 = cross_2d<T>(vec2[j], vec12) / det;
-      T t2 = cross_2d<T>(vec1[i], vec12) / det;
-
-      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
-        intersections[num++] = pts1[i] + vec1[i] * t1;
-      }
-    }
-  }
-
-  // Check for vertices of rect1 inside rect2
-  {
-    const auto &AB = vec2[0];
-    const auto &DA = vec2[3];
-    auto ABdotAB = dot_2d<T>(AB, AB);
-    auto ADdotAD = dot_2d<T>(DA, DA);
-    for (int i = 0; i < 4; i++) {
-      // assume ABCD is the rectangle, and P is the point to be judged
-      // P is inside ABCD iff. P's projection on AB lies within AB
-      // and P's projection on AD lies within AD
-
-      auto AP = pts1[i] - pts2[0];
-
-      auto APdotAB = dot_2d<T>(AP, AB);
-      auto APdotAD = -dot_2d<T>(AP, DA);
-
-      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
-          (APdotAD <= ADdotAD)) {
-        intersections[num++] = pts1[i];
-      }
-    }
-  }
-
-  // Reverse the check - check for vertices of rect2 inside rect1
-  {
-    const auto &AB = vec1[0];
-    const auto &DA = vec1[3];
-    auto ABdotAB = dot_2d<T>(AB, AB);
-    auto ADdotAD = dot_2d<T>(DA, DA);
-    for (int i = 0; i < 4; i++) {
-      auto AP = pts2[i] - pts1[0];
-
-      auto APdotAB = dot_2d<T>(AP, AB);
-      auto APdotAD = -dot_2d<T>(AP, DA);
-
-      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
-          (APdotAD <= ADdotAD)) {
-        intersections[num++] = pts2[i];
-      }
-    }
-  }
-
-  return num;
-}
-
-template <typename T>
-HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
-                                          const int &num_in, Point<T> (&q)[24],
-                                          bool shift_to_zero = false) {
-  assert(num_in >= 2);
-
-  // Step 1:
-  // Find point with minimum y
-  // if more than 1 points have the same minimum y,
-  // pick the one with the minimum x.
-  int t = 0;
-  for (int i = 1; i < num_in; i++) {
-    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
-      t = i;
-    }
-  }
-  auto &start = p[t]; // starting point
-
-  // Step 2:
-  // Subtract starting point from every points (for sorting in the next step)
-  for (int i = 0; i < num_in; i++) {
-    q[i] = p[i] - start;
-  }
-
-  // Swap the starting point to position 0
-  auto tmp = q[0];
-  q[0] = q[t];
-  q[t] = tmp;
-
-  // Step 3:
-  // Sort point 1 ~ num_in according to their relative cross-product values
-  // (essentially sorting according to angles)
-  // If the angles are the same, sort according to their distance to origin
-  T dist[24];
-  for (int i = 0; i < num_in; i++) {
-    dist[i] = dot_2d<T>(q[i], q[i]);
-  }
-
-#ifdef __CUDACC__
-  // CUDA version
-  // In the future, we can potentially use thrust
-  // for sorting here to improve speed (though not guaranteed)
-  for (int i = 1; i < num_in - 1; i++) {
-    for (int j = i + 1; j < num_in; j++) {
-      T crossProduct = cross_2d<T>(q[i], q[j]);
-      if ((crossProduct < -1e-6) ||
-          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
-        auto q_tmp = q[i];
-        q[i] = q[j];
-        q[j] = q_tmp;
-        auto dist_tmp = dist[i];
-        dist[i] = dist[j];
-        dist[j] = dist_tmp;
-      }
-    }
-  }
-#else
-  // CPU version
-  std::sort(q + 1, q + num_in,
-            [](const Point<T> &A, const Point<T> &B) -> bool {
-              T temp = cross_2d<T>(A, B);
-              if (fabs(temp) < 1e-6) {
-                return dot_2d<T>(A, A) < dot_2d<T>(B, B);
-              } else {
-                return temp > 0;
-              }
-            });
-#endif
-
-  // Step 4:
-  // Make sure there are at least 2 points (that don't overlap with each other)
-  // in the stack
-  int k; // index of the non-overlapped second point
-  for (k = 1; k < num_in; k++) {
-    if (dist[k] > 1e-8) {
-      break;
-    }
-  }
-  if (k == num_in) {
-    // We reach the end, which means the convex hull is just one point
-    q[0] = p[t];
-    return 1;
-  }
-  q[1] = q[k];
-  int m = 2; // 2 points in the stack
-  // Step 5:
-  // Finally we can start the scanning process.
-  // When a non-convex relationship between the 3 points is found
-  // (either concave shape or duplicated points),
-  // we pop the previous point from the stack
-  // until the 3-point relationship is convex again, or
-  // until the stack only contains two points
-  for (int i = k + 1; i < num_in; i++) {
-    while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
-      m--;
-    }
-    q[m++] = q[i];
-  }
-
-  // Step 6 (Optional):
-  // In general sense we need the original coordinates, so we
-  // need to shift the points back (reverting Step 2)
-  // But if we're only interested in getting the area/perimeter of the shape
-  // We can simply return.
-  if (!shift_to_zero) {
-    for (int i = 0; i < m; i++) {
-      q[i] += start;
-    }
-  }
-
-  return m;
-}
-
-template <typename T>
-HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int &m) {
-  if (m <= 2) {
-    return 0;
-  }
-
-  T area = 0;
-  for (int i = 1; i < m - 1; i++) {
-    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
-  }
-
-  return area / 2.0;
-}
-
-template <typename T>
-HOST_DEVICE_INLINE T rboxes_intersection(const RotatedBox<T> &box1,
-                                         const RotatedBox<T> &box2) {
-  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
-  // from rotated_rect_intersection_pts
-  Point<T> intersectPts[24], orderedPts[24];
-
-  Point<T> pts1[4];
-  Point<T> pts2[4];
-  get_rotated_vertices<T>(box1, pts1);
-  get_rotated_vertices<T>(box2, pts2);
-
-  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
-
-  if (num <= 2) {
-    return 0.0;
-  }
-
-  // Convex Hull to order the intersection points in clockwise order and find
-  // the contour area.
-  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
-  return polygon_area<T>(orderedPts, num_convex);
-}
-
-} // namespace
-
-template <typename T>
-HOST_DEVICE_INLINE T rbox_iou_single(T const *const box1_raw,
-                                     T const *const box2_raw) {
-  // shift center to the middle point to achieve higher precision in result
-  RotatedBox<T> box1, box2;
-  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
-  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
-  box1.x_ctr = box1_raw[0] - center_shift_x;
-  box1.y_ctr = box1_raw[1] - center_shift_y;
-  box1.w = box1_raw[2];
-  box1.h = box1_raw[3];
-  box1.a = box1_raw[4];
-  box2.x_ctr = box2_raw[0] - center_shift_x;
-  box2.y_ctr = box2_raw[1] - center_shift_y;
-  box2.w = box2_raw[2];
-  box2.h = box2_raw[3];
-  box2.a = box2_raw[4];
-
-  if (box1.w < 1e-2 || box1.h < 1e-2 || box2.w < 1e-2 || box2.h < 1e-2) {
-    return 0.f;
-  }
-  const T area1 = box1.w * box1.h;
-  const T area2 = box2.w * box2.h;
-
-  const T intersection = rboxes_intersection<T>(box1, box2);
-  const T iou = intersection / (area1 + area2 - intersection);
-  return iou;
-}
-
-/**
-   Computes ceil(a / b)
-*/
-
-HOST_DEVICE inline int CeilDiv(const int a, const int b) {
-  return (a + b - 1) / b;
-}
\ No newline at end of file
diff --git a/pdfdet/models/Paddle/ppdet/ext_op/setup.py b/pdfdet/models/Paddle/ppdet/ext_op/setup.py
deleted file mode 100644
index 5892f46..0000000
--- a/pdfdet/models/Paddle/ppdet/ext_op/setup.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import os
-import glob
-import paddle
-from paddle.utils.cpp_extension import CppExtension, CUDAExtension, setup
-
-
-def get_extensions():
-    root_dir = os.path.dirname(os.path.abspath(__file__))
-    ext_root_dir = os.path.join(root_dir, 'csrc')
-    sources = []
-    for ext_name in os.listdir(ext_root_dir):
-        ext_dir = os.path.join(ext_root_dir, ext_name)
-        source = glob.glob(os.path.join(ext_dir, '*.cc'))
-        kwargs = dict()
-        if paddle.device.is_compiled_with_cuda():
-            source += glob.glob(os.path.join(ext_dir, '*.cu'))
-
-        if not source:
-            continue
-
-        sources += source
-
-    if paddle.device.is_compiled_with_cuda():
-        extension = CUDAExtension(
-            sources, extra_compile_args={'cxx': ['-DPADDLE_WITH_CUDA']})
-    else:
-        extension = CppExtension(sources)
-
-    return extension
-
-
-if __name__ == "__main__":
-    setup(name='ext_op', ext_modules=get_extensions())
diff --git a/pdfdet/models/Paddle/ppdet/ext_op/unittest/test_matched_rbox_iou.py b/pdfdet/models/Paddle/ppdet/ext_op/unittest/test_matched_rbox_iou.py
deleted file mode 100644
index af7b076..0000000
--- a/pdfdet/models/Paddle/ppdet/ext_op/unittest/test_matched_rbox_iou.py
+++ /dev/null
@@ -1,149 +0,0 @@
-import numpy as np
-import sys
-import time
-from shapely.geometry import Polygon
-import paddle
-import unittest
-
-from ext_op import matched_rbox_iou
-
-
-def rbox2poly_single(rrect, get_best_begin_point=False):
-    """
-    rrect:[x_ctr,y_ctr,w,h,angle]
-    to
-    poly:[x0,y0,x1,y1,x2,y2,x3,y3]
-    """
-    x_ctr, y_ctr, width, height, angle = rrect[:5]
-    tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2
-    # rect 2x4
-    rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])
-    R = np.array([[np.cos(angle), -np.sin(angle)],
-                  [np.sin(angle), np.cos(angle)]])
-    # poly
-    poly = R.dot(rect)
-    x0, x1, x2, x3 = poly[0, :4] + x_ctr
-    y0, y1, y2, y3 = poly[1, :4] + y_ctr
-    poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64)
-    return poly
-
-
-def intersection(g, p):
-    """
-    Intersection.
-    """
-
-    g = g[:8].reshape((4, 2))
-    p = p[:8].reshape((4, 2))
-
-    a = g
-    b = p
-
-    use_filter = True
-    if use_filter:
-        # step1:
-        inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0]))
-        inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0]))
-        inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1]))
-        inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1]))
-        if inter_x1 >= inter_x2 or inter_y1 >= inter_y2:
-            return 0.
-        x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0]))
-        x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0]))
-        y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1]))
-        y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1]))
-        if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2:
-            return 0.
-
-    g = Polygon(g)
-    p = Polygon(p)
-    if not g.is_valid or not p.is_valid:
-        return 0
-
-    inter = Polygon(g).intersection(Polygon(p)).area
-    union = g.area + p.area - inter
-    if union == 0:
-        return 0
-    else:
-        return inter / union
-
-
-def matched_rbox_overlaps(anchors, gt_bboxes, use_cv2=False):
-    """
-
-    Args:
-        anchors: [M, 5]  x1,y1,x2,y2,angle
-        gt_bboxes: [M, 5]  x1,y1,x2,y2,angle
-
-    Returns:
-        macthed_iou: [M]
-    """
-    assert anchors.shape[1] == 5
-    assert gt_bboxes.shape[1] == 5
-
-    gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes]
-    anchors_ploy = [rbox2poly_single(e) for e in anchors]
-
-    num = len(anchors_ploy)
-    iou = np.zeros((num, ), dtype=np.float64)
-
-    start_time = time.time()
-    for i in range(num):
-        try:
-            iou[i] = intersection(gt_bboxes_ploy[i], anchors_ploy[i])
-        except Exception as e:
-            print('cur gt_bboxes_ploy[i]', gt_bboxes_ploy[i],
-                  'anchors_ploy[j]', anchors_ploy[i], e)
-    return iou
-
-
-def gen_sample(n):
-    rbox = np.random.rand(n, 5)
-    rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001
-    rbox[:, 4] = rbox[:, 4] - 0.5
-    return rbox
-
-
-class MatchedRBoxIoUTest(unittest.TestCase):
-    def setUp(self):
-        self.initTestCase()
-        self.rbox1 = gen_sample(self.n)
-        self.rbox2 = gen_sample(self.n)
-
-    def initTestCase(self):
-        self.n = 1000
-
-    def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2):
-        self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg)
-
-    def get_places(self):
-        places = [paddle.CPUPlace()]
-        if paddle.device.is_compiled_with_cuda():
-            places.append(paddle.CUDAPlace(0))
-
-        return places
-
-    def check_output(self, place):
-        paddle.disable_static()
-        pd_rbox1 = paddle.to_tensor(self.rbox1, place=place)
-        pd_rbox2 = paddle.to_tensor(self.rbox2, place=place)
-        actual_t = matched_rbox_iou(pd_rbox1, pd_rbox2).numpy()
-        poly_rbox1 = self.rbox1
-        poly_rbox2 = self.rbox2
-        poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024
-        poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024
-        expect_t = matched_rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False)
-        self.assertAllClose(
-            actual_t,
-            expect_t,
-            msg="rbox_iou has diff at {} \nExpect {}\nBut got {}".format(
-                str(place), str(expect_t), str(actual_t)))
-
-    def test_output(self):
-        places = self.get_places()
-        for place in places:
-            self.check_output(place)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/pdfdet/models/Paddle/ppdet/ext_op/unittest/test_rbox_iou.py b/pdfdet/models/Paddle/ppdet/ext_op/unittest/test_rbox_iou.py
deleted file mode 100644
index 8ef19ae..0000000
--- a/pdfdet/models/Paddle/ppdet/ext_op/unittest/test_rbox_iou.py
+++ /dev/null
@@ -1,151 +0,0 @@
-import numpy as np
-import sys
-import time
-from shapely.geometry import Polygon
-import paddle
-import unittest
-
-from ext_op import rbox_iou
-
-
-def rbox2poly_single(rrect, get_best_begin_point=False):
-    """
-    rrect:[x_ctr,y_ctr,w,h,angle]
-    to
-    poly:[x0,y0,x1,y1,x2,y2,x3,y3]
-    """
-    x_ctr, y_ctr, width, height, angle = rrect[:5]
-    tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2
-    # rect 2x4
-    rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])
-    R = np.array([[np.cos(angle), -np.sin(angle)],
-                  [np.sin(angle), np.cos(angle)]])
-    # poly
-    poly = R.dot(rect)
-    x0, x1, x2, x3 = poly[0, :4] + x_ctr
-    y0, y1, y2, y3 = poly[1, :4] + y_ctr
-    poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64)
-    return poly
-
-
-def intersection(g, p):
-    """
-    Intersection.
-    """
-
-    g = g[:8].reshape((4, 2))
-    p = p[:8].reshape((4, 2))
-
-    a = g
-    b = p
-
-    use_filter = True
-    if use_filter:
-        # step1:
-        inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0]))
-        inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0]))
-        inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1]))
-        inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1]))
-        if inter_x1 >= inter_x2 or inter_y1 >= inter_y2:
-            return 0.
-        x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0]))
-        x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0]))
-        y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1]))
-        y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1]))
-        if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2:
-            return 0.
-
-    g = Polygon(g)
-    p = Polygon(p)
-    if not g.is_valid or not p.is_valid:
-        return 0
-
-    inter = Polygon(g).intersection(Polygon(p)).area
-    union = g.area + p.area - inter
-    if union == 0:
-        return 0
-    else:
-        return inter / union
-
-
-def rbox_overlaps(anchors, gt_bboxes, use_cv2=False):
-    """
-
-    Args:
-        anchors: [NA, 5]  x1,y1,x2,y2,angle
-        gt_bboxes: [M, 5]  x1,y1,x2,y2,angle
-
-    Returns:
-        iou: [NA, M]
-    """
-    assert anchors.shape[1] == 5
-    assert gt_bboxes.shape[1] == 5
-
-    gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes]
-    anchors_ploy = [rbox2poly_single(e) for e in anchors]
-
-    num_gt, num_anchors = len(gt_bboxes_ploy), len(anchors_ploy)
-    iou = np.zeros((num_anchors, num_gt), dtype=np.float64)
-
-    start_time = time.time()
-    for i in range(num_anchors):
-        for j in range(num_gt):
-            try:
-                iou[i, j] = intersection(anchors_ploy[i], gt_bboxes_ploy[j])
-            except Exception as e:
-                print('cur anchors_ploy[i]', anchors_ploy[i],
-                      'gt_bboxes_ploy[j]', gt_bboxes_ploy[j], e)
-    return iou
-
-
-def gen_sample(n):
-    rbox = np.random.rand(n, 5)
-    rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001
-    rbox[:, 4] = rbox[:, 4] - 0.5
-    return rbox
-
-
-class RBoxIoUTest(unittest.TestCase):
-    def setUp(self):
-        self.initTestCase()
-        self.rbox1 = gen_sample(self.n)
-        self.rbox2 = gen_sample(self.m)
-
-    def initTestCase(self):
-        self.n = 13000
-        self.m = 7
-
-    def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2):
-        self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg)
-
-    def get_places(self):
-        places = [paddle.CPUPlace()]
-        if paddle.device.is_compiled_with_cuda():
-            places.append(paddle.CUDAPlace(0))
-
-        return places
-
-    def check_output(self, place):
-        paddle.disable_static()
-        pd_rbox1 = paddle.to_tensor(self.rbox1, place=place)
-        pd_rbox2 = paddle.to_tensor(self.rbox2, place=place)
-        actual_t = rbox_iou(pd_rbox1, pd_rbox2).numpy()
-        poly_rbox1 = self.rbox1
-        poly_rbox2 = self.rbox2
-        poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024
-        poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024
-        expect_t = rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False)
-        self.assertAllClose(
-            actual_t,
-            expect_t,
-            msg="rbox_iou has diff at {} \nExpect {}\nBut got {}".format(
-                str(place), str(expect_t), str(actual_t)))
-
-    def test_output(self):
-        places = self.get_places()
-        for place in places:
-            self.check_output(place)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/pdfdet/models/Paddle/ppdet/metrics/__init__.py b/pdfdet/models/Paddle/ppdet/metrics/__init__.py
deleted file mode 100644
index 288f158..0000000
--- a/pdfdet/models/Paddle/ppdet/metrics/__init__.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import metrics
-from . import keypoint_metrics
-
-from .metrics import *
-from .keypoint_metrics import *
-from .pose3d_metrics import *
-
-__all__ = metrics.__all__ + keypoint_metrics.__all__
-
-from . import mot_metrics
-from .mot_metrics import *
-__all__ = metrics.__all__ + mot_metrics.__all__
-
-from . import mcmot_metrics
-from .mcmot_metrics import *
-__all__ = metrics.__all__ + mcmot_metrics.__all__
-
-from . import culane_metrics
-from .culane_metrics import *
-__all__ = metrics.__all__ + culane_metrics.__all__
\ No newline at end of file
diff --git a/pdfdet/models/Paddle/ppdet/metrics/coco_utils.py b/pdfdet/models/Paddle/ppdet/metrics/coco_utils.py
deleted file mode 100644
index b7a4d7e..0000000
--- a/pdfdet/models/Paddle/ppdet/metrics/coco_utils.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import numpy as np
-import itertools
-
-from ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res, get_pose3d_res
-from ppdet.metrics.map_utils import draw_pr_curve
-
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-
-def get_infer_results(outs, catid, bias=0):
-    """
-    Get result at the stage of inference.
-    The output format is dictionary containing bbox or mask result.
-
-    For example, bbox result is a list and each element contains
-    image_id, category_id, bbox and score.
-    """
-    if outs is None or len(outs) == 0:
-        raise ValueError(
-            'The number of valid detection result if zero. Please use reasonable model and check input data.'
-        )
-
-    im_id = outs['im_id']
-
-    infer_res = {}
-    if 'bbox' in outs:
-        if len(outs['bbox']) > 0 and len(outs['bbox'][0]) > 6:
-            infer_res['bbox'] = get_det_poly_res(
-                outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias)
-        else:
-            infer_res['bbox'] = get_det_res(
-                outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias)
-
-    if 'mask' in outs:
-        # mask post process
-        infer_res['mask'] = get_seg_res(outs['mask'], outs['bbox'],
-                                        outs['bbox_num'], im_id, catid)
-
-    if 'segm' in outs:
-        infer_res['segm'] = get_solov2_segm_res(outs, im_id, catid)
-
-    if 'keypoint' in outs:
-        infer_res['keypoint'] = get_keypoint_res(outs, im_id)
-        outs['bbox_num'] = [len(infer_res['keypoint'])]
-
-    if 'pose3d' in outs:
-        infer_res['pose3d'] = get_pose3d_res(outs, im_id)
-        outs['bbox_num'] = [len(infer_res['pose3d'])]
-
-    return infer_res
-
-
-def cocoapi_eval(jsonfile,
-                 style,
-                 coco_gt=None,
-                 anno_file=None,
-                 max_dets=(100, 300, 1000),
-                 classwise=False,
-                 sigmas=None,
-                 use_area=True):
-    """
-    Args:
-        jsonfile (str): Evaluation json file, eg: bbox.json, mask.json.
-        style (str): COCOeval style, can be `bbox` , `segm` , `proposal`, `keypoints` and `keypoints_crowd`.
-        coco_gt (str): Whether to load COCOAPI through anno_file,
-                 eg: coco_gt = COCO(anno_file)
-        anno_file (str): COCO annotations file.
-        max_dets (tuple): COCO evaluation maxDets.
-        classwise (bool): Whether per-category AP and draw P-R Curve or not.
-        sigmas (nparray): keypoint labelling sigmas.
-        use_area (bool): If gt annotations (eg. CrowdPose, AIC)
-                         do not have 'area', please set use_area=False.
-    """
-    assert coco_gt != None or anno_file != None
-    if style == 'keypoints_crowd':
-        #please install xtcocotools==1.6
-        from xtcocotools.coco import COCO
-        from xtcocotools.cocoeval import COCOeval
-    else:
-        from pycocotools.coco import COCO
-        from pycocotools.cocoeval import COCOeval
-
-    if coco_gt == None:
-        coco_gt = COCO(anno_file)
-    logger.info("Start evaluate...")
-    coco_dt = coco_gt.loadRes(jsonfile)
-    if style == 'proposal':
-        coco_eval = COCOeval(coco_gt, coco_dt, 'bbox')
-        coco_eval.params.useCats = 0
-        coco_eval.params.maxDets = list(max_dets)
-    elif style == 'keypoints_crowd':
-        coco_eval = COCOeval(coco_gt, coco_dt, style, sigmas, use_area)
-    else:
-        coco_eval = COCOeval(coco_gt, coco_dt, style)
-    coco_eval.evaluate()
-    coco_eval.accumulate()
-    coco_eval.summarize()
-    if classwise:
-        # Compute per-category AP and PR curve
-        try:
-            from terminaltables import AsciiTable
-        except Exception as e:
-            logger.error(
-                'terminaltables not found, plaese install terminaltables. '
-                'for example: `pip install terminaltables`.')
-            raise e
-        precisions = coco_eval.eval['precision']
-        cat_ids = coco_gt.getCatIds()
-        # precision: (iou, recall, cls, area range, max dets)
-        assert len(cat_ids) == precisions.shape[2]
-        results_per_category = []
-        for idx, catId in enumerate(cat_ids):
-            # area range index 0: all area ranges
-            # max dets index -1: typically 100 per image
-            nm = coco_gt.loadCats(catId)[0]
-            precision = precisions[:, :, idx, 0, -1]
-            precision = precision[precision > -1]
-            if precision.size:
-                ap = np.mean(precision)
-            else:
-                ap = float('nan')
-            results_per_category.append(
-                (str(nm["name"]), '{:0.3f}'.format(float(ap))))
-            pr_array = precisions[0, :, idx, 0, 2]
-            recall_array = np.arange(0.0, 1.01, 0.01)
-            draw_pr_curve(
-                pr_array,
-                recall_array,
-                out_dir=style + '_pr_curve',
-                file_name='{}_precision_recall_curve.jpg'.format(nm["name"]))
-
-        num_columns = min(6, len(results_per_category) * 2)
-        results_flatten = list(itertools.chain(*results_per_category))
-        headers = ['category', 'AP'] * (num_columns // 2)
-        results_2d = itertools.zip_longest(
-            * [results_flatten[i::num_columns] for i in range(num_columns)])
-        table_data = [headers]
-        table_data += [result for result in results_2d]
-        table = AsciiTable(table_data)
-        logger.info('Per-category of {} AP: \n{}'.format(style, table.table))
-        logger.info("per-category PR curve has output to {} folder.".format(
-            style + '_pr_curve'))
-    # flush coco evaluation result
-    sys.stdout.flush()
-    return coco_eval.stats
-
-
-def json_eval_results(metric, json_directory, dataset):
-    """
-    cocoapi eval with already exists proposal.json, bbox.json or mask.json
-    """
-    assert metric == 'COCO'
-    anno_file = dataset.get_anno()
-    json_file_list = ['proposal.json', 'bbox.json', 'mask.json']
-    if json_directory:
-        assert os.path.exists(
-            json_directory), "The json directory:{} does not exist".format(
-                json_directory)
-        for k, v in enumerate(json_file_list):
-            json_file_list[k] = os.path.join(str(json_directory), v)
-
-    coco_eval_style = ['proposal', 'bbox', 'segm']
-    for i, v_json in enumerate(json_file_list):
-        if os.path.exists(v_json):
-            cocoapi_eval(v_json, coco_eval_style[i], anno_file=anno_file)
-        else:
-            logger.info("{} not exists!".format(v_json))
diff --git a/pdfdet/models/Paddle/ppdet/metrics/culane_metrics.py b/pdfdet/models/Paddle/ppdet/metrics/culane_metrics.py
deleted file mode 100644
index 848d2c1..0000000
--- a/pdfdet/models/Paddle/ppdet/metrics/culane_metrics.py
+++ /dev/null
@@ -1,327 +0,0 @@
-import os
-import cv2
-import numpy as np
-import os.path as osp
-from functools import partial
-from .metrics import Metric
-from scipy.interpolate import splprep, splev
-from scipy.optimize import linear_sum_assignment
-from shapely.geometry import LineString, Polygon
-from ppdet.utils.logger import setup_logger
-
-logger = setup_logger(__name__)
-
-__all__ = [
-    'draw_lane', 'discrete_cross_iou', 'continuous_cross_iou', 'interp',
-    'culane_metric', 'load_culane_img_data', 'load_culane_data',
-    'eval_predictions', "CULaneMetric"
-]
-
-LIST_FILE = {
-    'train': 'list/train_gt.txt',
-    'val': 'list/val.txt',
-    'test': 'list/test.txt',
-}
-
-CATEGORYS = {
-    'normal': 'list/test_split/test0_normal.txt',
-    'crowd': 'list/test_split/test1_crowd.txt',
-    'hlight': 'list/test_split/test2_hlight.txt',
-    'shadow': 'list/test_split/test3_shadow.txt',
-    'noline': 'list/test_split/test4_noline.txt',
-    'arrow': 'list/test_split/test5_arrow.txt',
-    'curve': 'list/test_split/test6_curve.txt',
-    'cross': 'list/test_split/test7_cross.txt',
-    'night': 'list/test_split/test8_night.txt',
-}
-
-
-def draw_lane(lane, img=None, img_shape=None, width=30):
-    if img is None:
-        img = np.zeros(img_shape, dtype=np.uint8)
-    lane = lane.astype(np.int32)
-    for p1, p2 in zip(lane[:-1], lane[1:]):
-        cv2.line(
-            img, tuple(p1), tuple(p2), color=(255, 255, 255), thickness=width)
-    return img
-
-
-def discrete_cross_iou(xs, ys, width=30, img_shape=(590, 1640, 3)):
-    xs = [draw_lane(lane, img_shape=img_shape, width=width) > 0 for lane in xs]
-    ys = [draw_lane(lane, img_shape=img_shape, width=width) > 0 for lane in ys]
-
-    ious = np.zeros((len(xs), len(ys)))
-    for i, x in enumerate(xs):
-        for j, y in enumerate(ys):
-            ious[i, j] = (x & y).sum() / (x | y).sum()
-    return ious
-
-
-def continuous_cross_iou(xs, ys, width=30, img_shape=(590, 1640, 3)):
-    h, w, _ = img_shape
-    image = Polygon([(0, 0), (0, h - 1), (w - 1, h - 1), (w - 1, 0)])
-    xs = [
-        LineString(lane).buffer(
-            distance=width / 2., cap_style=1, join_style=2).intersection(image)
-        for lane in xs
-    ]
-    ys = [
-        LineString(lane).buffer(
-            distance=width / 2., cap_style=1, join_style=2).intersection(image)
-        for lane in ys
-    ]
-
-    ious = np.zeros((len(xs), len(ys)))
-    for i, x in enumerate(xs):
-        for j, y in enumerate(ys):
-            ious[i, j] = x.intersection(y).area / x.union(y).area
-
-    return ious
-
-
-def interp(points, n=50):
-    x = [x for x, _ in points]
-    y = [y for _, y in points]
-    tck, u = splprep([x, y], s=0, t=n, k=min(3, len(points) - 1))
-
-    u = np.linspace(0., 1., num=(len(u) - 1) * n + 1)
-    return np.array(splev(u, tck)).T
-
-
-def culane_metric(pred,
-                  anno,
-                  width=30,
-                  iou_thresholds=[0.5],
-                  official=True,
-                  img_shape=(590, 1640, 3)):
-    _metric = {}
-    for thr in iou_thresholds:
-        tp = 0
-        fp = 0 if len(anno) != 0 else len(pred)
-        fn = 0 if len(pred) != 0 else len(anno)
-        _metric[thr] = [tp, fp, fn]
-
-    interp_pred = np.array(
-        [interp(
-            pred_lane, n=5) for pred_lane in pred], dtype=object)  # (4, 50, 2)
-    interp_anno = np.array(
-        [interp(
-            anno_lane, n=5) for anno_lane in anno], dtype=object)  # (4, 50, 2)
-
-    if official:
-        ious = discrete_cross_iou(
-            interp_pred, interp_anno, width=width, img_shape=img_shape)
-    else:
-        ious = continuous_cross_iou(
-            interp_pred, interp_anno, width=width, img_shape=img_shape)
-
-    row_ind, col_ind = linear_sum_assignment(1 - ious)
-
-    _metric = {}
-    for thr in iou_thresholds:
-        tp = int((ious[row_ind, col_ind] > thr).sum())
-        fp = len(pred) - tp
-        fn = len(anno) - tp
-        _metric[thr] = [tp, fp, fn]
-    return _metric
-
-
-def load_culane_img_data(path):
-    with open(path, 'r') as data_file:
-        img_data = data_file.readlines()
-    img_data = [line.split() for line in img_data]
-    img_data = [list(map(float, lane)) for lane in img_data]
-    img_data = [[(lane[i], lane[i + 1]) for i in range(0, len(lane), 2)]
-                for lane in img_data]
-    img_data = [lane for lane in img_data if len(lane) >= 2]
-
-    return img_data
-
-
-def load_culane_data(data_dir, file_list_path):
-    with open(file_list_path, 'r') as file_list:
-        filepaths = [
-            os.path.join(data_dir,
-                         line[1 if line[0] == '/' else 0:].rstrip().replace(
-                             '.jpg', '.lines.txt'))
-            for line in file_list.readlines()
-        ]
-
-    data = []
-    for path in filepaths:
-        img_data = load_culane_img_data(path)
-        data.append(img_data)
-
-    return data
-
-
-def eval_predictions(pred_dir,
-                     anno_dir,
-                     list_path,
-                     iou_thresholds=[0.5],
-                     width=30,
-                     official=True,
-                     sequential=False):
-    logger.info('Calculating metric for List: {}'.format(list_path))
-    predictions = load_culane_data(pred_dir, list_path)
-    annotations = load_culane_data(anno_dir, list_path)
-    img_shape = (590, 1640, 3)
-    if sequential:
-        results = map(partial(
-            culane_metric,
-            width=width,
-            official=official,
-            iou_thresholds=iou_thresholds,
-            img_shape=img_shape),
-                      predictions,
-                      annotations)
-    else:
-        from multiprocessing import Pool, cpu_count
-        from itertools import repeat
-        with Pool(cpu_count()) as p:
-            results = p.starmap(culane_metric,
-                                zip(predictions, annotations,
-                                    repeat(width),
-                                    repeat(iou_thresholds),
-                                    repeat(official), repeat(img_shape)))
-
-    mean_f1, mean_prec, mean_recall, total_tp, total_fp, total_fn = 0, 0, 0, 0, 0, 0
-    ret = {}
-    for thr in iou_thresholds:
-        tp = sum(m[thr][0] for m in results)
-        fp = sum(m[thr][1] for m in results)
-        fn = sum(m[thr][2] for m in results)
-        precision = float(tp) / (tp + fp) if tp != 0 else 0
-        recall = float(tp) / (tp + fn) if tp != 0 else 0
-        f1 = 2 * precision * recall / (precision + recall) if tp != 0 else 0
-        logger.info('iou thr: {:.2f}, tp: {}, fp: {}, fn: {},'
-                    'precision: {}, recall: {}, f1: {}'.format(
-                        thr, tp, fp, fn, precision, recall, f1))
-        mean_f1 += f1 / len(iou_thresholds)
-        mean_prec += precision / len(iou_thresholds)
-        mean_recall += recall / len(iou_thresholds)
-        total_tp += tp
-        total_fp += fp
-        total_fn += fn
-        ret[thr] = {
-            'TP': tp,
-            'FP': fp,
-            'FN': fn,
-            'Precision': precision,
-            'Recall': recall,
-            'F1': f1
-        }
-    if len(iou_thresholds) > 2:
-        logger.info(
-            'mean result, total_tp: {}, total_fp: {}, total_fn: {},'
-            'precision: {}, recall: {}, f1: {}'.format(
-                total_tp, total_fp, total_fn, mean_prec, mean_recall, mean_f1))
-        ret['mean'] = {
-            'TP': total_tp,
-            'FP': total_fp,
-            'FN': total_fn,
-            'Precision': mean_prec,
-            'Recall': mean_recall,
-            'F1': mean_f1
-        }
-    return ret
-
-
-class CULaneMetric(Metric):
-    def __init__(self,
-                 cfg,
-                 output_eval=None,
-                 split="test",
-                 dataset_dir="dataset/CULane/"):
-        super(CULaneMetric, self).__init__()
-        self.output_eval = "evaluation" if output_eval is None else output_eval
-        self.dataset_dir = dataset_dir
-        self.split = split
-        self.list_path = osp.join(dataset_dir, LIST_FILE[split])
-        self.predictions = []
-        self.img_names = []
-        self.lanes = []
-        self.eval_results = {}
-        self.cfg = cfg
-        self.reset()
-
-    def reset(self):
-        self.predictions = []
-        self.img_names = []
-        self.lanes = []
-        self.eval_results = {}
-
-    def get_prediction_string(self, pred):
-        ys = np.arange(270, 590, 8) / self.cfg.ori_img_h
-        out = []
-        for lane in pred:
-            xs = lane(ys)
-            valid_mask = (xs >= 0) & (xs < 1)
-            xs = xs * self.cfg.ori_img_w
-            lane_xs = xs[valid_mask]
-            lane_ys = ys[valid_mask] * self.cfg.ori_img_h
-            lane_xs, lane_ys = lane_xs[::-1], lane_ys[::-1]
-            lane_str = ' '.join([
-                '{:.5f} {:.5f}'.format(x, y) for x, y in zip(lane_xs, lane_ys)
-            ])
-            if lane_str != '':
-                out.append(lane_str)
-
-        return '\n'.join(out)
-
-    def accumulate(self):
-        loss_lines = [[], [], [], []]
-        for idx, pred in enumerate(self.predictions):
-            output_dir = os.path.join(self.output_eval,
-                                      os.path.dirname(self.img_names[idx]))
-            output_filename = os.path.basename(self.img_names[
-                idx])[:-3] + 'lines.txt'
-            os.makedirs(output_dir, exist_ok=True)
-            output = self.get_prediction_string(pred)
-
-            # store loss lines
-            lanes = self.lanes[idx]
-            if len(lanes) - len(pred) in [1, 2, 3, 4]:
-                loss_lines[len(lanes) - len(pred) - 1].append(self.img_names[
-                    idx])
-
-            with open(os.path.join(output_dir, output_filename),
-                      'w') as out_file:
-                out_file.write(output)
-
-        for i, names in enumerate(loss_lines):
-            with open(
-                    os.path.join(output_dir, 'loss_{}_lines.txt'.format(i + 1)),
-                    'w') as f:
-                for name in names:
-                    f.write(name + '\n')
-
-        for cate, cate_file in CATEGORYS.items():
-            result = eval_predictions(
-                self.output_eval,
-                self.dataset_dir,
-                os.path.join(self.dataset_dir, cate_file),
-                iou_thresholds=[0.5],
-                official=True)
-
-        result = eval_predictions(
-            self.output_eval,
-            self.dataset_dir,
-            self.list_path,
-            iou_thresholds=np.linspace(0.5, 0.95, 10),
-            official=True)
-        self.eval_results['F1@50'] = result[0.5]['F1']
-        self.eval_results['result'] = result
-
-    def update(self, inputs, outputs):
-        assert len(inputs['img_name']) == len(outputs['lanes'])
-        self.predictions.extend(outputs['lanes'])
-        self.img_names.extend(inputs['img_name'])
-        self.lanes.extend(inputs['lane_line'])
-
-    def log(self):
-        logger.info(self.eval_results)
-
-    # abstract method for getting metric results
-    def get_results(self):
-        return self.eval_results
diff --git a/pdfdet/models/Paddle/ppdet/metrics/json_results.py b/pdfdet/models/Paddle/ppdet/metrics/json_results.py
deleted file mode 100644
index d2575af..0000000
--- a/pdfdet/models/Paddle/ppdet/metrics/json_results.py
+++ /dev/null
@@ -1,175 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import six
-import numpy as np
-
-
-def get_det_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0):
-    det_res = []
-    k = 0
-    for i in range(len(bbox_nums)):
-        cur_image_id = int(image_id[i][0])
-        det_nums = bbox_nums[i]
-        for j in range(det_nums):
-            dt = bboxes[k]
-            k = k + 1
-            num_id, score, xmin, ymin, xmax, ymax = dt.tolist()
-            if int(num_id) < 0:
-                continue
-            category_id = label_to_cat_id_map[int(num_id)]
-            w = xmax - xmin + bias
-            h = ymax - ymin + bias
-            bbox = [xmin, ymin, w, h]
-            dt_res = {
-                'image_id': cur_image_id,
-                'category_id': category_id,
-                'bbox': bbox,
-                'score': score
-            }
-            det_res.append(dt_res)
-    return det_res
-
-
-def get_det_poly_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0):
-    det_res = []
-    k = 0
-    for i in range(len(bbox_nums)):
-        cur_image_id = int(image_id[i][0])
-        det_nums = bbox_nums[i]
-        for j in range(det_nums):
-            dt = bboxes[k]
-            k = k + 1
-            num_id, score, x1, y1, x2, y2, x3, y3, x4, y4 = dt.tolist()
-            if int(num_id) < 0:
-                continue
-            category_id = label_to_cat_id_map[int(num_id)]
-            rbox = [x1, y1, x2, y2, x3, y3, x4, y4]
-            dt_res = {
-                'image_id': cur_image_id,
-                'category_id': category_id,
-                'bbox': rbox,
-                'score': score
-            }
-            det_res.append(dt_res)
-    return det_res
-
-
-def strip_mask(mask):
-    row = mask[0, 0, :]
-    col = mask[0, :, 0]
-    im_h = len(col) - np.count_nonzero(col == -1)
-    im_w = len(row) - np.count_nonzero(row == -1)
-    return mask[:, :im_h, :im_w]
-
-
-def get_seg_res(masks, bboxes, mask_nums, image_id, label_to_cat_id_map):
-    import pycocotools.mask as mask_util
-    seg_res = []
-    k = 0
-    for i in range(len(mask_nums)):
-        cur_image_id = int(image_id[i][0])
-        det_nums = mask_nums[i]
-        mask_i = masks[k:k + det_nums]
-        mask_i = strip_mask(mask_i)
-        for j in range(det_nums):
-            mask = mask_i[j].astype(np.uint8)
-            score = float(bboxes[k][1])
-            label = int(bboxes[k][0])
-            k = k + 1
-            if label == -1:
-                continue
-            cat_id = label_to_cat_id_map[label]
-            rle = mask_util.encode(
-                np.array(
-                    mask[:, :, None], order="F", dtype="uint8"))[0]
-            if six.PY3:
-                if 'counts' in rle:
-                    rle['counts'] = rle['counts'].decode("utf8")
-            sg_res = {
-                'image_id': cur_image_id,
-                'category_id': cat_id,
-                'segmentation': rle,
-                'score': score
-            }
-            seg_res.append(sg_res)
-    return seg_res
-
-
-def get_solov2_segm_res(results, image_id, num_id_to_cat_id_map):
-    import pycocotools.mask as mask_util
-    segm_res = []
-    # for each batch
-    segms = results['segm'].astype(np.uint8)
-    clsid_labels = results['cate_label']
-    clsid_scores = results['cate_score']
-    lengths = segms.shape[0]
-    im_id = int(image_id[0][0])
-    if lengths == 0 or segms is None:
-        return None
-    # for each sample
-    for i in range(lengths - 1):
-        clsid = int(clsid_labels[i])
-        catid = num_id_to_cat_id_map[clsid]
-        score = float(clsid_scores[i])
-        mask = segms[i]
-        segm = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0]
-        segm['counts'] = segm['counts'].decode('utf8')
-        coco_res = {
-            'image_id': im_id,
-            'category_id': catid,
-            'segmentation': segm,
-            'score': score
-        }
-        segm_res.append(coco_res)
-    return segm_res
-
-
-def get_keypoint_res(results, im_id):
-    anns = []
-    preds = results['keypoint']
-    for idx in range(im_id.shape[0]):
-        image_id = im_id[idx].item()
-        kpts, scores = preds[idx]
-        for kpt, score in zip(kpts, scores):
-            kpt = kpt.flatten()
-            ann = {
-                'image_id': image_id,
-                'category_id': 1,  # XXX hard code
-                'keypoints': kpt.tolist(),
-                'score': float(score)
-            }
-            x = kpt[0::3]
-            y = kpt[1::3]
-            x0, x1, y0, y1 = np.min(x).item(), np.max(x).item(), np.min(y).item(
-            ), np.max(y).item()
-            ann['area'] = (x1 - x0) * (y1 - y0)
-            ann['bbox'] = [x0, y0, x1 - x0, y1 - y0]
-            anns.append(ann)
-    return anns
-
-
-def get_pose3d_res(results, im_id):
-    anns = []
-    preds = results['pose3d']
-    for idx in range(im_id.shape[0]):
-        image_id = im_id[idx].item()
-        pose3d = preds[idx]
-        ann = {
-            'image_id': image_id,
-            'category_id': 1,  # XXX hard code
-            'pose3d': pose3d.tolist(),
-            'score': float(1.)
-        }
-        anns.append(ann)
-    return anns
diff --git a/pdfdet/models/Paddle/ppdet/metrics/keypoint_metrics.py b/pdfdet/models/Paddle/ppdet/metrics/keypoint_metrics.py
deleted file mode 100644
index 26e9ecb..0000000
--- a/pdfdet/models/Paddle/ppdet/metrics/keypoint_metrics.py
+++ /dev/null
@@ -1,571 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import os
-import json
-from collections import defaultdict, OrderedDict
-import numpy as np
-import paddle
-from pycocotools.coco import COCO
-from pycocotools.cocoeval import COCOeval
-from ..modeling.keypoint_utils import oks_nms, keypoint_pck_accuracy, keypoint_auc, keypoint_epe
-from scipy.io import loadmat, savemat
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-__all__ = [
-    'KeyPointTopDownCOCOEval', 'KeyPointTopDownCOCOWholeBadyHandEval',
-    'KeyPointTopDownMPIIEval'
-]
-
-
-class KeyPointTopDownCOCOEval(object):
-    """refer to
-        https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
-        Copyright (c) Microsoft, under the MIT License.
-    """
-
-    def __init__(self,
-                 anno_file,
-                 num_samples,
-                 num_joints,
-                 output_eval,
-                 iou_type='keypoints',
-                 in_vis_thre=0.2,
-                 oks_thre=0.9,
-                 save_prediction_only=False):
-        super(KeyPointTopDownCOCOEval, self).__init__()
-        self.coco = COCO(anno_file)
-        self.num_samples = num_samples
-        self.num_joints = num_joints
-        self.iou_type = iou_type
-        self.in_vis_thre = in_vis_thre
-        self.oks_thre = oks_thre
-        self.output_eval = output_eval
-        self.res_file = os.path.join(output_eval, "keypoints_results.json")
-        self.save_prediction_only = save_prediction_only
-        self.reset()
-
-    def reset(self):
-        self.results = {
-            'all_preds': np.zeros(
-                (self.num_samples, self.num_joints, 3), dtype=np.float32),
-            'all_boxes': np.zeros((self.num_samples, 6)),
-            'image_path': []
-        }
-        self.eval_results = {}
-        self.idx = 0
-
-    def update(self, inputs, outputs):
-        kpts, _ = outputs['keypoint'][0]
-
-        num_images = inputs['image'].shape[0]
-        self.results['all_preds'][self.idx:self.idx + num_images, :, 0:
-                                  3] = kpts[:, :, 0:3]
-        self.results['all_boxes'][self.idx:self.idx + num_images, 0:2] = inputs[
-            'center'].numpy()[:, 0:2] if isinstance(
-                inputs['center'], paddle.Tensor) else inputs['center'][:, 0:2]
-        self.results['all_boxes'][self.idx:self.idx + num_images, 2:4] = inputs[
-            'scale'].numpy()[:, 0:2] if isinstance(
-                inputs['scale'], paddle.Tensor) else inputs['scale'][:, 0:2]
-        self.results['all_boxes'][self.idx:self.idx + num_images, 4] = np.prod(
-            inputs['scale'].numpy() * 200,
-            1) if isinstance(inputs['scale'], paddle.Tensor) else np.prod(
-                inputs['scale'] * 200, 1)
-        self.results['all_boxes'][
-            self.idx:self.idx + num_images,
-            5] = np.squeeze(inputs['score'].numpy()) if isinstance(
-                inputs['score'], paddle.Tensor) else np.squeeze(inputs['score'])
-        if isinstance(inputs['im_id'], paddle.Tensor):
-            self.results['image_path'].extend(inputs['im_id'].numpy())
-        else:
-            self.results['image_path'].extend(inputs['im_id'])
-        self.idx += num_images
-
-    def _write_coco_keypoint_results(self, keypoints):
-        data_pack = [{
-            'cat_id': 1,
-            'cls': 'person',
-            'ann_type': 'keypoints',
-            'keypoints': keypoints
-        }]
-        results = self._coco_keypoint_results_one_category_kernel(data_pack[0])
-        if not os.path.exists(self.output_eval):
-            os.makedirs(self.output_eval)
-        with open(self.res_file, 'w') as f:
-            json.dump(results, f, sort_keys=True, indent=4)
-            logger.info(f'The keypoint result is saved to {self.res_file}.')
-        try:
-            json.load(open(self.res_file))
-        except Exception:
-            content = []
-            with open(self.res_file, 'r') as f:
-                for line in f:
-                    content.append(line)
-            content[-1] = ']'
-            with open(self.res_file, 'w') as f:
-                for c in content:
-                    f.write(c)
-
-    def _coco_keypoint_results_one_category_kernel(self, data_pack):
-        cat_id = data_pack['cat_id']
-        keypoints = data_pack['keypoints']
-        cat_results = []
-
-        for img_kpts in keypoints:
-            if len(img_kpts) == 0:
-                continue
-
-            _key_points = np.array(
-                [img_kpts[k]['keypoints'] for k in range(len(img_kpts))])
-            _key_points = _key_points.reshape(_key_points.shape[0], -1)
-
-            result = [{
-                'image_id': img_kpts[k]['image'],
-                'category_id': cat_id,
-                'keypoints': _key_points[k].tolist(),
-                'score': img_kpts[k]['score'],
-                'center': list(img_kpts[k]['center']),
-                'scale': list(img_kpts[k]['scale'])
-            } for k in range(len(img_kpts))]
-            cat_results.extend(result)
-
-        return cat_results
-
-    def get_final_results(self, preds, all_boxes, img_path):
-        _kpts = []
-        for idx, kpt in enumerate(preds):
-            _kpts.append({
-                'keypoints': kpt,
-                'center': all_boxes[idx][0:2],
-                'scale': all_boxes[idx][2:4],
-                'area': all_boxes[idx][4],
-                'score': all_boxes[idx][5],
-                'image': int(img_path[idx])
-            })
-        # image x person x (keypoints)
-        kpts = defaultdict(list)
-        for kpt in _kpts:
-            kpts[kpt['image']].append(kpt)
-
-        # rescoring and oks nms
-        num_joints = preds.shape[1]
-        in_vis_thre = self.in_vis_thre
-        oks_thre = self.oks_thre
-        oks_nmsed_kpts = []
-        for img in kpts.keys():
-            img_kpts = kpts[img]
-            for n_p in img_kpts:
-                box_score = n_p['score']
-                kpt_score = 0
-                valid_num = 0
-                for n_jt in range(0, num_joints):
-                    t_s = n_p['keypoints'][n_jt][2]
-                    if t_s > in_vis_thre:
-                        kpt_score = kpt_score + t_s
-                        valid_num = valid_num + 1
-                if valid_num != 0:
-                    kpt_score = kpt_score / valid_num
-                # rescoring
-                n_p['score'] = kpt_score * box_score
-
-            keep = oks_nms([img_kpts[i] for i in range(len(img_kpts))],
-                           oks_thre)
-
-            if len(keep) == 0:
-                oks_nmsed_kpts.append(img_kpts)
-            else:
-                oks_nmsed_kpts.append([img_kpts[_keep] for _keep in keep])
-
-        self._write_coco_keypoint_results(oks_nmsed_kpts)
-
-    def accumulate(self):
-        self.get_final_results(self.results['all_preds'],
-                               self.results['all_boxes'],
-                               self.results['image_path'])
-        if self.save_prediction_only:
-            logger.info(f'The keypoint result is saved to {self.res_file} '
-                        'and do not evaluate the mAP.')
-            return
-        coco_dt = self.coco.loadRes(self.res_file)
-        coco_eval = COCOeval(self.coco, coco_dt, 'keypoints')
-        coco_eval.params.useSegm = None
-        coco_eval.evaluate()
-        coco_eval.accumulate()
-        coco_eval.summarize()
-
-        keypoint_stats = []
-        for ind in range(len(coco_eval.stats)):
-            keypoint_stats.append((coco_eval.stats[ind]))
-        self.eval_results['keypoint'] = keypoint_stats
-
-    def log(self):
-        if self.save_prediction_only:
-            return
-        stats_names = [
-            'AP', 'Ap .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5',
-            'AR .75', 'AR (M)', 'AR (L)'
-        ]
-        num_values = len(stats_names)
-        print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |')
-        print('|---' * (num_values + 1) + '|')
-
-        print(' '.join([
-            '| {:.3f}'.format(value) for value in self.eval_results['keypoint']
-        ]) + ' |')
-
-    def get_results(self):
-        return self.eval_results
-
-
-class KeyPointTopDownCOCOWholeBadyHandEval(object):
-    def __init__(self,
-                 anno_file,
-                 num_samples,
-                 num_joints,
-                 output_eval,
-                 save_prediction_only=False):
-        super(KeyPointTopDownCOCOWholeBadyHandEval, self).__init__()
-        self.coco = COCO(anno_file)
-        self.num_samples = num_samples
-        self.num_joints = num_joints
-        self.output_eval = output_eval
-        self.res_file = os.path.join(output_eval, "keypoints_results.json")
-        self.save_prediction_only = save_prediction_only
-        self.parse_dataset()
-        self.reset()
-
-    def parse_dataset(self):
-        gt_db = []
-        num_joints = self.num_joints
-        coco = self.coco
-        img_ids = coco.getImgIds()
-        for img_id in img_ids:
-            ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False)
-            objs = coco.loadAnns(ann_ids)
-
-            for obj in objs:
-                for type in ['left', 'right']:
-                    if (obj[f'{type}hand_valid'] and
-                            max(obj[f'{type}hand_kpts']) > 0):
-
-                        joints = np.zeros((num_joints, 3), dtype=np.float32)
-                        joints_vis = np.zeros((num_joints, 3), dtype=np.float32)
-
-                        keypoints = np.array(obj[f'{type}hand_kpts'])
-                        keypoints = keypoints.reshape(-1, 3)
-                        joints[:, :2] = keypoints[:, :2]
-                        joints_vis[:, :2] = np.minimum(1, keypoints[:, 2:3])
-
-                        gt_db.append({
-                            'bbox': obj[f'{type}hand_box'],
-                            'gt_joints': joints,
-                            'joints_vis': joints_vis,
-                        })
-        self.db = gt_db
-
-    def reset(self):
-        self.results = {
-            'preds': np.zeros(
-                (self.num_samples, self.num_joints, 3), dtype=np.float32),
-        }
-        self.eval_results = {}
-        self.idx = 0
-
-    def update(self, inputs, outputs):
-        kpts, _ = outputs['keypoint'][0]
-        num_images = inputs['image'].shape[0]
-        self.results['preds'][self.idx:self.idx + num_images, :, 0:
-                              3] = kpts[:, :, 0:3]
-        self.idx += num_images
-
-    def accumulate(self):
-        self.get_final_results(self.results['preds'])
-        if self.save_prediction_only:
-            logger.info(f'The keypoint result is saved to {self.res_file} '
-                        'and do not evaluate the mAP.')
-            return
-
-        self.eval_results = self.evaluate(self.res_file, ('PCK', 'AUC', 'EPE'))
-
-    def get_final_results(self, preds):
-        kpts = []
-        for idx, kpt in enumerate(preds):
-            kpts.append({'keypoints': kpt.tolist()})
-
-        self._write_keypoint_results(kpts)
-
-    def _write_keypoint_results(self, keypoints):
-        if not os.path.exists(self.output_eval):
-            os.makedirs(self.output_eval)
-        with open(self.res_file, 'w') as f:
-            json.dump(keypoints, f, sort_keys=True, indent=4)
-            logger.info(f'The keypoint result is saved to {self.res_file}.')
-        try:
-            json.load(open(self.res_file))
-        except Exception:
-            content = []
-            with open(self.res_file, 'r') as f:
-                for line in f:
-                    content.append(line)
-            content[-1] = ']'
-            with open(self.res_file, 'w') as f:
-                for c in content:
-                    f.write(c)
-
-    def log(self):
-        if self.save_prediction_only:
-            return
-        for item, value in self.eval_results.items():
-            print("{} : {}".format(item, value))
-
-    def get_results(self):
-        return self.eval_results
-
-    def evaluate(self, res_file, metrics, pck_thr=0.2, auc_nor=30):
-        """Keypoint evaluation.
-
-        Args:
-            res_file (str): Json file stored prediction results.
-            metrics (str | list[str]): Metric to be performed.
-                Options: 'PCK', 'AUC', 'EPE'.
-            pck_thr (float): PCK threshold, default as 0.2.
-            auc_nor (float): AUC normalization factor, default as 30 pixel.
-
-        Returns:
-            List: Evaluation results for evaluation metric.
-        """
-        info_str = []
-
-        with open(res_file, 'r') as fin:
-            preds = json.load(fin)
-        assert len(preds) == len(self.db)
-
-        outputs = []
-        gts = []
-        masks = []
-        threshold_bbox = []
-
-        for pred, item in zip(preds, self.db):
-            outputs.append(np.array(pred['keypoints'])[:, :-1])
-            gts.append(np.array(item['gt_joints'])[:, :-1])
-            masks.append((np.array(item['joints_vis'])[:, 0]) > 0)
-            if 'PCK' in metrics:
-                bbox = np.array(item['bbox'])
-                bbox_thr = np.max(bbox[2:])
-                threshold_bbox.append(np.array([bbox_thr, bbox_thr]))
-
-        outputs = np.array(outputs)
-        gts = np.array(gts)
-        masks = np.array(masks)
-        threshold_bbox = np.array(threshold_bbox)
-
-        if 'PCK' in metrics:
-            _, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr,
-                                              threshold_bbox)
-            info_str.append(('PCK', pck))
-
-        if 'AUC' in metrics:
-            info_str.append(('AUC', keypoint_auc(outputs, gts, masks, auc_nor)))
-
-        if 'EPE' in metrics:
-            info_str.append(('EPE', keypoint_epe(outputs, gts, masks)))
-
-        name_value = OrderedDict(info_str)
-
-        return name_value
-
-
-class KeyPointTopDownMPIIEval(object):
-    def __init__(self,
-                 anno_file,
-                 num_samples,
-                 num_joints,
-                 output_eval,
-                 oks_thre=0.9,
-                 save_prediction_only=False):
-        super(KeyPointTopDownMPIIEval, self).__init__()
-        self.ann_file = anno_file
-        self.res_file = os.path.join(output_eval, "keypoints_results.json")
-        self.save_prediction_only = save_prediction_only
-        self.reset()
-
-    def reset(self):
-        self.results = []
-        self.eval_results = {}
-        self.idx = 0
-
-    def update(self, inputs, outputs):
-        kpts, _ = outputs['keypoint'][0]
-
-        num_images = inputs['image'].shape[0]
-        results = {}
-        results['preds'] = kpts[:, :, 0:3]
-        results['boxes'] = np.zeros((num_images, 6))
-        results['boxes'][:, 0:2] = inputs['center'].numpy()[:, 0:2]
-        results['boxes'][:, 2:4] = inputs['scale'].numpy()[:, 0:2]
-        results['boxes'][:, 4] = np.prod(inputs['scale'].numpy() * 200, 1)
-        results['boxes'][:, 5] = np.squeeze(inputs['score'].numpy())
-        results['image_path'] = inputs['image_file']
-
-        self.results.append(results)
-
-    def accumulate(self):
-        self._mpii_keypoint_results_save()
-        if self.save_prediction_only:
-            logger.info(f'The keypoint result is saved to {self.res_file} '
-                        'and do not evaluate the mAP.')
-            return
-
-        self.eval_results = self.evaluate(self.results)
-
-    def _mpii_keypoint_results_save(self):
-        results = []
-        for res in self.results:
-            if len(res) == 0:
-                continue
-            result = [{
-                'preds': res['preds'][k].tolist(),
-                'boxes': res['boxes'][k].tolist(),
-                'image_path': res['image_path'][k],
-            } for k in range(len(res))]
-            results.extend(result)
-        with open(self.res_file, 'w') as f:
-            json.dump(results, f, sort_keys=True, indent=4)
-            logger.info(f'The keypoint result is saved to {self.res_file}.')
-
-    def log(self):
-        if self.save_prediction_only:
-            return
-        for item, value in self.eval_results.items():
-            print("{} : {}".format(item, value))
-
-    def get_results(self):
-        return self.eval_results
-
-    def evaluate(self, outputs, savepath=None):
-        """Evaluate PCKh for MPII dataset. refer to
-        https://github.com/leoxiaobin/deep-high-resolution-net.pytorch
-        Copyright (c) Microsoft, under the MIT License.
-
-        Args:
-            outputs(list(preds, boxes)):
-
-                * preds (np.ndarray[N,K,3]): The first two dimensions are
-                  coordinates, score is the third dimension of the array.
-                * boxes (np.ndarray[N,6]): [center[0], center[1], scale[0]
-                  , scale[1],area, score]
-
-        Returns:
-            dict: PCKh for each joint
-        """
-
-        kpts = []
-        for output in outputs:
-            preds = output['preds']
-            batch_size = preds.shape[0]
-            for i in range(batch_size):
-                kpts.append({'keypoints': preds[i]})
-
-        preds = np.stack([kpt['keypoints'] for kpt in kpts])
-
-        # convert 0-based index to 1-based index,
-        # and get the first two dimensions.
-        preds = preds[..., :2] + 1.0
-
-        if savepath is not None:
-            pred_file = os.path.join(savepath, 'pred.mat')
-            savemat(pred_file, mdict={'preds': preds})
-
-        SC_BIAS = 0.6
-        threshold = 0.5
-
-        gt_file = os.path.join(
-            os.path.dirname(self.ann_file), 'mpii_gt_val.mat')
-        gt_dict = loadmat(gt_file)
-        dataset_joints = gt_dict['dataset_joints']
-        jnt_missing = gt_dict['jnt_missing']
-        pos_gt_src = gt_dict['pos_gt_src']
-        headboxes_src = gt_dict['headboxes_src']
-
-        pos_pred_src = np.transpose(preds, [1, 2, 0])
-
-        head = np.where(dataset_joints == 'head')[1][0]
-        lsho = np.where(dataset_joints == 'lsho')[1][0]
-        lelb = np.where(dataset_joints == 'lelb')[1][0]
-        lwri = np.where(dataset_joints == 'lwri')[1][0]
-        lhip = np.where(dataset_joints == 'lhip')[1][0]
-        lkne = np.where(dataset_joints == 'lkne')[1][0]
-        lank = np.where(dataset_joints == 'lank')[1][0]
-
-        rsho = np.where(dataset_joints == 'rsho')[1][0]
-        relb = np.where(dataset_joints == 'relb')[1][0]
-        rwri = np.where(dataset_joints == 'rwri')[1][0]
-        rkne = np.where(dataset_joints == 'rkne')[1][0]
-        rank = np.where(dataset_joints == 'rank')[1][0]
-        rhip = np.where(dataset_joints == 'rhip')[1][0]
-
-        jnt_visible = 1 - jnt_missing
-        uv_error = pos_pred_src - pos_gt_src
-        uv_err = np.linalg.norm(uv_error, axis=1)
-        headsizes = headboxes_src[1, :, :] - headboxes_src[0, :, :]
-        headsizes = np.linalg.norm(headsizes, axis=0)
-        headsizes *= SC_BIAS
-        scale = headsizes * np.ones((len(uv_err), 1), dtype=np.float32)
-        scaled_uv_err = uv_err / scale
-        scaled_uv_err = scaled_uv_err * jnt_visible
-        jnt_count = np.sum(jnt_visible, axis=1)
-        less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible
-        PCKh = 100. * np.sum(less_than_threshold, axis=1) / jnt_count
-
-        # save
-        rng = np.arange(0, 0.5 + 0.01, 0.01)
-        pckAll = np.zeros((len(rng), 16), dtype=np.float32)
-
-        for r, threshold in enumerate(rng):
-            less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible
-            pckAll[r, :] = 100. * np.sum(less_than_threshold,
-                                         axis=1) / jnt_count
-
-        PCKh = np.ma.array(PCKh, mask=False)
-        PCKh.mask[6:8] = True
-
-        jnt_count = np.ma.array(jnt_count, mask=False)
-        jnt_count.mask[6:8] = True
-        jnt_ratio = jnt_count / np.sum(jnt_count).astype(np.float64)
-
-        name_value = [  #noqa
-            ('Head', PCKh[head]),
-            ('Shoulder', 0.5 * (PCKh[lsho] + PCKh[rsho])),
-            ('Elbow', 0.5 * (PCKh[lelb] + PCKh[relb])),
-            ('Wrist', 0.5 * (PCKh[lwri] + PCKh[rwri])),
-            ('Hip', 0.5 * (PCKh[lhip] + PCKh[rhip])),
-            ('Knee', 0.5 * (PCKh[lkne] + PCKh[rkne])),
-            ('Ankle', 0.5 * (PCKh[lank] + PCKh[rank])),
-            ('PCKh', np.sum(PCKh * jnt_ratio)),
-            ('PCKh@0.1', np.sum(pckAll[11, :] * jnt_ratio))
-        ]
-        name_value = OrderedDict(name_value)
-
-        return name_value
-
-    def _sort_and_unique_bboxes(self, kpts, key='bbox_id'):
-        """sort kpts and remove the repeated ones."""
-        kpts = sorted(kpts, key=lambda x: x[key])
-        num = len(kpts)
-        for i in range(num - 1, 0, -1):
-            if kpts[i][key] == kpts[i - 1][key]:
-                del kpts[i]
-
-        return kpts
diff --git a/pdfdet/models/Paddle/ppdet/metrics/map_utils.py b/pdfdet/models/Paddle/ppdet/metrics/map_utils.py
deleted file mode 100644
index 57f12d9..0000000
--- a/pdfdet/models/Paddle/ppdet/metrics/map_utils.py
+++ /dev/null
@@ -1,436 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import os
-import sys
-import numpy as np
-import itertools
-import paddle
-from ppdet.modeling.rbox_utils import poly2rbox_np
-
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-__all__ = [
-    'draw_pr_curve',
-    'bbox_area',
-    'jaccard_overlap',
-    'prune_zero_padding',
-    'DetectionMAP',
-    'ap_per_class',
-    'compute_ap',
-]
-
-
-def draw_pr_curve(precision,
-                  recall,
-                  iou=0.5,
-                  out_dir='pr_curve',
-                  file_name='precision_recall_curve.jpg'):
-    if not os.path.exists(out_dir):
-        os.makedirs(out_dir)
-    output_path = os.path.join(out_dir, file_name)
-    try:
-        import matplotlib.pyplot as plt
-    except Exception as e:
-        logger.error('Matplotlib not found, plaese install matplotlib.'
-                     'for example: `pip install matplotlib`.')
-        raise e
-    plt.cla()
-    plt.figure('P-R Curve')
-    plt.title('Precision/Recall Curve(IoU={})'.format(iou))
-    plt.xlabel('Recall')
-    plt.ylabel('Precision')
-    plt.grid(True)
-    plt.plot(recall, precision)
-    plt.savefig(output_path)
-
-
-def bbox_area(bbox, is_bbox_normalized):
-    """
-    Calculate area of a bounding box
-    """
-    norm = 1. - float(is_bbox_normalized)
-    width = bbox[2] - bbox[0] + norm
-    height = bbox[3] - bbox[1] + norm
-    return width * height
-
-
-def jaccard_overlap(pred, gt, is_bbox_normalized=False):
-    """
-    Calculate jaccard overlap ratio between two bounding box
-    """
-    if pred[0] >= gt[2] or pred[2] <= gt[0] or \
-        pred[1] >= gt[3] or pred[3] <= gt[1]:
-        return 0.
-    inter_xmin = max(pred[0], gt[0])
-    inter_ymin = max(pred[1], gt[1])
-    inter_xmax = min(pred[2], gt[2])
-    inter_ymax = min(pred[3], gt[3])
-    inter_size = bbox_area([inter_xmin, inter_ymin, inter_xmax, inter_ymax],
-                           is_bbox_normalized)
-    pred_size = bbox_area(pred, is_bbox_normalized)
-    gt_size = bbox_area(gt, is_bbox_normalized)
-    overlap = float(inter_size) / (pred_size + gt_size - inter_size)
-    return overlap
-
-
-def calc_rbox_iou(pred, gt_poly):
-    """
-    calc iou between rotated bbox
-    """
-    # calc iou of bounding box for speedup
-    pred = np.array(pred, np.float32).reshape(-1, 2)
-    gt_poly = np.array(gt_poly, np.float32).reshape(-1, 2)
-    pred_rect = [
-        np.min(pred[:, 0]), np.min(pred[:, 1]), np.max(pred[:, 0]),
-        np.max(pred[:, 1])
-    ]
-    gt_rect = [
-        np.min(gt_poly[:, 0]), np.min(gt_poly[:, 1]), np.max(gt_poly[:, 0]),
-        np.max(gt_poly[:, 1])
-    ]
-    iou = jaccard_overlap(pred_rect, gt_rect, False)
-
-    if iou <= 0:
-        return iou
-
-    # calc rbox iou
-    pred_rbox = poly2rbox_np(pred.reshape(-1, 8)).reshape(-1, 5)
-    gt_rbox = poly2rbox_np(gt_poly.reshape(-1, 8)).reshape(-1, 5)
-    try:
-        from ext_op import rbox_iou
-    except Exception as e:
-        print("import custom_ops error, try install ext_op " \
-                  "following ppdet/ext_op/README.md", e)
-        sys.stdout.flush()
-        sys.exit(-1)
-    pd_gt_rbox = paddle.to_tensor(gt_rbox, dtype='float32')
-    pd_pred_rbox = paddle.to_tensor(pred_rbox, dtype='float32')
-    iou = rbox_iou(pd_gt_rbox, pd_pred_rbox)
-    iou = iou.numpy()
-    return iou[0][0]
-
-
-def prune_zero_padding(gt_box, gt_label, difficult=None):
-    valid_cnt = 0
-    for i in range(len(gt_box)):
-        if (gt_box[i] == 0).all():
-            break
-        valid_cnt += 1
-    return (gt_box[:valid_cnt], gt_label[:valid_cnt], difficult[:valid_cnt]
-            if difficult is not None else None)
-
-
-class DetectionMAP(object):
-    """
-    Calculate detection mean average precision.
-    Currently support two types: 11point and integral
-
-    Args:
-        class_num (int): The class number.
-        overlap_thresh (float): The threshold of overlap
-            ratio between prediction bounding box and 
-            ground truth bounding box for deciding 
-            true/false positive. Default 0.5.
-        map_type (str): Calculation method of mean average
-            precision, currently support '11point' and
-            'integral'. Default '11point'.
-        is_bbox_normalized (bool): Whether bounding boxes
-            is normalized to range[0, 1]. Default False.
-        evaluate_difficult (bool): Whether to evaluate
-            difficult bounding boxes. Default False.
-        catid2name (dict): Mapping between category id and category name.
-        classwise (bool): Whether per-category AP and draw
-            P-R Curve or not.
-    """
-
-    def __init__(self,
-                 class_num,
-                 overlap_thresh=0.5,
-                 map_type='11point',
-                 is_bbox_normalized=False,
-                 evaluate_difficult=False,
-                 catid2name=None,
-                 classwise=False):
-        self.class_num = class_num
-        self.overlap_thresh = overlap_thresh
-        assert map_type in ['11point', 'integral'], \
-                "map_type currently only support '11point' "\
-                "and 'integral'"
-        self.map_type = map_type
-        self.is_bbox_normalized = is_bbox_normalized
-        self.evaluate_difficult = evaluate_difficult
-        self.classwise = classwise
-        self.classes = []
-        for cname in catid2name.values():
-            self.classes.append(cname)
-        self.reset()
-
-    def update(self, bbox, score, label, gt_box, gt_label, difficult=None):
-        """
-        Update metric statics from given prediction and ground
-        truth infomations.
-        """
-        if difficult is None:
-            difficult = np.zeros_like(gt_label)
-
-        # record class gt count
-        for gtl, diff in zip(gt_label, difficult):
-            if self.evaluate_difficult or int(diff) == 0:
-                self.class_gt_counts[int(np.array(gtl))] += 1
-
-        # record class score positive
-        visited = [False] * len(gt_label)
-        for b, s, l in zip(bbox, score, label):
-            pred = b.tolist() if isinstance(b, np.ndarray) else b
-            max_idx = -1
-            max_overlap = -1.0
-            for i, gl in enumerate(gt_label):
-                if int(gl) == int(l):
-                    if len(gt_box[i]) == 8:
-                        overlap = calc_rbox_iou(pred, gt_box[i])
-                    else:
-                        overlap = jaccard_overlap(pred, gt_box[i],
-                                                  self.is_bbox_normalized)
-                    if overlap > max_overlap:
-                        max_overlap = overlap
-                        max_idx = i
-
-            if max_overlap > self.overlap_thresh:
-                if self.evaluate_difficult or \
-                        int(np.array(difficult[max_idx])) == 0:
-                    if not visited[max_idx]:
-                        self.class_score_poss[int(l)].append([s, 1.0])
-                        visited[max_idx] = True
-                    else:
-                        self.class_score_poss[int(l)].append([s, 0.0])
-            else:
-                self.class_score_poss[int(l)].append([s, 0.0])
-
-    def reset(self):
-        """
-        Reset metric statics
-        """
-        self.class_score_poss = [[] for _ in range(self.class_num)]
-        self.class_gt_counts = [0] * self.class_num
-        self.mAP = 0.0
-
-    def accumulate(self):
-        """
-        Accumulate metric results and calculate mAP
-        """
-        mAP = 0.
-        valid_cnt = 0
-        eval_results = []
-        for score_pos, count in zip(self.class_score_poss,
-                                    self.class_gt_counts):
-            if count == 0: continue
-            if len(score_pos) == 0:
-                valid_cnt += 1
-                continue
-
-            accum_tp_list, accum_fp_list = \
-                    self._get_tp_fp_accum(score_pos)
-            precision = []
-            recall = []
-            for ac_tp, ac_fp in zip(accum_tp_list, accum_fp_list):
-                precision.append(float(ac_tp) / (ac_tp + ac_fp))
-                recall.append(float(ac_tp) / count)
-
-            one_class_ap = 0.0
-            if self.map_type == '11point':
-                max_precisions = [0.] * 11
-                start_idx = len(precision) - 1
-                for j in range(10, -1, -1):
-                    for i in range(start_idx, -1, -1):
-                        if recall[i] < float(j) / 10.:
-                            start_idx = i
-                            if j > 0:
-                                max_precisions[j - 1] = max_precisions[j]
-                                break
-                        else:
-                            if max_precisions[j] < precision[i]:
-                                max_precisions[j] = precision[i]
-                one_class_ap = sum(max_precisions) / 11.
-                mAP += one_class_ap
-                valid_cnt += 1
-            elif self.map_type == 'integral':
-                import math
-                prev_recall = 0.
-                for i in range(len(precision)):
-                    recall_gap = math.fabs(recall[i] - prev_recall)
-                    if recall_gap > 1e-6:
-                        one_class_ap += precision[i] * recall_gap
-                        prev_recall = recall[i]
-                mAP += one_class_ap
-                valid_cnt += 1
-            else:
-                logger.error("Unspported mAP type {}".format(self.map_type))
-                sys.exit(1)
-            eval_results.append({
-                'class': self.classes[valid_cnt - 1],
-                'ap': one_class_ap,
-                'precision': precision,
-                'recall': recall,
-            })
-        self.eval_results = eval_results
-        self.mAP = mAP / float(valid_cnt) if valid_cnt > 0 else mAP
-
-    def get_map(self):
-        """
-        Get mAP result
-        """
-        if self.mAP is None:
-            logger.error("mAP is not calculated.")
-        if self.classwise:
-            # Compute per-category AP and PR curve
-            try:
-                from terminaltables import AsciiTable
-            except Exception as e:
-                logger.error(
-                    'terminaltables not found, plaese install terminaltables. '
-                    'for example: `pip install terminaltables`.')
-                raise e
-            results_per_category = []
-            for eval_result in self.eval_results:
-                results_per_category.append(
-                    (str(eval_result['class']),
-                     '{:0.3f}'.format(float(eval_result['ap']))))
-                draw_pr_curve(
-                    eval_result['precision'],
-                    eval_result['recall'],
-                    out_dir='voc_pr_curve',
-                    file_name='{}_precision_recall_curve.jpg'.format(
-                        eval_result['class']))
-
-            num_columns = min(6, len(results_per_category) * 2)
-            results_flatten = list(itertools.chain(*results_per_category))
-            headers = ['category', 'AP'] * (num_columns // 2)
-            results_2d = itertools.zip_longest(* [
-                results_flatten[i::num_columns] for i in range(num_columns)
-            ])
-            table_data = [headers]
-            table_data += [result for result in results_2d]
-            table = AsciiTable(table_data)
-            logger.info('Per-category of VOC AP: \n{}'.format(table.table))
-            logger.info(
-                "per-category PR curve has output to voc_pr_curve folder.")
-        return self.mAP
-
-    def _get_tp_fp_accum(self, score_pos_list):
-        """
-        Calculate accumulating true/false positive results from
-        [score, pos] records
-        """
-        sorted_list = sorted(score_pos_list, key=lambda s: s[0], reverse=True)
-        accum_tp = 0
-        accum_fp = 0
-        accum_tp_list = []
-        accum_fp_list = []
-        for (score, pos) in sorted_list:
-            accum_tp += int(pos)
-            accum_tp_list.append(accum_tp)
-            accum_fp += 1 - int(pos)
-            accum_fp_list.append(accum_fp)
-        return accum_tp_list, accum_fp_list
-
-
-def ap_per_class(tp, conf, pred_cls, target_cls):
-    """
-    Computes the average precision, given the recall and precision curves.
-    Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics.
-    
-    Args:
-        tp (list): True positives.
-        conf (list): Objectness value from 0-1.
-        pred_cls (list): Predicted object classes.
-        target_cls (list): Target object classes.
-    """
-    tp, conf, pred_cls, target_cls = np.array(tp), np.array(conf), np.array(
-        pred_cls), np.array(target_cls)
-
-    # Sort by objectness
-    i = np.argsort(-conf)
-    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]
-
-    # Find unique classes
-    unique_classes = np.unique(np.concatenate((pred_cls, target_cls), 0))
-
-    # Create Precision-Recall curve and compute AP for each class
-    ap, p, r = [], [], []
-    for c in unique_classes:
-        i = pred_cls == c
-        n_gt = sum(target_cls == c)  # Number of ground truth objects
-        n_p = sum(i)  # Number of predicted objects
-
-        if (n_p == 0) and (n_gt == 0):
-            continue
-        elif (n_p == 0) or (n_gt == 0):
-            ap.append(0)
-            r.append(0)
-            p.append(0)
-        else:
-            # Accumulate FPs and TPs
-            fpc = np.cumsum(1 - tp[i])
-            tpc = np.cumsum(tp[i])
-
-            # Recall
-            recall_curve = tpc / (n_gt + 1e-16)
-            r.append(tpc[-1] / (n_gt + 1e-16))
-
-            # Precision
-            precision_curve = tpc / (tpc + fpc)
-            p.append(tpc[-1] / (tpc[-1] + fpc[-1]))
-
-            # AP from recall-precision curve
-            ap.append(compute_ap(recall_curve, precision_curve))
-
-    return np.array(ap), unique_classes.astype('int32'), np.array(r), np.array(
-        p)
-
-
-def compute_ap(recall, precision):
-    """
-    Computes the average precision, given the recall and precision curves.
-    Code originally from https://github.com/rbgirshick/py-faster-rcnn.
-    
-    Args:
-        recall (list): The recall curve.
-        precision (list): The precision curve.
-
-    Returns:
-        The average precision as computed in py-faster-rcnn.
-    """
-    # correct AP calculation
-    # first append sentinel values at the end
-    mrec = np.concatenate(([0.], recall, [1.]))
-    mpre = np.concatenate(([0.], precision, [0.]))
-
-    # compute the precision envelope
-    for i in range(mpre.size - 1, 0, -1):
-        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
-
-    # to calculate area under PR curve, look for points
-    # where X axis (recall) changes value
-    i = np.where(mrec[1:] != mrec[:-1])[0]
-
-    # and sum (\Delta recall) * prec
-    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
-    return ap
diff --git a/pdfdet/models/Paddle/ppdet/metrics/mcmot_metrics.py b/pdfdet/models/Paddle/ppdet/metrics/mcmot_metrics.py
deleted file mode 100644
index bf74d32..0000000
--- a/pdfdet/models/Paddle/ppdet/metrics/mcmot_metrics.py
+++ /dev/null
@@ -1,470 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import copy
-import sys
-import math
-from collections import defaultdict
-
-import numpy as np
-import pandas as pd
-
-from .metrics import Metric
-try:
-    import motmetrics as mm
-    from motmetrics.math_util import quiet_divide
-    metrics = mm.metrics.motchallenge_metrics
-    mh = mm.metrics.create()
-except:
-    pass
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-__all__ = ['MCMOTEvaluator', 'MCMOTMetric']
-
-METRICS_LIST = [
-    'num_frames', 'num_matches', 'num_switches', 'num_transfer', 'num_ascend',
-    'num_migrate', 'num_false_positives', 'num_misses', 'num_detections',
-    'num_objects', 'num_predictions', 'num_unique_objects', 'mostly_tracked',
-    'partially_tracked', 'mostly_lost', 'num_fragmentations', 'motp', 'mota',
-    'precision', 'recall', 'idfp', 'idfn', 'idtp', 'idp', 'idr', 'idf1'
-]
-
-NAME_MAP = {
-    'num_frames': 'num_frames',
-    'num_matches': 'num_matches',
-    'num_switches': 'IDs',
-    'num_transfer': 'IDt',
-    'num_ascend': 'IDa',
-    'num_migrate': 'IDm',
-    'num_false_positives': 'FP',
-    'num_misses': 'FN',
-    'num_detections': 'num_detections',
-    'num_objects': 'num_objects',
-    'num_predictions': 'num_predictions',
-    'num_unique_objects': 'GT',
-    'mostly_tracked': 'MT',
-    'partially_tracked': 'partially_tracked',
-    'mostly_lost': 'ML',
-    'num_fragmentations': 'FM',
-    'motp': 'MOTP',
-    'mota': 'MOTA',
-    'precision': 'Prcn',
-    'recall': 'Rcll',
-    'idfp': 'idfp',
-    'idfn': 'idfn',
-    'idtp': 'idtp',
-    'idp': 'IDP',
-    'idr': 'IDR',
-    'idf1': 'IDF1'
-}
-
-
-def parse_accs_metrics(seq_acc, index_name, verbose=False):
-    """
-    Parse the evaluation indicators of multiple MOTAccumulator 
-    """
-    mh = mm.metrics.create()
-    summary = MCMOTEvaluator.get_summary(seq_acc, index_name, METRICS_LIST)
-    summary.loc['OVERALL', 'motp'] = (summary['motp'] * summary['num_detections']).sum() / \
-                                     summary.loc['OVERALL', 'num_detections']
-    if verbose:
-        strsummary = mm.io.render_summary(
-            summary, formatters=mh.formatters, namemap=NAME_MAP)
-        print(strsummary)
-
-    return summary
-
-
-def seqs_overall_metrics(summary_df, verbose=False):
-    """
-    Calculate overall metrics for multiple sequences
-    """
-    add_col = [
-        'num_frames', 'num_matches', 'num_switches', 'num_transfer',
-        'num_ascend', 'num_migrate', 'num_false_positives', 'num_misses',
-        'num_detections', 'num_objects', 'num_predictions',
-        'num_unique_objects', 'mostly_tracked', 'partially_tracked',
-        'mostly_lost', 'num_fragmentations', 'idfp', 'idfn', 'idtp'
-    ]
-    calc_col = ['motp', 'mota', 'precision', 'recall', 'idp', 'idr', 'idf1']
-    calc_df = summary_df.copy()
-
-    overall_dic = {}
-    for col in add_col:
-        overall_dic[col] = calc_df[col].sum()
-
-    for col in calc_col:
-        overall_dic[col] = getattr(MCMOTMetricOverall, col + '_overall')(
-            calc_df, overall_dic)
-
-    overall_df = pd.DataFrame(overall_dic, index=['overall_calc'])
-    calc_df = pd.concat([calc_df, overall_df])
-
-    if verbose:
-        mh = mm.metrics.create()
-        str_calc_df = mm.io.render_summary(
-            calc_df, formatters=mh.formatters, namemap=NAME_MAP)
-        print(str_calc_df)
-
-    return calc_df
-
-
-class MCMOTMetricOverall(object):
-    def motp_overall(summary_df, overall_dic):
-        motp = quiet_divide((summary_df['motp'] *
-                             summary_df['num_detections']).sum(),
-                            overall_dic['num_detections'])
-        return motp
-
-    def mota_overall(summary_df, overall_dic):
-        del summary_df
-        mota = 1. - quiet_divide(
-            (overall_dic['num_misses'] + overall_dic['num_switches'] +
-             overall_dic['num_false_positives']), overall_dic['num_objects'])
-        return mota
-
-    def precision_overall(summary_df, overall_dic):
-        del summary_df
-        precision = quiet_divide(overall_dic['num_detections'], (
-            overall_dic['num_false_positives'] + overall_dic['num_detections']))
-        return precision
-
-    def recall_overall(summary_df, overall_dic):
-        del summary_df
-        recall = quiet_divide(overall_dic['num_detections'],
-                              overall_dic['num_objects'])
-        return recall
-
-    def idp_overall(summary_df, overall_dic):
-        del summary_df
-        idp = quiet_divide(overall_dic['idtp'],
-                           (overall_dic['idtp'] + overall_dic['idfp']))
-        return idp
-
-    def idr_overall(summary_df, overall_dic):
-        del summary_df
-        idr = quiet_divide(overall_dic['idtp'],
-                           (overall_dic['idtp'] + overall_dic['idfn']))
-        return idr
-
-    def idf1_overall(summary_df, overall_dic):
-        del summary_df
-        idf1 = quiet_divide(2. * overall_dic['idtp'], (
-            overall_dic['num_objects'] + overall_dic['num_predictions']))
-        return idf1
-
-
-def read_mcmot_results_union(filename, is_gt, is_ignore):
-    results_dict = dict()
-    if os.path.isfile(filename):
-        all_result = np.loadtxt(filename, delimiter=',')
-        if all_result.shape[0] == 0 or all_result.shape[1] < 7:
-            return results_dict
-        if is_ignore:
-            return results_dict
-        if is_gt:
-            # only for test use
-            all_result = all_result[all_result[:, 7] != 0]
-            all_result[:, 7] = all_result[:, 7] - 1
-
-        if all_result.shape[0] == 0:
-            return results_dict
-
-        class_unique = np.unique(all_result[:, 7])
-
-        last_max_id = 0
-        result_cls_list = []
-        for cls in class_unique:
-            result_cls_split = all_result[all_result[:, 7] == cls]
-            result_cls_split[:, 1] = result_cls_split[:, 1] + last_max_id
-            # make sure track id different between every category
-            last_max_id = max(np.unique(result_cls_split[:, 1])) + 1
-            result_cls_list.append(result_cls_split)
-
-        results_con = np.concatenate(result_cls_list)
-
-        for line in range(len(results_con)):
-            linelist = results_con[line]
-            fid = int(linelist[0])
-            if fid < 1:
-                continue
-            results_dict.setdefault(fid, list())
-
-            if is_gt:
-                score = 1
-            else:
-                score = float(linelist[6])
-
-            tlwh = tuple(map(float, linelist[2:6]))
-            target_id = int(linelist[1])
-            cls = int(linelist[7])
-
-            results_dict[fid].append((tlwh, target_id, cls, score))
-
-        return results_dict
-
-
-def read_mcmot_results(filename, is_gt, is_ignore):
-    results_dict = dict()
-    if os.path.isfile(filename):
-        with open(filename, 'r') as f:
-            for line in f.readlines():
-                linelist = line.strip().split(',')
-                if len(linelist) < 7:
-                    continue
-                fid = int(linelist[0])
-                if fid < 1:
-                    continue
-                cid = int(linelist[7])
-                if is_gt:
-                    score = 1
-                    # only for test use
-                    cid -= 1
-                else:
-                    score = float(linelist[6])
-
-                cls_result_dict = results_dict.setdefault(cid, dict())
-                cls_result_dict.setdefault(fid, list())
-
-                tlwh = tuple(map(float, linelist[2:6]))
-                target_id = int(linelist[1])
-                cls_result_dict[fid].append((tlwh, target_id, score))
-    return results_dict
-
-
-def read_results(filename,
-                 data_type,
-                 is_gt=False,
-                 is_ignore=False,
-                 multi_class=False,
-                 union=False):
-    if data_type in ['mcmot', 'lab']:
-        if multi_class:
-            if union:
-                # The results are evaluated by union all the categories.
-                # Track IDs between different categories cannot be duplicate.
-                read_fun = read_mcmot_results_union
-            else:
-                # The results are evaluated separately by category.
-                read_fun = read_mcmot_results
-        else:
-            raise ValueError('multi_class: {}, MCMOT should have cls_id.'.
-                             format(multi_class))
-    else:
-        raise ValueError('Unknown data type: {}'.format(data_type))
-
-    return read_fun(filename, is_gt, is_ignore)
-
-
-def unzip_objs(objs):
-    if len(objs) > 0:
-        tlwhs, ids, scores = zip(*objs)
-    else:
-        tlwhs, ids, scores = [], [], []
-    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
-    return tlwhs, ids, scores
-
-
-def unzip_objs_cls(objs):
-    if len(objs) > 0:
-        tlwhs, ids, cls, scores = zip(*objs)
-    else:
-        tlwhs, ids, cls, scores = [], [], [], []
-    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
-    ids = np.array(ids)
-    cls = np.array(cls)
-    scores = np.array(scores)
-    return tlwhs, ids, cls, scores
-
-
-class MCMOTEvaluator(object):
-    def __init__(self, data_root, seq_name, data_type, num_classes):
-        self.data_root = data_root
-        self.seq_name = seq_name
-        self.data_type = data_type
-        self.num_classes = num_classes
-
-        self.load_annotations()
-        try:
-            import motmetrics as mm
-            mm.lap.default_solver = 'lap'
-        except Exception as e:
-            raise RuntimeError(
-                'Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
-            )
-        self.reset_accumulator()
-
-        self.class_accs = []
-
-    def load_annotations(self):
-        assert self.data_type == 'mcmot'
-        self.gt_filename = os.path.join(self.data_root, '../', 'sequences',
-                                        '{}.txt'.format(self.seq_name))
-        if not os.path.exists(self.gt_filename):
-            logger.warning(
-                "gt_filename '{}' of MCMOTEvaluator is not exist, so the MOTA will be -INF."
-            )
-
-    def reset_accumulator(self):
-        self.acc = mm.MOTAccumulator(auto_id=True)
-
-    def eval_frame_dict(self, trk_objs, gt_objs, rtn_events=False, union=False):
-        if union:
-            trk_tlwhs, trk_ids, trk_cls = unzip_objs_cls(trk_objs)[:3]
-            gt_tlwhs, gt_ids, gt_cls = unzip_objs_cls(gt_objs)[:3]
-
-            # get distance matrix
-            iou_distance = mm.distances.iou_matrix(
-                gt_tlwhs, trk_tlwhs, max_iou=0.5)
-
-            # Set the distance between objects of different categories to nan
-            gt_cls_len = len(gt_cls)
-            trk_cls_len = len(trk_cls)
-            # When the number of GT or Trk is 0, iou_distance dimension is (0,0)
-            if gt_cls_len != 0 and trk_cls_len != 0:
-                gt_cls = gt_cls.reshape(gt_cls_len, 1)
-                gt_cls = np.repeat(gt_cls, trk_cls_len, axis=1)
-                trk_cls = trk_cls.reshape(1, trk_cls_len)
-                trk_cls = np.repeat(trk_cls, gt_cls_len, axis=0)
-                iou_distance = np.where(gt_cls == trk_cls, iou_distance, np.nan)
-
-        else:
-            trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2]
-            gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2]
-
-            # get distance matrix
-            iou_distance = mm.distances.iou_matrix(
-                gt_tlwhs, trk_tlwhs, max_iou=0.5)
-
-        self.acc.update(gt_ids, trk_ids, iou_distance)
-
-        if rtn_events and iou_distance.size > 0 and hasattr(self.acc,
-                                                            'mot_events'):
-            events = self.acc.mot_events  # only supported by https://github.com/longcw/py-motmetrics
-        else:
-            events = None
-        return events
-
-    def eval_file(self, result_filename):
-        # evaluation of each category
-        gt_frame_dict = read_results(
-            self.gt_filename,
-            self.data_type,
-            is_gt=True,
-            multi_class=True,
-            union=False)
-        result_frame_dict = read_results(
-            result_filename,
-            self.data_type,
-            is_gt=False,
-            multi_class=True,
-            union=False)
-
-        for cid in range(self.num_classes):
-            self.reset_accumulator()
-            cls_result_frame_dict = result_frame_dict.setdefault(cid, dict())
-            cls_gt_frame_dict = gt_frame_dict.setdefault(cid, dict())
-
-            # only labeled frames will be evaluated
-            frames = sorted(list(set(cls_gt_frame_dict.keys())))
-
-            for frame_id in frames:
-                trk_objs = cls_result_frame_dict.get(frame_id, [])
-                gt_objs = cls_gt_frame_dict.get(frame_id, [])
-                self.eval_frame_dict(trk_objs, gt_objs, rtn_events=False)
-
-            self.class_accs.append(self.acc)
-
-        return self.class_accs
-
-    @staticmethod
-    def get_summary(accs,
-                    names,
-                    metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1',
-                             'precision', 'recall')):
-        names = copy.deepcopy(names)
-        if metrics is None:
-            metrics = mm.metrics.motchallenge_metrics
-        metrics = copy.deepcopy(metrics)
-
-        mh = mm.metrics.create()
-        summary = mh.compute_many(
-            accs, metrics=metrics, names=names, generate_overall=True)
-
-        return summary
-
-    @staticmethod
-    def save_summary(summary, filename):
-        import pandas as pd
-        writer = pd.ExcelWriter(filename)
-        summary.to_excel(writer)
-        writer.save()
-
-
-class MCMOTMetric(Metric):
-    def __init__(self, num_classes, save_summary=False):
-        self.num_classes = num_classes
-        self.save_summary = save_summary
-        self.MCMOTEvaluator = MCMOTEvaluator
-        self.result_root = None
-        self.reset()
-
-        self.seqs_overall = defaultdict(list)
-
-    def reset(self):
-        self.accs = []
-        self.seqs = []
-
-    def update(self, data_root, seq, data_type, result_root, result_filename):
-        evaluator = self.MCMOTEvaluator(data_root, seq, data_type,
-                                        self.num_classes)
-        seq_acc = evaluator.eval_file(result_filename)
-        self.accs.append(seq_acc)
-        self.seqs.append(seq)
-        self.result_root = result_root
-
-        cls_index_name = [
-            '{}_{}'.format(seq, i) for i in range(self.num_classes)
-        ]
-        summary = parse_accs_metrics(seq_acc, cls_index_name)
-        summary.rename(
-            index={'OVERALL': '{}_OVERALL'.format(seq)}, inplace=True)
-        for row in range(len(summary)):
-            self.seqs_overall[row].append(summary.iloc[row:row + 1])
-
-    def accumulate(self):
-        self.cls_summary_list = []
-        for row in range(self.num_classes):
-            seqs_cls_df = pd.concat(self.seqs_overall[row])
-            seqs_cls_summary = seqs_overall_metrics(seqs_cls_df)
-            cls_summary_overall = seqs_cls_summary.iloc[-1:].copy()
-            cls_summary_overall.rename(
-                index={'overall_calc': 'overall_calc_{}'.format(row)},
-                inplace=True)
-            self.cls_summary_list.append(cls_summary_overall)
-
-    def log(self):
-        seqs_summary = seqs_overall_metrics(
-            pd.concat(self.seqs_overall[self.num_classes]), verbose=True)
-        class_summary = seqs_overall_metrics(
-            pd.concat(self.cls_summary_list), verbose=True)
-
-    def get_results(self):
-        return 1
diff --git a/pdfdet/models/Paddle/ppdet/metrics/metrics.py b/pdfdet/models/Paddle/ppdet/metrics/metrics.py
deleted file mode 100644
index b473509..0000000
--- a/pdfdet/models/Paddle/ppdet/metrics/metrics.py
+++ /dev/null
@@ -1,505 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import json
-import paddle
-import numpy as np
-import typing
-from collections import defaultdict
-from pathlib import Path
-
-from .map_utils import prune_zero_padding, DetectionMAP
-from .coco_utils import get_infer_results, cocoapi_eval
-from .widerface_utils import face_eval_run
-from ppdet.data.source.category import get_categories
-from ppdet.modeling.rbox_utils import poly2rbox_np
-
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-__all__ = [
-    'Metric', 'COCOMetric', 'VOCMetric', 'WiderFaceMetric', 'get_infer_results',
-    'RBoxMetric', 'SNIPERCOCOMetric'
-]
-
-COCO_SIGMAS = np.array([
-    .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87,
-    .89, .89
-]) / 10.0
-CROWD_SIGMAS = np.array(
-    [.79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89, .79,
-     .79]) / 10.0
-
-
-class Metric(paddle.metric.Metric):
-    def name(self):
-        return self.__class__.__name__
-
-    def reset(self):
-        pass
-
-    def accumulate(self):
-        pass
-
-    # paddle.metric.Metric defined :metch:`update`, :meth:`accumulate`
-    # :metch:`reset`, in ppdet, we also need following 2 methods:
-
-    # abstract method for logging metric results
-    def log(self):
-        pass
-
-    # abstract method for getting metric results
-    def get_results(self):
-        pass
-
-
-class COCOMetric(Metric):
-    def __init__(self, anno_file, **kwargs):
-        self.anno_file = anno_file
-        self.clsid2catid = kwargs.get('clsid2catid', None)
-        if self.clsid2catid is None:
-            self.clsid2catid, _ = get_categories('COCO', anno_file)
-        self.classwise = kwargs.get('classwise', False)
-        self.output_eval = kwargs.get('output_eval', None)
-        # TODO: bias should be unified
-        self.bias = kwargs.get('bias', 0)
-        self.save_prediction_only = kwargs.get('save_prediction_only', False)
-        self.iou_type = kwargs.get('IouType', 'bbox')
-
-        if not self.save_prediction_only:
-            assert os.path.isfile(anno_file), \
-                    "anno_file {} not a file".format(anno_file)
-
-        if self.output_eval is not None:
-            Path(self.output_eval).mkdir(exist_ok=True)
-
-        self.reset()
-
-    def reset(self):
-        # only bbox and mask evaluation support currently
-        self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}
-        self.eval_results = {}
-
-    def update(self, inputs, outputs):
-        outs = {}
-        # outputs Tensor -> numpy.ndarray
-        for k, v in outputs.items():
-            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
-
-        # multi-scale inputs: all inputs have same im_id
-        if isinstance(inputs, typing.Sequence):
-            im_id = inputs[0]['im_id']
-        else:
-            im_id = inputs['im_id']
-        outs['im_id'] = im_id.numpy() if isinstance(im_id,
-                                                    paddle.Tensor) else im_id
-
-        infer_results = get_infer_results(
-            outs, self.clsid2catid, bias=self.bias)
-        self.results['bbox'] += infer_results[
-            'bbox'] if 'bbox' in infer_results else []
-        self.results['mask'] += infer_results[
-            'mask'] if 'mask' in infer_results else []
-        self.results['segm'] += infer_results[
-            'segm'] if 'segm' in infer_results else []
-        self.results['keypoint'] += infer_results[
-            'keypoint'] if 'keypoint' in infer_results else []
-
-    def accumulate(self):
-        if len(self.results['bbox']) > 0:
-            output = "bbox.json"
-            if self.output_eval:
-                output = os.path.join(self.output_eval, output)
-            with open(output, 'w') as f:
-                json.dump(self.results['bbox'], f)
-                logger.info('The bbox result is saved to bbox.json.')
-
-            if self.save_prediction_only:
-                logger.info('The bbox result is saved to {} and do not '
-                            'evaluate the mAP.'.format(output))
-            else:
-                bbox_stats = cocoapi_eval(
-                    output,
-                    'bbox',
-                    anno_file=self.anno_file,
-                    classwise=self.classwise)
-                self.eval_results['bbox'] = bbox_stats
-                sys.stdout.flush()
-
-        if len(self.results['mask']) > 0:
-            output = "mask.json"
-            if self.output_eval:
-                output = os.path.join(self.output_eval, output)
-            with open(output, 'w') as f:
-                json.dump(self.results['mask'], f)
-                logger.info('The mask result is saved to mask.json.')
-
-            if self.save_prediction_only:
-                logger.info('The mask result is saved to {} and do not '
-                            'evaluate the mAP.'.format(output))
-            else:
-                seg_stats = cocoapi_eval(
-                    output,
-                    'segm',
-                    anno_file=self.anno_file,
-                    classwise=self.classwise)
-                self.eval_results['mask'] = seg_stats
-                sys.stdout.flush()
-
-        if len(self.results['segm']) > 0:
-            output = "segm.json"
-            if self.output_eval:
-                output = os.path.join(self.output_eval, output)
-            with open(output, 'w') as f:
-                json.dump(self.results['segm'], f)
-                logger.info('The segm result is saved to segm.json.')
-
-            if self.save_prediction_only:
-                logger.info('The segm result is saved to {} and do not '
-                            'evaluate the mAP.'.format(output))
-            else:
-                seg_stats = cocoapi_eval(
-                    output,
-                    'segm',
-                    anno_file=self.anno_file,
-                    classwise=self.classwise)
-                self.eval_results['mask'] = seg_stats
-                sys.stdout.flush()
-
-        if len(self.results['keypoint']) > 0:
-            output = "keypoint.json"
-            if self.output_eval:
-                output = os.path.join(self.output_eval, output)
-            with open(output, 'w') as f:
-                json.dump(self.results['keypoint'], f)
-                logger.info('The keypoint result is saved to keypoint.json.')
-
-            if self.save_prediction_only:
-                logger.info('The keypoint result is saved to {} and do not '
-                            'evaluate the mAP.'.format(output))
-            else:
-                style = 'keypoints'
-                use_area = True
-                sigmas = COCO_SIGMAS
-                if self.iou_type == 'keypoints_crowd':
-                    style = 'keypoints_crowd'
-                    use_area = False
-                    sigmas = CROWD_SIGMAS
-                keypoint_stats = cocoapi_eval(
-                    output,
-                    style,
-                    anno_file=self.anno_file,
-                    classwise=self.classwise,
-                    sigmas=sigmas,
-                    use_area=use_area)
-                self.eval_results['keypoint'] = keypoint_stats
-                sys.stdout.flush()
-
-    def log(self):
-        pass
-
-    def get_results(self):
-        return self.eval_results
-
-
-class VOCMetric(Metric):
-    def __init__(self,
-                 label_list,
-                 class_num=20,
-                 overlap_thresh=0.5,
-                 map_type='11point',
-                 is_bbox_normalized=False,
-                 evaluate_difficult=False,
-                 classwise=False,
-                 output_eval=None,
-                 save_prediction_only=False):
-        assert os.path.isfile(label_list), \
-                "label_list {} not a file".format(label_list)
-        self.clsid2catid, self.catid2name = get_categories('VOC', label_list)
-
-        self.overlap_thresh = overlap_thresh
-        self.map_type = map_type
-        self.evaluate_difficult = evaluate_difficult
-        self.output_eval = output_eval
-        self.save_prediction_only = save_prediction_only
-        self.detection_map = DetectionMAP(
-            class_num=class_num,
-            overlap_thresh=overlap_thresh,
-            map_type=map_type,
-            is_bbox_normalized=is_bbox_normalized,
-            evaluate_difficult=evaluate_difficult,
-            catid2name=self.catid2name,
-            classwise=classwise)
-
-        self.reset()
-
-    def reset(self):
-        self.results = {'bbox': [], 'score': [], 'label': []}
-        self.detection_map.reset()
-
-    def update(self, inputs, outputs):
-        bbox_np = outputs['bbox'].numpy() if isinstance(
-            outputs['bbox'], paddle.Tensor) else outputs['bbox']
-        bboxes = bbox_np[:, 2:]
-        scores = bbox_np[:, 1]
-        labels = bbox_np[:, 0]
-        bbox_lengths = outputs['bbox_num'].numpy() if isinstance(
-            outputs['bbox_num'], paddle.Tensor) else outputs['bbox_num']
-
-        self.results['bbox'].append(bboxes.tolist())
-        self.results['score'].append(scores.tolist())
-        self.results['label'].append(labels.tolist())
-
-        if bboxes.shape == (1, 1) or bboxes is None:
-            return
-        if self.save_prediction_only:
-            return
-
-        gt_boxes = inputs['gt_bbox']
-        gt_labels = inputs['gt_class']
-        difficults = inputs['difficult'] if not self.evaluate_difficult \
-                            else None
-
-        if 'scale_factor' in inputs:
-            scale_factor = inputs['scale_factor'].numpy() if isinstance(
-                inputs['scale_factor'],
-                paddle.Tensor) else inputs['scale_factor']
-        else:
-            scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32')
-
-        bbox_idx = 0
-        for i in range(len(gt_boxes)):
-            gt_box = gt_boxes[i].numpy() if isinstance(
-                gt_boxes[i], paddle.Tensor) else gt_boxes[i]
-            h, w = scale_factor[i]
-            gt_box = gt_box / np.array([w, h, w, h])
-            gt_label = gt_labels[i].numpy() if isinstance(
-                gt_labels[i], paddle.Tensor) else gt_labels[i]
-            if difficults is not None:
-                difficult = difficults[i].numpy() if isinstance(
-                    difficults[i], paddle.Tensor) else difficults[i]
-            else:
-                difficult = None
-            bbox_num = bbox_lengths[i]
-            bbox = bboxes[bbox_idx:bbox_idx + bbox_num]
-            score = scores[bbox_idx:bbox_idx + bbox_num]
-            label = labels[bbox_idx:bbox_idx + bbox_num]
-            gt_box, gt_label, difficult = prune_zero_padding(gt_box, gt_label,
-                                                             difficult)
-            self.detection_map.update(bbox, score, label, gt_box, gt_label,
-                                      difficult)
-            bbox_idx += bbox_num
-
-    def accumulate(self):
-        output = "bbox.json"
-        if self.output_eval:
-            output = os.path.join(self.output_eval, output)
-            with open(output, 'w') as f:
-                json.dump(self.results, f)
-                logger.info('The bbox result is saved to bbox.json.')
-        if self.save_prediction_only:
-            return
-
-        logger.info("Accumulating evaluatation results...")
-        self.detection_map.accumulate()
-
-    def log(self):
-        map_stat = 100. * self.detection_map.get_map()
-        logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh,
-                                                       self.map_type, map_stat))
-
-    def get_results(self):
-        return {'bbox': [self.detection_map.get_map()]}
-
-
-class WiderFaceMetric(Metric):
-    def __init__(self, image_dir, anno_file, multi_scale=True):
-        self.image_dir = image_dir
-        self.anno_file = anno_file
-        self.multi_scale = multi_scale
-        self.clsid2catid, self.catid2name = get_categories('widerface')
-
-    def update(self, model):
-
-        face_eval_run(
-            model,
-            self.image_dir,
-            self.anno_file,
-            pred_dir='output/pred',
-            eval_mode='widerface',
-            multi_scale=self.multi_scale)
-
-
-class RBoxMetric(Metric):
-    def __init__(self, anno_file, **kwargs):
-        self.anno_file = anno_file
-        self.clsid2catid, self.catid2name = get_categories('RBOX', anno_file)
-        self.catid2clsid = {v: k for k, v in self.clsid2catid.items()}
-        self.classwise = kwargs.get('classwise', False)
-        self.output_eval = kwargs.get('output_eval', None)
-        self.save_prediction_only = kwargs.get('save_prediction_only', False)
-        self.overlap_thresh = kwargs.get('overlap_thresh', 0.5)
-        self.map_type = kwargs.get('map_type', '11point')
-        self.evaluate_difficult = kwargs.get('evaluate_difficult', False)
-        self.imid2path = kwargs.get('imid2path', None)
-        class_num = len(self.catid2name)
-        self.detection_map = DetectionMAP(
-            class_num=class_num,
-            overlap_thresh=self.overlap_thresh,
-            map_type=self.map_type,
-            is_bbox_normalized=False,
-            evaluate_difficult=self.evaluate_difficult,
-            catid2name=self.catid2name,
-            classwise=self.classwise)
-
-        self.reset()
-
-    def reset(self):
-        self.results = []
-        self.detection_map.reset()
-
-    def update(self, inputs, outputs):
-        outs = {}
-        # outputs Tensor -> numpy.ndarray
-        for k, v in outputs.items():
-            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
-
-        im_id = inputs['im_id']
-        im_id = im_id.numpy() if isinstance(im_id, paddle.Tensor) else im_id
-        outs['im_id'] = im_id
-
-        infer_results = get_infer_results(outs, self.clsid2catid)
-        infer_results = infer_results['bbox'] if 'bbox' in infer_results else []
-        self.results += infer_results
-        if self.save_prediction_only:
-            return
-
-        gt_boxes = inputs['gt_poly']
-        gt_labels = inputs['gt_class']
-
-        if 'scale_factor' in inputs:
-            scale_factor = inputs['scale_factor'].numpy() if isinstance(
-                inputs['scale_factor'],
-                paddle.Tensor) else inputs['scale_factor']
-        else:
-            scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32')
-
-        for i in range(len(gt_boxes)):
-            gt_box = gt_boxes[i].numpy() if isinstance(
-                gt_boxes[i], paddle.Tensor) else gt_boxes[i]
-            h, w = scale_factor[i]
-            gt_box = gt_box / np.array([w, h, w, h, w, h, w, h])
-            gt_label = gt_labels[i].numpy() if isinstance(
-                gt_labels[i], paddle.Tensor) else gt_labels[i]
-            gt_box, gt_label, _ = prune_zero_padding(gt_box, gt_label)
-            bbox = [
-                res['bbox'] for res in infer_results
-                if int(res['image_id']) == int(im_id[i])
-            ]
-            score = [
-                res['score'] for res in infer_results
-                if int(res['image_id']) == int(im_id[i])
-            ]
-            label = [
-                self.catid2clsid[int(res['category_id'])]
-                for res in infer_results
-                if int(res['image_id']) == int(im_id[i])
-            ]
-            self.detection_map.update(bbox, score, label, gt_box, gt_label)
-
-    def save_results(self, results, output_dir, imid2path):
-        if imid2path:
-            data_dicts = defaultdict(list)
-            for result in results:
-                image_id = result['image_id']
-                data_dicts[image_id].append(result)
-
-            for image_id, image_path in imid2path.items():
-                basename = os.path.splitext(os.path.split(image_path)[-1])[0]
-                output = os.path.join(output_dir, "{}.txt".format(basename))
-                dets = data_dicts.get(image_id, [])
-                with open(output, 'w') as f:
-                    for det in dets:
-                        catid, bbox, score = det['category_id'], det[
-                            'bbox'], det['score']
-                        bbox_pred = '{} {} '.format(self.catid2name[catid],
-                                                    score) + ' '.join(
-                                                        [str(e) for e in bbox])
-                        f.write(bbox_pred + '\n')
-
-            logger.info('The bbox result is saved to {}.'.format(output_dir))
-        else:
-            output = os.path.join(output_dir, "bbox.json")
-            with open(output, 'w') as f:
-                json.dump(results, f)
-
-            logger.info('The bbox result is saved to {}.'.format(output))
-
-    def accumulate(self):
-        if self.output_eval:
-            self.save_results(self.results, self.output_eval, self.imid2path)
-
-        if not self.save_prediction_only:
-            logger.info("Accumulating evaluatation results...")
-            self.detection_map.accumulate()
-
-    def log(self):
-        map_stat = 100. * self.detection_map.get_map()
-        logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh,
-                                                       self.map_type, map_stat))
-
-    def get_results(self):
-        return {'bbox': [self.detection_map.get_map()]}
-
-
-class SNIPERCOCOMetric(COCOMetric):
-    def __init__(self, anno_file, **kwargs):
-        super(SNIPERCOCOMetric, self).__init__(anno_file, **kwargs)
-        self.dataset = kwargs["dataset"]
-        self.chip_results = []
-
-    def reset(self):
-        # only bbox and mask evaluation support currently
-        self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []}
-        self.eval_results = {}
-        self.chip_results = []
-
-    def update(self, inputs, outputs):
-        outs = {}
-        # outputs Tensor -> numpy.ndarray
-        for k, v in outputs.items():
-            outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v
-
-        im_id = inputs['im_id']
-        outs['im_id'] = im_id.numpy() if isinstance(im_id,
-                                                    paddle.Tensor) else im_id
-
-        self.chip_results.append(outs)
-
-    def accumulate(self):
-        results = self.dataset.anno_cropper.aggregate_chips_detections(
-            self.chip_results)
-        for outs in results:
-            infer_results = get_infer_results(
-                outs, self.clsid2catid, bias=self.bias)
-            self.results['bbox'] += infer_results[
-                'bbox'] if 'bbox' in infer_results else []
-
-        super(SNIPERCOCOMetric, self).accumulate()
diff --git a/pdfdet/models/Paddle/ppdet/metrics/mot_metrics.py b/pdfdet/models/Paddle/ppdet/metrics/mot_metrics.py
deleted file mode 100644
index f61ae9c..0000000
--- a/pdfdet/models/Paddle/ppdet/metrics/mot_metrics.py
+++ /dev/null
@@ -1,1243 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import copy
-import sys
-import math
-from collections import defaultdict
-import numpy as np
-
-from ppdet.modeling.bbox_utils import bbox_iou_np_expand
-from .map_utils import ap_per_class
-from .metrics import Metric
-from .munkres import Munkres
-
-try:
-    import motmetrics as mm
-    mm.lap.default_solver = 'lap'
-except:
-    pass
-
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-__all__ = ['MOTEvaluator', 'MOTMetric', 'JDEDetMetric', 'KITTIMOTMetric']
-
-
-def read_mot_results(filename, is_gt=False, is_ignore=False):
-    valid_label = [1]
-    ignore_labels = [2, 7, 8, 12]  # only in motchallenge datasets like 'MOT16'
-    if is_gt:
-        logger.info(
-            "In MOT16/17 dataset the valid_label of ground truth is '{}', "
-            "in other dataset it should be '0' for single classs MOT.".format(
-                valid_label[0]))
-    results_dict = dict()
-    if os.path.isfile(filename):
-        with open(filename, 'r') as f:
-            for line in f.readlines():
-                linelist = line.split(',')
-                if len(linelist) < 7:
-                    continue
-                fid = int(linelist[0])
-                if fid < 1:
-                    continue
-                results_dict.setdefault(fid, list())
-
-                if is_gt:
-                    label = int(float(linelist[7]))
-                    mark = int(float(linelist[6]))
-                    if mark == 0 or label not in valid_label:
-                        continue
-                    score = 1
-                elif is_ignore:
-                    if 'MOT16-' in filename or 'MOT17-' in filename or 'MOT15-' in filename or 'MOT20-' in filename:
-                        label = int(float(linelist[7]))
-                        vis_ratio = float(linelist[8])
-                        if label not in ignore_labels and vis_ratio >= 0:
-                            continue
-                    else:
-                        continue
-                    score = 1
-                else:
-                    score = float(linelist[6])
-
-                tlwh = tuple(map(float, linelist[2:6]))
-                target_id = int(linelist[1])
-
-                results_dict[fid].append((tlwh, target_id, score))
-    return results_dict
-
-
-"""
-MOT dataset label list, see in https://motchallenge.net
-labels={'ped', ...			    % 1
-        'person_on_vhcl', ...	% 2
-        'car', ...				% 3
-        'bicycle', ...			% 4
-        'mbike', ...			% 5
-        'non_mot_vhcl', ...		% 6
-        'static_person', ...	% 7
-        'distractor', ...		% 8
-        'occluder', ...			% 9
-        'occluder_on_grnd', ...	% 10
-        'occluder_full', ...	% 11
-        'reflection', ...		% 12
-        'crowd' ...			    % 13
-};
-"""
-
-
-def unzip_objs(objs):
-    if len(objs) > 0:
-        tlwhs, ids, scores = zip(*objs)
-    else:
-        tlwhs, ids, scores = [], [], []
-    tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4)
-    return tlwhs, ids, scores
-
-
-class MOTEvaluator(object):
-    def __init__(self, data_root, seq_name, data_type):
-        self.data_root = data_root
-        self.seq_name = seq_name
-        self.data_type = data_type
-
-        self.load_annotations()
-        try:
-            import motmetrics as mm
-            mm.lap.default_solver = 'lap'
-        except Exception as e:
-            raise RuntimeError(
-                'Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics'
-            )
-        self.reset_accumulator()
-
-    def load_annotations(self):
-        assert self.data_type == 'mot'
-        gt_filename = os.path.join(self.data_root, self.seq_name, 'gt',
-                                   'gt.txt')
-        if not os.path.exists(gt_filename):
-            logger.warning(
-                "gt_filename '{}' of MOTEvaluator is not exist, so the MOTA will be -INF."
-            )
-        self.gt_frame_dict = read_mot_results(gt_filename, is_gt=True)
-        self.gt_ignore_frame_dict = read_mot_results(
-            gt_filename, is_ignore=True)
-
-    def reset_accumulator(self):
-        self.acc = mm.MOTAccumulator(auto_id=True)
-
-    def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False):
-        # results
-        trk_tlwhs = np.copy(trk_tlwhs)
-        trk_ids = np.copy(trk_ids)
-
-        # gts
-        gt_objs = self.gt_frame_dict.get(frame_id, [])
-        gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2]
-
-        # ignore boxes
-        ignore_objs = self.gt_ignore_frame_dict.get(frame_id, [])
-        ignore_tlwhs = unzip_objs(ignore_objs)[0]
-
-        # remove ignored results
-        keep = np.ones(len(trk_tlwhs), dtype=bool)
-        iou_distance = mm.distances.iou_matrix(
-            ignore_tlwhs, trk_tlwhs, max_iou=0.5)
-        if len(iou_distance) > 0:
-            match_is, match_js = mm.lap.linear_sum_assignment(iou_distance)
-            match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js])
-            match_ious = iou_distance[match_is, match_js]
-
-            match_js = np.asarray(match_js, dtype=int)
-            match_js = match_js[np.logical_not(np.isnan(match_ious))]
-            keep[match_js] = False
-            trk_tlwhs = trk_tlwhs[keep]
-            trk_ids = trk_ids[keep]
-
-        # get distance matrix
-        iou_distance = mm.distances.iou_matrix(gt_tlwhs, trk_tlwhs, max_iou=0.5)
-
-        # acc
-        self.acc.update(gt_ids, trk_ids, iou_distance)
-
-        if rtn_events and iou_distance.size > 0 and hasattr(self.acc,
-                                                            'last_mot_events'):
-            events = self.acc.last_mot_events  # only supported by https://github.com/longcw/py-motmetrics
-        else:
-            events = None
-        return events
-
-    def eval_file(self, filename):
-        self.reset_accumulator()
-
-        result_frame_dict = read_mot_results(filename, is_gt=False)
-        frames = sorted(list(set(result_frame_dict.keys())))
-        for frame_id in frames:
-            trk_objs = result_frame_dict.get(frame_id, [])
-            trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2]
-            self.eval_frame(frame_id, trk_tlwhs, trk_ids, rtn_events=False)
-
-        return self.acc
-
-    @staticmethod
-    def get_summary(accs,
-                    names,
-                    metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1',
-                             'precision', 'recall')):
-        names = copy.deepcopy(names)
-        if metrics is None:
-            metrics = mm.metrics.motchallenge_metrics
-        metrics = copy.deepcopy(metrics)
-
-        mh = mm.metrics.create()
-        summary = mh.compute_many(
-            accs, metrics=metrics, names=names, generate_overall=True)
-        return summary
-
-    @staticmethod
-    def save_summary(summary, filename):
-        import pandas as pd
-        writer = pd.ExcelWriter(filename)
-        summary.to_excel(writer)
-        writer.save()
-
-
-class MOTMetric(Metric):
-    def __init__(self, save_summary=False):
-        self.save_summary = save_summary
-        self.MOTEvaluator = MOTEvaluator
-        self.result_root = None
-        self.reset()
-
-    def reset(self):
-        self.accs = []
-        self.seqs = []
-
-    def update(self, data_root, seq, data_type, result_root, result_filename):
-        evaluator = self.MOTEvaluator(data_root, seq, data_type)
-        self.accs.append(evaluator.eval_file(result_filename))
-        self.seqs.append(seq)
-        self.result_root = result_root
-
-    def accumulate(self):
-        metrics = mm.metrics.motchallenge_metrics
-        mh = mm.metrics.create()
-        summary = self.MOTEvaluator.get_summary(self.accs, self.seqs, metrics)
-        self.strsummary = mm.io.render_summary(
-            summary,
-            formatters=mh.formatters,
-            namemap=mm.io.motchallenge_metric_names)
-        if self.save_summary:
-            self.MOTEvaluator.save_summary(
-                summary, os.path.join(self.result_root, 'summary.xlsx'))
-
-    def log(self):
-        print(self.strsummary)
-
-    def get_results(self):
-        return self.strsummary
-
-
-class JDEDetMetric(Metric):
-    # Note this detection AP metric is different from COCOMetric or VOCMetric,
-    # and the bboxes coordinates are not scaled to the original image
-    def __init__(self, overlap_thresh=0.5):
-        self.overlap_thresh = overlap_thresh
-        self.reset()
-
-    def reset(self):
-        self.AP_accum = np.zeros(1)
-        self.AP_accum_count = np.zeros(1)
-
-    def update(self, inputs, outputs):
-        bboxes = outputs['bbox'][:, 2:].numpy()
-        scores = outputs['bbox'][:, 1].numpy()
-        labels = outputs['bbox'][:, 0].numpy()
-        bbox_lengths = outputs['bbox_num'].numpy()
-        if bboxes.shape[0] == 1 and bboxes.sum() == 0.0:
-            return
-
-        gt_boxes = inputs['gt_bbox'].numpy()[0]
-        gt_labels = inputs['gt_class'].numpy()[0]
-        if gt_labels.shape[0] == 0:
-            return
-
-        correct = []
-        detected = []
-        for i in range(bboxes.shape[0]):
-            obj_pred = 0
-            pred_bbox = bboxes[i].reshape(1, 4)
-            # Compute iou with target boxes
-            iou = bbox_iou_np_expand(pred_bbox, gt_boxes, x1y1x2y2=True)[0]
-            # Extract index of largest overlap
-            best_i = np.argmax(iou)
-            # If overlap exceeds threshold and classification is correct mark as correct
-            if iou[best_i] > self.overlap_thresh and obj_pred == gt_labels[
-                    best_i] and best_i not in detected:
-                correct.append(1)
-                detected.append(best_i)
-            else:
-                correct.append(0)
-
-        # Compute Average Precision (AP) per class
-        target_cls = list(gt_labels.T[0])
-        AP, AP_class, R, P = ap_per_class(
-            tp=correct,
-            conf=scores,
-            pred_cls=np.zeros_like(scores),
-            target_cls=target_cls)
-        self.AP_accum_count += np.bincount(AP_class, minlength=1)
-        self.AP_accum += np.bincount(AP_class, minlength=1, weights=AP)
-
-    def accumulate(self):
-        logger.info("Accumulating evaluatation results...")
-        self.map_stat = self.AP_accum[0] / (self.AP_accum_count[0] + 1E-16)
-
-    def log(self):
-        map_stat = 100. * self.map_stat
-        logger.info("mAP({:.2f}) = {:.2f}%".format(self.overlap_thresh,
-                                                   map_stat))
-
-    def get_results(self):
-        return self.map_stat
-
-
-"""
-Following code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/evaluate_tracking.py
-"""
-
-
-class tData:
-    """
-        Utility class to load data.
-    """
-    def __init__(self,frame=-1,obj_type="unset",truncation=-1,occlusion=-1,\
-                 obs_angle=-10,x1=-1,y1=-1,x2=-1,y2=-1,w=-1,h=-1,l=-1,\
-                 X=-1000,Y=-1000,Z=-1000,yaw=-10,score=-1000,track_id=-1):
-        """
-            Constructor, initializes the object given the parameters.
-        """
-        self.frame = frame
-        self.track_id = track_id
-        self.obj_type = obj_type
-        self.truncation = truncation
-        self.occlusion = occlusion
-        self.obs_angle = obs_angle
-        self.x1 = x1
-        self.y1 = y1
-        self.x2 = x2
-        self.y2 = y2
-        self.w = w
-        self.h = h
-        self.l = l
-        self.X = X
-        self.Y = Y
-        self.Z = Z
-        self.yaw = yaw
-        self.score = score
-        self.ignored = False
-        self.valid = False
-        self.tracker = -1
-
-    def __str__(self):
-        attrs = vars(self)
-        return '\n'.join("%s: %s" % item for item in attrs.items())
-
-
-class KITTIEvaluation(object):
-    """ KITTI tracking statistics (CLEAR MOT, id-switches, fragments, ML/PT/MT, precision/recall)
-             MOTA	- Multi-object tracking accuracy in [0,100]
-             MOTP	- Multi-object tracking precision in [0,100] (3D) / [td,100] (2D)
-             MOTAL	- Multi-object tracking accuracy in [0,100] with log10(id-switches)
-
-             id-switches - number of id switches
-             fragments   - number of fragmentations
-
-             MT, PT, ML	- number of mostly tracked, partially tracked and mostly lost trajectories
-
-             recall	        - recall = percentage of detected targets
-             precision	    - precision = percentage of correctly detected targets
-             FAR		    - number of false alarms per frame
-             falsepositives - number of false positives (FP)
-             missed         - number of missed targets (FN)
-    """
-    def __init__(self, result_path, gt_path, min_overlap=0.5, max_truncation = 0,\
-                min_height = 25, max_occlusion = 2, cls="car",\
-                n_frames=[], seqs=[], n_sequences=0):
-        # get number of sequences and
-        # get number of frames per sequence from test mapping
-        # (created while extracting the benchmark)
-        self.gt_path = os.path.join(gt_path, "../labels")
-        self.n_frames = n_frames
-        self.sequence_name = seqs
-        self.n_sequences = n_sequences
-
-        self.cls = cls  # class to evaluate, i.e. pedestrian or car
-
-        self.result_path = result_path
-
-        # statistics and numbers for evaluation
-        self.n_gt = 0  # number of ground truth detections minus ignored false negatives and true positives
-        self.n_igt = 0  # number of ignored ground truth detections
-        self.n_gts = [
-        ]  # number of ground truth detections minus ignored false negatives and true positives PER SEQUENCE
-        self.n_igts = [
-        ]  # number of ground ignored truth detections PER SEQUENCE
-        self.n_gt_trajectories = 0
-        self.n_gt_seq = []
-        self.n_tr = 0  # number of tracker detections minus ignored tracker detections
-        self.n_trs = [
-        ]  # number of tracker detections minus ignored tracker detections PER SEQUENCE
-        self.n_itr = 0  # number of ignored tracker detections
-        self.n_itrs = []  # number of ignored tracker detections PER SEQUENCE
-        self.n_igttr = 0  # number of ignored ground truth detections where the corresponding associated tracker detection is also ignored
-        self.n_tr_trajectories = 0
-        self.n_tr_seq = []
-        self.MOTA = 0
-        self.MOTP = 0
-        self.MOTAL = 0
-        self.MODA = 0
-        self.MODP = 0
-        self.MODP_t = []
-        self.recall = 0
-        self.precision = 0
-        self.F1 = 0
-        self.FAR = 0
-        self.total_cost = 0
-        self.itp = 0  # number of ignored true positives
-        self.itps = []  # number of ignored true positives PER SEQUENCE
-        self.tp = 0  # number of true positives including ignored true positives!
-        self.tps = [
-        ]  # number of true positives including ignored true positives PER SEQUENCE
-        self.fn = 0  # number of false negatives WITHOUT ignored false negatives
-        self.fns = [
-        ]  # number of false negatives WITHOUT ignored false negatives PER SEQUENCE
-        self.ifn = 0  # number of ignored false negatives
-        self.ifns = []  # number of ignored false negatives PER SEQUENCE
-        self.fp = 0  # number of false positives
-        # a bit tricky, the number of ignored false negatives and ignored true positives 
-        # is subtracted, but if both tracker detection and ground truth detection
-        # are ignored this number is added again to avoid double counting
-        self.fps = []  # above PER SEQUENCE
-        self.mme = 0
-        self.fragments = 0
-        self.id_switches = 0
-        self.MT = 0
-        self.PT = 0
-        self.ML = 0
-
-        self.min_overlap = min_overlap  # minimum bounding box overlap for 3rd party metrics
-        self.max_truncation = max_truncation  # maximum truncation of an object for evaluation
-        self.max_occlusion = max_occlusion  # maximum occlusion of an object for evaluation
-        self.min_height = min_height  # minimum height of an object for evaluation
-        self.n_sample_points = 500
-
-        # this should be enough to hold all groundtruth trajectories
-        # is expanded if necessary and reduced in any case
-        self.gt_trajectories = [[] for x in range(self.n_sequences)]
-        self.ign_trajectories = [[] for x in range(self.n_sequences)]
-
-    def loadGroundtruth(self):
-        try:
-            self._loadData(self.gt_path, cls=self.cls, loading_groundtruth=True)
-        except IOError:
-            return False
-        return True
-
-    def loadTracker(self):
-        try:
-            if not self._loadData(
-                    self.result_path, cls=self.cls, loading_groundtruth=False):
-                return False
-        except IOError:
-            return False
-        return True
-
-    def _loadData(self,
-                  root_dir,
-                  cls,
-                  min_score=-1000,
-                  loading_groundtruth=False):
-        """
-            Generic loader for ground truth and tracking data.
-            Use loadGroundtruth() or loadTracker() to load this data.
-            Loads detections in KITTI format from textfiles.
-        """
-        # construct objectDetections object to hold detection data
-        t_data = tData()
-        data = []
-        eval_2d = True
-        eval_3d = True
-
-        seq_data = []
-        n_trajectories = 0
-        n_trajectories_seq = []
-        for seq, s_name in enumerate(self.sequence_name):
-            i = 0
-            filename = os.path.join(root_dir, "%s.txt" % s_name)
-            f = open(filename, "r")
-
-            f_data = [
-                [] for x in range(self.n_frames[seq])
-            ]  # current set has only 1059 entries, sufficient length is checked anyway
-            ids = []
-            n_in_seq = 0
-            id_frame_cache = []
-            for line in f:
-                # KITTI tracking benchmark data format:
-                # (frame,tracklet_id,objectType,truncation,occlusion,alpha,x1,y1,x2,y2,h,w,l,X,Y,Z,ry)
-                line = line.strip()
-                fields = line.split(" ")
-                # classes that should be loaded (ignored neighboring classes)
-                if "car" in cls.lower():
-                    classes = ["car", "van"]
-                elif "pedestrian" in cls.lower():
-                    classes = ["pedestrian", "person_sitting"]
-                else:
-                    classes = [cls.lower()]
-                classes += ["dontcare"]
-                if not any([s for s in classes if s in fields[2].lower()]):
-                    continue
-                # get fields from table
-                t_data.frame = int(float(fields[0]))  # frame
-                t_data.track_id = int(float(fields[1]))  # id
-                t_data.obj_type = fields[
-                    2].lower()  # object type [car, pedestrian, cyclist, ...]
-                t_data.truncation = int(
-                    float(fields[3]))  # truncation [-1,0,1,2]
-                t_data.occlusion = int(
-                    float(fields[4]))  # occlusion  [-1,0,1,2]
-                t_data.obs_angle = float(fields[5])  # observation angle [rad]
-                t_data.x1 = float(fields[6])  # left   [px]
-                t_data.y1 = float(fields[7])  # top    [px]
-                t_data.x2 = float(fields[8])  # right  [px]
-                t_data.y2 = float(fields[9])  # bottom [px]
-                t_data.h = float(fields[10])  # height [m]
-                t_data.w = float(fields[11])  # width  [m]
-                t_data.l = float(fields[12])  # length [m]
-                t_data.X = float(fields[13])  # X [m]
-                t_data.Y = float(fields[14])  # Y [m]
-                t_data.Z = float(fields[15])  # Z [m]
-                t_data.yaw = float(fields[16])  # yaw angle [rad]
-                if not loading_groundtruth:
-                    if len(fields) == 17:
-                        t_data.score = -1
-                    elif len(fields) == 18:
-                        t_data.score = float(fields[17])  # detection score
-                    else:
-                        logger.info("file is not in KITTI format")
-                        return
-
-                # do not consider objects marked as invalid
-                if t_data.track_id is -1 and t_data.obj_type != "dontcare":
-                    continue
-
-                idx = t_data.frame
-                # check if length for frame data is sufficient
-                if idx >= len(f_data):
-                    print("extend f_data", idx, len(f_data))
-                    f_data += [[] for x in range(max(500, idx - len(f_data)))]
-                try:
-                    id_frame = (t_data.frame, t_data.track_id)
-                    if id_frame in id_frame_cache and not loading_groundtruth:
-                        logger.info(
-                            "track ids are not unique for sequence %d: frame %d"
-                            % (seq, t_data.frame))
-                        logger.info(
-                            "track id %d occurred at least twice for this frame"
-                            % t_data.track_id)
-                        logger.info("Exiting...")
-                        #continue # this allows to evaluate non-unique result files
-                        return False
-                    id_frame_cache.append(id_frame)
-                    f_data[t_data.frame].append(copy.copy(t_data))
-                except:
-                    print(len(f_data), idx)
-                    raise
-
-                if t_data.track_id not in ids and t_data.obj_type != "dontcare":
-                    ids.append(t_data.track_id)
-                    n_trajectories += 1
-                    n_in_seq += 1
-
-                # check if uploaded data provides information for 2D and 3D evaluation
-                if not loading_groundtruth and eval_2d is True and (
-                        t_data.x1 == -1 or t_data.x2 == -1 or t_data.y1 == -1 or
-                        t_data.y2 == -1):
-                    eval_2d = False
-                if not loading_groundtruth and eval_3d is True and (
-                        t_data.X == -1000 or t_data.Y == -1000 or
-                        t_data.Z == -1000):
-                    eval_3d = False
-
-            # only add existing frames
-            n_trajectories_seq.append(n_in_seq)
-            seq_data.append(f_data)
-            f.close()
-
-        if not loading_groundtruth:
-            self.tracker = seq_data
-            self.n_tr_trajectories = n_trajectories
-            self.eval_2d = eval_2d
-            self.eval_3d = eval_3d
-            self.n_tr_seq = n_trajectories_seq
-            if self.n_tr_trajectories == 0:
-                return False
-        else:
-            # split ground truth and DontCare areas
-            self.dcareas = []
-            self.groundtruth = []
-            for seq_idx in range(len(seq_data)):
-                seq_gt = seq_data[seq_idx]
-                s_g, s_dc = [], []
-                for f in range(len(seq_gt)):
-                    all_gt = seq_gt[f]
-                    g, dc = [], []
-                    for gg in all_gt:
-                        if gg.obj_type == "dontcare":
-                            dc.append(gg)
-                        else:
-                            g.append(gg)
-                    s_g.append(g)
-                    s_dc.append(dc)
-                self.dcareas.append(s_dc)
-                self.groundtruth.append(s_g)
-            self.n_gt_seq = n_trajectories_seq
-            self.n_gt_trajectories = n_trajectories
-        return True
-
-    def boxoverlap(self, a, b, criterion="union"):
-        """
-            boxoverlap computes intersection over union for bbox a and b in KITTI format.
-            If the criterion is 'union', overlap = (a inter b) / a union b).
-            If the criterion is 'a', overlap = (a inter b) / a, where b should be a dontcare area.
-        """
-        x1 = max(a.x1, b.x1)
-        y1 = max(a.y1, b.y1)
-        x2 = min(a.x2, b.x2)
-        y2 = min(a.y2, b.y2)
-
-        w = x2 - x1
-        h = y2 - y1
-
-        if w <= 0. or h <= 0.:
-            return 0.
-        inter = w * h
-        aarea = (a.x2 - a.x1) * (a.y2 - a.y1)
-        barea = (b.x2 - b.x1) * (b.y2 - b.y1)
-        # intersection over union overlap
-        if criterion.lower() == "union":
-            o = inter / float(aarea + barea - inter)
-        elif criterion.lower() == "a":
-            o = float(inter) / float(aarea)
-        else:
-            raise TypeError("Unkown type for criterion")
-        return o
-
-    def compute3rdPartyMetrics(self):
-        """
-            Computes the metrics defined in
-                - Stiefelhagen 2008: Evaluating Multiple Object Tracking Performance: The CLEAR MOT Metrics
-                  MOTA, MOTAL, MOTP
-                - Nevatia 2008: Global Data Association for Multi-Object Tracking Using Network Flows
-                  MT/PT/ML
-        """
-        # construct Munkres object for Hungarian Method association
-        hm = Munkres()
-        max_cost = 1e9
-
-        # go through all frames and associate ground truth and tracker results
-        # groundtruth and tracker contain lists for every single frame containing lists of KITTI format detections
-        fr, ids = 0, 0
-        for seq_idx in range(len(self.groundtruth)):
-            seq_gt = self.groundtruth[seq_idx]
-            seq_dc = self.dcareas[seq_idx]  # don't care areas
-            seq_tracker = self.tracker[seq_idx]
-            seq_trajectories = defaultdict(list)
-            seq_ignored = defaultdict(list)
-
-            # statistics over the current sequence, check the corresponding
-            # variable comments in __init__ to get their meaning
-            seqtp = 0
-            seqitp = 0
-            seqfn = 0
-            seqifn = 0
-            seqfp = 0
-            seqigt = 0
-            seqitr = 0
-
-            last_ids = [[], []]
-            n_gts = 0
-            n_trs = 0
-
-            for f in range(len(seq_gt)):
-                g = seq_gt[f]
-                dc = seq_dc[f]
-
-                t = seq_tracker[f]
-                # counting total number of ground truth and tracker objects
-                self.n_gt += len(g)
-                self.n_tr += len(t)
-
-                n_gts += len(g)
-                n_trs += len(t)
-
-                # use hungarian method to associate, using boxoverlap 0..1 as cost
-                # build cost matrix
-                cost_matrix = []
-                this_ids = [[], []]
-                for gg in g:
-                    # save current ids
-                    this_ids[0].append(gg.track_id)
-                    this_ids[1].append(-1)
-                    gg.tracker = -1
-                    gg.id_switch = 0
-                    gg.fragmentation = 0
-                    cost_row = []
-                    for tt in t:
-                        # overlap == 1 is cost ==0
-                        c = 1 - self.boxoverlap(gg, tt)
-                        # gating for boxoverlap
-                        if c <= self.min_overlap:
-                            cost_row.append(c)
-                        else:
-                            cost_row.append(max_cost)  # = 1e9
-                    cost_matrix.append(cost_row)
-                    # all ground truth trajectories are initially not associated
-                    # extend groundtruth trajectories lists (merge lists)
-                    seq_trajectories[gg.track_id].append(-1)
-                    seq_ignored[gg.track_id].append(False)
-
-                if len(g) is 0:
-                    cost_matrix = [[]]
-                # associate
-                association_matrix = hm.compute(cost_matrix)
-
-                # tmp variables for sanity checks and MODP computation
-                tmptp = 0
-                tmpfp = 0
-                tmpfn = 0
-                tmpc = 0  # this will sum up the overlaps for all true positives
-                tmpcs = [0] * len(
-                    g)  # this will save the overlaps for all true positives
-                # the reason is that some true positives might be ignored
-                # later such that the corrsponding overlaps can
-                # be subtracted from tmpc for MODP computation
-
-                # mapping for tracker ids and ground truth ids
-                for row, col in association_matrix:
-                    # apply gating on boxoverlap
-                    c = cost_matrix[row][col]
-                    if c < max_cost:
-                        g[row].tracker = t[col].track_id
-                        this_ids[1][row] = t[col].track_id
-                        t[col].valid = True
-                        g[row].distance = c
-                        self.total_cost += 1 - c
-                        tmpc += 1 - c
-                        tmpcs[row] = 1 - c
-                        seq_trajectories[g[row].track_id][-1] = t[col].track_id
-
-                        # true positives are only valid associations
-                        self.tp += 1
-                        tmptp += 1
-                    else:
-                        g[row].tracker = -1
-                        self.fn += 1
-                        tmpfn += 1
-
-                # associate tracker and DontCare areas
-                # ignore tracker in neighboring classes
-                nignoredtracker = 0  # number of ignored tracker detections
-                ignoredtrackers = dict()  # will associate the track_id with -1
-                # if it is not ignored and 1 if it is
-                # ignored;
-                # this is used to avoid double counting ignored
-                # cases, see the next loop
-
-                for tt in t:
-                    ignoredtrackers[tt.track_id] = -1
-                    # ignore detection if it belongs to a neighboring class or is
-                    # smaller or equal to the minimum height
-
-                    tt_height = abs(tt.y1 - tt.y2)
-                    if ((self.cls == "car" and tt.obj_type == "van") or
-                        (self.cls == "pedestrian" and
-                         tt.obj_type == "person_sitting") or
-                            tt_height <= self.min_height) and not tt.valid:
-                        nignoredtracker += 1
-                        tt.ignored = True
-                        ignoredtrackers[tt.track_id] = 1
-                        continue
-                    for d in dc:
-                        overlap = self.boxoverlap(tt, d, "a")
-                        if overlap > 0.5 and not tt.valid:
-                            tt.ignored = True
-                            nignoredtracker += 1
-                            ignoredtrackers[tt.track_id] = 1
-                            break
-
-                # check for ignored FN/TP (truncation or neighboring object class)
-                ignoredfn = 0  # the number of ignored false negatives
-                nignoredtp = 0  # the number of ignored true positives
-                nignoredpairs = 0  # the number of ignored pairs, i.e. a true positive
-                # which is ignored but where the associated tracker
-                # detection has already been ignored
-
-                gi = 0
-                for gg in g:
-                    if gg.tracker < 0:
-                        if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\
-                                or (self.cls=="car" and gg.obj_type=="van") or (self.cls=="pedestrian" and gg.obj_type=="person_sitting"):
-                            seq_ignored[gg.track_id][-1] = True
-                            gg.ignored = True
-                            ignoredfn += 1
-
-                    elif gg.tracker >= 0:
-                        if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\
-                                or (self.cls=="car" and gg.obj_type=="van") or (self.cls=="pedestrian" and gg.obj_type=="person_sitting"):
-
-                            seq_ignored[gg.track_id][-1] = True
-                            gg.ignored = True
-                            nignoredtp += 1
-
-                            # if the associated tracker detection is already ignored,
-                            # we want to avoid double counting ignored detections
-                            if ignoredtrackers[gg.tracker] > 0:
-                                nignoredpairs += 1
-
-                            # for computing MODP, the overlaps from ignored detections
-                            # are subtracted
-                            tmpc -= tmpcs[gi]
-                    gi += 1
-
-                # the below might be confusion, check the comments in __init__
-                # to see what the individual statistics represent
-
-                # correct TP by number of ignored TP due to truncation
-                # ignored TP are shown as tracked in visualization
-                tmptp -= nignoredtp
-
-                # count the number of ignored true positives
-                self.itp += nignoredtp
-
-                # adjust the number of ground truth objects considered
-                self.n_gt -= (ignoredfn + nignoredtp)
-
-                # count the number of ignored ground truth objects
-                self.n_igt += ignoredfn + nignoredtp
-
-                # count the number of ignored tracker objects
-                self.n_itr += nignoredtracker
-
-                # count the number of ignored pairs, i.e. associated tracker and
-                # ground truth objects that are both ignored
-                self.n_igttr += nignoredpairs
-
-                # false negatives = associated gt bboxes exceding association threshold + non-associated gt bboxes
-                tmpfn += len(g) - len(association_matrix) - ignoredfn
-                self.fn += len(g) - len(association_matrix) - ignoredfn
-                self.ifn += ignoredfn
-
-                # false positives = tracker bboxes - associated tracker bboxes
-                # mismatches (mme_t)
-                tmpfp += len(
-                    t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs
-                self.fp += len(
-                    t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs
-
-                # update sequence data
-                seqtp += tmptp
-                seqitp += nignoredtp
-                seqfp += tmpfp
-                seqfn += tmpfn
-                seqifn += ignoredfn
-                seqigt += ignoredfn + nignoredtp
-                seqitr += nignoredtracker
-
-                # sanity checks
-                # - the number of true positives minues ignored true positives
-                #   should be greater or equal to 0
-                # - the number of false negatives should be greater or equal to 0
-                # - the number of false positives needs to be greater or equal to 0
-                #   otherwise ignored detections might be counted double
-                # - the number of counted true positives (plus ignored ones)
-                #   and the number of counted false negatives (plus ignored ones)
-                #   should match the total number of ground truth objects
-                # - the number of counted true positives (plus ignored ones)
-                #   and the number of counted false positives
-                #   plus the number of ignored tracker detections should
-                #   match the total number of tracker detections; note that
-                #   nignoredpairs is subtracted here to avoid double counting
-                #   of ignored detection sin nignoredtp and nignoredtracker
-                if tmptp < 0:
-                    print(tmptp, nignoredtp)
-                    raise NameError("Something went wrong! TP is negative")
-                if tmpfn < 0:
-                    print(tmpfn,
-                          len(g),
-                          len(association_matrix), ignoredfn, nignoredpairs)
-                    raise NameError("Something went wrong! FN is negative")
-                if tmpfp < 0:
-                    print(tmpfp,
-                          len(t), tmptp, nignoredtracker, nignoredtp,
-                          nignoredpairs)
-                    raise NameError("Something went wrong! FP is negative")
-                if tmptp + tmpfn is not len(g) - ignoredfn - nignoredtp:
-                    print("seqidx", seq_idx)
-                    print("frame ", f)
-                    print("TP    ", tmptp)
-                    print("FN    ", tmpfn)
-                    print("FP    ", tmpfp)
-                    print("nGT   ", len(g))
-                    print("nAss  ", len(association_matrix))
-                    print("ign GT", ignoredfn)
-                    print("ign TP", nignoredtp)
-                    raise NameError(
-                        "Something went wrong! nGroundtruth is not TP+FN")
-                if tmptp + tmpfp + nignoredtp + nignoredtracker - nignoredpairs is not len(
-                        t):
-                    print(seq_idx, f, len(t), tmptp, tmpfp)
-                    print(len(association_matrix), association_matrix)
-                    raise NameError(
-                        "Something went wrong! nTracker is not TP+FP")
-
-                # check for id switches or fragmentations
-                for i, tt in enumerate(this_ids[0]):
-                    if tt in last_ids[0]:
-                        idx = last_ids[0].index(tt)
-                        tid = this_ids[1][i]
-                        lid = last_ids[1][idx]
-                        if tid != lid and lid != -1 and tid != -1:
-                            if g[i].truncation < self.max_truncation:
-                                g[i].id_switch = 1
-                                ids += 1
-                        if tid != lid and lid != -1:
-                            if g[i].truncation < self.max_truncation:
-                                g[i].fragmentation = 1
-                                fr += 1
-
-                # save current index
-                last_ids = this_ids
-                # compute MOTP_t
-                MODP_t = 1
-                if tmptp != 0:
-                    MODP_t = tmpc / float(tmptp)
-                self.MODP_t.append(MODP_t)
-
-            # remove empty lists for current gt trajectories
-            self.gt_trajectories[seq_idx] = seq_trajectories
-            self.ign_trajectories[seq_idx] = seq_ignored
-
-            # gather statistics for "per sequence" statistics.
-            self.n_gts.append(n_gts)
-            self.n_trs.append(n_trs)
-            self.tps.append(seqtp)
-            self.itps.append(seqitp)
-            self.fps.append(seqfp)
-            self.fns.append(seqfn)
-            self.ifns.append(seqifn)
-            self.n_igts.append(seqigt)
-            self.n_itrs.append(seqitr)
-
-        # compute MT/PT/ML, fragments, idswitches for all groundtruth trajectories
-        n_ignored_tr_total = 0
-        for seq_idx, (
-                seq_trajectories, seq_ignored
-        ) in enumerate(zip(self.gt_trajectories, self.ign_trajectories)):
-            if len(seq_trajectories) == 0:
-                continue
-            tmpMT, tmpML, tmpPT, tmpId_switches, tmpFragments = [0] * 5
-            n_ignored_tr = 0
-            for g, ign_g in zip(seq_trajectories.values(),
-                                seq_ignored.values()):
-                # all frames of this gt trajectory are ignored
-                if all(ign_g):
-                    n_ignored_tr += 1
-                    n_ignored_tr_total += 1
-                    continue
-                # all frames of this gt trajectory are not assigned to any detections
-                if all([this == -1 for this in g]):
-                    tmpML += 1
-                    self.ML += 1
-                    continue
-                # compute tracked frames in trajectory
-                last_id = g[0]
-                # first detection (necessary to be in gt_trajectories) is always tracked
-                tracked = 1 if g[0] >= 0 else 0
-                lgt = 0 if ign_g[0] else 1
-                for f in range(1, len(g)):
-                    if ign_g[f]:
-                        last_id = -1
-                        continue
-                    lgt += 1
-                    if last_id != g[f] and last_id != -1 and g[f] != -1 and g[
-                            f - 1] != -1:
-                        tmpId_switches += 1
-                        self.id_switches += 1
-                    if f < len(g) - 1 and g[f - 1] != g[
-                            f] and last_id != -1 and g[f] != -1 and g[f +
-                                                                      1] != -1:
-                        tmpFragments += 1
-                        self.fragments += 1
-                    if g[f] != -1:
-                        tracked += 1
-                        last_id = g[f]
-                # handle last frame; tracked state is handled in for loop (g[f]!=-1)
-                if len(g) > 1 and g[f - 1] != g[f] and last_id != -1 and g[
-                        f] != -1 and not ign_g[f]:
-                    tmpFragments += 1
-                    self.fragments += 1
-
-                # compute MT/PT/ML
-                tracking_ratio = tracked / float(len(g) - sum(ign_g))
-                if tracking_ratio > 0.8:
-                    tmpMT += 1
-                    self.MT += 1
-                elif tracking_ratio < 0.2:
-                    tmpML += 1
-                    self.ML += 1
-                else:  # 0.2 <= tracking_ratio <= 0.8
-                    tmpPT += 1
-                    self.PT += 1
-
-        if (self.n_gt_trajectories - n_ignored_tr_total) == 0:
-            self.MT = 0.
-            self.PT = 0.
-            self.ML = 0.
-        else:
-            self.MT /= float(self.n_gt_trajectories - n_ignored_tr_total)
-            self.PT /= float(self.n_gt_trajectories - n_ignored_tr_total)
-            self.ML /= float(self.n_gt_trajectories - n_ignored_tr_total)
-
-        # precision/recall etc.
-        if (self.fp + self.tp) == 0 or (self.tp + self.fn) == 0:
-            self.recall = 0.
-            self.precision = 0.
-        else:
-            self.recall = self.tp / float(self.tp + self.fn)
-            self.precision = self.tp / float(self.fp + self.tp)
-        if (self.recall + self.precision) == 0:
-            self.F1 = 0.
-        else:
-            self.F1 = 2. * (self.precision * self.recall) / (
-                self.precision + self.recall)
-        if sum(self.n_frames) == 0:
-            self.FAR = "n/a"
-        else:
-            self.FAR = self.fp / float(sum(self.n_frames))
-
-        # compute CLEARMOT
-        if self.n_gt == 0:
-            self.MOTA = -float("inf")
-            self.MODA = -float("inf")
-        else:
-            self.MOTA = 1 - (self.fn + self.fp + self.id_switches
-                             ) / float(self.n_gt)
-            self.MODA = 1 - (self.fn + self.fp) / float(self.n_gt)
-        if self.tp == 0:
-            self.MOTP = float("inf")
-        else:
-            self.MOTP = self.total_cost / float(self.tp)
-        if self.n_gt != 0:
-            if self.id_switches == 0:
-                self.MOTAL = 1 - (self.fn + self.fp + self.id_switches
-                                  ) / float(self.n_gt)
-            else:
-                self.MOTAL = 1 - (self.fn + self.fp +
-                                  math.log10(self.id_switches)
-                                  ) / float(self.n_gt)
-        else:
-            self.MOTAL = -float("inf")
-        if sum(self.n_frames) == 0:
-            self.MODP = "n/a"
-        else:
-            self.MODP = sum(self.MODP_t) / float(sum(self.n_frames))
-        return True
-
-    def createSummary(self):
-        summary = ""
-        summary += "tracking evaluation summary".center(80, "=") + "\n"
-        summary += self.printEntry("Multiple Object Tracking Accuracy (MOTA)",
-                                   self.MOTA) + "\n"
-        summary += self.printEntry("Multiple Object Tracking Precision (MOTP)",
-                                   self.MOTP) + "\n"
-        summary += self.printEntry("Multiple Object Tracking Accuracy (MOTAL)",
-                                   self.MOTAL) + "\n"
-        summary += self.printEntry("Multiple Object Detection Accuracy (MODA)",
-                                   self.MODA) + "\n"
-        summary += self.printEntry("Multiple Object Detection Precision (MODP)",
-                                   self.MODP) + "\n"
-        summary += "\n"
-        summary += self.printEntry("Recall", self.recall) + "\n"
-        summary += self.printEntry("Precision", self.precision) + "\n"
-        summary += self.printEntry("F1", self.F1) + "\n"
-        summary += self.printEntry("False Alarm Rate", self.FAR) + "\n"
-        summary += "\n"
-        summary += self.printEntry("Mostly Tracked", self.MT) + "\n"
-        summary += self.printEntry("Partly Tracked", self.PT) + "\n"
-        summary += self.printEntry("Mostly Lost", self.ML) + "\n"
-        summary += "\n"
-        summary += self.printEntry("True Positives", self.tp) + "\n"
-        #summary += self.printEntry("True Positives per Sequence", self.tps) + "\n"
-        summary += self.printEntry("Ignored True Positives", self.itp) + "\n"
-        #summary += self.printEntry("Ignored True Positives per Sequence", self.itps) + "\n"
-
-        summary += self.printEntry("False Positives", self.fp) + "\n"
-        #summary += self.printEntry("False Positives per Sequence", self.fps) + "\n"
-        summary += self.printEntry("False Negatives", self.fn) + "\n"
-        #summary += self.printEntry("False Negatives per Sequence", self.fns) + "\n"
-        summary += self.printEntry("ID-switches", self.id_switches) + "\n"
-        self.fp = self.fp / self.n_gt
-        self.fn = self.fn / self.n_gt
-        self.id_switches = self.id_switches / self.n_gt
-        summary += self.printEntry("False Positives Ratio", self.fp) + "\n"
-        #summary += self.printEntry("False Positives per Sequence", self.fps) + "\n"
-        summary += self.printEntry("False Negatives Ratio", self.fn) + "\n"
-        #summary += self.printEntry("False Negatives per Sequence", self.fns) + "\n"
-        summary += self.printEntry("Ignored False Negatives Ratio",
-                                   self.ifn) + "\n"
-
-        #summary += self.printEntry("Ignored False Negatives per Sequence", self.ifns) + "\n"
-        summary += self.printEntry("Missed Targets", self.fn) + "\n"
-        summary += self.printEntry("ID-switches", self.id_switches) + "\n"
-        summary += self.printEntry("Fragmentations", self.fragments) + "\n"
-        summary += "\n"
-        summary += self.printEntry("Ground Truth Objects (Total)", self.n_gt +
-                                   self.n_igt) + "\n"
-        #summary += self.printEntry("Ground Truth Objects (Total) per Sequence", self.n_gts) + "\n"
-        summary += self.printEntry("Ignored Ground Truth Objects",
-                                   self.n_igt) + "\n"
-        #summary += self.printEntry("Ignored Ground Truth Objects per Sequence", self.n_igts) + "\n"
-        summary += self.printEntry("Ground Truth Trajectories",
-                                   self.n_gt_trajectories) + "\n"
-        summary += "\n"
-        summary += self.printEntry("Tracker Objects (Total)", self.n_tr) + "\n"
-        #summary += self.printEntry("Tracker Objects (Total) per Sequence", self.n_trs) + "\n"
-        summary += self.printEntry("Ignored Tracker Objects", self.n_itr) + "\n"
-        #summary += self.printEntry("Ignored Tracker Objects per Sequence", self.n_itrs) + "\n"
-        summary += self.printEntry("Tracker Trajectories",
-                                   self.n_tr_trajectories) + "\n"
-        #summary += "\n"
-        #summary += self.printEntry("Ignored Tracker Objects with Associated Ignored Ground Truth Objects", self.n_igttr) + "\n"
-        summary += "=" * 80
-        return summary
-
-    def printEntry(self, key, val, width=(70, 10)):
-        """
-            Pretty print an entry in a table fashion.
-        """
-        s_out = key.ljust(width[0])
-        if type(val) == int:
-            s = "%%%dd" % width[1]
-            s_out += s % val
-        elif type(val) == float:
-            s = "%%%df" % (width[1])
-            s_out += s % val
-        else:
-            s_out += ("%s" % val).rjust(width[1])
-        return s_out
-
-    def saveToStats(self, save_summary):
-        """
-            Save the statistics in a whitespace separate file.
-        """
-        summary = self.createSummary()
-        if save_summary:
-            filename = os.path.join(self.result_path,
-                                    "summary_%s.txt" % self.cls)
-            dump = open(filename, "w+")
-            dump.write(summary)
-            dump.close()
-        return summary
-
-
-class KITTIMOTMetric(Metric):
-    def __init__(self, save_summary=True):
-        self.save_summary = save_summary
-        self.MOTEvaluator = KITTIEvaluation
-        self.result_root = None
-        self.reset()
-
-    def reset(self):
-        self.seqs = []
-        self.n_sequences = 0
-        self.n_frames = []
-        self.strsummary = ''
-
-    def update(self, data_root, seq, data_type, result_root, result_filename):
-        assert data_type == 'kitti', "data_type should 'kitti'"
-        self.result_root = result_root
-        self.gt_path = data_root
-        gt_path = '{}/../labels/{}.txt'.format(data_root, seq)
-        gt = open(gt_path, "r")
-        max_frame = 0
-        for line in gt:
-            line = line.strip()
-            line_list = line.split(" ")
-            if int(line_list[0]) > max_frame:
-                max_frame = int(line_list[0])
-        rs = open(result_filename, "r")
-        for line in rs:
-            line = line.strip()
-            line_list = line.split(" ")
-            if int(line_list[0]) > max_frame:
-                max_frame = int(line_list[0])
-        gt.close()
-        rs.close()
-        self.n_frames.append(max_frame + 1)
-        self.seqs.append(seq)
-        self.n_sequences += 1
-
-    def accumulate(self):
-        logger.info("Processing Result for KITTI Tracking Benchmark")
-        e = self.MOTEvaluator(result_path=self.result_root, gt_path=self.gt_path,\
-            n_frames=self.n_frames, seqs=self.seqs, n_sequences=self.n_sequences)
-        try:
-            if not e.loadTracker():
-                return
-            logger.info("Loading Results - Success")
-            logger.info("Evaluate Object Class: %s" % c.upper())
-        except:
-            logger.info("Caught exception while loading result data.")
-        if not e.loadGroundtruth():
-            raise ValueError("Ground truth not found.")
-        logger.info("Loading Groundtruth - Success")
-        # sanity checks
-        if len(e.groundtruth) is not len(e.tracker):
-            logger.info(
-                "The uploaded data does not provide results for every sequence.")
-            return False
-        logger.info("Loaded %d Sequences." % len(e.groundtruth))
-        logger.info("Start Evaluation...")
-
-        if e.compute3rdPartyMetrics():
-            self.strsummary = e.saveToStats(self.save_summary)
-        else:
-            logger.info(
-                "There seem to be no true positives or false positives at all in the submitted data."
-            )
-
-    def log(self):
-        print(self.strsummary)
-
-    def get_results(self):
-        return self.strsummary
diff --git a/pdfdet/models/Paddle/ppdet/metrics/munkres.py b/pdfdet/models/Paddle/ppdet/metrics/munkres.py
deleted file mode 100644
index fbd4a92..0000000
--- a/pdfdet/models/Paddle/ppdet/metrics/munkres.py
+++ /dev/null
@@ -1,428 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-"""
-This code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/munkres.py
-"""
-
-import sys
-
-__all__ = ['Munkres', 'make_cost_matrix']
-
-
-class Munkres:
-    """
-    Calculate the Munkres solution to the classical assignment problem.
-    See the module documentation for usage.
-    """
-
-    def __init__(self):
-        """Create a new instance"""
-        self.C = None
-        self.row_covered = []
-        self.col_covered = []
-        self.n = 0
-        self.Z0_r = 0
-        self.Z0_c = 0
-        self.marked = None
-        self.path = None
-
-    def make_cost_matrix(profit_matrix, inversion_function):
-        """
-        **DEPRECATED**
-
-        Please use the module function ``make_cost_matrix()``.
-        """
-        import munkres
-        return munkres.make_cost_matrix(profit_matrix, inversion_function)
-
-    make_cost_matrix = staticmethod(make_cost_matrix)
-
-    def pad_matrix(self, matrix, pad_value=0):
-        """
-        Pad a possibly non-square matrix to make it square.
-
-        :Parameters:
-            matrix : list of lists
-                matrix to pad
-
-            pad_value : int
-                value to use to pad the matrix
-
-        :rtype: list of lists
-        :return: a new, possibly padded, matrix
-        """
-        max_columns = 0
-        total_rows = len(matrix)
-
-        for row in matrix:
-            max_columns = max(max_columns, len(row))
-
-        total_rows = max(max_columns, total_rows)
-
-        new_matrix = []
-        for row in matrix:
-            row_len = len(row)
-            new_row = row[:]
-            if total_rows > row_len:
-                # Row too short. Pad it.
-                new_row += [0] * (total_rows - row_len)
-            new_matrix += [new_row]
-
-        while len(new_matrix) < total_rows:
-            new_matrix += [[0] * total_rows]
-
-        return new_matrix
-
-    def compute(self, cost_matrix):
-        """
-        Compute the indexes for the lowest-cost pairings between rows and
-        columns in the database. Returns a list of (row, column) tuples
-        that can be used to traverse the matrix.
-
-        :Parameters:
-            cost_matrix : list of lists
-                The cost matrix. If this cost matrix is not square, it
-                will be padded with zeros, via a call to ``pad_matrix()``.
-                (This method does *not* modify the caller's matrix. It
-                operates on a copy of the matrix.)
-
-                **WARNING**: This code handles square and rectangular
-                matrices. It does *not* handle irregular matrices.
-
-        :rtype: list
-        :return: A list of ``(row, column)`` tuples that describe the lowest
-                 cost path through the matrix
-
-        """
-        self.C = self.pad_matrix(cost_matrix)
-        self.n = len(self.C)
-        self.original_length = len(cost_matrix)
-        self.original_width = len(cost_matrix[0])
-        self.row_covered = [False for i in range(self.n)]
-        self.col_covered = [False for i in range(self.n)]
-        self.Z0_r = 0
-        self.Z0_c = 0
-        self.path = self.__make_matrix(self.n * 2, 0)
-        self.marked = self.__make_matrix(self.n, 0)
-
-        done = False
-        step = 1
-
-        steps = {
-            1: self.__step1,
-            2: self.__step2,
-            3: self.__step3,
-            4: self.__step4,
-            5: self.__step5,
-            6: self.__step6
-        }
-
-        while not done:
-            try:
-                func = steps[step]
-                step = func()
-            except KeyError:
-                done = True
-
-        # Look for the starred columns
-        results = []
-        for i in range(self.original_length):
-            for j in range(self.original_width):
-                if self.marked[i][j] == 1:
-                    results += [(i, j)]
-
-        return results
-
-    def __copy_matrix(self, matrix):
-        """Return an exact copy of the supplied matrix"""
-        return copy.deepcopy(matrix)
-
-    def __make_matrix(self, n, val):
-        """Create an *n*x*n* matrix, populating it with the specific value."""
-        matrix = []
-        for i in range(n):
-            matrix += [[val for j in range(n)]]
-        return matrix
-
-    def __step1(self):
-        """
-        For each row of the matrix, find the smallest element and
-        subtract it from every element in its row. Go to Step 2.
-        """
-        C = self.C
-        n = self.n
-        for i in range(n):
-            minval = min(self.C[i])
-            # Find the minimum value for this row and subtract that minimum
-            # from every element in the row.
-            for j in range(n):
-                self.C[i][j] -= minval
-
-        return 2
-
-    def __step2(self):
-        """
-        Find a zero (Z) in the resulting matrix. If there is no starred
-        zero in its row or column, star Z. Repeat for each element in the
-        matrix. Go to Step 3.
-        """
-        n = self.n
-        for i in range(n):
-            for j in range(n):
-                if (self.C[i][j] == 0) and \
-                   (not self.col_covered[j]) and \
-                   (not self.row_covered[i]):
-                    self.marked[i][j] = 1
-                    self.col_covered[j] = True
-                    self.row_covered[i] = True
-
-        self.__clear_covers()
-        return 3
-
-    def __step3(self):
-        """
-        Cover each column containing a starred zero. If K columns are
-        covered, the starred zeros describe a complete set of unique
-        assignments. In this case, Go to DONE, otherwise, Go to Step 4.
-        """
-        n = self.n
-        count = 0
-        for i in range(n):
-            for j in range(n):
-                if self.marked[i][j] == 1:
-                    self.col_covered[j] = True
-                    count += 1
-
-        if count >= n:
-            step = 7  # done
-        else:
-            step = 4
-
-        return step
-
-    def __step4(self):
-        """
-        Find a noncovered zero and prime it. If there is no starred zero
-        in the row containing this primed zero, Go to Step 5. Otherwise,
-        cover this row and uncover the column containing the starred
-        zero. Continue in this manner until there are no uncovered zeros
-        left. Save the smallest uncovered value and Go to Step 6.
-        """
-        step = 0
-        done = False
-        row = -1
-        col = -1
-        star_col = -1
-        while not done:
-            (row, col) = self.__find_a_zero()
-            if row < 0:
-                done = True
-                step = 6
-            else:
-                self.marked[row][col] = 2
-                star_col = self.__find_star_in_row(row)
-                if star_col >= 0:
-                    col = star_col
-                    self.row_covered[row] = True
-                    self.col_covered[col] = False
-                else:
-                    done = True
-                    self.Z0_r = row
-                    self.Z0_c = col
-                    step = 5
-
-        return step
-
-    def __step5(self):
-        """
-        Construct a series of alternating primed and starred zeros as
-        follows. Let Z0 represent the uncovered primed zero found in Step 4.
-        Let Z1 denote the starred zero in the column of Z0 (if any).
-        Let Z2 denote the primed zero in the row of Z1 (there will always
-        be one). Continue until the series terminates at a primed zero
-        that has no starred zero in its column. Unstar each starred zero
-        of the series, star each primed zero of the series, erase all
-        primes and uncover every line in the matrix. Return to Step 3
-        """
-        count = 0
-        path = self.path
-        path[count][0] = self.Z0_r
-        path[count][1] = self.Z0_c
-        done = False
-        while not done:
-            row = self.__find_star_in_col(path[count][1])
-            if row >= 0:
-                count += 1
-                path[count][0] = row
-                path[count][1] = path[count - 1][1]
-            else:
-                done = True
-
-            if not done:
-                col = self.__find_prime_in_row(path[count][0])
-                count += 1
-                path[count][0] = path[count - 1][0]
-                path[count][1] = col
-
-        self.__convert_path(path, count)
-        self.__clear_covers()
-        self.__erase_primes()
-        return 3
-
-    def __step6(self):
-        """
-        Add the value found in Step 4 to every element of each covered
-        row, and subtract it from every element of each uncovered column.
-        Return to Step 4 without altering any stars, primes, or covered
-        lines.
-        """
-        minval = self.__find_smallest()
-        for i in range(self.n):
-            for j in range(self.n):
-                if self.row_covered[i]:
-                    self.C[i][j] += minval
-                if not self.col_covered[j]:
-                    self.C[i][j] -= minval
-        return 4
-
-    def __find_smallest(self):
-        """Find the smallest uncovered value in the matrix."""
-        minval = 2e9  # sys.maxint
-        for i in range(self.n):
-            for j in range(self.n):
-                if (not self.row_covered[i]) and (not self.col_covered[j]):
-                    if minval > self.C[i][j]:
-                        minval = self.C[i][j]
-        return minval
-
-    def __find_a_zero(self):
-        """Find the first uncovered element with value 0"""
-        row = -1
-        col = -1
-        i = 0
-        n = self.n
-        done = False
-
-        while not done:
-            j = 0
-            while True:
-                if (self.C[i][j] == 0) and \
-                   (not self.row_covered[i]) and \
-                   (not self.col_covered[j]):
-                    row = i
-                    col = j
-                    done = True
-                j += 1
-                if j >= n:
-                    break
-            i += 1
-            if i >= n:
-                done = True
-
-        return (row, col)
-
-    def __find_star_in_row(self, row):
-        """
-        Find the first starred element in the specified row. Returns
-        the column index, or -1 if no starred element was found.
-        """
-        col = -1
-        for j in range(self.n):
-            if self.marked[row][j] == 1:
-                col = j
-                break
-
-        return col
-
-    def __find_star_in_col(self, col):
-        """
-        Find the first starred element in the specified row. Returns
-        the row index, or -1 if no starred element was found.
-        """
-        row = -1
-        for i in range(self.n):
-            if self.marked[i][col] == 1:
-                row = i
-                break
-
-        return row
-
-    def __find_prime_in_row(self, row):
-        """
-        Find the first prime element in the specified row. Returns
-        the column index, or -1 if no starred element was found.
-        """
-        col = -1
-        for j in range(self.n):
-            if self.marked[row][j] == 2:
-                col = j
-                break
-
-        return col
-
-    def __convert_path(self, path, count):
-        for i in range(count + 1):
-            if self.marked[path[i][0]][path[i][1]] == 1:
-                self.marked[path[i][0]][path[i][1]] = 0
-            else:
-                self.marked[path[i][0]][path[i][1]] = 1
-
-    def __clear_covers(self):
-        """Clear all covered matrix cells"""
-        for i in range(self.n):
-            self.row_covered[i] = False
-            self.col_covered[i] = False
-
-    def __erase_primes(self):
-        """Erase all prime markings"""
-        for i in range(self.n):
-            for j in range(self.n):
-                if self.marked[i][j] == 2:
-                    self.marked[i][j] = 0
-
-
-def make_cost_matrix(profit_matrix, inversion_function):
-    """
-    Create a cost matrix from a profit matrix by calling
-    'inversion_function' to invert each value. The inversion
-    function must take one numeric argument (of any type) and return
-    another numeric argument which is presumed to be the cost inverse
-    of the original profit.
-
-    This is a static method. Call it like this:
-
-    .. python::
-
-        cost_matrix = Munkres.make_cost_matrix(matrix, inversion_func)
-
-    For example:
-
-    .. python::
-
-        cost_matrix = Munkres.make_cost_matrix(matrix, lambda x : sys.maxint - x)
-
-    :Parameters:
-        profit_matrix : list of lists
-            The matrix to convert from a profit to a cost matrix
-
-        inversion_function : function
-            The function to use to invert each entry in the profit matrix
-
-    :rtype: list of lists
-    :return: The converted matrix
-    """
-    cost_matrix = []
-    for row in profit_matrix:
-        cost_matrix.append([inversion_function(value) for value in row])
-    return cost_matrix
diff --git a/pdfdet/models/Paddle/ppdet/metrics/pose3d_metrics.py b/pdfdet/models/Paddle/ppdet/metrics/pose3d_metrics.py
deleted file mode 100644
index ea21de9..0000000
--- a/pdfdet/models/Paddle/ppdet/metrics/pose3d_metrics.py
+++ /dev/null
@@ -1,200 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import paddle
-from paddle.distributed import ParallelEnv
-import os
-import json
-from collections import defaultdict, OrderedDict
-import numpy as np
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-__all__ = ['Pose3DEval']
-
-
-class AverageMeter(object):
-    def __init__(self):
-        self.reset()
-
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-
-    def update(self, val, n=1):
-        self.val = val
-        self.sum += val * n
-        self.count += n
-        self.avg = self.sum / self.count
-
-
-def mean_per_joint_position_error(pred, gt, has_3d_joints):
-    """ 
-    Compute mPJPE
-    """
-    gt = gt[has_3d_joints == 1]
-    gt = gt[:, :, :3]
-    pred = pred[has_3d_joints == 1]
-
-    with paddle.no_grad():
-        gt_pelvis = (gt[:, 2, :] + gt[:, 3, :]) / 2
-        gt = gt - gt_pelvis[:, None, :]
-        pred_pelvis = (pred[:, 2, :] + pred[:, 3, :]) / 2
-        pred = pred - pred_pelvis[:, None, :]
-        error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean(axis=-1).numpy()
-        return error
-
-
-def compute_similarity_transform(S1, S2):
-    """Computes a similarity transform (sR, t) that takes
-    a set of 3D points S1 (3 x N) closest to a set of 3D points S2,
-    where R is an 3x3 rotation matrix, t 3x1 translation, s scale.
-    i.e. solves the orthogonal Procrutes problem.
-    """
-    transposed = False
-    if S1.shape[0] != 3 and S1.shape[0] != 2:
-        S1 = S1.T
-        S2 = S2.T
-        transposed = True
-    assert (S2.shape[1] == S1.shape[1])
-
-    # 1. Remove mean.
-    mu1 = S1.mean(axis=1, keepdims=True)
-    mu2 = S2.mean(axis=1, keepdims=True)
-    X1 = S1 - mu1
-    X2 = S2 - mu2
-
-    # 2. Compute variance of X1 used for scale.
-    var1 = np.sum(X1**2)
-
-    # 3. The outer product of X1 and X2.
-    K = X1.dot(X2.T)
-
-    # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are
-    # singular vectors of K.
-    U, s, Vh = np.linalg.svd(K)
-    V = Vh.T
-    # Construct Z that fixes the orientation of R to get det(R)=1.
-    Z = np.eye(U.shape[0])
-    Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T)))
-    # Construct R.
-    R = V.dot(Z.dot(U.T))
-
-    # 5. Recover scale.
-    scale = np.trace(R.dot(K)) / var1
-
-    # 6. Recover translation.
-    t = mu2 - scale * (R.dot(mu1))
-
-    # 7. Error:
-    S1_hat = scale * R.dot(S1) + t
-
-    if transposed:
-        S1_hat = S1_hat.T
-
-    return S1_hat
-
-
-def compute_similarity_transform_batch(S1, S2):
-    """Batched version of compute_similarity_transform."""
-    S1_hat = np.zeros_like(S1)
-    for i in range(S1.shape[0]):
-        S1_hat[i] = compute_similarity_transform(S1[i], S2[i])
-    return S1_hat
-
-
-def reconstruction_error(S1, S2, reduction='mean'):
-    """Do Procrustes alignment and compute reconstruction error."""
-    S1_hat = compute_similarity_transform_batch(S1, S2)
-    re = np.sqrt(((S1_hat - S2)**2).sum(axis=-1)).mean(axis=-1)
-    if reduction == 'mean':
-        re = re.mean()
-    elif reduction == 'sum':
-        re = re.sum()
-    return re
-
-
-def all_gather(data):
-    if paddle.distributed.get_world_size() == 1:
-        return data
-    vlist = []
-    paddle.distributed.all_gather(vlist, data)
-    data = paddle.concat(vlist, 0)
-    return data
-
-
-class Pose3DEval(object):
-    def __init__(self, output_eval, save_prediction_only=False):
-        super(Pose3DEval, self).__init__()
-        self.output_eval = output_eval
-        self.res_file = os.path.join(output_eval, "pose3d_results.json")
-        self.save_prediction_only = save_prediction_only
-        self.reset()
-
-    def reset(self):
-        self.PAmPJPE = AverageMeter()
-        self.mPJPE = AverageMeter()
-        self.eval_results = {}
-
-    def get_human36m_joints(self, input):
-        J24_TO_J14 = paddle.to_tensor(
-            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18])
-        J24_TO_J17 = paddle.to_tensor(
-            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19])
-        return paddle.index_select(input, J24_TO_J14, axis=1)
-
-    def update(self, inputs, outputs):
-        gt_3d_joints = all_gather(inputs['joints_3d'].cuda(ParallelEnv()
-                                                           .local_rank))
-        has_3d_joints = all_gather(inputs['has_3d_joints'].cuda(ParallelEnv()
-                                                                .local_rank))
-        pred_3d_joints = all_gather(outputs['pose3d'])
-        if gt_3d_joints.shape[1] == 24:
-            gt_3d_joints = self.get_human36m_joints(gt_3d_joints)
-        if pred_3d_joints.shape[1] == 24:
-            pred_3d_joints = self.get_human36m_joints(pred_3d_joints)
-        mPJPE_val = mean_per_joint_position_error(pred_3d_joints, gt_3d_joints,
-                                                  has_3d_joints).mean()
-        PAmPJPE_val = reconstruction_error(
-            pred_3d_joints.numpy(),
-            gt_3d_joints[:, :, :3].numpy(),
-            reduction=None).mean()
-        count = int(np.sum(has_3d_joints.numpy()))
-        self.PAmPJPE.update(PAmPJPE_val * 1000., count)
-        self.mPJPE.update(mPJPE_val * 1000., count)
-
-    def accumulate(self):
-        if self.save_prediction_only:
-            logger.info(f'The pose3d result is saved to {self.res_file} '
-                        'and do not evaluate the model.')
-            return
-        self.eval_results['pose3d'] = [-self.mPJPE.avg, -self.PAmPJPE.avg]
-
-    def log(self):
-        if self.save_prediction_only:
-            return
-        stats_names = ['mPJPE', 'PAmPJPE']
-        num_values = len(stats_names)
-        print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |')
-        print('|---' * (num_values + 1) + '|')
-
-        print(' '.join([
-            '| {:.3f}'.format(abs(value))
-            for value in self.eval_results['pose3d']
-        ]) + ' |')
-
-    def get_results(self):
-        return self.eval_results
diff --git a/pdfdet/models/Paddle/ppdet/metrics/widerface_utils.py b/pdfdet/models/Paddle/ppdet/metrics/widerface_utils.py
deleted file mode 100644
index 2f64bf6..0000000
--- a/pdfdet/models/Paddle/ppdet/metrics/widerface_utils.py
+++ /dev/null
@@ -1,391 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import cv2
-import numpy as np
-from collections import OrderedDict
-
-import paddle
-
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-__all__ = ['face_eval_run', 'lmk2out']
-
-
-def face_eval_run(model,
-                  image_dir,
-                  gt_file,
-                  pred_dir='output/pred',
-                  eval_mode='widerface',
-                  multi_scale=False):
-    # load ground truth files
-    with open(gt_file, 'r') as f:
-        gt_lines = f.readlines()
-    imid2path = []
-    pos_gt = 0
-    while pos_gt < len(gt_lines):
-        name_gt = gt_lines[pos_gt].strip('\n\t').split()[0]
-        imid2path.append(name_gt)
-        pos_gt += 1
-        n_gt = int(gt_lines[pos_gt].strip('\n\t').split()[0])
-        pos_gt += 1 + n_gt
-    logger.info('The ground truth file load {} images'.format(len(imid2path)))
-
-    dets_dist = OrderedDict()
-    for iter_id, im_path in enumerate(imid2path):
-        image_path = os.path.join(image_dir, im_path)
-        if eval_mode == 'fddb':
-            image_path += '.jpg'
-        assert os.path.exists(image_path)
-        image = cv2.imread(image_path)
-        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
-        if multi_scale:
-            shrink, max_shrink = get_shrink(image.shape[0], image.shape[1])
-            det0 = detect_face(model, image, shrink)
-            det1 = flip_test(model, image, shrink)
-            [det2, det3] = multi_scale_test(model, image, max_shrink)
-            det4 = multi_scale_test_pyramid(model, image, max_shrink)
-            det = np.row_stack((det0, det1, det2, det3, det4))
-            dets = bbox_vote(det)
-        else:
-            dets = detect_face(model, image, 1)
-        if eval_mode == 'widerface':
-            save_widerface_bboxes(image_path, dets, pred_dir)
-        else:
-            dets_dist[im_path] = dets
-        if iter_id % 100 == 0:
-            logger.info('Test iter {}'.format(iter_id))
-    if eval_mode == 'fddb':
-        save_fddb_bboxes(dets_dist, pred_dir)
-    logger.info("Finish evaluation.")
-
-
-def detect_face(model, image, shrink):
-    image_shape = [image.shape[0], image.shape[1]]
-    if shrink != 1:
-        h, w = int(image_shape[0] * shrink), int(image_shape[1] * shrink)
-        image = cv2.resize(image, (w, h))
-        image_shape = [h, w]
-
-    img = face_img_process(image)
-    image_shape = np.asarray([image_shape])
-    scale_factor = np.asarray([[shrink, shrink]])
-    data = {
-        "image": paddle.to_tensor(
-            img, dtype='float32'),
-        "im_shape": paddle.to_tensor(
-            image_shape, dtype='float32'),
-        "scale_factor": paddle.to_tensor(
-            scale_factor, dtype='float32')
-    }
-    model.eval()
-    detection = model(data)
-    detection = detection['bbox'].numpy()
-    # layout: xmin, ymin, xmax. ymax, score
-    if np.prod(detection.shape) == 1:
-        logger.info("No face detected")
-        return np.array([[0, 0, 0, 0, 0]])
-    det_conf = detection[:, 1]
-    det_xmin = detection[:, 2]
-    det_ymin = detection[:, 3]
-    det_xmax = detection[:, 4]
-    det_ymax = detection[:, 5]
-
-    det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf))
-    return det
-
-
-def flip_test(model, image, shrink):
-    img = cv2.flip(image, 1)
-    det_f = detect_face(model, img, shrink)
-    det_t = np.zeros(det_f.shape)
-    img_width = image.shape[1]
-    det_t[:, 0] = img_width - det_f[:, 2]
-    det_t[:, 1] = det_f[:, 1]
-    det_t[:, 2] = img_width - det_f[:, 0]
-    det_t[:, 3] = det_f[:, 3]
-    det_t[:, 4] = det_f[:, 4]
-    return det_t
-
-
-def multi_scale_test(model, image, max_shrink):
-    # Shrink detecting is only used to detect big faces
-    st = 0.5 if max_shrink >= 0.75 else 0.5 * max_shrink
-    det_s = detect_face(model, image, st)
-    index = np.where(
-        np.maximum(det_s[:, 2] - det_s[:, 0] + 1, det_s[:, 3] - det_s[:, 1] + 1)
-        > 30)[0]
-    det_s = det_s[index, :]
-    # Enlarge one times
-    bt = min(2, max_shrink) if max_shrink > 1 else (st + max_shrink) / 2
-    det_b = detect_face(model, image, bt)
-
-    # Enlarge small image x times for small faces
-    if max_shrink > 2:
-        bt *= 2
-        while bt < max_shrink:
-            det_b = np.row_stack((det_b, detect_face(model, image, bt)))
-            bt *= 2
-        det_b = np.row_stack((det_b, detect_face(model, image, max_shrink)))
-
-    # Enlarged images are only used to detect small faces.
-    if bt > 1:
-        index = np.where(
-            np.minimum(det_b[:, 2] - det_b[:, 0] + 1,
-                       det_b[:, 3] - det_b[:, 1] + 1) < 100)[0]
-        det_b = det_b[index, :]
-    # Shrinked images are only used to detect big faces.
-    else:
-        index = np.where(
-            np.maximum(det_b[:, 2] - det_b[:, 0] + 1,
-                       det_b[:, 3] - det_b[:, 1] + 1) > 30)[0]
-        det_b = det_b[index, :]
-    return det_s, det_b
-
-
-def multi_scale_test_pyramid(model, image, max_shrink):
-    # Use image pyramids to detect faces
-    det_b = detect_face(model, image, 0.25)
-    index = np.where(
-        np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1)
-        > 30)[0]
-    det_b = det_b[index, :]
-
-    st = [0.75, 1.25, 1.5, 1.75]
-    for i in range(len(st)):
-        if st[i] <= max_shrink:
-            det_temp = detect_face(model, image, st[i])
-            # Enlarged images are only used to detect small faces.
-            if st[i] > 1:
-                index = np.where(
-                    np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1,
-                               det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0]
-                det_temp = det_temp[index, :]
-            # Shrinked images are only used to detect big faces.
-            else:
-                index = np.where(
-                    np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1,
-                               det_temp[:, 3] - det_temp[:, 1] + 1) > 30)[0]
-                det_temp = det_temp[index, :]
-            det_b = np.row_stack((det_b, det_temp))
-    return det_b
-
-
-def to_chw(image):
-    """
-    Transpose image from HWC to CHW.
-    Args:
-        image (np.array): an image with HWC layout.
-    """
-    # HWC to CHW
-    if len(image.shape) == 3:
-        image = np.swapaxes(image, 1, 2)
-        image = np.swapaxes(image, 1, 0)
-    return image
-
-
-def face_img_process(image,
-                     mean=[104., 117., 123.],
-                     std=[127.502231, 127.502231, 127.502231]):
-    img = np.array(image)
-    img = to_chw(img)
-    img = img.astype('float32')
-    img -= np.array(mean)[:, np.newaxis, np.newaxis].astype('float32')
-    img /= np.array(std)[:, np.newaxis, np.newaxis].astype('float32')
-    img = [img]
-    img = np.array(img)
-    return img
-
-
-def get_shrink(height, width):
-    """
-    Args:
-        height (int): image height.
-        width (int): image width.
-    """
-    # avoid out of memory
-    max_shrink_v1 = (0x7fffffff / 577.0 / (height * width))**0.5
-    max_shrink_v2 = ((678 * 1024 * 2.0 * 2.0) / (height * width))**0.5
-
-    def get_round(x, loc):
-        str_x = str(x)
-        if '.' in str_x:
-            str_before, str_after = str_x.split('.')
-            len_after = len(str_after)
-            if len_after >= 3:
-                str_final = str_before + '.' + str_after[0:loc]
-                return float(str_final)
-            else:
-                return x
-
-    max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3
-    if max_shrink >= 1.5 and max_shrink < 2:
-        max_shrink = max_shrink - 0.1
-    elif max_shrink >= 2 and max_shrink < 3:
-        max_shrink = max_shrink - 0.2
-    elif max_shrink >= 3 and max_shrink < 4:
-        max_shrink = max_shrink - 0.3
-    elif max_shrink >= 4 and max_shrink < 5:
-        max_shrink = max_shrink - 0.4
-    elif max_shrink >= 5:
-        max_shrink = max_shrink - 0.5
-    elif max_shrink <= 0.1:
-        max_shrink = 0.1
-
-    shrink = max_shrink if max_shrink < 1 else 1
-    return shrink, max_shrink
-
-
-def bbox_vote(det):
-    order = det[:, 4].ravel().argsort()[::-1]
-    det = det[order, :]
-    if det.shape[0] == 0:
-        dets = np.array([[10, 10, 20, 20, 0.002]])
-        det = np.empty(shape=[0, 5])
-    while det.shape[0] > 0:
-        # IOU
-        area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1)
-        xx1 = np.maximum(det[0, 0], det[:, 0])
-        yy1 = np.maximum(det[0, 1], det[:, 1])
-        xx2 = np.minimum(det[0, 2], det[:, 2])
-        yy2 = np.minimum(det[0, 3], det[:, 3])
-        w = np.maximum(0.0, xx2 - xx1 + 1)
-        h = np.maximum(0.0, yy2 - yy1 + 1)
-        inter = w * h
-        o = inter / (area[0] + area[:] - inter)
-
-        # nms
-        merge_index = np.where(o >= 0.3)[0]
-        det_accu = det[merge_index, :]
-        det = np.delete(det, merge_index, 0)
-        if merge_index.shape[0] <= 1:
-            if det.shape[0] == 0:
-                try:
-                    dets = np.row_stack((dets, det_accu))
-                except:
-                    dets = det_accu
-            continue
-        det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4))
-        max_score = np.max(det_accu[:, 4])
-        det_accu_sum = np.zeros((1, 5))
-        det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4],
-                                      axis=0) / np.sum(det_accu[:, -1:])
-        det_accu_sum[:, 4] = max_score
-        try:
-            dets = np.row_stack((dets, det_accu_sum))
-        except:
-            dets = det_accu_sum
-    dets = dets[0:750, :]
-    keep_index = np.where(dets[:, 4] >= 0.01)[0]
-    dets = dets[keep_index, :]
-    return dets
-
-
-def save_widerface_bboxes(image_path, bboxes_scores, output_dir):
-    image_name = image_path.split('/')[-1]
-    image_class = image_path.split('/')[-2]
-    odir = os.path.join(output_dir, image_class)
-    if not os.path.exists(odir):
-        os.makedirs(odir)
-
-    ofname = os.path.join(odir, '%s.txt' % (image_name[:-4]))
-    f = open(ofname, 'w')
-    f.write('{:s}\n'.format(image_class + '/' + image_name))
-    f.write('{:d}\n'.format(bboxes_scores.shape[0]))
-    for box_score in bboxes_scores:
-        xmin, ymin, xmax, ymax, score = box_score
-        f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'.format(xmin, ymin, (
-            xmax - xmin + 1), (ymax - ymin + 1), score))
-    f.close()
-    logger.info("The predicted result is saved as {}".format(ofname))
-
-
-def save_fddb_bboxes(bboxes_scores,
-                     output_dir,
-                     output_fname='pred_fddb_res.txt'):
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-    predict_file = os.path.join(output_dir, output_fname)
-    f = open(predict_file, 'w')
-    for image_path, dets in bboxes_scores.iteritems():
-        f.write('{:s}\n'.format(image_path))
-        f.write('{:d}\n'.format(dets.shape[0]))
-        for box_score in dets:
-            xmin, ymin, xmax, ymax, score = box_score
-            width, height = xmax - xmin, ymax - ymin
-            f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'
-                    .format(xmin, ymin, width, height, score))
-    logger.info("The predicted result is saved as {}".format(predict_file))
-    return predict_file
-
-
-def lmk2out(results, is_bbox_normalized=False):
-    """
-    Args:
-        results: request a dict, should include: `landmark`, `im_id`,
-                 if is_bbox_normalized=True, also need `im_shape`.
-        is_bbox_normalized: whether or not landmark is normalized.
-    """
-    xywh_res = []
-    for t in results:
-        bboxes = t['bbox'][0]
-        lengths = t['bbox'][1][0]
-        im_ids = np.array(t['im_id'][0]).flatten()
-        if bboxes.shape == (1, 1) or bboxes is None:
-            continue
-        face_index = t['face_index'][0]
-        prior_box = t['prior_boxes'][0]
-        predict_lmk = t['landmark'][0]
-        prior = np.reshape(prior_box, (-1, 4))
-        predictlmk = np.reshape(predict_lmk, (-1, 10))
-
-        k = 0
-        for a in range(len(lengths)):
-            num = lengths[a]
-            im_id = int(im_ids[a])
-            for i in range(num):
-                score = bboxes[k][1]
-                theindex = face_index[i][0]
-                me_prior = prior[theindex, :]
-                lmk_pred = predictlmk[theindex, :]
-                prior_w = me_prior[2] - me_prior[0]
-                prior_h = me_prior[3] - me_prior[1]
-                prior_w_center = (me_prior[2] + me_prior[0]) / 2
-                prior_h_center = (me_prior[3] + me_prior[1]) / 2
-                lmk_decode = np.zeros((10))
-                for j in [0, 2, 4, 6, 8]:
-                    lmk_decode[j] = lmk_pred[j] * 0.1 * prior_w + prior_w_center
-                for j in [1, 3, 5, 7, 9]:
-                    lmk_decode[j] = lmk_pred[j] * 0.1 * prior_h + prior_h_center
-                im_shape = t['im_shape'][0][a].tolist()
-                image_h, image_w = int(im_shape[0]), int(im_shape[1])
-                if is_bbox_normalized:
-                    lmk_decode = lmk_decode * np.array([
-                        image_w, image_h, image_w, image_h, image_w, image_h,
-                        image_w, image_h, image_w, image_h
-                    ])
-                lmk_res = {
-                    'image_id': im_id,
-                    'landmark': lmk_decode,
-                    'score': score,
-                }
-                xywh_res.append(lmk_res)
-                k += 1
-    return xywh_res
diff --git a/pdfdet/models/Paddle/ppdet/model_zoo/.gitignore b/pdfdet/models/Paddle/ppdet/model_zoo/.gitignore
deleted file mode 100644
index f296851..0000000
--- a/pdfdet/models/Paddle/ppdet/model_zoo/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-MODEL_ZOO
diff --git a/pdfdet/models/Paddle/ppdet/model_zoo/__init__.py b/pdfdet/models/Paddle/ppdet/model_zoo/__init__.py
deleted file mode 100644
index 6db6eb6..0000000
--- a/pdfdet/models/Paddle/ppdet/model_zoo/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from . import model_zoo
-from .model_zoo import *
-
-__all__ = model_zoo.__all__
diff --git a/pdfdet/models/Paddle/ppdet/model_zoo/model_zoo.py b/pdfdet/models/Paddle/ppdet/model_zoo/model_zoo.py
deleted file mode 100644
index 27581ef..0000000
--- a/pdfdet/models/Paddle/ppdet/model_zoo/model_zoo.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import os.path as osp
-import pkg_resources
-
-try:
-    from collections.abc import Sequence
-except:
-    from collections import Sequence
-
-from ppdet.core.workspace import load_config, create
-from ppdet.utils.checkpoint import load_weight
-from ppdet.utils.download import get_config_path
-
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-__all__ = [
-    'list_model', 'get_config_file', 'get_weights_url', 'get_model',
-    'MODEL_ZOO_FILENAME'
-]
-
-MODEL_ZOO_FILENAME = 'MODEL_ZOO'
-
-
-def list_model(filters=[]):
-    model_zoo_file = pkg_resources.resource_filename('ppdet.model_zoo',
-                                                     MODEL_ZOO_FILENAME)
-    with open(model_zoo_file) as f:
-        model_names = f.read().splitlines()
-
-    # filter model_name
-    def filt(name):
-        for f in filters:
-            if name.find(f) < 0:
-                return False
-        return True
-
-    if isinstance(filters, str) or not isinstance(filters, Sequence):
-        filters = [filters]
-    model_names = [name for name in model_names if filt(name)]
-    if len(model_names) == 0 and len(filters) > 0:
-        raise ValueError("no model found, please check filters seeting, "
-                         "filters can be set as following kinds:\n"
-                         "\tDataset: coco, voc ...\n"
-                         "\tArchitecture: yolo, rcnn, ssd ...\n"
-                         "\tBackbone: resnet, vgg, darknet ...\n")
-
-    model_str = "Available Models:\n"
-    for model_name in model_names:
-        model_str += "\t{}\n".format(model_name)
-    logger.info(model_str)
-
-
-# models and configs save on bcebos under dygraph directory
-def get_config_file(model_name):
-    return get_config_path("ppdet://configs/{}.yml".format(model_name))
-
-
-def get_weights_url(model_name):
-    return "ppdet://models/{}.pdparams".format(osp.split(model_name)[-1])
-
-
-def get_model(model_name, pretrained=True):
-    cfg_file = get_config_file(model_name)
-    cfg = load_config(cfg_file)
-    model = create(cfg.architecture)
-
-    if pretrained:
-        load_weight(model, get_weights_url(model_name))
-
-    return model
diff --git a/pdfdet/models/Paddle/ppdet/modeling/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/__init__.py
deleted file mode 100644
index fc7caf4..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/__init__.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import warnings
-warnings.filterwarnings(
-    action='ignore', category=DeprecationWarning, module='ops')
-
-from . import ops
-from . import backbones
-from . import necks
-from . import proposal_generator
-from . import heads
-from . import losses
-from . import architectures
-from . import post_process
-from . import layers
-from . import reid
-from . import mot
-from . import transformers
-from . import assigners
-from . import rbox_utils
-from . import ssod
-
-from .ops import *
-from .backbones import *
-from .necks import *
-from .proposal_generator import *
-from .heads import *
-from .losses import *
-from .architectures import *
-from .post_process import *
-from .layers import *
-from .reid import *
-from .mot import *
-from .transformers import *
-from .assigners import *
-from .rbox_utils import *
-from .ssod import *
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/__init__.py
deleted file mode 100644
index d22df32..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/__init__.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from . import meta_arch
-from . import faster_rcnn
-from . import mask_rcnn
-from . import yolo
-from . import ppyoloe
-from . import cascade_rcnn
-from . import ssd
-from . import fcos
-from . import solov2
-from . import ttfnet
-from . import s2anet
-from . import keypoint_hrhrnet
-from . import keypoint_hrnet
-from . import keypoint_vitpose
-from . import jde
-from . import deepsort
-from . import fairmot
-from . import centernet
-from . import gfl
-from . import picodet
-from . import detr
-from . import sparse_rcnn
-from . import tood
-from . import retinanet
-from . import bytetrack
-from . import yolox
-from . import yolof
-from . import pose3d_metro
-from . import centertrack
-from . import queryinst
-from . import detr_ssod
-from . import multi_stream_detector
-from . import clrnet
-
-from .meta_arch import *
-from .faster_rcnn import *
-from .mask_rcnn import *
-from .yolo import *
-from .ppyoloe import *
-from .cascade_rcnn import *
-from .ssd import *
-from .fcos import *
-from .solov2 import *
-from .ttfnet import *
-from .s2anet import *
-from .keypoint_hrhrnet import *
-from .keypoint_hrnet import *
-from .keypoint_vitpose import *
-from .jde import *
-from .deepsort import *
-from .fairmot import *
-from .centernet import *
-from .blazeface import *
-from .gfl import *
-from .picodet import *
-from .detr import *
-from .sparse_rcnn import *
-from .tood import *
-from .retinanet import *
-from .bytetrack import *
-from .yolox import *
-from .yolof import *
-from .pose3d_metro import *
-from .centertrack import *
-from .queryinst import *
-from .keypoint_petr import *
-from .detr_ssod import *
-from .multi_stream_detector import *
-from .clrnet import *
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/blazeface.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/blazeface.py
deleted file mode 100644
index 477732d..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/blazeface.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-import paddle
-import paddle.nn.functional as F
-
-__all__ = ['BlazeFace']
-
-
-@register
-class BlazeFace(BaseArch):
-    """
-    BlazeFace: Sub-millisecond Neural Face Detection on Mobile GPUs,
-               see https://arxiv.org/abs/1907.05047
-
-    Args:
-        backbone (nn.Layer): backbone instance
-        neck (nn.Layer): neck instance
-        blaze_head (nn.Layer): `blazeHead` instance
-        post_process (object): `BBoxPostProcess` instance
-    """
-
-    __category__ = 'architecture'
-    __inject__ = ['post_process']
-
-    def __init__(self, backbone, blaze_head, neck, post_process):
-        super(BlazeFace, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.blaze_head = blaze_head
-        self.post_process = post_process
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        # backbone
-        backbone = create(cfg['backbone'])
-        # fpn
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs)
-        # head
-        kwargs = {'input_shape': neck.out_shape}
-        blaze_head = create(cfg['blaze_head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            'blaze_head': blaze_head,
-        }
-
-    def _forward(self):
-        # Backbone
-        body_feats = self.backbone(self.inputs)
-        # neck
-        neck_feats = self.neck(body_feats)
-        # blaze Head
-        if self.training:
-            return self.blaze_head(neck_feats, self.inputs['image'],
-                                   self.inputs['gt_bbox'],
-                                   self.inputs['gt_class'])
-        else:
-            preds, anchors = self.blaze_head(neck_feats, self.inputs['image'])
-            bbox, bbox_num, nms_keep_idx = self.post_process(
-                preds, anchors, self.inputs['im_shape'],
-                self.inputs['scale_factor'])
-            if self.use_extra_data:
-                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
-                """extra_data:{
-                            'scores': predict scores,
-                            'nms_keep_idx': bbox index before nms,
-                           }
-                           """
-                preds_logits = preds[1]  # [[1xNumBBoxNumClass]]
-                extra_data['scores'] = F.softmax(paddle.concat(
-                    preds_logits, axis=1)).transpose([0, 2, 1])
-                extra_data['logits'] = paddle.concat(
-                    preds_logits, axis=1).transpose([0, 2, 1])
-                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
-                return bbox, bbox_num, extra_data
-            else:
-                return bbox, bbox_num
-
-    def get_loss(self, ):
-        return {"loss": self._forward()}
-
-    def get_pred(self):
-        if self.use_extra_data:
-            bbox_pred, bbox_num, extra_data = self._forward()
-            output = {
-                "bbox": bbox_pred,
-                "bbox_num": bbox_num,
-                "extra_data": extra_data
-            }
-        else:
-            bbox_pred, bbox_num = self._forward()
-            output = {
-                "bbox": bbox_pred,
-                "bbox_num": bbox_num,
-            }
-
-        return output
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/bytetrack.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/bytetrack.py
deleted file mode 100644
index 1f3d0d1..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/bytetrack.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-# 
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-__all__ = ['ByteTrack']
-
-
-@register
-class ByteTrack(BaseArch):
-    """
-    ByteTrack network, see https://arxiv.org/abs/2110.06864
-
-    Args:
-        detector (object): detector model instance
-        reid (object): reid model instance, default None
-        tracker (object): tracker instance
-    """
-    __category__ = 'architecture'
-
-    def __init__(self,
-                 detector='YOLOX',
-                 reid=None,
-                 tracker='JDETracker'):
-        super(ByteTrack, self).__init__()
-        self.detector = detector
-        self.reid = reid
-        self.tracker = tracker
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        detector = create(cfg['detector'])
-
-        if cfg['reid'] != 'None':
-            reid = create(cfg['reid'])
-        else:
-            reid = None
-
-        tracker = create(cfg['tracker'])
-
-        return {
-            "detector": detector,
-            "reid": reid,
-            "tracker": tracker,
-        }
-
-    def _forward(self):
-        det_outs = self.detector(self.inputs)
-
-        if self.training:
-            return det_outs
-        else:
-            if self.reid is not None:
-                assert 'crops' in self.inputs
-                crops = self.inputs['crops']
-                pred_embs = self.reid(crops)
-            else:
-                pred_embs = None
-            det_outs['embeddings'] = pred_embs
-            return det_outs
-
-    def get_loss(self):
-        return self._forward()
-
-    def get_pred(self):
-        return self._forward()
-
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/cascade_rcnn.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/cascade_rcnn.py
deleted file mode 100644
index c5d454f..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/cascade_rcnn.py
+++ /dev/null
@@ -1,143 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-__all__ = ['CascadeRCNN']
-
-
-@register
-class CascadeRCNN(BaseArch):
-    """
-    Cascade R-CNN network, see https://arxiv.org/abs/1712.00726
-
-    Args:
-        backbone (object): backbone instance
-        rpn_head (object): `RPNHead` instance
-        bbox_head (object): `BBoxHead` instance
-        bbox_post_process (object): `BBoxPostProcess` instance
-        neck (object): 'FPN' instance
-        mask_head (object): `MaskHead` instance
-        mask_post_process (object): `MaskPostProcess` instance
-    """
-    __category__ = 'architecture'
-    __inject__ = [
-        'bbox_post_process',
-        'mask_post_process',
-    ]
-
-    def __init__(self,
-                 backbone,
-                 rpn_head,
-                 bbox_head,
-                 bbox_post_process,
-                 neck=None,
-                 mask_head=None,
-                 mask_post_process=None):
-        super(CascadeRCNN, self).__init__()
-        self.backbone = backbone
-        self.rpn_head = rpn_head
-        self.bbox_head = bbox_head
-        self.bbox_post_process = bbox_post_process
-        self.neck = neck
-        self.mask_head = mask_head
-        self.mask_post_process = mask_post_process
-        self.with_mask = mask_head is not None
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        backbone = create(cfg['backbone'])
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = cfg['neck'] and create(cfg['neck'], **kwargs)
-
-        out_shape = neck and neck.out_shape or backbone.out_shape
-        kwargs = {'input_shape': out_shape}
-        rpn_head = create(cfg['rpn_head'], **kwargs)
-        bbox_head = create(cfg['bbox_head'], **kwargs)
-
-        out_shape = neck and out_shape or bbox_head.get_head().out_shape
-        kwargs = {'input_shape': out_shape}
-        mask_head = cfg['mask_head'] and create(cfg['mask_head'], **kwargs)
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            "rpn_head": rpn_head,
-            "bbox_head": bbox_head,
-            "mask_head": mask_head,
-        }
-
-    def _forward(self):
-        body_feats = self.backbone(self.inputs)
-        if self.neck is not None:
-            body_feats = self.neck(body_feats)
-
-        if self.training:
-            rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)
-            bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num,
-                                                  self.inputs)
-            rois, rois_num = self.bbox_head.get_assigned_rois()
-            bbox_targets = self.bbox_head.get_assigned_targets()
-            if self.with_mask:
-                mask_loss = self.mask_head(body_feats, rois, rois_num,
-                                           self.inputs, bbox_targets, bbox_feat)
-                return rpn_loss, bbox_loss, mask_loss
-            else:
-                return rpn_loss, bbox_loss, {}
-        else:
-            rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
-            preds, _ = self.bbox_head(body_feats, rois, rois_num, self.inputs)
-            refined_rois = self.bbox_head.get_refined_rois()
-
-            im_shape = self.inputs['im_shape']
-            scale_factor = self.inputs['scale_factor']
-
-            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(
-                preds, (refined_rois, rois_num), im_shape, scale_factor)
-            # rescale the prediction back to origin image
-            bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
-                bbox, bbox_num, im_shape, scale_factor)
-            if not self.with_mask:
-                return bbox_pred, bbox_num, None
-            mask_out = self.mask_head(body_feats, bbox, bbox_num, self.inputs)
-            origin_shape = self.bbox_post_process.get_origin_shape()
-            mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num,
-                                               origin_shape)
-            return bbox_pred, bbox_num, mask_pred
-
-    def get_loss(self, ):
-        rpn_loss, bbox_loss, mask_loss = self._forward()
-        loss = {}
-        loss.update(rpn_loss)
-        loss.update(bbox_loss)
-        if self.with_mask:
-            loss.update(mask_loss)
-        total_loss = paddle.add_n(list(loss.values()))
-        loss.update({'loss': total_loss})
-        return loss
-
-    def get_pred(self):
-        bbox_pred, bbox_num, mask_pred = self._forward()
-        output = {
-            'bbox': bbox_pred,
-            'bbox_num': bbox_num,
-        }
-        if self.with_mask:
-            output.update({'mask': mask_pred})
-        return output
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/centernet.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/centernet.py
deleted file mode 100644
index 439e5f8..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/centernet.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-__all__ = ['CenterNet']
-
-
-@register
-class CenterNet(BaseArch):
-    """
-    CenterNet network, see http://arxiv.org/abs/1904.07850
-
-    Args:
-        backbone (object): backbone instance
-        neck (object): FPN instance, default use 'CenterNetDLAFPN'
-        head (object): 'CenterNetHead' instance
-        post_process (object): 'CenterNetPostProcess' instance
-        for_mot (bool): whether return other features used in tracking model
-
-    """
-    __category__ = 'architecture'
-    __inject__ = ['post_process']
-    __shared__ = ['for_mot']
-
-    def __init__(self,
-                 backbone,
-                 neck='CenterNetDLAFPN',
-                 head='CenterNetHead',
-                 post_process='CenterNetPostProcess',
-                 for_mot=False):
-        super(CenterNet, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.head = head
-        self.post_process = post_process
-        self.for_mot = for_mot
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        backbone = create(cfg['backbone'])
-
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = cfg['neck'] and create(cfg['neck'], **kwargs)
-
-        out_shape = neck and neck.out_shape or backbone.out_shape
-        kwargs = {'input_shape': out_shape}
-        head = create(cfg['head'], **kwargs)
-
-        return {'backbone': backbone, 'neck': neck, "head": head}
-
-    def _forward(self):
-        neck_feat = self.backbone(self.inputs)
-        if self.neck is not None:
-            neck_feat = self.neck(neck_feat)
-        head_out = self.head(neck_feat, self.inputs)
-        if self.for_mot:
-            head_out.update({'neck_feat': neck_feat})
-        elif self.training:
-            head_out['loss'] = head_out.pop('det_loss')
-        return head_out
-
-    def get_pred(self):
-        head_out = self._forward()
-        bbox, bbox_num, bbox_inds, topk_clses, topk_ys, topk_xs = self.post_process(
-            head_out['heatmap'],
-            head_out['size'],
-            head_out['offset'],
-            im_shape=self.inputs['im_shape'],
-            scale_factor=self.inputs['scale_factor'])
-
-        if self.for_mot:
-            output = {
-                "bbox": bbox,
-                "bbox_num": bbox_num,
-                "bbox_inds": bbox_inds,
-                "topk_clses": topk_clses,
-                "topk_ys": topk_ys,
-                "topk_xs": topk_xs,
-                "neck_feat": head_out['neck_feat']
-            }
-        else:
-            output = {"bbox": bbox, "bbox_num": bbox_num}
-        return output
-
-    def get_loss(self):
-        return self._forward()
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/centertrack.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/centertrack.py
deleted file mode 100644
index b9880db..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/centertrack.py
+++ /dev/null
@@ -1,176 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import copy
-import math
-import numpy as np
-import paddle
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-from ..keypoint_utils import affine_transform
-from ppdet.data.transform.op_helper import gaussian_radius, gaussian2D, draw_umich_gaussian
-
-__all__ = ['CenterTrack']
-
-
-@register
-class CenterTrack(BaseArch):
-    """
-    CenterTrack network, see http://arxiv.org/abs/2004.01177
-
-    Args:
-        detector (object): 'CenterNet' instance
-        plugin_head (object): 'CenterTrackHead' instance
-        tracker (object): 'CenterTracker' instance
-    """
-    __category__ = 'architecture'
-    __shared__ = ['mot_metric']
-
-    def __init__(self,
-                 detector='CenterNet',
-                 plugin_head='CenterTrackHead',
-                 tracker='CenterTracker',
-                 mot_metric=False):
-        super(CenterTrack, self).__init__()
-        self.detector = detector
-        self.plugin_head = plugin_head
-        self.tracker = tracker
-        self.mot_metric = mot_metric
-        self.pre_image = None
-        self.deploy = False
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        detector = create(cfg['detector'])
-        detector_out_shape = detector.neck and detector.neck.out_shape or detector.backbone.out_shape
-
-        kwargs = {'input_shape': detector_out_shape}
-        plugin_head = create(cfg['plugin_head'], **kwargs)
-        tracker = create(cfg['tracker'])
-
-        return {
-            'detector': detector,
-            'plugin_head': plugin_head,
-            'tracker': tracker,
-        }
-
-    def _forward(self):
-        if self.training:
-            det_outs = self.detector(self.inputs)
-            neck_feat = det_outs['neck_feat']
-
-            losses = {}
-            for k, v in det_outs.items():
-                if 'loss' not in k: continue
-                losses.update({k: v})
-
-            plugin_outs = self.plugin_head(neck_feat, self.inputs)
-            for k, v in plugin_outs.items():
-                if 'loss' not in k: continue
-                losses.update({k: v})
-
-            losses['loss'] = det_outs['det_loss'] + plugin_outs['plugin_loss']
-            return losses
-
-        else:
-            if not self.mot_metric:
-                # detection, support bs>=1
-                det_outs = self.detector(self.inputs)
-                return {
-                    'bbox': det_outs['bbox'],
-                    'bbox_num': det_outs['bbox_num']
-                }
-
-            else:
-                # MOT, only support bs=1
-                if not self.deploy:
-                    if self.pre_image is None:
-                        self.pre_image = self.inputs['image']
-                        # initializing tracker for the first frame
-                        self.tracker.init_track([])
-                    self.inputs['pre_image'] = self.pre_image
-                    self.pre_image = self.inputs[
-                        'image']  # Note: update for next image
-
-                    # render input heatmap from tracker status
-                    pre_hm = self.get_additional_inputs(
-                        self.tracker.tracks, self.inputs, with_hm=True)
-                    self.inputs['pre_hm'] = paddle.to_tensor(pre_hm)
-
-                # model inference
-                det_outs = self.detector(self.inputs)
-                neck_feat = det_outs['neck_feat']
-                result = self.plugin_head(
-                    neck_feat, self.inputs, det_outs['bbox'],
-                    det_outs['bbox_inds'], det_outs['topk_clses'],
-                    det_outs['topk_ys'], det_outs['topk_xs'])
-
-                if not self.deploy:
-                    # convert the cropped and 4x downsampled output coordinate system
-                    # back to the input image coordinate system
-                    result = self.plugin_head.centertrack_post_process(
-                        result, self.inputs, self.tracker.out_thresh)
-                return result
-
-    def get_pred(self):
-        return self._forward()
-
-    def get_loss(self):
-        return self._forward()
-
-    def reset_tracking(self):
-        self.tracker.reset()
-        self.pre_image = None
-
-    def get_additional_inputs(self, dets, meta, with_hm=True):
-        # Render input heatmap from previous trackings.
-        trans_input = meta['trans_input'][0].numpy()
-        inp_width, inp_height = int(meta['inp_width'][0]), int(meta[
-            'inp_height'][0])
-        input_hm = np.zeros((1, inp_height, inp_width), dtype=np.float32)
-
-        for det in dets:
-            if det['score'] < self.tracker.pre_thresh:
-                continue
-            bbox = affine_transform_bbox(det['bbox'], trans_input, inp_width,
-                                         inp_height)
-            h, w = bbox[3] - bbox[1], bbox[2] - bbox[0]
-            if (h > 0 and w > 0):
-                radius = gaussian_radius(
-                    (math.ceil(h), math.ceil(w)), min_overlap=0.7)
-                radius = max(0, int(radius))
-                ct = np.array(
-                    [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2],
-                    dtype=np.float32)
-                ct_int = ct.astype(np.int32)
-                if with_hm:
-                    input_hm[0] = draw_umich_gaussian(input_hm[0], ct_int,
-                                                      radius)
-        if with_hm:
-            input_hm = input_hm[np.newaxis]
-        return input_hm
-
-
-def affine_transform_bbox(bbox, trans, width, height):
-    bbox = np.array(copy.deepcopy(bbox), dtype=np.float32)
-    bbox[:2] = affine_transform(bbox[:2], trans)
-    bbox[2:] = affine_transform(bbox[2:], trans)
-    bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, width - 1)
-    bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, height - 1)
-    return bbox
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/clrnet.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/clrnet.py
deleted file mode 100644
index 8336fd8..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/clrnet.py
+++ /dev/null
@@ -1,67 +0,0 @@
-from .meta_arch import BaseArch
-from ppdet.core.workspace import register, create
-from paddle import in_dynamic_mode
-
-__all__ = ['CLRNet']
-
-
-@register
-class CLRNet(BaseArch):
-    __category__ = 'architecture'
-
-    def __init__(self,
-                 backbone="CLRResNet",
-                 neck="CLRFPN",
-                 clr_head="CLRHead",
-                 post_process=None):
-        super(CLRNet, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.heads = clr_head
-        self.post_process = post_process
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        # backbone
-        backbone = create(cfg['backbone'])
-        # fpn
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs)
-        # head
-        kwargs = {'input_shape': neck.out_shape}
-        clr_head = create(cfg['clr_head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            'clr_head': clr_head,
-        }
-
-    def _forward(self):
-        # Backbone
-        body_feats = self.backbone(self.inputs['image'])
-        # neck
-        neck_feats = self.neck(body_feats)
-        # CRL Head
-
-        if self.training:
-            output = self.heads(neck_feats, self.inputs)
-        else:
-            output = self.heads(neck_feats)
-            output = {'lanes': output}
-            # TODO: hard code fix as_lanes=False problem in clrnet_head.py "get_lanes" function for static mode
-            if in_dynamic_mode():
-                output = self.heads.get_lanes(output['lanes'])
-                output = {
-                    "lanes": output,
-                    "img_path": self.inputs['full_img_path'],
-                    "img_name": self.inputs['img_name']
-                }
-
-        return output
-
-    def get_loss(self):
-        return self._forward()
-
-    def get_pred(self):
-        return self._forward()
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/deepsort.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/deepsort.py
deleted file mode 100644
index 164c279..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/deepsort.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-# 
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-from ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box
-
-__all__ = ['DeepSORT']
-
-
-@register
-class DeepSORT(BaseArch):
-    """
-    DeepSORT network, see https://arxiv.org/abs/1703.07402
-
-    Args:
-        detector (object): detector model instance
-        reid (object): reid model instance
-        tracker (object): tracker instance
-    """
-    __category__ = 'architecture'
-
-    def __init__(self,
-                 detector='YOLOv3',
-                 reid='PCBPyramid',
-                 tracker='DeepSORTTracker'):
-        super(DeepSORT, self).__init__()
-        self.detector = detector
-        self.reid = reid
-        self.tracker = tracker
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        if cfg['detector'] != 'None':
-            detector = create(cfg['detector'])
-        else:
-            detector = None
-        reid = create(cfg['reid'])
-        tracker = create(cfg['tracker'])
-
-        return {
-            "detector": detector,
-            "reid": reid,
-            "tracker": tracker,
-        }
-
-    def _forward(self):
-        crops = self.inputs['crops']
-        outs = {}
-        outs['embeddings'] = self.reid(crops)
-        return outs
-
-    def get_pred(self):
-        return self._forward()
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/detr.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/detr.py
deleted file mode 100644
index 085f63f..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/detr.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-from .meta_arch import BaseArch
-from ppdet.core.workspace import register, create
-
-__all__ = ['DETR']
-# Deformable DETR, DINO use the same architecture as DETR
-
-
-@register
-class DETR(BaseArch):
-    __category__ = 'architecture'
-    __inject__ = ['post_process', 'post_process_semi']
-    __shared__ = ['with_mask', 'exclude_post_process']
-
-    def __init__(self,
-                 backbone,
-                 transformer='DETRTransformer',
-                 detr_head='DETRHead',
-                 neck=None,
-                 post_process='DETRPostProcess',
-                 post_process_semi=None,
-                 with_mask=False,
-                 exclude_post_process=False):
-        super(DETR, self).__init__()
-        self.backbone = backbone
-        self.transformer = transformer
-        self.detr_head = detr_head
-        self.neck = neck
-        self.post_process = post_process
-        self.with_mask = with_mask
-        self.exclude_post_process = exclude_post_process
-        self.post_process_semi = post_process_semi
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        # backbone
-        backbone = create(cfg['backbone'])
-        # neck
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs) if cfg['neck'] else None
-
-        # transformer
-        if neck is not None:
-            kwargs = {'input_shape': neck.out_shape}
-        transformer = create(cfg['transformer'], **kwargs)
-        # head
-        kwargs = {
-            'hidden_dim': transformer.hidden_dim,
-            'nhead': transformer.nhead,
-            'input_shape': backbone.out_shape
-        }
-        detr_head = create(cfg['detr_head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            'transformer': transformer,
-            "detr_head": detr_head,
-            "neck": neck
-        }
-
-    def _forward(self):
-        # Backbone
-        body_feats = self.backbone(self.inputs)
-
-        # Neck
-        if self.neck is not None:
-            body_feats = self.neck(body_feats)
-
-        # Transformer
-        pad_mask = self.inputs.get('pad_mask', None)
-        out_transformer = self.transformer(body_feats, pad_mask, self.inputs)
-
-        # DETR Head
-        if self.training:
-            detr_losses = self.detr_head(out_transformer, body_feats,
-                                         self.inputs)
-            detr_losses.update({
-                'loss': paddle.add_n(
-                    [v for k, v in detr_losses.items() if 'log' not in k])
-            })
-            return detr_losses
-        else:
-            preds = self.detr_head(out_transformer, body_feats)
-            if self.exclude_post_process:
-                bbox, bbox_num, mask = preds
-            else:
-                bbox, bbox_num, mask = self.post_process(
-                    preds, self.inputs['im_shape'], self.inputs['scale_factor'],
-                    paddle.shape(self.inputs['image'])[2:])
-
-            output = {'bbox': bbox, 'bbox_num': bbox_num}
-            if self.with_mask:
-                output['mask'] = mask
-            return output
-
-    def get_loss(self):
-        return self._forward()
-
-    def get_pred(self):
-        return self._forward()
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/detr_ssod.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/detr_ssod.py
deleted file mode 100644
index 567c234..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/detr_ssod.py
+++ /dev/null
@@ -1,341 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-# 
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from ppdet.core.workspace import register, create, merge_config
-import paddle
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-from ppdet.core.workspace import register, create
-from ppdet.utils.logger import setup_logger
-from ppdet.modeling.ssod.utils import filter_invalid
-from .multi_stream_detector import MultiSteamDetector
-logger = setup_logger(__name__)
-
-__all__ = ['DETR_SSOD']
-__shared__ = ['num_classes']
-
-
-@register
-class DETR_SSOD(MultiSteamDetector):
-    def __init__(self,
-                 teacher,
-                 student,
-                 train_cfg=None,
-                 test_cfg=None,
-                 RTDETRTransformer=None,
-                 num_classes=80):
-        super(DETR_SSOD, self).__init__(
-            dict(
-                teacher=teacher, student=student),
-            train_cfg=train_cfg,
-            test_cfg=test_cfg, )
-        self.ema_start_iters = train_cfg['ema_start_iters']
-        self.momentum = 0.9996
-        self.cls_thr = None
-        self.cls_thr_ig = None
-        self.num_classes = num_classes
-        if train_cfg is not None:
-            self.freeze("teacher")
-            self.unsup_weight = self.train_cfg['unsup_weight']
-            self.sup_weight = self.train_cfg['sup_weight']
-            self._teacher = None
-            self._student = None
-            self._transformer = None
-
-    @classmethod
-    def from_config(cls, cfg):
-        teacher = create(cfg['teacher'])
-        merge_config(cfg)
-        student = create(cfg['student'])
-        train_cfg = cfg['train_cfg']
-        test_cfg = cfg['test_cfg']
-        RTDETRTransformer = cfg['RTDETRTransformer']
-        return {
-            'teacher': teacher,
-            'student': student,
-            'train_cfg': train_cfg,
-            'test_cfg': test_cfg,
-            'RTDETRTransformer': RTDETRTransformer
-        }
-
-    def forward_train(self, inputs, **kwargs):
-        if isinstance(inputs, dict):
-            iter_id = inputs['iter_id']
-        elif isinstance(inputs, list):
-            iter_id = inputs[-1]
-        if iter_id == self.ema_start_iters:
-            self.update_ema_model(momentum=0)
-        elif iter_id > self.ema_start_iters:
-            self.update_ema_model(momentum=self.momentum)
-        if iter_id > self.ema_start_iters:
-            data_sup_w, data_sup_s, data_unsup_w, data_unsup_s, _ = inputs
-
-            if data_sup_w['image'].shape != data_sup_s['image'].shape:
-                data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w,
-                                                                 data_sup_s)
-
-            if 'gt_bbox' in data_unsup_s.keys():
-                del data_unsup_s['gt_bbox']
-            if 'gt_class' in data_unsup_s.keys():
-                del data_unsup_s['gt_class']
-            if 'gt_class' in data_unsup_w.keys():
-                del data_unsup_w['gt_class']
-            if 'gt_bbox' in data_unsup_w.keys():
-                del data_unsup_w['gt_bbox']
-            for k, v in data_sup_s.items():
-                if k in ['epoch_id']:
-                    continue
-                elif k in ['gt_class', 'gt_bbox', 'is_crowd']:
-                    data_sup_s[k].extend(data_sup_w[k])
-                else:
-                    data_sup_s[k] = paddle.concat([v, data_sup_w[k]])
-
-            loss = {}
-            body_feats = self.student.backbone(data_sup_s)
-            if self.student.neck is not None:
-                body_feats = self.student.neck(body_feats)
-            out_transformer = self.student.transformer(body_feats, None,
-                                                       data_sup_s)
-            sup_loss = self.student.detr_head(out_transformer, body_feats,
-                                              data_sup_s)
-            sup_loss.update({
-                'loss': paddle.add_n(
-                    [v for k, v in sup_loss.items() if 'log' not in k])
-            })
-            sup_loss = {"sup_" + k: v for k, v in sup_loss.items()}
-
-            loss.update(**sup_loss)
-            unsup_loss = self.foward_unsup_train(data_unsup_w, data_unsup_s)
-            unsup_loss.update({
-                'loss': paddle.add_n(
-                    [v for k, v in unsup_loss.items() if 'log' not in k])
-            })
-            unsup_loss = {"unsup_" + k: v for k, v in unsup_loss.items()}
-            unsup_loss.update({
-                'loss': paddle.add_n(
-                    [v for k, v in unsup_loss.items() if 'log' not in k])
-            })
-            loss.update(**unsup_loss)
-            loss.update({'loss': loss['sup_loss'] + loss['unsup_loss']})
-        else:
-            if iter_id == self.ema_start_iters:
-                logger.info("start semi_supervised_traing")
-            data_sup_w, data_sup_s, data_unsup_w, data_unsup_s, _ = inputs
-
-            if data_sup_w['image'].shape != data_sup_s['image'].shape:
-                data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w,
-                                                                 data_sup_s)
-            for k, v in data_sup_s.items():
-                if k in ['epoch_id']:
-                    continue
-                elif k in ['gt_class', 'gt_bbox', 'is_crowd']:
-                    data_sup_s[k].extend(data_sup_w[k])
-                else:
-                    data_sup_s[k] = paddle.concat([v, data_sup_w[k]])
-            loss = {}
-            sup_loss = self.student(data_sup_s)
-            unsup_loss = {
-                "unsup_" + k: v * paddle.to_tensor(0)
-                for k, v in sup_loss.items()
-            }
-            sup_loss = {"sup_" + k: v for k, v in sup_loss.items()}
-            loss.update(**sup_loss)
-            unsup_loss.update({
-                'loss': paddle.add_n(
-                    [v * 0 for k, v in sup_loss.items() if 'log' not in k])
-            })
-            unsup_loss = {"unsup_" + k: v * 0 for k, v in unsup_loss.items()}
-            loss.update(**unsup_loss)
-            loss.update({'loss': loss['sup_loss']})
-        return loss
-
-    def foward_unsup_train(self, data_unsup_w, data_unsup_s):
-
-        with paddle.no_grad():
-            body_feats = self.teacher.backbone(data_unsup_w)
-            if self.teacher.neck is not None:
-                body_feats = self.teacher.neck(body_feats, is_teacher=True)
-            out_transformer = self.teacher.transformer(
-                body_feats, None, data_unsup_w, is_teacher=True)
-            preds = self.teacher.detr_head(out_transformer, body_feats)
-            bbox, bbox_num = self.teacher.post_process_semi(preds)
-        self.place = body_feats[0].place
-
-        proposal_bbox_list = bbox[:, -4:]
-        proposal_bbox_list = proposal_bbox_list.split(
-            tuple(np.array(bbox_num)), 0)
-
-        proposal_label_list = paddle.cast(bbox[:, :1], np.float32)
-        proposal_label_list = proposal_label_list.split(
-            tuple(np.array(bbox_num)), 0)
-        proposal_score_list = paddle.cast(bbox[:, 1:self.num_classes + 1],
-                                          np.float32)
-        proposal_score_list = proposal_score_list.split(
-            tuple(np.array(bbox_num)), 0)
-        proposal_bbox_list = [
-            paddle.to_tensor(
-                p, place=self.place) for p in proposal_bbox_list
-        ]
-        proposal_label_list = [
-            paddle.to_tensor(
-                p, place=self.place) for p in proposal_label_list
-        ]
-        # filter invalid box roughly
-        if isinstance(self.train_cfg['pseudo_label_initial_score_thr'], float):
-            thr = self.train_cfg['pseudo_label_initial_score_thr']
-        else:
-            # TODO: use dynamic threshold
-            raise NotImplementedError(
-                "Dynamic Threshold is not implemented yet.")
-        proposal_bbox_list, proposal_label_list, proposal_score_list = list(
-            zip(* [
-                filter_invalid(
-                    proposal[:, :4],
-                    proposal_label,
-                    proposal_score,
-                    thr=thr,
-                    min_size=self.train_cfg['min_pseduo_box_size'], )
-                for proposal, proposal_label, proposal_score in
-                zip(proposal_bbox_list, proposal_label_list,
-                    proposal_score_list)
-            ]))
-
-        teacher_bboxes = list(proposal_bbox_list)
-        teacher_labels = proposal_label_list
-        teacher_info = [teacher_bboxes, teacher_labels]
-        student_unsup = data_unsup_s
-        return self.compute_pseudo_label_loss(student_unsup, teacher_info,
-                                              proposal_score_list)
-
-    def compute_pseudo_label_loss(self, student_unsup, teacher_info,
-                                  proposal_score_list):
-
-        pseudo_bboxes = list(teacher_info[0])
-        pseudo_labels = list(teacher_info[1])
-        losses = dict()
-        for i in range(len(pseudo_bboxes)):
-            if pseudo_labels[i].shape[0] == 0:
-                pseudo_bboxes[i] = paddle.zeros([0, 4]).numpy()
-                pseudo_labels[i] = paddle.zeros([0, 1]).numpy()
-            else:
-                pseudo_bboxes[i] = pseudo_bboxes[i][:, :4].numpy()
-                pseudo_labels[i] = pseudo_labels[i].numpy()
-        for i in range(len(pseudo_bboxes)):
-            pseudo_labels[i] = paddle.to_tensor(
-                pseudo_labels[i], dtype=paddle.int32, place=self.place)
-            pseudo_bboxes[i] = paddle.to_tensor(
-                pseudo_bboxes[i], dtype=paddle.float32, place=self.place)
-        student_unsup.update({
-            'gt_bbox': pseudo_bboxes,
-            'gt_class': pseudo_labels
-        })
-        pseudo_sum = 0
-        for i in range(len(pseudo_bboxes)):
-            pseudo_sum += pseudo_bboxes[i].sum()
-        if pseudo_sum == 0:  #input fake data when there are no pseudo labels
-            pseudo_bboxes[0] = paddle.ones([1, 4]) - 0.5
-            pseudo_labels[0] = paddle.ones([1, 1]).astype('int32')
-            student_unsup.update({
-                'gt_bbox': pseudo_bboxes,
-                'gt_class': pseudo_labels
-            })
-            body_feats = self.student.backbone(student_unsup)
-            if self.student.neck is not None:
-                body_feats = self.student.neck(body_feats)
-            out_transformer = self.student.transformer(body_feats, None,
-                                                       student_unsup)
-            losses = self.student.detr_head(out_transformer, body_feats,
-                                            student_unsup)
-            for n, v in losses.items():
-                losses[n] = v * 0
-        else:
-            gt_bbox = []
-            gt_class = []
-            images = []
-            proposal_score = []
-            for i in range(len(pseudo_bboxes)):
-                if pseudo_labels[i].shape[0] == 0:
-                    continue
-                else:
-                    proposal_score.append(proposal_score_list[i].max(-1)
-                                          .unsqueeze(-1))
-                    gt_class.append(pseudo_labels[i])
-                    gt_bbox.append(pseudo_bboxes[i])
-                    images.append(student_unsup['image'][i])
-            images = paddle.stack(images)
-            student_unsup.update({
-                'image': images,
-                'gt_bbox': gt_bbox,
-                'gt_class': gt_class
-            })
-            body_feats = self.student.backbone(student_unsup)
-            if self.student.neck is not None:
-                body_feats = self.student.neck(body_feats)
-            out_transformer = self.student.transformer(body_feats, None,
-                                                       student_unsup)
-            student_unsup.update({'gt_score': proposal_score})
-            losses = self.student.detr_head(out_transformer, body_feats,
-                                            student_unsup)
-        return losses
-
-
-def box_cxcywh_to_xyxy(x):
-    x_c, y_c, w, h = x.unbind(-1)
-    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
-    return paddle.stack(b, axis=-1)
-
-
-def box_xyxy_to_cxcywh(x):
-    x0, y0, x1, y1 = x.unbind(-1)
-    b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
-    return paddle.stack(b, axis=-1)
-
-
-def get_size_with_aspect_ratio(image_size, size, max_size=None):
-    w, h = image_size
-    if max_size is not None:
-        min_original_size = float(min((w, h)))
-        max_original_size = float(max((w, h)))
-        if max_original_size / min_original_size * size > max_size:
-            size = int(round(max_size * min_original_size / max_original_size))
-
-    if (w <= h and w == size) or (h <= w and h == size):
-        return (w, h)
-
-    if w < h:
-        ow = size
-        oh = int(size * h / w)
-    else:
-        oh = size
-        ow = int(size * w / h)
-
-    return (ow, oh)
-
-
-def align_weak_strong_shape(data_weak, data_strong):
-    shape_x = data_strong['image'].shape[2]
-    shape_y = data_strong['image'].shape[3]
-
-    target_size = [shape_x, shape_y]
-    data_weak['image'] = F.interpolate(
-        data_weak['image'],
-        size=target_size,
-        mode='bilinear',
-        align_corners=False)
-    return data_weak, data_strong
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/fairmot.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/fairmot.py
deleted file mode 100644
index 2714508..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/fairmot.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-__all__ = ['FairMOT']
-
-
-@register
-class FairMOT(BaseArch):
-    """
-    FairMOT network, see http://arxiv.org/abs/2004.01888
-
-    Args:
-        detector (object): 'CenterNet' instance
-        reid (object): 'FairMOTEmbeddingHead' instance
-        tracker (object): 'JDETracker' instance
-        loss (object): 'FairMOTLoss' instance
-
-    """
-
-    __category__ = 'architecture'
-    __inject__ = ['loss']
-
-    def __init__(self,
-                 detector='CenterNet',
-                 reid='FairMOTEmbeddingHead',
-                 tracker='JDETracker',
-                 loss='FairMOTLoss'):
-        super(FairMOT, self).__init__()
-        self.detector = detector
-        self.reid = reid
-        self.tracker = tracker
-        self.loss = loss
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        detector = create(cfg['detector'])
-        detector_out_shape = detector.neck and detector.neck.out_shape or detector.backbone.out_shape
-
-        kwargs = {'input_shape': detector_out_shape}
-        reid = create(cfg['reid'], **kwargs)
-        loss = create(cfg['loss'])
-        tracker = create(cfg['tracker'])
-
-        return {
-            'detector': detector,
-            'reid': reid,
-            'loss': loss,
-            'tracker': tracker
-        }
-
-    def _forward(self):
-        loss = dict()
-        # det_outs keys:
-        # train: neck_feat, det_loss, heatmap_loss, size_loss, offset_loss (optional: iou_loss)
-        # eval/infer: neck_feat, bbox, bbox_inds
-        det_outs = self.detector(self.inputs)
-        neck_feat = det_outs['neck_feat']
-        if self.training:
-            reid_loss = self.reid(neck_feat, self.inputs)
-
-            det_loss = det_outs['det_loss']
-            loss = self.loss(det_loss, reid_loss)
-            for k, v in det_outs.items():
-                if 'loss' not in k:
-                    continue
-                loss.update({k: v})
-            loss.update({'reid_loss': reid_loss})
-            return loss
-        else:
-            pred_dets, pred_embs = self.reid(
-                neck_feat, self.inputs, det_outs['bbox'], det_outs['bbox_inds'],
-                det_outs['topk_clses'])
-            return pred_dets, pred_embs
-
-    def get_pred(self):
-        output = self._forward()
-        return output
-
-    def get_loss(self):
-        loss = self._forward()
-        return loss
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/faster_rcnn.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/faster_rcnn.py
deleted file mode 100644
index 93fd0f9..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/faster_rcnn.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-import numpy as np
-
-__all__ = ['FasterRCNN']
-
-
-@register
-class FasterRCNN(BaseArch):
-    """
-    Faster R-CNN network, see https://arxiv.org/abs/1506.01497
-
-    Args:
-        backbone (object): backbone instance
-        rpn_head (object): `RPNHead` instance
-        bbox_head (object): `BBoxHead` instance
-        bbox_post_process (object): `BBoxPostProcess` instance
-        neck (object): 'FPN' instance
-    """
-    __category__ = 'architecture'
-    __inject__ = ['bbox_post_process']
-
-    def __init__(self,
-                 backbone,
-                 rpn_head,
-                 bbox_head,
-                 bbox_post_process,
-                 neck=None):
-        super(FasterRCNN, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.rpn_head = rpn_head
-        self.bbox_head = bbox_head
-        self.bbox_post_process = bbox_post_process
-
-    def init_cot_head(self, relationship):
-        self.bbox_head.init_cot_head(relationship)
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        backbone = create(cfg['backbone'])
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = cfg['neck'] and create(cfg['neck'], **kwargs)
-
-        out_shape = neck and neck.out_shape or backbone.out_shape
-        kwargs = {'input_shape': out_shape}
-        rpn_head = create(cfg['rpn_head'], **kwargs)
-        bbox_head = create(cfg['bbox_head'], **kwargs)
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            "rpn_head": rpn_head,
-            "bbox_head": bbox_head,
-        }
-
-    def _forward(self):
-        body_feats = self.backbone(self.inputs)
-        if self.neck is not None:
-            body_feats = self.neck(body_feats)
-        if self.training:
-            rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)
-            bbox_loss, _ = self.bbox_head(body_feats, rois, rois_num,
-                                          self.inputs)
-            return rpn_loss, bbox_loss
-        else:
-            rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
-            preds, _ = self.bbox_head(body_feats, rois, rois_num, None)
-            im_shape = self.inputs['im_shape']
-            scale_factor = self.inputs['scale_factor']
-            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(
-                preds, (rois, rois_num), im_shape, scale_factor)
-
-            # rescale the prediction back to origin image
-            bboxes, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
-                bbox, bbox_num, im_shape, scale_factor)
-
-            if self.use_extra_data:
-                extra_data = {
-                }  # record the bbox output before nms, such like scores and nms_keep_idx
-                """extra_data:{
-                            'scores': predict scores,
-                            'nms_keep_idx': bbox index before nms,
-                           }
-                """
-                extra_data['scores'] = preds[1]  # predict scores (probability)
-                # Todo: get logits output
-                extra_data[
-                    'nms_keep_idx'] = nms_keep_idx  # bbox index before nms
-                return bbox_pred, bbox_num, extra_data
-            else:
-                return bbox_pred, bbox_num
-
-    def get_loss(self, ):
-        rpn_loss, bbox_loss = self._forward()
-        loss = {}
-        loss.update(rpn_loss)
-        loss.update(bbox_loss)
-        total_loss = paddle.add_n(list(loss.values()))
-        loss.update({'loss': total_loss})
-        return loss
-
-    def get_pred(self):
-        if self.use_extra_data:
-            bbox_pred, bbox_num, extra_data = self._forward()
-            output = {
-                'bbox': bbox_pred,
-                'bbox_num': bbox_num,
-                'extra_data': extra_data
-            }
-        else:
-            bbox_pred, bbox_num = self._forward()
-            output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
-        return output
-
-    def target_bbox_forward(self, data):
-        body_feats = self.backbone(data)
-        if self.neck is not None:
-            body_feats = self.neck(body_feats)
-        rois = [roi for roi in data['gt_bbox']]
-        rois_num = paddle.concat([paddle.shape(roi)[0:1] for roi in rois])
-
-        preds, _ = self.bbox_head(body_feats, rois, rois_num, None, cot=True)
-        return preds
-
-    def relationship_learning(self, loader, num_classes_novel):
-        print('computing relationship')
-        train_labels_list = []
-        label_list = []
-
-        for step_id, data in enumerate(loader):
-            _, bbox_prob = self.target_bbox_forward(data)
-            batch_size = data['im_id'].shape[0]
-            for i in range(batch_size):
-                num_bbox = data['gt_class'][i].shape[0]
-                train_labels = data['gt_class'][i]
-                train_labels_list.append(train_labels.numpy().squeeze(1))
-            base_labels = bbox_prob.detach().numpy()[:, :-1]
-            label_list.append(base_labels)
-
-        labels = np.concatenate(train_labels_list, 0)
-        probabilities = np.concatenate(label_list, 0)
-        N_t = np.max(labels) + 1
-        conditional = []
-        for i in range(N_t):
-            this_class = probabilities[labels == i]
-            average = np.mean(this_class, axis=0, keepdims=True)
-            conditional.append(average)
-        return np.concatenate(conditional)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/fcos.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/fcos.py
deleted file mode 100644
index 8c338ca..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/fcos.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-__all__ = ['FCOS', 'ARSL_FCOS']
-
-
-@register
-class FCOS(BaseArch):
-    """
-    FCOS network, see https://arxiv.org/abs/1904.01355
-
-    Args:
-        backbone (object): backbone instance
-        neck (object): 'FPN' instance
-        fcos_head (object): 'FCOSHead' instance
-        ssod_loss (object): 'SSODFCOSLoss' instance, only used for semi-det(ssod) by DenseTeacher
-    """
-
-    __category__ = 'architecture'
-    __inject__ = ['ssod_loss']
-
-    def __init__(self,
-                 backbone='ResNet',
-                 neck='FPN',
-                 fcos_head='FCOSHead',
-                 ssod_loss='SSODFCOSLoss'):
-        super(FCOS, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.fcos_head = fcos_head
-
-        # for ssod, semi-det
-        self.is_teacher = False
-        self.ssod_loss = ssod_loss
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        backbone = create(cfg['backbone'])
-
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs)
-
-        kwargs = {'input_shape': neck.out_shape}
-        fcos_head = create(cfg['fcos_head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            "fcos_head": fcos_head,
-        }
-
-    def _forward(self):
-        body_feats = self.backbone(self.inputs)
-        fpn_feats = self.neck(body_feats)
-
-        self.is_teacher = self.inputs.get('is_teacher', False)
-        if self.training or self.is_teacher:
-            losses = self.fcos_head(fpn_feats, self.inputs)
-            return losses
-        else:
-            fcos_head_outs = self.fcos_head(fpn_feats)
-            bbox_pred, bbox_num = self.fcos_head.post_process(
-                fcos_head_outs, self.inputs['scale_factor'])
-            return {'bbox': bbox_pred, 'bbox_num': bbox_num}
-
-    def get_loss(self):
-        return self._forward()
-
-    def get_pred(self):
-        return self._forward()
-
-    def get_loss_keys(self):
-        return ['loss_cls', 'loss_box', 'loss_quality']
-
-    def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg):
-        ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs,
-                                     train_cfg)
-        return ssod_losses
-
-
-@register
-class ARSL_FCOS(BaseArch):
-    """
-    FCOS ARSL network, see https://arxiv.org/abs/
-
-    Args:
-        backbone (object): backbone instance
-        neck (object): 'FPN' instance
-        fcos_head (object): 'FCOSHead_ARSL' instance
-        fcos_cr_loss (object): 'FCOSLossCR' instance, only used for semi-det(ssod) by ARSL
-    """
-
-    __category__ = 'architecture'
-    __inject__ = ['fcos_cr_loss']
-
-    def __init__(self,
-                 backbone,
-                 neck,
-                 fcos_head='FCOSHead_ARSL',
-                 fcos_cr_loss='FCOSLossCR'):
-        super(ARSL_FCOS, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.fcos_head = fcos_head
-        self.fcos_cr_loss = fcos_cr_loss
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        backbone = create(cfg['backbone'])
-
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs)
-
-        kwargs = {'input_shape': neck.out_shape}
-        fcos_head = create(cfg['fcos_head'], **kwargs)
-
-        # consistency regularization loss
-        fcos_cr_loss = create(cfg['fcos_cr_loss'])
-
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            'fcos_head': fcos_head,
-            'fcos_cr_loss': fcos_cr_loss,
-        }
-
-    def forward(self, inputs, branch="supervised", teacher_prediction=None):
-        assert branch in ['supervised', 'semi_supervised'], \
-            print('In ARSL, type must be supervised or semi_supervised.')
-
-        if self.data_format == 'NHWC':
-            image = inputs['image']
-            inputs['image'] = paddle.transpose(image, [0, 2, 3, 1])
-        self.inputs = inputs
-
-        if self.training:
-            if branch == "supervised":
-                out = self.get_loss()
-            else:
-                out = self.get_pseudo_loss(teacher_prediction)
-        else:
-            # norm test
-            if branch == "supervised":
-                out = self.get_pred()
-                # predict pseudo labels
-            else:
-                out = self.get_pseudo_pred()
-        return out
-
-    # model forward 
-    def model_forward(self):
-        body_feats = self.backbone(self.inputs)
-        fpn_feats = self.neck(body_feats)
-        fcos_head_outs = self.fcos_head(fpn_feats)
-        return fcos_head_outs
-
-    # supervised loss for labeled data
-    def get_loss(self):
-        loss = {}
-        tag_labels, tag_bboxes, tag_centerness = [], [], []
-        for i in range(len(self.fcos_head.fpn_stride)):
-            # labels, reg_target, centerness
-            k_lbl = 'labels{}'.format(i)
-            if k_lbl in self.inputs:
-                tag_labels.append(self.inputs[k_lbl])
-            k_box = 'reg_target{}'.format(i)
-            if k_box in self.inputs:
-                tag_bboxes.append(self.inputs[k_box])
-            k_ctn = 'centerness{}'.format(i)
-            if k_ctn in self.inputs:
-                tag_centerness.append(self.inputs[k_ctn])
-        fcos_head_outs = self.model_forward()
-        loss_fcos = self.fcos_head.get_loss(fcos_head_outs, tag_labels,
-                                            tag_bboxes, tag_centerness)
-        loss.update(loss_fcos)
-        return loss
-
-    # unsupervised loss for unlabeled data
-    def get_pseudo_loss(self, teacher_prediction):
-        loss = {}
-        fcos_head_outs = self.model_forward()
-        unsup_loss = self.fcos_cr_loss(fcos_head_outs, teacher_prediction)
-        for k in unsup_loss.keys():
-            loss[k + '_pseudo'] = unsup_loss[k]
-        return loss
-
-    # get detection results for test, decode and rescale the results to original size
-    def get_pred(self):
-        fcos_head_outs = self.model_forward()
-        scale_factor = self.inputs['scale_factor']
-        bbox_pred, bbox_num = self.fcos_head.post_process(fcos_head_outs,
-                                                          scale_factor)
-        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
-        return output
-
-    # generate pseudo labels to guide student
-    def get_pseudo_pred(self):
-        fcos_head_outs = self.model_forward()
-        pred_cls, pred_loc, pred_iou = fcos_head_outs[1:]  # 0 is locations
-        for lvl, _ in enumerate(pred_loc):
-            pred_loc[lvl] = pred_loc[lvl] / self.fcos_head.fpn_stride[lvl]
-
-        return [pred_cls, pred_loc, pred_iou, self.fcos_head.fpn_stride]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/gfl.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/gfl.py
deleted file mode 100644
index 91c1307..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/gfl.py
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-__all__ = ['GFL']
-
-
-@register
-class GFL(BaseArch):
-    """
-    Generalized Focal Loss network, see https://arxiv.org/abs/2006.04388
-
-    Args:
-        backbone (object): backbone instance
-        neck (object): 'FPN' instance
-        head (object): 'GFLHead' instance
-    """
-
-    __category__ = 'architecture'
-
-    def __init__(self, backbone, neck, head='GFLHead'):
-        super(GFL, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.head = head
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        backbone = create(cfg['backbone'])
-
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs)
-
-        kwargs = {'input_shape': neck.out_shape}
-        head = create(cfg['head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            "head": head,
-        }
-
-    def _forward(self):
-        body_feats = self.backbone(self.inputs)
-        fpn_feats = self.neck(body_feats)
-        head_outs = self.head(fpn_feats)
-        if not self.training:
-            im_shape = self.inputs['im_shape']
-            scale_factor = self.inputs['scale_factor']
-            bboxes, bbox_num = self.head.post_process(head_outs, im_shape,
-                                                      scale_factor)
-            return bboxes, bbox_num
-        else:
-            return head_outs
-
-    def get_loss(self, ):
-        loss = {}
-
-        head_outs = self._forward()
-        loss_gfl = self.head.get_loss(head_outs, self.inputs)
-        loss.update(loss_gfl)
-        total_loss = paddle.add_n(list(loss.values()))
-        loss.update({'loss': total_loss})
-        return loss
-
-    def get_pred(self):
-        bbox_pred, bbox_num = self._forward()
-        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
-        return output
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/jde.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/jde.py
deleted file mode 100644
index 11b45c8..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/jde.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-# 
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-__all__ = ['JDE']
-
-
-@register
-class JDE(BaseArch):
-    __category__ = 'architecture'
-    __shared__ = ['metric']
-    """
-    JDE network, see https://arxiv.org/abs/1909.12605v1
-
-    Args:
-        detector (object): detector model instance
-        reid (object): reid model instance
-        tracker (object): tracker instance
-        metric (str): 'MOTDet' for training and detection evaluation, 'ReID'
-            for ReID embedding evaluation, or 'MOT' for multi object tracking
-            evaluation.
-    """
-
-    def __init__(self,
-                 detector='YOLOv3',
-                 reid='JDEEmbeddingHead',
-                 tracker='JDETracker',
-                 metric='MOT'):
-        super(JDE, self).__init__()
-        self.detector = detector
-        self.reid = reid
-        self.tracker = tracker
-        self.metric = metric
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        detector = create(cfg['detector'])
-        kwargs = {'input_shape': detector.neck.out_shape}
-
-        reid = create(cfg['reid'], **kwargs)
-
-        tracker = create(cfg['tracker'])
-
-        return {
-            "detector": detector,
-            "reid": reid,
-            "tracker": tracker,
-        }
-
-    def _forward(self):
-        det_outs = self.detector(self.inputs)
-
-        if self.training:
-            emb_feats = det_outs['emb_feats']
-            loss_confs = det_outs['det_losses']['loss_confs']
-            loss_boxes = det_outs['det_losses']['loss_boxes']
-            jde_losses = self.reid(
-                emb_feats,
-                self.inputs,
-                loss_confs=loss_confs,
-                loss_boxes=loss_boxes)
-            return jde_losses
-        else:
-            if self.metric == 'MOTDet':
-                det_results = {
-                    'bbox': det_outs['bbox'],
-                    'bbox_num': det_outs['bbox_num'],
-                }
-                return det_results
-
-            elif self.metric == 'MOT':
-                emb_feats = det_outs['emb_feats']
-                bboxes = det_outs['bbox']
-                boxes_idx = det_outs['boxes_idx']
-                nms_keep_idx = det_outs['nms_keep_idx']
-
-                pred_dets, pred_embs = self.reid(
-                    emb_feats,
-                    self.inputs,
-                    bboxes=bboxes,
-                    boxes_idx=boxes_idx,
-                    nms_keep_idx=nms_keep_idx)
-                return pred_dets, pred_embs
-
-            else:
-                raise ValueError("Unknown metric {} for multi object tracking.".
-                                 format(self.metric))
-
-    def get_loss(self):
-        return self._forward()
-
-    def get_pred(self):
-        return self._forward()
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_hrhrnet.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_hrhrnet.py
deleted file mode 100644
index 366e9e3..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_hrhrnet.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from scipy.optimize import linear_sum_assignment
-from collections import abc, defaultdict
-import numpy as np
-import paddle
-
-from ppdet.core.workspace import register, create, serializable
-from .meta_arch import BaseArch
-from .. import layers as L
-from ..keypoint_utils import transpred
-
-__all__ = ['HigherHRNet']
-
-
-@register
-class HigherHRNet(BaseArch):
-    __category__ = 'architecture'
-
-    def __init__(self,
-                 backbone='HRNet',
-                 hrhrnet_head='HrHRNetHead',
-                 post_process='HrHRNetPostProcess',
-                 eval_flip=True,
-                 flip_perm=None,
-                 max_num_people=30):
-        """
-        HigherHRNet network, see https://arxiv.org/abs/1908.10357；
-        HigherHRNet+swahr, see https://arxiv.org/abs/2012.15175
-
-        Args:
-            backbone (nn.Layer): backbone instance
-            hrhrnet_head (nn.Layer): keypoint_head instance
-            bbox_post_process (object): `BBoxPostProcess` instance
-        """
-        super(HigherHRNet, self).__init__()
-        self.backbone = backbone
-        self.hrhrnet_head = hrhrnet_head
-        self.post_process = post_process
-        self.flip = eval_flip
-        self.flip_perm = paddle.to_tensor(flip_perm)
-        self.deploy = False
-        self.interpolate = L.Upsample(2, mode='bilinear')
-        self.pool = L.MaxPool(5, 1, 2)
-        self.max_num_people = max_num_people
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        # backbone
-        backbone = create(cfg['backbone'])
-        # head
-        kwargs = {'input_shape': backbone.out_shape}
-        hrhrnet_head = create(cfg['hrhrnet_head'], **kwargs)
-        post_process = create(cfg['post_process'])
-
-        return {
-            'backbone': backbone,
-            "hrhrnet_head": hrhrnet_head,
-            "post_process": post_process,
-        }
-
-    def _forward(self):
-        if self.flip and not self.training and not self.deploy:
-            self.inputs['image'] = paddle.concat(
-                (self.inputs['image'], paddle.flip(self.inputs['image'], [3])))
-        body_feats = self.backbone(self.inputs)
-
-        if self.training:
-            return self.hrhrnet_head(body_feats, self.inputs)
-        else:
-            outputs = self.hrhrnet_head(body_feats)
-
-            if self.flip and not self.deploy:
-                outputs = [paddle.split(o, 2) for o in outputs]
-                output_rflip = [
-                    paddle.flip(paddle.gather(o[1], self.flip_perm, 1), [3])
-                    for o in outputs
-                ]
-                output1 = [o[0] for o in outputs]
-                heatmap = (output1[0] + output_rflip[0]) / 2.
-                tagmaps = [output1[1], output_rflip[1]]
-                outputs = [heatmap] + tagmaps
-            outputs = self.get_topk(outputs)
-
-            if self.deploy:
-                return outputs
-
-            res_lst = []
-            h = self.inputs['im_shape'][0, 0].numpy().item()
-            w = self.inputs['im_shape'][0, 1].numpy().item()
-            kpts, scores = self.post_process(*outputs, h, w)
-            res_lst.append([kpts, scores])
-            return res_lst
-
-    def get_loss(self):
-        return self._forward()
-
-    def get_pred(self):
-        outputs = {}
-        res_lst = self._forward()
-        outputs['keypoint'] = res_lst
-        return outputs
-
-    def get_topk(self, outputs):
-        # resize to image size
-        outputs = [self.interpolate(x) for x in outputs]
-        if len(outputs) == 3:
-            tagmap = paddle.concat(
-                (outputs[1].unsqueeze(4), outputs[2].unsqueeze(4)), axis=4)
-        else:
-            tagmap = outputs[1].unsqueeze(4)
-
-        heatmap = outputs[0]
-        N, J = 1, self.hrhrnet_head.num_joints
-        heatmap_maxpool = self.pool(heatmap)
-        # topk
-        maxmap = heatmap * (heatmap == heatmap_maxpool)
-        maxmap = maxmap.reshape([N, J, -1])
-        heat_k, inds_k = maxmap.topk(self.max_num_people, axis=2)
-
-        outputs = [heatmap, tagmap, heat_k, inds_k]
-        return outputs
-
-
-@register
-@serializable
-class HrHRNetPostProcess(object):
-    '''
-    HrHRNet postprocess contain:
-        1) get topk keypoints in the output heatmap
-        2) sample the tagmap's value corresponding to each of the topk coordinate
-        3) match different joints to combine to some people with Hungary algorithm
-        4) adjust the coordinate by +-0.25 to decrease error std
-        5) salvage missing joints by check positivity of heatmap - tagdiff_norm
-    Args:
-        max_num_people (int): max number of people support in postprocess
-        heat_thresh (float): value of topk below this threshhold will be ignored
-        tag_thresh (float): coord's value sampled in tagmap below this threshold belong to same people for init
-
-        inputs(list[heatmap]): the output list of model, [heatmap, heatmap_maxpool, tagmap], heatmap_maxpool used to get topk
-        original_height, original_width (float): the original image size
-    '''
-
-    def __init__(self, max_num_people=30, heat_thresh=0.1, tag_thresh=1.):
-        self.max_num_people = max_num_people
-        self.heat_thresh = heat_thresh
-        self.tag_thresh = tag_thresh
-
-    def lerp(self, j, y, x, heatmap):
-        H, W = heatmap.shape[-2:]
-        left = np.clip(x - 1, 0, W - 1)
-        right = np.clip(x + 1, 0, W - 1)
-        up = np.clip(y - 1, 0, H - 1)
-        down = np.clip(y + 1, 0, H - 1)
-        offset_y = np.where(heatmap[j, down, x] > heatmap[j, up, x], 0.25,
-                            -0.25)
-        offset_x = np.where(heatmap[j, y, right] > heatmap[j, y, left], 0.25,
-                            -0.25)
-        return offset_y + 0.5, offset_x + 0.5
-
-    def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height,
-                 original_width):
-
-        N, J, H, W = heatmap.shape
-        assert N == 1, "only support batch size 1"
-        heatmap = heatmap[0].cpu().detach().numpy()
-        tagmap = tagmap[0].cpu().detach().numpy()
-        heats = heat_k[0].cpu().detach().numpy()
-        inds_np = inds_k[0].cpu().detach().numpy()
-        y = inds_np // W
-        x = inds_np % W
-        tags = tagmap[np.arange(J)[None, :].repeat(self.max_num_people),
-                      y.flatten(), x.flatten()].reshape(J, -1, tagmap.shape[-1])
-        coords = np.stack((y, x), axis=2)
-        # threshold
-        mask = heats > self.heat_thresh
-        # cluster
-        cluster = defaultdict(lambda: {
-            'coords': np.zeros((J, 2), dtype=np.float32),
-            'scores': np.zeros(J, dtype=np.float32),
-            'tags': []
-        })
-        for jid, m in enumerate(mask):
-            num_valid = m.sum()
-            if num_valid == 0:
-                continue
-            valid_inds = np.where(m)[0]
-            valid_tags = tags[jid, m, :]
-            if len(cluster) == 0:  # initialize
-                for i in valid_inds:
-                    tag = tags[jid, i]
-                    key = tag[0]
-                    cluster[key]['tags'].append(tag)
-                    cluster[key]['scores'][jid] = heats[jid, i]
-                    cluster[key]['coords'][jid] = coords[jid, i]
-                continue
-            candidates = list(cluster.keys())[:self.max_num_people]
-            centroids = [
-                np.mean(
-                    cluster[k]['tags'], axis=0) for k in candidates
-            ]
-            num_clusters = len(centroids)
-            # shape is (num_valid, num_clusters, tag_dim)
-            dist = valid_tags[:, None, :] - np.array(centroids)[None, ...]
-            l2_dist = np.linalg.norm(dist, ord=2, axis=2)
-            # modulate dist with heat value, see `use_detection_val`
-            cost = np.round(l2_dist) * 100 - heats[jid, m, None]
-            # pad the cost matrix, otherwise new pose are ignored
-            if num_valid > num_clusters:
-                cost = np.pad(cost, ((0, 0), (0, num_valid - num_clusters)),
-                              'constant',
-                              constant_values=((0, 0), (0, 1e-10)))
-            rows, cols = linear_sum_assignment(cost)
-            for y, x in zip(rows, cols):
-                tag = tags[jid, y]
-                if y < num_valid and x < num_clusters and \
-                   l2_dist[y, x] < self.tag_thresh:
-                    key = candidates[x]  # merge to cluster
-                else:
-                    key = tag[0]  # initialize new cluster
-                cluster[key]['tags'].append(tag)
-                cluster[key]['scores'][jid] = heats[jid, y]
-                cluster[key]['coords'][jid] = coords[jid, y]
-
-        # shape is [k, J, 2] and [k, J]
-        pose_tags = np.array([cluster[k]['tags'] for k in cluster])
-        pose_coords = np.array([cluster[k]['coords'] for k in cluster])
-        pose_scores = np.array([cluster[k]['scores'] for k in cluster])
-        valid = pose_scores > 0
-
-        pose_kpts = np.zeros((pose_scores.shape[0], J, 3), dtype=np.float32)
-        if valid.sum() == 0:
-            return pose_kpts, pose_kpts
-
-        # refine coords
-        valid_coords = pose_coords[valid].astype(np.int32)
-        y = valid_coords[..., 0].flatten()
-        x = valid_coords[..., 1].flatten()
-        _, j = np.nonzero(valid)
-        offsets = self.lerp(j, y, x, heatmap)
-        pose_coords[valid, 0] += offsets[0]
-        pose_coords[valid, 1] += offsets[1]
-
-        # mean score before salvage
-        mean_score = pose_scores.mean(axis=1)
-        pose_kpts[valid, 2] = pose_scores[valid]
-
-        # salvage missing joints
-        if True:
-            for pid, coords in enumerate(pose_coords):
-                tag_mean = np.array(pose_tags[pid]).mean(axis=0)
-                norm = np.sum((tagmap - tag_mean)**2, axis=3)**0.5
-                score = heatmap - np.round(norm)  # (J, H, W)
-                flat_score = score.reshape(J, -1)
-                max_inds = np.argmax(flat_score, axis=1)
-                max_scores = np.max(flat_score, axis=1)
-                salvage_joints = (pose_scores[pid] == 0) & (max_scores > 0)
-                if salvage_joints.sum() == 0:
-                    continue
-                y = max_inds[salvage_joints] // W
-                x = max_inds[salvage_joints] % W
-                offsets = self.lerp(salvage_joints.nonzero()[0], y, x, heatmap)
-                y = y.astype(np.float32) + offsets[0]
-                x = x.astype(np.float32) + offsets[1]
-                pose_coords[pid][salvage_joints, 0] = y
-                pose_coords[pid][salvage_joints, 1] = x
-                pose_kpts[pid][salvage_joints, 2] = max_scores[salvage_joints]
-        pose_kpts[..., :2] = transpred(pose_coords[..., :2][..., ::-1],
-                                       original_height, original_width,
-                                       min(H, W))
-        return pose_kpts, mean_score
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_hrnet.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_hrnet.py
deleted file mode 100644
index 8d50502..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_hrnet.py
+++ /dev/null
@@ -1,468 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License. 
-# You may obtain a copy of the License at 
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and 
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import numpy as np
-import math
-import cv2
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-from ..keypoint_utils import transform_preds
-from .. import layers as L
-from paddle.nn import functional as F
-
-__all__ = ['TopDownHRNet', 'TinyPose3DHRNet', 'TinyPose3DHRHeatmapNet']
-
-
-@register
-class TopDownHRNet(BaseArch):
-    __category__ = 'architecture'
-    __inject__ = ['loss']
-
-    def __init__(self,
-                 width,
-                 num_joints,
-                 backbone='HRNet',
-                 loss='KeyPointMSELoss',
-                 post_process='HRNetPostProcess',
-                 flip_perm=None,
-                 flip=True,
-                 shift_heatmap=True,
-                 use_dark=True):
-        """
-        HRNet network, see https://arxiv.org/abs/1902.09212
- 
-        Args:
-            backbone (nn.Layer): backbone instance
-            post_process (object): `HRNetPostProcess` instance
-            flip_perm (list): The left-right joints exchange order list
-            use_dark(bool): Whether to use DARK in post processing
-        """
-        super(TopDownHRNet, self).__init__()
-        self.backbone = backbone
-        self.post_process = HRNetPostProcess(use_dark)
-        self.loss = loss
-        self.flip_perm = flip_perm
-        self.flip = flip
-        self.final_conv = L.Conv2d(width, num_joints, 1, 1, 0, bias=True)
-        self.shift_heatmap = shift_heatmap
-        self.deploy = False
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        # backbone
-        backbone = create(cfg['backbone'])
-
-        return {'backbone': backbone, }
-
-    def _forward(self):
-        feats = self.backbone(self.inputs)
-        hrnet_outputs = self.final_conv(feats[0])
-
-        if self.training:
-            return self.loss(hrnet_outputs, self.inputs)
-        elif self.deploy:
-            outshape = hrnet_outputs.shape
-            max_idx = paddle.argmax(
-                hrnet_outputs.reshape(
-                    (outshape[0], outshape[1], outshape[2] * outshape[3])),
-                axis=-1)
-            return hrnet_outputs, max_idx
-        else:
-            if self.flip:
-                self.inputs['image'] = self.inputs['image'].flip([3])
-                feats = self.backbone(self.inputs)
-                output_flipped = self.final_conv(feats[0])
-                output_flipped = self.flip_back(output_flipped.numpy(),
-                                                self.flip_perm)
-                output_flipped = paddle.to_tensor(output_flipped.copy())
-                if self.shift_heatmap:
-                    output_flipped[:, :, :, 1:] = output_flipped.clone(
-                    )[:, :, :, 0:-1]
-                hrnet_outputs = (hrnet_outputs + output_flipped) * 0.5
-            imshape = (self.inputs['im_shape'].numpy()
-                       )[:, ::-1] if 'im_shape' in self.inputs else None
-            center = self.inputs['center'].numpy(
-            ) if 'center' in self.inputs else np.round(imshape / 2.)
-            scale = self.inputs['scale'].numpy(
-            ) if 'scale' in self.inputs else imshape / 200.
-            outputs = self.post_process(hrnet_outputs, center, scale)
-            return outputs
-
-    def get_loss(self):
-        return self._forward()
-
-    def get_pred(self):
-        res_lst = self._forward()
-        outputs = {'keypoint': res_lst}
-        return outputs
-
-    def flip_back(self, output_flipped, matched_parts):
-        assert output_flipped.ndim == 4,\
-                'output_flipped should be [batch_size, num_joints, height, width]'
-
-        output_flipped = output_flipped[:, :, :, ::-1]
-
-        for pair in matched_parts:
-            tmp = output_flipped[:, pair[0], :, :].copy()
-            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
-            output_flipped[:, pair[1], :, :] = tmp
-
-        return output_flipped
-
-
-class HRNetPostProcess(object):
-    def __init__(self, use_dark=True):
-        self.use_dark = use_dark
-
-    def get_max_preds(self, heatmaps):
-        '''get predictions from score maps
- 
-        Args:
-            heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
- 
-        Returns:
-            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
-            maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints
-        '''
-        assert isinstance(heatmaps,
-                          np.ndarray), 'heatmaps should be numpy.ndarray'
-        assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'
-
-        batch_size = heatmaps.shape[0]
-        num_joints = heatmaps.shape[1]
-        width = heatmaps.shape[3]
-        heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1))
-        idx = np.argmax(heatmaps_reshaped, 2)
-        maxvals = np.amax(heatmaps_reshaped, 2)
-
-        maxvals = maxvals.reshape((batch_size, num_joints, 1))
-        idx = idx.reshape((batch_size, num_joints, 1))
-
-        preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
-
-        preds[:, :, 0] = (preds[:, :, 0]) % width
-        preds[:, :, 1] = np.floor((preds[:, :, 1]) / width)
-
-        pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
-        pred_mask = pred_mask.astype(np.float32)
-
-        preds *= pred_mask
-
-        return preds, maxvals
-
-    def gaussian_blur(self, heatmap, kernel):
-        border = (kernel - 1) // 2
-        batch_size = heatmap.shape[0]
-        num_joints = heatmap.shape[1]
-        height = heatmap.shape[2]
-        width = heatmap.shape[3]
-        for i in range(batch_size):
-            for j in range(num_joints):
-                origin_max = np.max(heatmap[i, j])
-                dr = np.zeros((height + 2 * border, width + 2 * border))
-                dr[border:-border, border:-border] = heatmap[i, j].copy()
-                dr = cv2.GaussianBlur(dr, (kernel, kernel), 0)
-                heatmap[i, j] = dr[border:-border, border:-border].copy()
-                heatmap[i, j] *= origin_max / np.max(heatmap[i, j])
-        return heatmap
-
-    def dark_parse(self, hm, coord):
-        heatmap_height = hm.shape[0]
-        heatmap_width = hm.shape[1]
-        px = int(coord[0])
-        py = int(coord[1])
-        if 1 < px < heatmap_width - 2 and 1 < py < heatmap_height - 2:
-            dx = 0.5 * (hm[py][px + 1] - hm[py][px - 1])
-            dy = 0.5 * (hm[py + 1][px] - hm[py - 1][px])
-            dxx = 0.25 * (hm[py][px + 2] - 2 * hm[py][px] + hm[py][px - 2])
-            dxy = 0.25 * (hm[py+1][px+1] - hm[py-1][px+1] - hm[py+1][px-1] \
-                + hm[py-1][px-1])
-            dyy = 0.25 * (
-                hm[py + 2 * 1][px] - 2 * hm[py][px] + hm[py - 2 * 1][px])
-            derivative = np.matrix([[dx], [dy]])
-            hessian = np.matrix([[dxx, dxy], [dxy, dyy]])
-            if dxx * dyy - dxy**2 != 0:
-                hessianinv = hessian.I
-                offset = -hessianinv * derivative
-                offset = np.squeeze(np.array(offset.T), axis=0)
-                coord += offset
-        return coord
-
-    def dark_postprocess(self, hm, coords, kernelsize):
-        '''DARK postpocessing, Zhang et al. Distribution-Aware Coordinate
-        Representation for Human Pose Estimation (CVPR 2020).
-        '''
-
-        hm = self.gaussian_blur(hm, kernelsize)
-        hm = np.maximum(hm, 1e-10)
-        hm = np.log(hm)
-        for n in range(coords.shape[0]):
-            for p in range(coords.shape[1]):
-                coords[n, p] = self.dark_parse(hm[n][p], coords[n][p])
-        return coords
-
-    def get_final_preds(self, heatmaps, center, scale, kernelsize=3):
-        """the highest heatvalue location with a quarter offset in the
-        direction from the highest response to the second highest response.
- 
-        Args:
-            heatmaps (numpy.ndarray): The predicted heatmaps
-            center (numpy.ndarray): The boxes center
-            scale (numpy.ndarray): The scale factor
- 
-        Returns:
-            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
-            maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints
-        """
-        coords, maxvals = self.get_max_preds(heatmaps)
-
-        heatmap_height = heatmaps.shape[2]
-        heatmap_width = heatmaps.shape[3]
-
-        if self.use_dark:
-            coords = self.dark_postprocess(heatmaps, coords, kernelsize)
-        else:
-            for n in range(coords.shape[0]):
-                for p in range(coords.shape[1]):
-                    hm = heatmaps[n][p]
-                    px = int(math.floor(coords[n][p][0] + 0.5))
-                    py = int(math.floor(coords[n][p][1] + 0.5))
-                    if 1 < px < heatmap_width - 1 and 1 < py < heatmap_height - 1:
-                        diff = np.array([
-                            hm[py][px + 1] - hm[py][px - 1],
-                            hm[py + 1][px] - hm[py - 1][px]
-                        ])
-                        coords[n][p] += np.sign(diff) * .25
-        preds = coords.copy()
-
-        # Transform back
-        for i in range(coords.shape[0]):
-            preds[i] = transform_preds(coords[i], center[i], scale[i],
-                                       [heatmap_width, heatmap_height])
-
-        return preds, maxvals
-
-    def __call__(self, output, center, scale):
-        preds, maxvals = self.get_final_preds(output.numpy(), center, scale)
-        outputs = [[
-            np.concatenate(
-                (preds, maxvals), axis=-1), np.mean(
-                    maxvals, axis=1)
-        ]]
-        return outputs
-
-
-class TinyPose3DPostProcess(object):
-    def __init__(self):
-        pass
-
-    def __call__(self, output, center, scale):
-        """
-        Args:
-            output (numpy.ndarray): numpy.ndarray([batch_size, num_joints, 3]), keypoints coords
-            scale (numpy.ndarray): The scale factor
-        Returns:
-            preds: numpy.ndarray([batch_size, num_joints, 3]), keypoints coords
-        """
-
-        preds = output.numpy().copy()
-
-        # Transform back
-        for i in range(output.shape[0]):  # batch_size
-            preds[i][:, 0] = preds[i][:, 0] * scale[i][0]
-            preds[i][:, 1] = preds[i][:, 1] * scale[i][1]
-
-        return preds
-
-
-def soft_argmax(heatmaps, joint_num):
-    dims = heatmaps.shape
-    depth_dim = (int)(dims[1] / joint_num)
-    heatmaps = heatmaps.reshape((-1, joint_num, depth_dim * dims[2] * dims[3]))
-    heatmaps = F.softmax(heatmaps, 2)
-    heatmaps = heatmaps.reshape((-1, joint_num, depth_dim, dims[2], dims[3]))
-
-    accu_x = heatmaps.sum(axis=(2, 3))
-    accu_y = heatmaps.sum(axis=(2, 4))
-    accu_z = heatmaps.sum(axis=(3, 4))
-
-    accu_x = accu_x * paddle.arange(1, 33)
-    accu_y = accu_y * paddle.arange(1, 33)
-    accu_z = accu_z * paddle.arange(1, 33)
-
-    accu_x = accu_x.sum(axis=2, keepdim=True) - 1
-    accu_y = accu_y.sum(axis=2, keepdim=True) - 1
-    accu_z = accu_z.sum(axis=2, keepdim=True) - 1
-
-    coord_out = paddle.concat(
-        (accu_x, accu_y, accu_z), axis=2)  # [batch_size, joint_num, 3]
-
-    return coord_out
-
-
-@register
-class TinyPose3DHRHeatmapNet(BaseArch):
-    __category__ = 'architecture'
-    __inject__ = ['loss']
-
-    def __init__(
-            self,
-            width,  # 40, backbone输出的channel数目
-            num_joints,
-            backbone='HRNet',
-            loss='KeyPointRegressionMSELoss',
-            post_process=TinyPose3DPostProcess):
-        """
-        Args:
-            backbone (nn.Layer): backbone instance
-            post_process (object): post process instance
-        """
-        super(TinyPose3DHRHeatmapNet, self).__init__()
-
-        self.backbone = backbone
-        self.post_process = TinyPose3DPostProcess()
-        self.loss = loss
-        self.deploy = False
-        self.num_joints = num_joints
-
-        self.final_conv = L.Conv2d(width, num_joints * 32, 1, 1, 0, bias=True)
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        # backbone
-        backbone = create(cfg['backbone'])
-
-        return {'backbone': backbone, }
-
-    def _forward(self):
-        feats = self.backbone(self.inputs)  # feats:[[batch_size, 40, 32, 24]]
-
-        hrnet_outputs = self.final_conv(feats[0])
-        res = soft_argmax(hrnet_outputs, self.num_joints)
-        return res
-
-    def get_loss(self):
-        pose3d = self._forward()
-        loss = self.loss(pose3d, None, self.inputs)
-        outputs = {'loss': loss}
-        return outputs
-
-    def get_pred(self):
-        res_lst = self._forward()
-        outputs = {'pose3d': res_lst}
-        return outputs
-
-    def flip_back(self, output_flipped, matched_parts):
-        assert output_flipped.ndim == 4,\
-                'output_flipped should be [batch_size, num_joints, height, width]'
-
-        output_flipped = output_flipped[:, :, :, ::-1]
-
-        for pair in matched_parts:
-            tmp = output_flipped[:, pair[0], :, :].copy()
-            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
-            output_flipped[:, pair[1], :, :] = tmp
-
-        return output_flipped
-
-
-@register
-class TinyPose3DHRNet(BaseArch):
-    __category__ = 'architecture'
-    __inject__ = ['loss']
-
-    def __init__(self,
-                 width,
-                 num_joints,
-                 fc_channel=768,
-                 backbone='HRNet',
-                 loss='KeyPointRegressionMSELoss',
-                 post_process=TinyPose3DPostProcess):
-        """
-        Args:
-            backbone (nn.Layer): backbone instance
-            post_process (object): post process instance
-        """
-        super(TinyPose3DHRNet, self).__init__()
-        self.backbone = backbone
-        self.post_process = TinyPose3DPostProcess()
-        self.loss = loss
-        self.deploy = False
-        self.num_joints = num_joints
-
-        self.final_conv = L.Conv2d(width, num_joints, 1, 1, 0, bias=True)
-
-        self.flatten = paddle.nn.Flatten(start_axis=2, stop_axis=3)
-        self.fc1 = paddle.nn.Linear(fc_channel, 256)
-        self.act1 = paddle.nn.ReLU()
-        self.fc2 = paddle.nn.Linear(256, 64)
-        self.act2 = paddle.nn.ReLU()
-        self.fc3 = paddle.nn.Linear(64, 3)
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        # backbone
-        backbone = create(cfg['backbone'])
-
-        return {'backbone': backbone, }
-
-    def _forward(self):
-        '''
-        self.inputs is a dict
-        '''
-        feats = self.backbone(
-            self.inputs)  # feats:[[batch_size, 40, width/4, height/4]]
-
-        hrnet_outputs = self.final_conv(
-            feats[0])  # hrnet_outputs: [batch_size, num_joints*32,32,32]
-
-        flatten_res = self.flatten(
-            hrnet_outputs)  # [batch_size,num_joints*32,32*32]
-
-        res = self.fc1(flatten_res)
-        res = self.act1(res)
-        res = self.fc2(res)
-        res = self.act2(res)
-        res = self.fc3(res)
-
-        if self.training:
-            return self.loss(res, self.inputs)
-        else:  # export model need
-            return res
-
-    def get_loss(self):
-        return self._forward()
-
-    def get_pred(self):
-        res_lst = self._forward()
-        outputs = {'pose3d': res_lst}
-        return outputs
-
-    def flip_back(self, output_flipped, matched_parts):
-        assert output_flipped.ndim == 4,\
-                'output_flipped should be [batch_size, num_joints, height, width]'
-
-        output_flipped = output_flipped[:, :, :, ::-1]
-
-        for pair in matched_parts:
-            tmp = output_flipped[:, pair[0], :, :].copy()
-            output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :]
-            output_flipped[:, pair[1], :, :] = tmp
-
-        return output_flipped
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_petr.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_petr.py
deleted file mode 100644
index b587c1f..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_petr.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License. 
-# You may obtain a copy of the License at 
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and 
-# limitations under the License.
-"""
-this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/detectors/petr.py
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-from ppdet.core.workspace import register
-from .meta_arch import BaseArch
-from .. import layers as L
-
-__all__ = ['PETR']
-
-
-@register
-class PETR(BaseArch):
-    __category__ = 'architecture'
-    __inject__ = ['backbone', 'neck', 'bbox_head']
-
-    def __init__(self,
-                 backbone='ResNet',
-                 neck='ChannelMapper',
-                 bbox_head='PETRHead'):
-        """
-        PETR, see https://openaccess.thecvf.com/content/CVPR2022/papers/Shi_End-to-End_Multi-Person_Pose_Estimation_With_Transformers_CVPR_2022_paper.pdf
-
-        Args:
-            backbone (nn.Layer): backbone instance
-            neck (nn.Layer): neck between backbone and head
-            bbox_head (nn.Layer): model output and loss
-        """
-        super(PETR, self).__init__()
-        self.backbone = backbone
-        if neck is not None:
-            self.with_neck = True
-        self.neck = neck
-        self.bbox_head = bbox_head
-        self.deploy = False
-
-    def extract_feat(self, img):
-        """Directly extract features from the backbone+neck."""
-        x = self.backbone(img)
-        if self.with_neck:
-            x = self.neck(x)
-        return x
-
-    def get_inputs(self):
-        img_metas = []
-        gt_bboxes = []
-        gt_labels = []
-        gt_keypoints = []
-        gt_areas = []
-        pad_gt_mask = self.inputs['pad_gt_mask'].astype("bool").squeeze(-1)
-        for idx, im_shape in enumerate(self.inputs['im_shape']):
-            img_meta = {
-                'img_shape': im_shape.astype("int32").tolist() + [1, ],
-                'batch_input_shape': self.inputs['image'].shape[-2:],
-                'image_name': self.inputs['image_file'][idx]
-            }
-            img_metas.append(img_meta)
-            if (not pad_gt_mask[idx].any()):
-                gt_keypoints.append(self.inputs['gt_joints'][idx][:1])
-                gt_labels.append(self.inputs['gt_class'][idx][:1])
-                gt_bboxes.append(self.inputs['gt_bbox'][idx][:1])
-                gt_areas.append(self.inputs['gt_areas'][idx][:1])
-                continue
-
-            gt_keypoints.append(self.inputs['gt_joints'][idx][pad_gt_mask[idx]])
-            gt_labels.append(self.inputs['gt_class'][idx][pad_gt_mask[idx]])
-            gt_bboxes.append(self.inputs['gt_bbox'][idx][pad_gt_mask[idx]])
-            gt_areas.append(self.inputs['gt_areas'][idx][pad_gt_mask[idx]])
-
-        return img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas
-
-    def get_loss(self):
-        """
-        Args:
-            img (Tensor): Input images of shape (N, C, H, W).
-                Typically these should be mean centered and std scaled.
-            img_metas (list[dict]): A List of image info dict where each dict
-                has: 'img_shape', 'scale_factor', 'flip', and may also contain
-                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
-                For details on the values of these keys see
-                :class:`mmdet.datasets.pipelines.Collect`.
-            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
-                image in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels (list[Tensor]): Class indices corresponding to each box.
-            gt_keypoints (list[Tensor]): Each item are the truth keypoints for
-                each image in [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x,
-                p^{K}_y, p^{K}_v] format.
-            gt_areas (list[Tensor]): mask areas corresponding to each box.
-            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
-                boxes can be ignored when computing the loss.
-
-        Returns:
-            dict[str, Tensor]: A dictionary of loss components.
-        """
-
-        img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas = self.get_inputs(
-        )
-        gt_bboxes_ignore = getattr(self.inputs, 'gt_bboxes_ignore', None)
-
-        x = self.extract_feat(self.inputs)
-        losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes,
-                                              gt_labels, gt_keypoints, gt_areas,
-                                              gt_bboxes_ignore)
-        loss = 0
-        for k, v in losses.items():
-            loss += v
-        losses['loss'] = loss
-
-        return losses
-
-    def get_pred_numpy(self):
-        """Used for computing network flops.
-        """
-
-        img = self.inputs['image']
-        batch_size, _, height, width = img.shape
-        dummy_img_metas = [
-            dict(
-                batch_input_shape=(height, width),
-                img_shape=(height, width, 3),
-                scale_factor=(1., 1., 1., 1.)) for _ in range(batch_size)
-        ]
-        x = self.extract_feat(img)
-        outs = self.bbox_head(x, img_metas=dummy_img_metas)
-        bbox_list = self.bbox_head.get_bboxes(
-            *outs, dummy_img_metas, rescale=True)
-        return bbox_list
-
-    def get_pred(self):
-        """
-        """
-        img = self.inputs['image']
-        batch_size, _, height, width = img.shape
-        img_metas = [
-            dict(
-                batch_input_shape=(height, width),
-                img_shape=(height, width, 3),
-                scale_factor=self.inputs['scale_factor'][i])
-            for i in range(batch_size)
-        ]
-        kptpred = self.simple_test(
-            self.inputs, img_metas=img_metas, rescale=True)
-        keypoints = kptpred[0][1][0]
-        bboxs = kptpred[0][0][0]
-        keypoints[..., 2] = bboxs[:, None, 4]
-        res_lst = [[keypoints, bboxs[:, 4]]]
-        outputs = {'keypoint': res_lst}
-        return outputs
-
-    def simple_test(self, inputs, img_metas, rescale=False):
-        """Test function without test time augmentation.
-
-        Args:
-            inputs (list[paddle.Tensor]): List of multiple images.
-            img_metas (list[dict]): List of image information.
-            rescale (bool, optional): Whether to rescale the results.
-                Defaults to False.
-
-        Returns:
-            list[list[np.ndarray]]: BBox and keypoint results of each image
-                and classes. The outer list corresponds to each image.
-                The inner list corresponds to each class.
-        """
-        batch_size = len(img_metas)
-        assert batch_size == 1, 'Currently only batch_size 1 for inference ' \
-            f'mode is supported. Found batch_size {batch_size}.'
-        feat = self.extract_feat(inputs)
-        results_list = self.bbox_head.simple_test(
-            feat, img_metas, rescale=rescale)
-
-        bbox_kpt_results = [
-            self.bbox_kpt2result(det_bboxes, det_labels, det_kpts,
-                                 self.bbox_head.num_classes)
-            for det_bboxes, det_labels, det_kpts in results_list
-        ]
-        return bbox_kpt_results
-
-    def bbox_kpt2result(self, bboxes, labels, kpts, num_classes):
-        """Convert detection results to a list of numpy arrays.
-
-        Args:
-            bboxes (paddle.Tensor | np.ndarray): shape (n, 5).
-            labels (paddle.Tensor | np.ndarray): shape (n, ).
-            kpts (paddle.Tensor | np.ndarray): shape (n, K, 3).
-            num_classes (int): class number, including background class.
-
-        Returns:
-            list(ndarray): bbox and keypoint results of each class.
-        """
-        if bboxes.shape[0] == 0:
-            return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)], \
-                [np.zeros((0, kpts.size(1), 3), dtype=np.float32)
-                    for i in range(num_classes)]
-        else:
-            if isinstance(bboxes, paddle.Tensor):
-                bboxes = bboxes.numpy()
-                labels = labels.numpy()
-                kpts = kpts.numpy()
-            return [bboxes[labels == i, :] for i in range(num_classes)], \
-                [kpts[labels == i, :, :] for i in range(num_classes)]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_vitpose.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_vitpose.py
deleted file mode 100644
index b00226a..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_vitpose.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License. 
-# You may obtain a copy of the License at 
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and 
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import numpy as np
-import math
-import cv2
-from ppdet.core.workspace import register, create, serializable
-from .meta_arch import BaseArch
-from ..keypoint_utils import transform_preds
-from .. import layers as L
-
-__all__ = ['VitPose_TopDown', 'VitPosePostProcess']
-
-
-@register
-class VitPose_TopDown(BaseArch):
-    __category__ = 'architecture'
-    __inject__ = ['loss']
-
-    def __init__(self, backbone, head, loss, post_process, flip_test):
-        """
-        VitPose network, see https://arxiv.org/pdf/2204.12484v2.pdf
-
-        Args:
-            backbone (nn.Layer): backbone instance
-            post_process (object): `HRNetPostProcess` instance
-            
-        """
-        super(VitPose_TopDown, self).__init__()
-        self.backbone = backbone
-        self.head = head
-        self.loss = loss
-        self.post_process = post_process
-        self.flip_test = flip_test
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        # backbone
-        backbone = create(cfg['backbone'])
-        #head
-        head = create(cfg['head'])
-        #post_process
-        post_process = create(cfg['post_process'])
-
-        return {
-            'backbone': backbone,
-            'head': head,
-            'post_process': post_process
-        }
-
-    def _forward_train(self):
-
-        feats = self.backbone.forward_features(self.inputs['image'])
-        vitpost_output = self.head(feats)
-        return self.loss(vitpost_output, self.inputs)
-
-    def _forward_test(self):
-
-        feats = self.backbone.forward_features(self.inputs['image'])
-        output_heatmap = self.head(feats)
-
-        if self.flip_test:
-            img_flipped = self.inputs['image'].flip(3)
-            features_flipped = self.backbone.forward_features(img_flipped)
-            output_flipped_heatmap = self.head.inference_model(features_flipped,
-                                                               self.flip_test)
-
-            output_heatmap = (output_heatmap + output_flipped_heatmap) * 0.5
-
-        imshape = (self.inputs['im_shape'].numpy()
-                   )[:, ::-1] if 'im_shape' in self.inputs else None
-        center = self.inputs['center'].numpy(
-        ) if 'center' in self.inputs else np.round(imshape / 2.)
-        scale = self.inputs['scale'].numpy(
-        ) if 'scale' in self.inputs else imshape / 200.
-
-        result = self.post_process(output_heatmap.cpu().numpy(), center, scale)
-
-        return result
-
-    def get_loss(self):
-        return self._forward_train()
-
-    def get_pred(self):
-        res_lst = self._forward_test()
-        outputs = {'keypoint': res_lst}
-        return outputs
-
-
-@register
-@serializable
-class VitPosePostProcess(object):
-    def __init__(self, use_dark=False):
-        self.use_dark = use_dark
-
-    def get_max_preds(self, heatmaps):
-        '''get predictions from score maps
-
-        Args:
-            heatmaps: numpy.ndarray([batch_size, num_joints, height, width])
-
-        Returns:
-            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
-            maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints
-        '''
-        assert isinstance(heatmaps,
-                          np.ndarray), 'heatmaps should be numpy.ndarray'
-        assert heatmaps.ndim == 4, 'batch_images should be 4-ndim'
-
-        batch_size = heatmaps.shape[0]
-        num_joints = heatmaps.shape[1]
-        width = heatmaps.shape[3]
-        heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1))
-        idx = np.argmax(heatmaps_reshaped, 2)
-        maxvals = np.amax(heatmaps_reshaped, 2)
-
-        maxvals = maxvals.reshape((batch_size, num_joints, 1))
-        idx = idx.reshape((batch_size, num_joints, 1))
-
-        preds = np.tile(idx, (1, 1, 2)).astype(np.float32)
-
-        preds[:, :, 0] = (preds[:, :, 0]) % width
-        preds[:, :, 1] = np.floor((preds[:, :, 1]) // width)
-
-        pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2))
-        pred_mask = pred_mask.astype(np.float32)
-
-        preds *= pred_mask
-
-        return preds, maxvals
-
-    def post_datk_udp(self, coords, batch_heatmaps, kernel=3):
-        """DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The
-        Devil is in the Details: Delving into Unbiased Data Processing for Human
-        Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate
-        Representation for Human Pose Estimation (CVPR 2020).
-
-        Note:
-            - batch size: B
-            - num keypoints: K
-            - num persons: N
-            - height of heatmaps: H
-            - width of heatmaps: W
-
-            B=1 for bottom_up paradigm where all persons share the same heatmap.
-            B=N for top_down paradigm where each person has its own heatmaps.
-
-        Args:
-            coords (np.ndarray[N, K, 2]): Initial coordinates of human pose.
-            batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps
-            kernel (int): Gaussian kernel size (K) for modulation.
-
-        Returns:
-            np.ndarray([N, K, 2]): Refined coordinates.
-        """
-        if not isinstance(batch_heatmaps, np.ndarray):
-            batch_heatmaps = batch_heatmaps.cpu().numpy()
-        B, K, H, W = batch_heatmaps.shape
-        N = coords.shape[0]
-        assert (B == 1 or B == N)
-        for heatmaps in batch_heatmaps:
-            for heatmap in heatmaps:
-                cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap)
-        np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps)
-        np.log(batch_heatmaps, batch_heatmaps)
-
-        batch_heatmaps_pad = np.pad(batch_heatmaps, ((0, 0), (0, 0), (1, 1),
-                                                     (1, 1)),
-                                    mode='edge').flatten()
-
-        index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2)
-        index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K)
-        index = index.astype(int).reshape(-1, 1)
-        i_ = batch_heatmaps_pad[index]
-        ix1 = batch_heatmaps_pad[index + 1]
-        iy1 = batch_heatmaps_pad[index + W + 2]
-        ix1y1 = batch_heatmaps_pad[index + W + 3]
-        ix1_y1_ = batch_heatmaps_pad[index - W - 3]
-        ix1_ = batch_heatmaps_pad[index - 1]
-        iy1_ = batch_heatmaps_pad[index - 2 - W]
-
-        dx = 0.5 * (ix1 - ix1_)
-        dy = 0.5 * (iy1 - iy1_)
-        derivative = np.concatenate([dx, dy], axis=1)
-        derivative = derivative.reshape(N, K, 2, 1)
-        dxx = ix1 - 2 * i_ + ix1_
-        dyy = iy1 - 2 * i_ + iy1_
-        dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_)
-        hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1)
-        hessian = hessian.reshape(N, K, 2, 2)
-        hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2))
-        coords -= np.einsum('ijmn,ijnk->ijmk', hessian, derivative).squeeze()
-        return coords
-
-    def transform_preds_udp(self,
-                            coords,
-                            center,
-                            scale,
-                            output_size,
-                            use_udp=True):
-        """Get final keypoint predictions from heatmaps and apply scaling and
-        translation to map them back to the image.
-
-        Note:
-            num_keypoints: K
-
-        Args:
-            coords (np.ndarray[K, ndims]):
-
-                * If ndims=2, corrds are predicted keypoint location.
-                * If ndims=4, corrds are composed of (x, y, scores, tags)
-                * If ndims=5, corrds are composed of (x, y, scores, tags,
-                flipped_tags)
-
-            center (np.ndarray[2, ]): Center of the bounding box (x, y).
-            scale (np.ndarray[2, ]): Scale of the bounding box
-                wrt [width, height].
-            output_size (np.ndarray[2, ] | list(2,)): Size of the
-                destination heatmaps.
-            use_udp (bool): Use unbiased data processing
-
-        Returns:
-            np.ndarray: Predicted coordinates in the images.
-        """
-
-        assert coords.shape[1] in (2, 4, 5)
-        assert len(center) == 2
-        assert len(scale) == 2
-        assert len(output_size) == 2
-
-        # Recover the scale which is normalized by a factor of 200.
-        scale = scale * 200.0
-
-        if use_udp:
-            scale_x = scale[0] / (output_size[0] - 1.0)
-            scale_y = scale[1] / (output_size[1] - 1.0)
-        else:
-            scale_x = scale[0] / output_size[0]
-            scale_y = scale[1] / output_size[1]
-
-        target_coords = np.ones_like(coords)
-        target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[
-            0] * 0.5
-        target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[
-            1] * 0.5
-
-        return target_coords
-
-    def get_final_preds(self, heatmaps, center, scale, kernelsize=11):
-        """the highest heatvalue location with a quarter offset in the
-        direction from the highest response to the second highest response.
-
-        Args:
-            heatmaps (numpy.ndarray): The predicted heatmaps
-            center (numpy.ndarray): The boxes center
-            scale (numpy.ndarray): The scale factor
-
-        Returns:
-            preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords
-            maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints
-        """
-        coords, maxvals = self.get_max_preds(heatmaps)
-
-        N, K, H, W = heatmaps.shape
-
-        if self.use_dark:
-            coords = self.post_datk_udp(coords, heatmaps, kernelsize)
-            preds = coords.copy()
-            # Transform back to the image
-            for i in range(N):
-                preds[i] = self.transform_preds_udp(preds[i], center[i],
-                                                    scale[i], [W, H])
-        else:
-            for n in range(coords.shape[0]):
-                for p in range(coords.shape[1]):
-                    hm = heatmaps[n][p]
-                    px = int(math.floor(coords[n][p][0] + 0.5))
-                    py = int(math.floor(coords[n][p][1] + 0.5))
-                    if 1 < px < W - 1 and 1 < py < H - 1:
-                        diff = np.array([
-                            hm[py][px + 1] - hm[py][px - 1],
-                            hm[py + 1][px] - hm[py - 1][px]
-                        ])
-                        coords[n][p] += np.sign(diff) * .25
-            preds = coords.copy()
-
-            # Transform back
-            for i in range(coords.shape[0]):
-                preds[i] = transform_preds(coords[i], center[i], scale[i],
-                                           [W, H])
-
-        return preds, maxvals
-
-    def __call__(self, output, center, scale):
-        preds, maxvals = self.get_final_preds(output, center, scale)
-        outputs = [[
-            np.concatenate(
-                (preds, maxvals), axis=-1), np.mean(
-                    maxvals, axis=1)
-        ]]
-        return outputs
\ No newline at end of file
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/mask_rcnn.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/mask_rcnn.py
deleted file mode 100644
index 4f6a9ce..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/mask_rcnn.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-__all__ = ['MaskRCNN']
-
-
-@register
-class MaskRCNN(BaseArch):
-    """
-    Mask R-CNN network, see https://arxiv.org/abs/1703.06870
-
-    Args:
-        backbone (object): backbone instance
-        rpn_head (object): `RPNHead` instance
-        bbox_head (object): `BBoxHead` instance
-        mask_head (object): `MaskHead` instance
-        bbox_post_process (object): `BBoxPostProcess` instance
-        mask_post_process (object): `MaskPostProcess` instance
-        neck (object): 'FPN' instance
-    """
-
-    __category__ = 'architecture'
-    __inject__ = [
-        'bbox_post_process',
-        'mask_post_process',
-    ]
-
-    def __init__(self,
-                 backbone,
-                 rpn_head,
-                 bbox_head,
-                 mask_head,
-                 bbox_post_process,
-                 mask_post_process,
-                 neck=None):
-        super(MaskRCNN, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.rpn_head = rpn_head
-        self.bbox_head = bbox_head
-        self.mask_head = mask_head
-
-        self.bbox_post_process = bbox_post_process
-        self.mask_post_process = mask_post_process
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        backbone = create(cfg['backbone'])
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = cfg['neck'] and create(cfg['neck'], **kwargs)
-
-        out_shape = neck and neck.out_shape or backbone.out_shape
-        kwargs = {'input_shape': out_shape}
-        rpn_head = create(cfg['rpn_head'], **kwargs)
-        bbox_head = create(cfg['bbox_head'], **kwargs)
-
-        out_shape = neck and out_shape or bbox_head.get_head().out_shape
-        kwargs = {'input_shape': out_shape}
-        mask_head = create(cfg['mask_head'], **kwargs)
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            "rpn_head": rpn_head,
-            "bbox_head": bbox_head,
-            "mask_head": mask_head,
-        }
-
-    def _forward(self):
-        body_feats = self.backbone(self.inputs)
-        if self.neck is not None:
-            body_feats = self.neck(body_feats)
-
-        if self.training:
-            rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs)
-            bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num,
-                                                  self.inputs)
-            rois, rois_num = self.bbox_head.get_assigned_rois()
-            bbox_targets = self.bbox_head.get_assigned_targets()
-            # Mask Head needs bbox_feat in Mask RCNN
-            mask_loss = self.mask_head(body_feats, rois, rois_num, self.inputs,
-                                       bbox_targets, bbox_feat)
-            return rpn_loss, bbox_loss, mask_loss
-        else:
-            rois, rois_num, _ = self.rpn_head(body_feats, self.inputs)
-            preds, feat_func = self.bbox_head(body_feats, rois, rois_num, None)
-
-            im_shape = self.inputs['im_shape']
-            scale_factor = self.inputs['scale_factor']
-
-            bbox, bbox_num, nms_keep_idx = self.bbox_post_process(
-                preds, (rois, rois_num), im_shape, scale_factor)
-            mask_out = self.mask_head(
-                body_feats, bbox, bbox_num, self.inputs, feat_func=feat_func)
-
-            # rescale the prediction back to origin image
-            bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred(
-                bbox, bbox_num, im_shape, scale_factor)
-            origin_shape = self.bbox_post_process.get_origin_shape()
-            mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num,
-                                               origin_shape)
-
-            if self.use_extra_data:
-                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
-                """extra_data:{
-                            'scores': predict scores,
-                            'nms_keep_idx': bbox index before nms,
-                           }
-                """
-                extra_data['scores'] = preds[1]  # predict scores (probability)
-                # Todo: get logits output
-                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
-                return bbox_pred, bbox_num, mask_pred, extra_data
-            else:
-                return bbox_pred, bbox_num, mask_pred
-
-    def get_loss(self, ):
-        bbox_loss, mask_loss, rpn_loss = self._forward()
-        loss = {}
-        loss.update(rpn_loss)
-        loss.update(bbox_loss)
-        loss.update(mask_loss)
-        total_loss = paddle.add_n(list(loss.values()))
-        loss.update({'loss': total_loss})
-        return loss
-
-    def get_pred(self):
-        if self.use_extra_data:
-            bbox_pred, bbox_num, mask_pred, extra_data = self._forward()
-            output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred, 'extra_data': extra_data}
-        else:
-            bbox_pred, bbox_num, mask_pred = self._forward()
-            output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}
-        return output
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/meta_arch.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/meta_arch.py
deleted file mode 100644
index 370b2b1..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/meta_arch.py
+++ /dev/null
@@ -1,132 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-import typing
-
-from ppdet.core.workspace import register
-from ppdet.modeling.post_process import nms
-
-__all__ = ['BaseArch']
-
-
-@register
-class BaseArch(nn.Layer):
-    def __init__(self, data_format='NCHW', use_extra_data=False):
-        super(BaseArch, self).__init__()
-        self.data_format = data_format
-        self.inputs = {}
-        self.fuse_norm = False
-        self.use_extra_data = use_extra_data
-
-    def load_meanstd(self, cfg_transform):
-        scale = 1.
-        mean = np.array([0.485, 0.456, 0.406], dtype=np.float32)
-        std = np.array([0.229, 0.224, 0.225], dtype=np.float32)
-        for item in cfg_transform:
-            if 'NormalizeImage' in item:
-                mean = np.array(
-                    item['NormalizeImage']['mean'], dtype=np.float32)
-                std = np.array(item['NormalizeImage']['std'], dtype=np.float32)
-                if item['NormalizeImage'].get('is_scale', True):
-                    scale = 1. / 255.
-                break
-        if self.data_format == 'NHWC':
-            self.scale = paddle.to_tensor(scale / std).reshape((1, 1, 1, 3))
-            self.bias = paddle.to_tensor(-mean / std).reshape((1, 1, 1, 3))
-        else:
-            self.scale = paddle.to_tensor(scale / std).reshape((1, 3, 1, 1))
-            self.bias = paddle.to_tensor(-mean / std).reshape((1, 3, 1, 1))
-
-    def forward(self, inputs):
-        if self.data_format == 'NHWC':
-            image = inputs['image']
-            inputs['image'] = paddle.transpose(image, [0, 2, 3, 1])
-
-        if self.fuse_norm:
-            image = inputs['image']
-            self.inputs['image'] = image * self.scale + self.bias
-            self.inputs['im_shape'] = inputs['im_shape']
-            self.inputs['scale_factor'] = inputs['scale_factor']
-        else:
-            self.inputs = inputs
-
-        self.model_arch()
-
-        if self.training:
-            out = self.get_loss()
-        else:
-            inputs_list = []
-            # multi-scale input
-            if not isinstance(inputs, typing.Sequence):
-                inputs_list.append(inputs)
-            else:
-                inputs_list.extend(inputs)
-            outs = []
-            for inp in inputs_list:
-                if self.fuse_norm:
-                    self.inputs['image'] = inp['image'] * self.scale + self.bias
-                    self.inputs['im_shape'] = inp['im_shape']
-                    self.inputs['scale_factor'] = inp['scale_factor']
-                else:
-                    self.inputs = inp
-                outs.append(self.get_pred())
-
-            # multi-scale test
-            if len(outs) > 1:
-                out = self.merge_multi_scale_predictions(outs)
-            else:
-                out = outs[0]
-        return out
-
-    def merge_multi_scale_predictions(self, outs):
-        # default values for architectures not included in following list
-        num_classes = 80
-        nms_threshold = 0.5
-        keep_top_k = 100
-
-        if self.__class__.__name__ in ('CascadeRCNN', 'FasterRCNN', 'MaskRCNN'):
-            num_classes = self.bbox_head.num_classes
-            keep_top_k = self.bbox_post_process.nms.keep_top_k
-            nms_threshold = self.bbox_post_process.nms.nms_threshold
-        else:
-            raise Exception(
-                "Multi scale test only supports CascadeRCNN, FasterRCNN and MaskRCNN for now"
-            )
-
-        final_boxes = []
-        all_scale_outs = paddle.concat([o['bbox'] for o in outs]).numpy()
-        for c in range(num_classes):
-            idxs = all_scale_outs[:, 0] == c
-            if np.count_nonzero(idxs) == 0:
-                continue
-            r = nms(all_scale_outs[idxs, 1:], nms_threshold)
-            final_boxes.append(
-                np.concatenate([np.full((r.shape[0], 1), c), r], 1))
-        out = np.concatenate(final_boxes)
-        out = np.concatenate(sorted(
-            out, key=lambda e: e[1])[-keep_top_k:]).reshape((-1, 6))
-        out = {
-            'bbox': paddle.to_tensor(out),
-            'bbox_num': paddle.to_tensor(np.array([out.shape[0], ]))
-        }
-
-        return out
-
-    def build_inputs(self, data, input_def):
-        inputs = {}
-        for i, k in enumerate(input_def):
-            inputs[k] = data[i]
-        return inputs
-
-    def model_arch(self, ):
-        pass
-
-    def get_loss(self, ):
-        raise NotImplementedError("Should implement get_loss method!")
-
-    def get_pred(self, ):
-        raise NotImplementedError("Should implement get_pred method!")
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/multi_stream_detector.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/multi_stream_detector.py
deleted file mode 100644
index 58c4fe0..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/multi_stream_detector.py
+++ /dev/null
@@ -1,69 +0,0 @@
-from typing import Dict
-from collections import OrderedDict
-from ppdet.modeling.architectures.meta_arch import BaseArch
-
-
-class MultiSteamDetector(BaseArch):
-    def __init__(self,
-                 model: Dict[str, BaseArch],
-                 train_cfg=None,
-                 test_cfg=None):
-        super(MultiSteamDetector, self).__init__()
-        self.submodules = list(model.keys())
-        for k, v in model.items():
-            setattr(self, k, v)
-
-        self.train_cfg = train_cfg
-        self.test_cfg = test_cfg
-        self.inference_on = self.test_cfg.get("inference_on",
-                                              self.submodules[0])
-        self.first_load = True
-
-    def forward(self, inputs, return_loss=True, **kwargs):
-        """Calls either :func:`forward_train` or :func:`forward_test` depending
-        on whether ``return_loss`` is ``True``.
-
-        Note this setting will change the expected inputs. When
-        ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor
-        and List[dict]), and when ``resturn_loss=False``, img and img_meta
-        should be double nested (i.e.  List[Tensor], List[List[dict]]), with
-        the outer list indicating test time augmentations.
-        """
-        if return_loss:
-            return self.forward_train(inputs, **kwargs)
-        else:
-            return self.forward_test(inputs, **kwargs)
-
-    def get_loss(self, **kwargs):
-        # losses = self(**data)
-
-        return self.forward_train(self, **kwargs)
-
-    def model(self, **kwargs) -> BaseArch:
-        if "submodule" in kwargs:
-            assert (kwargs["submodule"] in self.submodules
-                    ), "Detector does not contain submodule {}".format(kwargs[
-                        "submodule"])
-            model: BaseArch = getattr(self, kwargs["submodule"])
-        else:
-            model: BaseArch = getattr(self, self.inference_on)
-        return model
-
-    def freeze(self, model_ref: str):
-        assert model_ref in self.submodules
-        model = getattr(self, model_ref)
-        model.eval()
-        for param in model.parameters():
-            param.stop_gradient = True
-
-    def update_ema_model(self, momentum=0.9996):
-        # print(momentum)
-        model_dict = self.student.state_dict()
-        new_dict = OrderedDict()
-        for key, value in self.teacher.state_dict().items():
-            if key in model_dict.keys():
-                new_dict[key] = (model_dict[key] *
-                                 (1 - momentum) + value * momentum)
-            else:
-                raise Exception("{} is not found in student model".format(key))
-        self.teacher.set_dict(new_dict)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/picodet.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/picodet.py
deleted file mode 100644
index b6f4447..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/picodet.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-__all__ = ['PicoDet']
-
-
-@register
-class PicoDet(BaseArch):
-    """
-    Generalized Focal Loss network, see https://arxiv.org/abs/2006.04388
-
-    Args:
-        backbone (object): backbone instance
-        neck (object): 'FPN' instance
-        head (object): 'PicoHead' instance
-    """
-
-    __category__ = 'architecture'
-
-    def __init__(self, backbone, neck, head='PicoHead', nms_cpu=False):
-        super(PicoDet, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.head = head
-        self.export_post_process = True
-        self.export_nms = True
-        self.nms_cpu = nms_cpu
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        backbone = create(cfg['backbone'])
-
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs)
-
-        kwargs = {'input_shape': neck.out_shape}
-        head = create(cfg['head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            "head": head,
-        }
-
-    def _forward(self):
-        body_feats = self.backbone(self.inputs)
-        fpn_feats = self.neck(body_feats)
-        head_outs = self.head(fpn_feats, self.export_post_process)
-        if self.training or not self.export_post_process:
-            return head_outs, None
-        else:
-            scale_factor = self.inputs['scale_factor']
-            bboxes, bbox_num = self.head.post_process(
-                head_outs,
-                scale_factor,
-                export_nms=self.export_nms,
-                nms_cpu=self.nms_cpu)
-            return bboxes, bbox_num
-
-    def get_loss(self, ):
-        loss = {}
-
-        head_outs, _ = self._forward()
-        loss_gfl = self.head.get_loss(head_outs, self.inputs)
-        loss.update(loss_gfl)
-        total_loss = paddle.add_n(list(loss.values()))
-        loss.update({'loss': total_loss})
-        return loss
-
-    def get_pred(self):
-        if not self.export_post_process:
-            return {'picodet': self._forward()[0]}
-        elif self.export_nms:
-            bbox_pred, bbox_num = self._forward()
-            output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
-            return output
-        else:
-            bboxes, mlvl_scores = self._forward()
-            output = {'bbox': bboxes, 'scores': mlvl_scores}
-            return output
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/pose3d_metro.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/pose3d_metro.py
deleted file mode 100644
index 4275154..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/pose3d_metro.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License. 
-# You may obtain a copy of the License at 
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and 
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-from .. import layers as L
-
-__all__ = ['METRO_Body']
-
-
-def orthographic_projection(X, camera):
-    """Perform orthographic projection of 3D points X using the camera parameters
-    Args:
-        X: size = [B, N, 3]
-        camera: size = [B, 3]
-    Returns:
-        Projected 2D points -- size = [B, N, 2]
-    """
-    camera = camera.reshape((-1, 1, 3))
-    X_trans = X[:, :, :2] + camera[:, :, 1:]
-    shape = paddle.shape(X_trans)
-    X_2d = (camera[:, :, 0] * X_trans.reshape((shape[0], -1))).reshape(shape)
-    return X_2d
-
-
-@register
-class METRO_Body(BaseArch):
-    __category__ = 'architecture'
-    __inject__ = ['loss']
-
-    def __init__(
-            self,
-            num_joints,
-            backbone='HRNet',
-            trans_encoder='',
-            loss='Pose3DLoss', ):
-        """
-        Modified from METRO network, see https://arxiv.org/abs/2012.09760
-
-        Args:
-            backbone (nn.Layer): backbone instance
-        """
-        super(METRO_Body, self).__init__()
-        self.num_joints = num_joints
-        self.backbone = backbone
-        self.loss = loss
-        self.deploy = False
-
-        self.trans_encoder = trans_encoder
-        self.conv_learn_tokens = paddle.nn.Conv1D(49, num_joints + 10, 1)
-        self.cam_param_fc = paddle.nn.Linear(3, 2)
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        # backbone
-        backbone = create(cfg['backbone'])
-        trans_encoder = create(cfg['trans_encoder'])
-
-        return {'backbone': backbone, 'trans_encoder': trans_encoder}
-
-    def _forward(self):
-        batch_size = self.inputs['image'].shape[0]
-
-        image_feat = self.backbone(self.inputs)
-        image_feat_flatten = image_feat.reshape((batch_size, 2048, 49))
-        image_feat_flatten = image_feat_flatten.transpose(perm=(0, 2, 1))
-        # and apply a conv layer to learn image token for each 3d joint/vertex position
-        features = self.conv_learn_tokens(image_feat_flatten)  # (B, J, C)
-
-        if self.training:
-            # apply mask vertex/joint modeling
-            # meta_masks is a tensor of all the masks, randomly generated in dataloader
-            # we pre-define a [MASK] token, which is a floating-value vector with 0.01s
-            meta_masks = self.inputs['mjm_mask'].expand((-1, -1, 2048))
-            constant_tensor = paddle.ones_like(features) * 0.01
-            features = features * meta_masks + constant_tensor * (1 - meta_masks
-                                                                  )
-        pred_out = self.trans_encoder(features)
-
-        pred_3d_joints = pred_out[:, :self.num_joints, :]
-        cam_features = pred_out[:, self.num_joints:, :]
-
-        # learn camera parameters
-        pred_2d_joints = self.cam_param_fc(cam_features)
-        return pred_3d_joints, pred_2d_joints
-
-    def get_loss(self):
-        preds_3d, preds_2d = self._forward()
-        loss = self.loss(preds_3d, preds_2d, self.inputs)
-        output = {'loss': loss}
-        return output
-
-    def get_pred(self):
-        preds_3d, preds_2d = self._forward()
-        outputs = {'pose3d': preds_3d, 'pose2d': preds_2d}
-        return outputs
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/ppyoloe.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/ppyoloe.py
deleted file mode 100644
index 330542b..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/ppyoloe.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import copy
-import paddle
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-__all__ = ['PPYOLOE', 'PPYOLOEWithAuxHead']
-# PP-YOLOE and PP-YOLOE+ are recommended to use this architecture, especially when use distillation or aux head
-# PP-YOLOE and PP-YOLOE+ can also use the same architecture of YOLOv3 in yolo.py when not use distillation or aux head
-
-
-@register
-class PPYOLOE(BaseArch):
-    """
-    PPYOLOE network, see https://arxiv.org/abs/2203.16250
-
-    Args:
-        backbone (nn.Layer): backbone instance
-        neck (nn.Layer): neck instance
-        yolo_head (nn.Layer): anchor_head instance
-        post_process (object): `BBoxPostProcess` instance
-        ssod_loss (object): 'SSODPPYOLOELoss' instance, only used for semi-det(ssod)
-        for_distill (bool): whether for distillation
-        feat_distill_place (str): distill which feature for distillation
-        for_mot (bool): whether return other features for multi-object tracking
-            models, default False in pure object detection models.
-    """
-
-    __category__ = 'architecture'
-    __shared__ = ['for_distill']
-    __inject__ = ['post_process', 'ssod_loss']
-
-    def __init__(self,
-                 backbone='CSPResNet',
-                 neck='CustomCSPPAN',
-                 yolo_head='PPYOLOEHead',
-                 post_process='BBoxPostProcess',
-                 ssod_loss='SSODPPYOLOELoss',
-                 for_distill=False,
-                 feat_distill_place='neck_feats',
-                 for_mot=False):
-        super(PPYOLOE, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.yolo_head = yolo_head
-        self.post_process = post_process
-        self.for_mot = for_mot
-
-        # for ssod, semi-det
-        self.is_teacher = False
-        self.ssod_loss = ssod_loss
-
-        # distill
-        self.for_distill = for_distill
-        self.feat_distill_place = feat_distill_place
-        if for_distill:
-            assert feat_distill_place in ['backbone_feats', 'neck_feats']
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        backbone = create(cfg['backbone'])
-
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs)
-
-        kwargs = {'input_shape': neck.out_shape}
-        yolo_head = create(cfg['yolo_head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            "yolo_head": yolo_head,
-        }
-
-    def _forward(self):
-        body_feats = self.backbone(self.inputs)
-        neck_feats = self.neck(body_feats, self.for_mot)
-
-        self.is_teacher = self.inputs.get('is_teacher', False)  # for semi-det
-        if self.training or self.is_teacher:
-            yolo_losses = self.yolo_head(neck_feats, self.inputs)
-
-            if self.for_distill:
-                if self.feat_distill_place == 'backbone_feats':
-                    self.yolo_head.distill_pairs['backbone_feats'] = body_feats
-                elif self.feat_distill_place == 'neck_feats':
-                    self.yolo_head.distill_pairs['neck_feats'] = neck_feats
-                else:
-                    raise ValueError
-            return yolo_losses
-        else:
-
-            yolo_head_outs = self.yolo_head(neck_feats)
-
-            if self.post_process is not None:
-                bbox, bbox_num, nms_keep_idx = self.post_process(
-                    yolo_head_outs, self.yolo_head.mask_anchors,
-                    self.inputs['im_shape'], self.inputs['scale_factor'])
-
-            else:
-                bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(
-                    yolo_head_outs, self.inputs['scale_factor'])
-
-            if self.use_extra_data:
-                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
-                """extra_data:{
-                            'scores': predict scores,
-                            'nms_keep_idx': bbox index before nms,
-                           }
-                           """
-                extra_data['scores'] = yolo_head_outs[0]  # predict scores (probability)
-                extra_data['nms_keep_idx'] = nms_keep_idx
-                output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data}
-            else:
-                output = {'bbox': bbox, 'bbox_num': bbox_num}
-
-            return output
-
-    def get_loss(self):
-        return self._forward()
-
-    def get_pred(self):
-        return self._forward()
-
-    def get_loss_keys(self):
-        return ['loss_cls', 'loss_iou', 'loss_dfl', 'loss_contrast']
-
-    def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg):
-        ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs,
-                                     train_cfg)
-        return ssod_losses
-
-
-@register
-class PPYOLOEWithAuxHead(BaseArch):
-    __category__ = 'architecture'
-    __inject__ = ['post_process']
-
-    def __init__(self,
-                 backbone='CSPResNet',
-                 neck='CustomCSPPAN',
-                 yolo_head='PPYOLOEHead',
-                 aux_head='SimpleConvHead',
-                 post_process='BBoxPostProcess',
-                 for_mot=False,
-                 detach_epoch=5):
-        """
-        PPYOLOE network, see https://arxiv.org/abs/2203.16250
-
-        Args:
-            backbone (nn.Layer): backbone instance
-            neck (nn.Layer): neck instance
-            yolo_head (nn.Layer): anchor_head instance
-            post_process (object): `BBoxPostProcess` instance
-            for_mot (bool): whether return other features for multi-object tracking
-                models, default False in pure object detection models.
-        """
-        super(PPYOLOEWithAuxHead, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.aux_neck = copy.deepcopy(self.neck)
-
-        self.yolo_head = yolo_head
-        self.aux_head = aux_head
-        self.post_process = post_process
-        self.for_mot = for_mot
-        self.detach_epoch = detach_epoch
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        # backbone
-        backbone = create(cfg['backbone'])
-
-        # fpn
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs)
-        aux_neck = copy.deepcopy(neck)
-
-        # head
-        kwargs = {'input_shape': neck.out_shape}
-        yolo_head = create(cfg['yolo_head'], **kwargs)
-        aux_head = create(cfg['aux_head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            "yolo_head": yolo_head,
-            'aux_head': aux_head,
-        }
-
-    def _forward(self):
-        body_feats = self.backbone(self.inputs)
-        neck_feats = self.neck(body_feats, self.for_mot)
-
-        if self.training:
-            if self.inputs['epoch_id'] >= self.detach_epoch:
-                aux_neck_feats = self.aux_neck([f.detach() for f in body_feats])
-                dual_neck_feats = (paddle.concat(
-                    [f.detach(), aux_f], axis=1) for f, aux_f in
-                                   zip(neck_feats, aux_neck_feats))
-            else:
-                aux_neck_feats = self.aux_neck(body_feats)
-                dual_neck_feats = (paddle.concat(
-                    [f, aux_f], axis=1) for f, aux_f in
-                                   zip(neck_feats, aux_neck_feats))
-            aux_cls_scores, aux_bbox_preds = self.aux_head(dual_neck_feats)
-            loss = self.yolo_head(
-                neck_feats,
-                self.inputs,
-                aux_pred=[aux_cls_scores, aux_bbox_preds])
-            return loss
-        else:
-            yolo_head_outs = self.yolo_head(neck_feats)
-
-            if self.post_process is not None:
-                bbox, bbox_num, nms_keep_idx = self.post_process(
-                    yolo_head_outs, self.yolo_head.mask_anchors,
-                    self.inputs['im_shape'], self.inputs['scale_factor'])
-            else:
-                bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(
-                    yolo_head_outs, self.inputs['scale_factor'])
-
-            if self.use_extra_data:
-                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
-                """extra_data:{
-                            'scores': predict scores,
-                            'nms_keep_idx': bbox index before nms,
-                           }
-                           """
-                extra_data['scores'] = yolo_head_outs[0]  # predict scores (probability)
-                # Todo: get logits output
-                extra_data['nms_keep_idx'] = nms_keep_idx
-                output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data}
-            else:
-                output = {'bbox': bbox, 'bbox_num': bbox_num}
-
-            return output
-
-    def get_loss(self):
-        return self._forward()
-
-    def get_pred(self):
-        return self._forward()
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/queryinst.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/queryinst.py
deleted file mode 100644
index 76a65ed..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/queryinst.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-__all__ = ['QueryInst']
-
-
-@register
-class QueryInst(BaseArch):
-    __category__ = 'architecture'
-    __inject__ = ['post_process']
-
-    def __init__(self,
-                 backbone,
-                 neck,
-                 rpn_head,
-                 roi_head,
-                 post_process='SparsePostProcess'):
-        super(QueryInst, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.rpn_head = rpn_head
-        self.roi_head = roi_head
-        self.post_process = post_process
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        backbone = create(cfg['backbone'])
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs)
-
-        kwargs = {'input_shape': neck.out_shape}
-        rpn_head = create(cfg['rpn_head'], **kwargs)
-        roi_head = create(cfg['roi_head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            'rpn_head': rpn_head,
-            "roi_head": roi_head
-        }
-
-    def _forward(self, targets=None):
-        features = self.backbone(self.inputs)
-        features = self.neck(features)
-
-        proposal_bboxes, proposal_features = self.rpn_head(self.inputs[
-            'img_whwh'])
-        outputs = self.roi_head(features, proposal_bboxes, proposal_features,
-                                targets)
-
-        if self.training:
-            return outputs
-        else:
-            bbox_pred, bbox_num, mask_pred = self.post_process(
-                outputs['class_logits'], outputs['bbox_pred'],
-                self.inputs['scale_factor_whwh'], self.inputs['ori_shape'],
-                outputs['mask_logits'])
-            return bbox_pred, bbox_num, mask_pred
-
-    def get_loss(self):
-        targets = []
-        for i in range(len(self.inputs['img_whwh'])):
-            boxes = self.inputs['gt_bbox'][i]
-            labels = self.inputs['gt_class'][i].squeeze(-1)
-            img_whwh = self.inputs['img_whwh'][i]
-            if boxes.shape[0] != 0:
-                img_whwh_tgt = img_whwh.unsqueeze(0).tile([boxes.shape[0], 1])
-            else:
-                img_whwh_tgt = paddle.zeros_like(boxes)
-            gt_segm = self.inputs['gt_segm'][i].astype('float32')
-            targets.append({
-                'boxes': boxes,
-                'labels': labels,
-                'img_whwh': img_whwh,
-                'img_whwh_tgt': img_whwh_tgt,
-                'gt_segm': gt_segm
-            })
-        losses = self._forward(targets)
-        losses.update({'loss': sum(losses.values())})
-        return losses
-
-    def get_pred(self):
-        bbox_pred, bbox_num, mask_pred = self._forward()
-        return {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred}
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/retinanet.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/retinanet.py
deleted file mode 100644
index fc49f0e..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/retinanet.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-import paddle
-import paddle.nn.functional as F
-
-__all__ = ['RetinaNet']
-
-
-@register
-class RetinaNet(BaseArch):
-    __category__ = 'architecture'
-
-    def __init__(self, backbone, neck, head):
-        super(RetinaNet, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.head = head
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        backbone = create(cfg['backbone'])
-
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs)
-
-        kwargs = {'input_shape': neck.out_shape}
-        head = create(cfg['head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            'head': head,
-        }
-
-    def _forward(self):
-        body_feats = self.backbone(self.inputs)
-        neck_feats = self.neck(body_feats)
-
-        if self.training:
-            return self.head(neck_feats, self.inputs)
-        else:
-            head_outs = self.head(neck_feats)
-            bbox, bbox_num, nms_keep_idx = self.head.post_process(
-                head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])
-
-            if self.use_extra_data:
-                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
-                """extra_data:{
-                            'scores': predict scores,
-                            'nms_keep_idx': bbox index before nms,
-                           }
-                           """
-                preds_logits = self.head.decode_cls_logits(head_outs[0])
-                preds_scores = F.sigmoid(preds_logits)
-                extra_data['logits'] = preds_logits
-                extra_data['scores'] = preds_scores
-                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
-                return {'bbox': bbox, 'bbox_num': bbox_num, "extra_data": extra_data}
-            else:
-                return {'bbox': bbox, 'bbox_num': bbox_num}
-
-    def get_loss(self):
-        return self._forward()
-
-    def get_pred(self):
-        return self._forward()
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/s2anet.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/s2anet.py
deleted file mode 100644
index 8fb71e2..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/s2anet.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-__all__ = ['S2ANet']
-
-
-@register
-class S2ANet(BaseArch):
-    __category__ = 'architecture'
-    __inject__ = ['head']
-
-    def __init__(self, backbone, neck, head):
-        """
-        S2ANet, see https://arxiv.org/pdf/2008.09397.pdf
-
-        Args:
-            backbone (object): backbone instance
-            neck (object): `FPN` instance
-            head (object): `Head` instance
-        """
-        super(S2ANet, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.s2anet_head = head
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        backbone = create(cfg['backbone'])
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = cfg['neck'] and create(cfg['neck'], **kwargs)
-
-        out_shape = neck and neck.out_shape or backbone.out_shape
-        kwargs = {'input_shape': out_shape}
-        head = create(cfg['head'], **kwargs)
-
-        return {'backbone': backbone, 'neck': neck, "head": head}
-
-    def _forward(self):
-        body_feats = self.backbone(self.inputs)
-        if self.neck is not None:
-            body_feats = self.neck(body_feats)
-        if self.training:
-            loss = self.s2anet_head(body_feats, self.inputs)
-            return loss
-        else:
-            head_outs = self.s2anet_head(body_feats)
-            # post_process
-            bboxes, bbox_num = self.s2anet_head.get_bboxes(head_outs)
-            # rescale the prediction back to origin image
-            im_shape = self.inputs['im_shape']
-            scale_factor = self.inputs['scale_factor']
-            bboxes = self.s2anet_head.get_pred(bboxes, bbox_num, im_shape,
-                                               scale_factor)
-            # output
-            output = {'bbox': bboxes, 'bbox_num': bbox_num}
-            return output
-
-    def get_loss(self, ):
-        loss = self._forward()
-        return loss
-
-    def get_pred(self):
-        output = self._forward()
-        return output
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/solov2.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/solov2.py
deleted file mode 100644
index 4e5fc21..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/solov2.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-__all__ = ['SOLOv2']
-
-
-@register
-class SOLOv2(BaseArch):
-    """
-    SOLOv2 network, see https://arxiv.org/abs/2003.10152
-
-    Args:
-        backbone (object): an backbone instance
-        solov2_head (object): an `SOLOv2Head` instance
-        mask_head (object): an `SOLOv2MaskHead` instance
-        neck (object): neck of network, such as feature pyramid network instance
-    """
-
-    __category__ = 'architecture'
-
-    def __init__(self, backbone, solov2_head, mask_head, neck=None):
-        super(SOLOv2, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.solov2_head = solov2_head
-        self.mask_head = mask_head
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        backbone = create(cfg['backbone'])
-
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs)
-
-        kwargs = {'input_shape': neck.out_shape}
-        solov2_head = create(cfg['solov2_head'], **kwargs)
-        mask_head = create(cfg['mask_head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            'solov2_head': solov2_head,
-            'mask_head': mask_head,
-        }
-
-    def model_arch(self):
-        body_feats = self.backbone(self.inputs)
-
-        body_feats = self.neck(body_feats)
-
-        self.seg_pred = self.mask_head(body_feats)
-
-        self.cate_pred_list, self.kernel_pred_list = self.solov2_head(
-            body_feats)
-
-    def get_loss(self, ):
-        loss = {}
-        # get gt_ins_labels, gt_cate_labels, etc.
-        gt_ins_labels, gt_cate_labels, gt_grid_orders = [], [], []
-        fg_num = self.inputs['fg_num']
-        for i in range(len(self.solov2_head.seg_num_grids)):
-            ins_label = 'ins_label{}'.format(i)
-            if ins_label in self.inputs:
-                gt_ins_labels.append(self.inputs[ins_label])
-            cate_label = 'cate_label{}'.format(i)
-            if cate_label in self.inputs:
-                gt_cate_labels.append(self.inputs[cate_label])
-            grid_order = 'grid_order{}'.format(i)
-            if grid_order in self.inputs:
-                gt_grid_orders.append(self.inputs[grid_order])
-
-        loss_solov2 = self.solov2_head.get_loss(
-            self.cate_pred_list, self.kernel_pred_list, self.seg_pred,
-            gt_ins_labels, gt_cate_labels, gt_grid_orders, fg_num)
-        loss.update(loss_solov2)
-        total_loss = paddle.add_n(list(loss.values()))
-        loss.update({'loss': total_loss})
-        return loss
-
-    def get_pred(self):
-        seg_masks, cate_labels, cate_scores, bbox_num = self.solov2_head.get_prediction(
-            self.cate_pred_list, self.kernel_pred_list, self.seg_pred,
-            self.inputs['im_shape'], self.inputs['scale_factor'])
-        outs = {
-            "segm": seg_masks,
-            "bbox_num": bbox_num,
-            'cate_label': cate_labels,
-            'cate_score': cate_scores
-        }
-        return outs
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/sparse_rcnn.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/sparse_rcnn.py
deleted file mode 100644
index 2cbc853..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/sparse_rcnn.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-__all__ = ["SparseRCNN"]
-
-
-@register
-class SparseRCNN(BaseArch):
-    __category__ = 'architecture'
-    __inject__ = ["postprocess"]
-
-    def __init__(self,
-                 backbone,
-                 neck,
-                 head="SparsercnnHead",
-                 postprocess="SparsePostProcess"):
-        super(SparseRCNN, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.head = head
-        self.postprocess = postprocess
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        backbone = create(cfg['backbone'])
-
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs)
-
-        kwargs = {'roi_input_shape': neck.out_shape}
-        head = create(cfg['head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            "head": head,
-        }
-
-    def _forward(self):
-        body_feats = self.backbone(self.inputs)
-        fpn_feats = self.neck(body_feats)
-        head_outs = self.head(fpn_feats, self.inputs["img_whwh"])
-
-        if not self.training:
-            bbox_pred, bbox_num = self.postprocess(
-                head_outs["pred_logits"], head_outs["pred_boxes"],
-                self.inputs["scale_factor_whwh"], self.inputs["ori_shape"])
-            return bbox_pred, bbox_num
-        else:
-            return head_outs
-
-    def get_loss(self):
-        batch_gt_class = self.inputs["gt_class"]
-        batch_gt_box = self.inputs["gt_bbox"]
-        batch_whwh = self.inputs["img_whwh"]
-        targets = []
-
-        for i in range(len(batch_gt_class)):
-            boxes = batch_gt_box[i]
-            labels = batch_gt_class[i].squeeze(-1)
-            img_whwh = batch_whwh[i]
-            img_whwh_tgt = img_whwh.unsqueeze(0).tile([int(boxes.shape[0]), 1])
-            targets.append({
-                "boxes": boxes,
-                "labels": labels,
-                "img_whwh": img_whwh,
-                "img_whwh_tgt": img_whwh_tgt
-            })
-
-        outputs = self._forward()
-        loss_dict = self.head.get_loss(outputs, targets)
-        acc = loss_dict["acc"]
-        loss_dict.pop("acc")
-        total_loss = sum(loss_dict.values())
-        loss_dict.update({"loss": total_loss, "acc": acc})
-        return loss_dict
-
-    def get_pred(self):
-        bbox_pred, bbox_num = self._forward()
-        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
-        return output
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/ssd.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/ssd.py
deleted file mode 100644
index b8669b7..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/ssd.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-import paddle
-import paddle.nn.functional as F
-
-__all__ = ['SSD']
-
-
-@register
-class SSD(BaseArch):
-    """
-    Single Shot MultiBox Detector, see https://arxiv.org/abs/1512.02325
-
-    Args:
-        backbone (nn.Layer): backbone instance
-        ssd_head (nn.Layer): `SSDHead` instance
-        post_process (object): `BBoxPostProcess` instance
-    """
-
-    __category__ = 'architecture'
-    __inject__ = ['post_process']
-
-    def __init__(self, backbone, ssd_head, post_process, r34_backbone=False):
-        super(SSD, self).__init__()
-        self.backbone = backbone
-        self.ssd_head = ssd_head
-        self.post_process = post_process
-        self.r34_backbone = r34_backbone
-        if self.r34_backbone:
-            from ppdet.modeling.backbones.resnet import ResNet
-            assert isinstance(self.backbone, ResNet) and \
-                   self.backbone.depth == 34, \
-                "If you set r34_backbone=True, please use ResNet-34 as backbone."
-            self.backbone.res_layers[2].blocks[0].branch2a.conv._stride = [1, 1]
-            self.backbone.res_layers[2].blocks[0].short.conv._stride = [1, 1]
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        # backbone
-        backbone = create(cfg['backbone'])
-
-        # head
-        kwargs = {'input_shape': backbone.out_shape}
-        ssd_head = create(cfg['ssd_head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            "ssd_head": ssd_head,
-        }
-
-    def _forward(self):
-        # Backbone
-        body_feats = self.backbone(self.inputs)
-
-        # SSD Head
-        if self.training:
-            return self.ssd_head(body_feats, self.inputs['image'],
-                                 self.inputs['gt_bbox'],
-                                 self.inputs['gt_class'])
-        else:
-            preds, anchors = self.ssd_head(body_feats, self.inputs['image'])
-            bbox, bbox_num, nms_keep_idx = self.post_process(
-                preds, anchors, self.inputs['im_shape'],
-                self.inputs['scale_factor'])
-
-            if self.use_extra_data:
-                extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
-                """extra_data:{
-                            'scores': predict scores,
-                            'nms_keep_idx': bbox index before nms,
-                           }
-                           """
-                preds_logits = preds[1]  # [[1xNumBBoxNumClass]]
-                extra_data['scores'] = F.softmax(paddle.concat(
-                    preds_logits, axis=1)).transpose([0, 2, 1])
-                extra_data['logits'] = paddle.concat(
-                    preds_logits, axis=1).transpose([0, 2, 1])
-                extra_data['nms_keep_idx'] = nms_keep_idx  # bbox index before nms
-                return bbox, bbox_num, extra_data
-            else:
-                return bbox, bbox_num
-
-    def get_loss(self, ):
-        return {"loss": self._forward()}
-
-    def get_pred(self):
-        if self.use_extra_data:
-            bbox_pred, bbox_num, extra_data = self._forward()
-            output = {
-                "bbox": bbox_pred,
-                "bbox_num": bbox_num,
-                "extra_data": extra_data
-            }
-        else:
-            bbox_pred, bbox_num = self._forward()
-            output = {
-                "bbox": bbox_pred,
-                "bbox_num": bbox_num,
-            }
-        return output
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/tood.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/tood.py
deleted file mode 100644
index 157ec6f..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/tood.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-__all__ = ['TOOD']
-
-
-@register
-class TOOD(BaseArch):
-    """
-    TOOD: Task-aligned One-stage Object Detection, see https://arxiv.org/abs/2108.07755
-    Args:
-        backbone (nn.Layer): backbone instance
-        neck (nn.Layer): 'FPN' instance
-        head (nn.Layer): 'TOODHead' instance
-    """
-
-    __category__ = 'architecture'
-
-    def __init__(self, backbone, neck, head):
-        super(TOOD, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.head = head
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        backbone = create(cfg['backbone'])
-
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs)
-
-        kwargs = {'input_shape': neck.out_shape}
-        head = create(cfg['head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            "head": head,
-        }
-
-    def _forward(self):
-        body_feats = self.backbone(self.inputs)
-        fpn_feats = self.neck(body_feats)
-        head_outs = self.head(fpn_feats)
-        if not self.training:
-            bboxes, bbox_num = self.head.post_process(
-                head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])
-            return bboxes, bbox_num
-        else:
-            loss = self.head.get_loss(head_outs, self.inputs)
-            return loss
-
-    def get_loss(self):
-        return self._forward()
-
-    def get_pred(self):
-        bbox_pred, bbox_num = self._forward()
-        output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
-        return output
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/ttfnet.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/ttfnet.py
deleted file mode 100644
index c3eb61c..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/ttfnet.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-__all__ = ['TTFNet']
-
-
-@register
-class TTFNet(BaseArch):
-    """
-    TTFNet network, see https://arxiv.org/abs/1909.00700
-
-    Args:
-        backbone (object): backbone instance
-        neck (object): 'TTFFPN' instance
-        ttf_head (object): 'TTFHead' instance
-        post_process (object): 'BBoxPostProcess' instance
-    """
-
-    __category__ = 'architecture'
-    __inject__ = ['post_process']
-
-    def __init__(self,
-                 backbone='DarkNet',
-                 neck='TTFFPN',
-                 ttf_head='TTFHead',
-                 post_process='BBoxPostProcess'):
-        super(TTFNet, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.ttf_head = ttf_head
-        self.post_process = post_process
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        backbone = create(cfg['backbone'])
-
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs)
-
-        kwargs = {'input_shape': neck.out_shape}
-        ttf_head = create(cfg['ttf_head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            "ttf_head": ttf_head,
-        }
-
-    def _forward(self):
-        body_feats = self.backbone(self.inputs)
-        body_feats = self.neck(body_feats)
-        hm, wh = self.ttf_head(body_feats)
-        if self.training:
-            return hm, wh
-        else:
-            bbox, bbox_num = self.post_process(hm, wh, self.inputs['im_shape'],
-                                               self.inputs['scale_factor'])
-            return bbox, bbox_num
-
-    def get_loss(self, ):
-        loss = {}
-        heatmap = self.inputs['ttf_heatmap']
-        box_target = self.inputs['ttf_box_target']
-        reg_weight = self.inputs['ttf_reg_weight']
-        hm, wh = self._forward()
-        head_loss = self.ttf_head.get_loss(hm, wh, heatmap, box_target,
-                                           reg_weight)
-        loss.update(head_loss)
-        total_loss = paddle.add_n(list(loss.values()))
-        loss.update({'loss': total_loss})
-        return loss
-
-    def get_pred(self):
-        bbox_pred, bbox_num = self._forward()
-        output = {
-            "bbox": bbox_pred,
-            "bbox_num": bbox_num,
-        }
-        return output
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/yolo.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/yolo.py
deleted file mode 100644
index b004935..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/yolo.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-from ..post_process import JDEBBoxPostProcess
-
-__all__ = ['YOLOv3']
-# YOLOv3,PP-YOLO,PP-YOLOv2,PP-YOLOE,PP-YOLOE+ use the same architecture as YOLOv3
-# PP-YOLOE and PP-YOLOE+ are recommended to use PPYOLOE architecture in ppyoloe.py, especially when use distillation or aux head
-
-
-@register
-class YOLOv3(BaseArch):
-    __category__ = 'architecture'
-    __shared__ = ['data_format']
-    __inject__ = ['post_process']
-
-    def __init__(self,
-                 backbone='DarkNet',
-                 neck='YOLOv3FPN',
-                 yolo_head='YOLOv3Head',
-                 post_process='BBoxPostProcess',
-                 data_format='NCHW',
-                 for_mot=False):
-        """
-        YOLOv3 network, see https://arxiv.org/abs/1804.02767
-
-        Args:
-            backbone (nn.Layer): backbone instance
-            neck (nn.Layer): neck instance
-            yolo_head (nn.Layer): anchor_head instance
-            bbox_post_process (object): `BBoxPostProcess` instance
-            data_format (str): data format, NCHW or NHWC
-            for_mot (bool): whether return other features for multi-object tracking
-                models, default False in pure object detection models.
-        """
-        super(YOLOv3, self).__init__(data_format=data_format)
-        self.backbone = backbone
-        self.neck = neck
-        self.yolo_head = yolo_head
-        self.post_process = post_process
-        self.for_mot = for_mot
-        self.return_idx = isinstance(post_process, JDEBBoxPostProcess)
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        # backbone
-        backbone = create(cfg['backbone'])
-
-        # fpn
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs)
-
-        # head
-        kwargs = {'input_shape': neck.out_shape}
-        yolo_head = create(cfg['yolo_head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            "yolo_head": yolo_head,
-        }
-
-    def _forward(self):
-        body_feats = self.backbone(self.inputs)
-        if self.for_mot:
-            neck_feats = self.neck(body_feats, self.for_mot)
-        else:
-            neck_feats = self.neck(body_feats)
-
-        if isinstance(neck_feats, dict):
-            assert self.for_mot == True
-            emb_feats = neck_feats['emb_feats']
-            neck_feats = neck_feats['yolo_feats']
-
-        if self.training:
-            yolo_losses = self.yolo_head(neck_feats, self.inputs)
-
-            if self.for_mot:
-                return {'det_losses': yolo_losses, 'emb_feats': emb_feats}
-            else:
-                return yolo_losses
-
-        else:
-            yolo_head_outs = self.yolo_head(neck_feats)
-
-            if self.for_mot:
-                # the detection part of JDE MOT model
-                boxes_idx, bbox, bbox_num, nms_keep_idx = self.post_process(
-                    yolo_head_outs, self.yolo_head.mask_anchors)
-                output = {
-                    'bbox': bbox,
-                    'bbox_num': bbox_num,
-                    'boxes_idx': boxes_idx,
-                    'nms_keep_idx': nms_keep_idx,
-                    'emb_feats': emb_feats,
-                }
-            else:
-                if self.return_idx:
-                    # the detection part of JDE MOT model
-                    _, bbox, bbox_num, nms_keep_idx = self.post_process(
-                        yolo_head_outs, self.yolo_head.mask_anchors)
-                elif self.post_process is not None:
-                    # anchor based YOLOs: YOLOv3,PP-YOLO,PP-YOLOv2 use mask_anchors
-                    bbox, bbox_num, nms_keep_idx = self.post_process(
-                        yolo_head_outs, self.yolo_head.mask_anchors,
-                        self.inputs['im_shape'], self.inputs['scale_factor'])
-                else:
-                    # anchor free YOLOs: PP-YOLOE, PP-YOLOE+
-                    bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process(
-                        yolo_head_outs, self.inputs['scale_factor'])
-
-                if self.use_extra_data:
-                    extra_data = {}  # record the bbox output before nms, such like scores and nms_keep_idx
-                    """extra_data:{
-                                'scores': predict scores,
-                                'nms_keep_idx': bbox index before nms,
-                               }
-                    """
-                    extra_data['scores'] = yolo_head_outs[0]  # predict scores (probability)
-                    # Todo: get logits output
-                    extra_data['nms_keep_idx'] = nms_keep_idx
-                    # Todo support for mask_anchors yolo
-                    output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data}
-                else:
-                    output = {'bbox': bbox, 'bbox_num': bbox_num}
-
-            return output
-
-    def get_loss(self):
-        return self._forward()
-
-    def get_pred(self):
-        return self._forward()
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/yolof.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/yolof.py
deleted file mode 100644
index b6a2920..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/yolof.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-__all__ = ['YOLOF']
-
-
-@register
-class YOLOF(BaseArch):
-    __category__ = 'architecture'
-
-    def __init__(self,
-                 backbone='ResNet',
-                 neck='DilatedEncoder',
-                 head='YOLOFHead',
-                 for_mot=False):
-        """
-        YOLOF network, see https://arxiv.org/abs/2103.09460
-
-        Args:
-            backbone (nn.Layer): backbone instance
-            neck (nn.Layer): DilatedEncoder instance
-            head (nn.Layer): YOLOFHead instance
-            for_mot (bool): whether return other features for multi-object tracking
-                models, default False in pure object detection models.
-        """
-        super(YOLOF, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.head = head
-        self.for_mot = for_mot
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        # backbone
-        backbone = create(cfg['backbone'])
-
-        # fpn
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs)
-
-        # head
-        kwargs = {'input_shape': neck.out_shape}
-        head = create(cfg['head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            "head": head,
-        }
-
-    def _forward(self):
-        body_feats = self.backbone(self.inputs)
-        neck_feats = self.neck(body_feats, self.for_mot)
-
-        if self.training:
-            yolo_losses = self.head(neck_feats, self.inputs)
-            return yolo_losses
-        else:
-            yolo_head_outs = self.head(neck_feats)
-            bbox, bbox_num = self.head.post_process(yolo_head_outs,
-                                                    self.inputs['im_shape'],
-                                                    self.inputs['scale_factor'])
-            output = {'bbox': bbox, 'bbox_num': bbox_num}
-            return output
-
-    def get_loss(self):
-        return self._forward()
-
-    def get_pred(self):
-        return self._forward()
diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/yolox.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/yolox.py
deleted file mode 100644
index 8e02e9e..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/architectures/yolox.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ppdet.core.workspace import register, create
-from .meta_arch import BaseArch
-
-import random
-import paddle
-import paddle.nn.functional as F
-import paddle.distributed as dist
-
-__all__ = ['YOLOX']
-
-
-@register
-class YOLOX(BaseArch):
-    """
-    YOLOX network, see https://arxiv.org/abs/2107.08430
-
-    Args:
-        backbone (nn.Layer): backbone instance
-        neck (nn.Layer): neck instance
-        head (nn.Layer): head instance
-        for_mot (bool): whether used for MOT or not
-        input_size (list[int]): initial scale, will be reset by self._preprocess()
-        size_stride (int): stride of the size range
-        size_range (list[int]): multi-scale range for training
-        random_interval (int): interval of iter to change self._input_size
-    """
-    __category__ = 'architecture'
-
-    def __init__(self,
-                 backbone='CSPDarkNet',
-                 neck='YOLOCSPPAN',
-                 head='YOLOXHead',
-                 for_mot=False,
-                 input_size=[640, 640],
-                 size_stride=32,
-                 size_range=[15, 25],
-                 random_interval=10):
-        super(YOLOX, self).__init__()
-        self.backbone = backbone
-        self.neck = neck
-        self.head = head
-        self.for_mot = for_mot
-
-        self.input_size = input_size
-        self._input_size = paddle.to_tensor(input_size)
-        self.size_stride = size_stride
-        self.size_range = size_range
-        self.random_interval = random_interval
-        self._step = 0
-
-    @classmethod
-    def from_config(cls, cfg, *args, **kwargs):
-        # backbone
-        backbone = create(cfg['backbone'])
-
-        # fpn
-        kwargs = {'input_shape': backbone.out_shape}
-        neck = create(cfg['neck'], **kwargs)
-
-        # head
-        kwargs = {'input_shape': neck.out_shape}
-        head = create(cfg['head'], **kwargs)
-
-        return {
-            'backbone': backbone,
-            'neck': neck,
-            "head": head,
-        }
-
-    def _forward(self):
-        if self.training:
-            self._preprocess()
-        body_feats = self.backbone(self.inputs)
-        neck_feats = self.neck(body_feats, self.for_mot)
-
-        if self.training:
-            yolox_losses = self.head(neck_feats, self.inputs)
-            yolox_losses.update({'size': self._input_size[0]})
-            return yolox_losses
-        else:
-            head_outs = self.head(neck_feats)
-            bbox, bbox_num = self.head.post_process(
-                head_outs, self.inputs['im_shape'], self.inputs['scale_factor'])
-            return {'bbox': bbox, 'bbox_num': bbox_num}
-
-    def get_loss(self):
-        return self._forward()
-
-    def get_pred(self):
-        return self._forward()
-
-    def _preprocess(self):
-        # YOLOX multi-scale training, interpolate resize before inputs of the network.
-        self._get_size()
-        scale_y = self._input_size[0] / self.input_size[0]
-        scale_x = self._input_size[1] / self.input_size[1]
-        if scale_x != 1 or scale_y != 1:
-            self.inputs['image'] = F.interpolate(
-                self.inputs['image'],
-                size=self._input_size,
-                mode='bilinear',
-                align_corners=False)
-            gt_bboxes = self.inputs['gt_bbox']
-            for i in range(len(gt_bboxes)):
-                if len(gt_bboxes[i]) > 0:
-                    gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x
-                    gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y
-            self.inputs['gt_bbox'] = gt_bboxes
-
-    def _get_size(self):
-        # random_interval = 10 as default, every 10 iters to change self._input_size
-        image_ratio = self.input_size[1] * 1.0 / self.input_size[0]
-        if self._step % self.random_interval == 0:
-            size_factor = random.randint(*self.size_range)
-            size = [
-                self.size_stride * size_factor,
-                self.size_stride * int(size_factor * image_ratio)
-            ]
-            self._input_size = paddle.to_tensor(size)
-        self._step += 1
diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/__init__.py
deleted file mode 100644
index f462a9f..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/assigners/__init__.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import utils
-from . import task_aligned_assigner
-from . import atss_assigner
-from . import simota_assigner
-from . import max_iou_assigner
-from . import fcosr_assigner
-from . import rotated_task_aligned_assigner
-from . import task_aligned_assigner_cr
-from . import uniform_assigner
-
-from .utils import *
-from .task_aligned_assigner import *
-from .atss_assigner import *
-from .simota_assigner import *
-from .max_iou_assigner import *
-from .fcosr_assigner import *
-from .rotated_task_aligned_assigner import *
-from .task_aligned_assigner_cr import *
-from .uniform_assigner import *
-from .hungarian_assigner import *
-from .pose_utils import *
diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/atss_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/atss_assigner.py
deleted file mode 100644
index f1aae2b..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/assigners/atss_assigner.py
+++ /dev/null
@@ -1,225 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ppdet.core.workspace import register
-from ..bbox_utils import iou_similarity, batch_iou_similarity
-from ..bbox_utils import bbox_center
-from .utils import (check_points_inside_bboxes, compute_max_iou_anchor,
-                    compute_max_iou_gt)
-
-__all__ = ['ATSSAssigner']
-
-
-@register
-class ATSSAssigner(nn.Layer):
-    """Bridging the Gap Between Anchor-based and Anchor-free Detection
-     via Adaptive Training Sample Selection
-    """
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 topk=9,
-                 num_classes=80,
-                 force_gt_matching=False,
-                 eps=1e-9,
-                 sm_use=False):
-        super(ATSSAssigner, self).__init__()
-        self.topk = topk
-        self.num_classes = num_classes
-        self.force_gt_matching = force_gt_matching
-        self.eps = eps
-        self.sm_use = sm_use
-
-    def _gather_topk_pyramid(self, gt2anchor_distances, num_anchors_list,
-                             pad_gt_mask):
-        gt2anchor_distances_list = paddle.split(
-            gt2anchor_distances, num_anchors_list, axis=-1)
-        num_anchors_index = np.cumsum(num_anchors_list).tolist()
-        num_anchors_index = [0, ] + num_anchors_index[:-1]
-        is_in_topk_list = []
-        topk_idxs_list = []
-        for distances, anchors_index in zip(gt2anchor_distances_list,
-                                            num_anchors_index):
-            num_anchors = distances.shape[-1]
-            _, topk_idxs = paddle.topk(
-                distances, self.topk, axis=-1, largest=False)
-            topk_idxs_list.append(topk_idxs + anchors_index)
-            is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(
-                axis=-2).astype(gt2anchor_distances.dtype)
-            is_in_topk_list.append(is_in_topk * pad_gt_mask)
-        is_in_topk_list = paddle.concat(is_in_topk_list, axis=-1)
-        topk_idxs_list = paddle.concat(topk_idxs_list, axis=-1)
-        return is_in_topk_list, topk_idxs_list
-
-    @paddle.no_grad()
-    def forward(self,
-                anchor_bboxes,
-                num_anchors_list,
-                gt_labels,
-                gt_bboxes,
-                pad_gt_mask,
-                bg_index,
-                gt_scores=None,
-                pred_bboxes=None):
-        r"""This code is based on
-            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py
-
-        The assignment is done in following steps
-        1. compute iou between all bbox (bbox of all pyramid levels) and gt
-        2. compute center distance between all bbox and gt
-        3. on each pyramid level, for each gt, select k bbox whose center
-           are closest to the gt center, so we total select k*l bbox as
-           candidates for each gt
-        4. get corresponding iou for the these candidates, and compute the
-           mean and std, set mean + std as the iou threshold
-        5. select these candidates whose iou are greater than or equal to
-           the threshold as positive
-        6. limit the positive sample's center in gt
-        7. if an anchor box is assigned to multiple gts, the one with the
-           highest iou will be selected.
-        Args:
-            anchor_bboxes (Tensor, float32): pre-defined anchors, shape(L, 4),
-                    "xmin, xmax, ymin, ymax" format
-            num_anchors_list (List): num of anchors in each level
-            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
-            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
-            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
-            bg_index (int): background index
-            gt_scores (Tensor|None, float32) Score of gt_bboxes,
-                    shape(B, n, 1), if None, then it will initialize with one_hot label
-            pred_bboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 4)
-        Returns:
-            assigned_labels (Tensor): (B, L)
-            assigned_bboxes (Tensor): (B, L, 4)
-            assigned_scores (Tensor): (B, L, C), if pred_bboxes is not None, then output ious
-        """
-        assert gt_labels.ndim == gt_bboxes.ndim and \
-               gt_bboxes.ndim == 3
-
-        num_anchors, _ = anchor_bboxes.shape
-        batch_size, num_max_boxes, _ = gt_bboxes.shape
-
-        # negative batch
-        if num_max_boxes == 0:
-            assigned_labels = paddle.full(
-                [batch_size, num_anchors], bg_index, dtype='int32')
-            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
-            assigned_scores = paddle.zeros(
-                [batch_size, num_anchors, self.num_classes])
-            return assigned_labels, assigned_bboxes, assigned_scores
-
-        # 1. compute iou between gt and anchor bbox, [B, n, L]
-        ious = iou_similarity(gt_bboxes.reshape([-1, 4]), anchor_bboxes)
-        ious = ious.reshape([batch_size, -1, num_anchors])
-
-        # 2. compute center distance between all anchors and gt, [B, n, L]
-        gt_centers = bbox_center(gt_bboxes.reshape([-1, 4])).unsqueeze(1)
-        anchor_centers = bbox_center(anchor_bboxes)
-        gt2anchor_distances = (gt_centers - anchor_centers.unsqueeze(0)) \
-            .norm(2, axis=-1).reshape([batch_size, -1, num_anchors])
-
-        # 3. on each pyramid level, selecting topk closest candidates
-        # based on the center distance, [B, n, L]
-        is_in_topk, topk_idxs = self._gather_topk_pyramid(
-            gt2anchor_distances, num_anchors_list, pad_gt_mask)
-
-        # 4. get corresponding iou for the these candidates, and compute the
-        # mean and std, 5. set mean + std as the iou threshold
-        iou_candidates = ious * is_in_topk
-        iou_threshold = paddle.index_sample(
-            iou_candidates.flatten(stop_axis=-2),
-            topk_idxs.flatten(stop_axis=-2))
-        iou_threshold = iou_threshold.reshape([batch_size, num_max_boxes, -1])
-        iou_threshold = iou_threshold.mean(axis=-1, keepdim=True) + \
-                        iou_threshold.std(axis=-1, keepdim=True)
-        is_in_topk = paddle.where(iou_candidates > iou_threshold, is_in_topk,
-                                  paddle.zeros_like(is_in_topk))
-
-        # 6. check the positive sample's center in gt, [B, n, L]
-        if self.sm_use:
-            is_in_gts = check_points_inside_bboxes(
-                anchor_centers, gt_bboxes, sm_use=True)
-        else:
-            is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes)
-
-        # select positive sample, [B, n, L]
-        mask_positive = is_in_topk * is_in_gts * pad_gt_mask
-
-        # 7. if an anchor box is assigned to multiple gts,
-        # the one with the highest iou will be selected.
-        mask_positive_sum = mask_positive.sum(axis=-2)
-        if mask_positive_sum.max() > 1:
-            mask_multiple_gts = (
-                mask_positive_sum.unsqueeze(1) > 1).astype('int32').tile(
-                    [1, num_max_boxes, 1]).astype('bool')
-            if self.sm_use:
-                is_max_iou = compute_max_iou_anchor(ious * mask_positive)
-            else:
-                is_max_iou = compute_max_iou_anchor(ious)
-            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
-                                         mask_positive)
-            mask_positive_sum = mask_positive.sum(axis=-2)
-        # 8. make sure every gt_bbox matches the anchor
-        if self.force_gt_matching:
-            is_max_iou = compute_max_iou_gt(ious) * pad_gt_mask
-            mask_max_iou = (is_max_iou.sum(-2, keepdim=True) == 1).tile(
-                [1, num_max_boxes, 1])
-            mask_positive = paddle.where(mask_max_iou, is_max_iou,
-                                         mask_positive)
-            mask_positive_sum = mask_positive.sum(axis=-2)
-        assigned_gt_index = mask_positive.argmax(axis=-2)
-
-        # assigned target
-        batch_ind = paddle.arange(
-            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
-        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
-        assigned_labels = paddle.gather(
-            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
-        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
-        assigned_labels = paddle.where(
-            mask_positive_sum > 0, assigned_labels,
-            paddle.full_like(assigned_labels, bg_index))
-
-        assigned_bboxes = paddle.gather(
-            gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
-        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])
-
-        assigned_scores = F.one_hot(assigned_labels, self.num_classes + 1)
-        ind = list(range(self.num_classes + 1))
-        ind.remove(bg_index)
-        assigned_scores = paddle.index_select(
-            assigned_scores, paddle.to_tensor(ind), axis=-1)
-        if pred_bboxes is not None:
-            # assigned iou
-            ious = batch_iou_similarity(gt_bboxes, pred_bboxes) * mask_positive
-            ious = ious.max(axis=-2).unsqueeze(-1)
-            assigned_scores *= ious
-        elif gt_scores is not None:
-            gather_scores = paddle.gather(
-                gt_scores.flatten(), assigned_gt_index.flatten(), axis=0)
-            gather_scores = gather_scores.reshape([batch_size, num_anchors])
-            gather_scores = paddle.where(mask_positive_sum > 0, gather_scores,
-                                         paddle.zeros_like(gather_scores))
-            assigned_scores *= gather_scores.unsqueeze(-1)
-
-        return assigned_labels, assigned_bboxes, assigned_scores
diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/clrnet_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/clrnet_assigner.py
deleted file mode 100644
index 59c94a0..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/assigners/clrnet_assigner.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import paddle
-import paddle.nn.functional as F
-from ppdet.modeling.losses.clrnet_line_iou_loss import line_iou
-
-
-def distance_cost(predictions, targets, img_w):
-    """
-    repeat predictions and targets to generate all combinations
-    use the abs distance as the new distance cost
-    """
-    num_priors = predictions.shape[0]
-    num_targets = targets.shape[0]
-    predictions = paddle.repeat_interleave(
-        predictions, num_targets, axis=0)[..., 6:]
-    targets = paddle.concat(x=num_priors * [targets])[..., 6:]
-    invalid_masks = (targets < 0) | (targets >= img_w)
-    lengths = (~invalid_masks).sum(axis=1)
-    distances = paddle.abs(x=targets - predictions)
-    distances[invalid_masks] = 0.0
-    distances = distances.sum(axis=1) / (lengths.cast("float32") + 1e-09)
-    distances = distances.reshape([num_priors, num_targets])
-    return distances
-
-
-def focal_cost(cls_pred, gt_labels, alpha=0.25, gamma=2, eps=1e-12):
-    """
-    Args:
-        cls_pred (Tensor): Predicted classification logits, shape
-            [num_query, num_class].
-        gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
-
-    Returns:
-        torch.Tensor: cls_cost value
-    """
-    cls_pred = F.sigmoid(cls_pred)
-    neg_cost = -(1 - cls_pred + eps).log() * (1 - alpha) * cls_pred.pow(gamma)
-    pos_cost = -(cls_pred + eps).log() * alpha * (1 - cls_pred).pow(gamma)
-    cls_cost = pos_cost.index_select(
-        gt_labels, axis=1) - neg_cost.index_select(
-            gt_labels, axis=1)
-    return cls_cost
-
-
-def dynamic_k_assign(cost, pair_wise_ious):
-    """
-    Assign grouth truths with priors dynamically.
-
-    Args:
-        cost: the assign cost.
-        pair_wise_ious: iou of grouth truth and priors.
-
-    Returns:
-        prior_idx: the index of assigned prior.
-        gt_idx: the corresponding ground truth index.
-    """
-    matching_matrix = paddle.zeros_like(cost)
-    ious_matrix = pair_wise_ious
-    ious_matrix[ious_matrix < 0] = 0.0
-    n_candidate_k = 4
-    topk_ious, _ = paddle.topk(ious_matrix, n_candidate_k, axis=0)
-    dynamic_ks = paddle.clip(x=topk_ious.sum(0).cast("int32"), min=1)
-    num_gt = cost.shape[1]
-
-    for gt_idx in range(num_gt):
-        _, pos_idx = paddle.topk(
-            x=cost[:, gt_idx], k=dynamic_ks[gt_idx].item(), largest=False)
-        matching_matrix[pos_idx, gt_idx] = 1.0
-    del topk_ious, dynamic_ks, pos_idx
-    matched_gt = matching_matrix.sum(axis=1)
-
-    if (matched_gt > 1).sum() > 0:
-        matched_gt_indices = paddle.nonzero(matched_gt > 1)[:, 0]
-        cost_argmin = paddle.argmin(
-            cost.index_select(matched_gt_indices), axis=1)
-        matching_matrix[matched_gt_indices][0] *= 0.0
-        matching_matrix[matched_gt_indices, cost_argmin] = 1.0
-
-    prior_idx = matching_matrix.sum(axis=1).nonzero()
-    gt_idx = matching_matrix[prior_idx].argmax(axis=-1)
-    return prior_idx.flatten(), gt_idx.flatten()
-
-
-def cdist_paddle(x1, x2, p=2):
-    assert x1.shape[1] == x2.shape[1]
-    B, M = x1.shape
-    # if p == np.inf:
-    #     dist = np.max(np.abs(x1[:, np.newaxis, :] - x2[np.newaxis, :, :]), axis=-1)
-    if p == 1:
-        dist = paddle.sum(
-            paddle.abs(x1.unsqueeze(axis=1) - x2.unsqueeze(axis=0)), axis=-1)
-    else:
-        dist = paddle.pow(paddle.sum(paddle.pow(
-            paddle.abs(x1.unsqueeze(axis=1) - x2.unsqueeze(axis=0)), p),
-                                     axis=-1),
-                          1 / p)
-    return dist
-
-
-def assign(predictions,
-           targets,
-           img_w,
-           img_h,
-           distance_cost_weight=3.0,
-           cls_cost_weight=1.0):
-    """
-    computes dynamicly matching based on the cost, including cls cost and lane similarity cost
-    Args:
-        predictions (Tensor): predictions predicted by each stage, shape: (num_priors, 78)
-        targets (Tensor): lane targets, shape: (num_targets, 78)
-    return:
-        matched_row_inds (Tensor): matched predictions, shape: (num_targets)
-        matched_col_inds (Tensor): matched targets, shape: (num_targets)
-    """
-    predictions = predictions.detach().clone()
-    predictions[:, 3] *= img_w - 1
-    predictions[:, 6:] *= img_w - 1
-
-    targets = targets.detach().clone()
-    distances_score = distance_cost(predictions, targets, img_w)
-    distances_score = 1 - distances_score / paddle.max(x=distances_score) + 0.01
-
-    cls_score = focal_cost(predictions[:, :2], targets[:, 1].cast('int64'))
-
-    num_priors = predictions.shape[0]
-    num_targets = targets.shape[0]
-    target_start_xys = targets[:, 2:4]
-    target_start_xys[..., 0] *= (img_h - 1)
-    prediction_start_xys = predictions[:, 2:4]
-    prediction_start_xys[..., 0] *= (img_h - 1)
-    start_xys_score = cdist_paddle(
-        prediction_start_xys, target_start_xys,
-        p=2).reshape([num_priors, num_targets])
-
-    start_xys_score = 1 - start_xys_score / paddle.max(x=start_xys_score) + 0.01
-
-    target_thetas = targets[:, 4].unsqueeze(axis=-1)
-    theta_score = cdist_paddle(
-        predictions[:, 4].unsqueeze(axis=-1), target_thetas,
-        p=1).reshape([num_priors, num_targets]) * 180
-    theta_score = 1 - theta_score / paddle.max(x=theta_score) + 0.01
-
-    cost = -(distances_score * start_xys_score * theta_score
-             )**2 * distance_cost_weight + cls_score * cls_cost_weight
-    iou = line_iou(predictions[..., 6:], targets[..., 6:], img_w, aligned=False)
-
-    matched_row_inds, matched_col_inds = dynamic_k_assign(cost, iou)
-    return matched_row_inds, matched_col_inds
diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/fcosr_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/fcosr_assigner.py
deleted file mode 100644
index 46b743e..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/assigners/fcosr_assigner.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ppdet.core.workspace import register
-from ppdet.modeling.rbox_utils import box2corners, check_points_in_polys, paddle_gather
-
-__all__ = ['FCOSRAssigner']
-
-EPS = 1e-9
-
-
-@register
-class FCOSRAssigner(nn.Layer):
-    """ FCOSR Assigner, refer to https://arxiv.org/abs/2111.10780 for details
-
-    1. compute normalized gaussian distribution score and refined gaussian distribution score
-    2. refer to ellipse center sampling, sample points whose normalized gaussian distribution score is greater than threshold
-    3. refer to multi-level sampling, assign ground truth to feature map which follows two conditions.
-        i). first, the ratio between the short edge of the target and the stride of the feature map is less than 2.
-        ii). second, the long edge of minimum bounding rectangle of the target is larger than the acceptance range of feature map
-    4. refer to fuzzy sample label assignment, the points satisfying 2 and 3 will be assigned to the ground truth according to gaussian distribution score
-    """
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 num_classes=80,
-                 factor=12,
-                 threshold=0.23,
-                 boundary=[[-1, 128], [128, 320], [320, 10000]],
-                 score_type='iou'):
-        super(FCOSRAssigner, self).__init__()
-        self.num_classes = num_classes
-        self.factor = factor
-        self.threshold = threshold
-        self.boundary = [
-            paddle.to_tensor(
-                l, dtype=paddle.float32).reshape([1, 1, 2]) for l in boundary
-        ]
-        self.score_type = score_type
-
-    def get_gaussian_distribution_score(self, points, gt_rboxes, gt_polys):
-        # projecting points to coordinate system defined by each rbox
-        # [B, N, 4, 2] -> 4 * [B, N, 1, 2]
-        a, b, c, d = gt_polys.split(4, axis=2)
-        # [1, L, 2] -> [1, 1, L, 2]
-        points = points.unsqueeze(0)
-        ab = b - a
-        ad = d - a
-        # [B, N, 5] -> [B, N, 2], [B, N, 2], [B, N, 1]
-        xy, wh, angle = gt_rboxes.split([2, 2, 1], axis=-1)
-        # [B, N, 2] -> [B, N, 1, 2]
-        xy = xy.unsqueeze(2)
-        # vector of points to center [B, N, L, 2]
-        vec = points - xy
-        # <ab, vec> = |ab| * |vec| * cos(theta) [B, N, L]
-        vec_dot_ab = paddle.sum(vec * ab, axis=-1)
-        # <ad, vec> = |ad| * |vec| * cos(theta) [B, N, L]
-        vec_dot_ad = paddle.sum(vec * ad, axis=-1)
-        # norm_ab [B, N, L]
-        norm_ab = paddle.sum(ab * ab, axis=-1).sqrt()
-        # norm_ad [B, N, L]
-        norm_ad = paddle.sum(ad * ad, axis=-1).sqrt()
-        # min(h, w), [B, N, 1]
-        min_edge = paddle.min(wh, axis=-1, keepdim=True)
-        # delta_x, delta_y [B, N, L]
-        delta_x = vec_dot_ab.pow(2) / (norm_ab.pow(3) * min_edge + EPS)
-        delta_y = vec_dot_ad.pow(2) / (norm_ad.pow(3) * min_edge + EPS)
-        # score [B, N, L]
-        norm_score = paddle.exp(-0.5 * self.factor * (delta_x + delta_y))
-
-        # simplified calculation
-        sigma = min_edge / self.factor
-        refined_score = norm_score / (2 * np.pi * sigma + EPS)
-        return norm_score, refined_score
-
-    def get_rotated_inside_mask(self, points, gt_polys, scores):
-        inside_mask = check_points_in_polys(points, gt_polys)
-        center_mask = scores >= self.threshold
-        return (inside_mask & center_mask).cast(paddle.float32)
-
-    def get_inside_range_mask(self, points, gt_bboxes, gt_rboxes, stride_tensor,
-                              regress_range):
-        # [1, L, 2] -> [1, 1, L, 2]
-        points = points.unsqueeze(0)
-        # [B, n, 4] -> [B, n, 1, 4]
-        x1y1, x2y2 = gt_bboxes.unsqueeze(2).split(2, axis=-1)
-        # [B, n, L, 2]
-        lt = points - x1y1
-        rb = x2y2 - points
-        # [B, n, L, 4]
-        ltrb = paddle.concat([lt, rb], axis=-1)
-        # [B, n, L, 4] -> [B, n, L]
-        inside_mask = paddle.min(ltrb, axis=-1) > EPS
-        # regress_range [1, L, 2] -> [1, 1, L, 2]
-        regress_range = regress_range.unsqueeze(0)
-        # stride_tensor [1, L, 1] -> [1, 1, L]
-        stride_tensor = stride_tensor.transpose((0, 2, 1))
-        # fcos range
-        # [B, n, L, 4] -> [B, n, L]
-        ltrb_max = paddle.max(ltrb, axis=-1)
-        # [1, 1, L, 2] -> [1, 1, L]
-        low, high = regress_range[..., 0], regress_range[..., 1]
-        # [B, n, L]
-        regress_mask = (ltrb_max >= low) & (ltrb_max <= high)
-        # mask for rotated
-        # [B, n, 1]
-        min_edge = paddle.min(gt_rboxes[..., 2:4], axis=-1, keepdim=True)
-        # [B, n , L]
-        rotated_mask = ((min_edge / stride_tensor) < 2.0) & (ltrb_max > high)
-        mask = inside_mask & (regress_mask | rotated_mask)
-        return mask.cast(paddle.float32)
-
-    @paddle.no_grad()
-    def forward(self,
-                anchor_points,
-                stride_tensor,
-                num_anchors_list,
-                gt_labels,
-                gt_bboxes,
-                gt_rboxes,
-                pad_gt_mask,
-                bg_index,
-                pred_rboxes=None):
-        r"""
-
-        Args:
-            anchor_points (Tensor, float32): pre-defined anchor points, shape(1, L, 2),
-                    "x, y" format
-            stride_tensor (Tensor, float32): stride tensor, shape (1, L, 1)
-            num_anchors_list (List): num of anchors in each level
-            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
-            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
-            gt_rboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 5)
-            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
-            bg_index (int): background index
-            pred_rboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 5)
-        Returns:
-            assigned_labels (Tensor): (B, L)
-            assigned_rboxes (Tensor): (B, L, 5)
-            assigned_scores (Tensor): (B, L, C), if pred_rboxes is not None, then output ious
-        """
-
-        _, num_anchors, _ = anchor_points.shape
-        batch_size, num_max_boxes, _ = gt_rboxes.shape
-        if num_max_boxes == 0:
-            assigned_labels = paddle.full(
-                [batch_size, num_anchors], bg_index, dtype=gt_labels.dtype)
-            assigned_rboxes = paddle.zeros([batch_size, num_anchors, 5])
-            assigned_scores = paddle.zeros(
-                [batch_size, num_anchors, self.num_classes])
-            return assigned_labels, assigned_rboxes, assigned_scores
-
-        # get normalized gaussian distribution score and refined distribution score
-        gt_polys = box2corners(gt_rboxes)
-        score, refined_score = self.get_gaussian_distribution_score(
-            anchor_points, gt_rboxes, gt_polys)
-        inside_mask = self.get_rotated_inside_mask(anchor_points, gt_polys,
-                                                   score)
-        regress_ranges = []
-        for num, bound in zip(num_anchors_list, self.boundary):
-            regress_ranges.append(bound.tile((1, num, 1)))
-        regress_ranges = paddle.concat(regress_ranges, axis=1)
-        regress_mask = self.get_inside_range_mask(
-            anchor_points, gt_bboxes, gt_rboxes, stride_tensor, regress_ranges)
-        # [B, n, L]
-        mask_positive = inside_mask * regress_mask * pad_gt_mask
-        refined_score = refined_score * mask_positive - (1. - mask_positive)
-
-        argmax_refined_score = refined_score.argmax(axis=-2)
-        max_refined_score = refined_score.max(axis=-2)
-        assigned_gt_index = argmax_refined_score
-
-        # assigned target
-        batch_ind = paddle.arange(
-            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
-        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
-        assigned_labels = paddle.gather(
-            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
-        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
-        assigned_labels = paddle.where(
-            max_refined_score > 0, assigned_labels,
-            paddle.full_like(assigned_labels, bg_index))
-
-        assigned_rboxes = paddle.gather(
-            gt_rboxes.reshape([-1, 5]), assigned_gt_index.flatten(), axis=0)
-        assigned_rboxes = assigned_rboxes.reshape([batch_size, num_anchors, 5])
-
-        assigned_scores = F.one_hot(assigned_labels, self.num_classes + 1)
-        ind = list(range(self.num_classes + 1))
-        ind.remove(bg_index)
-        assigned_scores = paddle.index_select(
-            assigned_scores, paddle.to_tensor(ind), axis=-1)
-
-        if self.score_type == 'gaussian':
-            selected_scores = paddle_gather(
-                score, 1, argmax_refined_score.unsqueeze(-2)).squeeze(-2)
-            assigned_scores = assigned_scores * selected_scores.unsqueeze(-1)
-        elif self.score_type == 'iou':
-            assert pred_rboxes is not None, 'If score type is iou, pred_rboxes should not be None'
-            from ext_op import matched_rbox_iou
-            b, l = pred_rboxes.shape[:2]
-            iou_score = matched_rbox_iou(
-                pred_rboxes.reshape((-1, 5)), assigned_rboxes.reshape(
-                    (-1, 5))).reshape((b, l, 1))
-            assigned_scores = assigned_scores * iou_score
-
-        return assigned_labels, assigned_rboxes, assigned_scores
\ No newline at end of file
diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/hungarian_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/hungarian_assigner.py
deleted file mode 100644
index 154c27c..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/assigners/hungarian_assigner.py
+++ /dev/null
@@ -1,316 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-try:
-    from scipy.optimize import linear_sum_assignment
-except ImportError:
-    linear_sum_assignment = None
-
-import paddle
-
-from ppdet.core.workspace import register
-
-__all__ = ['PoseHungarianAssigner', 'PseudoSampler']
-
-
-class AssignResult:
-    """Stores assignments between predicted and truth boxes.
-
-    Attributes:
-        num_gts (int): the number of truth boxes considered when computing this
-            assignment
-
-        gt_inds (LongTensor): for each predicted box indicates the 1-based
-            index of the assigned truth box. 0 means unassigned and -1 means
-            ignore.
-
-        max_overlaps (FloatTensor): the iou between the predicted box and its
-            assigned truth box.
-
-        labels (None | LongTensor): If specified, for each predicted box
-            indicates the category label of the assigned truth box.
-    """
-
-    def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):
-        self.num_gts = num_gts
-        self.gt_inds = gt_inds
-        self.max_overlaps = max_overlaps
-        self.labels = labels
-        # Interface for possible user-defined properties
-        self._extra_properties = {}
-
-    @property
-    def num_preds(self):
-        """int: the number of predictions in this assignment"""
-        return len(self.gt_inds)
-
-    def set_extra_property(self, key, value):
-        """Set user-defined new property."""
-        assert key not in self.info
-        self._extra_properties[key] = value
-
-    def get_extra_property(self, key):
-        """Get user-defined property."""
-        return self._extra_properties.get(key, None)
-
-    @property
-    def info(self):
-        """dict: a dictionary of info about the object"""
-        basic_info = {
-            'num_gts': self.num_gts,
-            'num_preds': self.num_preds,
-            'gt_inds': self.gt_inds,
-            'max_overlaps': self.max_overlaps,
-            'labels': self.labels,
-        }
-        basic_info.update(self._extra_properties)
-        return basic_info
-
-
-@register
-class PoseHungarianAssigner:
-    """Computes one-to-one matching between predictions and ground truth.
-
-    This class computes an assignment between the targets and the predictions
-    based on the costs. The costs are weighted sum of three components:
-    classification cost, regression L1 cost and regression oks cost. The
-    targets don't include the no_object, so generally there are more
-    predictions than targets. After the one-to-one matching, the un-matched
-    are treated as backgrounds. Thus each query prediction will be assigned
-    with `0` or a positive integer indicating the ground truth index:
-
-    - 0: negative sample, no assigned gt.
-    - positive integer: positive sample, index (1-based) of assigned gt.
-
-    Args:
-        cls_weight (int | float, optional): The scale factor for classification
-            cost. Default 1.0.
-        kpt_weight (int | float, optional): The scale factor for regression
-            L1 cost. Default 1.0.
-        oks_weight (int | float, optional): The scale factor for regression
-            oks cost. Default 1.0.
-    """
-    __inject__ = ['cls_cost', 'kpt_cost', 'oks_cost']
-
-    def __init__(self,
-                 cls_cost='ClassificationCost',
-                 kpt_cost='KptL1Cost',
-                 oks_cost='OksCost'):
-        self.cls_cost = cls_cost
-        self.kpt_cost = kpt_cost
-        self.oks_cost = oks_cost
-
-    def assign(self,
-               cls_pred,
-               kpt_pred,
-               gt_labels,
-               gt_keypoints,
-               gt_areas,
-               img_meta,
-               eps=1e-7):
-        """Computes one-to-one matching based on the weighted costs.
-
-        This method assign each query prediction to a ground truth or
-        background. The `assigned_gt_inds` with -1 means don't care,
-        0 means negative sample, and positive number is the index (1-based)
-        of assigned gt.
-        The assignment is done in the following steps, the order matters.
-
-        1. assign every prediction to -1
-        2. compute the weighted costs
-        3. do Hungarian matching on CPU based on the costs
-        4. assign all to 0 (background) first, then for each matched pair
-           between predictions and gts, treat this prediction as foreground
-           and assign the corresponding gt index (plus 1) to it.
-
-        Args:
-            cls_pred (Tensor): Predicted classification logits, shape
-                [num_query, num_class].
-            kpt_pred (Tensor): Predicted keypoints with normalized coordinates
-                (x_{i}, y_{i}), which are all in range [0, 1]. Shape
-                [num_query, K*2].
-            gt_labels (Tensor): Label of `gt_keypoints`, shape (num_gt,).
-            gt_keypoints (Tensor): Ground truth keypoints with unnormalized
-                coordinates [p^{1}_x, p^{1}_y, p^{1}_v, ..., \
-                    p^{K}_x, p^{K}_y, p^{K}_v]. Shape [num_gt, K*3].
-            gt_areas (Tensor): Ground truth mask areas, shape (num_gt,).
-            img_meta (dict): Meta information for current image.
-            eps (int | float, optional): A value added to the denominator for
-                numerical stability. Default 1e-7.
-
-        Returns:
-            :obj:`AssignResult`: The assigned result.
-        """
-        num_gts, num_kpts = gt_keypoints.shape[0], kpt_pred.shape[0]
-        if not gt_keypoints.astype('bool').any():
-            num_gts = 0
-
-        # 1. assign -1 by default
-        assigned_gt_inds = paddle.full((num_kpts, ), -1, dtype="int64")
-        assigned_labels = paddle.full((num_kpts, ), -1, dtype="int64")
-        if num_gts == 0 or num_kpts == 0:
-            # No ground truth or keypoints, return empty assignment
-            if num_gts == 0:
-                # No ground truth, assign all to background
-                assigned_gt_inds[:] = 0
-            return AssignResult(
-                num_gts, assigned_gt_inds, None, labels=assigned_labels)
-        img_h, img_w, _ = img_meta['img_shape']
-        factor = paddle.to_tensor(
-            [img_w, img_h, img_w, img_h], dtype=gt_keypoints.dtype).reshape(
-                (1, -1))
-
-        # 2. compute the weighted costs
-        # classification cost
-        cls_cost = self.cls_cost(cls_pred, gt_labels)
-
-        # keypoint regression L1 cost
-        gt_keypoints_reshape = gt_keypoints.reshape((gt_keypoints.shape[0], -1,
-                                                     3))
-        valid_kpt_flag = gt_keypoints_reshape[..., -1]
-        kpt_pred_tmp = kpt_pred.clone().detach().reshape((kpt_pred.shape[0], -1,
-                                                          2))
-        normalize_gt_keypoints = gt_keypoints_reshape[
-            ..., :2] / factor[:, :2].unsqueeze(0)
-        kpt_cost = self.kpt_cost(kpt_pred_tmp, normalize_gt_keypoints,
-                                 valid_kpt_flag)
-        # keypoint OKS cost
-        kpt_pred_tmp = kpt_pred.clone().detach().reshape((kpt_pred.shape[0], -1,
-                                                          2))
-        kpt_pred_tmp = kpt_pred_tmp * factor[:, :2].unsqueeze(0)
-        oks_cost = self.oks_cost(kpt_pred_tmp, gt_keypoints_reshape[..., :2],
-                                 valid_kpt_flag, gt_areas)
-        # weighted sum of above three costs
-        cost = cls_cost + kpt_cost + oks_cost
-
-        # 3. do Hungarian matching on CPU using linear_sum_assignment
-        cost = cost.detach().cpu()
-        if linear_sum_assignment is None:
-            raise ImportError('Please run "pip install scipy" '
-                              'to install scipy first.')
-        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
-        matched_row_inds = paddle.to_tensor(matched_row_inds)
-        matched_col_inds = paddle.to_tensor(matched_col_inds)
-
-        # 4. assign backgrounds and foregrounds
-        # assign all indices to backgrounds first
-        assigned_gt_inds[:] = 0
-        # assign foregrounds based on matching results
-        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
-        assigned_labels[matched_row_inds] = gt_labels[matched_col_inds][
-            ..., 0].astype("int64")
-        return AssignResult(
-            num_gts, assigned_gt_inds, None, labels=assigned_labels)
-
-
-class SamplingResult:
-    """Bbox sampling result.
-    """
-
-    def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,
-                 gt_flags):
-        self.pos_inds = pos_inds
-        self.neg_inds = neg_inds
-        if pos_inds.size > 0:
-            self.pos_bboxes = bboxes[pos_inds]
-            self.neg_bboxes = bboxes[neg_inds]
-            self.pos_is_gt = gt_flags[pos_inds]
-
-            self.num_gts = gt_bboxes.shape[0]
-            self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
-
-            if gt_bboxes.numel() == 0:
-                # hack for index error case
-                assert self.pos_assigned_gt_inds.numel() == 0
-                self.pos_gt_bboxes = paddle.zeros(
-                    gt_bboxes.shape, dtype=gt_bboxes.dtype).reshape((-1, 4))
-            else:
-                if len(gt_bboxes.shape) < 2:
-                    gt_bboxes = gt_bboxes.reshape((-1, 4))
-
-                self.pos_gt_bboxes = paddle.index_select(
-                    gt_bboxes,
-                    self.pos_assigned_gt_inds.astype('int64'),
-                    axis=0)
-
-            if assign_result.labels is not None:
-                self.pos_gt_labels = assign_result.labels[pos_inds]
-            else:
-                self.pos_gt_labels = None
-
-    @property
-    def bboxes(self):
-        """paddle.Tensor: concatenated positive and negative boxes"""
-        return paddle.concat([self.pos_bboxes, self.neg_bboxes])
-
-    def __nice__(self):
-        data = self.info.copy()
-        data['pos_bboxes'] = data.pop('pos_bboxes').shape
-        data['neg_bboxes'] = data.pop('neg_bboxes').shape
-        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
-        body = '    ' + ',\n    '.join(parts)
-        return '{\n' + body + '\n}'
-
-    @property
-    def info(self):
-        """Returns a dictionary of info about the object."""
-        return {
-            'pos_inds': self.pos_inds,
-            'neg_inds': self.neg_inds,
-            'pos_bboxes': self.pos_bboxes,
-            'neg_bboxes': self.neg_bboxes,
-            'pos_is_gt': self.pos_is_gt,
-            'num_gts': self.num_gts,
-            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
-        }
-
-
-@register
-class PseudoSampler:
-    """A pseudo sampler that does not do sampling actually."""
-
-    def __init__(self, **kwargs):
-        pass
-
-    def _sample_pos(self, **kwargs):
-        """Sample positive samples."""
-        raise NotImplementedError
-
-    def _sample_neg(self, **kwargs):
-        """Sample negative samples."""
-        raise NotImplementedError
-
-    def sample(self, assign_result, bboxes, gt_bboxes, *args, **kwargs):
-        """Directly returns the positive and negative indices  of samples.
-
-        Args:
-            assign_result (:obj:`AssignResult`): Assigned results
-            bboxes (paddle.Tensor): Bounding boxes
-            gt_bboxes (paddle.Tensor): Ground truth boxes
-
-        Returns:
-            :obj:`SamplingResult`: sampler results
-        """
-        pos_inds = paddle.nonzero(
-            assign_result.gt_inds > 0, as_tuple=False).squeeze(-1)
-        neg_inds = paddle.nonzero(
-            assign_result.gt_inds == 0, as_tuple=False).squeeze(-1)
-        gt_flags = paddle.zeros([bboxes.shape[0]], dtype='int32')
-        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
-                                         assign_result, gt_flags)
-        return sampling_result
diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/max_iou_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/max_iou_assigner.py
deleted file mode 100644
index 98a4fdf..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/assigners/max_iou_assigner.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from ppdet.core.workspace import register
-from ppdet.modeling.proposal_generator.target import label_box
-
-__all__ = ['MaxIoUAssigner']
-
-@register
-class MaxIoUAssigner(object):
-    """a standard bbox assigner based on max IoU, use ppdet's label_box 
-    as backend.
-    Args:
-        positive_overlap (float): threshold for defining positive samples 
-        negative_overlap (float): threshold for denining negative samples
-        allow_low_quality (bool): whether to lower IoU thr if a GT poorly
-            overlaps with candidate bboxes
-    """
-    def __init__(self,
-                 positive_overlap,
-                 negative_overlap,
-                 allow_low_quality=True):
-        self.positive_overlap = positive_overlap
-        self.negative_overlap = negative_overlap
-        self.allow_low_quality = allow_low_quality
-
-    def __call__(self, bboxes, gt_bboxes):
-        matches, match_labels = label_box(
-            bboxes,
-            gt_bboxes,
-            positive_overlap=self.positive_overlap,
-            negative_overlap=self.negative_overlap,
-            allow_low_quality=self.allow_low_quality,
-            ignore_thresh=-1,
-            is_crowd=None,
-            assign_on_cpu=False)
-        return matches, match_labels
diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/pose_utils.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/pose_utils.py
deleted file mode 100644
index 313215a..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/assigners/pose_utils.py
+++ /dev/null
@@ -1,275 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import paddle
-import paddle.nn.functional as F
-
-from ppdet.core.workspace import register
-
-__all__ = ['KptL1Cost', 'OksCost', 'ClassificationCost']
-
-
-def masked_fill(x, mask, value):
-    y = paddle.full(x.shape, value, x.dtype)
-    return paddle.where(mask, y, x)
-
-
-@register
-class KptL1Cost(object):
-    """KptL1Cost.
-
-    this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py
-
-    Args:
-        weight (int | float, optional): loss_weight.
-    """
-
-    def __init__(self, weight=1.0):
-        self.weight = weight
-
-    def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag):
-        """
-        Args:
-            kpt_pred (Tensor): Predicted keypoints with normalized coordinates
-                (x_{i}, y_{i}), which are all in range [0, 1]. Shape
-                [num_query, K, 2].
-            gt_keypoints (Tensor): Ground truth keypoints with normalized
-                coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2].
-            valid_kpt_flag (Tensor): valid flag of ground truth keypoints.
-                Shape [num_gt, K].
-
-        Returns:
-            paddle.Tensor: kpt_cost value with weight.
-        """
-        kpt_cost = []
-        for i in range(len(gt_keypoints)):
-            if gt_keypoints[i].size == 0:
-                kpt_cost.append(kpt_pred.sum() * 0)
-            kpt_pred_tmp = kpt_pred.clone()
-            valid_flag = valid_kpt_flag[i] > 0
-            valid_flag_expand = valid_flag.unsqueeze(0).unsqueeze(-1).expand_as(
-                kpt_pred_tmp)
-            if not valid_flag_expand.all():
-                kpt_pred_tmp = masked_fill(kpt_pred_tmp, ~valid_flag_expand, 0)
-            cost = F.pairwise_distance(
-                kpt_pred_tmp.reshape((kpt_pred_tmp.shape[0], -1)),
-                gt_keypoints[i].reshape((-1, )).unsqueeze(0),
-                p=1,
-                keepdim=True)
-            avg_factor = paddle.clip(
-                valid_flag.astype('float32').sum() * 2, 1.0)
-            cost = cost / avg_factor
-            kpt_cost.append(cost)
-        kpt_cost = paddle.concat(kpt_cost, axis=1)
-        return kpt_cost * self.weight
-
-
-@register
-class OksCost(object):
-    """OksCost.
-
-    this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py
-
-    Args:
-        num_keypoints (int): number of keypoints
-        weight (int | float, optional): loss_weight.
-    """
-
-    def __init__(self, num_keypoints=17, weight=1.0):
-        self.weight = weight
-        if num_keypoints == 17:
-            self.sigmas = np.array(
-                [
-                    .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
-                    1.07, .87, .87, .89, .89
-                ],
-                dtype=np.float32) / 10.0
-        elif num_keypoints == 14:
-            self.sigmas = np.array(
-                [
-                    .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89,
-                    .89, .79, .79
-                ],
-                dtype=np.float32) / 10.0
-        else:
-            raise ValueError(f'Unsupported keypoints number {num_keypoints}')
-
-    def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag, gt_areas):
-        """
-        Args:
-            kpt_pred (Tensor): Predicted keypoints with unnormalized
-                coordinates (x_{i}, y_{i}). Shape [num_query, K, 2].
-            gt_keypoints (Tensor): Ground truth keypoints with unnormalized
-                coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2].
-            valid_kpt_flag (Tensor): valid flag of ground truth keypoints.
-                Shape [num_gt, K].
-            gt_areas (Tensor): Ground truth mask areas. Shape [num_gt,].
-
-        Returns:
-            paddle.Tensor: oks_cost value with weight.
-        """
-        sigmas = paddle.to_tensor(self.sigmas)
-        variances = (sigmas * 2)**2
-
-        oks_cost = []
-        assert len(gt_keypoints) == len(gt_areas)
-        for i in range(len(gt_keypoints)):
-            if gt_keypoints[i].size == 0:
-                oks_cost.append(kpt_pred.sum() * 0)
-            squared_distance = \
-                (kpt_pred[:, :, 0] - gt_keypoints[i, :, 0].unsqueeze(0)) ** 2 + \
-                (kpt_pred[:, :, 1] - gt_keypoints[i, :, 1].unsqueeze(0)) ** 2
-            vis_flag = (valid_kpt_flag[i] > 0).astype('int')
-            vis_ind = vis_flag.nonzero(as_tuple=False)[:, 0]
-            num_vis_kpt = vis_ind.shape[0]
-            # assert num_vis_kpt > 0
-            if num_vis_kpt == 0:
-                oks_cost.append(paddle.zeros((squared_distance.shape[0], 1)))
-                continue
-            area = gt_areas[i]
-
-            squared_distance0 = squared_distance / (area * variances * 2)
-            squared_distance0 = paddle.index_select(
-                squared_distance0, vis_ind, axis=1)
-            squared_distance1 = paddle.exp(-squared_distance0).sum(axis=1,
-                                                                   keepdim=True)
-            oks = squared_distance1 / num_vis_kpt
-            # The 1 is a constant that doesn't change the matching, so omitted.
-            oks_cost.append(-oks)
-        oks_cost = paddle.concat(oks_cost, axis=1)
-        return oks_cost * self.weight
-
-
-@register
-class ClassificationCost:
-    """ClsSoftmaxCost.
-
-     Args:
-         weight (int | float, optional): loss_weight
-    """
-
-    def __init__(self, weight=1.):
-        self.weight = weight
-
-    def __call__(self, cls_pred, gt_labels):
-        """
-        Args:
-            cls_pred (Tensor): Predicted classification logits, shape
-                (num_query, num_class).
-            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
-
-        Returns:
-            paddle.Tensor: cls_cost value with weight
-        """
-        # Following the official DETR repo, contrary to the loss that
-        # NLL is used, we approximate it in 1 - cls_score[gt_label].
-        # The 1 is a constant that doesn't change the matching,
-        # so it can be omitted.
-        cls_score = cls_pred.softmax(-1)
-        cls_cost = -cls_score[:, gt_labels]
-        return cls_cost * self.weight
-
-
-@register
-class FocalLossCost:
-    """FocalLossCost.
-
-     Args:
-         weight (int | float, optional): loss_weight
-         alpha (int | float, optional): focal_loss alpha
-         gamma (int | float, optional): focal_loss gamma
-         eps (float, optional): default 1e-12
-         binary_input (bool, optional): Whether the input is binary,
-            default False.
-    """
-
-    def __init__(self,
-                 weight=1.,
-                 alpha=0.25,
-                 gamma=2,
-                 eps=1e-12,
-                 binary_input=False):
-        self.weight = weight
-        self.alpha = alpha
-        self.gamma = gamma
-        self.eps = eps
-        self.binary_input = binary_input
-
-    def _focal_loss_cost(self, cls_pred, gt_labels):
-        """
-        Args:
-            cls_pred (Tensor): Predicted classification logits, shape
-                (num_query, num_class).
-            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
-
-        Returns:
-            paddle.Tensor: cls_cost value with weight
-        """
-        if gt_labels.size == 0:
-            return cls_pred.sum() * 0
-        cls_pred = F.sigmoid(cls_pred)
-        neg_cost = -(1 - cls_pred + self.eps).log() * (
-            1 - self.alpha) * cls_pred.pow(self.gamma)
-        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
-            1 - cls_pred).pow(self.gamma)
-
-        cls_cost = paddle.index_select(
-            pos_cost, gt_labels, axis=1) - paddle.index_select(
-                neg_cost, gt_labels, axis=1)
-        return cls_cost * self.weight
-
-    def _mask_focal_loss_cost(self, cls_pred, gt_labels):
-        """
-        Args:
-            cls_pred (Tensor): Predicted classfication logits
-                in shape (num_query, d1, ..., dn), dtype=paddle.float32.
-            gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn),
-                dtype=paddle.long. Labels should be binary.
-
-        Returns:
-            Tensor: Focal cost matrix with weight in shape\
-                (num_query, num_gt).
-        """
-        cls_pred = cls_pred.flatten(1)
-        gt_labels = gt_labels.flatten(1).float()
-        n = cls_pred.shape[1]
-        cls_pred = F.sigmoid(cls_pred)
-        neg_cost = -(1 - cls_pred + self.eps).log() * (
-            1 - self.alpha) * cls_pred.pow(self.gamma)
-        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
-            1 - cls_pred).pow(self.gamma)
-
-        cls_cost = paddle.einsum('nc,mc->nm', pos_cost, gt_labels) + \
-            paddle.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))
-        return cls_cost / n * self.weight
-
-    def __call__(self, cls_pred, gt_labels):
-        """
-        Args:
-            cls_pred (Tensor): Predicted classfication logits.
-            gt_labels (Tensor)): Labels.
-
-        Returns:
-            Tensor: Focal cost matrix with weight in shape\
-                (num_query, num_gt).
-        """
-        if self.binary_input:
-            return self._mask_focal_loss_cost(cls_pred, gt_labels)
-        else:
-            return self._focal_loss_cost(cls_pred, gt_labels)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/rotated_task_aligned_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/rotated_task_aligned_assigner.py
deleted file mode 100644
index eeb9a68..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/assigners/rotated_task_aligned_assigner.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ppdet.core.workspace import register
-from ..rbox_utils import rotated_iou_similarity, check_points_in_rotated_boxes
-from .utils import gather_topk_anchors, compute_max_iou_anchor
-
-__all__ = ['RotatedTaskAlignedAssigner']
-
-
-@register
-class RotatedTaskAlignedAssigner(nn.Layer):
-    """TOOD: Task-aligned One-stage Object Detection
-    """
-
-    def __init__(self, topk=13, alpha=1.0, beta=6.0, eps=1e-9):
-        super(RotatedTaskAlignedAssigner, self).__init__()
-        self.topk = topk
-        self.alpha = alpha
-        self.beta = beta
-        self.eps = eps
-
-    @paddle.no_grad()
-    def forward(self,
-                pred_scores,
-                pred_bboxes,
-                anchor_points,
-                num_anchors_list,
-                gt_labels,
-                gt_bboxes,
-                pad_gt_mask,
-                bg_index,
-                gt_scores=None):
-        r"""This code is based on
-            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py
-
-        The assignment is done in following steps
-        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
-        2. select top-k bbox as candidates for each gt
-        3. limit the positive sample's center in gt (because the anchor-free detector
-           only can predict positive distance)
-        4. if an anchor box is assigned to multiple gts, the one with the
-           highest iou will be selected.
-        Args:
-            pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
-            pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 5)
-            anchor_points (Tensor, float32): pre-defined anchors, shape(1, L, 2), "cxcy" format
-            num_anchors_list (List): num of anchors in each level, shape(L)
-            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
-            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 5)
-            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
-            bg_index (int): background index
-            gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
-        Returns:
-            assigned_labels (Tensor): (B, L)
-            assigned_bboxes (Tensor): (B, L, 5)
-            assigned_scores (Tensor): (B, L, C)
-        """
-        assert pred_scores.ndim == pred_bboxes.ndim
-        assert gt_labels.ndim == gt_bboxes.ndim and \
-               gt_bboxes.ndim == 3
-
-        batch_size, num_anchors, num_classes = pred_scores.shape
-        _, num_max_boxes, _ = gt_bboxes.shape
-
-        # negative batch
-        if num_max_boxes == 0:
-            assigned_labels = paddle.full(
-                [batch_size, num_anchors], bg_index, dtype=gt_labels.dtype)
-            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 5])
-            assigned_scores = paddle.zeros(
-                [batch_size, num_anchors, num_classes])
-            return assigned_labels, assigned_bboxes, assigned_scores
-
-        # compute iou between gt and pred bbox, [B, n, L]
-        ious = rotated_iou_similarity(gt_bboxes, pred_bboxes)
-        ious = paddle.where(ious > 1 + self.eps, paddle.zeros_like(ious), ious)
-        ious.stop_gradient = True
-        # gather pred bboxes class score
-        pred_scores = pred_scores.transpose([0, 2, 1])
-        batch_ind = paddle.arange(
-            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
-        gt_labels_ind = paddle.stack(
-            [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],
-            axis=-1)
-        bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)
-        # compute alignment metrics, [B, n, L]
-        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(
-            self.beta)
-
-        # check the positive sample's center in gt, [B, n, L]
-        is_in_gts = check_points_in_rotated_boxes(anchor_points, gt_bboxes)
-
-        # select topk largest alignment metrics pred bbox as candidates
-        # for each gt, [B, n, L]
-        is_in_topk = gather_topk_anchors(
-            alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask)
-
-        # select positive sample, [B, n, L]
-        mask_positive = is_in_topk * is_in_gts * pad_gt_mask
-
-        # if an anchor box is assigned to multiple gts,
-        # the one with the highest iou will be selected, [B, n, L]
-        mask_positive_sum = mask_positive.sum(axis=-2)
-        if mask_positive_sum.max() > 1:
-            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
-                [1, num_max_boxes, 1])
-            is_max_iou = compute_max_iou_anchor(ious)
-            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
-                                         mask_positive)
-            mask_positive_sum = mask_positive.sum(axis=-2)
-        assigned_gt_index = mask_positive.argmax(axis=-2)
-
-        # assigned target
-        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
-        assigned_labels = paddle.gather(
-            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
-        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
-        assigned_labels = paddle.where(
-            mask_positive_sum > 0, assigned_labels,
-            paddle.full_like(assigned_labels, bg_index))
-
-        assigned_bboxes = paddle.gather(
-            gt_bboxes.reshape([-1, 5]), assigned_gt_index.flatten(), axis=0)
-        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 5])
-
-        assigned_scores = F.one_hot(assigned_labels, num_classes + 1)
-        ind = list(range(num_classes + 1))
-        ind.remove(bg_index)
-        assigned_scores = paddle.index_select(
-            assigned_scores, paddle.to_tensor(ind), axis=-1)
-        # rescale alignment metrics
-        alignment_metrics *= mask_positive
-        max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)
-        max_ious_per_instance = (ious * mask_positive).max(axis=-1,
-                                                           keepdim=True)
-        alignment_metrics = alignment_metrics / (
-            max_metrics_per_instance + self.eps) * max_ious_per_instance
-        alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
-        assigned_scores = assigned_scores * alignment_metrics
-
-        assigned_bboxes.stop_gradient = True
-        assigned_scores.stop_gradient = True
-        assigned_labels.stop_gradient = True
-        return assigned_labels, assigned_bboxes, assigned_scores
diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/simota_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/simota_assigner.py
deleted file mode 100644
index 4ec87cb..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/assigners/simota_assigner.py
+++ /dev/null
@@ -1,265 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# The code is based on:
-# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/sim_ota_assigner.py
-
-import paddle
-import numpy as np
-import paddle.nn.functional as F
-
-from ppdet.modeling.losses.varifocal_loss import varifocal_loss
-from ppdet.modeling.bbox_utils import batch_bbox_overlaps
-from ppdet.core.workspace import register
-
-
-@register
-class SimOTAAssigner(object):
-    """Computes matching between predictions and ground truth.
-    Args:
-        center_radius (int | float, optional): Ground truth center size
-            to judge whether a prior is in center. Default 2.5.
-        candidate_topk (int, optional): The candidate top-k which used to
-            get top-k ious to calculate dynamic-k. Default 10.
-        iou_weight (int | float, optional): The scale factor for regression
-            iou cost. Default 3.0.
-        cls_weight (int | float, optional): The scale factor for classification
-            cost. Default 1.0.
-        num_classes (int): The num_classes of dataset.
-        use_vfl (int): Whether to use varifocal_loss when calculating the cost matrix.
-    """
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 center_radius=2.5,
-                 candidate_topk=10,
-                 iou_weight=3.0,
-                 cls_weight=1.0,
-                 num_classes=80,
-                 use_vfl=True):
-        self.center_radius = center_radius
-        self.candidate_topk = candidate_topk
-        self.iou_weight = iou_weight
-        self.cls_weight = cls_weight
-        self.num_classes = num_classes
-        self.use_vfl = use_vfl
-
-    def get_in_gt_and_in_center_info(self, flatten_center_and_stride,
-                                     gt_bboxes):
-        num_gt = gt_bboxes.shape[0]
-
-        flatten_x = flatten_center_and_stride[:, 0].unsqueeze(1).tile(
-            [1, num_gt])
-        flatten_y = flatten_center_and_stride[:, 1].unsqueeze(1).tile(
-            [1, num_gt])
-        flatten_stride_x = flatten_center_and_stride[:, 2].unsqueeze(1).tile(
-            [1, num_gt])
-        flatten_stride_y = flatten_center_and_stride[:, 3].unsqueeze(1).tile(
-            [1, num_gt])
-
-        # is prior centers in gt bboxes, shape: [n_center, n_gt]
-        l_ = flatten_x - gt_bboxes[:, 0]
-        t_ = flatten_y - gt_bboxes[:, 1]
-        r_ = gt_bboxes[:, 2] - flatten_x
-        b_ = gt_bboxes[:, 3] - flatten_y
-
-        deltas = paddle.stack([l_, t_, r_, b_], axis=1)
-        is_in_gts = deltas.min(axis=1) > 0
-        is_in_gts_all = is_in_gts.sum(axis=1) > 0
-
-        # is prior centers in gt centers
-        gt_center_xs = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
-        gt_center_ys = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
-        ct_bound_l = gt_center_xs - self.center_radius * flatten_stride_x
-        ct_bound_t = gt_center_ys - self.center_radius * flatten_stride_y
-        ct_bound_r = gt_center_xs + self.center_radius * flatten_stride_x
-        ct_bound_b = gt_center_ys + self.center_radius * flatten_stride_y
-
-        cl_ = flatten_x - ct_bound_l
-        ct_ = flatten_y - ct_bound_t
-        cr_ = ct_bound_r - flatten_x
-        cb_ = ct_bound_b - flatten_y
-
-        ct_deltas = paddle.stack([cl_, ct_, cr_, cb_], axis=1)
-        is_in_cts = ct_deltas.min(axis=1) > 0
-        is_in_cts_all = is_in_cts.sum(axis=1) > 0
-
-        # in any of gts or gt centers, shape: [n_center]
-        is_in_gts_or_centers_all = paddle.logical_or(is_in_gts_all,
-                                                     is_in_cts_all)
-
-        is_in_gts_or_centers_all_inds = paddle.nonzero(
-            is_in_gts_or_centers_all).squeeze(1)
-
-        # both in gts and gt centers, shape: [num_fg, num_gt]
-        is_in_gts_and_centers = paddle.logical_and(
-            paddle.gather(
-                is_in_gts.cast('int'), is_in_gts_or_centers_all_inds,
-                axis=0).cast('bool'),
-            paddle.gather(
-                is_in_cts.cast('int'), is_in_gts_or_centers_all_inds,
-                axis=0).cast('bool'))
-        return is_in_gts_or_centers_all, is_in_gts_or_centers_all_inds, is_in_gts_and_centers
-
-    def dynamic_k_matching(self, cost_matrix, pairwise_ious, num_gt):
-        match_matrix = np.zeros_like(cost_matrix.numpy())
-        # select candidate topk ious for dynamic-k calculation
-        topk_ious, _ = paddle.topk(
-            pairwise_ious,
-            min(self.candidate_topk, pairwise_ious.shape[0]),
-            axis=0)
-        # calculate dynamic k for each gt
-        dynamic_ks = paddle.clip(topk_ious.sum(0).cast('int'), min=1)
-        for gt_idx in range(num_gt):
-            _, pos_idx = paddle.topk(
-                cost_matrix[:, gt_idx], k=dynamic_ks[gt_idx], largest=False)
-            match_matrix[:, gt_idx][pos_idx.numpy()] = 1.0
-
-        del topk_ious, dynamic_ks, pos_idx
-
-        # match points more than two gts
-        extra_match_gts_mask = match_matrix.sum(1) > 1
-        if extra_match_gts_mask.sum() > 0:
-            cost_matrix = cost_matrix.numpy()
-            cost_argmin = np.argmin(
-                cost_matrix[extra_match_gts_mask, :], axis=1)
-            match_matrix[extra_match_gts_mask, :] *= 0.0
-            match_matrix[extra_match_gts_mask, cost_argmin] = 1.0
-        # get foreground mask
-        match_fg_mask_inmatrix = match_matrix.sum(1) > 0
-        match_gt_inds_to_fg = match_matrix[match_fg_mask_inmatrix, :].argmax(1)
-
-        return match_gt_inds_to_fg, match_fg_mask_inmatrix
-
-    def get_sample(self, assign_gt_inds, gt_bboxes):
-        pos_inds = np.unique(np.nonzero(assign_gt_inds > 0)[0])
-        neg_inds = np.unique(np.nonzero(assign_gt_inds == 0)[0])
-        pos_assigned_gt_inds = assign_gt_inds[pos_inds] - 1
-
-        if gt_bboxes.size == 0:
-            # hack for index error case
-            assert pos_assigned_gt_inds.size == 0
-            pos_gt_bboxes = np.empty_like(gt_bboxes).reshape(-1, 4)
-        else:
-            if len(gt_bboxes.shape) < 2:
-                gt_bboxes = gt_bboxes.resize(-1, 4)
-            pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
-        return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds
-
-    def __call__(self,
-                 flatten_cls_pred_scores,
-                 flatten_center_and_stride,
-                 flatten_bboxes,
-                 gt_bboxes,
-                 gt_labels,
-                 eps=1e-7):
-        """Assign gt to priors using SimOTA.
-        TODO: add comment.
-        Returns:
-            assign_result: The assigned result.
-        """
-        num_gt = gt_bboxes.shape[0]
-        num_bboxes = flatten_bboxes.shape[0]
-
-        if num_gt == 0 or num_bboxes == 0:
-            # No ground truth or boxes
-            label = np.ones([num_bboxes], dtype=np.int64) * self.num_classes
-            label_weight = np.ones([num_bboxes], dtype=np.float32)
-            bbox_target = np.zeros_like(flatten_center_and_stride)
-            return 0, label, label_weight, bbox_target
-
-        is_in_gts_or_centers_all, is_in_gts_or_centers_all_inds, is_in_boxes_and_center = self.get_in_gt_and_in_center_info(
-            flatten_center_and_stride, gt_bboxes)
-
-        # bboxes and scores to calculate matrix
-        valid_flatten_bboxes = flatten_bboxes[is_in_gts_or_centers_all_inds]
-        valid_cls_pred_scores = flatten_cls_pred_scores[
-            is_in_gts_or_centers_all_inds]
-        num_valid_bboxes = valid_flatten_bboxes.shape[0]
-
-        pairwise_ious = batch_bbox_overlaps(valid_flatten_bboxes,
-                                            gt_bboxes)  # [num_points,num_gts]
-        if self.use_vfl:
-            gt_vfl_labels = gt_labels.squeeze(-1).unsqueeze(0).tile(
-                [num_valid_bboxes, 1]).reshape([-1])
-            valid_pred_scores = valid_cls_pred_scores.unsqueeze(1).tile(
-                [1, num_gt, 1]).reshape([-1, self.num_classes])
-            vfl_score = np.zeros(valid_pred_scores.shape)
-            vfl_score[np.arange(0, vfl_score.shape[0]), gt_vfl_labels.numpy(
-            )] = pairwise_ious.reshape([-1])
-            vfl_score = paddle.to_tensor(vfl_score)
-            losses_vfl = varifocal_loss(
-                valid_pred_scores, vfl_score,
-                use_sigmoid=False).reshape([num_valid_bboxes, num_gt])
-            losses_giou = batch_bbox_overlaps(
-                valid_flatten_bboxes, gt_bboxes, mode='giou')
-            cost_matrix = (
-                losses_vfl * self.cls_weight + losses_giou * self.iou_weight +
-                paddle.logical_not(is_in_boxes_and_center).cast('float32') *
-                100000000)
-        else:
-            iou_cost = -paddle.log(pairwise_ious + eps)
-            gt_onehot_label = (F.one_hot(
-                gt_labels.squeeze(-1).cast(paddle.int64),
-                flatten_cls_pred_scores.shape[-1]).cast('float32').unsqueeze(0)
-                               .tile([num_valid_bboxes, 1, 1]))
-
-            valid_pred_scores = valid_cls_pred_scores.unsqueeze(1).tile(
-                [1, num_gt, 1])
-            cls_cost = F.binary_cross_entropy(
-                valid_pred_scores, gt_onehot_label, reduction='none').sum(-1)
-
-            cost_matrix = (
-                cls_cost * self.cls_weight + iou_cost * self.iou_weight +
-                paddle.logical_not(is_in_boxes_and_center).cast('float32') *
-                100000000)
-
-        match_gt_inds_to_fg, match_fg_mask_inmatrix = \
-            self.dynamic_k_matching(
-                cost_matrix, pairwise_ious, num_gt)
-
-        # sample and assign results
-        assigned_gt_inds = np.zeros([num_bboxes], dtype=np.int64)
-        match_fg_mask_inall = np.zeros_like(assigned_gt_inds)
-        match_fg_mask_inall[is_in_gts_or_centers_all.numpy(
-        )] = match_fg_mask_inmatrix
-
-        assigned_gt_inds[match_fg_mask_inall.astype(
-            np.bool_)] = match_gt_inds_to_fg + 1
-
-        pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds \
-            = self.get_sample(assigned_gt_inds, gt_bboxes.numpy())
-
-        bbox_target = np.zeros(flatten_bboxes.shape, paddle.common_ops_import.convert_dtype(flatten_bboxes.dtype))
-        bbox_weight = np.zeros_like(bbox_target)
-        label = np.ones([num_bboxes], dtype=np.int64) * self.num_classes
-        label_weight = np.zeros([num_bboxes], dtype=np.float32)
-
-        if len(pos_inds) > 0:
-            gt_labels = gt_labels.numpy()
-            pos_bbox_targets = pos_gt_bboxes
-            bbox_target[pos_inds, :] = pos_bbox_targets
-            bbox_weight[pos_inds, :] = 1.0
-            if not np.any(gt_labels):
-                label[pos_inds] = 0
-            else:
-                label[pos_inds] = gt_labels.squeeze(-1)[pos_assigned_gt_inds]
-
-            label_weight[pos_inds] = 1.0
-        if len(neg_inds) > 0:
-            label_weight[neg_inds] = 1.0
-
-        pos_num = max(pos_inds.size, 1)
-
-        return pos_num, label, label_weight, bbox_target
diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/task_aligned_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/task_aligned_assigner.py
deleted file mode 100644
index 23af794..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/assigners/task_aligned_assigner.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ppdet.core.workspace import register
-from ..bbox_utils import batch_iou_similarity
-from .utils import (gather_topk_anchors, check_points_inside_bboxes,
-                    compute_max_iou_anchor)
-
-__all__ = ['TaskAlignedAssigner']
-
-
-def is_close_gt(anchor, gt, stride_lst, max_dist=2.0, alpha=2.):
-    """Calculate distance ratio of box1 and box2 in batch for larger stride
-        anchors dist/stride to promote the survive of large distance match
-    Args:
-        anchor (Tensor): box with the shape [L, 2]
-        gt (Tensor): box with the shape [N, M2, 4]
-    Return:
-        dist (Tensor): dist ratio between box1 and box2 with the shape [N, M1, M2]
-    """
-    center1 = anchor.unsqueeze(0)
-    center2 = (gt[..., :2] + gt[..., -2:]) / 2.
-    center1 = center1.unsqueeze(1)  # [N, M1, 2] -> [N, 1, M1, 2]
-    center2 = center2.unsqueeze(2)  # [N, M2, 2] -> [N, M2, 1, 2]
-
-    stride = paddle.concat([
-        paddle.full([x], 32 / pow(2, idx)) for idx, x in enumerate(stride_lst)
-    ]).unsqueeze(0).unsqueeze(0)
-    dist = paddle.linalg.norm(center1 - center2, p=2, axis=-1) / stride
-    dist_ratio = dist
-    dist_ratio[dist < max_dist] = 1.
-    dist_ratio[dist >= max_dist] = 0.
-    return dist_ratio
-
-
-@register
-class TaskAlignedAssigner(nn.Layer):
-    """TOOD: Task-aligned One-stage Object Detection
-    """
-
-    def __init__(self,
-                 topk=13,
-                 alpha=1.0,
-                 beta=6.0,
-                 eps=1e-9,
-                 is_close_gt=False):
-        super(TaskAlignedAssigner, self).__init__()
-        self.topk = topk
-        self.alpha = alpha
-        self.beta = beta
-        self.eps = eps
-        self.is_close_gt = is_close_gt
-
-    @paddle.no_grad()
-    def forward(self,
-                pred_scores,
-                pred_bboxes,
-                anchor_points,
-                num_anchors_list,
-                gt_labels,
-                gt_bboxes,
-                pad_gt_mask,
-                bg_index,
-                gt_scores=None):
-        r"""This code is based on
-            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py
-
-        The assignment is done in following steps
-        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
-        2. select top-k bbox as candidates for each gt
-        3. limit the positive sample's center in gt (because the anchor-free detector
-           only can predict positive distance)
-        4. if an anchor box is assigned to multiple gts, the one with the
-           highest iou will be selected.
-        Args:
-            pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
-            pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4)
-            anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format
-            num_anchors_list (List): num of anchors in each level, shape(L)
-            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
-            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
-            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
-            bg_index (int): background index
-            gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
-        Returns:
-            assigned_labels (Tensor): (B, L)
-            assigned_bboxes (Tensor): (B, L, 4)
-            assigned_scores (Tensor): (B, L, C)
-        """
-        assert pred_scores.ndim == pred_bboxes.ndim
-        assert gt_labels.ndim == gt_bboxes.ndim and \
-               gt_bboxes.ndim == 3
-
-        batch_size, num_anchors, num_classes = pred_scores.shape
-        _, num_max_boxes, _ = gt_bboxes.shape
-
-        # negative batch
-        if num_max_boxes == 0:
-            assigned_labels = paddle.full(
-                [batch_size, num_anchors], bg_index, dtype='int32')
-            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
-            assigned_scores = paddle.zeros(
-                [batch_size, num_anchors, num_classes])
-            return assigned_labels, assigned_bboxes, assigned_scores
-
-        # compute iou between gt and pred bbox, [B, n, L]
-        ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
-        # gather pred bboxes class score
-        pred_scores = pred_scores.transpose([0, 2, 1])
-        batch_ind = paddle.arange(
-            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
-        gt_labels_ind = paddle.stack(
-            [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],
-            axis=-1)
-        bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)
-        # compute alignment metrics, [B, n, L]
-        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(
-            self.beta)
-
-        # check the positive sample's center in gt, [B, n, L]
-        if self.is_close_gt:
-            is_in_gts = is_close_gt(anchor_points, gt_bboxes, num_anchors_list)
-        else:
-            is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes)
-
-        # select topk largest alignment metrics pred bbox as candidates
-        # for each gt, [B, n, L]
-        is_in_topk = gather_topk_anchors(
-            alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask)
-
-        # select positive sample, [B, n, L]
-        mask_positive = is_in_topk * is_in_gts * pad_gt_mask
-
-        # if an anchor box is assigned to multiple gts,
-        # the one with the highest iou will be selected, [B, n, L]
-        mask_positive_sum = mask_positive.sum(axis=-2)
-        if mask_positive_sum.max() > 1:
-            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
-                [1, num_max_boxes, 1])
-            is_max_iou = compute_max_iou_anchor(ious)
-            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
-                                         mask_positive)
-            mask_positive_sum = mask_positive.sum(axis=-2)
-        assigned_gt_index = mask_positive.argmax(axis=-2)
-
-        # assigned target
-        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
-        assigned_labels = paddle.gather(
-            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
-        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
-        assigned_labels = paddle.where(
-            mask_positive_sum > 0, assigned_labels,
-            paddle.full_like(assigned_labels, bg_index))
-
-        assigned_bboxes = paddle.gather(
-            gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
-        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])
-
-        assigned_scores = F.one_hot(assigned_labels, num_classes + 1)
-        ind = list(range(num_classes + 1))
-        ind.remove(bg_index)
-        assigned_scores = paddle.index_select(
-            assigned_scores, paddle.to_tensor(ind), axis=-1)
-        # rescale alignment metrics
-        alignment_metrics *= mask_positive
-        max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)
-        max_ious_per_instance = (ious * mask_positive).max(axis=-1,
-                                                           keepdim=True)
-        alignment_metrics = alignment_metrics / (
-            max_metrics_per_instance + self.eps) * max_ious_per_instance
-        alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
-        assigned_scores = assigned_scores * alignment_metrics
-
-        return assigned_labels, assigned_bboxes, assigned_scores
diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/task_aligned_assigner_cr.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/task_aligned_assigner_cr.py
deleted file mode 100644
index 5c50976..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/assigners/task_aligned_assigner_cr.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ppdet.core.workspace import register
-from ..bbox_utils import batch_iou_similarity
-from .utils import (gather_topk_anchors, check_points_inside_bboxes,
-                    compute_max_iou_anchor)
-
-__all__ = ['TaskAlignedAssigner_CR']
-
-
-@register
-class TaskAlignedAssigner_CR(nn.Layer):
-    """TOOD: Task-aligned One-stage Object Detection with Center R
-    """
-
-    def __init__(self,
-                 topk=13,
-                 alpha=1.0,
-                 beta=6.0,
-                 center_radius=None,
-                 eps=1e-9):
-        super(TaskAlignedAssigner_CR, self).__init__()
-        self.topk = topk
-        self.alpha = alpha
-        self.beta = beta
-        self.center_radius = center_radius
-        self.eps = eps
-
-    @paddle.no_grad()
-    def forward(self,
-                pred_scores,
-                pred_bboxes,
-                anchor_points,
-                stride_tensor,
-                gt_labels,
-                gt_bboxes,
-                pad_gt_mask,
-                bg_index,
-                gt_scores=None):
-        r"""This code is based on
-            https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py
-
-        The assignment is done in following steps
-        1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt
-        2. select top-k bbox as candidates for each gt
-        3. limit the positive sample's center in gt (because the anchor-free detector
-           only can predict positive distance)
-        4. if an anchor box is assigned to multiple gts, the one with the
-           highest iou will be selected.
-        Args:
-            pred_scores (Tensor, float32): predicted class probability, shape(B, L, C)
-            pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4)
-            anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format
-            stride_tensor (Tensor, float32): stride of feature map, shape(L, 1)
-            gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1)
-            gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4)
-            pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1)
-            bg_index (int): background index
-            gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1)
-        Returns:
-            assigned_labels (Tensor): (B, L)
-            assigned_bboxes (Tensor): (B, L, 4)
-            assigned_scores (Tensor): (B, L, C)
-        """
-        assert pred_scores.ndim == pred_bboxes.ndim
-        assert gt_labels.ndim == gt_bboxes.ndim and \
-               gt_bboxes.ndim == 3
-
-        batch_size, num_anchors, num_classes = pred_scores.shape
-        _, num_max_boxes, _ = gt_bboxes.shape
-
-        # negative batch
-        if num_max_boxes == 0:
-            assigned_labels = paddle.full(
-                [batch_size, num_anchors], bg_index, dtype='int32')
-            assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4])
-            assigned_scores = paddle.zeros(
-                [batch_size, num_anchors, num_classes])
-            return assigned_labels, assigned_bboxes, assigned_scores
-
-        # compute iou between gt and pred bbox, [B, n, L]
-        ious = batch_iou_similarity(gt_bboxes, pred_bboxes)
-        # gather pred bboxes class score
-        pred_scores = pred_scores.transpose([0, 2, 1])
-        batch_ind = paddle.arange(
-            end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1)
-        gt_labels_ind = paddle.stack(
-            [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)],
-            axis=-1)
-        bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind)
-        # compute alignment metrics, [B, n, L]
-        alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow(
-            self.beta) * pad_gt_mask
-
-        # select positive sample, [B, n, L]
-        if self.center_radius is None:
-            # check the positive sample's center in gt, [B, n, L]
-            is_in_gts = check_points_inside_bboxes(
-                anchor_points, gt_bboxes, sm_use=True)
-            # select topk largest alignment metrics pred bbox as candidates
-            # for each gt, [B, n, L]
-            mask_positive = gather_topk_anchors(
-                alignment_metrics, self.topk, topk_mask=pad_gt_mask) * is_in_gts
-        else:
-            is_in_gts, is_in_center = check_points_inside_bboxes(
-                anchor_points,
-                gt_bboxes,
-                stride_tensor * self.center_radius,
-                sm_use=True)
-            is_in_gts *= pad_gt_mask
-            is_in_center *= pad_gt_mask
-            candidate_metrics = paddle.where(
-                is_in_gts.sum(-1, keepdim=True) == 0,
-                alignment_metrics + is_in_center,
-                alignment_metrics)
-            mask_positive = gather_topk_anchors(
-                candidate_metrics, self.topk,
-                topk_mask=pad_gt_mask) * paddle.cast((is_in_center > 0) |
-                                                     (is_in_gts > 0), 'float32')
-
-        # if an anchor box is assigned to multiple gts,
-        # the one with the highest iou will be selected, [B, n, L]
-        mask_positive_sum = mask_positive.sum(axis=-2)
-        if mask_positive_sum.max() > 1:
-            mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile(
-                [1, num_max_boxes, 1])
-            is_max_iou = compute_max_iou_anchor(ious * mask_positive)
-            mask_positive = paddle.where(mask_multiple_gts, is_max_iou,
-                                         mask_positive)
-            mask_positive_sum = mask_positive.sum(axis=-2)
-        assigned_gt_index = mask_positive.argmax(axis=-2)
-
-        # assigned target
-        assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes
-        assigned_labels = paddle.gather(
-            gt_labels.flatten(), assigned_gt_index.flatten(), axis=0)
-        assigned_labels = assigned_labels.reshape([batch_size, num_anchors])
-        assigned_labels = paddle.where(
-            mask_positive_sum > 0, assigned_labels,
-            paddle.full_like(assigned_labels, bg_index))
-
-        assigned_bboxes = paddle.gather(
-            gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0)
-        assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4])
-
-        assigned_scores = F.one_hot(assigned_labels, num_classes + 1)
-        ind = list(range(num_classes + 1))
-        ind.remove(bg_index)
-        assigned_scores = paddle.index_select(
-            assigned_scores, paddle.to_tensor(ind), axis=-1)
-        # rescale alignment metrics
-        alignment_metrics *= mask_positive
-        max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True)
-        max_ious_per_instance = (ious * mask_positive).max(axis=-1,
-                                                           keepdim=True)
-        alignment_metrics = alignment_metrics / (
-            max_metrics_per_instance + self.eps) * max_ious_per_instance
-        alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1)
-        assigned_scores = assigned_scores * alignment_metrics
-
-        return assigned_labels, assigned_bboxes, assigned_scores
diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/uniform_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/uniform_assigner.py
deleted file mode 100644
index 1c14805..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/assigners/uniform_assigner.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-
-from ppdet.modeling.bbox_utils import batch_bbox_overlaps
-from ppdet.modeling.transformers import bbox_xyxy_to_cxcywh
-
-__all__ = ['UniformAssigner']
-
-
-def batch_p_dist(x, y, p=2):
-    """
-    calculate pairwise p_dist, the first index of x and y are batch
-    return [x.shape[0], y.shape[0]]
-    """
-    x = x.unsqueeze(1)
-    diff = x - y
-    return paddle.norm(diff, p=p, axis=list(range(2, diff.dim())))
-
-
-@register
-class UniformAssigner(nn.Layer):
-    def __init__(self, pos_ignore_thr, neg_ignore_thr, match_times=4):
-        super(UniformAssigner, self).__init__()
-        self.pos_ignore_thr = pos_ignore_thr
-        self.neg_ignore_thr = neg_ignore_thr
-        self.match_times = match_times
-
-    def forward(self, bbox_pred, anchor, gt_bboxes, gt_labels=None):
-        num_bboxes = bbox_pred.shape[0]
-        num_gts = gt_bboxes.shape[0]
-        match_labels = paddle.full([num_bboxes], -1, dtype=paddle.int32)
-
-        pred_ious = batch_bbox_overlaps(bbox_pred, gt_bboxes)
-        pred_max_iou = pred_ious.max(axis=1)
-        neg_ignore = pred_max_iou > self.neg_ignore_thr
-        # exclude potential ignored neg samples first, deal with pos samples later
-        #match_labels: -2(ignore), -1(neg) or >=0(pos_inds)
-        match_labels = paddle.where(neg_ignore,
-                                    paddle.full_like(match_labels, -2),
-                                    match_labels)
-
-        bbox_pred_c = bbox_xyxy_to_cxcywh(bbox_pred)
-        anchor_c = bbox_xyxy_to_cxcywh(anchor)
-        gt_bboxes_c = bbox_xyxy_to_cxcywh(gt_bboxes)
-        bbox_pred_dist = batch_p_dist(bbox_pred_c, gt_bboxes_c, p=1)
-        anchor_dist = batch_p_dist(anchor_c, gt_bboxes_c, p=1)
-
-        top_pred = bbox_pred_dist.topk(
-            k=self.match_times, axis=0, largest=False)[1]
-        top_anchor = anchor_dist.topk(
-            k=self.match_times, axis=0, largest=False)[1]
-
-        tar_pred = paddle.arange(num_gts).expand([self.match_times, num_gts])
-        tar_anchor = paddle.arange(num_gts).expand([self.match_times, num_gts])
-        pos_places = paddle.concat([top_pred, top_anchor]).reshape([-1])
-        pos_inds = paddle.concat([tar_pred, tar_anchor]).reshape([-1])
-
-        pos_anchor = anchor[pos_places]
-        pos_tar_bbox = gt_bboxes[pos_inds]
-        pos_ious = batch_bbox_overlaps(
-            pos_anchor, pos_tar_bbox, is_aligned=True)
-        pos_ignore = pos_ious < self.pos_ignore_thr
-        pos_inds = paddle.where(pos_ignore,
-                                paddle.full_like(pos_inds, -2), pos_inds)
-        match_labels[pos_places] = pos_inds
-        match_labels.stop_gradient = True
-        pos_keep = ~pos_ignore
-
-        if pos_keep.sum() > 0:
-            pos_places_keep = pos_places[pos_keep]
-            pos_bbox_pred = bbox_pred[pos_places_keep].reshape([-1, 4])
-            pos_bbox_tar = pos_tar_bbox[pos_keep].reshape([-1, 4]).detach()
-        else:
-            pos_bbox_pred = None
-            pos_bbox_tar = None
-
-        return match_labels, pos_bbox_pred, pos_bbox_tar
diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/utils.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/utils.py
deleted file mode 100644
index 8fe7c93..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/assigners/utils.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn.functional as F
-
-__all__ = [
-    'pad_gt', 'gather_topk_anchors', 'check_points_inside_bboxes',
-    'compute_max_iou_anchor', 'compute_max_iou_gt',
-    'generate_anchors_for_grid_cell'
-]
-
-
-def pad_gt(gt_labels, gt_bboxes, gt_scores=None):
-    r""" Pad 0 in gt_labels and gt_bboxes.
-    Args:
-        gt_labels (Tensor|List[Tensor], int64): Label of gt_bboxes,
-            shape is [B, n, 1] or [[n_1, 1], [n_2, 1], ...], here n = sum(n_i)
-        gt_bboxes (Tensor|List[Tensor], float32): Ground truth bboxes,
-            shape is [B, n, 4] or [[n_1, 4], [n_2, 4], ...], here n = sum(n_i)
-        gt_scores (Tensor|List[Tensor]|None, float32): Score of gt_bboxes,
-            shape is [B, n, 1] or [[n_1, 4], [n_2, 4], ...], here n = sum(n_i)
-    Returns:
-        pad_gt_labels (Tensor, int64): shape[B, n, 1]
-        pad_gt_bboxes (Tensor, float32): shape[B, n, 4]
-        pad_gt_scores (Tensor, float32): shape[B, n, 1]
-        pad_gt_mask (Tensor, float32): shape[B, n, 1], 1 means bbox, 0 means no bbox
-    """
-    if isinstance(gt_labels, paddle.Tensor) and isinstance(gt_bboxes,
-                                                           paddle.Tensor):
-        assert gt_labels.ndim == gt_bboxes.ndim and \
-               gt_bboxes.ndim == 3
-        pad_gt_mask = (
-            gt_bboxes.sum(axis=-1, keepdim=True) > 0).astype(gt_bboxes.dtype)
-        if gt_scores is None:
-            gt_scores = pad_gt_mask.clone()
-        assert gt_labels.ndim == gt_scores.ndim
-
-        return gt_labels, gt_bboxes, gt_scores, pad_gt_mask
-    elif isinstance(gt_labels, list) and isinstance(gt_bboxes, list):
-        assert len(gt_labels) == len(gt_bboxes), \
-            'The number of `gt_labels` and `gt_bboxes` is not equal. '
-        num_max_boxes = max([len(a) for a in gt_bboxes])
-        batch_size = len(gt_bboxes)
-        # pad label and bbox
-        pad_gt_labels = paddle.zeros(
-            [batch_size, num_max_boxes, 1], dtype=gt_labels[0].dtype)
-        pad_gt_bboxes = paddle.zeros(
-            [batch_size, num_max_boxes, 4], dtype=gt_bboxes[0].dtype)
-        pad_gt_scores = paddle.zeros(
-            [batch_size, num_max_boxes, 1], dtype=gt_bboxes[0].dtype)
-        pad_gt_mask = paddle.zeros(
-            [batch_size, num_max_boxes, 1], dtype=gt_bboxes[0].dtype)
-        for i, (label, bbox) in enumerate(zip(gt_labels, gt_bboxes)):
-            if len(label) > 0 and len(bbox) > 0:
-                pad_gt_labels[i, :len(label)] = label
-                pad_gt_bboxes[i, :len(bbox)] = bbox
-                pad_gt_mask[i, :len(bbox)] = 1.
-                if gt_scores is not None:
-                    pad_gt_scores[i, :len(gt_scores[i])] = gt_scores[i]
-        if gt_scores is None:
-            pad_gt_scores = pad_gt_mask.clone()
-        return pad_gt_labels, pad_gt_bboxes, pad_gt_scores, pad_gt_mask
-    else:
-        raise ValueError('The input `gt_labels` or `gt_bboxes` is invalid! ')
-
-
-def gather_topk_anchors(metrics, topk, largest=True, topk_mask=None, eps=1e-9):
-    r"""
-    Args:
-        metrics (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
-        topk (int): The number of top elements to look for along the axis.
-        largest (bool) : largest is a flag, if set to true,
-            algorithm will sort by descending order, otherwise sort by
-            ascending order. Default: True
-        topk_mask (Tensor, float32): shape[B, n, 1], ignore bbox mask,
-            Default: None
-        eps (float): Default: 1e-9
-    Returns:
-        is_in_topk (Tensor, float32): shape[B, n, L], value=1. means selected
-    """
-    num_anchors = metrics.shape[-1]
-    topk_metrics, topk_idxs = paddle.topk(
-        metrics, topk, axis=-1, largest=largest)
-    if topk_mask is None:
-        topk_mask = (
-            topk_metrics.max(axis=-1, keepdim=True) > eps).astype(metrics.dtype)
-    is_in_topk = F.one_hot(topk_idxs, num_anchors).sum(
-        axis=-2).astype(metrics.dtype)
-    return is_in_topk * topk_mask
-
-
-def check_points_inside_bboxes(points,
-                               bboxes,
-                               center_radius_tensor=None,
-                               eps=1e-9,
-                               sm_use=False):
-    r"""
-    Args:
-        points (Tensor, float32): shape[L, 2], "xy" format, L: num_anchors
-        bboxes (Tensor, float32): shape[B, n, 4], "xmin, ymin, xmax, ymax" format
-        center_radius_tensor (Tensor, float32): shape [L, 1]. Default: None.
-        eps (float): Default: 1e-9
-    Returns:
-        is_in_bboxes (Tensor, float32): shape[B, n, L], value=1. means selected
-    """
-    points = points.unsqueeze([0, 1])
-    x, y = points.chunk(2, axis=-1)
-    xmin, ymin, xmax, ymax = bboxes.unsqueeze(2).chunk(4, axis=-1)
-    # check whether `points` is in `bboxes`
-    l = x - xmin
-    t = y - ymin
-    r = xmax - x
-    b = ymax - y
-    delta_ltrb = paddle.concat([l, t, r, b], axis=-1)
-    is_in_bboxes = (delta_ltrb.min(axis=-1) > eps)
-    if center_radius_tensor is not None:
-        # check whether `points` is in `center_radius`
-        center_radius_tensor = center_radius_tensor.unsqueeze([0, 1])
-        cx = (xmin + xmax) * 0.5
-        cy = (ymin + ymax) * 0.5
-        l = x - (cx - center_radius_tensor)
-        t = y - (cy - center_radius_tensor)
-        r = (cx + center_radius_tensor) - x
-        b = (cy + center_radius_tensor) - y
-        delta_ltrb_c = paddle.concat([l, t, r, b], axis=-1)
-        is_in_center = (delta_ltrb_c.min(axis=-1) > eps)
-        if sm_use:
-            return is_in_bboxes.astype(bboxes.dtype), is_in_center.astype(
-                bboxes.dtype)
-        else:
-            return (paddle.logical_and(is_in_bboxes, is_in_center),
-                    paddle.logical_or(is_in_bboxes, is_in_center))
-
-    return is_in_bboxes.astype(bboxes.dtype)
-
-
-def compute_max_iou_anchor(ious):
-    r"""
-    For each anchor, find the GT with the largest IOU.
-    Args:
-        ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
-    Returns:
-        is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected
-    """
-    num_max_boxes = ious.shape[-2]
-    max_iou_index = ious.argmax(axis=-2)
-    is_max_iou = F.one_hot(max_iou_index, num_max_boxes).transpose([0, 2, 1])
-    return is_max_iou.astype(ious.dtype)
-
-
-def compute_max_iou_gt(ious):
-    r"""
-    For each GT, find the anchor with the largest IOU.
-    Args:
-        ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors
-    Returns:
-        is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected
-    """
-    num_anchors = ious.shape[-1]
-    max_iou_index = ious.argmax(axis=-1)
-    is_max_iou = F.one_hot(max_iou_index, num_anchors)
-    return is_max_iou.astype(ious.dtype)
-
-
-def generate_anchors_for_grid_cell(feats,
-                                   fpn_strides,
-                                   grid_cell_size=5.0,
-                                   grid_cell_offset=0.5,
-                                   dtype='float32'):
-    r"""
-    Like ATSS, generate anchors based on grid size.
-    Args:
-        feats (List[Tensor]): shape[s, (b, c, h, w)]
-        fpn_strides (tuple|list): shape[s], stride for each scale feature
-        grid_cell_size (float): anchor size
-        grid_cell_offset (float): The range is between 0 and 1.
-    Returns:
-        anchors (Tensor): shape[l, 4], "xmin, ymin, xmax, ymax" format.
-        anchor_points (Tensor): shape[l, 2], "x, y" format.
-        num_anchors_list (List[int]): shape[s], contains [s_1, s_2, ...].
-        stride_tensor (Tensor): shape[l, 1], contains the stride for each scale.
-    """
-    assert len(feats) == len(fpn_strides)
-    anchors = []
-    anchor_points = []
-    num_anchors_list = []
-    stride_tensor = []
-    for feat, stride in zip(feats, fpn_strides):
-        _, _, h, w = feat.shape
-        cell_half_size = grid_cell_size * stride * 0.5
-        shift_x = (paddle.arange(end=w) + grid_cell_offset) * stride
-        shift_y = (paddle.arange(end=h) + grid_cell_offset) * stride
-        shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
-        anchor = paddle.stack(
-            [
-                shift_x - cell_half_size, shift_y - cell_half_size,
-                shift_x + cell_half_size, shift_y + cell_half_size
-            ],
-            axis=-1).astype(dtype)
-        anchor_point = paddle.stack([shift_x, shift_y], axis=-1).astype(dtype)
-
-        anchors.append(anchor.reshape([-1, 4]))
-        anchor_points.append(anchor_point.reshape([-1, 2]))
-        num_anchors_list.append(len(anchors[-1]))
-        stride_tensor.append(
-            paddle.full(
-                [num_anchors_list[-1], 1], stride, dtype=dtype))
-    anchors = paddle.concat(anchors)
-    anchors.stop_gradient = True
-    anchor_points = paddle.concat(anchor_points)
-    anchor_points.stop_gradient = True
-    stride_tensor = paddle.concat(stride_tensor)
-    stride_tensor.stop_gradient = True
-    return anchors, anchor_points, num_anchors_list, stride_tensor
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/__init__.py
deleted file mode 100644
index bc000c7..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/__init__.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-# 
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from . import vgg
-from . import resnet
-from . import darknet
-from . import mobilenet_v1
-from . import mobilenet_v3
-from . import hrnet
-from . import lite_hrnet
-from . import blazenet
-from . import ghostnet
-from . import senet
-from . import res2net
-from . import dla
-from . import shufflenet_v2
-from . import swin_transformer
-from . import lcnet
-from . import hardnet
-from . import esnet
-from . import cspresnet
-from . import csp_darknet
-from . import convnext
-from . import vision_transformer
-from . import mobileone
-from . import trans_encoder
-from . import focalnet
-from . import vit_mae
-from . import hgnet_v2
-from . import clrnet_resnet
-
-from .vgg import *
-from .resnet import *
-from .darknet import *
-from .mobilenet_v1 import *
-from .mobilenet_v3 import *
-from .hrnet import *
-from .lite_hrnet import *
-from .blazenet import *
-from .ghostnet import *
-from .senet import *
-from .res2net import *
-from .dla import *
-from .shufflenet_v2 import *
-from .swin_transformer import *
-from .lcnet import *
-from .hardnet import *
-from .esnet import *
-from .cspresnet import *
-from .csp_darknet import *
-from .convnext import *
-from .vision_transformer import *
-from .mobileone import *
-from .trans_encoder import *
-from .focalnet import *
-from .vitpose import *
-from .vit_mae import *
-from .hgnet_v2 import *
-from .clrnet_resnet import *
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/blazenet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/blazenet.py
deleted file mode 100644
index fbfdcec..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/blazenet.py
+++ /dev/null
@@ -1,319 +0,0 @@
-# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn.initializer import KaimingNormal
-from ppdet.core.workspace import register, serializable
-from ..shape_spec import ShapeSpec
-
-__all__ = ['BlazeNet']
-
-
-def hard_swish(x):
-    return x * F.relu6(x + 3) / 6.
-
-
-class ConvBNLayer(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride,
-                 padding,
-                 num_groups=1,
-                 act='relu',
-                 conv_lr=0.1,
-                 conv_decay=0.,
-                 norm_decay=0.,
-                 norm_type='bn',
-                 name=None):
-        super(ConvBNLayer, self).__init__()
-        self.act = act
-        self._conv = nn.Conv2D(
-            in_channels,
-            out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            groups=num_groups,
-            weight_attr=ParamAttr(
-                learning_rate=conv_lr, initializer=KaimingNormal()),
-            bias_attr=False)
-
-        if norm_type in ['bn', 'sync_bn']:
-            self._batch_norm = nn.BatchNorm2D(out_channels)
-
-    def forward(self, x):
-        x = self._conv(x)
-        x = self._batch_norm(x)
-        if self.act == "relu":
-            x = F.relu(x)
-        elif self.act == "relu6":
-            x = F.relu6(x)
-        elif self.act == 'leaky':
-            x = F.leaky_relu(x)
-        elif self.act == 'hard_swish':
-            x = hard_swish(x)
-        return x
-
-
-class BlazeBlock(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels1,
-                 out_channels2,
-                 double_channels=None,
-                 stride=1,
-                 use_5x5kernel=True,
-                 act='relu',
-                 name=None):
-        super(BlazeBlock, self).__init__()
-        assert stride in [1, 2]
-        self.use_pool = not stride == 1
-        self.use_double_block = double_channels is not None
-        self.conv_dw = []
-        if use_5x5kernel:
-            self.conv_dw.append(
-                self.add_sublayer(
-                    name + "1_dw",
-                    ConvBNLayer(
-                        in_channels=in_channels,
-                        out_channels=out_channels1,
-                        kernel_size=5,
-                        stride=stride,
-                        padding=2,
-                        num_groups=out_channels1,
-                        name=name + "1_dw")))
-        else:
-            self.conv_dw.append(
-                self.add_sublayer(
-                    name + "1_dw_1",
-                    ConvBNLayer(
-                        in_channels=in_channels,
-                        out_channels=out_channels1,
-                        kernel_size=3,
-                        stride=1,
-                        padding=1,
-                        num_groups=out_channels1,
-                        name=name + "1_dw_1")))
-            self.conv_dw.append(
-                self.add_sublayer(
-                    name + "1_dw_2",
-                    ConvBNLayer(
-                        in_channels=out_channels1,
-                        out_channels=out_channels1,
-                        kernel_size=3,
-                        stride=stride,
-                        padding=1,
-                        num_groups=out_channels1,
-                        name=name + "1_dw_2")))
-        self.act = act if self.use_double_block else None
-        self.conv_pw = ConvBNLayer(
-            in_channels=out_channels1,
-            out_channels=out_channels2,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            act=self.act,
-            name=name + "1_sep")
-        if self.use_double_block:
-            self.conv_dw2 = []
-            if use_5x5kernel:
-                self.conv_dw2.append(
-                    self.add_sublayer(
-                        name + "2_dw",
-                        ConvBNLayer(
-                            in_channels=out_channels2,
-                            out_channels=out_channels2,
-                            kernel_size=5,
-                            stride=1,
-                            padding=2,
-                            num_groups=out_channels2,
-                            name=name + "2_dw")))
-            else:
-                self.conv_dw2.append(
-                    self.add_sublayer(
-                        name + "2_dw_1",
-                        ConvBNLayer(
-                            in_channels=out_channels2,
-                            out_channels=out_channels2,
-                            kernel_size=3,
-                            stride=1,
-                            padding=1,
-                            num_groups=out_channels2,
-                            name=name + "1_dw_1")))
-                self.conv_dw2.append(
-                    self.add_sublayer(
-                        name + "2_dw_2",
-                        ConvBNLayer(
-                            in_channels=out_channels2,
-                            out_channels=out_channels2,
-                            kernel_size=3,
-                            stride=1,
-                            padding=1,
-                            num_groups=out_channels2,
-                            name=name + "2_dw_2")))
-            self.conv_pw2 = ConvBNLayer(
-                in_channels=out_channels2,
-                out_channels=double_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                name=name + "2_sep")
-        # shortcut
-        if self.use_pool:
-            shortcut_channel = double_channels or out_channels2
-            self._shortcut = []
-            self._shortcut.append(
-                self.add_sublayer(
-                    name + '_shortcut_pool',
-                    nn.MaxPool2D(
-                        kernel_size=stride, stride=stride, ceil_mode=True)))
-            self._shortcut.append(
-                self.add_sublayer(
-                    name + '_shortcut_conv',
-                    ConvBNLayer(
-                        in_channels=in_channels,
-                        out_channels=shortcut_channel,
-                        kernel_size=1,
-                        stride=1,
-                        padding=0,
-                        name="shortcut" + name)))
-
-    def forward(self, x):
-        y = x
-        for conv_dw_block in self.conv_dw:
-            y = conv_dw_block(y)
-        y = self.conv_pw(y)
-        if self.use_double_block:
-            for conv_dw2_block in self.conv_dw2:
-                y = conv_dw2_block(y)
-            y = self.conv_pw2(y)
-        if self.use_pool:
-            for shortcut in self._shortcut:
-                x = shortcut(x)
-        return F.relu(paddle.add(x, y))
-
-
-@register
-@serializable
-class BlazeNet(nn.Layer):
-    """
-    BlazeFace, see https://arxiv.org/abs/1907.05047
-
-    Args:
-        blaze_filters (list): number of filter for each blaze block.
-        double_blaze_filters (list): number of filter for each double_blaze block.
-        use_5x5kernel (bool): whether or not filter size is 5x5 in depth-wise conv.
-    """
-
-    def __init__(
-            self,
-            blaze_filters=[[24, 24], [24, 24], [24, 48, 2], [48, 48], [48, 48]],
-            double_blaze_filters=[[48, 24, 96, 2], [96, 24, 96], [96, 24, 96],
-                                  [96, 24, 96, 2], [96, 24, 96], [96, 24, 96]],
-            use_5x5kernel=True,
-            act=None):
-        super(BlazeNet, self).__init__()
-        conv1_num_filters = blaze_filters[0][0]
-        self.conv1 = ConvBNLayer(
-            in_channels=3,
-            out_channels=conv1_num_filters,
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            name="conv1")
-        in_channels = conv1_num_filters
-        self.blaze_block = []
-        self._out_channels = []
-        for k, v in enumerate(blaze_filters):
-            assert len(v) in [2, 3], \
-                "blaze_filters {} not in [2, 3]"
-            if len(v) == 2:
-                self.blaze_block.append(
-                    self.add_sublayer(
-                        'blaze_{}'.format(k),
-                        BlazeBlock(
-                            in_channels,
-                            v[0],
-                            v[1],
-                            use_5x5kernel=use_5x5kernel,
-                            act=act,
-                            name='blaze_{}'.format(k))))
-            elif len(v) == 3:
-                self.blaze_block.append(
-                    self.add_sublayer(
-                        'blaze_{}'.format(k),
-                        BlazeBlock(
-                            in_channels,
-                            v[0],
-                            v[1],
-                            stride=v[2],
-                            use_5x5kernel=use_5x5kernel,
-                            act=act,
-                            name='blaze_{}'.format(k))))
-            in_channels = v[1]
-
-        for k, v in enumerate(double_blaze_filters):
-            assert len(v) in [3, 4], \
-                "blaze_filters {} not in [3, 4]"
-            if len(v) == 3:
-                self.blaze_block.append(
-                    self.add_sublayer(
-                        'double_blaze_{}'.format(k),
-                        BlazeBlock(
-                            in_channels,
-                            v[0],
-                            v[1],
-                            double_channels=v[2],
-                            use_5x5kernel=use_5x5kernel,
-                            act=act,
-                            name='double_blaze_{}'.format(k))))
-            elif len(v) == 4:
-                self.blaze_block.append(
-                    self.add_sublayer(
-                        'double_blaze_{}'.format(k),
-                        BlazeBlock(
-                            in_channels,
-                            v[0],
-                            v[1],
-                            double_channels=v[2],
-                            stride=v[3],
-                            use_5x5kernel=use_5x5kernel,
-                            act=act,
-                            name='double_blaze_{}'.format(k))))
-            in_channels = v[2]
-            self._out_channels.append(in_channels)
-
-    def forward(self, inputs):
-        outs = []
-        y = self.conv1(inputs['image'])
-        for block in self.blaze_block:
-            y = block(y)
-            outs.append(y)
-        return [outs[-4], outs[-1]]
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(channels=c)
-            for c in [self._out_channels[-4], self._out_channels[-1]]
-        ]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/clrnet_resnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/clrnet_resnet.py
deleted file mode 100644
index 00758df..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/clrnet_resnet.py
+++ /dev/null
@@ -1,697 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-
-from paddle.utils.download import get_weights_path_from_url
-from ppdet.core.workspace import register, serializable
-from ..shape_spec import ShapeSpec
-
-__all__ = ['CLRResNet']
-
-model_urls = {
-    'resnet18':
-    'https://x2paddle.bj.bcebos.com/vision/models/resnet18-pt.pdparams',
-    'resnet34':
-    'https://x2paddle.bj.bcebos.com/vision/models/resnet34-pt.pdparams',
-    'resnet50':
-    'https://x2paddle.bj.bcebos.com/vision/models/resnet50-pt.pdparams',
-    'resnet101':
-    'https://x2paddle.bj.bcebos.com/vision/models/resnet101-pt.pdparams',
-    'resnet152':
-    'https://x2paddle.bj.bcebos.com/vision/models/resnet152-pt.pdparams',
-    'resnext50_32x4d':
-    'https://x2paddle.bj.bcebos.com/vision/models/resnext50_32x4d-pt.pdparams',
-    'resnext101_32x8d':
-    'https://x2paddle.bj.bcebos.com/vision/models/resnext101_32x8d-pt.pdparams',
-    'wide_resnet50_2':
-    'https://x2paddle.bj.bcebos.com/vision/models/wide_resnet50_2-pt.pdparams',
-    'wide_resnet101_2':
-    'https://x2paddle.bj.bcebos.com/vision/models/wide_resnet101_2-pt.pdparams',
-}
-
-
-class BasicBlock(nn.Layer):
-    expansion = 1
-
-    def __init__(self,
-                 inplanes,
-                 planes,
-                 stride=1,
-                 downsample=None,
-                 groups=1,
-                 base_width=64,
-                 dilation=1,
-                 norm_layer=None):
-        super(BasicBlock, self).__init__()
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2D
-
-        if dilation > 1:
-            raise NotImplementedError(
-                "Dilation > 1 not supported in BasicBlock")
-
-        self.conv1 = nn.Conv2D(
-            inplanes, planes, 3, padding=1, stride=stride, bias_attr=False)
-        self.bn1 = norm_layer(planes)
-        self.relu = nn.ReLU()
-        self.conv2 = nn.Conv2D(planes, planes, 3, padding=1, bias_attr=False)
-        self.bn2 = norm_layer(planes)
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        identity = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-
-        if self.downsample is not None:
-            identity = self.downsample(x)
-
-        out += identity
-        out = self.relu(out)
-
-        return out
-
-
-class BottleneckBlock(nn.Layer):
-
-    expansion = 4
-
-    def __init__(self,
-                 inplanes,
-                 planes,
-                 stride=1,
-                 downsample=None,
-                 groups=1,
-                 base_width=64,
-                 dilation=1,
-                 norm_layer=None):
-        super(BottleneckBlock, self).__init__()
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2D
-        width = int(planes * (base_width / 64.)) * groups
-
-        self.conv1 = nn.Conv2D(inplanes, width, 1, bias_attr=False)
-        self.bn1 = norm_layer(width)
-
-        self.conv2 = nn.Conv2D(
-            width,
-            width,
-            3,
-            padding=dilation,
-            stride=stride,
-            groups=groups,
-            dilation=dilation,
-            bias_attr=False)
-        self.bn2 = norm_layer(width)
-
-        self.conv3 = nn.Conv2D(
-            width, planes * self.expansion, 1, bias_attr=False)
-        self.bn3 = norm_layer(planes * self.expansion)
-        self.relu = nn.ReLU()
-        self.downsample = downsample
-        self.stride = stride
-
-    def forward(self, x):
-        identity = x
-
-        out = self.conv1(x)
-        out = self.bn1(out)
-        out = self.relu(out)
-
-        out = self.conv2(out)
-        out = self.bn2(out)
-        out = self.relu(out)
-
-        out = self.conv3(out)
-        out = self.bn3(out)
-
-        if self.downsample is not None:
-            identity = self.downsample(x)
-
-        out += identity
-        out = self.relu(out)
-
-        return out
-
-
-class ResNet(nn.Layer):
-    """ResNet model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-    Args:
-        Block (BasicBlock|BottleneckBlock): Block module of model.
-        depth (int, optional): Layers of ResNet, Default: 50.
-        width (int, optional): Base width per convolution group for each convolution block, Default: 64.
-        num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer 
-                            will not be defined. Default: 1000.
-        with_pool (bool, optional): Use pool before the last fc layer or not. Default: True.
-        groups (int, optional): Number of groups for each convolution block, Default: 1.
-    Returns:
-        :ref:`api_paddle_nn_Layer`. An instance of ResNet model.
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import ResNet
-            from paddle.vision.models.resnet import BottleneckBlock, BasicBlock
-            # build ResNet with 18 layers
-            resnet18 = ResNet(BasicBlock, 18)
-            # build ResNet with 50 layers
-            resnet50 = ResNet(BottleneckBlock, 50)
-            # build Wide ResNet model
-            wide_resnet50_2 = ResNet(BottleneckBlock, 50, width=64*2)
-            # build ResNeXt model
-            resnext50_32x4d = ResNet(BottleneckBlock, 50, width=4, groups=32)
-            x = paddle.rand([1, 3, 224, 224])
-            out = resnet18(x)
-            print(out.shape)
-            # [1, 1000]
-    """
-
-    def __init__(self, block, depth=50, width=64, with_pool=True, groups=1):
-        super(ResNet, self).__init__()
-        layer_cfg = {
-            18: [2, 2, 2, 2],
-            34: [3, 4, 6, 3],
-            50: [3, 4, 6, 3],
-            101: [3, 4, 23, 3],
-            152: [3, 8, 36, 3]
-        }
-
-        layers = layer_cfg[depth]
-        self.groups = groups
-        self.base_width = width
-        self.with_pool = with_pool
-        self._norm_layer = nn.BatchNorm2D
-
-        self.inplanes = 64
-        self.dilation = 1
-
-        self.conv1 = nn.Conv2D(
-            3,
-            self.inplanes,
-            kernel_size=7,
-            stride=2,
-            padding=3,
-            bias_attr=False)
-        self.bn1 = self._norm_layer(self.inplanes)
-        self.relu = nn.ReLU()
-        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
-        self.layer1 = self._make_layer(block, 64, layers[0])
-        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
-        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
-        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
-        if with_pool:
-            self.avgpool = nn.AdaptiveAvgPool2D((1, 1))
-
-        ch_out_list = [64, 128, 256, 512]
-        block = BottleneckBlock if depth >= 50 else BasicBlock
-
-        self._out_channels = [block.expansion * v for v in ch_out_list]
-        self._out_strides = [4, 8, 16, 32]
-        self.return_idx = [0, 1, 2, 3]
-
-    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
-        norm_layer = self._norm_layer
-        downsample = None
-        previous_dilation = self.dilation
-        if dilate:
-            self.dilation *= stride
-            stride = 1
-        if stride != 1 or self.inplanes != planes * block.expansion:
-            downsample = nn.Sequential(
-                nn.Conv2D(
-                    self.inplanes,
-                    planes * block.expansion,
-                    1,
-                    stride=stride,
-                    bias_attr=False),
-                norm_layer(planes * block.expansion), )
-
-        layers = []
-        layers.append(
-            block(self.inplanes, planes, stride, downsample, self.groups,
-                  self.base_width, previous_dilation, norm_layer))
-        self.inplanes = planes * block.expansion
-        for _ in range(1, blocks):
-            layers.append(
-                block(
-                    self.inplanes,
-                    planes,
-                    groups=self.groups,
-                    base_width=self.base_width,
-                    norm_layer=norm_layer))
-
-        return nn.Sequential(*layers)
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=self._out_channels[i], stride=self._out_strides[i])
-            for i in self.return_idx
-        ]
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.bn1(x)
-        x = self.relu(x)
-        x = self.maxpool(x)
-
-        out_layers = []
-        x = self.layer1(x)
-        out_layers.append(x)
-        x = self.layer2(x)
-        out_layers.append(x)
-        x = self.layer3(x)
-        out_layers.append(x)
-        x = self.layer4(x)
-        out_layers.append(x)
-
-        if self.with_pool:
-            x = self.avgpool(x)
-
-        return out_layers
-
-
-@register
-@serializable
-class CLRResNet(nn.Layer):
-    def __init__(self,
-                 resnet='resnet18',
-                 pretrained=True,
-                 out_conv=False,
-                 fea_stride=8,
-                 out_channel=128,
-                 in_channels=[64, 128, 256, 512],
-                 cfg=None):
-        super(CLRResNet, self).__init__()
-        self.cfg = cfg
-        self.in_channels = in_channels
-
-        self.model = eval(resnet)(pretrained=pretrained)
-        self.out = None
-        if out_conv:
-            out_channel = 512
-            for chan in reversed(self.in_channels):
-                if chan < 0: continue
-                out_channel = chan
-                break
-            self.out = nn.Conv2D(
-                out_channel * self.model.expansion,
-                cfg.featuremap_out_channel,
-                kernel_size=1,
-                bias_attr=False)
-
-    @property
-    def out_shape(self):
-        return self.model.out_shape
-
-    def forward(self, x):
-        x = self.model(x)
-        if self.out:
-            x[-1] = self.out(x[-1])
-        return x
-
-
-def _resnet(arch, Block, depth, pretrained, **kwargs):
-    model = ResNet(Block, depth, **kwargs)
-    if pretrained:
-        assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format(
-            arch)
-        weight_path = get_weights_path_from_url(model_urls[arch])
-
-        param = paddle.load(weight_path)
-        model.set_dict(param)
-
-    return model
-
-
-def resnet18(pretrained=False, **kwargs):
-    """ResNet 18-layer model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-    Args:
-        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
-                            on ImageNet. Default: False.
-        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
-    Returns:
-        :ref:`api_paddle_nn_Layer`. An instance of ResNet 18-layer model.
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import resnet18
-            # build model
-            model = resnet18()
-            # build model and load imagenet pretrained weight
-            # model = resnet18(pretrained=True)
-            x = paddle.rand([1, 3, 224, 224])
-            out = model(x)
-            print(out.shape)
-            # [1, 1000]
-    """
-    return _resnet('resnet18', BasicBlock, 18, pretrained, **kwargs)
-
-
-def resnet34(pretrained=False, **kwargs):
-    """ResNet 34-layer model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-    Args:
-        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
-                            on ImageNet. Default: False.
-        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
-    Returns:
-        :ref:`api_paddle_nn_Layer`. An instance of ResNet 34-layer model.
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import resnet34
-            # build model
-            model = resnet34()
-            # build model and load imagenet pretrained weight
-            # model = resnet34(pretrained=True)
-            x = paddle.rand([1, 3, 224, 224])
-            out = model(x)
-            print(out.shape)
-            # [1, 1000]
-    """
-    return _resnet('resnet34', BasicBlock, 34, pretrained, **kwargs)
-
-
-def resnet50(pretrained=False, **kwargs):
-    """ResNet 50-layer model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-    Args:
-        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
-                            on ImageNet. Default: False.
-        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
-    Returns:
-        :ref:`api_paddle_nn_Layer`. An instance of ResNet 50-layer model.
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import resnet50
-            # build model
-            model = resnet50()
-            # build model and load imagenet pretrained weight
-            # model = resnet50(pretrained=True)
-            x = paddle.rand([1, 3, 224, 224])
-            out = model(x)
-            print(out.shape)
-            # [1, 1000]
-    """
-    return _resnet('resnet50', BottleneckBlock, 50, pretrained, **kwargs)
-
-
-def resnet101(pretrained=False, **kwargs):
-    """ResNet 101-layer model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-    Args:
-        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
-                            on ImageNet. Default: False.
-        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
-    Returns:
-        :ref:`api_paddle_nn_Layer`. An instance of ResNet 101-layer.
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import resnet101
-            # build model
-            model = resnet101()
-            # build model and load imagenet pretrained weight
-            # model = resnet101(pretrained=True)
-            x = paddle.rand([1, 3, 224, 224])
-            out = model(x)
-            print(out.shape)
-            # [1, 1000]
-    """
-    return _resnet('resnet101', BottleneckBlock, 101, pretrained, **kwargs)
-
-
-def resnet152(pretrained=False, **kwargs):
-    """ResNet 152-layer model from
-    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
-    Args:
-        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
-                            on ImageNet. Default: False.
-        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
-    Returns:
-        :ref:`api_paddle_nn_Layer`. An instance of ResNet 152-layer model.
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import resnet152
-            # build model
-            model = resnet152()
-            # build model and load imagenet pretrained weight
-            # model = resnet152(pretrained=True)
-            x = paddle.rand([1, 3, 224, 224])
-            out = model(x)
-            print(out.shape)
-            # [1, 1000]
-    """
-    return _resnet('resnet152', BottleneckBlock, 152, pretrained, **kwargs)
-
-
-def resnext50_32x4d(pretrained=False, **kwargs):
-    """ResNeXt-50 32x4d model from
-    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-    
-    Args:
-        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
-                            on ImageNet. Default: False.
-        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
-    Returns:
-        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-50 32x4d model.
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import resnext50_32x4d
-            # build model
-            model = resnext50_32x4d()
-            # build model and load imagenet pretrained weight
-            # model = resnext50_32x4d(pretrained=True)
-            x = paddle.rand([1, 3, 224, 224])
-            out = model(x)
-            print(out.shape)
-            # [1, 1000]
-    """
-    kwargs['groups'] = 32
-    kwargs['width'] = 4
-    return _resnet('resnext50_32x4d', BottleneckBlock, 50, pretrained, **kwargs)
-
-
-def resnext50_64x4d(pretrained=False, **kwargs):
-    """ResNeXt-50 64x4d model from
-    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-    
-    Args:
-        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
-                            on ImageNet. Default: False.
-        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
-    Returns:
-        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-50 64x4d model.
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import resnext50_64x4d
-            # build model
-            model = resnext50_64x4d()
-            # build model and load imagenet pretrained weight
-            # model = resnext50_64x4d(pretrained=True)
-            x = paddle.rand([1, 3, 224, 224])
-            out = model(x)
-            print(out.shape)
-            # [1, 1000]
-    """
-    kwargs['groups'] = 64
-    kwargs['width'] = 4
-    return _resnet('resnext50_64x4d', BottleneckBlock, 50, pretrained, **kwargs)
-
-
-def resnext101_32x4d(pretrained=False, **kwargs):
-    """ResNeXt-101 32x4d model from
-    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-    
-    Args:
-        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
-                            on ImageNet. Default: False.
-        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
-    Returns:
-        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-101 32x4d model.
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import resnext101_32x4d
-            # build model
-            model = resnext101_32x4d()
-            # build model and load imagenet pretrained weight
-            # model = resnext101_32x4d(pretrained=True)
-            x = paddle.rand([1, 3, 224, 224])
-            out = model(x)
-            print(out.shape)
-            # [1, 1000]
-    """
-    kwargs['groups'] = 32
-    kwargs['width'] = 4
-    return _resnet('resnext101_32x4d', BottleneckBlock, 101, pretrained,
-                   **kwargs)
-
-
-def resnext101_64x4d(pretrained=False, **kwargs):
-    """ResNeXt-101 64x4d model from
-    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-    
-    Args:
-        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
-                            on ImageNet. Default: False.
-        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
-    Returns:
-        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-101 64x4d model.
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import resnext101_64x4d
-            # build model
-            model = resnext101_64x4d()
-            # build model and load imagenet pretrained weight
-            # model = resnext101_64x4d(pretrained=True)
-            x = paddle.rand([1, 3, 224, 224])
-            out = model(x)
-            print(out.shape)
-            # [1, 1000]
-    """
-    kwargs['groups'] = 64
-    kwargs['width'] = 4
-    return _resnet('resnext101_64x4d', BottleneckBlock, 101, pretrained,
-                   **kwargs)
-
-
-def resnext152_32x4d(pretrained=False, **kwargs):
-    """ResNeXt-152 32x4d model from
-    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-    
-    Args:
-        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
-                            on ImageNet. Default: False.
-        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
-    Returns:
-        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-152 32x4d model.
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import resnext152_32x4d
-            # build model
-            model = resnext152_32x4d()
-            # build model and load imagenet pretrained weight
-            # model = resnext152_32x4d(pretrained=True)
-            x = paddle.rand([1, 3, 224, 224])
-            out = model(x)
-            print(out.shape)
-            # [1, 1000]
-    """
-    kwargs['groups'] = 32
-    kwargs['width'] = 4
-    return _resnet('resnext152_32x4d', BottleneckBlock, 152, pretrained,
-                   **kwargs)
-
-
-def resnext152_64x4d(pretrained=False, **kwargs):
-    """ResNeXt-152 64x4d model from
-    `"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
-    
-    Args:
-        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
-                            on ImageNet. Default: False.
-        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
-    Returns:
-        :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-152 64x4d model.
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import resnext152_64x4d
-            # build model
-            model = resnext152_64x4d()
-            # build model and load imagenet pretrained weight
-            # model = resnext152_64x4d(pretrained=True)
-            x = paddle.rand([1, 3, 224, 224])
-            out = model(x)
-            print(out.shape)
-            # [1, 1000]
-    """
-    kwargs['groups'] = 64
-    kwargs['width'] = 4
-    return _resnet('resnext152_64x4d', BottleneckBlock, 152, pretrained,
-                   **kwargs)
-
-
-def wide_resnet50_2(pretrained=False, **kwargs):
-    """Wide ResNet-50-2 model from
-    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
-    Args:
-        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
-                            on ImageNet. Default: False.
-        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
-    Returns:
-        :ref:`api_paddle_nn_Layer`. An instance of Wide ResNet-50-2 model.
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import wide_resnet50_2
-            # build model
-            model = wide_resnet50_2()
-            # build model and load imagenet pretrained weight
-            # model = wide_resnet50_2(pretrained=True)
-            x = paddle.rand([1, 3, 224, 224])
-            out = model(x)
-            print(out.shape)
-            # [1, 1000]
-    """
-    kwargs['width'] = 64 * 2
-    return _resnet('wide_resnet50_2', BottleneckBlock, 50, pretrained, **kwargs)
-
-
-def wide_resnet101_2(pretrained=False, **kwargs):
-    """Wide ResNet-101-2 model from
-    `"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
-    Args:
-        pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained
-                            on ImageNet. Default: False.
-        **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet <api_paddle_vision_ResNet>`.
-    Returns:
-        :ref:`api_paddle_nn_Layer`. An instance of Wide ResNet-101-2 model.
-    Examples:
-        .. code-block:: python
-            import paddle
-            from paddle.vision.models import wide_resnet101_2
-            # build model
-            model = wide_resnet101_2()
-            # build model and load imagenet pretrained weight
-            # model = wide_resnet101_2(pretrained=True)
-            x = paddle.rand([1, 3, 224, 224])
-            out = model(x)
-            print(out.shape)
-            # [1, 1000]
-    """
-    kwargs['width'] = 64 * 2
-    return _resnet('wide_resnet101_2', BottleneckBlock, 101, pretrained,
-                   **kwargs)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/convnext.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/convnext.py
deleted file mode 100644
index 476e12b..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/convnext.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-'''
-Modified from https://github.com/facebookresearch/ConvNeXt
-Copyright (c) Meta Platforms, Inc. and affiliates.
-All rights reserved.
-This source code is licensed under the license found in the
-LICENSE file in the root directory of this source tree.
-'''
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn.initializer import Constant
-
-import numpy as np
-
-from ppdet.core.workspace import register, serializable
-from ..shape_spec import ShapeSpec
-from .transformer_utils import DropPath, trunc_normal_, zeros_
-
-__all__ = ['ConvNeXt']
-
-
-class Block(nn.Layer):
-    r""" ConvNeXt Block. There are two equivalent implementations:
-    (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W)
-    (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back
-    We use (2) as we find it slightly faster in Pypaddle
-    
-    Args:
-        dim (int): Number of input channels.
-        drop_path (float): Stochastic depth rate. Default: 0.0
-        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
-    """
-
-    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6):
-        super().__init__()
-        self.dwconv = nn.Conv2D(
-            dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv
-        self.norm = LayerNorm(dim, eps=1e-6)
-        self.pwconv1 = nn.Linear(
-            dim, 4 * dim)  # pointwise/1x1 convs, implemented with linear layers
-        self.act = nn.GELU()
-        self.pwconv2 = nn.Linear(4 * dim, dim)
-
-        if layer_scale_init_value > 0:
-            self.gamma = self.create_parameter(
-                shape=(dim, ),
-                attr=ParamAttr(initializer=Constant(layer_scale_init_value)))
-        else:
-            self.gamma = None
-
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity(
-        )
-
-    def forward(self, x):
-        input = x
-        x = self.dwconv(x)
-        x = x.transpose([0, 2, 3, 1])
-        x = self.norm(x)
-        x = self.pwconv1(x)
-        x = self.act(x)
-        x = self.pwconv2(x)
-        if self.gamma is not None:
-            x = self.gamma * x
-        x = x.transpose([0, 3, 1, 2])
-        x = input + self.drop_path(x)
-        return x
-
-
-class LayerNorm(nn.Layer):
-    r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. 
-    The ordering of the dimensions in the inputs. channels_last corresponds to inputs with 
-    shape (batch_size, height, width, channels) while channels_first corresponds to inputs 
-    with shape (batch_size, channels, height, width).
-    """
-
-    def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
-        super().__init__()
-
-        self.weight = self.create_parameter(
-            shape=(normalized_shape, ),
-            attr=ParamAttr(initializer=Constant(1.)))
-        self.bias = self.create_parameter(
-            shape=(normalized_shape, ),
-            attr=ParamAttr(initializer=Constant(0.)))
-
-        self.eps = eps
-        self.data_format = data_format
-        if self.data_format not in ["channels_last", "channels_first"]:
-            raise NotImplementedError
-        self.normalized_shape = (normalized_shape, )
-
-    def forward(self, x):
-        if self.data_format == "channels_last":
-            return F.layer_norm(x, self.normalized_shape, self.weight,
-                                self.bias, self.eps)
-        elif self.data_format == "channels_first":
-            u = x.mean(1, keepdim=True)
-            s = (x - u).pow(2).mean(1, keepdim=True)
-            x = (x - u) / paddle.sqrt(s + self.eps)
-            x = self.weight[:, None, None] * x + self.bias[:, None, None]
-            return x
-
-
-@register
-@serializable
-class ConvNeXt(nn.Layer):
-    r""" ConvNeXt
-        A Pypaddle impl of : `A ConvNet for the 2020s`  -
-          https://arxiv.org/pdf/2201.03545.pdf
-
-    Args:
-        in_chans (int): Number of input image channels. Default: 3
-        depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3]
-        dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768]
-        drop_path_rate (float): Stochastic depth rate. Default: 0.
-        layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6.
-    """
-
-    arch_settings = {
-        'tiny': {
-            'depths': [3, 3, 9, 3],
-            'dims': [96, 192, 384, 768]
-        },
-        'small': {
-            'depths': [3, 3, 27, 3],
-            'dims': [96, 192, 384, 768]
-        },
-        'base': {
-            'depths': [3, 3, 27, 3],
-            'dims': [128, 256, 512, 1024]
-        },
-        'large': {
-            'depths': [3, 3, 27, 3],
-            'dims': [192, 384, 768, 1536]
-        },
-        'xlarge': {
-            'depths': [3, 3, 27, 3],
-            'dims': [256, 512, 1024, 2048]
-        },
-    }
-
-    def __init__(
-            self,
-            arch='tiny',
-            in_chans=3,
-            drop_path_rate=0.,
-            layer_scale_init_value=1e-6,
-            return_idx=[1, 2, 3],
-            norm_output=True,
-            pretrained=None, ):
-        super().__init__()
-        depths = self.arch_settings[arch]['depths']
-        dims = self.arch_settings[arch]['dims']
-        self.downsample_layers = nn.LayerList(
-        )  # stem and 3 intermediate downsampling conv layers
-        stem = nn.Sequential(
-            nn.Conv2D(
-                in_chans, dims[0], kernel_size=4, stride=4),
-            LayerNorm(
-                dims[0], eps=1e-6, data_format="channels_first"))
-        self.downsample_layers.append(stem)
-        for i in range(3):
-            downsample_layer = nn.Sequential(
-                LayerNorm(
-                    dims[i], eps=1e-6, data_format="channels_first"),
-                nn.Conv2D(
-                    dims[i], dims[i + 1], kernel_size=2, stride=2), )
-            self.downsample_layers.append(downsample_layer)
-
-        self.stages = nn.LayerList(
-        )  # 4 feature resolution stages, each consisting of multiple residual blocks
-        dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))]
-        cur = 0
-        for i in range(4):
-            stage = nn.Sequential(* [
-                Block(
-                    dim=dims[i],
-                    drop_path=dp_rates[cur + j],
-                    layer_scale_init_value=layer_scale_init_value)
-                for j in range(depths[i])
-            ])
-            self.stages.append(stage)
-            cur += depths[i]
-
-        self.return_idx = return_idx
-        self.dims = [dims[i] for i in return_idx]  # [::-1]
-
-        self.norm_output = norm_output
-        if norm_output:
-            self.norms = nn.LayerList([
-                LayerNorm(
-                    c, eps=1e-6, data_format="channels_first")
-                for c in self.dims
-            ])
-
-        self.apply(self._init_weights)
-
-        if pretrained is not None:
-            if 'http' in pretrained:  #URL
-                path = paddle.utils.download.get_weights_path_from_url(
-                    pretrained)
-            else:  #model in local path
-                path = pretrained
-            self.set_state_dict(paddle.load(path))
-
-    def _init_weights(self, m):
-        if isinstance(m, (nn.Conv2D, nn.Linear)):
-            trunc_normal_(m.weight)
-            zeros_(m.bias)
-
-    def forward_features(self, x):
-        output = []
-        for i in range(4):
-            x = self.downsample_layers[i](x)
-            x = self.stages[i](x)
-            output.append(x)
-
-        outputs = [output[i] for i in self.return_idx]
-        if self.norm_output:
-            outputs = [self.norms[i](out) for i, out in enumerate(outputs)]
-
-        return outputs
-
-    def forward(self, x):
-        x = self.forward_features(x['image'])
-        return x
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=c) for c in self.dims]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/csp_darknet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/csp_darknet.py
deleted file mode 100644
index 4c225d1..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/csp_darknet.py
+++ /dev/null
@@ -1,404 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-# 
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-from ppdet.core.workspace import register, serializable
-from ppdet.modeling.initializer import conv_init_
-from ..shape_spec import ShapeSpec
-
-__all__ = [
-    'CSPDarkNet', 'BaseConv', 'DWConv', 'BottleNeck', 'SPPLayer', 'SPPFLayer'
-]
-
-
-class BaseConv(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 ksize,
-                 stride,
-                 groups=1,
-                 bias=False,
-                 act="silu"):
-        super(BaseConv, self).__init__()
-        self.conv = nn.Conv2D(
-            in_channels,
-            out_channels,
-            kernel_size=ksize,
-            stride=stride,
-            padding=(ksize - 1) // 2,
-            groups=groups,
-            bias_attr=bias)
-        self.bn = nn.BatchNorm2D(
-            out_channels,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-
-        self._init_weights()
-
-    def _init_weights(self):
-        conv_init_(self.conv)
-
-    def forward(self, x):
-        # use 'x * F.sigmoid(x)' replace 'silu'
-        x = self.bn(self.conv(x))
-        y = x * F.sigmoid(x)
-        return y
-
-
-class DWConv(nn.Layer):
-    """Depthwise Conv"""
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 ksize,
-                 stride=1,
-                 bias=False,
-                 act="silu"):
-        super(DWConv, self).__init__()
-        self.dw_conv = BaseConv(
-            in_channels,
-            in_channels,
-            ksize=ksize,
-            stride=stride,
-            groups=in_channels,
-            bias=bias,
-            act=act)
-        self.pw_conv = BaseConv(
-            in_channels,
-            out_channels,
-            ksize=1,
-            stride=1,
-            groups=1,
-            bias=bias,
-            act=act)
-
-    def forward(self, x):
-        return self.pw_conv(self.dw_conv(x))
-
-
-class Focus(nn.Layer):
-    """Focus width and height information into channel space, used in YOLOX."""
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 ksize=3,
-                 stride=1,
-                 bias=False,
-                 act="silu"):
-        super(Focus, self).__init__()
-        self.conv = BaseConv(
-            in_channels * 4,
-            out_channels,
-            ksize=ksize,
-            stride=stride,
-            bias=bias,
-            act=act)
-
-    def forward(self, inputs):
-        # inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2]
-        top_left = inputs[:, :, 0::2, 0::2]
-        top_right = inputs[:, :, 0::2, 1::2]
-        bottom_left = inputs[:, :, 1::2, 0::2]
-        bottom_right = inputs[:, :, 1::2, 1::2]
-        outputs = paddle.concat(
-            [top_left, bottom_left, top_right, bottom_right], 1)
-        return self.conv(outputs)
-
-
-class BottleNeck(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 shortcut=True,
-                 expansion=0.5,
-                 depthwise=False,
-                 bias=False,
-                 act="silu"):
-        super(BottleNeck, self).__init__()
-        hidden_channels = int(out_channels * expansion)
-        Conv = DWConv if depthwise else BaseConv
-        self.conv1 = BaseConv(
-            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
-        self.conv2 = Conv(
-            hidden_channels,
-            out_channels,
-            ksize=3,
-            stride=1,
-            bias=bias,
-            act=act)
-        self.add_shortcut = shortcut and in_channels == out_channels
-
-    def forward(self, x):
-        y = self.conv2(self.conv1(x))
-        if self.add_shortcut:
-            y = y + x
-        return y
-
-
-class SPPLayer(nn.Layer):
-    """Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX"""
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_sizes=(5, 9, 13),
-                 bias=False,
-                 act="silu"):
-        super(SPPLayer, self).__init__()
-        hidden_channels = in_channels // 2
-        self.conv1 = BaseConv(
-            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
-        self.maxpoolings = nn.LayerList([
-            nn.MaxPool2D(
-                kernel_size=ks, stride=1, padding=ks // 2)
-            for ks in kernel_sizes
-        ])
-        conv2_channels = hidden_channels * (len(kernel_sizes) + 1)
-        self.conv2 = BaseConv(
-            conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1)
-        x = self.conv2(x)
-        return x
-
-
-class SPPFLayer(nn.Layer):
-    """ Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher,
-        equivalent to SPP(k=(5, 9, 13))
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 ksize=5,
-                 bias=False,
-                 act='silu'):
-        super(SPPFLayer, self).__init__()
-        hidden_channels = in_channels // 2
-        self.conv1 = BaseConv(
-            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
-        self.maxpooling = nn.MaxPool2D(
-            kernel_size=ksize, stride=1, padding=ksize // 2)
-        conv2_channels = hidden_channels * 4
-        self.conv2 = BaseConv(
-            conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        y1 = self.maxpooling(x)
-        y2 = self.maxpooling(y1)
-        y3 = self.maxpooling(y2)
-        concats = paddle.concat([x, y1, y2, y3], axis=1)
-        out = self.conv2(concats)
-        return out
-
-
-class CSPLayer(nn.Layer):
-    """CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5"""
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 num_blocks=1,
-                 shortcut=True,
-                 expansion=0.5,
-                 depthwise=False,
-                 bias=False,
-                 act="silu"):
-        super(CSPLayer, self).__init__()
-        hidden_channels = int(out_channels * expansion)
-        self.conv1 = BaseConv(
-            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
-        self.conv2 = BaseConv(
-            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
-        self.bottlenecks = nn.Sequential(* [
-            BottleNeck(
-                hidden_channels,
-                hidden_channels,
-                shortcut=shortcut,
-                expansion=1.0,
-                depthwise=depthwise,
-                bias=bias,
-                act=act) for _ in range(num_blocks)
-        ])
-        self.conv3 = BaseConv(
-            hidden_channels * 2,
-            out_channels,
-            ksize=1,
-            stride=1,
-            bias=bias,
-            act=act)
-
-    def forward(self, x):
-        x_1 = self.conv1(x)
-        x_1 = self.bottlenecks(x_1)
-        x_2 = self.conv2(x)
-        x = paddle.concat([x_1, x_2], axis=1)
-        x = self.conv3(x)
-        return x
-
-
-@register
-@serializable
-class CSPDarkNet(nn.Layer):
-    """
-    CSPDarkNet backbone.
-    Args:
-        arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X,
-            and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5.
-        depth_mult (float): Depth multiplier, multiply number of channels in
-            each layer, default as 1.0.
-        width_mult (float): Width multiplier, multiply number of blocks in
-            CSPLayer, default as 1.0.
-        depthwise (bool): Whether to use depth-wise conv layer.
-        act (str): Activation function type, default as 'silu'.
-        return_idx (list): Index of stages whose feature maps are returned.
-    """
-
-    __shared__ = ['depth_mult', 'width_mult', 'act', 'trt']
-
-    # in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf)
-    # 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5.
-    arch_settings = {
-        'X': [[64, 128, 3, True, False], [128, 256, 9, True, False],
-              [256, 512, 9, True, False], [512, 1024, 3, False, True]],
-        'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False],
-               [256, 512, 9, True, False], [512, 1024, 3, True, True]],
-        'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False],
-               [256, 512, 9, True, False], [512, 768, 3, True, False],
-               [768, 1024, 3, True, True]],
-    }
-
-    def __init__(self,
-                 arch='X',
-                 depth_mult=1.0,
-                 width_mult=1.0,
-                 depthwise=False,
-                 act='silu',
-                 trt=False,
-                 return_idx=[2, 3, 4]):
-        super(CSPDarkNet, self).__init__()
-        self.arch = arch
-        self.return_idx = return_idx
-        Conv = DWConv if depthwise else BaseConv
-        arch_setting = self.arch_settings[arch]
-        base_channels = int(arch_setting[0][0] * width_mult)
-
-        # Note: differences between the latest YOLOv5 and the original YOLOX
-        # 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX)
-        # 2. use SPPF(in YOLOv5) or SPP(in YOLOX)
-        # 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer
-        # 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX
-        if arch in ['P5', 'P6']:
-            # in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size)
-            self.stem = Conv(
-                3, base_channels, ksize=6, stride=2, bias=False, act=act)
-            spp_kernal_sizes = 5
-        elif arch in ['X']:
-            # in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes)
-            self.stem = Focus(
-                3, base_channels, ksize=3, stride=1, bias=False, act=act)
-            spp_kernal_sizes = (5, 9, 13)
-        else:
-            raise AttributeError("Unsupported arch type: {}".format(arch))
-
-        _out_channels = [base_channels]
-        layers_num = 1
-        self.csp_dark_blocks = []
-
-        for i, (in_channels, out_channels, num_blocks, shortcut,
-                use_spp) in enumerate(arch_setting):
-            in_channels = int(in_channels * width_mult)
-            out_channels = int(out_channels * width_mult)
-            _out_channels.append(out_channels)
-            num_blocks = max(round(num_blocks * depth_mult), 1)
-            stage = []
-
-            conv_layer = self.add_sublayer(
-                'layers{}.stage{}.conv_layer'.format(layers_num, i + 1),
-                Conv(
-                    in_channels, out_channels, 3, 2, bias=False, act=act))
-            stage.append(conv_layer)
-            layers_num += 1
-
-            if use_spp and arch in ['X']:
-                # in YOLOX use SPPLayer
-                spp_layer = self.add_sublayer(
-                    'layers{}.stage{}.spp_layer'.format(layers_num, i + 1),
-                    SPPLayer(
-                        out_channels,
-                        out_channels,
-                        kernel_sizes=spp_kernal_sizes,
-                        bias=False,
-                        act=act))
-                stage.append(spp_layer)
-                layers_num += 1
-
-            csp_layer = self.add_sublayer(
-                'layers{}.stage{}.csp_layer'.format(layers_num, i + 1),
-                CSPLayer(
-                    out_channels,
-                    out_channels,
-                    num_blocks=num_blocks,
-                    shortcut=shortcut,
-                    depthwise=depthwise,
-                    bias=False,
-                    act=act))
-            stage.append(csp_layer)
-            layers_num += 1
-
-            if use_spp and arch in ['P5', 'P6']:
-                # in latest YOLOv5 use SPPFLayer instead of SPPLayer
-                sppf_layer = self.add_sublayer(
-                    'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1),
-                    SPPFLayer(
-                        out_channels,
-                        out_channels,
-                        ksize=5,
-                        bias=False,
-                        act=act))
-                stage.append(sppf_layer)
-                layers_num += 1
-
-            self.csp_dark_blocks.append(nn.Sequential(*stage))
-
-        self._out_channels = [_out_channels[i] for i in self.return_idx]
-        self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx]
-
-    def forward(self, inputs):
-        x = inputs['image']
-        outputs = []
-        x = self.stem(x)
-        for i, layer in enumerate(self.csp_dark_blocks):
-            x = layer(x)
-            if i + 1 in self.return_idx:
-                outputs.append(x)
-        return outputs
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=c, stride=s)
-            for c, s in zip(self._out_channels, self.strides)
-        ]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/cspresnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/cspresnet.py
deleted file mode 100644
index 5268ec8..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/cspresnet.py
+++ /dev/null
@@ -1,321 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-from paddle.nn.initializer import Constant
-
-from ppdet.modeling.ops import get_act_fn
-from ppdet.core.workspace import register, serializable
-from ..shape_spec import ShapeSpec
-
-__all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer']
-
-
-class ConvBNLayer(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 filter_size=3,
-                 stride=1,
-                 groups=1,
-                 padding=0,
-                 act=None):
-        super(ConvBNLayer, self).__init__()
-
-        self.conv = nn.Conv2D(
-            in_channels=ch_in,
-            out_channels=ch_out,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=padding,
-            groups=groups,
-            bias_attr=False)
-
-        self.bn = nn.BatchNorm2D(
-            ch_out,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-        self.act = get_act_fn(act) if act is None or isinstance(act, (
-            str, dict)) else act
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        x = self.act(x)
-
-        return x
-
-
-class RepVggBlock(nn.Layer):
-    def __init__(self, ch_in, ch_out, act='relu', alpha=False):
-        super(RepVggBlock, self).__init__()
-        self.ch_in = ch_in
-        self.ch_out = ch_out
-        self.conv1 = ConvBNLayer(
-            ch_in, ch_out, 3, stride=1, padding=1, act=None)
-        self.conv2 = ConvBNLayer(
-            ch_in, ch_out, 1, stride=1, padding=0, act=None)
-        self.act = get_act_fn(act) if act is None or isinstance(act, (
-            str, dict)) else act
-        if alpha:
-            self.alpha = self.create_parameter(
-                shape=[1],
-                attr=ParamAttr(initializer=Constant(value=1.)),
-                dtype="float32")
-        else:
-            self.alpha = None
-
-    def forward(self, x):
-        if hasattr(self, 'conv'):
-            y = self.conv(x)
-        else:
-            if self.alpha:
-                y = self.conv1(x) + self.alpha * self.conv2(x)
-            else:
-                y = self.conv1(x) + self.conv2(x)
-        y = self.act(y)
-        return y
-
-    def convert_to_deploy(self):
-        if not hasattr(self, 'conv'):
-            self.conv = nn.Conv2D(
-                in_channels=self.ch_in,
-                out_channels=self.ch_out,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                groups=1)
-        kernel, bias = self.get_equivalent_kernel_bias()
-        self.conv.weight.set_value(kernel)
-        self.conv.bias.set_value(bias)
-        self.__delattr__('conv1')
-        self.__delattr__('conv2')
-
-    def get_equivalent_kernel_bias(self):
-        kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1)
-        kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2)
-        if self.alpha:
-            return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor(
-                kernel1x1), bias3x3 + self.alpha * bias1x1
-        else:
-            return kernel3x3 + self._pad_1x1_to_3x3_tensor(
-                kernel1x1), bias3x3 + bias1x1
-
-    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
-        if kernel1x1 is None:
-            return 0
-        else:
-            return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
-
-    def _fuse_bn_tensor(self, branch):
-        if branch is None:
-            return 0, 0
-        kernel = branch.conv.weight
-        running_mean = branch.bn._mean
-        running_var = branch.bn._variance
-        gamma = branch.bn.weight
-        beta = branch.bn.bias
-        eps = branch.bn._epsilon
-        std = (running_var + eps).sqrt()
-        t = (gamma / std).reshape((-1, 1, 1, 1))
-        return kernel * t, beta - running_mean * gamma / std
-
-
-class BasicBlock(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 act='relu',
-                 shortcut=True,
-                 use_alpha=False):
-        super(BasicBlock, self).__init__()
-        assert ch_in == ch_out
-        self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act)
-        self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha)
-        self.shortcut = shortcut
-
-    def forward(self, x):
-        y = self.conv1(x)
-        y = self.conv2(y)
-        if self.shortcut:
-            return paddle.add(x, y)
-        else:
-            return y
-
-
-class EffectiveSELayer(nn.Layer):
-    """ Effective Squeeze-Excitation
-    From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667
-    """
-
-    def __init__(self, channels, act='hardsigmoid'):
-        super(EffectiveSELayer, self).__init__()
-        self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0)
-        self.act = get_act_fn(act) if act is None or isinstance(act, (
-            str, dict)) else act
-
-    def forward(self, x):
-        x_se = x.mean((2, 3), keepdim=True)
-        x_se = self.fc(x_se)
-        return x * self.act(x_se)
-
-
-class CSPResStage(nn.Layer):
-    def __init__(self,
-                 block_fn,
-                 ch_in,
-                 ch_out,
-                 n,
-                 stride,
-                 act='relu',
-                 attn='eca',
-                 use_alpha=False):
-        super(CSPResStage, self).__init__()
-
-        ch_mid = (ch_in + ch_out) // 2
-        if stride == 2:
-            self.conv_down = ConvBNLayer(
-                ch_in, ch_mid, 3, stride=2, padding=1, act=act)
-        else:
-            self.conv_down = None
-        self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
-        self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act)
-        self.blocks = nn.Sequential(*[
-            block_fn(
-                ch_mid // 2,
-                ch_mid // 2,
-                act=act,
-                shortcut=True,
-                use_alpha=use_alpha) for i in range(n)
-        ])
-        if attn:
-            self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid')
-        else:
-            self.attn = None
-
-        self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act)
-
-    def forward(self, x):
-        if self.conv_down is not None:
-            x = self.conv_down(x)
-        y1 = self.conv1(x)
-        y2 = self.blocks(self.conv2(x))
-        y = paddle.concat([y1, y2], axis=1)
-        if self.attn is not None:
-            y = self.attn(y)
-        y = self.conv3(y)
-        return y
-
-
-@register
-@serializable
-class CSPResNet(nn.Layer):
-    __shared__ = ['width_mult', 'depth_mult', 'trt']
-
-    def __init__(self,
-                 layers=[3, 6, 6, 3],
-                 channels=[64, 128, 256, 512, 1024],
-                 act='swish',
-                 return_idx=[1, 2, 3],
-                 depth_wise=False,
-                 use_large_stem=False,
-                 width_mult=1.0,
-                 depth_mult=1.0,
-                 trt=False,
-                 use_checkpoint=False,
-                 use_alpha=False,
-                 **args):
-        super(CSPResNet, self).__init__()
-        self.use_checkpoint = use_checkpoint
-        channels = [max(round(c * width_mult), 1) for c in channels]
-        layers = [max(round(l * depth_mult), 1) for l in layers]
-        act = get_act_fn(
-            act, trt=trt) if act is None or isinstance(act,
-                                                       (str, dict)) else act
-
-        if use_large_stem:
-            self.stem = nn.Sequential(
-                ('conv1', ConvBNLayer(
-                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
-                ('conv2', ConvBNLayer(
-                    channels[0] // 2,
-                    channels[0] // 2,
-                    3,
-                    stride=1,
-                    padding=1,
-                    act=act)), ('conv3', ConvBNLayer(
-                        channels[0] // 2,
-                        channels[0],
-                        3,
-                        stride=1,
-                        padding=1,
-                        act=act)))
-        else:
-            self.stem = nn.Sequential(
-                ('conv1', ConvBNLayer(
-                    3, channels[0] // 2, 3, stride=2, padding=1, act=act)),
-                ('conv2', ConvBNLayer(
-                    channels[0] // 2,
-                    channels[0],
-                    3,
-                    stride=1,
-                    padding=1,
-                    act=act)))
-
-        n = len(channels) - 1
-        self.stages = nn.Sequential(*[(str(i), CSPResStage(
-            BasicBlock,
-            channels[i],
-            channels[i + 1],
-            layers[i],
-            2,
-            act=act,
-            use_alpha=use_alpha)) for i in range(n)])
-
-        self._out_channels = channels[1:]
-        self._out_strides = [4 * 2**i for i in range(n)]
-        self.return_idx = return_idx
-        if use_checkpoint:
-            paddle.seed(0)
-
-    def forward(self, inputs):
-        x = inputs['image']
-        x = self.stem(x)
-        outs = []
-        for idx, stage in enumerate(self.stages):
-            if self.use_checkpoint and self.training:
-                x = paddle.distributed.fleet.utils.recompute(
-                    stage, x, **{"preserve_rng_state": True})
-            else:
-                x = stage(x)
-            if idx in self.return_idx:
-                outs.append(x)
-
-        return outs
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=self._out_channels[i], stride=self._out_strides[i])
-            for i in self.return_idx
-        ]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/darknet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/darknet.py
deleted file mode 100644
index c68c650..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/darknet.py
+++ /dev/null
@@ -1,345 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ppdet.core.workspace import register, serializable
-from ppdet.modeling.ops import batch_norm, mish
-from ..shape_spec import ShapeSpec
-
-__all__ = ['DarkNet', 'ConvBNLayer']
-
-
-class ConvBNLayer(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 filter_size=3,
-                 stride=1,
-                 groups=1,
-                 padding=0,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 act="leaky",
-                 freeze_norm=False,
-                 data_format='NCHW',
-                 name=''):
-        """
-        conv + bn + activation layer
-
-        Args:
-            ch_in (int): input channel
-            ch_out (int): output channel
-            filter_size (int): filter size, default 3
-            stride (int): stride, default 1
-            groups (int): number of groups of conv layer, default 1
-            padding (int): padding size, default 0
-            norm_type (str): batch norm type, default bn
-            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
-            act (str): activation function type, default 'leaky', which means leaky_relu
-            freeze_norm (bool): whether to freeze norm, default False
-            data_format (str): data format, NCHW or NHWC
-        """
-        super(ConvBNLayer, self).__init__()
-
-        self.conv = nn.Conv2D(
-            in_channels=ch_in,
-            out_channels=ch_out,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=padding,
-            groups=groups,
-            data_format=data_format,
-            bias_attr=False)
-        self.batch_norm = batch_norm(
-            ch_out,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            data_format=data_format)
-        self.act = act
-
-    def forward(self, inputs):
-        out = self.conv(inputs)
-        out = self.batch_norm(out)
-        if self.act == 'leaky':
-            out = F.leaky_relu(out, 0.1)
-        else:
-            out = getattr(F, self.act)(out)
-        return out
-
-
-class DownSample(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 filter_size=3,
-                 stride=2,
-                 padding=1,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 freeze_norm=False,
-                 data_format='NCHW'):
-        """
-        downsample layer
-
-        Args:
-            ch_in (int): input channel
-            ch_out (int): output channel
-            filter_size (int): filter size, default 3
-            stride (int): stride, default 2
-            padding (int): padding size, default 1
-            norm_type (str): batch norm type, default bn
-            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
-            freeze_norm (bool): whether to freeze norm, default False
-            data_format (str): data format, NCHW or NHWC
-        """
-
-        super(DownSample, self).__init__()
-
-        self.conv_bn_layer = ConvBNLayer(
-            ch_in=ch_in,
-            ch_out=ch_out,
-            filter_size=filter_size,
-            stride=stride,
-            padding=padding,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            data_format=data_format)
-        self.ch_out = ch_out
-
-    def forward(self, inputs):
-        out = self.conv_bn_layer(inputs)
-        return out
-
-
-class BasicBlock(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 freeze_norm=False,
-                 data_format='NCHW'):
-        """
-        BasicBlock layer of DarkNet
-
-        Args:
-            ch_in (int): input channel
-            ch_out (int): output channel
-            norm_type (str): batch norm type, default bn
-            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
-            freeze_norm (bool): whether to freeze norm, default False
-            data_format (str): data format, NCHW or NHWC
-        """
-
-        super(BasicBlock, self).__init__()
-
-        assert ch_in == ch_out and (ch_in % 2) == 0, \
-            f"ch_in and ch_out should be the same even int, but the input \'ch_in is {ch_in}, \'ch_out is {ch_out}"
-        # example:
-        # --------------{conv1} --> {conv2}
-        # channel route: 10-->5 --> 5-->10
-        self.conv1 = ConvBNLayer(
-            ch_in=ch_in,
-            ch_out=int(ch_out / 2),
-            filter_size=1,
-            stride=1,
-            padding=0,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            data_format=data_format)
-        self.conv2 = ConvBNLayer(
-            ch_in=int(ch_out / 2),
-            ch_out=ch_out,
-            filter_size=3,
-            stride=1,
-            padding=1,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            data_format=data_format)
-
-    def forward(self, inputs):
-        conv1 = self.conv1(inputs)
-        conv2 = self.conv2(conv1)
-        out = paddle.add(x=inputs, y=conv2)
-        return out
-
-
-class Blocks(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 count,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 freeze_norm=False,
-                 name=None,
-                 data_format='NCHW'):
-        """
-        Blocks layer, which consist of some BaickBlock layers
-
-        Args:
-            ch_in (int): input channel
-            ch_out (int): output channel
-            count (int): number of BasicBlock layer
-            norm_type (str): batch norm type, default bn
-            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
-            freeze_norm (bool): whether to freeze norm, default False
-            name (str): layer name
-            data_format (str): data format, NCHW or NHWC
-        """
-        super(Blocks, self).__init__()
-
-        self.basicblock0 = BasicBlock(
-            ch_in,
-            ch_out,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            data_format=data_format)
-        self.res_out_list = []
-        for i in range(1, count):
-            block_name = '{}.{}'.format(name, i)
-            res_out = self.add_sublayer(
-                block_name,
-                BasicBlock(
-                    ch_out,
-                    ch_out,
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    data_format=data_format))
-            self.res_out_list.append(res_out)
-        self.ch_out = ch_out
-
-    def forward(self, inputs):
-        y = self.basicblock0(inputs)
-        for basic_block_i in self.res_out_list:
-            y = basic_block_i(y)
-        return y
-
-
-DarkNet_cfg = {53: ([1, 2, 8, 8, 4])}
-
-
-@register
-@serializable
-class DarkNet(nn.Layer):
-    __shared__ = ['norm_type', 'data_format']
-
-    def __init__(self,
-                 depth=53,
-                 freeze_at=-1,
-                 return_idx=[2, 3, 4],
-                 num_stages=5,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 freeze_norm=False,
-                 data_format='NCHW'):
-        """
-        Darknet, see https://pjreddie.com/darknet/yolo/
-
-        Args:
-            depth (int): depth of network
-            freeze_at (int): freeze the backbone at which stage
-            filter_size (int): filter size, default 3
-            return_idx (list): index of stages whose feature maps are returned
-            norm_type (str): batch norm type, default bn
-            norm_decay (str): decay for weight and bias of batch norm layer, default 0.
-            data_format (str): data format, NCHW or NHWC
-        """
-        super(DarkNet, self).__init__()
-        self.depth = depth
-        self.freeze_at = freeze_at
-        self.return_idx = return_idx
-        self.num_stages = num_stages
-        self.stages = DarkNet_cfg[self.depth][0:num_stages]
-
-        self.conv0 = ConvBNLayer(
-            ch_in=3,
-            ch_out=32,
-            filter_size=3,
-            stride=1,
-            padding=1,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            data_format=data_format)
-
-        self.downsample0 = DownSample(
-            ch_in=32,
-            ch_out=32 * 2,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            data_format=data_format)
-
-        self._out_channels = []
-        self.darknet_conv_block_list = []
-        self.downsample_list = []
-        ch_in = [64, 128, 256, 512, 1024]
-        for i, stage in enumerate(self.stages):
-            name = 'stage.{}'.format(i)
-            conv_block = self.add_sublayer(
-                name,
-                Blocks(
-                    int(ch_in[i]),
-                    int(ch_in[i]),
-                    stage,
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    data_format=data_format,
-                    name=name))
-            self.darknet_conv_block_list.append(conv_block)
-            if i in return_idx:
-                self._out_channels.append(int(ch_in[i]))
-        for i in range(num_stages - 1):
-            down_name = 'stage.{}.downsample'.format(i)
-            downsample = self.add_sublayer(
-                down_name,
-                DownSample(
-                    ch_in=int(ch_in[i]),
-                    ch_out=int(ch_in[i + 1]),
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    data_format=data_format))
-            self.downsample_list.append(downsample)
-
-    def forward(self, inputs):
-        x = inputs['image']
-
-        out = self.conv0(x)
-        out = self.downsample0(out)
-        blocks = []
-        for i, conv_block_i in enumerate(self.darknet_conv_block_list):
-            out = conv_block_i(out)
-            if i == self.freeze_at:
-                out.stop_gradient = True
-            if i in self.return_idx:
-                blocks.append(out)
-            if i < self.num_stages - 1:
-                out = self.downsample_list[i](out)
-        return blocks
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/dla.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/dla.py
deleted file mode 100644
index 51d1f07..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/dla.py
+++ /dev/null
@@ -1,283 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register, serializable
-from ppdet.modeling.layers import ConvNormLayer
-from ..shape_spec import ShapeSpec
-
-DLA_cfg = {34: ([1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512]), }
-
-
-class BasicBlock(nn.Layer):
-    def __init__(self, ch_in, ch_out, stride=1):
-        super(BasicBlock, self).__init__()
-        self.conv1 = ConvNormLayer(
-            ch_in,
-            ch_out,
-            filter_size=3,
-            stride=stride,
-            bias_on=False,
-            norm_decay=None)
-        self.conv2 = ConvNormLayer(
-            ch_out,
-            ch_out,
-            filter_size=3,
-            stride=1,
-            bias_on=False,
-            norm_decay=None)
-
-    def forward(self, inputs, residual=None):
-        if residual is None:
-            residual = inputs
-
-        out = self.conv1(inputs)
-        out = F.relu(out)
-
-        out = self.conv2(out)
-
-        out = paddle.add(x=out, y=residual)
-        out = F.relu(out)
-
-        return out
-
-
-class Root(nn.Layer):
-    def __init__(self, ch_in, ch_out, kernel_size, residual):
-        super(Root, self).__init__()
-        self.conv = ConvNormLayer(
-            ch_in,
-            ch_out,
-            filter_size=1,
-            stride=1,
-            bias_on=False,
-            norm_decay=None)
-        self.residual = residual
-
-    def forward(self, inputs):
-        children = inputs
-        out = self.conv(paddle.concat(inputs, axis=1))
-        if self.residual:
-            out = paddle.add(x=out, y=children[0])
-        out = F.relu(out)
-
-        return out
-
-
-class Tree(nn.Layer):
-    def __init__(self,
-                 level,
-                 block,
-                 ch_in,
-                 ch_out,
-                 stride=1,
-                 level_root=False,
-                 root_dim=0,
-                 root_kernel_size=1,
-                 root_residual=False):
-        super(Tree, self).__init__()
-        if root_dim == 0:
-            root_dim = 2 * ch_out
-        if level_root:
-            root_dim += ch_in
-        if level == 1:
-            self.tree1 = block(ch_in, ch_out, stride)
-            self.tree2 = block(ch_out, ch_out, 1)
-        else:
-            self.tree1 = Tree(
-                level - 1,
-                block,
-                ch_in,
-                ch_out,
-                stride,
-                root_dim=0,
-                root_kernel_size=root_kernel_size,
-                root_residual=root_residual)
-            self.tree2 = Tree(
-                level - 1,
-                block,
-                ch_out,
-                ch_out,
-                1,
-                root_dim=root_dim + ch_out,
-                root_kernel_size=root_kernel_size,
-                root_residual=root_residual)
-
-        if level == 1:
-            self.root = Root(root_dim, ch_out, root_kernel_size, root_residual)
-        self.level_root = level_root
-        self.root_dim = root_dim
-        self.downsample = None
-        self.project = None
-        self.level = level
-        if stride > 1:
-            self.downsample = nn.MaxPool2D(stride, stride=stride)
-        if ch_in != ch_out:
-            self.project = ConvNormLayer(
-                ch_in,
-                ch_out,
-                filter_size=1,
-                stride=1,
-                bias_on=False,
-                norm_decay=None)
-
-    def forward(self, x, residual=None, children=None):
-        children = [] if children is None else children
-        bottom = self.downsample(x) if self.downsample else x
-        residual = self.project(bottom) if self.project else bottom
-        if self.level_root:
-            children.append(bottom)
-        x1 = self.tree1(x, residual)
-        if self.level == 1:
-            x2 = self.tree2(x1)
-            x = self.root([x2, x1] + children)
-        else:
-            children.append(x1)
-            x = self.tree2(x1, children=children)
-        return x
-
-
-@register
-@serializable
-class DLA(nn.Layer):
-    """
-    DLA, see https://arxiv.org/pdf/1707.06484.pdf
-
-    Args:
-        depth (int): DLA depth, only support 34 now.
-        residual_root (bool): whether use a reidual layer in the root block
-        pre_img (bool): add pre_img, only used in CenterTrack
-        pre_hm (bool): add pre_hm, only used in CenterTrack
-    """
-
-    def __init__(self,
-                 depth=34,
-                 residual_root=False,
-                 pre_img=False,
-                 pre_hm=False):
-        super(DLA, self).__init__()
-        assert depth == 34, 'Only support DLA with depth of 34 now.'
-        if depth == 34:
-            block = BasicBlock
-        levels, channels = DLA_cfg[depth]
-        self.channels = channels
-        self.num_levels = len(levels)
-
-        self.base_layer = nn.Sequential(
-            ConvNormLayer(
-                3,
-                channels[0],
-                filter_size=7,
-                stride=1,
-                bias_on=False,
-                norm_decay=None),
-            nn.ReLU())
-        self.level0 = self._make_conv_level(channels[0], channels[0], levels[0])
-        self.level1 = self._make_conv_level(
-            channels[0], channels[1], levels[1], stride=2)
-        self.level2 = Tree(
-            levels[2],
-            block,
-            channels[1],
-            channels[2],
-            2,
-            level_root=False,
-            root_residual=residual_root)
-        self.level3 = Tree(
-            levels[3],
-            block,
-            channels[2],
-            channels[3],
-            2,
-            level_root=True,
-            root_residual=residual_root)
-        self.level4 = Tree(
-            levels[4],
-            block,
-            channels[3],
-            channels[4],
-            2,
-            level_root=True,
-            root_residual=residual_root)
-        self.level5 = Tree(
-            levels[5],
-            block,
-            channels[4],
-            channels[5],
-            2,
-            level_root=True,
-            root_residual=residual_root)
-
-        if pre_img:
-            self.pre_img_layer = nn.Sequential(
-                ConvNormLayer(
-                    3,
-                    channels[0],
-                    filter_size=7,
-                    stride=1,
-                    bias_on=False,
-                    norm_decay=None),
-                nn.ReLU())
-        if pre_hm:
-            self.pre_hm_layer = nn.Sequential(
-                ConvNormLayer(
-                    1,
-                    channels[0],
-                    filter_size=7,
-                    stride=1,
-                    bias_on=False,
-                    norm_decay=None),
-                nn.ReLU())
-        self.pre_img = pre_img
-        self.pre_hm = pre_hm
-
-    def _make_conv_level(self, ch_in, ch_out, conv_num, stride=1):
-        modules = []
-        for i in range(conv_num):
-            modules.extend([
-                ConvNormLayer(
-                    ch_in,
-                    ch_out,
-                    filter_size=3,
-                    stride=stride if i == 0 else 1,
-                    bias_on=False,
-                    norm_decay=None), nn.ReLU()
-            ])
-            ch_in = ch_out
-        return nn.Sequential(*modules)
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(channels=self.channels[i]) for i in range(self.num_levels)
-        ]
-
-    def forward(self, inputs):
-        outs = []
-        feats = self.base_layer(inputs['image'])
-
-        if self.pre_img and 'pre_image' in inputs and inputs[
-                'pre_image'] is not None:
-            feats = feats + self.pre_img_layer(inputs['pre_image'])
-
-        if self.pre_hm and 'pre_hm' in inputs and inputs['pre_hm'] is not None:
-            feats = feats + self.pre_hm_layer(inputs['pre_hm'])
-
-        for i in range(self.num_levels):
-            feats = getattr(self, 'level{}'.format(i))(feats)
-            outs.append(feats)
-
-        return outs
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/esnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/esnet.py
deleted file mode 100644
index 2b3f3c5..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/esnet.py
+++ /dev/null
@@ -1,290 +0,0 @@
-# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm
-from paddle.nn.initializer import KaimingNormal
-from paddle.regularizer import L2Decay
-
-from ppdet.core.workspace import register, serializable
-from numbers import Integral
-from ..shape_spec import ShapeSpec
-from ppdet.modeling.ops import channel_shuffle
-from ppdet.modeling.backbones.shufflenet_v2 import ConvBNLayer
-
-__all__ = ['ESNet']
-
-
-def make_divisible(v, divisor=16, min_value=None):
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
-class SEModule(nn.Layer):
-    def __init__(self, channel, reduction=4):
-        super(SEModule, self).__init__()
-        self.avg_pool = AdaptiveAvgPool2D(1)
-        self.conv1 = Conv2D(
-            in_channels=channel,
-            out_channels=channel // reduction,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            weight_attr=ParamAttr(),
-            bias_attr=ParamAttr())
-        self.conv2 = Conv2D(
-            in_channels=channel // reduction,
-            out_channels=channel,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            weight_attr=ParamAttr(),
-            bias_attr=ParamAttr())
-
-    def forward(self, inputs):
-        outputs = self.avg_pool(inputs)
-        outputs = self.conv1(outputs)
-        outputs = F.relu(outputs)
-        outputs = self.conv2(outputs)
-        outputs = F.hardsigmoid(outputs)
-        return paddle.multiply(x=inputs, y=outputs)
-
-
-class InvertedResidual(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 mid_channels,
-                 out_channels,
-                 stride,
-                 act="relu"):
-        super(InvertedResidual, self).__init__()
-        self._conv_pw = ConvBNLayer(
-            in_channels=in_channels // 2,
-            out_channels=mid_channels // 2,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            groups=1,
-            act=act)
-        self._conv_dw = ConvBNLayer(
-            in_channels=mid_channels // 2,
-            out_channels=mid_channels // 2,
-            kernel_size=3,
-            stride=stride,
-            padding=1,
-            groups=mid_channels // 2,
-            act=None)
-        self._se = SEModule(mid_channels)
-
-        self._conv_linear = ConvBNLayer(
-            in_channels=mid_channels,
-            out_channels=out_channels // 2,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            groups=1,
-            act=act)
-
-    def forward(self, inputs):
-        x1, x2 = paddle.split(
-            inputs,
-            num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
-            axis=1)
-        x2 = self._conv_pw(x2)
-        x3 = self._conv_dw(x2)
-        x3 = paddle.concat([x2, x3], axis=1)
-        x3 = self._se(x3)
-        x3 = self._conv_linear(x3)
-        out = paddle.concat([x1, x3], axis=1)
-        return channel_shuffle(out, 2)
-
-
-class InvertedResidualDS(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 mid_channels,
-                 out_channels,
-                 stride,
-                 act="relu"):
-        super(InvertedResidualDS, self).__init__()
-
-        # branch1
-        self._conv_dw_1 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=in_channels,
-            kernel_size=3,
-            stride=stride,
-            padding=1,
-            groups=in_channels,
-            act=None)
-        self._conv_linear_1 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=out_channels // 2,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            groups=1,
-            act=act)
-        # branch2
-        self._conv_pw_2 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=mid_channels // 2,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            groups=1,
-            act=act)
-        self._conv_dw_2 = ConvBNLayer(
-            in_channels=mid_channels // 2,
-            out_channels=mid_channels // 2,
-            kernel_size=3,
-            stride=stride,
-            padding=1,
-            groups=mid_channels // 2,
-            act=None)
-        self._se = SEModule(mid_channels // 2)
-        self._conv_linear_2 = ConvBNLayer(
-            in_channels=mid_channels // 2,
-            out_channels=out_channels // 2,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            groups=1,
-            act=act)
-        self._conv_dw_mv1 = ConvBNLayer(
-            in_channels=out_channels,
-            out_channels=out_channels,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            groups=out_channels,
-            act="hard_swish")
-        self._conv_pw_mv1 = ConvBNLayer(
-            in_channels=out_channels,
-            out_channels=out_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            groups=1,
-            act="hard_swish")
-
-    def forward(self, inputs):
-        x1 = self._conv_dw_1(inputs)
-        x1 = self._conv_linear_1(x1)
-        x2 = self._conv_pw_2(inputs)
-        x2 = self._conv_dw_2(x2)
-        x2 = self._se(x2)
-        x2 = self._conv_linear_2(x2)
-        out = paddle.concat([x1, x2], axis=1)
-        out = self._conv_dw_mv1(out)
-        out = self._conv_pw_mv1(out)
-
-        return out
-
-
-@register
-@serializable
-class ESNet(nn.Layer):
-    def __init__(self,
-                 scale=1.0,
-                 act="hard_swish",
-                 feature_maps=[4, 11, 14],
-                 channel_ratio=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]):
-        super(ESNet, self).__init__()
-        self.scale = scale
-        if isinstance(feature_maps, Integral):
-            feature_maps = [feature_maps]
-        self.feature_maps = feature_maps
-        stage_repeats = [3, 7, 3]
-
-        stage_out_channels = [
-            -1, 24, make_divisible(128 * scale), make_divisible(256 * scale),
-            make_divisible(512 * scale), 1024
-        ]
-
-        self._out_channels = []
-        self._feature_idx = 0
-        # 1. conv1
-        self._conv1 = ConvBNLayer(
-            in_channels=3,
-            out_channels=stage_out_channels[1],
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            act=act)
-        self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
-        self._feature_idx += 1
-
-        # 2. bottleneck sequences
-        self._block_list = []
-        arch_idx = 0
-        for stage_id, num_repeat in enumerate(stage_repeats):
-            for i in range(num_repeat):
-                channels_scales = channel_ratio[arch_idx]
-                mid_c = make_divisible(
-                    int(stage_out_channels[stage_id + 2] * channels_scales),
-                    divisor=8)
-                if i == 0:
-                    block = self.add_sublayer(
-                        name=str(stage_id + 2) + '_' + str(i + 1),
-                        sublayer=InvertedResidualDS(
-                            in_channels=stage_out_channels[stage_id + 1],
-                            mid_channels=mid_c,
-                            out_channels=stage_out_channels[stage_id + 2],
-                            stride=2,
-                            act=act))
-                else:
-                    block = self.add_sublayer(
-                        name=str(stage_id + 2) + '_' + str(i + 1),
-                        sublayer=InvertedResidual(
-                            in_channels=stage_out_channels[stage_id + 2],
-                            mid_channels=mid_c,
-                            out_channels=stage_out_channels[stage_id + 2],
-                            stride=1,
-                            act=act))
-                self._block_list.append(block)
-                arch_idx += 1
-                self._feature_idx += 1
-                self._update_out_channels(stage_out_channels[stage_id + 2],
-                                          self._feature_idx, self.feature_maps)
-
-    def _update_out_channels(self, channel, feature_idx, feature_maps):
-        if feature_idx in feature_maps:
-            self._out_channels.append(channel)
-
-    def forward(self, inputs):
-        y = self._conv1(inputs['image'])
-        y = self._max_pool(y)
-        outs = []
-        for i, inv in enumerate(self._block_list):
-            y = inv(y)
-            if i + 2 in self.feature_maps:
-                outs.append(y)
-
-        return outs
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/focalnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/focalnet.py
deleted file mode 100644
index 54c2877..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/focalnet.py
+++ /dev/null
@@ -1,720 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/microsoft/FocalNet/blob/main/classification/focalnet.py
-"""
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.modeling.shape_spec import ShapeSpec
-from ppdet.core.workspace import register, serializable
-from .transformer_utils import DropPath, Identity
-from .transformer_utils import add_parameter, to_2tuple
-from .transformer_utils import ones_, zeros_, trunc_normal_
-from .swin_transformer import Mlp
-
-__all__ = ['FocalNet']
-
-MODEL_cfg = {
-    'focalnet_T_224_1k_srf': dict(
-        embed_dim=96,
-        depths=[2, 2, 6, 2],
-        focal_levels=[2, 2, 2, 2],
-        focal_windows=[3, 3, 3, 3],
-        drop_path_rate=0.2,
-        use_conv_embed=False,
-        use_postln=False,
-        use_postln_in_modulation=False,
-        use_layerscale=False,
-        normalize_modulator=False,
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_srf_pretrained.pdparams',
-    ),
-    'focalnet_S_224_1k_srf': dict(
-        embed_dim=96,
-        depths=[2, 2, 18, 2],
-        focal_levels=[2, 2, 2, 2],
-        focal_windows=[3, 3, 3, 3],
-        drop_path_rate=0.3,
-        use_conv_embed=False,
-        use_postln=False,
-        use_postln_in_modulation=False,
-        use_layerscale=False,
-        normalize_modulator=False,
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_srf_pretrained.pdparams',
-    ),
-    'focalnet_B_224_1k_srf': dict(
-        embed_dim=128,
-        depths=[2, 2, 18, 2],
-        focal_levels=[2, 2, 2, 2],
-        focal_windows=[3, 3, 3, 3],
-        drop_path_rate=0.5,
-        use_conv_embed=False,
-        use_postln=False,
-        use_postln_in_modulation=False,
-        use_layerscale=False,
-        normalize_modulator=False,
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_srf_pretrained.pdparams',
-    ),
-    'focalnet_T_224_1k_lrf': dict(
-        embed_dim=96,
-        depths=[2, 2, 6, 2],
-        focal_levels=[3, 3, 3, 3],
-        focal_windows=[3, 3, 3, 3],
-        drop_path_rate=0.2,
-        use_conv_embed=False,
-        use_postln=False,
-        use_postln_in_modulation=False,
-        use_layerscale=False,
-        normalize_modulator=False,
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_lrf_pretrained.pdparams',
-    ),
-    'focalnet_S_224_1k_lrf': dict(
-        embed_dim=96,
-        depths=[2, 2, 18, 2],
-        focal_levels=[3, 3, 3, 3],
-        focal_windows=[3, 3, 3, 3],
-        drop_path_rate=0.3,
-        use_conv_embed=False,
-        use_postln=False,
-        use_postln_in_modulation=False,
-        use_layerscale=False,
-        normalize_modulator=False,
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_lrf_pretrained.pdparams',
-    ),
-    'focalnet_B_224_1k_lrf': dict(
-        embed_dim=128,
-        depths=[2, 2, 18, 2],
-        focal_levels=[3, 3, 3, 3],
-        focal_windows=[3, 3, 3, 3],
-        drop_path_rate=0.5,
-        use_conv_embed=False,
-        use_postln=False,
-        use_postln_in_modulation=False,
-        use_layerscale=False,
-        normalize_modulator=False,
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_lrf_pretrained.pdparams',
-    ),
-    'focalnet_L_384_22k_fl3': dict(
-        embed_dim=192,
-        depths=[2, 2, 18, 2],
-        focal_levels=[3, 3, 3, 3],
-        focal_windows=[5, 5, 5, 5],
-        drop_path_rate=0.5,
-        use_conv_embed=True,
-        use_postln=True,
-        use_postln_in_modulation=False,
-        use_layerscale=True,
-        normalize_modulator=False,
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_pretrained.pdparams',
-    ),
-    'focalnet_L_384_22k_fl4': dict(
-        embed_dim=192,
-        depths=[2, 2, 18, 2],
-        focal_levels=[4, 4, 4, 4],
-        focal_windows=[3, 3, 3, 3],
-        drop_path_rate=0.5,
-        use_conv_embed=True,
-        use_postln=True,
-        use_postln_in_modulation=False,
-        use_layerscale=True,
-        normalize_modulator=True,  #
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_fl4_pretrained.pdparams',
-    ),
-    'focalnet_XL_384_22k_fl3': dict(
-        embed_dim=256,
-        depths=[2, 2, 18, 2],
-        focal_levels=[3, 3, 3, 3],
-        focal_windows=[5, 5, 5, 5],
-        drop_path_rate=0.5,
-        use_conv_embed=True,
-        use_postln=True,
-        use_postln_in_modulation=False,
-        use_layerscale=True,
-        normalize_modulator=False,
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_pretrained.pdparams',
-    ),
-    'focalnet_XL_384_22k_fl4': dict(
-        embed_dim=256,
-        depths=[2, 2, 18, 2],
-        focal_levels=[4, 4, 4, 4],
-        focal_windows=[3, 3, 3, 3],
-        drop_path_rate=0.5,
-        use_conv_embed=True,
-        use_postln=True,
-        use_postln_in_modulation=False,
-        use_layerscale=True,
-        normalize_modulator=False,
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_fl4_pretrained.pdparams',
-    ),
-    'focalnet_H_224_22k_fl3': dict(
-        embed_dim=352,
-        depths=[2, 2, 18, 2],
-        focal_levels=[3, 3, 3, 3],
-        focal_windows=[3, 3, 3, 3],
-        drop_path_rate=0.5,
-        use_conv_embed=True,
-        use_postln=True,
-        use_postln_in_modulation=True,  #
-        use_layerscale=True,
-        normalize_modulator=False,
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_pretrained.pdparams',
-    ),
-    'focalnet_H_224_22k_fl4': dict(
-        embed_dim=352,
-        depths=[2, 2, 18, 2],
-        focal_levels=[4, 4, 4, 4],
-        focal_windows=[3, 3, 3, 3],
-        drop_path_rate=0.5,
-        use_conv_embed=True,
-        use_postln=True,
-        use_postln_in_modulation=True,  #
-        use_layerscale=True,
-        normalize_modulator=False,
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_fl4_pretrained.pdparams',
-    ),
-}
-
-
-class FocalModulation(nn.Layer):
-    """
-    Args:
-        dim (int): Number of input channels.
-        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
-        focal_level (int): Number of focal levels
-        focal_window (int): Focal window size at focal level 1
-        focal_factor (int): Step to increase the focal window. Default: 2
-        use_postln_in_modulation (bool): Whether use post-modulation layernorm
-        normalize_modulator (bool): Whether use normalize in modulator
-    """
-
-    def __init__(self,
-                 dim,
-                 proj_drop=0.,
-                 focal_level=2,
-                 focal_window=7,
-                 focal_factor=2,
-                 use_postln_in_modulation=False,
-                 normalize_modulator=False):
-        super().__init__()
-        self.dim = dim
-
-        # specific args for focalv3
-        self.focal_level = focal_level
-        self.focal_window = focal_window
-        self.focal_factor = focal_factor
-        self.use_postln_in_modulation = use_postln_in_modulation
-        self.normalize_modulator = normalize_modulator
-
-        self.f = nn.Linear(
-            dim, 2 * dim + (self.focal_level + 1), bias_attr=True)
-        self.h = nn.Conv2D(
-            dim,
-            dim,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            groups=1,
-            bias_attr=True)
-
-        self.act = nn.GELU()
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-        self.focal_layers = nn.LayerList()
-
-        if self.use_postln_in_modulation:
-            self.ln = nn.LayerNorm(dim)
-
-        for k in range(self.focal_level):
-            kernel_size = self.focal_factor * k + self.focal_window
-            self.focal_layers.append(
-                nn.Sequential(
-                    nn.Conv2D(
-                        dim,
-                        dim,
-                        kernel_size=kernel_size,
-                        stride=1,
-                        groups=dim,
-                        padding=kernel_size // 2,
-                        bias_attr=False),
-                    nn.GELU()))
-
-    def forward(self, x):
-        """ Forward function.
-        Args:
-            x: input features with shape of (B, H, W, C)
-        """
-        _, _, _, C = x.shape
-        x = self.f(x)
-        x = x.transpose([0, 3, 1, 2])
-        q, ctx, gates = paddle.split(x, (C, C, self.focal_level + 1), 1)
-
-        ctx_all = 0
-        for l in range(self.focal_level):
-            ctx = self.focal_layers[l](ctx)
-            ctx_all = ctx_all + ctx * gates[:, l:l + 1]
-        ctx_global = self.act(ctx.mean(2, keepdim=True).mean(3, keepdim=True))
-        ctx_all = ctx_all + ctx_global * gates[:, self.focal_level:]
-        if self.normalize_modulator:
-            ctx_all = ctx_all / (self.focal_level + 1)
-
-        x_out = q * self.h(ctx_all)
-        x_out = x_out.transpose([0, 2, 3, 1])
-        if self.use_postln_in_modulation:
-            x_out = self.ln(x_out)
-        x_out = self.proj(x_out)
-        x_out = self.proj_drop(x_out)
-        return x_out
-
-
-class FocalModulationBlock(nn.Layer):
-    """ Focal Modulation Block.
-    Args:
-        dim (int): Number of input channels.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        drop (float, optional): Dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
-        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
-        focal_level (int): number of focal levels
-        focal_window (int): focal kernel size at level 1
-        use_postln (bool): Whether use layernorm after modulation. Default: False.
-        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
-        normalize_modulator (bool): Whether use normalize in modulator
-        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
-        layerscale_value (float): Value for layer scale. Default: 1e-4 
-    """
-
-    def __init__(self,
-                 dim,
-                 mlp_ratio=4.,
-                 drop=0.,
-                 drop_path=0.,
-                 act_layer=nn.GELU,
-                 norm_layer=nn.LayerNorm,
-                 focal_level=2,
-                 focal_window=9,
-                 use_postln=False,
-                 use_postln_in_modulation=False,
-                 normalize_modulator=False,
-                 use_layerscale=False,
-                 layerscale_value=1e-4):
-        super().__init__()
-        self.dim = dim
-        self.mlp_ratio = mlp_ratio
-        self.focal_window = focal_window
-        self.focal_level = focal_level
-        self.use_postln = use_postln
-        self.use_layerscale = use_layerscale
-
-        self.norm1 = norm_layer(dim)
-        self.modulation = FocalModulation(
-            dim,
-            proj_drop=drop,
-            focal_level=self.focal_level,
-            focal_window=self.focal_window,
-            use_postln_in_modulation=use_postln_in_modulation,
-            normalize_modulator=normalize_modulator)
-
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim,
-                       hidden_features=mlp_hidden_dim,
-                       act_layer=act_layer,
-                       drop=drop)
-        self.H = None
-        self.W = None
-
-        self.gamma_1 = 1.0
-        self.gamma_2 = 1.0
-        if self.use_layerscale:
-            self.gamma_1 = add_parameter(self,
-                                         layerscale_value * paddle.ones([dim]))
-            self.gamma_2 = add_parameter(self,
-                                         layerscale_value * paddle.ones([dim]))
-
-    def forward(self, x):
-        """
-        Args:
-            x: Input feature, tensor size (B, H*W, C).
-        """
-        B, L, C = x.shape
-        H, W = self.H, self.W
-        assert L == H * W, "input feature has wrong size"
-
-        shortcut = x
-        if not self.use_postln:
-            x = self.norm1(x)
-        x = x.reshape([-1, H, W, C])
-
-        # FM
-        x = self.modulation(x).reshape([-1, H * W, C])
-        if self.use_postln:
-            x = self.norm1(x)
-
-        # FFN
-        x = shortcut + self.drop_path(self.gamma_1 * x)
-
-        if self.use_postln:
-            x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
-        else:
-            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
-        return x
-
-
-class BasicLayer(nn.Layer):
-    """ A basic focal modulation layer for one stage.
-    Args:
-        dim (int): Number of feature channels
-        depth (int): Depths of this stage.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
-        drop (float, optional): Dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
-        focal_level (int): Number of focal levels
-        focal_window (int): Focal window size at focal level 1
-        use_conv_embed (bool): Whether use overlapped convolution for patch embedding
-        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
-        layerscale_value (float): Value of layerscale
-        use_postln (bool): Whether use layernorm after modulation. Default: False.
-        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
-        normalize_modulator (bool): Whether use normalize in modulator
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-    """
-
-    def __init__(self,
-                 dim,
-                 depth,
-                 mlp_ratio=4.,
-                 drop=0.,
-                 drop_path=0.,
-                 norm_layer=nn.LayerNorm,
-                 downsample=None,
-                 focal_level=2,
-                 focal_window=9,
-                 use_conv_embed=False,
-                 use_layerscale=False,
-                 layerscale_value=1e-4,
-                 use_postln=False,
-                 use_postln_in_modulation=False,
-                 normalize_modulator=False,
-                 use_checkpoint=False):
-        super().__init__()
-        self.depth = depth
-        self.use_checkpoint = use_checkpoint
-
-        # build blocks
-        self.blocks = nn.LayerList([
-            FocalModulationBlock(
-                dim=dim,
-                mlp_ratio=mlp_ratio,
-                drop=drop,
-                drop_path=drop_path[i]
-                if isinstance(drop_path, np.ndarray) else drop_path,
-                act_layer=nn.GELU,
-                norm_layer=norm_layer,
-                focal_level=focal_level,
-                focal_window=focal_window,
-                use_postln=use_postln,
-                use_postln_in_modulation=use_postln_in_modulation,
-                normalize_modulator=normalize_modulator,
-                use_layerscale=use_layerscale,
-                layerscale_value=layerscale_value) for i in range(depth)
-        ])
-
-        # patch merging layer
-        if downsample is not None:
-            self.downsample = downsample(
-                patch_size=2,
-                in_chans=dim,
-                embed_dim=2 * dim,
-                use_conv_embed=use_conv_embed,
-                norm_layer=norm_layer,
-                is_stem=False)
-        else:
-            self.downsample = None
-
-    def forward(self, x, H, W):
-        """
-        Args:
-            x: Input feature, tensor size (B, H*W, C).
-        """
-        for blk in self.blocks:
-            blk.H, blk.W = H, W
-            x = blk(x)
-
-        if self.downsample is not None:
-            x_reshaped = x.transpose([0, 2, 1]).reshape(
-                [x.shape[0], x.shape[-1], H, W])
-            x_down = self.downsample(x_reshaped)
-            x_down = x_down.flatten(2).transpose([0, 2, 1])
-            Wh, Ww = (H + 1) // 2, (W + 1) // 2
-            return x, H, W, x_down, Wh, Ww
-        else:
-            return x, H, W, x, H, W
-
-
-class PatchEmbed(nn.Layer):
-    """ Image to Patch Embedding
-    Args:
-        patch_size (int): Patch token size. Default: 4.
-        in_chans (int): Number of input image channels. Default: 3.
-        embed_dim (int): Number of linear projection output channels. Default: 96.
-        norm_layer (nn.Layer, optional): Normalization layer. Default: None
-        use_conv_embed (bool): Whether use overlapped convolution for patch embedding. Default: False
-        is_stem (bool): Is the stem block or not. 
-    """
-
-    def __init__(self,
-                 patch_size=4,
-                 in_chans=3,
-                 embed_dim=96,
-                 norm_layer=None,
-                 use_conv_embed=False,
-                 is_stem=False):
-        super().__init__()
-        patch_size = to_2tuple(patch_size)
-        self.patch_size = patch_size
-
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-
-        if use_conv_embed:
-            # if we choose to use conv embedding, then we treat the stem and non-stem differently
-            if is_stem:
-                kernel_size = 7
-                padding = 2
-                stride = 4
-            else:
-                kernel_size = 3
-                padding = 1
-                stride = 2
-            self.proj = nn.Conv2D(
-                in_chans,
-                embed_dim,
-                kernel_size=kernel_size,
-                stride=stride,
-                padding=padding)
-        else:
-            self.proj = nn.Conv2D(
-                in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-
-        if norm_layer is not None:
-            self.norm = norm_layer(embed_dim)
-        else:
-            self.norm = None
-
-    def forward(self, x):
-        _, _, H, W = x.shape
-
-        if W % self.patch_size[1] != 0:
-            # for 3D tensor: [pad_left, pad_right]
-            # for 4D tensor: [pad_left, pad_right, pad_top, pad_bottom]
-            x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
-            W += W % self.patch_size[1]
-        if H % self.patch_size[0] != 0:
-            x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
-            H += H % self.patch_size[0]
-
-        x = self.proj(x)
-        if self.norm is not None:
-            _, _, Wh, Ww = x.shape
-            x = x.flatten(2).transpose([0, 2, 1])
-            x = self.norm(x)
-            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
-
-        return x
-
-
-@register
-@serializable
-class FocalNet(nn.Layer):
-    """ FocalNet backbone
-    Args:
-        arch (str): Architecture of FocalNet
-        out_indices (Sequence[int]): Output from which stages.
-        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-            -1 means not freezing any parameters.
-        patch_size (int | tuple(int)): Patch size. Default: 4.
-        in_chans (int): Number of input image channels. Default: 3.
-        embed_dim (int): Number of linear projection output channels. Default: 96.
-        depths (tuple[int]): Depths of each FocalNet Transformer stage.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
-        drop_rate (float): Dropout rate.
-        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
-        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
-        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
-        focal_levels (Sequence[int]): Number of focal levels at four stages
-        focal_windows (Sequence[int]): Focal window sizes at first focal level at four stages
-        use_conv_embed (bool): Whether use overlapped convolution for patch embedding
-        use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False 
-        layerscale_value (float): Value of layerscale
-        use_postln (bool): Whether use layernorm after modulation. Default: False.
-        use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False.
-        normalize_modulator (bool): Whether use normalize in modulator
-        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
-    """
-
-    def __init__(
-            self,
-            arch='focalnet_T_224_1k_srf',
-            out_indices=(0, 1, 2, 3),
-            frozen_stages=-1,
-            patch_size=4,
-            in_chans=3,
-            embed_dim=96,
-            depths=[2, 2, 6, 2],
-            mlp_ratio=4.,
-            drop_rate=0.,
-            drop_path_rate=0.2,  # 0.5 better for large+ models
-            norm_layer=nn.LayerNorm,
-            patch_norm=True,
-            focal_levels=[2, 2, 2, 2],
-            focal_windows=[3, 3, 3, 3],
-            use_conv_embed=False,
-            use_layerscale=False,
-            layerscale_value=1e-4,
-            use_postln=False,
-            use_postln_in_modulation=False,
-            normalize_modulator=False,
-            use_checkpoint=False,
-            pretrained=None):
-        super(FocalNet, self).__init__()
-        assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
-
-        embed_dim = MODEL_cfg[arch]['embed_dim']
-        depths = MODEL_cfg[arch]['depths']
-        drop_path_rate = MODEL_cfg[arch]['drop_path_rate']
-        focal_levels = MODEL_cfg[arch]['focal_levels']
-        focal_windows = MODEL_cfg[arch]['focal_windows']
-        use_conv_embed = MODEL_cfg[arch]['use_conv_embed']
-        use_layerscale = MODEL_cfg[arch]['use_layerscale']
-        use_postln = MODEL_cfg[arch]['use_postln']
-        use_postln_in_modulation = MODEL_cfg[arch]['use_postln_in_modulation']
-        normalize_modulator = MODEL_cfg[arch]['normalize_modulator']
-        if pretrained is None:
-            pretrained = MODEL_cfg[arch]['pretrained']
-
-        self.out_indices = out_indices
-        self.frozen_stages = frozen_stages
-        self.num_layers = len(depths)
-        self.patch_norm = patch_norm
-
-        # split image into non-overlapping patches
-        self.patch_embed = PatchEmbed(
-            patch_size=patch_size,
-            in_chans=in_chans,
-            embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None,
-            use_conv_embed=use_conv_embed,
-            is_stem=True)
-
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        # stochastic depth decay rule
-        dpr = np.linspace(0, drop_path_rate, sum(depths))
-
-        # build layers
-        self.layers = nn.LayerList()
-        for i_layer in range(self.num_layers):
-            layer = BasicLayer(
-                dim=int(embed_dim * 2**i_layer),
-                depth=depths[i_layer],
-                mlp_ratio=mlp_ratio,
-                drop=drop_rate,
-                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
-                norm_layer=norm_layer,
-                downsample=PatchEmbed
-                if (i_layer < self.num_layers - 1) else None,
-                focal_level=focal_levels[i_layer],
-                focal_window=focal_windows[i_layer],
-                use_conv_embed=use_conv_embed,
-                use_layerscale=use_layerscale,
-                layerscale_value=layerscale_value,
-                use_postln=use_postln,
-                use_postln_in_modulation=use_postln_in_modulation,
-                normalize_modulator=normalize_modulator,
-                use_checkpoint=use_checkpoint)
-            self.layers.append(layer)
-
-        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
-        self.num_features = num_features
-
-        # add a norm layer for each output
-        for i_layer in out_indices:
-            layer = norm_layer(num_features[i_layer])
-            layer_name = f'norm{i_layer}'
-            self.add_sublayer(layer_name, layer)
-
-        self.apply(self._init_weights)
-        self._freeze_stages()
-        if pretrained:
-            if 'http' in pretrained:  #URL
-                path = paddle.utils.download.get_weights_path_from_url(
-                    pretrained)
-            else:  #model in local path
-                path = pretrained
-            self.set_state_dict(paddle.load(path))
-
-    def _freeze_stages(self):
-        if self.frozen_stages >= 0:
-            self.patch_embed.eval()
-            for param in self.patch_embed.parameters():
-                param.stop_gradient = True
-
-        if self.frozen_stages >= 2:
-            self.pos_drop.eval()
-            for i in range(0, self.frozen_stages - 1):
-                m = self.layers[i]
-                m.eval()
-                for param in m.parameters():
-                    param.stop_gradient = True
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                zeros_(m.bias)
-        elif isinstance(m, nn.LayerNorm):
-            zeros_(m.bias)
-            ones_(m.weight)
-
-    def forward(self, x):
-        x = self.patch_embed(x['image'])
-        B, _, Wh, Ww = x.shape
-        x = x.flatten(2).transpose([0, 2, 1])
-        x = self.pos_drop(x)
-        outs = []
-        for i in range(self.num_layers):
-            layer = self.layers[i]
-            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
-            if i in self.out_indices:
-                norm_layer = getattr(self, f'norm{i}')
-                x_out = norm_layer(x_out)
-                out = x_out.reshape([-1, H, W, self.num_features[i]]).transpose(
-                    (0, 3, 1, 2))
-                outs.append(out)
-
-        return outs
-
-    @property
-    def out_shape(self):
-        out_strides = [4, 8, 16, 32]
-        return [
-            ShapeSpec(
-                channels=self.num_features[i], stride=out_strides[i])
-            for i in self.out_indices
-        ]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/ghostnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/ghostnet.py
deleted file mode 100644
index cd333b4..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/ghostnet.py
+++ /dev/null
@@ -1,470 +0,0 @@
-# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import paddle
-from paddle import ParamAttr
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.nn import AdaptiveAvgPool2D, Linear
-from paddle.nn.initializer import Uniform
-
-from ppdet.core.workspace import register, serializable
-from numbers import Integral
-from ..shape_spec import ShapeSpec
-from .mobilenet_v3 import make_divisible, ConvBNLayer
-
-__all__ = ['GhostNet']
-
-
-class ExtraBlockDW(nn.Layer):
-    def __init__(self,
-                 in_c,
-                 ch_1,
-                 ch_2,
-                 stride,
-                 lr_mult,
-                 conv_decay=0.,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 freeze_norm=False,
-                 name=None):
-        super(ExtraBlockDW, self).__init__()
-        self.pointwise_conv = ConvBNLayer(
-            in_c=in_c,
-            out_c=ch_1,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            act='relu6',
-            lr_mult=lr_mult,
-            conv_decay=conv_decay,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name=name + "_extra1")
-        self.depthwise_conv = ConvBNLayer(
-            in_c=ch_1,
-            out_c=ch_2,
-            filter_size=3,
-            stride=stride,
-            padding=1,  #
-            num_groups=int(ch_1),
-            act='relu6',
-            lr_mult=lr_mult,
-            conv_decay=conv_decay,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name=name + "_extra2_dw")
-        self.normal_conv = ConvBNLayer(
-            in_c=ch_2,
-            out_c=ch_2,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            act='relu6',
-            lr_mult=lr_mult,
-            conv_decay=conv_decay,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name=name + "_extra2_sep")
-
-    def forward(self, inputs):
-        x = self.pointwise_conv(inputs)
-        x = self.depthwise_conv(x)
-        x = self.normal_conv(x)
-        return x
-
-
-class SEBlock(nn.Layer):
-    def __init__(self, num_channels, lr_mult, reduction_ratio=4, name=None):
-        super(SEBlock, self).__init__()
-        self.pool2d_gap = AdaptiveAvgPool2D(1)
-        self._num_channels = num_channels
-        stdv = 1.0 / math.sqrt(num_channels * 1.0)
-        med_ch = num_channels // reduction_ratio
-        self.squeeze = Linear(
-            num_channels,
-            med_ch,
-            weight_attr=ParamAttr(
-                learning_rate=lr_mult, initializer=Uniform(-stdv, stdv)),
-            bias_attr=ParamAttr(learning_rate=lr_mult))
-        stdv = 1.0 / math.sqrt(med_ch * 1.0)
-        self.excitation = Linear(
-            med_ch,
-            num_channels,
-            weight_attr=ParamAttr(
-                learning_rate=lr_mult, initializer=Uniform(-stdv, stdv)),
-            bias_attr=ParamAttr(learning_rate=lr_mult))
-
-    def forward(self, inputs):
-        pool = self.pool2d_gap(inputs)
-        pool = paddle.squeeze(pool, axis=[2, 3])
-        squeeze = self.squeeze(pool)
-        squeeze = F.relu(squeeze)
-        excitation = self.excitation(squeeze)
-        excitation = paddle.clip(x=excitation, min=0, max=1)
-        excitation = paddle.unsqueeze(excitation, axis=[2, 3])
-        out = paddle.multiply(inputs, excitation)
-        return out
-
-
-class GhostModule(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 output_channels,
-                 kernel_size=1,
-                 ratio=2,
-                 dw_size=3,
-                 stride=1,
-                 relu=True,
-                 lr_mult=1.,
-                 conv_decay=0.,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 freeze_norm=False,
-                 name=None):
-        super(GhostModule, self).__init__()
-        init_channels = int(math.ceil(output_channels / ratio))
-        new_channels = int(init_channels * (ratio - 1))
-        self.primary_conv = ConvBNLayer(
-            in_c=in_channels,
-            out_c=init_channels,
-            filter_size=kernel_size,
-            stride=stride,
-            padding=int((kernel_size - 1) // 2),
-            num_groups=1,
-            act="relu" if relu else None,
-            lr_mult=lr_mult,
-            conv_decay=conv_decay,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name=name + "_primary_conv")
-        self.cheap_operation = ConvBNLayer(
-            in_c=init_channels,
-            out_c=new_channels,
-            filter_size=dw_size,
-            stride=1,
-            padding=int((dw_size - 1) // 2),
-            num_groups=init_channels,
-            act="relu" if relu else None,
-            lr_mult=lr_mult,
-            conv_decay=conv_decay,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name=name + "_cheap_operation")
-
-    def forward(self, inputs):
-        x = self.primary_conv(inputs)
-        y = self.cheap_operation(x)
-        out = paddle.concat([x, y], axis=1)
-        return out
-
-
-class GhostBottleneck(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 hidden_dim,
-                 output_channels,
-                 kernel_size,
-                 stride,
-                 use_se,
-                 lr_mult,
-                 conv_decay=0.,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 freeze_norm=False,
-                 return_list=False,
-                 name=None):
-        super(GhostBottleneck, self).__init__()
-        self._stride = stride
-        self._use_se = use_se
-        self._num_channels = in_channels
-        self._output_channels = output_channels
-        self.return_list = return_list
-
-        self.ghost_module_1 = GhostModule(
-            in_channels=in_channels,
-            output_channels=hidden_dim,
-            kernel_size=1,
-            stride=1,
-            relu=True,
-            lr_mult=lr_mult,
-            conv_decay=conv_decay,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name=name + "_ghost_module_1")
-        if stride == 2:
-            self.depthwise_conv = ConvBNLayer(
-                in_c=hidden_dim,
-                out_c=hidden_dim,
-                filter_size=kernel_size,
-                stride=stride,
-                padding=int((kernel_size - 1) // 2),
-                num_groups=hidden_dim,
-                act=None,
-                lr_mult=lr_mult,
-                conv_decay=conv_decay,
-                norm_type=norm_type,
-                norm_decay=norm_decay,
-                freeze_norm=freeze_norm,
-                name=name +
-                "_depthwise_depthwise"  # looks strange due to an old typo, will be fixed later.
-            )
-        if use_se:
-            self.se_block = SEBlock(hidden_dim, lr_mult, name=name + "_se")
-        self.ghost_module_2 = GhostModule(
-            in_channels=hidden_dim,
-            output_channels=output_channels,
-            kernel_size=1,
-            relu=False,
-            lr_mult=lr_mult,
-            conv_decay=conv_decay,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name=name + "_ghost_module_2")
-        if stride != 1 or in_channels != output_channels:
-            self.shortcut_depthwise = ConvBNLayer(
-                in_c=in_channels,
-                out_c=in_channels,
-                filter_size=kernel_size,
-                stride=stride,
-                padding=int((kernel_size - 1) // 2),
-                num_groups=in_channels,
-                act=None,
-                lr_mult=lr_mult,
-                conv_decay=conv_decay,
-                norm_type=norm_type,
-                norm_decay=norm_decay,
-                freeze_norm=freeze_norm,
-                name=name +
-                "_shortcut_depthwise_depthwise"  # looks strange due to an old typo, will be fixed later.
-            )
-            self.shortcut_conv = ConvBNLayer(
-                in_c=in_channels,
-                out_c=output_channels,
-                filter_size=1,
-                stride=1,
-                padding=0,
-                num_groups=1,
-                act=None,
-                lr_mult=lr_mult,
-                conv_decay=conv_decay,
-                norm_type=norm_type,
-                norm_decay=norm_decay,
-                freeze_norm=freeze_norm,
-                name=name + "_shortcut_conv")
-
-    def forward(self, inputs):
-        y = self.ghost_module_1(inputs)
-        x = y
-        if self._stride == 2:
-            x = self.depthwise_conv(x)
-        if self._use_se:
-            x = self.se_block(x)
-        x = self.ghost_module_2(x)
-
-        if self._stride == 1 and self._num_channels == self._output_channels:
-            shortcut = inputs
-        else:
-            shortcut = self.shortcut_depthwise(inputs)
-            shortcut = self.shortcut_conv(shortcut)
-        x = paddle.add(x=x, y=shortcut)
-
-        if self.return_list:
-            return [y, x]
-        else:
-            return x
-
-
-@register
-@serializable
-class GhostNet(nn.Layer):
-    __shared__ = ['norm_type']
-
-    def __init__(
-            self,
-            scale=1.3,
-            feature_maps=[6, 12, 15],
-            with_extra_blocks=False,
-            extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],
-            lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
-            conv_decay=0.,
-            norm_type='bn',
-            norm_decay=0.0,
-            freeze_norm=False):
-        super(GhostNet, self).__init__()
-        if isinstance(feature_maps, Integral):
-            feature_maps = [feature_maps]
-        if norm_type == 'sync_bn' and freeze_norm:
-            raise ValueError(
-                "The norm_type should not be sync_bn when freeze_norm is True")
-        self.feature_maps = feature_maps
-        self.with_extra_blocks = with_extra_blocks
-        self.extra_block_filters = extra_block_filters
-
-        inplanes = 16
-        self.cfgs = [
-            # k, t, c, SE, s
-            [3, 16, 16, 0, 1],
-            [3, 48, 24, 0, 2],
-            [3, 72, 24, 0, 1],
-            [5, 72, 40, 1, 2],
-            [5, 120, 40, 1, 1],
-            [3, 240, 80, 0, 2],
-            [3, 200, 80, 0, 1],
-            [3, 184, 80, 0, 1],
-            [3, 184, 80, 0, 1],
-            [3, 480, 112, 1, 1],
-            [3, 672, 112, 1, 1],
-            [5, 672, 160, 1, 2],  # SSDLite output
-            [5, 960, 160, 0, 1],
-            [5, 960, 160, 1, 1],
-            [5, 960, 160, 0, 1],
-            [5, 960, 160, 1, 1]
-        ]
-        self.scale = scale
-        conv1_out_ch = int(make_divisible(inplanes * self.scale, 4))
-        self.conv1 = ConvBNLayer(
-            in_c=3,
-            out_c=conv1_out_ch,
-            filter_size=3,
-            stride=2,
-            padding=1,
-            num_groups=1,
-            act="relu",
-            lr_mult=1.,
-            conv_decay=conv_decay,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name="conv1")
-
-        # build inverted residual blocks
-        self._out_channels = []
-        self.ghost_bottleneck_list = []
-        idx = 0
-        inplanes = conv1_out_ch
-        for k, exp_size, c, use_se, s in self.cfgs:
-            lr_idx = min(idx // 3, len(lr_mult_list) - 1)
-            lr_mult = lr_mult_list[lr_idx]
-
-            # for SSD/SSDLite, first head input is after ResidualUnit expand_conv
-            return_list = self.with_extra_blocks and idx + 2 in self.feature_maps
-
-            ghost_bottleneck = self.add_sublayer(
-                "_ghostbottleneck_" + str(idx),
-                sublayer=GhostBottleneck(
-                    in_channels=inplanes,
-                    hidden_dim=int(make_divisible(exp_size * self.scale, 4)),
-                    output_channels=int(make_divisible(c * self.scale, 4)),
-                    kernel_size=k,
-                    stride=s,
-                    use_se=use_se,
-                    lr_mult=lr_mult,
-                    conv_decay=conv_decay,
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    return_list=return_list,
-                    name="_ghostbottleneck_" + str(idx)))
-            self.ghost_bottleneck_list.append(ghost_bottleneck)
-            inplanes = int(make_divisible(c * self.scale, 4))
-            idx += 1
-            self._update_out_channels(
-                int(make_divisible(exp_size * self.scale, 4))
-                if return_list else inplanes, idx + 1, feature_maps)
-
-        if self.with_extra_blocks:
-            self.extra_block_list = []
-            extra_out_c = int(make_divisible(self.scale * self.cfgs[-1][1], 4))
-            lr_idx = min(idx // 3, len(lr_mult_list) - 1)
-            lr_mult = lr_mult_list[lr_idx]
-
-            conv_extra = self.add_sublayer(
-                "conv" + str(idx + 2),
-                sublayer=ConvBNLayer(
-                    in_c=inplanes,
-                    out_c=extra_out_c,
-                    filter_size=1,
-                    stride=1,
-                    padding=0,
-                    num_groups=1,
-                    act="relu6",
-                    lr_mult=lr_mult,
-                    conv_decay=conv_decay,
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    name="conv" + str(idx + 2)))
-            self.extra_block_list.append(conv_extra)
-            idx += 1
-            self._update_out_channels(extra_out_c, idx + 1, feature_maps)
-
-            for j, block_filter in enumerate(self.extra_block_filters):
-                in_c = extra_out_c if j == 0 else self.extra_block_filters[j -
-                                                                           1][1]
-                conv_extra = self.add_sublayer(
-                    "conv" + str(idx + 2),
-                    sublayer=ExtraBlockDW(
-                        in_c,
-                        block_filter[0],
-                        block_filter[1],
-                        stride=2,
-                        lr_mult=lr_mult,
-                        conv_decay=conv_decay,
-                        norm_type=norm_type,
-                        norm_decay=norm_decay,
-                        freeze_norm=freeze_norm,
-                        name='conv' + str(idx + 2)))
-                self.extra_block_list.append(conv_extra)
-                idx += 1
-                self._update_out_channels(block_filter[1], idx + 1,
-                                          feature_maps)
-
-    def _update_out_channels(self, channel, feature_idx, feature_maps):
-        if feature_idx in feature_maps:
-            self._out_channels.append(channel)
-
-    def forward(self, inputs):
-        x = self.conv1(inputs['image'])
-        outs = []
-        for idx, ghost_bottleneck in enumerate(self.ghost_bottleneck_list):
-            x = ghost_bottleneck(x)
-            if idx + 2 in self.feature_maps:
-                if isinstance(x, list):
-                    outs.append(x[0])
-                    x = x[1]
-                else:
-                    outs.append(x)
-
-        if not self.with_extra_blocks:
-            return outs
-
-        for i, block in enumerate(self.extra_block_list):
-            idx = i + len(self.ghost_bottleneck_list)
-            x = block(x)
-            if idx + 2 in self.feature_maps:
-                outs.append(x)
-        return outs
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/hardnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/hardnet.py
deleted file mode 100644
index 8615fb6..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/hardnet.py
+++ /dev/null
@@ -1,226 +0,0 @@
-# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-from ppdet.core.workspace import register
-from ..shape_spec import ShapeSpec
-
-__all__ = ['HarDNet']
-
-
-def ConvLayer(in_channels,
-              out_channels,
-              kernel_size=3,
-              stride=1,
-              bias_attr=False):
-    layer = nn.Sequential(
-        ('conv', nn.Conv2D(
-            in_channels,
-            out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=kernel_size // 2,
-            groups=1,
-            bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)),
-        ('relu', nn.ReLU6()))
-    return layer
-
-
-def DWConvLayer(in_channels,
-                out_channels,
-                kernel_size=3,
-                stride=1,
-                bias_attr=False):
-    layer = nn.Sequential(
-        ('dwconv', nn.Conv2D(
-            in_channels,
-            out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=1,
-            groups=out_channels,
-            bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)))
-    return layer
-
-
-def CombConvLayer(in_channels, out_channels, kernel_size=1, stride=1):
-    layer = nn.Sequential(
-        ('layer1', ConvLayer(
-            in_channels, out_channels, kernel_size=kernel_size)),
-        ('layer2', DWConvLayer(
-            out_channels, out_channels, stride=stride)))
-    return layer
-
-
-class HarDBlock(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 growth_rate,
-                 grmul,
-                 n_layers,
-                 keepBase=False,
-                 residual_out=False,
-                 dwconv=False):
-        super().__init__()
-        self.keepBase = keepBase
-        self.links = []
-        layers_ = []
-        self.out_channels = 0
-        for i in range(n_layers):
-            outch, inch, link = self.get_link(i + 1, in_channels, growth_rate,
-                                              grmul)
-            self.links.append(link)
-            if dwconv:
-                layers_.append(CombConvLayer(inch, outch))
-            else:
-                layers_.append(ConvLayer(inch, outch))
-
-            if (i % 2 == 0) or (i == n_layers - 1):
-                self.out_channels += outch
-        self.layers = nn.LayerList(layers_)
-
-    def get_out_ch(self):
-        return self.out_channels
-
-    def get_link(self, layer, base_ch, growth_rate, grmul):
-        if layer == 0:
-            return base_ch, 0, []
-        out_channels = growth_rate
-
-        link = []
-        for i in range(10):
-            dv = 2**i
-            if layer % dv == 0:
-                k = layer - dv
-                link.append(k)
-                if i > 0:
-                    out_channels *= grmul
-
-        out_channels = int(int(out_channels + 1) / 2) * 2
-        in_channels = 0
-
-        for i in link:
-            ch, _, _ = self.get_link(i, base_ch, growth_rate, grmul)
-            in_channels += ch
-
-        return out_channels, in_channels, link
-
-    def forward(self, x):
-        layers_ = [x]
-
-        for layer in range(len(self.layers)):
-            link = self.links[layer]
-            tin = []
-            for i in link:
-                tin.append(layers_[i])
-            if len(tin) > 1:
-                x = paddle.concat(tin, 1)
-            else:
-                x = tin[0]
-            out = self.layers[layer](x)
-            layers_.append(out)
-
-        t = len(layers_)
-        out_ = []
-        for i in range(t):
-            if (i == 0 and self.keepBase) or (i == t - 1) or (i % 2 == 1):
-                out_.append(layers_[i])
-        out = paddle.concat(out_, 1)
-
-        return out
-
-
-@register
-class HarDNet(nn.Layer):
-    def __init__(self, depth_wise=False, return_idx=[1, 3, 8, 13], arch=85):
-        super(HarDNet, self).__init__()
-        assert arch in [68, 85], "HarDNet-{} is not supported.".format(arch)
-        if arch == 85:
-            first_ch = [48, 96]
-            second_kernel = 3
-            ch_list = [192, 256, 320, 480, 720]
-            grmul = 1.7
-            gr = [24, 24, 28, 36, 48]
-            n_layers = [8, 16, 16, 16, 16]
-        elif arch == 68:
-            first_ch = [32, 64]
-            second_kernel = 3
-            ch_list = [128, 256, 320, 640]
-            grmul = 1.7
-            gr = [14, 16, 20, 40]
-            n_layers = [8, 16, 16, 16]
-        else:
-            raise ValueError("HarDNet-{} is not supported.".format(arch))
-
-        self.return_idx = return_idx
-        self._out_channels = [96, 214, 458, 784]
-
-        avg_pool = True
-        if depth_wise:
-            second_kernel = 1
-            avg_pool = False
-
-        blks = len(n_layers)
-        self.base = nn.LayerList([])
-
-        # First Layer: Standard Conv3x3, Stride=2
-        self.base.append(
-            ConvLayer(
-                in_channels=3,
-                out_channels=first_ch[0],
-                kernel_size=3,
-                stride=2,
-                bias_attr=False))
-
-        # Second Layer
-        self.base.append(
-            ConvLayer(
-                first_ch[0], first_ch[1], kernel_size=second_kernel))
-
-        # Avgpooling or DWConv3x3 downsampling
-        if avg_pool:
-            self.base.append(nn.AvgPool2D(kernel_size=3, stride=2, padding=1))
-        else:
-            self.base.append(DWConvLayer(first_ch[1], first_ch[1], stride=2))
-
-        # Build all HarDNet blocks
-        ch = first_ch[1]
-        for i in range(blks):
-            blk = HarDBlock(ch, gr[i], grmul, n_layers[i], dwconv=depth_wise)
-            ch = blk.out_channels
-            self.base.append(blk)
-
-            if i != blks - 1:
-                self.base.append(ConvLayer(ch, ch_list[i], kernel_size=1))
-            ch = ch_list[i]
-            if i == 0:
-                self.base.append(
-                    nn.AvgPool2D(
-                        kernel_size=2, stride=2, ceil_mode=True))
-            elif i != blks - 1 and i != 1 and i != 3:
-                self.base.append(nn.AvgPool2D(kernel_size=2, stride=2))
-
-    def forward(self, inputs):
-        x = inputs['image']
-        outs = []
-        for i, layer in enumerate(self.base):
-            x = layer(x)
-            if i in self.return_idx:
-                outs.append(x)
-        return outs
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=self._out_channels[i]) for i in range(4)]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/hgnet_v2.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/hgnet_v2.py
deleted file mode 100644
index 88f989a..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/hgnet_v2.py
+++ /dev/null
@@ -1,447 +0,0 @@
-# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.nn.initializer import KaimingNormal, Constant
-from paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D
-from paddle.regularizer import L2Decay
-from paddle import ParamAttr
-
-import copy
-
-from ppdet.core.workspace import register, serializable
-from ..shape_spec import ShapeSpec
-
-__all__ = ['PPHGNetV2']
-
-kaiming_normal_ = KaimingNormal()
-zeros_ = Constant(value=0.)
-ones_ = Constant(value=1.)
-
-
-class LearnableAffineBlock(nn.Layer):
-    def __init__(self,
-                 scale_value=1.0,
-                 bias_value=0.0,
-                 lr_mult=1.0,
-                 lab_lr=0.01):
-        super().__init__()
-        self.scale = self.create_parameter(
-            shape=[1, ],
-            default_initializer=Constant(value=scale_value),
-            attr=ParamAttr(learning_rate=lr_mult * lab_lr))
-        self.add_parameter("scale", self.scale)
-        self.bias = self.create_parameter(
-            shape=[1, ],
-            default_initializer=Constant(value=bias_value),
-            attr=ParamAttr(learning_rate=lr_mult * lab_lr))
-        self.add_parameter("bias", self.bias)
-
-    def forward(self, x):
-        return self.scale * x + self.bias
-
-
-class ConvBNAct(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size=3,
-                 stride=1,
-                 padding=1,
-                 groups=1,
-                 use_act=True,
-                 use_lab=False,
-                 lr_mult=1.0):
-        super().__init__()
-        self.use_act = use_act
-        self.use_lab = use_lab
-        self.conv = Conv2D(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride,
-            padding=padding
-            if isinstance(padding, str) else (kernel_size - 1) // 2,
-            groups=groups,
-            weight_attr=ParamAttr(learning_rate=lr_mult),
-            bias_attr=False)
-        self.bn = BatchNorm2D(
-            out_channels,
-            weight_attr=ParamAttr(
-                regularizer=L2Decay(0.0), learning_rate=lr_mult),
-            bias_attr=ParamAttr(
-                regularizer=L2Decay(0.0), learning_rate=lr_mult))
-        if self.use_act:
-            self.act = ReLU()
-            if self.use_lab:
-                self.lab = LearnableAffineBlock(lr_mult=lr_mult)
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        if self.use_act:
-            x = self.act(x)
-            if self.use_lab:
-                x = self.lab(x)
-        return x
-
-
-class LightConvBNAct(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride,
-                 groups=1,
-                 use_lab=False,
-                 lr_mult=1.0):
-        super().__init__()
-        self.conv1 = ConvBNAct(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=1,
-            use_act=False,
-            use_lab=use_lab,
-            lr_mult=lr_mult)
-        self.conv2 = ConvBNAct(
-            in_channels=out_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            groups=out_channels,
-            use_act=True,
-            use_lab=use_lab,
-            lr_mult=lr_mult)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.conv2(x)
-        return x
-
-
-class StemBlock(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 mid_channels,
-                 out_channels,
-                 use_lab=False,
-                 lr_mult=1.0):
-        super().__init__()
-        self.stem1 = ConvBNAct(
-            in_channels=in_channels,
-            out_channels=mid_channels,
-            kernel_size=3,
-            stride=2,
-            use_lab=use_lab,
-            lr_mult=lr_mult)
-        self.stem2a = ConvBNAct(
-            in_channels=mid_channels,
-            out_channels=mid_channels // 2,
-            kernel_size=2,
-            stride=1,
-            padding="SAME",
-            use_lab=use_lab,
-            lr_mult=lr_mult)
-        self.stem2b = ConvBNAct(
-            in_channels=mid_channels // 2,
-            out_channels=mid_channels,
-            kernel_size=2,
-            stride=1,
-            padding="SAME",
-            use_lab=use_lab,
-            lr_mult=lr_mult)
-        self.stem3 = ConvBNAct(
-            in_channels=mid_channels * 2,
-            out_channels=mid_channels,
-            kernel_size=3,
-            stride=2,
-            use_lab=use_lab,
-            lr_mult=lr_mult)
-        self.stem4 = ConvBNAct(
-            in_channels=mid_channels,
-            out_channels=out_channels,
-            kernel_size=1,
-            stride=1,
-            use_lab=use_lab,
-            lr_mult=lr_mult)
-        self.pool = nn.MaxPool2D(
-            kernel_size=2, stride=1, ceil_mode=True, padding="SAME")
-
-    def forward(self, x):
-        x = self.stem1(x)
-        x2 = self.stem2a(x)
-        x2 = self.stem2b(x2)
-        x1 = self.pool(x)
-        x = paddle.concat([x1, x2], 1)
-        x = self.stem3(x)
-        x = self.stem4(x)
-
-        return x
-
-
-class HG_Block(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 mid_channels,
-                 out_channels,
-                 kernel_size=3,
-                 layer_num=6,
-                 identity=False,
-                 light_block=True,
-                 use_lab=False,
-                 lr_mult=1.0):
-        super().__init__()
-        self.identity = identity
-
-        self.layers = nn.LayerList()
-        block_type = "LightConvBNAct" if light_block else "ConvBNAct"
-        for i in range(layer_num):
-            self.layers.append(
-                eval(block_type)(in_channels=in_channels
-                                 if i == 0 else mid_channels,
-                                 out_channels=mid_channels,
-                                 stride=1,
-                                 kernel_size=kernel_size,
-                                 use_lab=use_lab,
-                                 lr_mult=lr_mult))
-        # feature aggregation
-        total_channels = in_channels + layer_num * mid_channels
-        self.aggregation_squeeze_conv = ConvBNAct(
-            in_channels=total_channels,
-            out_channels=out_channels // 2,
-            kernel_size=1,
-            stride=1,
-            use_lab=use_lab,
-            lr_mult=lr_mult)
-        self.aggregation_excitation_conv = ConvBNAct(
-            in_channels=out_channels // 2,
-            out_channels=out_channels,
-            kernel_size=1,
-            stride=1,
-            use_lab=use_lab,
-            lr_mult=lr_mult)
-
-    def forward(self, x):
-        identity = x
-        output = []
-        output.append(x)
-        for layer in self.layers:
-            x = layer(x)
-            output.append(x)
-        x = paddle.concat(output, axis=1)
-        x = self.aggregation_squeeze_conv(x)
-        x = self.aggregation_excitation_conv(x)
-        if self.identity:
-            x += identity
-        return x
-
-
-class HG_Stage(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 mid_channels,
-                 out_channels,
-                 block_num,
-                 layer_num=6,
-                 downsample=True,
-                 light_block=True,
-                 kernel_size=3,
-                 use_lab=False,
-                 lr_mult=1.0):
-        super().__init__()
-        self.downsample = downsample
-        if downsample:
-            self.downsample = ConvBNAct(
-                in_channels=in_channels,
-                out_channels=in_channels,
-                kernel_size=3,
-                stride=2,
-                groups=in_channels,
-                use_act=False,
-                use_lab=use_lab,
-                lr_mult=lr_mult)
-
-        blocks_list = []
-        for i in range(block_num):
-            blocks_list.append(
-                HG_Block(
-                    in_channels=in_channels if i == 0 else out_channels,
-                    mid_channels=mid_channels,
-                    out_channels=out_channels,
-                    kernel_size=kernel_size,
-                    layer_num=layer_num,
-                    identity=False if i == 0 else True,
-                    light_block=light_block,
-                    use_lab=use_lab,
-                    lr_mult=lr_mult))
-        self.blocks = nn.Sequential(*blocks_list)
-
-    def forward(self, x):
-        if self.downsample:
-            x = self.downsample(x)
-        x = self.blocks(x)
-        return x
-
-
-def _freeze_norm(m: nn.BatchNorm2D):
-    param_attr = ParamAttr(
-        learning_rate=0., regularizer=L2Decay(0.), trainable=False)
-    bias_attr = ParamAttr(
-        learning_rate=0., regularizer=L2Decay(0.), trainable=False)
-    global_stats = True
-    norm = nn.BatchNorm2D(
-        m._num_features,
-        weight_attr=param_attr,
-        bias_attr=bias_attr,
-        use_global_stats=global_stats)
-    for param in norm.parameters():
-        param.stop_gradient = True
-    return norm
-
-
-def reset_bn(model: nn.Layer, reset_func=_freeze_norm):
-    if isinstance(model, nn.BatchNorm2D):
-        model = reset_func(model)
-    else:
-        for name, child in model.named_children():
-            _child = reset_bn(child, reset_func)
-            if _child is not child:
-                setattr(model, name, _child)
-    return model
-
-
-@register
-@serializable
-class PPHGNetV2(nn.Layer):
-    """
-    PPHGNetV2
-    Args:
-        stem_channels: list. Number of channels for the stem block.
-        stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc.
-        use_lab: boolean. Whether to use LearnableAffineBlock in network.
-        lr_mult_list: list. Control the learning rate of different stages.
-    Returns:
-        model: nn.Layer. Specific PPHGNetV2 model depends on args.
-    """
-
-    arch_configs = {
-        'L': {
-            'stem_channels': [3, 32, 48],
-            'stage_config': {
-                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
-                "stage1": [48, 48, 128, 1, False, False, 3, 6],
-                "stage2": [128, 96, 512, 1, True, False, 3, 6],
-                "stage3": [512, 192, 1024, 3, True, True, 5, 6],
-                "stage4": [1024, 384, 2048, 1, True, True, 5, 6],
-            }
-        },
-        'X': {
-            'stem_channels': [3, 32, 64],
-            'stage_config': {
-                # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num
-                "stage1": [64, 64, 128, 1, False, False, 3, 6],
-                "stage2": [128, 128, 512, 2, True, False, 3, 6],
-                "stage3": [512, 256, 1024, 5, True, True, 5, 6],
-                "stage4": [1024, 512, 2048, 2, True, True, 5, 6],
-            }
-        }
-    }
-
-    def __init__(self,
-                 arch,
-                 use_lab=False,
-                 lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
-                 return_idx=[1, 2, 3],
-                 freeze_stem_only=True,
-                 freeze_at=0,
-                 freeze_norm=True):
-        super().__init__()
-        self.use_lab = use_lab
-        self.return_idx = return_idx
-
-        stem_channels = self.arch_configs[arch]['stem_channels']
-        stage_config = self.arch_configs[arch]['stage_config']
-
-        self._out_strides = [4, 8, 16, 32]
-        self._out_channels = [stage_config[k][2] for k in stage_config]
-
-        # stem
-        self.stem = StemBlock(
-            in_channels=stem_channels[0],
-            mid_channels=stem_channels[1],
-            out_channels=stem_channels[2],
-            use_lab=use_lab,
-            lr_mult=lr_mult_list[0])
-
-        # stages
-        self.stages = nn.LayerList()
-        for i, k in enumerate(stage_config):
-            in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[
-                k]
-            self.stages.append(
-                HG_Stage(
-                    in_channels,
-                    mid_channels,
-                    out_channels,
-                    block_num,
-                    layer_num,
-                    downsample,
-                    light_block,
-                    kernel_size,
-                    use_lab,
-                    lr_mult=lr_mult_list[i + 1]))
-
-        if freeze_at >= 0:
-            self._freeze_parameters(self.stem)
-            if not freeze_stem_only:
-                for i in range(min(freeze_at + 1, len(self.stages))):
-                    self._freeze_parameters(self.stages[i])
-
-        if freeze_norm:
-            reset_bn(self, reset_func=_freeze_norm)
-
-        self._init_weights()
-
-    def _freeze_parameters(self, m):
-        for p in m.parameters():
-            p.stop_gradient = True
-
-    def _init_weights(self):
-        for m in self.sublayers():
-            if isinstance(m, nn.Conv2D):
-                kaiming_normal_(m.weight)
-            elif isinstance(m, (nn.BatchNorm2D)):
-                ones_(m.weight)
-                zeros_(m.bias)
-            elif isinstance(m, nn.Linear):
-                zeros_(m.bias)
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=self._out_channels[i], stride=self._out_strides[i])
-            for i in self.return_idx
-        ]
-
-    def forward(self, inputs):
-        x = inputs['image']
-        x = self.stem(x)
-        outs = []
-        for idx, stage in enumerate(self.stages):
-            x = stage(x)
-            if idx in self.return_idx:
-                outs.append(x)
-        return outs
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/hrnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/hrnet.py
deleted file mode 100644
index 977edd6..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/hrnet.py
+++ /dev/null
@@ -1,869 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.nn import AdaptiveAvgPool2D, Linear
-from paddle.regularizer import L2Decay
-from paddle import ParamAttr
-from paddle.nn.initializer import Normal, Uniform
-from numbers import Integral
-import math
-
-from ppdet.core.workspace import register
-from ..shape_spec import ShapeSpec
-
-__all__ = ['HRNet']
-
-
-class ConvNormLayer(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 filter_size,
-                 stride=1,
-                 norm_type='bn',
-                 norm_groups=32,
-                 use_dcn=False,
-                 norm_momentum=0.9,
-                 norm_decay=0.,
-                 freeze_norm=False,
-                 act=None,
-                 name=None):
-        super(ConvNormLayer, self).__init__()
-        assert norm_type in ['bn', 'sync_bn', 'gn']
-
-        self.act = act
-        self.conv = nn.Conv2D(
-            in_channels=ch_in,
-            out_channels=ch_out,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=1,
-            weight_attr=ParamAttr(initializer=Normal(
-                mean=0., std=0.01)),
-            bias_attr=False)
-
-        norm_lr = 0. if freeze_norm else 1.
-
-        param_attr = ParamAttr(
-            learning_rate=norm_lr, regularizer=L2Decay(norm_decay))
-        bias_attr = ParamAttr(
-            learning_rate=norm_lr, regularizer=L2Decay(norm_decay))
-        global_stats = True if freeze_norm else None
-        if norm_type in ['bn', 'sync_bn']:
-            self.norm = nn.BatchNorm2D(
-                ch_out,
-                momentum=norm_momentum,
-                weight_attr=param_attr,
-                bias_attr=bias_attr,
-                use_global_stats=global_stats)
-        elif norm_type == 'gn':
-            self.norm = nn.GroupNorm(
-                num_groups=norm_groups,
-                num_channels=ch_out,
-                weight_attr=param_attr,
-                bias_attr=bias_attr)
-        norm_params = self.norm.parameters()
-        if freeze_norm:
-            for param in norm_params:
-                param.stop_gradient = True
-
-    def forward(self, inputs):
-        out = self.conv(inputs)
-        out = self.norm(out)
-
-        if self.act == 'relu':
-            out = F.relu(out)
-        return out
-
-
-class Layer1(nn.Layer):
-    def __init__(self,
-                 num_channels,
-                 has_se=False,
-                 norm_momentum=0.9,
-                 norm_decay=0.,
-                 freeze_norm=True,
-                 name=None):
-        super(Layer1, self).__init__()
-
-        self.bottleneck_block_list = []
-
-        for i in range(4):
-            bottleneck_block = self.add_sublayer(
-                "block_{}_{}".format(name, i + 1),
-                BottleneckBlock(
-                    num_channels=num_channels if i == 0 else 256,
-                    num_filters=64,
-                    has_se=has_se,
-                    stride=1,
-                    downsample=True if i == 0 else False,
-                    norm_momentum=norm_momentum,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    name=name + '_' + str(i + 1)))
-            self.bottleneck_block_list.append(bottleneck_block)
-
-    def forward(self, input):
-        conv = input
-        for block_func in self.bottleneck_block_list:
-            conv = block_func(conv)
-        return conv
-
-
-class TransitionLayer(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 norm_momentum=0.9,
-                 norm_decay=0.,
-                 freeze_norm=True,
-                 name=None):
-        super(TransitionLayer, self).__init__()
-
-        num_in = len(in_channels)
-        num_out = len(out_channels)
-        out = []
-        self.conv_bn_func_list = []
-        for i in range(num_out):
-            residual = None
-            if i < num_in:
-                if in_channels[i] != out_channels[i]:
-                    residual = self.add_sublayer(
-                        "transition_{}_layer_{}".format(name, i + 1),
-                        ConvNormLayer(
-                            ch_in=in_channels[i],
-                            ch_out=out_channels[i],
-                            filter_size=3,
-                            norm_momentum=norm_momentum,
-                            norm_decay=norm_decay,
-                            freeze_norm=freeze_norm,
-                            act='relu',
-                            name=name + '_layer_' + str(i + 1)))
-            else:
-                residual = self.add_sublayer(
-                    "transition_{}_layer_{}".format(name, i + 1),
-                    ConvNormLayer(
-                        ch_in=in_channels[-1],
-                        ch_out=out_channels[i],
-                        filter_size=3,
-                        stride=2,
-                        norm_momentum=norm_momentum,
-                        norm_decay=norm_decay,
-                        freeze_norm=freeze_norm,
-                        act='relu',
-                        name=name + '_layer_' + str(i + 1)))
-            self.conv_bn_func_list.append(residual)
-
-    def forward(self, input):
-        outs = []
-        for idx, conv_bn_func in enumerate(self.conv_bn_func_list):
-            if conv_bn_func is None:
-                outs.append(input[idx])
-            else:
-                if idx < len(input):
-                    outs.append(conv_bn_func(input[idx]))
-                else:
-                    outs.append(conv_bn_func(input[-1]))
-        return outs
-
-
-class Branches(nn.Layer):
-    def __init__(self,
-                 block_num,
-                 in_channels,
-                 out_channels,
-                 has_se=False,
-                 norm_momentum=0.9,
-                 norm_decay=0.,
-                 freeze_norm=True,
-                 name=None):
-        super(Branches, self).__init__()
-
-        self.basic_block_list = []
-        for i in range(len(out_channels)):
-            self.basic_block_list.append([])
-            for j in range(block_num):
-                in_ch = in_channels[i] if j == 0 else out_channels[i]
-                basic_block_func = self.add_sublayer(
-                    "bb_{}_branch_layer_{}_{}".format(name, i + 1, j + 1),
-                    BasicBlock(
-                        num_channels=in_ch,
-                        num_filters=out_channels[i],
-                        has_se=has_se,
-                        norm_momentum=norm_momentum,
-                        norm_decay=norm_decay,
-                        freeze_norm=freeze_norm,
-                        name=name + '_branch_layer_' + str(i + 1) + '_' +
-                        str(j + 1)))
-                self.basic_block_list[i].append(basic_block_func)
-
-    def forward(self, inputs):
-        outs = []
-        for idx, input in enumerate(inputs):
-            conv = input
-            basic_block_list = self.basic_block_list[idx]
-            for basic_block_func in basic_block_list:
-                conv = basic_block_func(conv)
-            outs.append(conv)
-        return outs
-
-
-class BottleneckBlock(nn.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 has_se,
-                 stride=1,
-                 downsample=False,
-                 norm_momentum=0.9,
-                 norm_decay=0.,
-                 freeze_norm=True,
-                 name=None):
-        super(BottleneckBlock, self).__init__()
-
-        self.has_se = has_se
-        self.downsample = downsample
-
-        self.conv1 = ConvNormLayer(
-            ch_in=num_channels,
-            ch_out=num_filters,
-            filter_size=1,
-            norm_momentum=norm_momentum,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            act="relu",
-            name=name + "_conv1")
-        self.conv2 = ConvNormLayer(
-            ch_in=num_filters,
-            ch_out=num_filters,
-            filter_size=3,
-            stride=stride,
-            norm_momentum=norm_momentum,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            act="relu",
-            name=name + "_conv2")
-        self.conv3 = ConvNormLayer(
-            ch_in=num_filters,
-            ch_out=num_filters * 4,
-            filter_size=1,
-            norm_momentum=norm_momentum,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            act=None,
-            name=name + "_conv3")
-
-        if self.downsample:
-            self.conv_down = ConvNormLayer(
-                ch_in=num_channels,
-                ch_out=num_filters * 4,
-                filter_size=1,
-                norm_momentum=norm_momentum,
-                norm_decay=norm_decay,
-                freeze_norm=freeze_norm,
-                act=None,
-                name=name + "_downsample")
-
-        if self.has_se:
-            self.se = SELayer(
-                num_channels=num_filters * 4,
-                num_filters=num_filters * 4,
-                reduction_ratio=16,
-                name='fc' + name)
-
-    def forward(self, input):
-        residual = input
-        conv1 = self.conv1(input)
-        conv2 = self.conv2(conv1)
-        conv3 = self.conv3(conv2)
-
-        if self.downsample:
-            residual = self.conv_down(input)
-
-        if self.has_se:
-            conv3 = self.se(conv3)
-
-        y = paddle.add(x=residual, y=conv3)
-        y = F.relu(y)
-        return y
-
-
-class BasicBlock(nn.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 stride=1,
-                 has_se=False,
-                 downsample=False,
-                 norm_momentum=0.9,
-                 norm_decay=0.,
-                 freeze_norm=True,
-                 name=None):
-        super(BasicBlock, self).__init__()
-
-        self.has_se = has_se
-        self.downsample = downsample
-        self.conv1 = ConvNormLayer(
-            ch_in=num_channels,
-            ch_out=num_filters,
-            filter_size=3,
-            norm_momentum=norm_momentum,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            stride=stride,
-            act="relu",
-            name=name + "_conv1")
-        self.conv2 = ConvNormLayer(
-            ch_in=num_filters,
-            ch_out=num_filters,
-            filter_size=3,
-            norm_momentum=norm_momentum,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            stride=1,
-            act=None,
-            name=name + "_conv2")
-
-        if self.downsample:
-            self.conv_down = ConvNormLayer(
-                ch_in=num_channels,
-                ch_out=num_filters * 4,
-                filter_size=1,
-                norm_momentum=norm_momentum,
-                norm_decay=norm_decay,
-                freeze_norm=freeze_norm,
-                act=None,
-                name=name + "_downsample")
-
-        if self.has_se:
-            self.se = SELayer(
-                num_channels=num_filters,
-                num_filters=num_filters,
-                reduction_ratio=16,
-                name='fc' + name)
-
-    def forward(self, input):
-        residual = input
-        conv1 = self.conv1(input)
-        conv2 = self.conv2(conv1)
-
-        if self.downsample:
-            residual = self.conv_down(input)
-
-        if self.has_se:
-            conv2 = self.se(conv2)
-
-        y = paddle.add(x=residual, y=conv2)
-        y = F.relu(y)
-        return y
-
-
-class SELayer(nn.Layer):
-    def __init__(self, num_channels, num_filters, reduction_ratio, name=None):
-        super(SELayer, self).__init__()
-
-        self.pool2d_gap = AdaptiveAvgPool2D(1)
-
-        self._num_channels = num_channels
-
-        med_ch = int(num_channels / reduction_ratio)
-        stdv = 1.0 / math.sqrt(num_channels * 1.0)
-        self.squeeze = Linear(
-            num_channels,
-            med_ch,
-            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
-
-        stdv = 1.0 / math.sqrt(med_ch * 1.0)
-        self.excitation = Linear(
-            med_ch,
-            num_filters,
-            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)))
-
-    def forward(self, input):
-        pool = self.pool2d_gap(input)
-        pool = paddle.squeeze(pool, axis=[2, 3])
-        squeeze = self.squeeze(pool)
-        squeeze = F.relu(squeeze)
-        excitation = self.excitation(squeeze)
-        excitation = F.sigmoid(excitation)
-        excitation = paddle.unsqueeze(excitation, axis=[2, 3])
-        out = input * excitation
-        return out
-
-
-class Stage(nn.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_modules,
-                 num_filters,
-                 has_se=False,
-                 norm_momentum=0.9,
-                 norm_decay=0.,
-                 freeze_norm=True,
-                 multi_scale_output=True,
-                 name=None):
-        super(Stage, self).__init__()
-
-        self._num_modules = num_modules
-        self.stage_func_list = []
-        for i in range(num_modules):
-            if i == num_modules - 1 and not multi_scale_output:
-                stage_func = self.add_sublayer(
-                    "stage_{}_{}".format(name, i + 1),
-                    HighResolutionModule(
-                        num_channels=num_channels,
-                        num_filters=num_filters,
-                        has_se=has_se,
-                        norm_momentum=norm_momentum,
-                        norm_decay=norm_decay,
-                        freeze_norm=freeze_norm,
-                        multi_scale_output=False,
-                        name=name + '_' + str(i + 1)))
-            else:
-                stage_func = self.add_sublayer(
-                    "stage_{}_{}".format(name, i + 1),
-                    HighResolutionModule(
-                        num_channels=num_channels,
-                        num_filters=num_filters,
-                        has_se=has_se,
-                        norm_momentum=norm_momentum,
-                        norm_decay=norm_decay,
-                        freeze_norm=freeze_norm,
-                        name=name + '_' + str(i + 1)))
-
-            self.stage_func_list.append(stage_func)
-
-    def forward(self, input):
-        out = input
-        for idx in range(self._num_modules):
-            out = self.stage_func_list[idx](out)
-        return out
-
-
-class HighResolutionModule(nn.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 has_se=False,
-                 multi_scale_output=True,
-                 norm_momentum=0.9,
-                 norm_decay=0.,
-                 freeze_norm=True,
-                 name=None):
-        super(HighResolutionModule, self).__init__()
-        self.branches_func = Branches(
-            block_num=4,
-            in_channels=num_channels,
-            out_channels=num_filters,
-            has_se=has_se,
-            norm_momentum=norm_momentum,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name=name)
-
-        self.fuse_func = FuseLayers(
-            in_channels=num_filters,
-            out_channels=num_filters,
-            multi_scale_output=multi_scale_output,
-            norm_momentum=norm_momentum,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name=name)
-
-    def forward(self, input):
-        out = self.branches_func(input)
-        out = self.fuse_func(out)
-        return out
-
-
-class FuseLayers(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 multi_scale_output=True,
-                 norm_momentum=0.9,
-                 norm_decay=0.,
-                 freeze_norm=True,
-                 name=None):
-        super(FuseLayers, self).__init__()
-
-        self._actual_ch = len(in_channels) if multi_scale_output else 1
-        self._in_channels = in_channels
-
-        self.residual_func_list = []
-        for i in range(self._actual_ch):
-            for j in range(len(in_channels)):
-                residual_func = None
-                if j > i:
-                    residual_func = self.add_sublayer(
-                        "residual_{}_layer_{}_{}".format(name, i + 1, j + 1),
-                        ConvNormLayer(
-                            ch_in=in_channels[j],
-                            ch_out=out_channels[i],
-                            filter_size=1,
-                            stride=1,
-                            act=None,
-                            norm_momentum=norm_momentum,
-                            norm_decay=norm_decay,
-                            freeze_norm=freeze_norm,
-                            name=name + '_layer_' + str(i + 1) + '_' +
-                            str(j + 1)))
-                    self.residual_func_list.append(residual_func)
-                elif j < i:
-                    pre_num_filters = in_channels[j]
-                    for k in range(i - j):
-                        if k == i - j - 1:
-                            residual_func = self.add_sublayer(
-                                "residual_{}_layer_{}_{}_{}".format(
-                                    name, i + 1, j + 1, k + 1),
-                                ConvNormLayer(
-                                    ch_in=pre_num_filters,
-                                    ch_out=out_channels[i],
-                                    filter_size=3,
-                                    stride=2,
-                                    norm_momentum=norm_momentum,
-                                    norm_decay=norm_decay,
-                                    freeze_norm=freeze_norm,
-                                    act=None,
-                                    name=name + '_layer_' + str(i + 1) + '_' +
-                                    str(j + 1) + '_' + str(k + 1)))
-                            pre_num_filters = out_channels[i]
-                        else:
-                            residual_func = self.add_sublayer(
-                                "residual_{}_layer_{}_{}_{}".format(
-                                    name, i + 1, j + 1, k + 1),
-                                ConvNormLayer(
-                                    ch_in=pre_num_filters,
-                                    ch_out=out_channels[j],
-                                    filter_size=3,
-                                    stride=2,
-                                    norm_momentum=norm_momentum,
-                                    norm_decay=norm_decay,
-                                    freeze_norm=freeze_norm,
-                                    act="relu",
-                                    name=name + '_layer_' + str(i + 1) + '_' +
-                                    str(j + 1) + '_' + str(k + 1)))
-                            pre_num_filters = out_channels[j]
-                        self.residual_func_list.append(residual_func)
-
-    def forward(self, input):
-        outs = []
-        residual_func_idx = 0
-        for i in range(self._actual_ch):
-            residual = input[i]
-            for j in range(len(self._in_channels)):
-                if j > i:
-                    y = self.residual_func_list[residual_func_idx](input[j])
-                    residual_func_idx += 1
-                    y = F.interpolate(y, scale_factor=2**(j - i))
-                    residual = paddle.add(x=residual, y=y)
-                elif j < i:
-                    y = input[j]
-                    for k in range(i - j):
-                        y = self.residual_func_list[residual_func_idx](y)
-                        residual_func_idx += 1
-                    residual = paddle.add(x=residual, y=y)
-            residual = F.relu(residual)
-            outs.append(residual)
-
-        return outs
-
-
-@register
-class HRNet(nn.Layer):
-    """
-    HRNet, see https://arxiv.org/abs/1908.07919
-
-    Args:
-        width (int): the width of HRNet
-        has_se (bool): whether to add SE block for each stage
-        freeze_at (int): the stage to freeze
-        freeze_norm (bool): whether to freeze norm in HRNet
-        norm_momentum (float): momentum of BatchNorm
-        norm_decay (float): weight decay for normalization layer weights
-        return_idx (List): the stage to return
-        upsample (bool): whether to upsample and concat the backbone feats
-    """
-
-    def __init__(self,
-                 width=18,
-                 has_se=False,
-                 freeze_at=0,
-                 freeze_norm=True,
-                 norm_momentum=0.9,
-                 norm_decay=0.,
-                 return_idx=[0, 1, 2, 3],
-                 upsample=False,
-                 downsample=False):
-        super(HRNet, self).__init__()
-
-        self.width = width
-        self.has_se = has_se
-        if isinstance(return_idx, Integral):
-            return_idx = [return_idx]
-
-        assert len(return_idx) > 0, "need one or more return index"
-        self.freeze_at = freeze_at
-        self.return_idx = return_idx
-        self.upsample = upsample
-        self.downsample = downsample
-
-        self.channels = {
-            18: [[18, 36], [18, 36, 72], [18, 36, 72, 144]],
-            30: [[30, 60], [30, 60, 120], [30, 60, 120, 240]],
-            32: [[32, 64], [32, 64, 128], [32, 64, 128, 256]],
-            40: [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
-            44: [[44, 88], [44, 88, 176], [44, 88, 176, 352]],
-            48: [[48, 96], [48, 96, 192], [48, 96, 192, 384]],
-            60: [[60, 120], [60, 120, 240], [60, 120, 240, 480]],
-            64: [[64, 128], [64, 128, 256], [64, 128, 256, 512]]
-        }
-
-        channels_2, channels_3, channels_4 = self.channels[width]
-        num_modules_2, num_modules_3, num_modules_4 = 1, 4, 3
-        self._out_channels = [sum(channels_4)] if self.upsample else channels_4
-        self._out_strides = [4] if self.upsample else [4, 8, 16, 32]
-
-        self.conv_layer1_1 = ConvNormLayer(
-            ch_in=3,
-            ch_out=64,
-            filter_size=3,
-            stride=2,
-            norm_momentum=norm_momentum,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            act='relu',
-            name="layer1_1")
-
-        self.conv_layer1_2 = ConvNormLayer(
-            ch_in=64,
-            ch_out=64,
-            filter_size=3,
-            stride=2,
-            norm_momentum=norm_momentum,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            act='relu',
-            name="layer1_2")
-
-        self.la1 = Layer1(
-            num_channels=64,
-            has_se=has_se,
-            norm_momentum=norm_momentum,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name="layer2")
-
-        self.tr1 = TransitionLayer(
-            in_channels=[256],
-            out_channels=channels_2,
-            norm_momentum=norm_momentum,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name="tr1")
-
-        self.st2 = Stage(
-            num_channels=channels_2,
-            num_modules=num_modules_2,
-            num_filters=channels_2,
-            has_se=self.has_se,
-            norm_momentum=norm_momentum,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name="st2")
-
-        self.tr2 = TransitionLayer(
-            in_channels=channels_2,
-            out_channels=channels_3,
-            norm_momentum=norm_momentum,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name="tr2")
-
-        self.st3 = Stage(
-            num_channels=channels_3,
-            num_modules=num_modules_3,
-            num_filters=channels_3,
-            has_se=self.has_se,
-            norm_momentum=norm_momentum,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name="st3")
-
-        self.tr3 = TransitionLayer(
-            in_channels=channels_3,
-            out_channels=channels_4,
-            norm_momentum=norm_momentum,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name="tr3")
-        self.st4 = Stage(
-            num_channels=channels_4,
-            num_modules=num_modules_4,
-            num_filters=channels_4,
-            has_se=self.has_se,
-            norm_momentum=norm_momentum,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            multi_scale_output=len(return_idx) > 1,
-            name="st4")
-
-        if self.downsample:
-            self.incre_modules, self.downsamp_modules, \
-                self.final_layer = self._make_head(channels_4, norm_momentum=norm_momentum, has_se=self.has_se)
-
-    def _make_layer(self,
-                    block,
-                    inplanes,
-                    planes,
-                    blocks,
-                    stride=1,
-                    norm_momentum=0.9,
-                    has_se=False,
-                    name=None):
-        downsample = None
-        if stride != 1 or inplanes != planes * 4:
-            downsample = True
-
-        layers = []
-        layers.append(
-            block(
-                inplanes,
-                planes,
-                has_se,
-                stride,
-                downsample,
-                norm_momentum=norm_momentum,
-                freeze_norm=False,
-                name=name + "_s0"))
-        inplanes = planes * 4
-        for i in range(1, blocks):
-            layers.append(
-                block(
-                    inplanes,
-                    planes,
-                    has_se,
-                    norm_momentum=norm_momentum,
-                    freeze_norm=False,
-                    name=name + "_s" + str(i)))
-
-        return nn.Sequential(*layers)
-
-    def _make_head(self, pre_stage_channels, norm_momentum=0.9, has_se=False):
-        head_block = BottleneckBlock
-        head_channels = [32, 64, 128, 256]
-
-        # Increasing the #channels on each resolution 
-        # from C, 2C, 4C, 8C to 128, 256, 512, 1024
-        incre_modules = []
-        for i, channels in enumerate(pre_stage_channels):
-            incre_module = self._make_layer(
-                head_block,
-                channels,
-                head_channels[i],
-                1,
-                stride=1,
-                norm_momentum=norm_momentum,
-                has_se=has_se,
-                name='incre' + str(i))
-            incre_modules.append(incre_module)
-        incre_modules = nn.LayerList(incre_modules)
-
-        # downsampling modules
-        downsamp_modules = []
-        for i in range(len(pre_stage_channels) - 1):
-            in_channels = head_channels[i] * 4
-            out_channels = head_channels[i + 1] * 4
-
-            downsamp_module = nn.Sequential(
-                nn.Conv2D(
-                    in_channels=in_channels,
-                    out_channels=out_channels,
-                    kernel_size=3,
-                    stride=2,
-                    padding=1),
-                nn.BatchNorm2D(
-                    out_channels, momentum=norm_momentum),
-                nn.ReLU())
-
-            downsamp_modules.append(downsamp_module)
-        downsamp_modules = nn.LayerList(downsamp_modules)
-
-        final_layer = nn.Sequential(
-            nn.Conv2D(
-                in_channels=head_channels[3] * 4,
-                out_channels=2048,
-                kernel_size=1,
-                stride=1,
-                padding=0),
-            nn.BatchNorm2D(
-                2048, momentum=norm_momentum),
-            nn.ReLU())
-
-        return incre_modules, downsamp_modules, final_layer
-
-    def forward(self, inputs):
-        x = inputs['image']
-        conv1 = self.conv_layer1_1(x)
-        conv2 = self.conv_layer1_2(conv1)
-
-        la1 = self.la1(conv2)
-        tr1 = self.tr1([la1])
-        st2 = self.st2(tr1)
-        tr2 = self.tr2(st2)
-
-        st3 = self.st3(tr2)
-        tr3 = self.tr3(st3)
-
-        st4 = self.st4(tr3)
-
-        if self.upsample:
-            # Upsampling
-            x0_h, x0_w = st4[0].shape[2:4]
-            x1 = F.upsample(st4[1], size=(x0_h, x0_w), mode='bilinear')
-            x2 = F.upsample(st4[2], size=(x0_h, x0_w), mode='bilinear')
-            x3 = F.upsample(st4[3], size=(x0_h, x0_w), mode='bilinear')
-            x = paddle.concat([st4[0], x1, x2, x3], 1)
-            return x
-
-        if self.downsample:
-            y = self.incre_modules[0](st4[0])
-            for i in range(len(self.downsamp_modules)):
-                y = self.incre_modules[i+1](st4[i+1]) + \
-                            self.downsamp_modules[i](y)
-            y = self.final_layer(y)
-            return y
-
-        res = []
-        for i, layer in enumerate(st4):
-            if i == self.freeze_at:
-                layer.stop_gradient = True
-            if i in self.return_idx:
-                res.append(layer)
-
-        return res
-
-    @property
-    def out_shape(self):
-        if self.upsample:
-            self.return_idx = [0]
-        return [
-            ShapeSpec(
-                channels=self._out_channels[i], stride=self._out_strides[i])
-            for i in self.return_idx
-        ]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/lcnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/lcnet.py
deleted file mode 100644
index 76da139..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/lcnet.py
+++ /dev/null
@@ -1,271 +0,0 @@
-# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-from paddle import ParamAttr
-from paddle.nn import AdaptiveAvgPool2D, Conv2D
-from paddle.regularizer import L2Decay
-from paddle.nn.initializer import KaimingNormal
-
-from ppdet.core.workspace import register, serializable
-from numbers import Integral
-from ..shape_spec import ShapeSpec
-
-__all__ = ['LCNet']
-
-NET_CONFIG = {
-    "blocks2":
-    #k, in_c, out_c, s, use_se
-    [[3, 16, 32, 1, False], ],
-    "blocks3": [
-        [3, 32, 64, 2, False],
-        [3, 64, 64, 1, False],
-    ],
-    "blocks4": [
-        [3, 64, 128, 2, False],
-        [3, 128, 128, 1, False],
-    ],
-    "blocks5": [
-        [3, 128, 256, 2, False],
-        [5, 256, 256, 1, False],
-        [5, 256, 256, 1, False],
-        [5, 256, 256, 1, False],
-        [5, 256, 256, 1, False],
-        [5, 256, 256, 1, False],
-    ],
-    "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]
-}
-
-
-def make_divisible(v, divisor=8, min_value=None):
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
-class ConvBNLayer(nn.Layer):
-    def __init__(self,
-                 num_channels,
-                 filter_size,
-                 num_filters,
-                 stride,
-                 num_groups=1,
-                 act='hard_swish'):
-        super().__init__()
-
-        self.conv = Conv2D(
-            in_channels=num_channels,
-            out_channels=num_filters,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=num_groups,
-            weight_attr=ParamAttr(initializer=KaimingNormal()),
-            bias_attr=False)
-
-        self.bn = nn.BatchNorm2D(
-            num_filters,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-        if act == 'hard_swish':
-            self.act = nn.Hardswish()
-        elif act == 'relu6':
-            self.act = nn.ReLU6()
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        x = self.act(x)
-        return x
-
-
-class DepthwiseSeparable(nn.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 stride,
-                 dw_size=3,
-                 use_se=False,
-                 act='hard_swish'):
-        super().__init__()
-        self.use_se = use_se
-        self.dw_conv = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_channels,
-            filter_size=dw_size,
-            stride=stride,
-            num_groups=num_channels,
-            act=act)
-        if use_se:
-            self.se = SEModule(num_channels)
-        self.pw_conv = ConvBNLayer(
-            num_channels=num_channels,
-            filter_size=1,
-            num_filters=num_filters,
-            stride=1,
-            act=act)
-
-    def forward(self, x):
-        x = self.dw_conv(x)
-        if self.use_se:
-            x = self.se(x)
-        x = self.pw_conv(x)
-        return x
-
-
-class SEModule(nn.Layer):
-    def __init__(self, channel, reduction=4):
-        super().__init__()
-        self.avg_pool = AdaptiveAvgPool2D(1)
-        self.conv1 = Conv2D(
-            in_channels=channel,
-            out_channels=channel // reduction,
-            kernel_size=1,
-            stride=1,
-            padding=0)
-        self.relu = nn.ReLU()
-        self.conv2 = Conv2D(
-            in_channels=channel // reduction,
-            out_channels=channel,
-            kernel_size=1,
-            stride=1,
-            padding=0)
-        self.hardsigmoid = nn.Hardsigmoid()
-
-    def forward(self, x):
-        identity = x
-        x = self.avg_pool(x)
-        x = self.conv1(x)
-        x = self.relu(x)
-        x = self.conv2(x)
-        x = self.hardsigmoid(x)
-        x = paddle.multiply(x=identity, y=x)
-        return x
-
-
-@register
-@serializable
-class LCNet(nn.Layer):
-    def __init__(self, scale=1.0, feature_maps=[3, 4, 5], act='hard_swish'):
-        super().__init__()
-        self.scale = scale
-        self.feature_maps = feature_maps
-
-        out_channels = []
-
-        self.conv1 = ConvBNLayer(
-            num_channels=3,
-            filter_size=3,
-            num_filters=make_divisible(16 * scale),
-            stride=2,
-            act=act)
-
-        self.blocks2 = nn.Sequential(* [
-            DepthwiseSeparable(
-                num_channels=make_divisible(in_c * scale),
-                num_filters=make_divisible(out_c * scale),
-                dw_size=k,
-                stride=s,
-                use_se=se,
-                act=act)
-            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"])
-        ])
-
-        self.blocks3 = nn.Sequential(* [
-            DepthwiseSeparable(
-                num_channels=make_divisible(in_c * scale),
-                num_filters=make_divisible(out_c * scale),
-                dw_size=k,
-                stride=s,
-                use_se=se,
-                act=act)
-            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"])
-        ])
-
-        out_channels.append(
-            make_divisible(NET_CONFIG["blocks3"][-1][2] * scale))
-
-        self.blocks4 = nn.Sequential(* [
-            DepthwiseSeparable(
-                num_channels=make_divisible(in_c * scale),
-                num_filters=make_divisible(out_c * scale),
-                dw_size=k,
-                stride=s,
-                use_se=se,
-                act=act)
-            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"])
-        ])
-
-        out_channels.append(
-            make_divisible(NET_CONFIG["blocks4"][-1][2] * scale))
-
-        self.blocks5 = nn.Sequential(* [
-            DepthwiseSeparable(
-                num_channels=make_divisible(in_c * scale),
-                num_filters=make_divisible(out_c * scale),
-                dw_size=k,
-                stride=s,
-                use_se=se,
-                act=act)
-            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"])
-        ])
-
-        out_channels.append(
-            make_divisible(NET_CONFIG["blocks5"][-1][2] * scale))
-
-        self.blocks6 = nn.Sequential(* [
-            DepthwiseSeparable(
-                num_channels=make_divisible(in_c * scale),
-                num_filters=make_divisible(out_c * scale),
-                dw_size=k,
-                stride=s,
-                use_se=se,
-                act=act)
-            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"])
-        ])
-
-        out_channels.append(
-            make_divisible(NET_CONFIG["blocks6"][-1][2] * scale))
-        self._out_channels = [
-            ch for idx, ch in enumerate(out_channels) if idx + 2 in feature_maps
-        ]
-
-    def forward(self, inputs):
-        x = inputs['image']
-        outs = []
-
-        x = self.conv1(x)
-        x = self.blocks2(x)
-        x = self.blocks3(x)
-        outs.append(x)
-        x = self.blocks4(x)
-        outs.append(x)
-        x = self.blocks5(x)
-        outs.append(x)
-        x = self.blocks6(x)
-        outs.append(x)
-        outs = [o for i, o in enumerate(outs) if i + 2 in self.feature_maps]
-        return outs
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/lite_hrnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/lite_hrnet.py
deleted file mode 100644
index 95e3a26..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/lite_hrnet.py
+++ /dev/null
@@ -1,891 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on
-https://github.com/HRNet/Lite-HRNet/blob/hrnet/models/backbones/litehrnet.py
-"""
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from numbers import Integral
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-from paddle.nn.initializer import Normal, Constant
-from ppdet.core.workspace import register
-from ppdet.modeling.shape_spec import ShapeSpec
-from ppdet.modeling.ops import channel_shuffle
-from .. import layers as L
-
-__all__ = ['LiteHRNet']
-
-
-class ConvNormLayer(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 filter_size,
-                 stride=1,
-                 groups=1,
-                 norm_type=None,
-                 norm_groups=32,
-                 norm_decay=0.,
-                 freeze_norm=False,
-                 act=None):
-        super(ConvNormLayer, self).__init__()
-        self.act = act
-        norm_lr = 0. if freeze_norm else 1.
-        if norm_type is not None:
-            assert norm_type in ['bn', 'sync_bn', 'gn'], \
-                "norm_type should be one of ['bn', 'sync_bn', 'gn'], but got {}".format(norm_type)
-            param_attr = ParamAttr(
-                initializer=Constant(1.0),
-                learning_rate=norm_lr,
-                regularizer=L2Decay(norm_decay), )
-            bias_attr = ParamAttr(
-                learning_rate=norm_lr, regularizer=L2Decay(norm_decay))
-            global_stats = True if freeze_norm else None
-            if norm_type in ['bn', 'sync_bn']:
-                self.norm = nn.BatchNorm2D(
-                    ch_out,
-                    weight_attr=param_attr,
-                    bias_attr=bias_attr,
-                    use_global_stats=global_stats, )
-            elif norm_type == 'gn':
-                self.norm = nn.GroupNorm(
-                    num_groups=norm_groups,
-                    num_channels=ch_out,
-                    weight_attr=param_attr,
-                    bias_attr=bias_attr)
-            norm_params = self.norm.parameters()
-            if freeze_norm:
-                for param in norm_params:
-                    param.stop_gradient = True
-            conv_bias_attr = False
-        else:
-            conv_bias_attr = True
-            self.norm = None
-
-        self.conv = nn.Conv2D(
-            in_channels=ch_in,
-            out_channels=ch_out,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=groups,
-            weight_attr=ParamAttr(initializer=Normal(
-                mean=0., std=0.001)),
-            bias_attr=conv_bias_attr)
-
-    def forward(self, inputs):
-        out = self.conv(inputs)
-        if self.norm is not None:
-            out = self.norm(out)
-
-        if self.act == 'relu':
-            out = F.relu(out)
-        elif self.act == 'sigmoid':
-            out = F.sigmoid(out)
-        return out
-
-
-class DepthWiseSeparableConvNormLayer(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 filter_size,
-                 stride=1,
-                 dw_norm_type=None,
-                 pw_norm_type=None,
-                 norm_decay=0.,
-                 freeze_norm=False,
-                 dw_act=None,
-                 pw_act=None):
-        super(DepthWiseSeparableConvNormLayer, self).__init__()
-        self.depthwise_conv = ConvNormLayer(
-            ch_in=ch_in,
-            ch_out=ch_in,
-            filter_size=filter_size,
-            stride=stride,
-            groups=ch_in,
-            norm_type=dw_norm_type,
-            act=dw_act,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm, )
-        self.pointwise_conv = ConvNormLayer(
-            ch_in=ch_in,
-            ch_out=ch_out,
-            filter_size=1,
-            stride=1,
-            norm_type=pw_norm_type,
-            act=pw_act,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm, )
-
-    def forward(self, x):
-        x = self.depthwise_conv(x)
-        x = self.pointwise_conv(x)
-        return x
-
-
-class CrossResolutionWeightingModule(nn.Layer):
-    def __init__(self,
-                 channels,
-                 ratio=16,
-                 norm_type='bn',
-                 freeze_norm=False,
-                 norm_decay=0.):
-        super(CrossResolutionWeightingModule, self).__init__()
-        self.channels = channels
-        total_channel = sum(channels)
-        self.conv1 = ConvNormLayer(
-            ch_in=total_channel,
-            ch_out=total_channel // ratio,
-            filter_size=1,
-            stride=1,
-            norm_type=norm_type,
-            act='relu',
-            freeze_norm=freeze_norm,
-            norm_decay=norm_decay)
-        self.conv2 = ConvNormLayer(
-            ch_in=total_channel // ratio,
-            ch_out=total_channel,
-            filter_size=1,
-            stride=1,
-            norm_type=norm_type,
-            act='sigmoid',
-            freeze_norm=freeze_norm,
-            norm_decay=norm_decay)
-
-    def forward(self, x):
-        mini_size = x[-1].shape[-2:]
-        out = [F.adaptive_avg_pool2d(s, mini_size) for s in x[:-1]] + [x[-1]]
-        out = paddle.concat(out, 1)
-        out = self.conv1(out)
-        out = self.conv2(out)
-        out = paddle.split(out, self.channels, 1)
-        out = [
-            s * F.interpolate(
-                a, s.shape[-2:], mode='nearest') for s, a in zip(x, out)
-        ]
-        return out
-
-
-class SpatialWeightingModule(nn.Layer):
-    def __init__(self, in_channel, ratio=16, freeze_norm=False, norm_decay=0.):
-        super(SpatialWeightingModule, self).__init__()
-        self.global_avgpooling = nn.AdaptiveAvgPool2D(1)
-        self.conv1 = ConvNormLayer(
-            ch_in=in_channel,
-            ch_out=in_channel // ratio,
-            filter_size=1,
-            stride=1,
-            act='relu',
-            freeze_norm=freeze_norm,
-            norm_decay=norm_decay)
-        self.conv2 = ConvNormLayer(
-            ch_in=in_channel // ratio,
-            ch_out=in_channel,
-            filter_size=1,
-            stride=1,
-            act='sigmoid',
-            freeze_norm=freeze_norm,
-            norm_decay=norm_decay)
-
-    def forward(self, x):
-        out = self.global_avgpooling(x)
-        out = self.conv1(out)
-        out = self.conv2(out)
-        return x * out
-
-
-class ConditionalChannelWeightingBlock(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 stride,
-                 reduce_ratio,
-                 norm_type='bn',
-                 freeze_norm=False,
-                 norm_decay=0.):
-        super(ConditionalChannelWeightingBlock, self).__init__()
-        assert stride in [1, 2]
-        branch_channels = [channel // 2 for channel in in_channels]
-
-        self.cross_resolution_weighting = CrossResolutionWeightingModule(
-            branch_channels,
-            ratio=reduce_ratio,
-            norm_type=norm_type,
-            freeze_norm=freeze_norm,
-            norm_decay=norm_decay)
-        self.depthwise_convs = nn.LayerList([
-            ConvNormLayer(
-                channel,
-                channel,
-                filter_size=3,
-                stride=stride,
-                groups=channel,
-                norm_type=norm_type,
-                freeze_norm=freeze_norm,
-                norm_decay=norm_decay) for channel in branch_channels
-        ])
-
-        self.spatial_weighting = nn.LayerList([
-            SpatialWeightingModule(
-                channel,
-                ratio=4,
-                freeze_norm=freeze_norm,
-                norm_decay=norm_decay) for channel in branch_channels
-        ])
-
-    def forward(self, x):
-        x = [s.chunk(2, axis=1) for s in x]
-        x1 = [s[0] for s in x]
-        x2 = [s[1] for s in x]
-
-        x2 = self.cross_resolution_weighting(x2)
-        x2 = [dw(s) for s, dw in zip(x2, self.depthwise_convs)]
-        x2 = [sw(s) for s, sw in zip(x2, self.spatial_weighting)]
-
-        out = [paddle.concat([s1, s2], axis=1) for s1, s2 in zip(x1, x2)]
-        out = [channel_shuffle(s, groups=2) for s in out]
-        return out
-
-
-class ShuffleUnit(nn.Layer):
-    def __init__(self,
-                 in_channel,
-                 out_channel,
-                 stride,
-                 norm_type='bn',
-                 freeze_norm=False,
-                 norm_decay=0.):
-        super(ShuffleUnit, self).__init__()
-        branch_channel = out_channel // 2
-        self.stride = stride
-        if self.stride == 1:
-            assert in_channel == branch_channel * 2, \
-                "when stride=1, in_channel {} should equal to branch_channel*2 {}".format(in_channel, branch_channel * 2)
-        if stride > 1:
-            self.branch1 = nn.Sequential(
-                ConvNormLayer(
-                    ch_in=in_channel,
-                    ch_out=in_channel,
-                    filter_size=3,
-                    stride=self.stride,
-                    groups=in_channel,
-                    norm_type=norm_type,
-                    freeze_norm=freeze_norm,
-                    norm_decay=norm_decay),
-                ConvNormLayer(
-                    ch_in=in_channel,
-                    ch_out=branch_channel,
-                    filter_size=1,
-                    stride=1,
-                    norm_type=norm_type,
-                    act='relu',
-                    freeze_norm=freeze_norm,
-                    norm_decay=norm_decay), )
-        self.branch2 = nn.Sequential(
-            ConvNormLayer(
-                ch_in=branch_channel if stride == 1 else in_channel,
-                ch_out=branch_channel,
-                filter_size=1,
-                stride=1,
-                norm_type=norm_type,
-                act='relu',
-                freeze_norm=freeze_norm,
-                norm_decay=norm_decay),
-            ConvNormLayer(
-                ch_in=branch_channel,
-                ch_out=branch_channel,
-                filter_size=3,
-                stride=self.stride,
-                groups=branch_channel,
-                norm_type=norm_type,
-                freeze_norm=freeze_norm,
-                norm_decay=norm_decay),
-            ConvNormLayer(
-                ch_in=branch_channel,
-                ch_out=branch_channel,
-                filter_size=1,
-                stride=1,
-                norm_type=norm_type,
-                act='relu',
-                freeze_norm=freeze_norm,
-                norm_decay=norm_decay), )
-
-    def forward(self, x):
-        if self.stride > 1:
-            x1 = self.branch1(x)
-            x2 = self.branch2(x)
-        else:
-            x1, x2 = x.chunk(2, axis=1)
-            x2 = self.branch2(x2)
-        out = paddle.concat([x1, x2], axis=1)
-        out = channel_shuffle(out, groups=2)
-        return out
-
-
-class IterativeHead(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 norm_type='bn',
-                 freeze_norm=False,
-                 norm_decay=0.):
-        super(IterativeHead, self).__init__()
-        num_branches = len(in_channels)
-        self.in_channels = in_channels[::-1]
-
-        projects = []
-        for i in range(num_branches):
-            if i != num_branches - 1:
-                projects.append(
-                    DepthWiseSeparableConvNormLayer(
-                        ch_in=self.in_channels[i],
-                        ch_out=self.in_channels[i + 1],
-                        filter_size=3,
-                        stride=1,
-                        dw_act=None,
-                        pw_act='relu',
-                        dw_norm_type=norm_type,
-                        pw_norm_type=norm_type,
-                        freeze_norm=freeze_norm,
-                        norm_decay=norm_decay))
-            else:
-                projects.append(
-                    DepthWiseSeparableConvNormLayer(
-                        ch_in=self.in_channels[i],
-                        ch_out=self.in_channels[i],
-                        filter_size=3,
-                        stride=1,
-                        dw_act=None,
-                        pw_act='relu',
-                        dw_norm_type=norm_type,
-                        pw_norm_type=norm_type,
-                        freeze_norm=freeze_norm,
-                        norm_decay=norm_decay))
-        self.projects = nn.LayerList(projects)
-
-    def forward(self, x):
-        x = x[::-1]
-        y = []
-        last_x = None
-        for i, s in enumerate(x):
-            if last_x is not None:
-                last_x = F.interpolate(
-                    last_x,
-                    size=s.shape[-2:],
-                    mode='bilinear',
-                    align_corners=True)
-                s = s + last_x
-            s = self.projects[i](s)
-            y.append(s)
-            last_x = s
-
-        return y[::-1]
-
-
-class Stem(nn.Layer):
-    def __init__(self,
-                 in_channel,
-                 stem_channel,
-                 out_channel,
-                 expand_ratio,
-                 norm_type='bn',
-                 freeze_norm=False,
-                 norm_decay=0.):
-        super(Stem, self).__init__()
-        self.conv1 = ConvNormLayer(
-            in_channel,
-            stem_channel,
-            filter_size=3,
-            stride=2,
-            norm_type=norm_type,
-            act='relu',
-            freeze_norm=freeze_norm,
-            norm_decay=norm_decay)
-        mid_channel = int(round(stem_channel * expand_ratio))
-        branch_channel = stem_channel // 2
-        if stem_channel == out_channel:
-            inc_channel = out_channel - branch_channel
-        else:
-            inc_channel = out_channel - stem_channel
-        self.branch1 = nn.Sequential(
-            ConvNormLayer(
-                ch_in=branch_channel,
-                ch_out=branch_channel,
-                filter_size=3,
-                stride=2,
-                groups=branch_channel,
-                norm_type=norm_type,
-                freeze_norm=freeze_norm,
-                norm_decay=norm_decay),
-            ConvNormLayer(
-                ch_in=branch_channel,
-                ch_out=inc_channel,
-                filter_size=1,
-                stride=1,
-                norm_type=norm_type,
-                act='relu',
-                freeze_norm=freeze_norm,
-                norm_decay=norm_decay), )
-        self.expand_conv = ConvNormLayer(
-            ch_in=branch_channel,
-            ch_out=mid_channel,
-            filter_size=1,
-            stride=1,
-            norm_type=norm_type,
-            act='relu',
-            freeze_norm=freeze_norm,
-            norm_decay=norm_decay)
-        self.depthwise_conv = ConvNormLayer(
-            ch_in=mid_channel,
-            ch_out=mid_channel,
-            filter_size=3,
-            stride=2,
-            groups=mid_channel,
-            norm_type=norm_type,
-            freeze_norm=freeze_norm,
-            norm_decay=norm_decay)
-        self.linear_conv = ConvNormLayer(
-            ch_in=mid_channel,
-            ch_out=branch_channel
-            if stem_channel == out_channel else stem_channel,
-            filter_size=1,
-            stride=1,
-            norm_type=norm_type,
-            act='relu',
-            freeze_norm=freeze_norm,
-            norm_decay=norm_decay)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x1, x2 = x.chunk(2, axis=1)
-        x1 = self.branch1(x1)
-        x2 = self.expand_conv(x2)
-        x2 = self.depthwise_conv(x2)
-        x2 = self.linear_conv(x2)
-        out = paddle.concat([x1, x2], axis=1)
-        out = channel_shuffle(out, groups=2)
-
-        return out
-
-
-class LiteHRNetModule(nn.Layer):
-    def __init__(self,
-                 num_branches,
-                 num_blocks,
-                 in_channels,
-                 reduce_ratio,
-                 module_type,
-                 multiscale_output=False,
-                 with_fuse=True,
-                 norm_type='bn',
-                 freeze_norm=False,
-                 norm_decay=0.):
-        super(LiteHRNetModule, self).__init__()
-        assert num_branches == len(in_channels),\
-            "num_branches {} should equal to num_in_channels {}".format(num_branches, len(in_channels))
-        assert module_type in [
-            'LITE', 'NAIVE'
-        ], "module_type should be one of ['LITE', 'NAIVE']"
-        self.num_branches = num_branches
-        self.in_channels = in_channels
-        self.multiscale_output = multiscale_output
-        self.with_fuse = with_fuse
-        self.norm_type = 'bn'
-        self.module_type = module_type
-
-        if self.module_type == 'LITE':
-            self.layers = self._make_weighting_blocks(
-                num_blocks,
-                reduce_ratio,
-                freeze_norm=freeze_norm,
-                norm_decay=norm_decay)
-        elif self.module_type == 'NAIVE':
-            self.layers = self._make_naive_branches(
-                num_branches,
-                num_blocks,
-                freeze_norm=freeze_norm,
-                norm_decay=norm_decay)
-
-        if self.with_fuse:
-            self.fuse_layers = self._make_fuse_layers(
-                freeze_norm=freeze_norm, norm_decay=norm_decay)
-            self.relu = nn.ReLU()
-
-    def _make_weighting_blocks(self,
-                               num_blocks,
-                               reduce_ratio,
-                               stride=1,
-                               freeze_norm=False,
-                               norm_decay=0.):
-        layers = []
-        for i in range(num_blocks):
-            layers.append(
-                ConditionalChannelWeightingBlock(
-                    self.in_channels,
-                    stride=stride,
-                    reduce_ratio=reduce_ratio,
-                    norm_type=self.norm_type,
-                    freeze_norm=freeze_norm,
-                    norm_decay=norm_decay))
-        return nn.Sequential(*layers)
-
-    def _make_naive_branches(self,
-                             num_branches,
-                             num_blocks,
-                             freeze_norm=False,
-                             norm_decay=0.):
-        branches = []
-        for branch_idx in range(num_branches):
-            layers = []
-            for i in range(num_blocks):
-                layers.append(
-                    ShuffleUnit(
-                        self.in_channels[branch_idx],
-                        self.in_channels[branch_idx],
-                        stride=1,
-                        norm_type=self.norm_type,
-                        freeze_norm=freeze_norm,
-                        norm_decay=norm_decay))
-            branches.append(nn.Sequential(*layers))
-        return nn.LayerList(branches)
-
-    def _make_fuse_layers(self, freeze_norm=False, norm_decay=0.):
-        if self.num_branches == 1:
-            return None
-        fuse_layers = []
-        num_out_branches = self.num_branches if self.multiscale_output else 1
-        for i in range(num_out_branches):
-            fuse_layer = []
-            for j in range(self.num_branches):
-                if j > i:
-                    fuse_layer.append(
-                        nn.Sequential(
-                            L.Conv2d(
-                                self.in_channels[j],
-                                self.in_channels[i],
-                                kernel_size=1,
-                                stride=1,
-                                padding=0,
-                                bias=False, ),
-                            nn.BatchNorm2D(self.in_channels[i]),
-                            nn.Upsample(
-                                scale_factor=2**(j - i), mode='nearest')))
-                elif j == i:
-                    fuse_layer.append(None)
-                else:
-                    conv_downsamples = []
-                    for k in range(i - j):
-                        if k == i - j - 1:
-                            conv_downsamples.append(
-                                nn.Sequential(
-                                    L.Conv2d(
-                                        self.in_channels[j],
-                                        self.in_channels[j],
-                                        kernel_size=3,
-                                        stride=2,
-                                        padding=1,
-                                        groups=self.in_channels[j],
-                                        bias=False, ),
-                                    nn.BatchNorm2D(self.in_channels[j]),
-                                    L.Conv2d(
-                                        self.in_channels[j],
-                                        self.in_channels[i],
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0,
-                                        bias=False, ),
-                                    nn.BatchNorm2D(self.in_channels[i])))
-                        else:
-                            conv_downsamples.append(
-                                nn.Sequential(
-                                    L.Conv2d(
-                                        self.in_channels[j],
-                                        self.in_channels[j],
-                                        kernel_size=3,
-                                        stride=2,
-                                        padding=1,
-                                        groups=self.in_channels[j],
-                                        bias=False, ),
-                                    nn.BatchNorm2D(self.in_channels[j]),
-                                    L.Conv2d(
-                                        self.in_channels[j],
-                                        self.in_channels[j],
-                                        kernel_size=1,
-                                        stride=1,
-                                        padding=0,
-                                        bias=False, ),
-                                    nn.BatchNorm2D(self.in_channels[j]),
-                                    nn.ReLU()))
-
-                    fuse_layer.append(nn.Sequential(*conv_downsamples))
-            fuse_layers.append(nn.LayerList(fuse_layer))
-
-        return nn.LayerList(fuse_layers)
-
-    def forward(self, x):
-        if self.num_branches == 1:
-            return [self.layers[0](x[0])]
-        if self.module_type == 'LITE':
-            out = self.layers(x)
-        elif self.module_type == 'NAIVE':
-            for i in range(self.num_branches):
-                x[i] = self.layers[i](x[i])
-            out = x
-        if self.with_fuse:
-            out_fuse = []
-            for i in range(len(self.fuse_layers)):
-                y = out[0] if i == 0 else self.fuse_layers[i][0](out[0])
-                for j in range(self.num_branches):
-                    if j == 0:
-                        y += y
-                    elif i == j:
-                        y += out[j]
-                    else:
-                        y += self.fuse_layers[i][j](out[j])
-                    if i == 0:
-                        out[i] = y
-                out_fuse.append(self.relu(y))
-            out = out_fuse
-        elif not self.multiscale_output:
-            out = [out[0]]
-        return out
-
-
-@register
-class LiteHRNet(nn.Layer):
-    """
-    @inproceedings{Yulitehrnet21,
-    title={Lite-HRNet: A Lightweight High-Resolution Network},
-        author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong},
-        booktitle={CVPR},year={2021}
-    }
-    Args:
-        network_type (str): the network_type should be one of ["lite_18", "lite_30", "naive", "wider_naive"],
-            "naive": Simply combining the shuffle block in ShuffleNet and the highresolution design pattern in HRNet.
-            "wider_naive": Naive network with wider channels in each block.
-            "lite_18": Lite-HRNet-18, which replaces the pointwise convolution in a shuffle block by conditional channel weighting.
-            "lite_30": Lite-HRNet-30, with more blocks compared with Lite-HRNet-18.
-        freeze_at (int): the stage to freeze
-        freeze_norm (bool): whether to freeze norm in HRNet
-        norm_decay (float): weight decay for normalization layer weights
-        return_idx (List): the stage to return
-    """
-
-    def __init__(self,
-                 network_type,
-                 freeze_at=0,
-                 freeze_norm=True,
-                 norm_decay=0.,
-                 return_idx=[0, 1, 2, 3]):
-        super(LiteHRNet, self).__init__()
-        if isinstance(return_idx, Integral):
-            return_idx = [return_idx]
-        assert network_type in ["lite_18", "lite_30", "naive", "wider_naive"], \
-            "the network_type should be one of [lite_18, lite_30, naive, wider_naive]"
-        assert len(return_idx) > 0, "need one or more return index"
-        self.freeze_at = freeze_at
-        self.freeze_norm = freeze_norm
-        self.norm_decay = norm_decay
-        self.return_idx = return_idx
-        self.norm_type = 'bn'
-
-        self.module_configs = {
-            "lite_18": {
-                "num_modules": [2, 4, 2],
-                "num_branches": [2, 3, 4],
-                "num_blocks": [2, 2, 2],
-                "module_type": ["LITE", "LITE", "LITE"],
-                "reduce_ratios": [8, 8, 8],
-                "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
-            },
-            "lite_30": {
-                "num_modules": [3, 8, 3],
-                "num_branches": [2, 3, 4],
-                "num_blocks": [2, 2, 2],
-                "module_type": ["LITE", "LITE", "LITE"],
-                "reduce_ratios": [8, 8, 8],
-                "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
-            },
-            "naive": {
-                "num_modules": [2, 4, 2],
-                "num_branches": [2, 3, 4],
-                "num_blocks": [2, 2, 2],
-                "module_type": ["NAIVE", "NAIVE", "NAIVE"],
-                "reduce_ratios": [1, 1, 1],
-                "num_channels": [[30, 60], [30, 60, 120], [30, 60, 120, 240]],
-            },
-            "wider_naive": {
-                "num_modules": [2, 4, 2],
-                "num_branches": [2, 3, 4],
-                "num_blocks": [2, 2, 2],
-                "module_type": ["NAIVE", "NAIVE", "NAIVE"],
-                "reduce_ratios": [1, 1, 1],
-                "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]],
-            },
-        }
-
-        self.stages_config = self.module_configs[network_type]
-
-        self.stem = Stem(3, 32, 32, 1)
-        num_channels_pre_layer = [32]
-        for stage_idx in range(3):
-            num_channels = self.stages_config["num_channels"][stage_idx]
-            setattr(self, 'transition{}'.format(stage_idx),
-                    self._make_transition_layer(num_channels_pre_layer,
-                                                num_channels, self.freeze_norm,
-                                                self.norm_decay))
-            stage, num_channels_pre_layer = self._make_stage(
-                self.stages_config, stage_idx, num_channels, True,
-                self.freeze_norm, self.norm_decay)
-            setattr(self, 'stage{}'.format(stage_idx), stage)
-        self.head_layer = IterativeHead(num_channels_pre_layer, 'bn',
-                                        self.freeze_norm, self.norm_decay)
-
-    def _make_transition_layer(self,
-                               num_channels_pre_layer,
-                               num_channels_cur_layer,
-                               freeze_norm=False,
-                               norm_decay=0.):
-        num_branches_pre = len(num_channels_pre_layer)
-        num_branches_cur = len(num_channels_cur_layer)
-        transition_layers = []
-        for i in range(num_branches_cur):
-            if i < num_branches_pre:
-                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
-                    transition_layers.append(
-                        nn.Sequential(
-                            L.Conv2d(
-                                num_channels_pre_layer[i],
-                                num_channels_pre_layer[i],
-                                kernel_size=3,
-                                stride=1,
-                                padding=1,
-                                groups=num_channels_pre_layer[i],
-                                bias=False),
-                            nn.BatchNorm2D(num_channels_pre_layer[i]),
-                            L.Conv2d(
-                                num_channels_pre_layer[i],
-                                num_channels_cur_layer[i],
-                                kernel_size=1,
-                                stride=1,
-                                padding=0,
-                                bias=False, ),
-                            nn.BatchNorm2D(num_channels_cur_layer[i]),
-                            nn.ReLU()))
-                else:
-                    transition_layers.append(None)
-            else:
-                conv_downsamples = []
-                for j in range(i + 1 - num_branches_pre):
-                    conv_downsamples.append(
-                        nn.Sequential(
-                            L.Conv2d(
-                                num_channels_pre_layer[-1],
-                                num_channels_pre_layer[-1],
-                                groups=num_channels_pre_layer[-1],
-                                kernel_size=3,
-                                stride=2,
-                                padding=1,
-                                bias=False, ),
-                            nn.BatchNorm2D(num_channels_pre_layer[-1]),
-                            L.Conv2d(
-                                num_channels_pre_layer[-1],
-                                num_channels_cur_layer[i]
-                                if j == i - num_branches_pre else
-                                num_channels_pre_layer[-1],
-                                kernel_size=1,
-                                stride=1,
-                                padding=0,
-                                bias=False, ),
-                            nn.BatchNorm2D(num_channels_cur_layer[i]
-                                           if j == i - num_branches_pre else
-                                           num_channels_pre_layer[-1]),
-                            nn.ReLU()))
-                transition_layers.append(nn.Sequential(*conv_downsamples))
-        return nn.LayerList(transition_layers)
-
-    def _make_stage(self,
-                    stages_config,
-                    stage_idx,
-                    in_channels,
-                    multiscale_output,
-                    freeze_norm=False,
-                    norm_decay=0.):
-        num_modules = stages_config["num_modules"][stage_idx]
-        num_branches = stages_config["num_branches"][stage_idx]
-        num_blocks = stages_config["num_blocks"][stage_idx]
-        reduce_ratio = stages_config['reduce_ratios'][stage_idx]
-        module_type = stages_config['module_type'][stage_idx]
-
-        modules = []
-        for i in range(num_modules):
-            if not multiscale_output and i == num_modules - 1:
-                reset_multiscale_output = False
-            else:
-                reset_multiscale_output = True
-            modules.append(
-                LiteHRNetModule(
-                    num_branches,
-                    num_blocks,
-                    in_channels,
-                    reduce_ratio,
-                    module_type,
-                    multiscale_output=reset_multiscale_output,
-                    with_fuse=True,
-                    freeze_norm=freeze_norm,
-                    norm_decay=norm_decay))
-            in_channels = modules[-1].in_channels
-        return nn.Sequential(*modules), in_channels
-
-    def forward(self, inputs):
-        x = inputs['image']
-        dims = x.shape
-        if len(dims) == 5:
-            x = paddle.reshape(x, (dims[0] * dims[1], dims[2], dims[3],
-                                   dims[4]))  # [6, 3, 128, 96]
-
-        x = self.stem(x)
-        y_list = [x]
-        for stage_idx in range(3):
-            x_list = []
-            transition = getattr(self, 'transition{}'.format(stage_idx))
-            for j in range(self.stages_config["num_branches"][stage_idx]):
-                if transition[j] is not None:
-                    if j >= len(y_list):
-                        x_list.append(transition[j](y_list[-1]))
-                    else:
-                        x_list.append(transition[j](y_list[j]))
-                else:
-                    x_list.append(y_list[j])
-            y_list = getattr(self, 'stage{}'.format(stage_idx))(x_list)
-        x = self.head_layer(y_list)
-        res = []
-        for i, layer in enumerate(x):
-            if i == self.freeze_at:
-                layer.stop_gradient = True
-            if i in self.return_idx:
-                res.append(layer)
-        return res
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=self._out_channels[i], stride=self._out_strides[i])
-            for i in self.return_idx
-        ]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/mobilenet_v1.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/mobilenet_v1.py
deleted file mode 100644
index a39435b..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/mobilenet_v1.py
+++ /dev/null
@@ -1,402 +0,0 @@
-# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-from paddle.nn.initializer import KaimingNormal
-from ppdet.core.workspace import register, serializable
-from numbers import Integral
-from ..shape_spec import ShapeSpec
-
-__all__ = ['MobileNet']
-
-
-class ConvBNLayer(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride,
-                 padding,
-                 num_groups=1,
-                 act='relu',
-                 conv_lr=1.,
-                 conv_decay=0.,
-                 norm_decay=0.,
-                 norm_type='bn',
-                 name=None):
-        super(ConvBNLayer, self).__init__()
-        self.act = act
-        self._conv = nn.Conv2D(
-            in_channels,
-            out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            groups=num_groups,
-            weight_attr=ParamAttr(
-                learning_rate=conv_lr,
-                initializer=KaimingNormal(),
-                regularizer=L2Decay(conv_decay)),
-            bias_attr=False)
-
-        param_attr = ParamAttr(regularizer=L2Decay(norm_decay))
-        bias_attr = ParamAttr(regularizer=L2Decay(norm_decay))
-        if norm_type in ['sync_bn', 'bn']:
-            self._batch_norm = nn.BatchNorm2D(
-                out_channels, weight_attr=param_attr, bias_attr=bias_attr)
-
-    def forward(self, x):
-        x = self._conv(x)
-        x = self._batch_norm(x)
-        if self.act == "relu":
-            x = F.relu(x)
-        elif self.act == "relu6":
-            x = F.relu6(x)
-        return x
-
-
-class DepthwiseSeparable(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels1,
-                 out_channels2,
-                 num_groups,
-                 stride,
-                 scale,
-                 conv_lr=1.,
-                 conv_decay=0.,
-                 norm_decay=0.,
-                 norm_type='bn',
-                 name=None):
-        super(DepthwiseSeparable, self).__init__()
-
-        self._depthwise_conv = ConvBNLayer(
-            in_channels,
-            int(out_channels1 * scale),
-            kernel_size=3,
-            stride=stride,
-            padding=1,
-            num_groups=int(num_groups * scale),
-            conv_lr=conv_lr,
-            conv_decay=conv_decay,
-            norm_decay=norm_decay,
-            norm_type=norm_type,
-            name=name + "_dw")
-
-        self._pointwise_conv = ConvBNLayer(
-            int(out_channels1 * scale),
-            int(out_channels2 * scale),
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            conv_lr=conv_lr,
-            conv_decay=conv_decay,
-            norm_decay=norm_decay,
-            norm_type=norm_type,
-            name=name + "_sep")
-
-    def forward(self, x):
-        x = self._depthwise_conv(x)
-        x = self._pointwise_conv(x)
-        return x
-
-
-class ExtraBlock(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels1,
-                 out_channels2,
-                 num_groups=1,
-                 stride=2,
-                 conv_lr=1.,
-                 conv_decay=0.,
-                 norm_decay=0.,
-                 norm_type='bn',
-                 name=None):
-        super(ExtraBlock, self).__init__()
-
-        self.pointwise_conv = ConvBNLayer(
-            in_channels,
-            int(out_channels1),
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            num_groups=int(num_groups),
-            act='relu6',
-            conv_lr=conv_lr,
-            conv_decay=conv_decay,
-            norm_decay=norm_decay,
-            norm_type=norm_type,
-            name=name + "_extra1")
-
-        self.normal_conv = ConvBNLayer(
-            int(out_channels1),
-            int(out_channels2),
-            kernel_size=3,
-            stride=stride,
-            padding=1,
-            num_groups=int(num_groups),
-            act='relu6',
-            conv_lr=conv_lr,
-            conv_decay=conv_decay,
-            norm_decay=norm_decay,
-            norm_type=norm_type,
-            name=name + "_extra2")
-
-    def forward(self, x):
-        x = self.pointwise_conv(x)
-        x = self.normal_conv(x)
-        return x
-
-
-@register
-@serializable
-class MobileNet(nn.Layer):
-    __shared__ = ['norm_type']
-
-    def __init__(self,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 conv_decay=0.,
-                 scale=1,
-                 conv_learning_rate=1.0,
-                 feature_maps=[4, 6, 13],
-                 with_extra_blocks=False,
-                 extra_block_filters=[[256, 512], [128, 256], [128, 256],
-                                      [64, 128]]):
-        super(MobileNet, self).__init__()
-        if isinstance(feature_maps, Integral):
-            feature_maps = [feature_maps]
-        self.feature_maps = feature_maps
-        self.with_extra_blocks = with_extra_blocks
-        self.extra_block_filters = extra_block_filters
-
-        self._out_channels = []
-
-        self.conv1 = ConvBNLayer(
-            in_channels=3,
-            out_channels=int(32 * scale),
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            conv_lr=conv_learning_rate,
-            conv_decay=conv_decay,
-            norm_decay=norm_decay,
-            norm_type=norm_type,
-            name="conv1")
-
-        self.dwsl = []
-        dws21 = self.add_sublayer(
-            "conv2_1",
-            sublayer=DepthwiseSeparable(
-                in_channels=int(32 * scale),
-                out_channels1=32,
-                out_channels2=64,
-                num_groups=32,
-                stride=1,
-                scale=scale,
-                conv_lr=conv_learning_rate,
-                conv_decay=conv_decay,
-                norm_decay=norm_decay,
-                norm_type=norm_type,
-                name="conv2_1"))
-        self.dwsl.append(dws21)
-        self._update_out_channels(int(64 * scale), len(self.dwsl), feature_maps)
-        dws22 = self.add_sublayer(
-            "conv2_2",
-            sublayer=DepthwiseSeparable(
-                in_channels=int(64 * scale),
-                out_channels1=64,
-                out_channels2=128,
-                num_groups=64,
-                stride=2,
-                scale=scale,
-                conv_lr=conv_learning_rate,
-                conv_decay=conv_decay,
-                norm_decay=norm_decay,
-                norm_type=norm_type,
-                name="conv2_2"))
-        self.dwsl.append(dws22)
-        self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
-        # 1/4
-        dws31 = self.add_sublayer(
-            "conv3_1",
-            sublayer=DepthwiseSeparable(
-                in_channels=int(128 * scale),
-                out_channels1=128,
-                out_channels2=128,
-                num_groups=128,
-                stride=1,
-                scale=scale,
-                conv_lr=conv_learning_rate,
-                conv_decay=conv_decay,
-                norm_decay=norm_decay,
-                norm_type=norm_type,
-                name="conv3_1"))
-        self.dwsl.append(dws31)
-        self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps)
-        dws32 = self.add_sublayer(
-            "conv3_2",
-            sublayer=DepthwiseSeparable(
-                in_channels=int(128 * scale),
-                out_channels1=128,
-                out_channels2=256,
-                num_groups=128,
-                stride=2,
-                scale=scale,
-                conv_lr=conv_learning_rate,
-                conv_decay=conv_decay,
-                norm_decay=norm_decay,
-                norm_type=norm_type,
-                name="conv3_2"))
-        self.dwsl.append(dws32)
-        self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
-        # 1/8
-        dws41 = self.add_sublayer(
-            "conv4_1",
-            sublayer=DepthwiseSeparable(
-                in_channels=int(256 * scale),
-                out_channels1=256,
-                out_channels2=256,
-                num_groups=256,
-                stride=1,
-                scale=scale,
-                conv_lr=conv_learning_rate,
-                conv_decay=conv_decay,
-                norm_decay=norm_decay,
-                norm_type=norm_type,
-                name="conv4_1"))
-        self.dwsl.append(dws41)
-        self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps)
-        dws42 = self.add_sublayer(
-            "conv4_2",
-            sublayer=DepthwiseSeparable(
-                in_channels=int(256 * scale),
-                out_channels1=256,
-                out_channels2=512,
-                num_groups=256,
-                stride=2,
-                scale=scale,
-                conv_lr=conv_learning_rate,
-                conv_decay=conv_decay,
-                norm_decay=norm_decay,
-                norm_type=norm_type,
-                name="conv4_2"))
-        self.dwsl.append(dws42)
-        self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
-        # 1/16
-        for i in range(5):
-            tmp = self.add_sublayer(
-                "conv5_" + str(i + 1),
-                sublayer=DepthwiseSeparable(
-                    in_channels=int(512 * scale),
-                    out_channels1=512,
-                    out_channels2=512,
-                    num_groups=512,
-                    stride=1,
-                    scale=scale,
-                    conv_lr=conv_learning_rate,
-                    conv_decay=conv_decay,
-                    norm_decay=norm_decay,
-                    norm_type=norm_type,
-                    name="conv5_" + str(i + 1)))
-            self.dwsl.append(tmp)
-            self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps)
-        dws56 = self.add_sublayer(
-            "conv5_6",
-            sublayer=DepthwiseSeparable(
-                in_channels=int(512 * scale),
-                out_channels1=512,
-                out_channels2=1024,
-                num_groups=512,
-                stride=2,
-                scale=scale,
-                conv_lr=conv_learning_rate,
-                conv_decay=conv_decay,
-                norm_decay=norm_decay,
-                norm_type=norm_type,
-                name="conv5_6"))
-        self.dwsl.append(dws56)
-        self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
-        # 1/32
-        dws6 = self.add_sublayer(
-            "conv6",
-            sublayer=DepthwiseSeparable(
-                in_channels=int(1024 * scale),
-                out_channels1=1024,
-                out_channels2=1024,
-                num_groups=1024,
-                stride=1,
-                scale=scale,
-                conv_lr=conv_learning_rate,
-                conv_decay=conv_decay,
-                norm_decay=norm_decay,
-                norm_type=norm_type,
-                name="conv6"))
-        self.dwsl.append(dws6)
-        self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps)
-
-        if self.with_extra_blocks:
-            self.extra_blocks = []
-            for i, block_filter in enumerate(self.extra_block_filters):
-                in_c = 1024 if i == 0 else self.extra_block_filters[i - 1][1]
-                conv_extra = self.add_sublayer(
-                    "conv7_" + str(i + 1),
-                    sublayer=ExtraBlock(
-                        in_c,
-                        block_filter[0],
-                        block_filter[1],
-                        conv_lr=conv_learning_rate,
-                        conv_decay=conv_decay,
-                        norm_decay=norm_decay,
-                        norm_type=norm_type,
-                        name="conv7_" + str(i + 1)))
-                self.extra_blocks.append(conv_extra)
-                self._update_out_channels(
-                    block_filter[1],
-                    len(self.dwsl) + len(self.extra_blocks), feature_maps)
-
-    def _update_out_channels(self, channel, feature_idx, feature_maps):
-        if feature_idx in feature_maps:
-            self._out_channels.append(channel)
-
-    def forward(self, inputs):
-        outs = []
-        y = self.conv1(inputs['image'])
-        for i, block in enumerate(self.dwsl):
-            y = block(y)
-            if i + 1 in self.feature_maps:
-                outs.append(y)
-
-        if not self.with_extra_blocks:
-            return outs
-
-        y = outs[-1]
-        for i, block in enumerate(self.extra_blocks):
-            idx = i + len(self.dwsl)
-            y = block(y)
-            if idx + 1 in self.feature_maps:
-                outs.append(y)
-        return outs
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/mobilenet_v3.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/mobilenet_v3.py
deleted file mode 100644
index 2bd8856..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/mobilenet_v3.py
+++ /dev/null
@@ -1,478 +0,0 @@
-# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-from ppdet.core.workspace import register, serializable
-from numbers import Integral
-from ..shape_spec import ShapeSpec
-
-__all__ = ['MobileNetV3']
-
-
-def make_divisible(v, divisor=8, min_value=None):
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
-class ConvBNLayer(nn.Layer):
-    def __init__(self,
-                 in_c,
-                 out_c,
-                 filter_size,
-                 stride,
-                 padding,
-                 num_groups=1,
-                 act=None,
-                 lr_mult=1.,
-                 conv_decay=0.,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 freeze_norm=False,
-                 name=""):
-        super(ConvBNLayer, self).__init__()
-        self.act = act
-        self.conv = nn.Conv2D(
-            in_channels=in_c,
-            out_channels=out_c,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=padding,
-            groups=num_groups,
-            weight_attr=ParamAttr(
-                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
-            bias_attr=False)
-
-        norm_lr = 0. if freeze_norm else lr_mult
-        param_attr = ParamAttr(
-            learning_rate=norm_lr,
-            regularizer=L2Decay(norm_decay),
-            trainable=False if freeze_norm else True)
-        bias_attr = ParamAttr(
-            learning_rate=norm_lr,
-            regularizer=L2Decay(norm_decay),
-            trainable=False if freeze_norm else True)
-        global_stats = True if freeze_norm else None
-        if norm_type in ['sync_bn', 'bn']:
-            self.bn = nn.BatchNorm2D(
-                out_c,
-                weight_attr=param_attr,
-                bias_attr=bias_attr,
-                use_global_stats=global_stats)
-        norm_params = self.bn.parameters()
-        if freeze_norm:
-            for param in norm_params:
-                param.stop_gradient = True
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        if self.act is not None:
-            if self.act == "relu":
-                x = F.relu(x)
-            elif self.act == "relu6":
-                x = F.relu6(x)
-            elif self.act == "hard_swish":
-                x = F.hardswish(x)
-            else:
-                raise NotImplementedError(
-                    "The activation function is selected incorrectly.")
-        return x
-
-
-class ResidualUnit(nn.Layer):
-    def __init__(self,
-                 in_c,
-                 mid_c,
-                 out_c,
-                 filter_size,
-                 stride,
-                 use_se,
-                 lr_mult,
-                 conv_decay=0.,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 freeze_norm=False,
-                 act=None,
-                 return_list=False,
-                 name=''):
-        super(ResidualUnit, self).__init__()
-        self.if_shortcut = stride == 1 and in_c == out_c
-        self.use_se = use_se
-        self.return_list = return_list
-
-        self.expand_conv = ConvBNLayer(
-            in_c=in_c,
-            out_c=mid_c,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            act=act,
-            lr_mult=lr_mult,
-            conv_decay=conv_decay,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name=name + "_expand")
-        self.bottleneck_conv = ConvBNLayer(
-            in_c=mid_c,
-            out_c=mid_c,
-            filter_size=filter_size,
-            stride=stride,
-            padding=int((filter_size - 1) // 2),
-            num_groups=mid_c,
-            act=act,
-            lr_mult=lr_mult,
-            conv_decay=conv_decay,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name=name + "_depthwise")
-        if self.use_se:
-            self.mid_se = SEModule(
-                mid_c, lr_mult, conv_decay, name=name + "_se")
-        self.linear_conv = ConvBNLayer(
-            in_c=mid_c,
-            out_c=out_c,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            act=None,
-            lr_mult=lr_mult,
-            conv_decay=conv_decay,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name=name + "_linear")
-
-    def forward(self, inputs):
-        y = self.expand_conv(inputs)
-        x = self.bottleneck_conv(y)
-        if self.use_se:
-            x = self.mid_se(x)
-        x = self.linear_conv(x)
-        if self.if_shortcut:
-            x = paddle.add(inputs, x)
-        if self.return_list:
-            return [y, x]
-        else:
-            return x
-
-
-class SEModule(nn.Layer):
-    def __init__(self, channel, lr_mult, conv_decay, reduction=4, name=""):
-        super(SEModule, self).__init__()
-        self.avg_pool = nn.AdaptiveAvgPool2D(1)
-        mid_channels = int(channel // reduction)
-        self.conv1 = nn.Conv2D(
-            in_channels=channel,
-            out_channels=mid_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            weight_attr=ParamAttr(
-                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
-            bias_attr=ParamAttr(
-                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
-        self.conv2 = nn.Conv2D(
-            in_channels=mid_channels,
-            out_channels=channel,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            weight_attr=ParamAttr(
-                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)),
-            bias_attr=ParamAttr(
-                learning_rate=lr_mult, regularizer=L2Decay(conv_decay)))
-
-    def forward(self, inputs):
-        outputs = self.avg_pool(inputs)
-        outputs = self.conv1(outputs)
-        outputs = F.relu(outputs)
-        outputs = self.conv2(outputs)
-        outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5)
-        return paddle.multiply(x=inputs, y=outputs)
-
-
-class ExtraBlockDW(nn.Layer):
-    def __init__(self,
-                 in_c,
-                 ch_1,
-                 ch_2,
-                 stride,
-                 lr_mult,
-                 conv_decay=0.,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 freeze_norm=False,
-                 name=None):
-        super(ExtraBlockDW, self).__init__()
-        self.pointwise_conv = ConvBNLayer(
-            in_c=in_c,
-            out_c=ch_1,
-            filter_size=1,
-            stride=1,
-            padding='SAME',
-            act='relu6',
-            lr_mult=lr_mult,
-            conv_decay=conv_decay,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name=name + "_extra1")
-        self.depthwise_conv = ConvBNLayer(
-            in_c=ch_1,
-            out_c=ch_2,
-            filter_size=3,
-            stride=stride,
-            padding='SAME',
-            num_groups=int(ch_1),
-            act='relu6',
-            lr_mult=lr_mult,
-            conv_decay=conv_decay,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name=name + "_extra2_dw")
-        self.normal_conv = ConvBNLayer(
-            in_c=ch_2,
-            out_c=ch_2,
-            filter_size=1,
-            stride=1,
-            padding='SAME',
-            act='relu6',
-            lr_mult=lr_mult,
-            conv_decay=conv_decay,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name=name + "_extra2_sep")
-
-    def forward(self, inputs):
-        x = self.pointwise_conv(inputs)
-        x = self.depthwise_conv(x)
-        x = self.normal_conv(x)
-        return x
-
-
-@register
-@serializable
-class MobileNetV3(nn.Layer):
-    __shared__ = ['norm_type']
-
-    def __init__(
-            self,
-            scale=1.0,
-            model_name="large",
-            feature_maps=[6, 12, 15],
-            with_extra_blocks=False,
-            extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]],
-            lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0],
-            conv_decay=0.0,
-            multiplier=1.0,
-            norm_type='bn',
-            norm_decay=0.0,
-            freeze_norm=False):
-        super(MobileNetV3, self).__init__()
-        if isinstance(feature_maps, Integral):
-            feature_maps = [feature_maps]
-        if norm_type == 'sync_bn' and freeze_norm:
-            raise ValueError(
-                "The norm_type should not be sync_bn when freeze_norm is True")
-        self.feature_maps = feature_maps
-        self.with_extra_blocks = with_extra_blocks
-        self.extra_block_filters = extra_block_filters
-
-        inplanes = 16
-        if model_name == "large":
-            self.cfg = [
-                # k, exp, c,  se,     nl,  s,
-                [3, 16, 16, False, "relu", 1],
-                [3, 64, 24, False, "relu", 2],
-                [3, 72, 24, False, "relu", 1],
-                [5, 72, 40, True, "relu", 2],  # RCNN output
-                [5, 120, 40, True, "relu", 1],
-                [5, 120, 40, True, "relu", 1],  # YOLOv3 output
-                [3, 240, 80, False, "hard_swish", 2],  # RCNN output
-                [3, 200, 80, False, "hard_swish", 1],
-                [3, 184, 80, False, "hard_swish", 1],
-                [3, 184, 80, False, "hard_swish", 1],
-                [3, 480, 112, True, "hard_swish", 1],
-                [3, 672, 112, True, "hard_swish", 1],  # YOLOv3 output
-                [5, 672, 160, True, "hard_swish", 2],  # SSD/SSDLite/RCNN output
-                [5, 960, 160, True, "hard_swish", 1],
-                [5, 960, 160, True, "hard_swish", 1],  # YOLOv3 output
-            ]
-        elif model_name == "small":
-            self.cfg = [
-                # k, exp, c,  se,     nl,  s,
-                [3, 16, 16, True, "relu", 2],
-                [3, 72, 24, False, "relu", 2],  # RCNN output
-                [3, 88, 24, False, "relu", 1],  # YOLOv3 output
-                [5, 96, 40, True, "hard_swish", 2],  # RCNN output
-                [5, 240, 40, True, "hard_swish", 1],
-                [5, 240, 40, True, "hard_swish", 1],
-                [5, 120, 48, True, "hard_swish", 1],
-                [5, 144, 48, True, "hard_swish", 1],  # YOLOv3 output
-                [5, 288, 96, True, "hard_swish", 2],  # SSD/SSDLite/RCNN output
-                [5, 576, 96, True, "hard_swish", 1],
-                [5, 576, 96, True, "hard_swish", 1],  # YOLOv3 output
-            ]
-        else:
-            raise NotImplementedError(
-                "mode[{}_model] is not implemented!".format(model_name))
-
-        if multiplier != 1.0:
-            self.cfg[-3][2] = int(self.cfg[-3][2] * multiplier)
-            self.cfg[-2][1] = int(self.cfg[-2][1] * multiplier)
-            self.cfg[-2][2] = int(self.cfg[-2][2] * multiplier)
-            self.cfg[-1][1] = int(self.cfg[-1][1] * multiplier)
-            self.cfg[-1][2] = int(self.cfg[-1][2] * multiplier)
-
-        self.conv1 = ConvBNLayer(
-            in_c=3,
-            out_c=make_divisible(inplanes * scale),
-            filter_size=3,
-            stride=2,
-            padding=1,
-            num_groups=1,
-            act="hard_swish",
-            lr_mult=lr_mult_list[0],
-            conv_decay=conv_decay,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            name="conv1")
-
-        self._out_channels = []
-        self.block_list = []
-        i = 0
-        inplanes = make_divisible(inplanes * scale)
-        for (k, exp, c, se, nl, s) in self.cfg:
-            lr_idx = min(i // 3, len(lr_mult_list) - 1)
-            lr_mult = lr_mult_list[lr_idx]
-
-            # for SSD/SSDLite, first head input is after ResidualUnit expand_conv
-            return_list = self.with_extra_blocks and i + 2 in self.feature_maps
-
-            block = self.add_sublayer(
-                "conv" + str(i + 2),
-                sublayer=ResidualUnit(
-                    in_c=inplanes,
-                    mid_c=make_divisible(scale * exp),
-                    out_c=make_divisible(scale * c),
-                    filter_size=k,
-                    stride=s,
-                    use_se=se,
-                    act=nl,
-                    lr_mult=lr_mult,
-                    conv_decay=conv_decay,
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    return_list=return_list,
-                    name="conv" + str(i + 2)))
-            self.block_list.append(block)
-            inplanes = make_divisible(scale * c)
-            i += 1
-            self._update_out_channels(
-                make_divisible(scale * exp)
-                if return_list else inplanes, i + 1, feature_maps)
-
-        if self.with_extra_blocks:
-            self.extra_block_list = []
-            extra_out_c = make_divisible(scale * self.cfg[-1][1])
-            lr_idx = min(i // 3, len(lr_mult_list) - 1)
-            lr_mult = lr_mult_list[lr_idx]
-
-            conv_extra = self.add_sublayer(
-                "conv" + str(i + 2),
-                sublayer=ConvBNLayer(
-                    in_c=inplanes,
-                    out_c=extra_out_c,
-                    filter_size=1,
-                    stride=1,
-                    padding=0,
-                    num_groups=1,
-                    act="hard_swish",
-                    lr_mult=lr_mult,
-                    conv_decay=conv_decay,
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    name="conv" + str(i + 2)))
-            self.extra_block_list.append(conv_extra)
-            i += 1
-            self._update_out_channels(extra_out_c, i + 1, feature_maps)
-
-            for j, block_filter in enumerate(self.extra_block_filters):
-                in_c = extra_out_c if j == 0 else self.extra_block_filters[j -
-                                                                           1][1]
-                conv_extra = self.add_sublayer(
-                    "conv" + str(i + 2),
-                    sublayer=ExtraBlockDW(
-                        in_c,
-                        block_filter[0],
-                        block_filter[1],
-                        stride=2,
-                        lr_mult=lr_mult,
-                        conv_decay=conv_decay,
-                        norm_type=norm_type,
-                        norm_decay=norm_decay,
-                        freeze_norm=freeze_norm,
-                        name='conv' + str(i + 2)))
-                self.extra_block_list.append(conv_extra)
-                i += 1
-                self._update_out_channels(block_filter[1], i + 1, feature_maps)
-
-    def _update_out_channels(self, channel, feature_idx, feature_maps):
-        if feature_idx in feature_maps:
-            self._out_channels.append(channel)
-
-    def forward(self, inputs):
-        x = self.conv1(inputs['image'])
-        outs = []
-        for idx, block in enumerate(self.block_list):
-            x = block(x)
-            if idx + 2 in self.feature_maps:
-                if isinstance(x, list):
-                    outs.append(x[0])
-                    x = x[1]
-                else:
-                    outs.append(x)
-
-        if not self.with_extra_blocks:
-            return outs
-
-        for i, block in enumerate(self.extra_block_list):
-            idx = i + len(self.block_list)
-            x = block(x)
-            if idx + 2 in self.feature_maps:
-                outs.append(x)
-        return outs
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/mobileone.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/mobileone.py
deleted file mode 100644
index e548bad..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/mobileone.py
+++ /dev/null
@@ -1,266 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf. 
-Some codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py
-Ths copyright of microsoft/Swin-Transformer is as follows:
-MIT License [see LICENSE for details]
-"""
-
-import paddle
-import paddle.nn as nn
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-from paddle.nn.initializer import Normal, Constant
-
-from ppdet.modeling.ops import get_act_fn
-from ppdet.modeling.layers import ConvNormLayer
-
-
-class MobileOneBlock(nn.Layer):
-    def __init__(
-            self,
-            ch_in,
-            ch_out,
-            stride,
-            kernel_size,
-            conv_num=1,
-            norm_type='bn',
-            norm_decay=0.,
-            norm_groups=32,
-            bias_on=False,
-            lr_scale=1.,
-            freeze_norm=False,
-            initializer=Normal(
-                mean=0., std=0.01),
-            skip_quant=False,
-            act='relu', ):
-        super(MobileOneBlock, self).__init__()
-
-        self.ch_in = ch_in
-        self.ch_out = ch_out
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = (kernel_size - 1) // 2
-        self.k = conv_num
-
-        self.depth_conv = nn.LayerList()
-        self.point_conv = nn.LayerList()
-        for _ in range(self.k):
-            self.depth_conv.append(
-                ConvNormLayer(
-                    ch_in,
-                    ch_in,
-                    kernel_size,
-                    stride=stride,
-                    groups=ch_in,
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    norm_groups=norm_groups,
-                    bias_on=bias_on,
-                    lr_scale=lr_scale,
-                    freeze_norm=freeze_norm,
-                    initializer=initializer,
-                    skip_quant=skip_quant))
-            self.point_conv.append(
-                ConvNormLayer(
-                    ch_in,
-                    ch_out,
-                    1,
-                    stride=1,
-                    groups=1,
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    norm_groups=norm_groups,
-                    bias_on=bias_on,
-                    lr_scale=lr_scale,
-                    freeze_norm=freeze_norm,
-                    initializer=initializer,
-                    skip_quant=skip_quant))
-        self.rbr_1x1 = ConvNormLayer(
-            ch_in,
-            ch_in,
-            1,
-            stride=self.stride,
-            groups=ch_in,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            norm_groups=norm_groups,
-            bias_on=bias_on,
-            lr_scale=lr_scale,
-            freeze_norm=freeze_norm,
-            initializer=initializer,
-            skip_quant=skip_quant)
-        self.rbr_identity_st1 = nn.BatchNorm2D(
-            num_features=ch_in,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(
-                0.0))) if ch_in == ch_out and self.stride == 1 else None
-        self.rbr_identity_st2 = nn.BatchNorm2D(
-            num_features=ch_out,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(
-                0.0))) if ch_in == ch_out and self.stride == 1 else None
-        self.act = get_act_fn(act) if act is None or isinstance(act, (
-            str, dict)) else act
-
-    def forward(self, x):
-        if hasattr(self, "conv1") and hasattr(self, "conv2"):
-            y = self.act(self.conv2(self.act(self.conv1(x))))
-        else:
-            if self.rbr_identity_st1 is None:
-                id_out_st1 = 0
-            else:
-                id_out_st1 = self.rbr_identity_st1(x)
-
-            x1_1 = 0
-            for i in range(self.k):
-                x1_1 += self.depth_conv[i](x)
-
-            x1_2 = self.rbr_1x1(x)
-            x1 = self.act(x1_1 + x1_2 + id_out_st1)
-
-            if self.rbr_identity_st2 is None:
-                id_out_st2 = 0
-            else:
-                id_out_st2 = self.rbr_identity_st2(x1)
-
-            x2_1 = 0
-            for i in range(self.k):
-                x2_1 += self.point_conv[i](x1)
-            y = self.act(x2_1 + id_out_st2)
-
-        return y
-
-    def convert_to_deploy(self):
-        if not hasattr(self, 'conv1'):
-            self.conv1 = nn.Conv2D(
-                in_channels=self.ch_in,
-                out_channels=self.ch_in,
-                kernel_size=self.kernel_size,
-                stride=self.stride,
-                padding=self.padding,
-                groups=self.ch_in,
-                bias_attr=ParamAttr(
-                    initializer=Constant(value=0.), learning_rate=1.))
-        if not hasattr(self, 'conv2'):
-            self.conv2 = nn.Conv2D(
-                in_channels=self.ch_in,
-                out_channels=self.ch_out,
-                kernel_size=1,
-                stride=1,
-                padding='SAME',
-                groups=1,
-                bias_attr=ParamAttr(
-                    initializer=Constant(value=0.), learning_rate=1.))
-
-        conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias(
-        )
-        self.conv1.weight.set_value(conv1_kernel)
-        self.conv1.bias.set_value(conv1_bias)
-        self.conv2.weight.set_value(conv2_kernel)
-        self.conv2.bias.set_value(conv2_bias)
-        self.__delattr__('depth_conv')
-        self.__delattr__('point_conv')
-        self.__delattr__('rbr_1x1')
-        if hasattr(self, 'rbr_identity_st1'):
-            self.__delattr__('rbr_identity_st1')
-        if hasattr(self, 'rbr_identity_st2'):
-            self.__delattr__('rbr_identity_st2')
-
-    def get_equivalent_kernel_bias(self):
-        st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv)
-        st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
-        st1_kernelid, st1_biasid = self._fuse_bn_tensor(
-            self.rbr_identity_st1, kernel_size=self.kernel_size)
-
-        st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv)
-        st2_kernelid, st2_biasid = self._fuse_bn_tensor(
-            self.rbr_identity_st2, kernel_size=1)
-
-        conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor(
-            st1_kernel1x1) + st1_kernelid
-
-        conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid
-
-        conv2_kernel = st2_kernel1x1 + st2_kernelid
-        conv2_bias = st2_bias1x1 + st2_biasid
-
-        return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias
-
-    def _pad_1x1_to_3x3_tensor(self, kernel1x1):
-        if kernel1x1 is None:
-            return 0
-        else:
-            padding_size = (self.kernel_size - 1) // 2
-            return nn.functional.pad(
-                kernel1x1,
-                [padding_size, padding_size, padding_size, padding_size])
-
-    def _fuse_bn_tensor(self, branch, kernel_size=3):
-        if branch is None:
-            return 0, 0
-
-        if isinstance(branch, nn.LayerList):
-            fused_kernels = []
-            fused_bias = []
-            for block in branch:
-                kernel = block.conv.weight
-                running_mean = block.norm._mean
-                running_var = block.norm._variance
-                gamma = block.norm.weight
-                beta = block.norm.bias
-                eps = block.norm._epsilon
-
-                std = (running_var + eps).sqrt()
-                t = (gamma / std).reshape((-1, 1, 1, 1))
-
-                fused_kernels.append(kernel * t)
-                fused_bias.append(beta - running_mean * gamma / std)
-
-            return sum(fused_kernels), sum(fused_bias)
-
-        elif isinstance(branch, ConvNormLayer):
-            kernel = branch.conv.weight
-            running_mean = branch.norm._mean
-            running_var = branch.norm._variance
-            gamma = branch.norm.weight
-            beta = branch.norm.bias
-            eps = branch.norm._epsilon
-        else:
-            assert isinstance(branch, nn.BatchNorm2D)
-            input_dim = self.ch_in if kernel_size == 1 else 1
-            kernel_value = paddle.zeros(
-                shape=[self.ch_in, input_dim, kernel_size, kernel_size],
-                dtype='float32')
-            if kernel_size > 1:
-                for i in range(self.ch_in):
-                    kernel_value[i, i % input_dim, (kernel_size - 1) // 2, (
-                        kernel_size - 1) // 2] = 1
-            elif kernel_size == 1:
-                for i in range(self.ch_in):
-                    kernel_value[i, i % input_dim, 0, 0] = 1
-            else:
-                raise ValueError("Invalid kernel size recieved!")
-            kernel = paddle.to_tensor(kernel_value, place=branch.weight.place)
-            running_mean = branch._mean
-            running_var = branch._variance
-            gamma = branch.weight
-            beta = branch.bias
-            eps = branch._epsilon
-
-        std = (running_var + eps).sqrt()
-        t = (gamma / std).reshape((-1, 1, 1, 1))
-
-        return kernel * t, beta - running_mean * gamma / std
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/name_adapter.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/name_adapter.py
deleted file mode 100644
index 4afbb9b..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/name_adapter.py
+++ /dev/null
@@ -1,69 +0,0 @@
-class NameAdapter(object):
-    """Fix the backbones variable names for pretrained weight"""
-
-    def __init__(self, model):
-        super(NameAdapter, self).__init__()
-        self.model = model
-
-    @property
-    def model_type(self):
-        return getattr(self.model, '_model_type', '')
-
-    @property
-    def variant(self):
-        return getattr(self.model, 'variant', '')
-
-    def fix_conv_norm_name(self, name):
-        if name == "conv1":
-            bn_name = "bn_" + name
-        else:
-            bn_name = "bn" + name[3:]
-        # the naming rule is same as pretrained weight
-        if self.model_type == 'SEResNeXt':
-            bn_name = name + "_bn"
-        return bn_name
-
-    def fix_shortcut_name(self, name):
-        if self.model_type == 'SEResNeXt':
-            name = 'conv' + name + '_prj'
-        return name
-
-    def fix_bottleneck_name(self, name):
-        if self.model_type == 'SEResNeXt':
-            conv_name1 = 'conv' + name + '_x1'
-            conv_name2 = 'conv' + name + '_x2'
-            conv_name3 = 'conv' + name + '_x3'
-            shortcut_name = name
-        else:
-            conv_name1 = name + "_branch2a"
-            conv_name2 = name + "_branch2b"
-            conv_name3 = name + "_branch2c"
-            shortcut_name = name + "_branch1"
-        return conv_name1, conv_name2, conv_name3, shortcut_name
-
-    def fix_basicblock_name(self, name):
-        if self.model_type == 'SEResNeXt':
-            conv_name1 = 'conv' + name + '_x1'
-            conv_name2 = 'conv' + name + '_x2'
-            shortcut_name = name
-        else:
-            conv_name1 = name + "_branch2a"
-            conv_name2 = name + "_branch2b"
-            shortcut_name = name + "_branch1"
-        return conv_name1, conv_name2, shortcut_name
-
-    def fix_layer_warp_name(self, stage_num, count, i):
-        name = 'res' + str(stage_num)
-        if count > 10 and stage_num == 4:
-            if i == 0:
-                conv_name = name + "a"
-            else:
-                conv_name = name + "b" + str(i)
-        else:
-            conv_name = name + chr(ord("a") + i)
-        if self.model_type == 'SEResNeXt':
-            conv_name = str(stage_num + 2) + '_' + str(i + 1)
-        return conv_name
-
-    def fix_c1_stage_name(self):
-        return "res_conv1" if self.model_type == 'ResNeXt' else "conv1"
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/res2net.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/res2net.py
deleted file mode 100644
index 9e76772..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/res2net.py
+++ /dev/null
@@ -1,357 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from numbers import Integral
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register, serializable
-from ..shape_spec import ShapeSpec
-from .resnet import ConvNormLayer
-
-__all__ = ['Res2Net', 'Res2NetC5']
-
-Res2Net_cfg = {
-    50: [3, 4, 6, 3],
-    101: [3, 4, 23, 3],
-    152: [3, 8, 36, 3],
-    200: [3, 12, 48, 3]
-}
-
-
-class BottleNeck(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 stride,
-                 shortcut,
-                 width,
-                 scales=4,
-                 variant='b',
-                 groups=1,
-                 lr=1.0,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 freeze_norm=True,
-                 dcn_v2=False):
-        super(BottleNeck, self).__init__()
-
-        self.shortcut = shortcut
-        self.scales = scales
-        self.stride = stride
-        if not shortcut:
-            if variant == 'd' and stride == 2:
-                self.branch1 = nn.Sequential()
-                self.branch1.add_sublayer(
-                    'pool',
-                    nn.AvgPool2D(
-                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
-                self.branch1.add_sublayer(
-                    'conv',
-                    ConvNormLayer(
-                        ch_in=ch_in,
-                        ch_out=ch_out,
-                        filter_size=1,
-                        stride=1,
-                        norm_type=norm_type,
-                        norm_decay=norm_decay,
-                        freeze_norm=freeze_norm,
-                        lr=lr))
-            else:
-                self.branch1 = ConvNormLayer(
-                    ch_in=ch_in,
-                    ch_out=ch_out,
-                    filter_size=1,
-                    stride=stride,
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    lr=lr)
-
-        self.branch2a = ConvNormLayer(
-            ch_in=ch_in,
-            ch_out=width * scales,
-            filter_size=1,
-            stride=stride if variant == 'a' else 1,
-            groups=1,
-            act='relu',
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            lr=lr)
-
-        self.branch2b = nn.LayerList([
-            ConvNormLayer(
-                ch_in=width,
-                ch_out=width,
-                filter_size=3,
-                stride=1 if variant == 'a' else stride,
-                groups=groups,
-                act='relu',
-                norm_type=norm_type,
-                norm_decay=norm_decay,
-                freeze_norm=freeze_norm,
-                lr=lr,
-                dcn_v2=dcn_v2) for _ in range(self.scales - 1)
-        ])
-
-        self.branch2c = ConvNormLayer(
-            ch_in=width * scales,
-            ch_out=ch_out,
-            filter_size=1,
-            stride=1,
-            groups=1,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            lr=lr)
-
-    def forward(self, inputs):
-
-        out = self.branch2a(inputs)
-        feature_split = paddle.split(out, self.scales, 1)
-        out_split = []
-        for i in range(self.scales - 1):
-            if i == 0 or self.stride == 2:
-                out_split.append(self.branch2b[i](feature_split[i]))
-            else:
-                out_split.append(self.branch2b[i](paddle.add(feature_split[i],
-                                                             out_split[-1])))
-        if self.stride == 1:
-            out_split.append(feature_split[-1])
-        else:
-            out_split.append(F.avg_pool2d(feature_split[-1], 3, self.stride, 1))
-        out = self.branch2c(paddle.concat(out_split, 1))
-
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.branch1(inputs)
-
-        out = paddle.add(out, short)
-        out = F.relu(out)
-
-        return out
-
-
-class Blocks(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 count,
-                 stage_num,
-                 width,
-                 scales=4,
-                 variant='b',
-                 groups=1,
-                 lr=1.0,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 freeze_norm=True,
-                 dcn_v2=False):
-        super(Blocks, self).__init__()
-
-        self.blocks = nn.Sequential()
-        for i in range(count):
-            self.blocks.add_sublayer(
-                str(i),
-                BottleNeck(
-                    ch_in=ch_in if i == 0 else ch_out,
-                    ch_out=ch_out,
-                    stride=2 if i == 0 and stage_num != 2 else 1,
-                    shortcut=False if i == 0 else True,
-                    width=width * (2**(stage_num - 2)),
-                    scales=scales,
-                    variant=variant,
-                    groups=groups,
-                    lr=lr,
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    dcn_v2=dcn_v2))
-
-    def forward(self, inputs):
-        return self.blocks(inputs)
-
-
-@register
-@serializable
-class Res2Net(nn.Layer):
-    """
-    Res2Net, see https://arxiv.org/abs/1904.01169
-    Args:
-        depth (int): Res2Net depth, should be 50, 101, 152, 200.
-        width (int): Res2Net width
-        scales (int): Res2Net scale
-        variant (str): Res2Net variant, supports 'a', 'b', 'c', 'd' currently
-        lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
-                             lower learning rate ratio is need for pretrained model
-                             got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
-        groups (int): The groups number of the Conv Layer.
-        norm_type (str): normalization type, 'bn' or 'sync_bn'
-        norm_decay (float): weight decay for normalization layer weights
-        freeze_norm (bool): freeze normalization layers
-        freeze_at (int): freeze the backbone at which stage
-        return_idx (list): index of stages whose feature maps are returned,
-                           index 0 stands for res2
-        dcn_v2_stages (list): index of stages who select deformable conv v2
-        num_stages (int): number of stages created
-
-    """
-    __shared__ = ['norm_type']
-
-    def __init__(self,
-                 depth=50,
-                 width=26,
-                 scales=4,
-                 variant='b',
-                 lr_mult_list=[1.0, 1.0, 1.0, 1.0],
-                 groups=1,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 freeze_norm=True,
-                 freeze_at=0,
-                 return_idx=[0, 1, 2, 3],
-                 dcn_v2_stages=[-1],
-                 num_stages=4):
-        super(Res2Net, self).__init__()
-
-        self._model_type = 'Res2Net' if groups == 1 else 'Res2NeXt'
-
-        assert depth in [50, 101, 152, 200], \
-            "depth {} not in [50, 101, 152, 200]"
-        assert variant in ['a', 'b', 'c', 'd'], "invalid Res2Net variant"
-        assert num_stages >= 1 and num_stages <= 4
-
-        self.depth = depth
-        self.variant = variant
-        self.norm_type = norm_type
-        self.norm_decay = norm_decay
-        self.freeze_norm = freeze_norm
-        self.freeze_at = freeze_at
-        if isinstance(return_idx, Integral):
-            return_idx = [return_idx]
-        assert max(return_idx) < num_stages, \
-            'the maximum return index must smaller than num_stages, ' \
-            'but received maximum return index is {} and num_stages ' \
-            'is {}'.format(max(return_idx), num_stages)
-        self.return_idx = return_idx
-        self.num_stages = num_stages
-        assert len(lr_mult_list) == 4, \
-            "lr_mult_list length must be 4 but got {}".format(len(lr_mult_list))
-        if isinstance(dcn_v2_stages, Integral):
-            dcn_v2_stages = [dcn_v2_stages]
-        assert max(dcn_v2_stages) < num_stages
-        self.dcn_v2_stages = dcn_v2_stages
-
-        block_nums = Res2Net_cfg[depth]
-
-        # C1 stage
-        if self.variant in ['c', 'd']:
-            conv_def = [
-                [3, 32, 3, 2, "conv1_1"],
-                [32, 32, 3, 1, "conv1_2"],
-                [32, 64, 3, 1, "conv1_3"],
-            ]
-        else:
-            conv_def = [[3, 64, 7, 2, "conv1"]]
-        self.res1 = nn.Sequential()
-        for (c_in, c_out, k, s, _name) in conv_def:
-            self.res1.add_sublayer(
-                _name,
-                ConvNormLayer(
-                    ch_in=c_in,
-                    ch_out=c_out,
-                    filter_size=k,
-                    stride=s,
-                    groups=1,
-                    act='relu',
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    lr=1.0))
-
-        self._in_channels = [64, 256, 512, 1024]
-        self._out_channels = [256, 512, 1024, 2048]
-        self._out_strides = [4, 8, 16, 32]
-
-        # C2-C5 stages
-        self.res_layers = []
-        for i in range(num_stages):
-            lr_mult = lr_mult_list[i]
-            stage_num = i + 2
-            self.res_layers.append(
-                self.add_sublayer(
-                    "res{}".format(stage_num),
-                    Blocks(
-                        self._in_channels[i],
-                        self._out_channels[i],
-                        count=block_nums[i],
-                        stage_num=stage_num,
-                        width=width,
-                        scales=scales,
-                        groups=groups,
-                        lr=lr_mult,
-                        norm_type=norm_type,
-                        norm_decay=norm_decay,
-                        freeze_norm=freeze_norm,
-                        dcn_v2=(i in self.dcn_v2_stages))))
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=self._out_channels[i], stride=self._out_strides[i])
-            for i in self.return_idx
-        ]
-
-    def forward(self, inputs):
-        x = inputs['image']
-        res1 = self.res1(x)
-        x = F.max_pool2d(res1, kernel_size=3, stride=2, padding=1)
-        outs = []
-        for idx, stage in enumerate(self.res_layers):
-            x = stage(x)
-            if idx == self.freeze_at:
-                x.stop_gradient = True
-            if idx in self.return_idx:
-                outs.append(x)
-        return outs
-
-
-@register
-class Res2NetC5(nn.Layer):
-    def __init__(self, depth=50, width=26, scales=4, variant='b'):
-        super(Res2NetC5, self).__init__()
-        feat_in, feat_out = [1024, 2048]
-        self.res5 = Blocks(
-            feat_in,
-            feat_out,
-            count=3,
-            stage_num=5,
-            width=width,
-            scales=scales,
-            variant=variant)
-        self.feat_out = feat_out
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(
-            channels=self.feat_out,
-            stride=32, )]
-
-    def forward(self, roi_feat, stage=0):
-        y = self.res5(roi_feat)
-        return y
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/resnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/resnet.py
deleted file mode 100644
index a64f400..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/resnet.py
+++ /dev/null
@@ -1,611 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import math
-from numbers import Integral
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register, serializable
-from paddle.regularizer import L2Decay
-from paddle.nn.initializer import Uniform
-from paddle import ParamAttr
-from paddle.nn.initializer import Constant
-from paddle.vision.ops import DeformConv2D
-from .name_adapter import NameAdapter
-from ..shape_spec import ShapeSpec
-
-__all__ = ['ResNet', 'Res5Head', 'Blocks', 'BasicBlock', 'BottleNeck']
-
-ResNet_cfg = {
-    18: [2, 2, 2, 2],
-    34: [3, 4, 6, 3],
-    50: [3, 4, 6, 3],
-    101: [3, 4, 23, 3],
-    152: [3, 8, 36, 3],
-}
-
-
-class ConvNormLayer(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 filter_size,
-                 stride,
-                 groups=1,
-                 act=None,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 freeze_norm=True,
-                 lr=1.0,
-                 dcn_v2=False):
-        super(ConvNormLayer, self).__init__()
-        assert norm_type in ['bn', 'sync_bn']
-        self.norm_type = norm_type
-        self.act = act
-        self.dcn_v2 = dcn_v2
-
-        if not self.dcn_v2:
-            self.conv = nn.Conv2D(
-                in_channels=ch_in,
-                out_channels=ch_out,
-                kernel_size=filter_size,
-                stride=stride,
-                padding=(filter_size - 1) // 2,
-                groups=groups,
-                weight_attr=ParamAttr(learning_rate=lr),
-                bias_attr=False)
-        else:
-            self.offset_channel = 2 * filter_size**2
-            self.mask_channel = filter_size**2
-
-            self.conv_offset = nn.Conv2D(
-                in_channels=ch_in,
-                out_channels=3 * filter_size**2,
-                kernel_size=filter_size,
-                stride=stride,
-                padding=(filter_size - 1) // 2,
-                weight_attr=ParamAttr(initializer=Constant(0.)),
-                bias_attr=ParamAttr(initializer=Constant(0.)))
-            self.conv = DeformConv2D(
-                in_channels=ch_in,
-                out_channels=ch_out,
-                kernel_size=filter_size,
-                stride=stride,
-                padding=(filter_size - 1) // 2,
-                dilation=1,
-                groups=groups,
-                weight_attr=ParamAttr(learning_rate=lr),
-                bias_attr=False)
-
-        norm_lr = 0. if freeze_norm else lr
-        param_attr = ParamAttr(
-            learning_rate=norm_lr,
-            regularizer=L2Decay(norm_decay),
-            trainable=False if freeze_norm else True)
-        bias_attr = ParamAttr(
-            learning_rate=norm_lr,
-            regularizer=L2Decay(norm_decay),
-            trainable=False if freeze_norm else True)
-
-        global_stats = True if freeze_norm else None
-        if norm_type in ['sync_bn', 'bn']:
-            self.norm = nn.BatchNorm2D(
-                ch_out,
-                weight_attr=param_attr,
-                bias_attr=bias_attr,
-                use_global_stats=global_stats)
-        norm_params = self.norm.parameters()
-
-        if freeze_norm:
-            for param in norm_params:
-                param.stop_gradient = True
-
-    def forward(self, inputs):
-        if not self.dcn_v2:
-            out = self.conv(inputs)
-        else:
-            offset_mask = self.conv_offset(inputs)
-            offset, mask = paddle.split(
-                offset_mask,
-                num_or_sections=[self.offset_channel, self.mask_channel],
-                axis=1)
-            mask = F.sigmoid(mask)
-            out = self.conv(inputs, offset, mask=mask)
-
-        if self.norm_type in ['bn', 'sync_bn']:
-            out = self.norm(out)
-        if self.act:
-            out = getattr(F, self.act)(out)
-        return out
-
-
-class SELayer(nn.Layer):
-    def __init__(self, ch, reduction_ratio=16):
-        super(SELayer, self).__init__()
-        self.pool = nn.AdaptiveAvgPool2D(1)
-        stdv = 1.0 / math.sqrt(ch)
-        c_ = ch // reduction_ratio
-        self.squeeze = nn.Linear(
-            ch,
-            c_,
-            weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
-            bias_attr=True)
-
-        stdv = 1.0 / math.sqrt(c_)
-        self.extract = nn.Linear(
-            c_,
-            ch,
-            weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)),
-            bias_attr=True)
-
-    def forward(self, inputs):
-        out = self.pool(inputs)
-        out = paddle.squeeze(out, axis=[2, 3])
-        out = self.squeeze(out)
-        out = F.relu(out)
-        out = self.extract(out)
-        out = F.sigmoid(out)
-        out = paddle.unsqueeze(out, axis=[2, 3])
-        scale = out * inputs
-        return scale
-
-
-class BasicBlock(nn.Layer):
-
-    expansion = 1
-
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 stride,
-                 shortcut,
-                 variant='b',
-                 groups=1,
-                 base_width=64,
-                 lr=1.0,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 freeze_norm=True,
-                 dcn_v2=False,
-                 std_senet=False):
-        super(BasicBlock, self).__init__()
-        assert groups == 1 and base_width == 64, 'BasicBlock only supports groups=1 and base_width=64'
-
-        self.shortcut = shortcut
-        if not shortcut:
-            if variant == 'd' and stride == 2:
-                self.short = nn.Sequential()
-                self.short.add_sublayer(
-                    'pool',
-                    nn.AvgPool2D(
-                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
-                self.short.add_sublayer(
-                    'conv',
-                    ConvNormLayer(
-                        ch_in=ch_in,
-                        ch_out=ch_out,
-                        filter_size=1,
-                        stride=1,
-                        norm_type=norm_type,
-                        norm_decay=norm_decay,
-                        freeze_norm=freeze_norm,
-                        lr=lr))
-            else:
-                self.short = ConvNormLayer(
-                    ch_in=ch_in,
-                    ch_out=ch_out,
-                    filter_size=1,
-                    stride=stride,
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    lr=lr)
-
-        self.branch2a = ConvNormLayer(
-            ch_in=ch_in,
-            ch_out=ch_out,
-            filter_size=3,
-            stride=stride,
-            act='relu',
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            lr=lr)
-
-        self.branch2b = ConvNormLayer(
-            ch_in=ch_out,
-            ch_out=ch_out,
-            filter_size=3,
-            stride=1,
-            act=None,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            lr=lr,
-            dcn_v2=dcn_v2)
-
-        self.std_senet = std_senet
-        if self.std_senet:
-            self.se = SELayer(ch_out)
-
-    def forward(self, inputs):
-        out = self.branch2a(inputs)
-        out = self.branch2b(out)
-        if self.std_senet:
-            out = self.se(out)
-
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
-
-        out = paddle.add(x=out, y=short)
-        out = F.relu(out)
-
-        return out
-
-
-class BottleNeck(nn.Layer):
-
-    expansion = 4
-
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 stride,
-                 shortcut,
-                 variant='b',
-                 groups=1,
-                 base_width=4,
-                 lr=1.0,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 freeze_norm=True,
-                 dcn_v2=False,
-                 std_senet=False):
-        super(BottleNeck, self).__init__()
-        if variant == 'a':
-            stride1, stride2 = stride, 1
-        else:
-            stride1, stride2 = 1, stride
-
-        # ResNeXt
-        width = int(ch_out * (base_width / 64.)) * groups
-
-        self.branch2a = ConvNormLayer(
-            ch_in=ch_in,
-            ch_out=width,
-            filter_size=1,
-            stride=stride1,
-            groups=1,
-            act='relu',
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            lr=lr)
-
-        self.branch2b = ConvNormLayer(
-            ch_in=width,
-            ch_out=width,
-            filter_size=3,
-            stride=stride2,
-            groups=groups,
-            act='relu',
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            lr=lr,
-            dcn_v2=dcn_v2)
-
-        self.branch2c = ConvNormLayer(
-            ch_in=width,
-            ch_out=ch_out * self.expansion,
-            filter_size=1,
-            stride=1,
-            groups=1,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            lr=lr)
-
-        self.shortcut = shortcut
-        if not shortcut:
-            if variant == 'd' and stride == 2:
-                self.short = nn.Sequential()
-                self.short.add_sublayer(
-                    'pool',
-                    nn.AvgPool2D(
-                        kernel_size=2, stride=2, padding=0, ceil_mode=True))
-                self.short.add_sublayer(
-                    'conv',
-                    ConvNormLayer(
-                        ch_in=ch_in,
-                        ch_out=ch_out * self.expansion,
-                        filter_size=1,
-                        stride=1,
-                        norm_type=norm_type,
-                        norm_decay=norm_decay,
-                        freeze_norm=freeze_norm,
-                        lr=lr))
-            else:
-                self.short = ConvNormLayer(
-                    ch_in=ch_in,
-                    ch_out=ch_out * self.expansion,
-                    filter_size=1,
-                    stride=stride,
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    lr=lr)
-
-        self.std_senet = std_senet
-        if self.std_senet:
-            self.se = SELayer(ch_out * self.expansion)
-
-    def forward(self, inputs):
-
-        out = self.branch2a(inputs)
-        out = self.branch2b(out)
-        out = self.branch2c(out)
-
-        if self.std_senet:
-            out = self.se(out)
-
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
-
-        out = paddle.add(x=out, y=short)
-        out = F.relu(out)
-
-        return out
-
-
-class Blocks(nn.Layer):
-    def __init__(self,
-                 block,
-                 ch_in,
-                 ch_out,
-                 count,
-                 name_adapter,
-                 stage_num,
-                 variant='b',
-                 groups=1,
-                 base_width=64,
-                 lr=1.0,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 freeze_norm=True,
-                 dcn_v2=False,
-                 std_senet=False):
-        super(Blocks, self).__init__()
-
-        self.blocks = []
-        for i in range(count):
-            conv_name = name_adapter.fix_layer_warp_name(stage_num, count, i)
-            layer = self.add_sublayer(
-                conv_name,
-                block(
-                    ch_in=ch_in,
-                    ch_out=ch_out,
-                    stride=2 if i == 0 and stage_num != 2 else 1,
-                    shortcut=False if i == 0 else True,
-                    variant=variant,
-                    groups=groups,
-                    base_width=base_width,
-                    lr=lr,
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    dcn_v2=dcn_v2,
-                    std_senet=std_senet))
-            self.blocks.append(layer)
-            if i == 0:
-                ch_in = ch_out * block.expansion
-
-    def forward(self, inputs):
-        block_out = inputs
-        for block in self.blocks:
-            block_out = block(block_out)
-        return block_out
-
-
-@register
-@serializable
-class ResNet(nn.Layer):
-    __shared__ = ['norm_type']
-
-    def __init__(self,
-                 depth=50,
-                 ch_in=64,
-                 variant='b',
-                 lr_mult_list=[1.0, 1.0, 1.0, 1.0],
-                 groups=1,
-                 base_width=64,
-                 norm_type='bn',
-                 norm_decay=0,
-                 freeze_norm=True,
-                 freeze_at=0,
-                 return_idx=[0, 1, 2, 3],
-                 dcn_v2_stages=[-1],
-                 num_stages=4,
-                 std_senet=False,
-                 freeze_stem_only=False):
-        """
-        Residual Network, see https://arxiv.org/abs/1512.03385
-        
-        Args:
-            depth (int): ResNet depth, should be 18, 34, 50, 101, 152.
-            ch_in (int): output channel of first stage, default 64
-            variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
-            lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
-                                 lower learning rate ratio is need for pretrained model 
-                                 got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
-            groups (int): group convolution cardinality
-            base_width (int): base width of each group convolution
-            norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
-            norm_decay (float): weight decay for normalization layer weights
-            freeze_norm (bool): freeze normalization layers
-            freeze_at (int): freeze the backbone at which stage
-            return_idx (list): index of the stages whose feature maps are returned
-            dcn_v2_stages (list): index of stages who select deformable conv v2
-            num_stages (int): total num of stages
-            std_senet (bool): whether use senet, default True
-        """
-        super(ResNet, self).__init__()
-        self._model_type = 'ResNet' if groups == 1 else 'ResNeXt'
-        assert num_stages >= 1 and num_stages <= 4
-        self.depth = depth
-        self.variant = variant
-        self.groups = groups
-        self.base_width = base_width
-        self.norm_type = norm_type
-        self.norm_decay = norm_decay
-        self.freeze_norm = freeze_norm
-        self.freeze_at = freeze_at
-        if isinstance(return_idx, Integral):
-            return_idx = [return_idx]
-        assert max(return_idx) < num_stages, \
-            'the maximum return index must smaller than num_stages, ' \
-            'but received maximum return index is {} and num_stages ' \
-            'is {}'.format(max(return_idx), num_stages)
-        self.return_idx = return_idx
-        self.num_stages = num_stages
-        assert len(lr_mult_list) == 4, \
-            "lr_mult_list length must be 4 but got {}".format(len(lr_mult_list))
-        if isinstance(dcn_v2_stages, Integral):
-            dcn_v2_stages = [dcn_v2_stages]
-        assert max(dcn_v2_stages) < num_stages
-
-        if isinstance(dcn_v2_stages, Integral):
-            dcn_v2_stages = [dcn_v2_stages]
-        assert max(dcn_v2_stages) < num_stages
-        self.dcn_v2_stages = dcn_v2_stages
-
-        block_nums = ResNet_cfg[depth]
-        na = NameAdapter(self)
-
-        conv1_name = na.fix_c1_stage_name()
-        if variant in ['c', 'd']:
-            conv_def = [
-                [3, ch_in // 2, 3, 2, "conv1_1"],
-                [ch_in // 2, ch_in // 2, 3, 1, "conv1_2"],
-                [ch_in // 2, ch_in, 3, 1, "conv1_3"],
-            ]
-        else:
-            conv_def = [[3, ch_in, 7, 2, conv1_name]]
-        self.conv1 = nn.Sequential()
-        for (c_in, c_out, k, s, _name) in conv_def:
-            self.conv1.add_sublayer(
-                _name,
-                ConvNormLayer(
-                    ch_in=c_in,
-                    ch_out=c_out,
-                    filter_size=k,
-                    stride=s,
-                    groups=1,
-                    act='relu',
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    lr=1.0))
-
-        self.ch_in = ch_in
-        ch_out_list = [64, 128, 256, 512]
-        block = BottleNeck if depth >= 50 else BasicBlock
-
-        self._out_channels = [block.expansion * v for v in ch_out_list]
-        self._out_strides = [4, 8, 16, 32]
-
-        self.res_layers = []
-        for i in range(num_stages):
-            lr_mult = lr_mult_list[i]
-            stage_num = i + 2
-            res_name = "res{}".format(stage_num)
-            res_layer = self.add_sublayer(
-                res_name,
-                Blocks(
-                    block,
-                    self.ch_in,
-                    ch_out_list[i],
-                    count=block_nums[i],
-                    name_adapter=na,
-                    stage_num=stage_num,
-                    variant=variant,
-                    groups=groups,
-                    base_width=base_width,
-                    lr=lr_mult,
-                    norm_type=norm_type,
-                    norm_decay=norm_decay,
-                    freeze_norm=freeze_norm,
-                    dcn_v2=(i in self.dcn_v2_stages),
-                    std_senet=std_senet))
-            self.res_layers.append(res_layer)
-            self.ch_in = self._out_channels[i]
-
-        if freeze_at >= 0:
-            self._freeze_parameters(self.conv1)
-            if not freeze_stem_only:
-                for i in range(min(freeze_at + 1, num_stages)):
-                    self._freeze_parameters(self.res_layers[i])
-
-    def _freeze_parameters(self, m):
-        for p in m.parameters():
-            p.stop_gradient = True
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=self._out_channels[i], stride=self._out_strides[i])
-            for i in self.return_idx
-        ]
-
-    def forward(self, inputs):
-        x = inputs['image']
-        conv1 = self.conv1(x)
-        x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1)
-        outs = []
-        for idx, stage in enumerate(self.res_layers):
-            x = stage(x)
-            if idx in self.return_idx:
-                outs.append(x)
-        return outs
-
-
-@register
-class Res5Head(nn.Layer):
-    def __init__(self, depth=50):
-        super(Res5Head, self).__init__()
-        feat_in, feat_out = [1024, 512]
-        if depth < 50:
-            feat_in = 256
-        na = NameAdapter(self)
-        block = BottleNeck if depth >= 50 else BasicBlock
-        self.res5 = Blocks(
-            block, feat_in, feat_out, count=3, name_adapter=na, stage_num=5)
-        self.feat_out = feat_out if depth < 50 else feat_out * 4
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(
-            channels=self.feat_out,
-            stride=16, )]
-
-    def forward(self, roi_feat, stage=0):
-        y = self.res5(roi_feat)
-        return y
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/senet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/senet.py
deleted file mode 100644
index db1e29b..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/senet.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import paddle.nn as nn
-
-from ppdet.core.workspace import register, serializable
-from .resnet import ResNet, Blocks, BasicBlock, BottleNeck
-from ..shape_spec import ShapeSpec
-from .name_adapter import NameAdapter
-
-__all__ = ['SENet', 'SERes5Head']
-
-
-@register
-@serializable
-class SENet(ResNet):
-    __shared__ = ['norm_type']
-
-    def __init__(self,
-                 depth=50,
-                 variant='b',
-                 lr_mult_list=[1.0, 1.0, 1.0, 1.0],
-                 groups=1,
-                 base_width=64,
-                 norm_type='bn',
-                 norm_decay=0,
-                 freeze_norm=True,
-                 freeze_at=0,
-                 return_idx=[0, 1, 2, 3],
-                 dcn_v2_stages=[-1],
-                 std_senet=True,
-                 num_stages=4):
-        """
-        Squeeze-and-Excitation Networks, see https://arxiv.org/abs/1709.01507
-        
-        Args:
-            depth (int): SENet depth, should be 50, 101, 152
-            variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
-            lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5),
-                                 lower learning rate ratio is need for pretrained model 
-                                 got using distillation(default as [1.0, 1.0, 1.0, 1.0]).
-            groups (int): group convolution cardinality
-            base_width (int): base width of each group convolution
-            norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
-            norm_decay (float): weight decay for normalization layer weights
-            freeze_norm (bool): freeze normalization layers
-            freeze_at (int): freeze the backbone at which stage
-            return_idx (list): index of the stages whose feature maps are returned
-            dcn_v2_stages (list): index of stages who select deformable conv v2
-            std_senet (bool): whether use senet, default True
-            num_stages (int): total num of stages
-        """
-
-        super(SENet, self).__init__(
-            depth=depth,
-            variant=variant,
-            lr_mult_list=lr_mult_list,
-            ch_in=128,
-            groups=groups,
-            base_width=base_width,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            freeze_at=freeze_at,
-            return_idx=return_idx,
-            dcn_v2_stages=dcn_v2_stages,
-            std_senet=std_senet,
-            num_stages=num_stages)
-
-
-@register
-class SERes5Head(nn.Layer):
-    def __init__(self,
-                 depth=50,
-                 variant='b',
-                 lr_mult=1.0,
-                 groups=1,
-                 base_width=64,
-                 norm_type='bn',
-                 norm_decay=0,
-                 dcn_v2=False,
-                 freeze_norm=False,
-                 std_senet=True):
-        """
-        SERes5Head layer
-
-        Args:
-            depth (int): SENet depth, should be 50, 101, 152
-            variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently
-            lr_mult (list): learning rate ratio of SERes5Head, default as 1.0.
-            groups (int): group convolution cardinality
-            base_width (int): base width of each group convolution
-            norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel'
-            norm_decay (float): weight decay for normalization layer weights
-            dcn_v2_stages (list): index of stages who select deformable conv v2
-            std_senet (bool): whether use senet, default True
-            
-        """
-        super(SERes5Head, self).__init__()
-        ch_out = 512
-        ch_in = 256 if depth < 50 else 1024
-        na = NameAdapter(self)
-        block = BottleNeck if depth >= 50 else BasicBlock
-        self.res5 = Blocks(
-            block,
-            ch_in,
-            ch_out,
-            count=3,
-            name_adapter=na,
-            stage_num=5,
-            variant=variant,
-            groups=groups,
-            base_width=base_width,
-            lr=lr_mult,
-            norm_type=norm_type,
-            norm_decay=norm_decay,
-            freeze_norm=freeze_norm,
-            dcn_v2=dcn_v2,
-            std_senet=std_senet)
-        self.ch_out = ch_out * block.expansion
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(
-            channels=self.ch_out,
-            stride=16, )]
-
-    def forward(self, roi_feat):
-        y = self.res5(roi_feat)
-        return y
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/shufflenet_v2.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/shufflenet_v2.py
deleted file mode 100644
index ca7ebb9..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/shufflenet_v2.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-from paddle import ParamAttr
-import paddle.nn.functional as F
-from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm2D
-from paddle.nn.initializer import KaimingNormal
-from paddle.regularizer import L2Decay
-
-from ppdet.core.workspace import register, serializable
-from numbers import Integral
-from ..shape_spec import ShapeSpec
-from ppdet.modeling.ops import channel_shuffle
-
-__all__ = ['ShuffleNetV2']
-
-
-class ConvBNLayer(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride,
-                 padding,
-                 groups=1,
-                 act=None):
-        super(ConvBNLayer, self).__init__()
-        self._conv = Conv2D(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            groups=groups,
-            weight_attr=ParamAttr(initializer=KaimingNormal()),
-            bias_attr=False)
-
-        self._batch_norm = BatchNorm2D(
-            out_channels,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-        if act == "hard_swish":
-            act = 'hardswish'
-        self.act = act
-
-    def forward(self, inputs):
-        y = self._conv(inputs)
-        y = self._batch_norm(y)
-        if self.act:
-            y = getattr(F, self.act)(y)
-        return y
-
-
-class InvertedResidual(nn.Layer):
-    def __init__(self, in_channels, out_channels, stride, act="relu"):
-        super(InvertedResidual, self).__init__()
-        self._conv_pw = ConvBNLayer(
-            in_channels=in_channels // 2,
-            out_channels=out_channels // 2,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            groups=1,
-            act=act)
-        self._conv_dw = ConvBNLayer(
-            in_channels=out_channels // 2,
-            out_channels=out_channels // 2,
-            kernel_size=3,
-            stride=stride,
-            padding=1,
-            groups=out_channels // 2,
-            act=None)
-        self._conv_linear = ConvBNLayer(
-            in_channels=out_channels // 2,
-            out_channels=out_channels // 2,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            groups=1,
-            act=act)
-
-    def forward(self, inputs):
-        x1, x2 = paddle.split(
-            inputs,
-            num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2],
-            axis=1)
-        x2 = self._conv_pw(x2)
-        x2 = self._conv_dw(x2)
-        x2 = self._conv_linear(x2)
-        out = paddle.concat([x1, x2], axis=1)
-        return channel_shuffle(out, 2)
-
-
-class InvertedResidualDS(nn.Layer):
-    def __init__(self, in_channels, out_channels, stride, act="relu"):
-        super(InvertedResidualDS, self).__init__()
-
-        # branch1
-        self._conv_dw_1 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=in_channels,
-            kernel_size=3,
-            stride=stride,
-            padding=1,
-            groups=in_channels,
-            act=None)
-        self._conv_linear_1 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=out_channels // 2,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            groups=1,
-            act=act)
-        # branch2
-        self._conv_pw_2 = ConvBNLayer(
-            in_channels=in_channels,
-            out_channels=out_channels // 2,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            groups=1,
-            act=act)
-        self._conv_dw_2 = ConvBNLayer(
-            in_channels=out_channels // 2,
-            out_channels=out_channels // 2,
-            kernel_size=3,
-            stride=stride,
-            padding=1,
-            groups=out_channels // 2,
-            act=None)
-        self._conv_linear_2 = ConvBNLayer(
-            in_channels=out_channels // 2,
-            out_channels=out_channels // 2,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            groups=1,
-            act=act)
-
-    def forward(self, inputs):
-        x1 = self._conv_dw_1(inputs)
-        x1 = self._conv_linear_1(x1)
-        x2 = self._conv_pw_2(inputs)
-        x2 = self._conv_dw_2(x2)
-        x2 = self._conv_linear_2(x2)
-        out = paddle.concat([x1, x2], axis=1)
-
-        return channel_shuffle(out, 2)
-
-
-@register
-@serializable
-class ShuffleNetV2(nn.Layer):
-    def __init__(self, scale=1.0, act="relu", feature_maps=[5, 13, 17]):
-        super(ShuffleNetV2, self).__init__()
-        self.scale = scale
-        if isinstance(feature_maps, Integral):
-            feature_maps = [feature_maps]
-        self.feature_maps = feature_maps
-        stage_repeats = [4, 8, 4]
-
-        if scale == 0.25:
-            stage_out_channels = [-1, 24, 24, 48, 96, 512]
-        elif scale == 0.33:
-            stage_out_channels = [-1, 24, 32, 64, 128, 512]
-        elif scale == 0.5:
-            stage_out_channels = [-1, 24, 48, 96, 192, 1024]
-        elif scale == 1.0:
-            stage_out_channels = [-1, 24, 116, 232, 464, 1024]
-        elif scale == 1.5:
-            stage_out_channels = [-1, 24, 176, 352, 704, 1024]
-        elif scale == 2.0:
-            stage_out_channels = [-1, 24, 244, 488, 976, 2048]
-        else:
-            raise NotImplementedError("This scale size:[" + str(scale) +
-                                      "] is not implemented!")
-        self._out_channels = []
-        self._feature_idx = 0
-        # 1. conv1
-        self._conv1 = ConvBNLayer(
-            in_channels=3,
-            out_channels=stage_out_channels[1],
-            kernel_size=3,
-            stride=2,
-            padding=1,
-            act=act)
-        self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1)
-        self._feature_idx += 1
-
-        # 2. bottleneck sequences
-        self._block_list = []
-        for stage_id, num_repeat in enumerate(stage_repeats):
-            for i in range(num_repeat):
-                if i == 0:
-                    block = self.add_sublayer(
-                        name=str(stage_id + 2) + '_' + str(i + 1),
-                        sublayer=InvertedResidualDS(
-                            in_channels=stage_out_channels[stage_id + 1],
-                            out_channels=stage_out_channels[stage_id + 2],
-                            stride=2,
-                            act=act))
-                else:
-                    block = self.add_sublayer(
-                        name=str(stage_id + 2) + '_' + str(i + 1),
-                        sublayer=InvertedResidual(
-                            in_channels=stage_out_channels[stage_id + 2],
-                            out_channels=stage_out_channels[stage_id + 2],
-                            stride=1,
-                            act=act))
-                self._block_list.append(block)
-                self._feature_idx += 1
-                self._update_out_channels(stage_out_channels[stage_id + 2],
-                                          self._feature_idx, self.feature_maps)
-
-    def _update_out_channels(self, channel, feature_idx, feature_maps):
-        if feature_idx in feature_maps:
-            self._out_channels.append(channel)
-
-    def forward(self, inputs):
-        y = self._conv1(inputs['image'])
-        y = self._max_pool(y)
-        outs = []
-        for i, inv in enumerate(self._block_list):
-            y = inv(y)
-            if i + 2 in self.feature_maps:
-                outs.append(y)
-
-        return outs
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/swin_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/swin_transformer.py
deleted file mode 100644
index 64aabab..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/swin_transformer.py
+++ /dev/null
@@ -1,752 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
-Ths copyright of microsoft/Swin-Transformer is as follows:
-MIT License [see LICENSE for details]
-"""
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.modeling.shape_spec import ShapeSpec
-from ppdet.core.workspace import register, serializable
-from .transformer_utils import DropPath, Identity
-from .transformer_utils import add_parameter, to_2tuple
-from .transformer_utils import ones_, zeros_, trunc_normal_
-
-__all__ = ['SwinTransformer']
-
-MODEL_cfg = {
-    # use 22kto1k finetune weights as default pretrained, can set by SwinTransformer.pretrained in config
-    'swin_T_224': dict(
-        pretrain_img_size=224,
-        embed_dim=96,
-        depths=[2, 2, 6, 2],
-        num_heads=[3, 6, 12, 24],
-        window_size=7,
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams',
-    ),
-    'swin_S_224': dict(
-        pretrain_img_size=224,
-        embed_dim=96,
-        depths=[2, 2, 18, 2],
-        num_heads=[3, 6, 12, 24],
-        window_size=7,
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_small_patch4_window7_224_22kto1k_pretrained.pdparams',
-    ),
-    'swin_B_224': dict(
-        pretrain_img_size=224,
-        embed_dim=128,
-        depths=[2, 2, 18, 2],
-        num_heads=[4, 8, 16, 32],
-        window_size=7,
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams',
-    ),
-    'swin_L_224': dict(
-        pretrain_img_size=224,
-        embed_dim=192,
-        depths=[2, 2, 18, 2],
-        num_heads=[6, 12, 24, 48],
-        window_size=7,
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams',
-    ),
-    'swin_B_384': dict(
-        pretrain_img_size=384,
-        embed_dim=128,
-        depths=[2, 2, 18, 2],
-        num_heads=[4, 8, 16, 32],
-        window_size=12,
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams',
-    ),
-    'swin_L_384': dict(
-        pretrain_img_size=384,
-        embed_dim=192,
-        depths=[2, 2, 18, 2],
-        num_heads=[6, 12, 24, 48],
-        window_size=12,
-        pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams',
-    ),
-}
-
-
-class Mlp(nn.Layer):
-    def __init__(self,
-                 in_features,
-                 hidden_features=None,
-                 out_features=None,
-                 act_layer=nn.GELU,
-                 drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-def window_partition(x, window_size):
-    """
-    Args:
-        x: (B, H, W, C)
-        window_size (int): window size
-    Returns:
-        windows: (num_windows*B, window_size, window_size, C)
-    """
-    B, H, W, C = x.shape
-    x = x.reshape(
-        [-1, H // window_size, window_size, W // window_size, window_size, C])
-    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
-        [-1, window_size, window_size, C])
-    return windows
-
-
-def window_reverse(windows, window_size, H, W):
-    """
-    Args:
-        windows: (num_windows*B, window_size, window_size, C)
-        window_size (int): Window size
-        H (int): Height of image
-        W (int): Width of image
-    Returns:
-        x: (B, H, W, C)
-    """
-    _, _, _, C = windows.shape
-    B = int(windows.shape[0] / (H * W / window_size / window_size))
-    x = windows.reshape(
-        [-1, H // window_size, W // window_size, window_size, window_size, C])
-    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C])
-    return x
-
-
-class WindowAttention(nn.Layer):
-    """ Window based multi-head self attention (W-MSA) module with relative position bias.
-    It supports both of shifted and non-shifted window.
-
-    Args:
-        dim (int): Number of input channels.
-        window_size (tuple[int]): The height and width of the window.
-        num_heads (int): Number of attention heads.
-        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
-        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
-        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
-    """
-
-    def __init__(self,
-                 dim,
-                 window_size,
-                 num_heads,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 attn_drop=0.,
-                 proj_drop=0.):
-
-        super().__init__()
-        self.dim = dim
-        self.window_size = window_size  # Wh, Ww
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-
-        # define a parameter table of relative position bias
-        self.relative_position_bias_table = add_parameter(
-            self,
-            paddle.zeros(((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
-                          num_heads)))  # 2*Wh-1 * 2*Ww-1, nH
-
-        # get pair-wise relative position index for each token inside the window
-        coords_h = paddle.arange(self.window_size[0])
-        coords_w = paddle.arange(self.window_size[1])
-        coords = paddle.stack(paddle.meshgrid(
-            [coords_h, coords_w]))  # 2, Wh, Ww
-        coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
-        coords_flatten_1 = coords_flatten.unsqueeze(axis=2)
-        coords_flatten_2 = coords_flatten.unsqueeze(axis=1)
-        relative_coords = coords_flatten_1 - coords_flatten_2
-        relative_coords = relative_coords.transpose(
-            [1, 2, 0])  # Wh*Ww, Wh*Ww, 2
-        relative_coords[:, :, 0] += self.window_size[
-            0] - 1  # shift to start from 0
-        relative_coords[:, :, 1] += self.window_size[1] - 1
-        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
-        self.relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
-
-        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-        trunc_normal_(self.relative_position_bias_table)
-        self.softmax = nn.Softmax(axis=-1)
-
-    def forward(self, x, mask=None):
-        """ Forward function.
-        Args:
-            x: input features with shape of (num_windows*B, N, C)
-            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
-        """
-        B_, N, C = x.shape
-        qkv = self.qkv(x).reshape(
-            [-1, N, 3, self.num_heads, C // self.num_heads]).transpose(
-                [2, 0, 3, 1, 4])
-        q, k, v = qkv[0], qkv[1], qkv[2]
-
-        q = q * self.scale
-        attn = paddle.mm(q, k.transpose([0, 1, 3, 2]))
-
-        index = self.relative_position_index.flatten()
-
-        relative_position_bias = paddle.index_select(
-            self.relative_position_bias_table, index)
-        relative_position_bias = relative_position_bias.reshape([
-            self.window_size[0] * self.window_size[1],
-            self.window_size[0] * self.window_size[1], -1
-        ])  # Wh*Ww,Wh*Ww,nH
-        relative_position_bias = relative_position_bias.transpose(
-            [2, 0, 1])  # nH, Wh*Ww, Wh*Ww
-        attn = attn + relative_position_bias.unsqueeze(0)
-
-        if mask is not None:
-            nW = mask.shape[0]
-            attn = attn.reshape([-1, nW, self.num_heads, N, N
-                                 ]) + mask.unsqueeze(1).unsqueeze(0)
-            attn = attn.reshape([-1, self.num_heads, N, N])
-            attn = self.softmax(attn)
-        else:
-            attn = self.softmax(attn)
-
-        attn = self.attn_drop(attn)
-
-        # x = (attn @ v).transpose(1, 2).reshape([B_, N, C])
-        x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C])
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-
-class SwinTransformerBlock(nn.Layer):
-    """ Swin Transformer Block.
-    Args:
-        dim (int): Number of input channels.
-        num_heads (int): Number of attention heads.
-        window_size (int): Window size.
-        shift_size (int): Shift size for SW-MSA.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float, optional): Stochastic depth rate. Default: 0.0
-        act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU
-        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 window_size=7,
-                 shift_size=0,
-                 mlp_ratio=4.,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 drop=0.,
-                 attn_drop=0.,
-                 drop_path=0.,
-                 act_layer=nn.GELU,
-                 norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.dim = dim
-        self.num_heads = num_heads
-        self.window_size = window_size
-        self.shift_size = shift_size
-        self.mlp_ratio = mlp_ratio
-        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
-
-        self.norm1 = norm_layer(dim)
-        self.attn = WindowAttention(
-            dim,
-            window_size=to_2tuple(self.window_size),
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            qk_scale=qk_scale,
-            attn_drop=attn_drop,
-            proj_drop=drop)
-
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
-        self.norm2 = norm_layer(dim)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim,
-                       hidden_features=mlp_hidden_dim,
-                       act_layer=act_layer,
-                       drop=drop)
-
-        self.H = None
-        self.W = None
-
-    def forward(self, x, mask_matrix):
-        """ Forward function.
-        Args:
-            x: Input feature, tensor size (B, H*W, C).
-            H, W: Spatial resolution of the input feature.
-            mask_matrix: Attention mask for cyclic shift.
-        """
-        B, L, C = x.shape
-        H, W = self.H, self.W
-        assert L == H * W, "input feature has wrong size"
-
-        shortcut = x
-        x = self.norm1(x)
-        x = x.reshape([-1, H, W, C])
-
-        # pad feature maps to multiples of window size
-        pad_l = pad_t = 0
-        pad_r = (self.window_size - W % self.window_size) % self.window_size
-        pad_b = (self.window_size - H % self.window_size) % self.window_size
-        x = F.pad(x, [0, pad_l, 0, pad_b, 0, pad_r, 0, pad_t],
-                  data_format='NHWC')
-        _, Hp, Wp, _ = x.shape
-
-        # cyclic shift
-        if self.shift_size > 0:
-            shifted_x = paddle.roll(
-                x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2))
-            attn_mask = mask_matrix
-        else:
-            shifted_x = x
-            attn_mask = None
-
-        # partition windows
-        x_windows = window_partition(
-            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
-        x_windows = x_windows.reshape(
-            [x_windows.shape[0], self.window_size * self.window_size,
-             C])  # nW*B, window_size*window_size, C
-
-        # W-MSA/SW-MSA
-        attn_windows = self.attn(
-            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
-
-        # merge windows
-        attn_windows = attn_windows.reshape(
-            [x_windows.shape[0], self.window_size, self.window_size, C])
-        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
-                                   Wp)  # B H' W' C
-
-        # reverse cyclic shift
-        if self.shift_size > 0:
-            x = paddle.roll(
-                shifted_x,
-                shifts=(self.shift_size, self.shift_size),
-                axis=(1, 2))
-        else:
-            x = shifted_x
-
-        if pad_r > 0 or pad_b > 0:
-            x = x[:, :H, :W, :]
-
-        x = x.reshape([-1, H * W, C])
-
-        # FFN
-        x = shortcut + self.drop_path(x)
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-
-        return x
-
-
-class PatchMerging(nn.Layer):
-    r""" Patch Merging Layer.
-    Args:
-        dim (int): Number of input channels.
-        norm_layer (nn.Layer, optional): Normalization layer.  Default: nn.LayerNorm
-    """
-
-    def __init__(self, dim, norm_layer=nn.LayerNorm):
-        super().__init__()
-        self.dim = dim
-        self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False)
-        self.norm = norm_layer(4 * dim)
-
-    def forward(self, x, H, W):
-        """ Forward function.
-        Args:
-            x: Input feature, tensor size (B, H*W, C).
-            H, W: Spatial resolution of the input feature.
-        """
-        B, L, C = x.shape
-        assert L == H * W, "input feature has wrong size"
-
-        x = x.reshape([-1, H, W, C])
-
-        # padding
-        pad_input = (H % 2 == 1) or (W % 2 == 1)
-        if pad_input:
-            # paddle F.pad default data_format is 'NCHW'
-            x = F.pad(x, [0, 0, 0, H % 2, 0, W % 2, 0, 0], data_format='NHWC')
-            H += H % 2
-            W += W % 2
-
-        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
-        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
-        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
-        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
-        x = paddle.concat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
-        x = x.reshape([-1, H * W // 4, 4 * C])  # B H/2*W/2 4*C
-
-        x = self.norm(x)
-        x = self.reduction(x)
-
-        return x
-
-
-class BasicLayer(nn.Layer):
-    """ A basic Swin Transformer layer for one stage.
-    Args:
-        dim (int): Number of input channels.
-        depth (int): Number of blocks.
-        num_heads (int): Number of attention heads.
-        window_size (int): Local window size.
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
-        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
-        drop (float, optional): Dropout rate. Default: 0.0
-        attn_drop (float, optional): Attention dropout rate. Default: 0.0
-        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
-        norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm
-        downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None
-    """
-
-    def __init__(self,
-                 dim,
-                 depth,
-                 num_heads,
-                 window_size=7,
-                 mlp_ratio=4.,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 drop=0.,
-                 attn_drop=0.,
-                 drop_path=0.,
-                 norm_layer=nn.LayerNorm,
-                 downsample=None):
-        super().__init__()
-        self.window_size = window_size
-        self.shift_size = window_size // 2
-        self.depth = depth
-
-        # build blocks
-        self.blocks = nn.LayerList([
-            SwinTransformerBlock(
-                dim=dim,
-                num_heads=num_heads,
-                window_size=window_size,
-                shift_size=0 if (i % 2 == 0) else window_size // 2,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop,
-                attn_drop=attn_drop,
-                drop_path=drop_path[i]
-                if isinstance(drop_path, np.ndarray) else drop_path,
-                norm_layer=norm_layer) for i in range(depth)
-        ])
-
-        # patch merging layer
-        if downsample is not None:
-            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
-        else:
-            self.downsample = None
-
-    def forward(self, x, H, W):
-        """ Forward function.
-        Args:
-            x: Input feature, tensor size (B, H*W, C).
-            H, W: Spatial resolution of the input feature.
-        """
-
-        # calculate attention mask for SW-MSA
-        Hp = int(np.ceil(H / self.window_size)) * self.window_size
-        Wp = int(np.ceil(W / self.window_size)) * self.window_size
-        img_mask = paddle.zeros([1, Hp, Wp, 1], dtype='float32')  # 1 Hp Wp 1
-        h_slices = (slice(0, -self.window_size),
-                    slice(-self.window_size, -self.shift_size),
-                    slice(-self.shift_size, None))
-        w_slices = (slice(0, -self.window_size),
-                    slice(-self.window_size, -self.shift_size),
-                    slice(-self.shift_size, None))
-        cnt = 0
-        for h in h_slices:
-            for w in w_slices:
-                img_mask[:, h, w, :] = cnt
-
-                cnt += 1
-
-        mask_windows = window_partition(
-            img_mask, self.window_size)  # nW, window_size, window_size, 1
-        mask_windows = mask_windows.reshape(
-            [-1, self.window_size * self.window_size])
-        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
-        huns = -100.0 * paddle.ones_like(attn_mask)
-        attn_mask = huns * (attn_mask != 0).astype("float32")
-
-        for blk in self.blocks:
-            blk.H, blk.W = H, W
-            x = blk(x, attn_mask)
-        if self.downsample is not None:
-            x_down = self.downsample(x, H, W)
-            Wh, Ww = (H + 1) // 2, (W + 1) // 2
-            return x, H, W, x_down, Wh, Ww
-        else:
-            return x, H, W, x, H, W
-
-
-class PatchEmbed(nn.Layer):
-    """ Image to Patch Embedding
-    Args:
-        patch_size (int): Patch token size. Default: 4.
-        in_chans (int): Number of input image channels. Default: 3.
-        embed_dim (int): Number of linear projection output channels. Default: 96.
-        norm_layer (nn.Layer, optional): Normalization layer. Default: None
-    """
-
-    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
-        super().__init__()
-        patch_size = to_2tuple(patch_size)
-        self.patch_size = patch_size
-
-        self.in_chans = in_chans
-        self.embed_dim = embed_dim
-
-        self.proj = nn.Conv2D(
-            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-        if norm_layer is not None:
-            self.norm = norm_layer(embed_dim)
-        else:
-            self.norm = None
-
-    def forward(self, x):
-        # TODO # export dynamic shape
-        B, C, H, W = x.shape
-        # assert [H, W] == self.img_size[:2], "Input image size ({H}*{W}) doesn't match model ({}*{}).".format(H, W, self.img_size[0], self.img_size[1])
-        if W % self.patch_size[1] != 0:
-            x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0])
-        if H % self.patch_size[0] != 0:
-            x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]])
-
-        x = self.proj(x)
-        if self.norm is not None:
-            _, _, Wh, Ww = x.shape
-            x = x.flatten(2).transpose([0, 2, 1])
-            x = self.norm(x)
-            x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww])
-
-        return x
-
-
-@register
-@serializable
-class SwinTransformer(nn.Layer):
-    """ Swin Transformer backbone
-    Args:
-        arch (str): Architecture of FocalNet
-        pretrain_img_size (int | tuple(int)): Input image size. Default 224
-        patch_size (int | tuple(int)): Patch size. Default: 4
-        in_chans (int): Number of input image channels. Default: 3
-        embed_dim (int): Patch embedding dimension. Default: 96
-        depths (tuple(int)): Depth of each Swin Transformer layer.
-        num_heads (tuple(int)): Number of attention heads in different layers.
-        window_size (int): Window size. Default: 7
-        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
-        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
-        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
-        drop_rate (float): Dropout rate. Default: 0
-        attn_drop_rate (float): Attention dropout rate. Default: 0
-        drop_path_rate (float): Stochastic depth rate. Default: 0.1
-        norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm.
-        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
-        patch_norm (bool): If True, add normalization after patch embedding. Default: True
-    """
-
-    def __init__(self,
-                 arch='swin_T_224',
-                 pretrain_img_size=224,
-                 patch_size=4,
-                 in_chans=3,
-                 embed_dim=96,
-                 depths=[2, 2, 6, 2],
-                 num_heads=[3, 6, 12, 24],
-                 window_size=7,
-                 mlp_ratio=4.,
-                 qkv_bias=True,
-                 qk_scale=None,
-                 drop_rate=0.,
-                 attn_drop_rate=0.,
-                 drop_path_rate=0.2,
-                 norm_layer=nn.LayerNorm,
-                 ape=False,
-                 patch_norm=True,
-                 out_indices=(0, 1, 2, 3),
-                 frozen_stages=-1,
-                 pretrained=None):
-        super(SwinTransformer, self).__init__()
-        assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch)
-
-        pretrain_img_size = MODEL_cfg[arch]['pretrain_img_size']
-        embed_dim = MODEL_cfg[arch]['embed_dim']
-        depths = MODEL_cfg[arch]['depths']
-        num_heads = MODEL_cfg[arch]['num_heads']
-        window_size = MODEL_cfg[arch]['window_size']
-        if pretrained is None:
-            pretrained = MODEL_cfg[arch]['pretrained']
-
-        self.num_layers = len(depths)
-        self.ape = ape
-        self.patch_norm = patch_norm
-        self.out_indices = out_indices
-        self.frozen_stages = frozen_stages
-
-        # split image into non-overlapping patches
-        self.patch_embed = PatchEmbed(
-            patch_size=patch_size,
-            in_chans=in_chans,
-            embed_dim=embed_dim,
-            norm_layer=norm_layer if self.patch_norm else None)
-
-        # absolute position embedding
-        if self.ape:
-            pretrain_img_size = to_2tuple(pretrain_img_size)
-            patch_size = to_2tuple(patch_size)
-            patches_resolution = [
-                pretrain_img_size[0] // patch_size[0],
-                pretrain_img_size[1] // patch_size[1]
-            ]
-
-            self.absolute_pos_embed = add_parameter(
-                self,
-                paddle.zeros((1, embed_dim, patches_resolution[0],
-                              patches_resolution[1])))
-            trunc_normal_(self.absolute_pos_embed)
-
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        # stochastic depth
-        dpr = np.linspace(0, drop_path_rate,
-                          sum(depths))  # stochastic depth decay rule
-
-        # build layers
-        self.layers = nn.LayerList()
-        for i_layer in range(self.num_layers):
-            layer = BasicLayer(
-                dim=int(embed_dim * 2**i_layer),
-                depth=depths[i_layer],
-                num_heads=num_heads[i_layer],
-                window_size=window_size,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                attn_drop=attn_drop_rate,
-                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
-                norm_layer=norm_layer,
-                downsample=PatchMerging
-                if (i_layer < self.num_layers - 1) else None)
-            self.layers.append(layer)
-
-        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
-        self.num_features = num_features
-
-        # add a norm layer for each output
-        for i_layer in out_indices:
-            layer = norm_layer(num_features[i_layer])
-            layer_name = f'norm{i_layer}'
-            self.add_sublayer(layer_name, layer)
-
-        self.apply(self._init_weights)
-        self._freeze_stages()
-        if pretrained:
-            if 'http' in pretrained:  #URL
-                path = paddle.utils.download.get_weights_path_from_url(
-                    pretrained)
-            else:  #model in local path
-                path = pretrained
-            self.set_state_dict(paddle.load(path))
-
-    def _freeze_stages(self):
-        if self.frozen_stages >= 0:
-            self.patch_embed.eval()
-            for param in self.patch_embed.parameters():
-                param.stop_gradient = True
-
-        if self.frozen_stages >= 1 and self.ape:
-            self.absolute_pos_embed.stop_gradient = True
-
-        if self.frozen_stages >= 2:
-            self.pos_drop.eval()
-            for i in range(0, self.frozen_stages - 1):
-                m = self.layers[i]
-                m.eval()
-                for param in m.parameters():
-                    param.stop_gradient = True
-
-    def _init_weights(self, m):
-        if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight)
-            if isinstance(m, nn.Linear) and m.bias is not None:
-                zeros_(m.bias)
-        elif isinstance(m, nn.LayerNorm):
-            zeros_(m.bias)
-            ones_(m.weight)
-
-    def forward(self, x):
-        """Forward function."""
-        x = self.patch_embed(x['image'])
-        B, _, Wh, Ww = x.shape
-        if self.ape:
-            # interpolate the position embedding to the corresponding size
-            absolute_pos_embed = F.interpolate(
-                self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
-            x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1])
-        else:
-            x = x.flatten(2).transpose([0, 2, 1])
-        x = self.pos_drop(x)
-        outs = []
-        for i in range(self.num_layers):
-            layer = self.layers[i]
-            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
-            if i in self.out_indices:
-                norm_layer = getattr(self, f'norm{i}')
-                x_out = norm_layer(x_out)
-                out = x_out.reshape((-1, H, W, self.num_features[i])).transpose(
-                    (0, 3, 1, 2))
-                outs.append(out)
-
-        return outs
-
-    @property
-    def out_shape(self):
-        out_strides = [4, 8, 16, 32]
-        return [
-            ShapeSpec(
-                channels=self.num_features[i], stride=out_strides[i])
-            for i in self.out_indices
-        ]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/trans_encoder.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/trans_encoder.py
deleted file mode 100644
index 1a45e0f..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/trans_encoder.py
+++ /dev/null
@@ -1,381 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.nn import ReLU, Swish, GELU
-import math
-
-from ppdet.core.workspace import register
-from ..shape_spec import ShapeSpec
-
-__all__ = ['TransEncoder']
-
-
-class BertEmbeddings(nn.Layer):
-    def __init__(self, word_size, position_embeddings_size, word_type_size,
-                 hidden_size, dropout_prob):
-        super(BertEmbeddings, self).__init__()
-        self.word_embeddings = nn.Embedding(
-            word_size, hidden_size, padding_idx=0)
-        self.position_embeddings = nn.Embedding(position_embeddings_size,
-                                                hidden_size)
-        self.token_type_embeddings = nn.Embedding(word_type_size, hidden_size)
-        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
-        self.dropout = nn.Dropout(dropout_prob)
-
-    def forward(self, x, token_type_ids=None, position_ids=None):
-        seq_len = paddle.shape(x)[1]
-        if position_ids is None:
-            position_ids = paddle.arange(seq_len).unsqueeze(0).expand_as(x)
-        if token_type_ids is None:
-            token_type_ids = paddle.zeros(paddle.shape(x))
-
-        word_embs = self.word_embeddings(x)
-        position_embs = self.position_embeddings(position_ids)
-        token_type_embs = self.token_type_embeddings(token_type_ids)
-
-        embs_cmb = word_embs + position_embs + token_type_embs
-        embs_out = self.layernorm(embs_cmb)
-        embs_out = self.dropout(embs_out)
-        return embs_out
-
-
-class BertSelfAttention(nn.Layer):
-    def __init__(self,
-                 hidden_size,
-                 num_attention_heads,
-                 attention_probs_dropout_prob,
-                 output_attentions=False):
-        super(BertSelfAttention, self).__init__()
-        if hidden_size % num_attention_heads != 0:
-            raise ValueError(
-                "The hidden_size must be a multiple of the number of attention "
-                "heads, but got {} % {} != 0" %
-                (hidden_size, num_attention_heads))
-
-        self.num_attention_heads = num_attention_heads
-        self.attention_head_size = int(hidden_size / num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(hidden_size, self.all_head_size)
-        self.key = nn.Linear(hidden_size, self.all_head_size)
-        self.value = nn.Linear(hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(attention_probs_dropout_prob)
-        self.output_attentions = output_attentions
-
-    def forward(self, x, attention_mask, head_mask=None):
-        query = self.query(x)
-        key = self.key(x)
-        value = self.value(x)
-
-        query_dim1, query_dim2 = paddle.shape(query)[:-1]
-        new_shape = [
-            query_dim1, query_dim2, self.num_attention_heads,
-            self.attention_head_size
-        ]
-        query = query.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
-        key = key.reshape(new_shape).transpose(perm=(0, 2, 3, 1))
-        value = value.reshape(new_shape).transpose(perm=(0, 2, 1, 3))
-
-        attention = paddle.matmul(query,
-                                  key) / math.sqrt(self.attention_head_size)
-        attention = attention + attention_mask
-        attention_value = F.softmax(attention, axis=-1)
-        attention_value = self.dropout(attention_value)
-
-        if head_mask is not None:
-            attention_value = attention_value * head_mask
-
-        context = paddle.matmul(attention_value, value).transpose(perm=(0, 2, 1,
-                                                                        3))
-        ctx_dim1, ctx_dim2 = paddle.shape(context)[:-2]
-        new_context_shape = [
-            ctx_dim1,
-            ctx_dim2,
-            self.all_head_size,
-        ]
-        context = context.reshape(new_context_shape)
-
-        if self.output_attentions:
-            return (context, attention_value)
-        else:
-            return (context, )
-
-
-class BertAttention(nn.Layer):
-    def __init__(self,
-                 hidden_size,
-                 num_attention_heads,
-                 attention_probs_dropout_prob,
-                 fc_dropout_prob,
-                 output_attentions=False):
-        super(BertAttention, self).__init__()
-        self.bert_selfattention = BertSelfAttention(
-            hidden_size, num_attention_heads, attention_probs_dropout_prob,
-            output_attentions)
-        self.fc = nn.Linear(hidden_size, hidden_size)
-        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
-        self.dropout = nn.Dropout(fc_dropout_prob)
-
-    def forward(self, x, attention_mask, head_mask=None):
-        attention_feats = self.bert_selfattention(x, attention_mask, head_mask)
-        features = self.fc(attention_feats[0])
-        features = self.dropout(features)
-        features = self.layernorm(features + x)
-        if len(attention_feats) == 2:
-            return (features, attention_feats[1])
-        else:
-            return (features, )
-
-
-class BertFeedForward(nn.Layer):
-    def __init__(self,
-                 hidden_size,
-                 intermediate_size,
-                 num_attention_heads,
-                 attention_probs_dropout_prob,
-                 fc_dropout_prob,
-                 act_fn='ReLU',
-                 output_attentions=False):
-        super(BertFeedForward, self).__init__()
-        self.fc1 = nn.Linear(hidden_size, intermediate_size)
-        self.act_fn = eval(act_fn)
-        self.fc2 = nn.Linear(intermediate_size, hidden_size)
-        self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8)
-        self.dropout = nn.Dropout(fc_dropout_prob)
-
-    def forward(self, x):
-        features = self.fc1(x)
-        features = self.act_fn(features)
-        features = self.fc2(features)
-        features = self.dropout(features)
-        features = self.layernorm(features + x)
-        return features
-
-
-class BertLayer(nn.Layer):
-    def __init__(self,
-                 hidden_size,
-                 intermediate_size,
-                 num_attention_heads,
-                 attention_probs_dropout_prob,
-                 fc_dropout_prob,
-                 act_fn='ReLU',
-                 output_attentions=False):
-        super(BertLayer, self).__init__()
-        self.attention = BertAttention(hidden_size, num_attention_heads,
-                                       attention_probs_dropout_prob,
-                                       output_attentions)
-        self.feed_forward = BertFeedForward(
-            hidden_size, intermediate_size, num_attention_heads,
-            attention_probs_dropout_prob, fc_dropout_prob, act_fn,
-            output_attentions)
-
-    def forward(self, x, attention_mask, head_mask=None):
-        attention_feats = self.attention(x, attention_mask, head_mask)
-        features = self.feed_forward(attention_feats[0])
-        if len(attention_feats) == 2:
-            return (features, attention_feats[1])
-        else:
-            return (features, )
-
-
-class BertEncoder(nn.Layer):
-    def __init__(self,
-                 num_hidden_layers,
-                 hidden_size,
-                 intermediate_size,
-                 num_attention_heads,
-                 attention_probs_dropout_prob,
-                 fc_dropout_prob,
-                 act_fn='ReLU',
-                 output_attentions=False,
-                 output_hidden_feats=False):
-        super(BertEncoder, self).__init__()
-        self.output_attentions = output_attentions
-        self.output_hidden_feats = output_hidden_feats
-        self.layers = nn.LayerList([
-            BertLayer(hidden_size, intermediate_size, num_attention_heads,
-                      attention_probs_dropout_prob, fc_dropout_prob, act_fn,
-                      output_attentions) for _ in range(num_hidden_layers)
-        ])
-
-    def forward(self, x, attention_mask, head_mask=None):
-        all_features = (x, )
-        all_attentions = ()
-
-        for i, layer in enumerate(self.layers):
-            mask = head_mask[i] if head_mask is not None else None
-            layer_out = layer(x, attention_mask, mask)
-
-            if self.output_hidden_feats:
-                all_features = all_features + (x, )
-            x = layer_out[0]
-            if self.output_attentions:
-                all_attentions = all_attentions + (layer_out[1], )
-
-        outputs = (x, )
-        if self.output_hidden_feats:
-            outputs += (all_features, )
-        if self.output_attentions:
-            outputs += (all_attentions, )
-        return outputs
-
-
-class BertPooler(nn.Layer):
-    def __init__(self, hidden_size):
-        super(BertPooler, self).__init__()
-        self.fc = nn.Linear(hidden_size, hidden_size)
-        self.act = nn.Tanh()
-
-    def forward(self, x):
-        first_token = x[:, 0]
-        pooled_output = self.fc(first_token)
-        pooled_output = self.act(pooled_output)
-        return pooled_output
-
-
-class METROEncoder(nn.Layer):
-    def __init__(self,
-                 vocab_size,
-                 num_hidden_layers,
-                 features_dims,
-                 position_embeddings_size,
-                 hidden_size,
-                 intermediate_size,
-                 output_feature_dim,
-                 num_attention_heads,
-                 attention_probs_dropout_prob,
-                 fc_dropout_prob,
-                 act_fn='ReLU',
-                 output_attentions=False,
-                 output_hidden_feats=False,
-                 use_img_layernorm=False):
-        super(METROEncoder, self).__init__()
-        self.img_dims = features_dims
-        self.num_hidden_layers = num_hidden_layers
-        self.use_img_layernorm = use_img_layernorm
-        self.output_attentions = output_attentions
-        self.embedding = BertEmbeddings(vocab_size, position_embeddings_size, 2,
-                                        hidden_size, fc_dropout_prob)
-        self.encoder = BertEncoder(
-            num_hidden_layers, hidden_size, intermediate_size,
-            num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob,
-            act_fn, output_attentions, output_hidden_feats)
-        self.pooler = BertPooler(hidden_size)
-        self.position_embeddings = nn.Embedding(position_embeddings_size,
-                                                hidden_size)
-        self.img_embedding = nn.Linear(
-            features_dims, hidden_size, bias_attr=True)
-        self.dropout = nn.Dropout(fc_dropout_prob)
-        self.cls_head = nn.Linear(hidden_size, output_feature_dim)
-        self.residual = nn.Linear(features_dims, output_feature_dim)
-
-        self.apply(self.init_weights)
-
-    def init_weights(self, module):
-        """ Initialize the weights.
-        """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            module.weight.set_value(
-                paddle.normal(
-                    mean=0.0, std=0.02, shape=module.weight.shape))
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.set_value(paddle.zeros(shape=module.bias.shape))
-            module.weight.set_value(
-                paddle.full(
-                    shape=module.weight.shape, fill_value=1.0))
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.set_value(paddle.zeros(shape=module.bias.shape))
-
-    def forward(self, x):
-        batchsize, seq_len = paddle.shape(x)[:2]
-        input_ids = paddle.zeros((batchsize, seq_len), dtype="int64")
-        position_ids = paddle.arange(
-            seq_len, dtype="int64").unsqueeze(0).expand_as(input_ids)
-
-        attention_mask = paddle.ones_like(input_ids).unsqueeze(1).unsqueeze(2)
-        head_mask = [None] * self.num_hidden_layers
-
-        position_embs = self.position_embeddings(position_ids)
-        attention_mask = (1.0 - attention_mask) * -10000.0
-
-        img_features = self.img_embedding(x)
-
-        # We empirically observe that adding an additional learnable position embedding leads to more stable training
-        embeddings = position_embs + img_features
-        if self.use_img_layernorm:
-            embeddings = self.layernorm(embeddings)
-        embeddings = self.dropout(embeddings)
-
-        encoder_outputs = self.encoder(
-            embeddings, attention_mask, head_mask=head_mask)
-
-        pred_score = self.cls_head(encoder_outputs[0])
-        res_img_feats = self.residual(x)
-        pred_score = pred_score + res_img_feats
-
-        if self.output_attentions and self.output_hidden_feats:
-            return pred_score, encoder_outputs[1], encoder_outputs[-1]
-        else:
-            return pred_score
-
-
-def gelu(x):
-    """Implementation of the gelu activation function.
-        https://arxiv.org/abs/1606.08415
-    """
-    return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
-
-
-@register
-class TransEncoder(nn.Layer):
-    def __init__(self,
-                 vocab_size=30522,
-                 num_hidden_layers=4,
-                 num_attention_heads=4,
-                 position_embeddings_size=512,
-                 intermediate_size=3072,
-                 input_feat_dim=[2048, 512, 128],
-                 hidden_feat_dim=[1024, 256, 128],
-                 attention_probs_dropout_prob=0.1,
-                 fc_dropout_prob=0.1,
-                 act_fn='gelu',
-                 output_attentions=False,
-                 output_hidden_feats=False):
-        super(TransEncoder, self).__init__()
-        output_feat_dim = input_feat_dim[1:] + [3]
-        trans_encoder = []
-        for i in range(len(output_feat_dim)):
-            features_dims = input_feat_dim[i]
-            output_feature_dim = output_feat_dim[i]
-            hidden_size = hidden_feat_dim[i]
-
-            # init a transformer encoder and append it to a list
-            assert hidden_size % num_attention_heads == 0
-            model = METROEncoder(vocab_size, num_hidden_layers, features_dims,
-                                 position_embeddings_size, hidden_size,
-                                 intermediate_size, output_feature_dim,
-                                 num_attention_heads,
-                                 attention_probs_dropout_prob, fc_dropout_prob,
-                                 act_fn, output_attentions, output_hidden_feats)
-            trans_encoder.append(model)
-        self.trans_encoder = paddle.nn.Sequential(*trans_encoder)
-
-    def forward(self, x):
-        out = self.trans_encoder(x)
-        return out
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/transformer_utils.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/transformer_utils.py
deleted file mode 100644
index a0783e1..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/transformer_utils.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from paddle.nn.initializer import TruncatedNormal, Constant, Assign
-
-# Common initializations
-ones_ = Constant(value=1.)
-zeros_ = Constant(value=0.)
-trunc_normal_ = TruncatedNormal(std=.02)
-
-
-# Common Layers
-def drop_path(x, drop_prob=0., training=False):
-    """
-        Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-        the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-        See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
-    """
-    if drop_prob == 0. or not training:
-        return x
-    keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype)
-    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
-    random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype)
-    random_tensor = paddle.floor(random_tensor)  # binarize
-    output = x.divide(keep_prob) * random_tensor
-    return output
-
-
-class DropPath(nn.Layer):
-    def __init__(self, drop_prob=None):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training)
-
-
-class Identity(nn.Layer):
-    def __init__(self):
-        super(Identity, self).__init__()
-
-    def forward(self, input):
-        return input
-
-
-# common funcs
-
-
-def to_2tuple(x):
-    if isinstance(x, (list, tuple)):
-        return x
-    return tuple([x] * 2)
-
-
-def add_parameter(layer, datas, name=None):
-    parameter = layer.create_parameter(
-        shape=(datas.shape), default_initializer=Assign(datas))
-    if name:
-        layer.add_parameter(name, parameter)
-    return parameter
-
-
-def window_partition(x, window_size):
-    """
-    Partition into non-overlapping windows with padding if needed.
-    Args:
-        x (tensor): input tokens with [B, H, W, C].
-        window_size (int): window size.
-    Returns:
-        windows: windows after partition with [B * num_windows, window_size, window_size, C].
-        (Hp, Wp): padded height and width before partition
-    """
-    B, H, W, C = paddle.shape(x)
-
-    pad_h = (window_size - H % window_size) % window_size
-    pad_w = (window_size - W % window_size) % window_size
-    x = F.pad(x.transpose([0, 3, 1, 2]),
-              paddle.to_tensor(
-                  [0, int(pad_w), 0, int(pad_h)],
-                  dtype='int32')).transpose([0, 2, 3, 1])
-    Hp, Wp = H + pad_h, W + pad_w
-
-    num_h, num_w = Hp // window_size, Wp // window_size
-
-    x = x.reshape([B, num_h, window_size, num_w, window_size, C])
-    windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape(
-        [-1, window_size, window_size, C])
-    return windows, (Hp, Wp), (num_h, num_w)
-
-
-def window_unpartition(x, pad_hw, num_hw, hw):
-    """
-    Window unpartition into original sequences and removing padding.
-    Args:
-        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
-        pad_hw (Tuple): padded height and width (Hp, Wp).
-        hw (Tuple): original height and width (H, W) before padding.
-    Returns:
-        x: unpartitioned sequences with [B, H, W, C].
-    """
-    Hp, Wp = pad_hw
-    num_h, num_w = num_hw
-    H, W = hw
-    B, window_size, _, C = paddle.shape(x)
-    B = B // (num_h * num_w)
-    x = x.reshape([B, num_h, num_w, window_size, window_size, C])
-    x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, Hp, Wp, C])
-
-    return x[:, :H, :W, :]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/vgg.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/vgg.py
deleted file mode 100644
index e057532..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/vgg.py
+++ /dev/null
@@ -1,210 +0,0 @@
-from __future__ import division
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn import Conv2D, MaxPool2D
-from ppdet.core.workspace import register, serializable
-from ..shape_spec import ShapeSpec
-
-__all__ = ['VGG']
-
-VGG_cfg = {16: [2, 2, 3, 3, 3], 19: [2, 2, 4, 4, 4]}
-
-
-class ConvBlock(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 groups,
-                 pool_size=2,
-                 pool_stride=2,
-                 pool_padding=0,
-                 name=None):
-        super(ConvBlock, self).__init__()
-
-        self.groups = groups
-        self.conv0 = nn.Conv2D(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=3,
-            stride=1,
-            padding=1)
-        self.conv_out_list = []
-        for i in range(1, groups):
-            conv_out = self.add_sublayer(
-                'conv{}'.format(i),
-                Conv2D(
-                    in_channels=out_channels,
-                    out_channels=out_channels,
-                    kernel_size=3,
-                    stride=1,
-                    padding=1))
-            self.conv_out_list.append(conv_out)
-
-        self.pool = MaxPool2D(
-            kernel_size=pool_size,
-            stride=pool_stride,
-            padding=pool_padding,
-            ceil_mode=True)
-
-    def forward(self, inputs):
-        out = self.conv0(inputs)
-        out = F.relu(out)
-        for conv_i in self.conv_out_list:
-            out = conv_i(out)
-            out = F.relu(out)
-        pool = self.pool(out)
-        return out, pool
-
-
-class ExtraBlock(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 mid_channels,
-                 out_channels,
-                 padding,
-                 stride,
-                 kernel_size,
-                 name=None):
-        super(ExtraBlock, self).__init__()
-
-        self.conv0 = Conv2D(
-            in_channels=in_channels,
-            out_channels=mid_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0)
-        self.conv1 = Conv2D(
-            in_channels=mid_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding)
-
-    def forward(self, inputs):
-        out = self.conv0(inputs)
-        out = F.relu(out)
-        out = self.conv1(out)
-        out = F.relu(out)
-        return out
-
-
-class L2NormScale(nn.Layer):
-    def __init__(self, num_channels, scale=1.0):
-        super(L2NormScale, self).__init__()
-        self.scale = self.create_parameter(
-            attr=ParamAttr(initializer=paddle.nn.initializer.Constant(scale)),
-            shape=[num_channels])
-
-    def forward(self, inputs):
-        out = F.normalize(inputs, axis=1, epsilon=1e-10)
-        # out = self.scale.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(
-        #     out) * out
-        out = self.scale.unsqueeze(0).unsqueeze(2).unsqueeze(3) * out
-        return out
-
-
-@register
-@serializable
-class VGG(nn.Layer):
-    def __init__(self,
-                 depth=16,
-                 normalizations=[20., -1, -1, -1, -1, -1],
-                 extra_block_filters=[[256, 512, 1, 2, 3], [128, 256, 1, 2, 3],
-                                      [128, 256, 0, 1, 3],
-                                      [128, 256, 0, 1, 3]]):
-        super(VGG, self).__init__()
-
-        assert depth in [16, 19], \
-                "depth as 16/19 supported currently, but got {}".format(depth)
-        self.depth = depth
-        self.groups = VGG_cfg[depth]
-        self.normalizations = normalizations
-        self.extra_block_filters = extra_block_filters
-
-        self._out_channels = []
-
-        self.conv_block_0 = ConvBlock(
-            3, 64, self.groups[0], 2, 2, 0, name="conv1_")
-        self.conv_block_1 = ConvBlock(
-            64, 128, self.groups[1], 2, 2, 0, name="conv2_")
-        self.conv_block_2 = ConvBlock(
-            128, 256, self.groups[2], 2, 2, 0, name="conv3_")
-        self.conv_block_3 = ConvBlock(
-            256, 512, self.groups[3], 2, 2, 0, name="conv4_")
-        self.conv_block_4 = ConvBlock(
-            512, 512, self.groups[4], 3, 1, 1, name="conv5_")
-        self._out_channels.append(512)
-
-        self.fc6 = Conv2D(
-            in_channels=512,
-            out_channels=1024,
-            kernel_size=3,
-            stride=1,
-            padding=6,
-            dilation=6)
-        self.fc7 = Conv2D(
-            in_channels=1024,
-            out_channels=1024,
-            kernel_size=1,
-            stride=1,
-            padding=0)
-        self._out_channels.append(1024)
-
-        # extra block
-        self.extra_convs = []
-        last_channels = 1024
-        for i, v in enumerate(self.extra_block_filters):
-            assert len(v) == 5, "extra_block_filters size not fix"
-            extra_conv = self.add_sublayer("conv{}".format(6 + i),
-                                           ExtraBlock(last_channels, v[0], v[1],
-                                                      v[2], v[3], v[4]))
-            last_channels = v[1]
-            self.extra_convs.append(extra_conv)
-            self._out_channels.append(last_channels)
-
-        self.norms = []
-        for i, n in enumerate(self.normalizations):
-            if n != -1:
-                norm = self.add_sublayer("norm{}".format(i),
-                                         L2NormScale(
-                                             self.extra_block_filters[i][1], n))
-            else:
-                norm = None
-            self.norms.append(norm)
-
-    def forward(self, inputs):
-        outputs = []
-
-        conv, pool = self.conv_block_0(inputs['image'])
-        conv, pool = self.conv_block_1(pool)
-        conv, pool = self.conv_block_2(pool)
-        conv, pool = self.conv_block_3(pool)
-        outputs.append(conv)
-
-        conv, pool = self.conv_block_4(pool)
-        out = self.fc6(pool)
-        out = F.relu(out)
-        out = self.fc7(out)
-        out = F.relu(out)
-        outputs.append(out)
-
-        if not self.extra_block_filters:
-            return outputs
-
-        # extra block
-        for extra_conv in self.extra_convs:
-            out = extra_conv(out)
-            outputs.append(out)
-
-        for i, n in enumerate(self.normalizations):
-            if n != -1:
-                outputs[i] = self.norms[i](outputs[i])
-
-        return outputs
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/vision_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/vision_transformer.py
deleted file mode 100644
index a21eefc..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/vision_transformer.py
+++ /dev/null
@@ -1,652 +0,0 @@
-# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-import numpy as np
-from paddle.nn.initializer import Constant
-
-from ppdet.modeling.shape_spec import ShapeSpec
-from ppdet.core.workspace import register, serializable
-
-from .transformer_utils import zeros_, DropPath, Identity
-
-
-class Mlp(nn.Layer):
-    def __init__(self,
-                 in_features,
-                 hidden_features=None,
-                 out_features=None,
-                 act_layer=nn.GELU,
-                 drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-        x = self.drop(x)
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-class Attention(nn.Layer):
-    def __init__(self,
-                 dim,
-                 num_heads=8,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 attn_drop=0.,
-                 proj_drop=0.,
-                 window_size=None):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-
-        self.qkv = nn.Linear(dim, dim * 3, bias_attr=False)
-
-        if qkv_bias:
-            self.q_bias = self.create_parameter(
-                shape=([dim]), default_initializer=zeros_)
-            self.v_bias = self.create_parameter(
-                shape=([dim]), default_initializer=zeros_)
-        else:
-            self.q_bias = None
-            self.v_bias = None
-        if window_size:
-            self.window_size = window_size
-            self.num_relative_distance = (2 * window_size[0] - 1) * (
-                2 * window_size[1] - 1) + 3
-            self.relative_position_bias_table = self.create_parameter(
-                shape=(self.num_relative_distance, num_heads),
-                default_initializer=zeros_)  # 2*Wh-1 * 2*Ww-1, nH
-            # cls to token & token 2 cls & cls to cls
-
-            # get pair-wise relative position index for each token inside the window
-            coords_h = paddle.arange(window_size[0])
-            coords_w = paddle.arange(window_size[1])
-            coords = paddle.stack(paddle.meshgrid(
-                [coords_h, coords_w]))  # 2, Wh, Ww
-            coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww 
-            coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2)
-            coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1)
-            relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone(
-            )
-
-            #relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Wh
-            relative_coords = relative_coords.transpose(
-                (1, 2, 0))  #.contiguous()  # Wh*Ww, Wh*Ww, 2
-            relative_coords[:, :, 0] += window_size[
-                0] - 1  # shift to start from 0
-            relative_coords[:, :, 1] += window_size[1] - 1
-            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
-            relative_position_index = \
-                paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype)
-            relative_position_index[1:, 1:] = relative_coords.sum(
-                -1)  # Wh*Ww, Wh*Ww
-            relative_position_index[0, 0:] = self.num_relative_distance - 3
-            relative_position_index[0:, 0] = self.num_relative_distance - 2
-            relative_position_index[0, 0] = self.num_relative_distance - 1
-
-            self.register_buffer("relative_position_index",
-                                 relative_position_index)
-            # trunc_normal_(self.relative_position_bias_table, std=.0)
-        else:
-            self.window_size = None
-            self.relative_position_bias_table = None
-            self.relative_position_index = None
-
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-    def forward(self, x, rel_pos_bias=None):
-        x_shape = paddle.shape(x)
-        N, C = x_shape[1], x_shape[2]
-
-        qkv_bias = None
-        if self.q_bias is not None:
-            qkv_bias = paddle.concat(
-                (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
-        qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
-
-        qkv = qkv.reshape((-1, N, 3, self.num_heads,
-                           C // self.num_heads)).transpose((2, 0, 3, 1, 4))
-        q, k, v = qkv[0], qkv[1], qkv[2]
-        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
-
-        if self.relative_position_bias_table is not None:
-            relative_position_bias = self.relative_position_bias_table[
-                self.relative_position_index.reshape([-1])].reshape([
-                    self.window_size[0] * self.window_size[1] + 1,
-                    self.window_size[0] * self.window_size[1] + 1, -1
-                ])  # Wh*Ww,Wh*Ww,nH
-            relative_position_bias = relative_position_bias.transpose(
-                (2, 0, 1))  #.contiguous()  # nH, Wh*Ww, Wh*Ww
-            attn = attn + relative_position_bias.unsqueeze(0)
-        if rel_pos_bias is not None:
-            attn = attn + rel_pos_bias
-
-        attn = nn.functional.softmax(attn, axis=-1)
-        attn = self.attn_drop(attn)
-
-        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
-        x = self.proj(x)
-        x = self.proj_drop(x)
-        return x
-
-
-class Block(nn.Layer):
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 mlp_ratio=4.,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 drop=0.,
-                 attn_drop=0.,
-                 drop_path=0.,
-                 window_size=None,
-                 init_values=None,
-                 act_layer=nn.GELU,
-                 norm_layer='nn.LayerNorm',
-                 epsilon=1e-5):
-        super().__init__()
-        self.norm1 = nn.LayerNorm(dim, epsilon=1e-6)
-        self.attn = Attention(
-            dim,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            qk_scale=qk_scale,
-            attn_drop=attn_drop,
-            proj_drop=drop,
-            window_size=window_size)
-        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
-        self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim,
-                       hidden_features=mlp_hidden_dim,
-                       act_layer=act_layer,
-                       drop=drop)
-        if init_values is not None:
-            self.gamma_1 = self.create_parameter(
-                shape=([dim]), default_initializer=Constant(value=init_values))
-            self.gamma_2 = self.create_parameter(
-                shape=([dim]), default_initializer=Constant(value=init_values))
-        else:
-            self.gamma_1, self.gamma_2 = None, None
-
-    def forward(self, x, rel_pos_bias=None):
-
-        if self.gamma_1 is None:
-            x = x + self.drop_path(
-                self.attn(
-                    self.norm1(x), rel_pos_bias=rel_pos_bias))
-            x = x + self.drop_path(self.mlp(self.norm2(x)))
-        else:
-            x = x + self.drop_path(self.gamma_1 * self.attn(
-                self.norm1(x), rel_pos_bias=rel_pos_bias))
-            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
-        return x
-
-
-class PatchEmbed(nn.Layer):
-    """ Image to Patch Embedding
-    """
-
-    def __init__(self,
-                 img_size=[224, 224],
-                 patch_size=16,
-                 in_chans=3,
-                 embed_dim=768):
-        super().__init__()
-        self.num_patches_w = img_size[0] // patch_size
-        self.num_patches_h = img_size[1] // patch_size
-
-        num_patches = self.num_patches_w * self.num_patches_h
-        self.patch_shape = (img_size[0] // patch_size,
-                            img_size[1] // patch_size)
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.num_patches = num_patches
-
-        self.proj = nn.Conv2D(
-            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-
-    @property
-    def num_patches_in_h(self):
-        return self.img_size[1] // self.patch_size
-
-    @property
-    def num_patches_in_w(self):
-        return self.img_size[0] // self.patch_size
-
-    def forward(self, x, mask=None):
-        B, C, H, W = x.shape
-        return self.proj(x)
-
-
-class RelativePositionBias(nn.Layer):
-    def __init__(self, window_size, num_heads):
-        super().__init__()
-        self.window_size = window_size
-        self.num_relative_distance = (2 * window_size[0] - 1) * (
-            2 * window_size[1] - 1) + 3
-        self.relative_position_bias_table = self.create_parameter(
-            shape=(self.num_relative_distance, num_heads),
-            default_initialize=zeros_)
-        # cls to token & token 2 cls & cls to cls
-
-        # get pair-wise relative position index for each token inside the window
-        coords_h = paddle.arange(window_size[0])
-        coords_w = paddle.arange(window_size[1])
-        coords = paddle.stack(paddle.meshgrid(
-            [coords_h, coords_w]))  # 2, Wh, Ww
-        coords_flatten = coords.flatten(1)  # 2, Wh*Ww
-
-        relative_coords = coords_flatten[:, :,
-                                         None] - coords_flatten[:,
-                                                                None, :]  # 2, Wh*Ww, Wh*Ww
-        relative_coords = relative_coords.transpos(
-            (1, 2, 0))  # Wh*Ww, Wh*Ww, 2 
-        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
-        relative_coords[:, :, 1] += window_size[1] - 1
-        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
-        relative_position_index = \
-            paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
-        relative_position_index[1:, 1:] = relative_coords.sum(
-            -1)  # Wh*Ww, Wh*Ww
-        relative_position_index[0, 0:] = self.num_relative_distance - 3
-        relative_position_index[0:, 0] = self.num_relative_distance - 2
-        relative_position_index[0, 0] = self.num_relative_distance - 1
-        self.register_buffer("relative_position_index", relative_position_index)
-
-    def forward(self):
-        relative_position_bias = \
-            self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([
-                 self.window_size[0] * self.window_size[1] + 1,
-                 self.window_size[0] * self.window_size[1] + 1, -1])  # Wh*Ww,Wh*Ww,nH 
-        return relative_position_bias.transpose((2, 0, 1))  # nH, Wh*Ww, Wh*Ww
-
-
-def get_sinusoid_encoding_table(n_position, d_hid, token=False):
-    ''' Sinusoid position encoding table '''
-
-    def get_position_angle_vec(position):
-        return [
-            position / np.power(10000, 2 * (hid_j // 2) / d_hid)
-            for hid_j in range(d_hid)
-        ]
-
-    sinusoid_table = np.array(
-        [get_position_angle_vec(pos_i) for pos_i in range(n_position)])
-    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
-    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
-    if token:
-        sinusoid_table = np.concatenate(
-            [sinusoid_table, np.zeros([1, d_hid])], dim=0)
-
-    return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0)
-
-
-@register
-@serializable
-class VisionTransformer(nn.Layer):
-    """ Vision Transformer with support for patch input
-    """
-
-    def __init__(self,
-                 img_size=[672, 1092],
-                 patch_size=16,
-                 in_chans=3,
-                 embed_dim=768,
-                 depth=12,
-                 num_heads=12,
-                 mlp_ratio=4,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 drop_rate=0.,
-                 attn_drop_rate=0.,
-                 drop_path_rate=0.,
-                 norm_layer='nn.LayerNorm',
-                 init_values=None,
-                 use_rel_pos_bias=False,
-                 use_shared_rel_pos_bias=False,
-                 epsilon=1e-5,
-                 final_norm=False,
-                 pretrained=None,
-                 out_indices=[3, 5, 7, 11],
-                 use_abs_pos_emb=False,
-                 use_sincos_pos_emb=True,
-                 with_fpn=True,
-                 num_fpn_levels=4,
-                 use_checkpoint=False,
-                 **args):
-        super().__init__()
-        self.img_size = img_size
-        self.embed_dim = embed_dim
-        self.with_fpn = with_fpn
-        self.use_checkpoint = use_checkpoint
-        self.use_sincos_pos_emb = use_sincos_pos_emb
-        self.use_rel_pos_bias = use_rel_pos_bias
-        self.final_norm = final_norm
-        self.out_indices = out_indices
-        self.num_fpn_levels = num_fpn_levels
-
-        if use_checkpoint:
-            paddle.seed(0)
-
-        self.patch_embed = PatchEmbed(
-            img_size=img_size,
-            patch_size=patch_size,
-            in_chans=in_chans,
-            embed_dim=embed_dim)
-
-        self.pos_w = self.patch_embed.num_patches_in_w
-        self.pos_h = self.patch_embed.num_patches_in_h
-
-        self.cls_token = self.create_parameter(
-            shape=(1, 1, embed_dim),
-            default_initializer=paddle.nn.initializer.Constant(value=0.))
-
-        if use_abs_pos_emb:
-            self.pos_embed = self.create_parameter(
-                shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
-                default_initializer=paddle.nn.initializer.TruncatedNormal(
-                    std=.02))
-        elif use_sincos_pos_emb:
-            pos_embed = self.build_2d_sincos_position_embedding(embed_dim)
-
-            self.pos_embed = pos_embed
-            self.pos_embed = self.create_parameter(shape=pos_embed.shape)
-            self.pos_embed.set_value(pos_embed.numpy())
-            self.pos_embed.stop_gradient = True
-
-        else:
-            self.pos_embed = None
-
-        self.pos_drop = nn.Dropout(p=drop_rate)
-
-        if use_shared_rel_pos_bias:
-            self.rel_pos_bias = RelativePositionBias(
-                window_size=self.patch_embed.patch_shape, num_heads=num_heads)
-        else:
-            self.rel_pos_bias = None
-
-        dpr = np.linspace(0, drop_path_rate, depth)
-
-        self.blocks = nn.LayerList([
-            Block(
-                dim=embed_dim,
-                num_heads=num_heads,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                attn_drop=attn_drop_rate,
-                drop_path=dpr[i],
-                norm_layer=norm_layer,
-                init_values=init_values,
-                window_size=self.patch_embed.patch_shape
-                if use_rel_pos_bias else None,
-                epsilon=epsilon) for i in range(depth)
-        ])
-
-        self.pretrained = pretrained
-        self.init_weight()
-
-        assert len(out_indices) <= 4, ''
-        self.out_indices = out_indices
-        self.out_channels = [embed_dim for _ in range(num_fpn_levels)]
-        self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [
-            patch_size for _ in range(len(out_indices))
-        ]
-
-        self.norm = Identity()
-
-        if self.with_fpn:
-            assert num_fpn_levels <= 4, ''
-            self.init_fpn(
-                embed_dim=embed_dim,
-                patch_size=patch_size, )
-
-    def init_weight(self):
-        pretrained = self.pretrained
-
-        if pretrained:
-            if 'http' in pretrained:  #URL
-                path = paddle.utils.download.get_weights_path_from_url(
-                    pretrained)
-            else:  #model in local path
-                path = pretrained
-
-            load_state_dict = paddle.load(path)
-            model_state_dict = self.state_dict()
-            pos_embed_name = "pos_embed"
-
-            if pos_embed_name in load_state_dict.keys():
-                load_pos_embed = paddle.to_tensor(
-                    load_state_dict[pos_embed_name], dtype="float32")
-                if self.pos_embed.shape != load_pos_embed.shape:
-                    pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
-                    model_state_dict[pos_embed_name] = self.resize_pos_embed(
-                        load_pos_embed, (pos_size, pos_size),
-                        (self.pos_h, self.pos_w))
-
-                    # self.set_state_dict(model_state_dict)
-                    load_state_dict[pos_embed_name] = model_state_dict[
-                        pos_embed_name]
-
-                    print("Load pos_embed and resize it from {} to {} .".format(
-                        load_pos_embed.shape, self.pos_embed.shape))
-
-            self.set_state_dict(load_state_dict)
-            print("Load load_state_dict....")
-
-    def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
-        if patch_size == 16:
-            self.fpn1 = nn.Sequential(
-                nn.Conv2DTranspose(
-                    embed_dim, embed_dim, kernel_size=2, stride=2),
-                nn.BatchNorm2D(embed_dim),
-                nn.GELU(),
-                nn.Conv2DTranspose(
-                    embed_dim, embed_dim, kernel_size=2, stride=2), )
-
-            self.fpn2 = nn.Sequential(
-                nn.Conv2DTranspose(
-                    embed_dim, embed_dim, kernel_size=2, stride=2), )
-
-            self.fpn3 = Identity()
-
-            self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
-        elif patch_size == 8:
-            self.fpn1 = nn.Sequential(
-                nn.Conv2DTranspose(
-                    embed_dim, embed_dim, kernel_size=2, stride=2), )
-
-            self.fpn2 = Identity()
-
-            self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
-
-            self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
-
-        if not out_with_norm:
-            self.norm = Identity()
-        else:
-            self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6)
-
-    def interpolate_pos_encoding(self, x, w, h):
-        npatch = x.shape[1] - 1
-        N = self.pos_embed.shape[1] - 1
-        w0 = w // self.patch_embed.patch_size
-        h0 = h // self.patch_embed.patch_size
-        if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h:
-            return self.pos_embed
-        class_pos_embed = self.pos_embed[:, 0]
-        patch_pos_embed = self.pos_embed[:, 1:]
-        dim = x.shape[-1]
-        # we add a small number to avoid floating point error in the interpolation
-        # see discussion at https://github.com/facebookresearch/dino/issues/8
-        # w0, h0 = w0 + 0.1, h0 + 0.1
-        # patch_pos_embed = nn.functional.interpolate(
-        #     patch_pos_embed.reshape([
-        #         1, self.patch_embed.num_patches_w,
-        #         self.patch_embed.num_patches_h, dim
-        #     ]).transpose((0, 3, 1, 2)),
-        #     scale_factor=(w0 / self.patch_embed.num_patches_w,
-        #                   h0 / self.patch_embed.num_patches_h),
-        #     mode='bicubic', )
-
-        patch_pos_embed = nn.functional.interpolate(
-            patch_pos_embed.reshape([
-                1, self.patch_embed.num_patches_w,
-                self.patch_embed.num_patches_h, dim
-            ]).transpose((0, 3, 1, 2)),
-            (w0, h0),
-            mode='bicubic', )
-
-        assert int(w0) == patch_pos_embed.shape[-2] and int(
-            h0) == patch_pos_embed.shape[-1]
-        patch_pos_embed = patch_pos_embed.transpose(
-            (0, 2, 3, 1)).reshape([1, -1, dim])
-        return paddle.concat(
-            (class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1)
-
-    def resize_pos_embed(self, pos_embed, old_hw, new_hw):
-        """
-        Resize pos_embed weight.
-        Args:
-            pos_embed (Tensor): the pos_embed weight
-            old_hw (list[int]): the height and width of old pos_embed
-            new_hw (list[int]): the height and width of new pos_embed
-        Returns:
-            Tensor: the resized pos_embed weight
-        """
-        cls_pos_embed = pos_embed[:, :1, :]
-        pos_embed = pos_embed[:, 1:, :]
-
-        pos_embed = pos_embed.transpose([0, 2, 1])
-        pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
-        pos_embed = F.interpolate(
-            pos_embed, new_hw, mode='bicubic', align_corners=False)
-        pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
-        pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
-
-        return pos_embed
-
-    def build_2d_sincos_position_embedding(
-            self,
-            embed_dim=768,
-            temperature=10000., ):
-        h, w = self.patch_embed.patch_shape
-        grid_w = paddle.arange(w, dtype=paddle.float32)
-        grid_h = paddle.arange(h, dtype=paddle.float32)
-        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
-        assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
-        pos_dim = embed_dim // 4
-        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
-        omega = 1. / (temperature**omega)
-
-        out_w = grid_w.flatten()[..., None] @omega[None]
-        out_h = grid_h.flatten()[..., None] @omega[None]
-
-        pos_emb = paddle.concat(
-            [
-                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
-                paddle.cos(out_h)
-            ],
-            axis=1)[None, :, :]
-
-        pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32)
-        pos_embed = paddle.concat([pe_token, pos_emb], axis=1)
-        # pos_embed.stop_gradient = True
-
-        return pos_embed
-
-    def forward(self, x):
-        x = x['image'] if isinstance(x, dict) else x
-        _, _, h, w = x.shape
-
-        x = self.patch_embed(x)
-
-        B, D, Hp, Wp = x.shape  # b * c * h * w
-
-        cls_tokens = self.cls_token.expand(
-            (B, self.cls_token.shape[-2], self.cls_token.shape[-1]))
-        x = x.flatten(2).transpose([0, 2, 1])  # b * hw * c
-        x = paddle.concat([cls_tokens, x], axis=1)
-
-        if self.pos_embed is not None:
-            # x = x + self.interpolate_pos_encoding(x, w, h)
-            x = x + self.interpolate_pos_encoding(x, h, w)
-
-        x = self.pos_drop(x)
-
-        rel_pos_bias = self.rel_pos_bias(
-        ) if self.rel_pos_bias is not None else None
-
-        feats = []
-        for idx, blk in enumerate(self.blocks):
-            if self.use_checkpoint and self.training:
-                x = paddle.distributed.fleet.utils.recompute(
-                    blk, x, rel_pos_bias, **{"preserve_rng_state": True})
-            else:
-                x = blk(x, rel_pos_bias)
-
-            if idx in self.out_indices:
-                xp = paddle.reshape(
-                    paddle.transpose(
-                        self.norm(x[:, 1:, :]), perm=[0, 2, 1]),
-                    shape=[B, D, Hp, Wp])
-                feats.append(xp)
-
-        if self.with_fpn:
-            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][
-                -self.num_fpn_levels:]
-            assert len(fpns) == len(feats) or len(feats) == 1, ''
-            outputs = []
-            for i, m in enumerate(fpns):
-                outputs.append(
-                    m(feats[i] if len(feats) == len(fpns) else feats[-1]))
-
-            return outputs
-
-        return feats
-
-    @property
-    def num_layers(self):
-        return len(self.blocks)
-
-    @property
-    def no_weight_decay(self):
-        return {'pos_embed', 'cls_token'}
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=c, stride=s)
-            for c, s in zip(self.out_channels, self.out_strides)
-        ]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/vit_mae.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/vit_mae.py
deleted file mode 100644
index 8d00da7..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/vit_mae.py
+++ /dev/null
@@ -1,749 +0,0 @@
-# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-import numpy as np
-import math
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-from paddle.nn.initializer import Constant, TruncatedNormal
-
-from ppdet.modeling.shape_spec import ShapeSpec
-from ppdet.core.workspace import register, serializable
-
-from .transformer_utils import (zeros_, DropPath, Identity, window_partition,
-                                window_unpartition)
-from ..initializer import linear_init_
-
-__all__ = ['VisionTransformer2D', 'SimpleFeaturePyramid']
-
-
-class Mlp(nn.Layer):
-    def __init__(self,
-                 in_features,
-                 hidden_features=None,
-                 out_features=None,
-                 act_layer='nn.GELU',
-                 drop=0.,
-                 lr_factor=1.0):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(
-            in_features,
-            hidden_features,
-            weight_attr=ParamAttr(learning_rate=lr_factor),
-            bias_attr=ParamAttr(learning_rate=lr_factor))
-        self.act = eval(act_layer)()
-        self.fc2 = nn.Linear(
-            hidden_features,
-            out_features,
-            weight_attr=ParamAttr(learning_rate=lr_factor),
-            bias_attr=ParamAttr(learning_rate=lr_factor))
-        self.drop = nn.Dropout(drop)
-
-        self._init_weights()
-
-    def _init_weights(self):
-        linear_init_(self.fc1)
-        linear_init_(self.fc2)
-
-    def forward(self, x):
-        x = self.drop(self.act(self.fc1(x)))
-        x = self.drop(self.fc2(x))
-        return x
-
-
-class Attention(nn.Layer):
-    def __init__(self,
-                 dim,
-                 num_heads=8,
-                 qkv_bias=False,
-                 attn_bias=False,
-                 attn_drop=0.,
-                 proj_drop=0.,
-                 use_rel_pos=False,
-                 rel_pos_zero_init=True,
-                 window_size=None,
-                 input_size=None,
-                 qk_scale=None,
-                 lr_factor=1.0):
-        super().__init__()
-        self.num_heads = num_heads
-        self.head_dim = dim // num_heads
-        self.scale = qk_scale or self.head_dim**-0.5
-        self.use_rel_pos = use_rel_pos
-        self.input_size = input_size
-        self.rel_pos_zero_init = rel_pos_zero_init
-        self.window_size = window_size
-        self.lr_factor = lr_factor
-
-        self.qkv = nn.Linear(
-            dim,
-            dim * 3,
-            weight_attr=ParamAttr(learning_rate=lr_factor),
-            bias_attr=ParamAttr(learning_rate=lr_factor)
-            if attn_bias else False)
-        if qkv_bias:
-            self.q_bias = self.create_parameter(
-                shape=([dim]), default_initializer=zeros_)
-            self.v_bias = self.create_parameter(
-                shape=([dim]), default_initializer=zeros_)
-        else:
-            self.q_bias = None
-            self.v_bias = None
-        self.proj = nn.Linear(
-            dim,
-            dim,
-            weight_attr=ParamAttr(learning_rate=lr_factor),
-            bias_attr=ParamAttr(learning_rate=lr_factor))
-        self.attn_drop = nn.Dropout(attn_drop)
-        if window_size is None:
-            self.window_size = self.input_size[0]
-
-        self._init_weights()
-
-    def _init_weights(self):
-        linear_init_(self.qkv)
-        linear_init_(self.proj)
-
-        if self.use_rel_pos:
-            self.rel_pos_h = self.create_parameter(
-                [2 * self.window_size - 1, self.head_dim],
-                attr=ParamAttr(learning_rate=self.lr_factor),
-                default_initializer=Constant(value=0.))
-            self.rel_pos_w = self.create_parameter(
-                [2 * self.window_size - 1, self.head_dim],
-                attr=ParamAttr(learning_rate=self.lr_factor),
-                default_initializer=Constant(value=0.))
-
-            if not self.rel_pos_zero_init:
-                TruncatedNormal(self.rel_pos_h, std=0.02)
-                TruncatedNormal(self.rel_pos_w, std=0.02)
-
-    def get_rel_pos(self, seq_size, rel_pos):
-        max_rel_dist = int(2 * seq_size - 1)
-        # Interpolate rel pos if needed.
-        if rel_pos.shape[0] != max_rel_dist:
-            # Interpolate rel pos.
-            rel_pos = rel_pos.reshape([1, rel_pos.shape[0], -1])
-            rel_pos = rel_pos.transpose([0, 2, 1])
-            rel_pos_resized = F.interpolate(
-                rel_pos,
-                size=(max_rel_dist, ),
-                mode="linear",
-                data_format='NCW')
-            rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist])
-            rel_pos_resized = rel_pos_resized.transpose([1, 0])
-        else:
-            rel_pos_resized = rel_pos
-
-        coords = paddle.arange(seq_size, dtype='float32')
-        relative_coords = coords.unsqueeze(-1) - coords.unsqueeze(0)
-        relative_coords += (seq_size - 1)
-        relative_coords = relative_coords.astype('int64').flatten()
-
-        return paddle.index_select(rel_pos_resized, relative_coords).reshape(
-            [seq_size, seq_size, self.head_dim])
-
-    def add_decomposed_rel_pos(self, attn, q, h, w):
-        """
-        Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
-        Args:
-            attn (Tensor): attention map.
-            q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
-        Returns:
-            attn (Tensor): attention map with added relative positional embeddings.
-        """
-        Rh = self.get_rel_pos(h, self.rel_pos_h)
-        Rw = self.get_rel_pos(w, self.rel_pos_w)
-
-        B, _, dim = q.shape
-        r_q = q.reshape([B, h, w, dim])
-        # bhwc, hch->bhwh1
-        # bwhc, wcw->bhw1w
-        rel_h = paddle.einsum("bhwc,hkc->bhwk", r_q, Rh).unsqueeze(-1)
-        rel_w = paddle.einsum("bhwc,wkc->bhwk", r_q, Rw).unsqueeze(-2)
-
-        attn = attn.reshape([B, h, w, h, w]) + rel_h + rel_w
-        return attn.reshape([B, h * w, h * w])
-
-    def forward(self, x):
-        B, H, W, C = paddle.shape(x)
-
-        if self.q_bias is not None:
-            qkv_bias = paddle.concat(
-                (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias))
-            qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias)
-        else:
-            qkv = self.qkv(x).reshape(
-                [B, H * W, 3, self.num_heads, self.head_dim]).transpose(
-                    [2, 0, 3, 1, 4]).reshape(
-                        [3, B * self.num_heads, H * W, self.head_dim])
-
-        q, k, v = qkv[0], qkv[1], qkv[2]
-        attn = q.matmul(k.transpose([0, 2, 1])) * self.scale
-
-        if self.use_rel_pos:
-            attn = self.add_decomposed_rel_pos(attn, q, H, W)
-
-        attn = F.softmax(attn, axis=-1)
-        attn = self.attn_drop(attn)
-        x = attn.matmul(v).reshape(
-            [B, self.num_heads, H * W, self.head_dim]).transpose(
-                [0, 2, 1, 3]).reshape([B, H, W, C])
-        x = self.proj(x)
-        return x
-
-
-class Block(nn.Layer):
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 mlp_ratio=4.,
-                 qkv_bias=False,
-                 attn_bias=False,
-                 qk_scale=None,
-                 init_values=None,
-                 drop=0.,
-                 attn_drop=0.,
-                 drop_path=0.,
-                 use_rel_pos=True,
-                 rel_pos_zero_init=True,
-                 window_size=None,
-                 input_size=None,
-                 act_layer='nn.GELU',
-                 norm_layer='nn.LayerNorm',
-                 lr_factor=1.0,
-                 epsilon=1e-5):
-        super().__init__()
-        self.window_size = window_size
-
-        self.norm1 = eval(norm_layer)(dim,
-                                      weight_attr=ParamAttr(
-                                          learning_rate=lr_factor,
-                                          regularizer=L2Decay(0.0)),
-                                      bias_attr=ParamAttr(
-                                          learning_rate=lr_factor,
-                                          regularizer=L2Decay(0.0)),
-                                      epsilon=epsilon)
-        self.attn = Attention(
-            dim,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            attn_bias=attn_bias,
-            qk_scale=qk_scale,
-            attn_drop=attn_drop,
-            proj_drop=drop,
-            use_rel_pos=use_rel_pos,
-            rel_pos_zero_init=rel_pos_zero_init,
-            window_size=window_size,
-            input_size=input_size,
-            lr_factor=lr_factor)
-
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
-        self.norm2 = eval(norm_layer)(dim,
-                                      weight_attr=ParamAttr(
-                                          learning_rate=lr_factor,
-                                          regularizer=L2Decay(0.0)),
-                                      bias_attr=ParamAttr(
-                                          learning_rate=lr_factor,
-                                          regularizer=L2Decay(0.0)),
-                                      epsilon=epsilon)
-        self.mlp = Mlp(in_features=dim,
-                       hidden_features=int(dim * mlp_ratio),
-                       act_layer=act_layer,
-                       drop=drop,
-                       lr_factor=lr_factor)
-        if init_values is not None:
-            self.gamma_1 = self.create_parameter(
-                shape=([dim]), default_initializer=Constant(value=init_values))
-            self.gamma_2 = self.create_parameter(
-                shape=([dim]), default_initializer=Constant(value=init_values))
-        else:
-            self.gamma_1, self.gamma_2 = None, None
-
-    def forward(self, x):
-        y = self.norm1(x)
-        if self.window_size is not None:
-            y, pad_hw, num_hw = window_partition(y, self.window_size)
-        y = self.attn(y)
-        if self.gamma_1 is not None:
-            y = self.gamma_1 * y
-
-        if self.window_size is not None:
-            y = window_unpartition(y, pad_hw, num_hw, (x.shape[1], x.shape[2]))
-        x = x + self.drop_path(y)
-        if self.gamma_2 is None:
-            x = x + self.drop_path(self.mlp(self.norm2(x)))
-        else:
-            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
-
-        return x
-
-
-class PatchEmbed(nn.Layer):
-    """ Image to Patch Embedding
-    """
-
-    def __init__(self,
-                 img_size=(224, 224),
-                 patch_size=16,
-                 in_chans=3,
-                 embed_dim=768,
-                 lr_factor=0.01):
-        super().__init__()
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.proj = nn.Conv2D(
-            in_chans,
-            embed_dim,
-            kernel_size=patch_size,
-            stride=patch_size,
-            weight_attr=ParamAttr(learning_rate=lr_factor),
-            bias_attr=ParamAttr(learning_rate=lr_factor))
-
-    @property
-    def num_patches_in_h(self):
-        return self.img_size[1] // self.patch_size
-
-    @property
-    def num_patches_in_w(self):
-        return self.img_size[0] // self.patch_size
-
-    def forward(self, x):
-        out = self.proj(x)
-        return out
-
-
-@register
-@serializable
-class VisionTransformer2D(nn.Layer):
-    """ Vision Transformer with support for patch input
-    """
-
-    def __init__(self,
-                 img_size=(1024, 1024),
-                 patch_size=16,
-                 in_chans=3,
-                 embed_dim=768,
-                 depth=12,
-                 num_heads=12,
-                 mlp_ratio=4,
-                 qkv_bias=False,
-                 attn_bias=False,
-                 qk_scale=None,
-                 init_values=None,
-                 drop_rate=0.,
-                 attn_drop_rate=0.,
-                 drop_path_rate=0.,
-                 act_layer='nn.GELU',
-                 norm_layer='nn.LayerNorm',
-                 lr_decay_rate=1.0,
-                 global_attn_indexes=(2, 5, 8, 11),
-                 use_abs_pos=False,
-                 use_rel_pos=False,
-                 use_abs_pos_emb=False,
-                 use_sincos_pos_emb=False,
-                 rel_pos_zero_init=True,
-                 epsilon=1e-5,
-                 final_norm=False,
-                 pretrained=None,
-                 window_size=None,
-                 out_indices=(11, ),
-                 with_fpn=False,
-                 use_checkpoint=False,
-                 *args,
-                 **kwargs):
-        super().__init__()
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.depth = depth
-        self.global_attn_indexes = global_attn_indexes
-        self.epsilon = epsilon
-        self.with_fpn = with_fpn
-        self.use_checkpoint = use_checkpoint
-
-        self.patch_h = img_size[0] // patch_size
-        self.patch_w = img_size[1] // patch_size
-        self.num_patches = self.patch_h * self.patch_w
-        self.use_abs_pos = use_abs_pos
-        self.use_abs_pos_emb = use_abs_pos_emb
-
-        self.patch_embed = PatchEmbed(
-            img_size=img_size,
-            patch_size=patch_size,
-            in_chans=in_chans,
-            embed_dim=embed_dim)
-
-        dpr = np.linspace(0, drop_path_rate, depth)
-        if use_checkpoint:
-            paddle.seed(0)
-
-        if use_abs_pos_emb:
-            self.pos_w = self.patch_embed.num_patches_in_w
-            self.pos_h = self.patch_embed.num_patches_in_h
-            self.pos_embed = self.create_parameter(
-                shape=(1, self.pos_w * self.pos_h + 1, embed_dim),
-                default_initializer=paddle.nn.initializer.TruncatedNormal(
-                    std=.02))
-        elif use_sincos_pos_emb:
-            pos_embed = self.get_2d_sincos_position_embedding(self.patch_h,
-                                                              self.patch_w)
-
-            self.pos_embed = pos_embed
-            self.pos_embed = self.create_parameter(shape=pos_embed.shape)
-            self.pos_embed.set_value(pos_embed.numpy())
-            self.pos_embed.stop_gradient = True
-        else:
-            self.pos_embed = None
-
-        self.blocks = nn.LayerList([
-            Block(
-                embed_dim,
-                num_heads=num_heads,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                attn_bias=attn_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                attn_drop=attn_drop_rate,
-                drop_path=dpr[i],
-                use_rel_pos=use_rel_pos,
-                rel_pos_zero_init=rel_pos_zero_init,
-                window_size=None
-                if i in self.global_attn_indexes else window_size,
-                input_size=[self.patch_h, self.patch_w],
-                act_layer=act_layer,
-                lr_factor=self.get_vit_lr_decay_rate(i, lr_decay_rate),
-                norm_layer=norm_layer,
-                init_values=init_values,
-                epsilon=epsilon) for i in range(depth)
-        ])
-
-        assert len(out_indices) <= 4, 'out_indices out of bound'
-        self.out_indices = out_indices
-        self.pretrained = pretrained
-        self.init_weight()
-
-        self.out_channels = [embed_dim for _ in range(len(out_indices))]
-        self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [
-            patch_size for _ in range(len(out_indices))
-        ]
-        self.norm = Identity()
-        if self.with_fpn:
-            self.init_fpn(
-                embed_dim=embed_dim,
-                patch_size=patch_size,
-                out_with_norm=final_norm)
-
-    def get_vit_lr_decay_rate(self, layer_id, lr_decay_rate):
-        return lr_decay_rate**(self.depth - layer_id)
-
-    def init_weight(self):
-        pretrained = self.pretrained
-        if pretrained:
-            if 'http' in pretrained:
-                path = paddle.utils.download.get_weights_path_from_url(
-                    pretrained)
-            else:
-                path = pretrained
-
-            load_state_dict = paddle.load(path)
-            model_state_dict = self.state_dict()
-            pos_embed_name = "pos_embed"
-
-            if pos_embed_name in load_state_dict.keys(
-            ) and self.use_abs_pos_emb:
-                load_pos_embed = paddle.to_tensor(
-                    load_state_dict[pos_embed_name], dtype="float32")
-                if self.pos_embed.shape != load_pos_embed.shape:
-                    pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1))
-                    model_state_dict[pos_embed_name] = self.resize_pos_embed(
-                        load_pos_embed, (pos_size, pos_size),
-                        (self.pos_h, self.pos_w))
-
-                    # self.set_state_dict(model_state_dict)
-                    load_state_dict[pos_embed_name] = model_state_dict[
-                        pos_embed_name]
-
-                    print("Load pos_embed and resize it from {} to {} .".format(
-                        load_pos_embed.shape, self.pos_embed.shape))
-
-            self.set_state_dict(load_state_dict)
-            print("Load load_state_dict....")
-
-    def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False):
-        if patch_size == 16:
-            self.fpn1 = nn.Sequential(
-                nn.Conv2DTranspose(
-                    embed_dim, embed_dim, kernel_size=2, stride=2),
-                nn.BatchNorm2D(embed_dim),
-                nn.GELU(),
-                nn.Conv2DTranspose(
-                    embed_dim, embed_dim, kernel_size=2, stride=2), )
-
-            self.fpn2 = nn.Sequential(
-                nn.Conv2DTranspose(
-                    embed_dim, embed_dim, kernel_size=2, stride=2), )
-
-            self.fpn3 = Identity()
-
-            self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2)
-        elif patch_size == 8:
-            self.fpn1 = nn.Sequential(
-                nn.Conv2DTranspose(
-                    embed_dim, embed_dim, kernel_size=2, stride=2), )
-
-            self.fpn2 = Identity()
-
-            self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), )
-
-            self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), )
-
-        if not out_with_norm:
-            self.norm = Identity()
-        else:
-            self.norm = nn.LayerNorm(embed_dim, epsilon=self.epsilon)
-
-    def resize_pos_embed(self, pos_embed, old_hw, new_hw):
-        """
-        Resize pos_embed weight.
-        Args:
-            pos_embed (Tensor): the pos_embed weight
-            old_hw (list[int]): the height and width of old pos_embed
-            new_hw (list[int]): the height and width of new pos_embed
-        Returns:
-            Tensor: the resized pos_embed weight
-        """
-        cls_pos_embed = pos_embed[:, :1, :]
-        pos_embed = pos_embed[:, 1:, :]
-
-        pos_embed = pos_embed.transpose([0, 2, 1])
-        pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]])
-        pos_embed = F.interpolate(
-            pos_embed, new_hw, mode='bicubic', align_corners=False)
-        pos_embed = pos_embed.flatten(2).transpose([0, 2, 1])
-        pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1)
-
-        return pos_embed
-
-    def get_2d_sincos_position_embedding(self, h, w, temperature=10000.):
-        grid_y, grid_x = paddle.meshgrid(
-            paddle.arange(
-                h, dtype=paddle.float32),
-            paddle.arange(
-                w, dtype=paddle.float32))
-        assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
-        pos_dim = self.embed_dim // 4
-        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
-        omega = (1. / (temperature**omega)).unsqueeze(0)
-
-        out_x = grid_x.reshape([-1, 1]).matmul(omega)
-        out_y = grid_y.reshape([-1, 1]).matmul(omega)
-
-        pos_emb = paddle.concat(
-            [
-                paddle.sin(out_y), paddle.cos(out_y), paddle.sin(out_x),
-                paddle.cos(out_x)
-            ],
-            axis=1)
-
-        return pos_emb.reshape([1, h, w, self.embed_dim])
-
-    def forward(self, inputs):
-        x = self.patch_embed(inputs['image']).transpose([0, 2, 3, 1])
-        B, Hp, Wp, _ = paddle.shape(x)
-
-        if self.use_abs_pos:
-            x = x + self.get_2d_sincos_position_embedding(Hp, Wp)
-
-        if self.use_abs_pos_emb:
-            x = x + self.resize_pos_embed(self.pos_embed,
-                                          (self.pos_h, self.pos_w), (Hp, Wp))
-
-        feats = []
-        for idx, blk in enumerate(self.blocks):
-            if self.use_checkpoint and self.training:
-                x = paddle.distributed.fleet.utils.recompute(
-                    blk, x, **{"preserve_rng_state": True})
-            else:
-                x = blk(x)
-            if idx in self.out_indices:
-                feats.append(self.norm(x.transpose([0, 3, 1, 2])))
-
-        if self.with_fpn:
-            fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
-            for i in range(len(feats)):
-                feats[i] = fpns[i](feats[i])
-        return feats
-
-    @property
-    def num_layers(self):
-        return len(self.blocks)
-
-    @property
-    def no_weight_decay(self):
-        return {'pos_embed', 'cls_token'}
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=c, stride=s)
-            for c, s in zip(self.out_channels, self.out_strides)
-        ]
-
-
-class LayerNorm(nn.Layer):
-    """
-    A LayerNorm variant, popularized by Transformers, that performs point-wise mean and
-    variance normalization over the channel dimension for inputs that have shape
-    (batch_size, channels, height, width).    
-    Note that, the modified LayerNorm on used in ResBlock and SimpleFeaturePyramid.
-
-    In ViT, we use the nn.LayerNorm
-    """
-
-    def __init__(self, normalized_shape, eps=1e-6):
-        super().__init__()
-        self.weight = self.create_parameter([normalized_shape])
-        self.bias = self.create_parameter([normalized_shape])
-        self.eps = eps
-        self.normalized_shape = (normalized_shape, )
-
-    def forward(self, x):
-        u = x.mean(1, keepdim=True)
-        s = (x - u).pow(2).mean(1, keepdim=True)
-        x = (x - u) / paddle.sqrt(s + self.eps)
-        x = self.weight[:, None, None] * x + self.bias[:, None, None]
-        return x
-
-
-@register
-@serializable
-class SimpleFeaturePyramid(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 spatial_scales,
-                 num_levels=4,
-                 use_bias=False):
-        """
-        Args:
-            in_channels (list[int]): input channels of each level which can be 
-                derived from the output shape of backbone by from_config
-            out_channel (int): output channel of each level.
-            spatial_scales (list[float]): list of scaling factors to upsample or downsample
-                the input features for creating pyramid features which can be derived from 
-                the output shape of backbone by from_config
-            num_levels (int): number of levels of output features.
-            use_bias (bool): whether use bias or not.
-        """
-        super(SimpleFeaturePyramid, self).__init__()
-
-        self.in_channels = in_channels[0]
-        self.out_channels = out_channels
-        self.num_levels = num_levels
-
-        self.stages = []
-        dim = self.in_channels
-        if num_levels == 4:
-            scale_factors = [2.0, 1.0, 0.5]
-        elif num_levels == 5:
-            scale_factors = [4.0, 2.0, 1.0, 0.5]
-        else:
-            raise NotImplementedError(
-                f"num_levels={num_levels} is not supported yet.")
-
-        dim = in_channels[0]
-        for idx, scale in enumerate(scale_factors):
-            out_dim = dim
-            if scale == 4.0:
-                layers = [
-                    nn.Conv2DTranspose(
-                        dim, dim // 2, kernel_size=2, stride=2),
-                    nn.LayerNorm(dim // 2),
-                    nn.GELU(),
-                    nn.Conv2DTranspose(
-                        dim // 2, dim // 4, kernel_size=2, stride=2),
-                ]
-                out_dim = dim // 4
-            elif scale == 2.0:
-                layers = [
-                    nn.Conv2DTranspose(
-                        dim, dim // 2, kernel_size=2, stride=2)
-                ]
-                out_dim = dim // 2
-            elif scale == 1.0:
-                layers = []
-            elif scale == 0.5:
-                layers = [nn.MaxPool2D(kernel_size=2, stride=2)]
-
-            layers.extend([
-                nn.Conv2D(
-                    out_dim,
-                    out_channels,
-                    kernel_size=1,
-                    bias_attr=use_bias, ), LayerNorm(out_channels), nn.Conv2D(
-                        out_channels,
-                        out_channels,
-                        kernel_size=3,
-                        padding=1,
-                        bias_attr=use_bias, ), LayerNorm(out_channels)
-            ])
-            layers = nn.Sequential(*layers)
-
-            stage = -int(math.log2(spatial_scales[0] * scale_factors[idx]))
-            self.add_sublayer(f"simfp_{stage}", layers)
-            self.stages.append(layers)
-
-        # top block output feature maps.
-        self.top_block = nn.Sequential(
-            nn.MaxPool2D(
-                kernel_size=1, stride=2, padding=0))
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {
-            'in_channels': [i.channels for i in input_shape],
-            'spatial_scales': [1.0 / i.stride for i in input_shape],
-        }
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(channels=self.out_channels)
-            for _ in range(self.num_levels)
-        ]
-
-    def forward(self, feats):
-        """
-        Args:
-            x: Tensor of shape (N,C,H,W).
-        """
-        features = feats[0]
-        results = []
-
-        for stage in self.stages:
-            results.append(stage(features))
-
-        top_block_in_feature = results[-1]
-        results.append(self.top_block(top_block_in_feature))
-        assert self.num_levels == len(results)
-
-        return results
diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/vitpose.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/vitpose.py
deleted file mode 100644
index 23e00be..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/backbones/vitpose.py
+++ /dev/null
@@ -1,320 +0,0 @@
-# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
-# reference: https://arxiv.org/abs/2010.11929
-
-from collections.abc import Callable
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-from paddle.nn.initializer import TruncatedNormal, Constant, Normal
-from ppdet.core.workspace import register, serializable
-
-trunc_normal_ = TruncatedNormal(std=.02)
-
-
-def to_2tuple(x):
-    if isinstance(x, (list, tuple)):
-        return x
-    return tuple([x] * 2)
-
-
-def drop_path(x, drop_prob=0., training=False):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ...
-    """
-    if drop_prob == 0. or not training:
-        return x
-    keep_prob = paddle.to_tensor(1.0 - drop_prob).astype(x.dtype)
-    shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1)
-    random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype)
-    random_tensor = paddle.floor(random_tensor)  # binarize
-    output = x.divide(keep_prob) * random_tensor
-    return output
-
-
-class DropPath(nn.Layer):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
-    """
-
-    def __init__(self, drop_prob=None):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training)
-
-
-class Identity(nn.Layer):
-    def __init__(self):
-        super(Identity, self).__init__()
-
-    def forward(self, input):
-        return input
-
-
-class Mlp(nn.Layer):
-    def __init__(self,
-                 in_features,
-                 hidden_features=None,
-                 out_features=None,
-                 act_layer=nn.GELU,
-                 drop=0.):
-        super().__init__()
-        out_features = out_features or in_features
-        hidden_features = hidden_features or in_features
-        self.fc1 = nn.Linear(in_features, hidden_features)
-        self.act = act_layer()
-        self.fc2 = nn.Linear(hidden_features, out_features)
-        self.drop = nn.Dropout(drop)
-
-    def forward(self, x):
-        x = self.fc1(x)
-        x = self.act(x)
-
-        x = self.fc2(x)
-        x = self.drop(x)
-        return x
-
-
-class Attention(nn.Layer):
-    def __init__(self,
-                 dim,
-                 num_heads=8,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 attn_drop=0.,
-                 proj_drop=0.):
-        super().__init__()
-        self.num_heads = num_heads
-        head_dim = dim // num_heads
-        self.scale = qk_scale or head_dim**-0.5
-
-        self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias)
-
-        self.attn_drop = nn.Dropout(attn_drop)
-        self.proj = nn.Linear(dim, dim)
-        self.proj_drop = nn.Dropout(proj_drop)
-
-    def forward(self, x):
-
-        N, C = x.shape[1:]
-        qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C //
-                                   self.num_heads)).transpose((2, 0, 3, 1, 4))
-
-        q, k, v = qkv[0], qkv[1], qkv[2]
-
-        attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale
-        attn = nn.functional.softmax(attn, axis=-1)
-        attn = self.attn_drop(attn)
-
-        x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C))
-        x = self.proj(x)
-
-        x = self.proj_drop(x)
-        return x
-
-
-class Block(nn.Layer):
-    def __init__(self,
-                 dim,
-                 num_heads,
-                 mlp_ratio=4.,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 drop=0.,
-                 attn_drop=0.,
-                 drop_path=0.,
-                 act_layer=nn.GELU,
-                 norm_layer='nn.LayerNorm',
-                 epsilon=1e-5):
-        super().__init__()
-        if isinstance(norm_layer, str):
-            self.norm1 = eval(norm_layer)(dim, epsilon=epsilon)
-        elif isinstance(norm_layer, Callable):
-            self.norm1 = norm_layer(dim)
-        else:
-            raise TypeError(
-                "The norm_layer must be str or paddle.nn.layer.Layer class")
-        self.attn = Attention(
-            dim,
-            num_heads=num_heads,
-            qkv_bias=qkv_bias,
-            qk_scale=qk_scale,
-            attn_drop=attn_drop,
-            proj_drop=drop)
-        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
-        self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity()
-        if isinstance(norm_layer, str):
-            self.norm2 = eval(norm_layer)(dim, epsilon=epsilon)
-        elif isinstance(norm_layer, Callable):
-            self.norm2 = norm_layer(dim)
-        else:
-            raise TypeError(
-                "The norm_layer must be str or paddle.nn.layer.Layer class")
-        mlp_hidden_dim = int(dim * mlp_ratio)
-        self.mlp = Mlp(in_features=dim,
-                       hidden_features=mlp_hidden_dim,
-                       act_layer=act_layer,
-                       drop=drop)
-
-    def forward(self, x):
-        x = x + self.drop_path(self.attn(self.norm1(x)))
-        x = x + self.drop_path(self.mlp(self.norm2(x)))
-
-        return x
-
-
-class PatchEmbed(nn.Layer):
-    """ Image to Patch Embedding
-    """
-
-    def __init__(self,
-                 img_size=224,
-                 patch_size=16,
-                 in_chans=3,
-                 embed_dim=768,
-                 ratio=1):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-
-        num_patches = (img_size[1] // patch_size[1]) * (
-            img_size[0] // patch_size[0]) * (ratio**2)
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.num_patches = num_patches
-
-        self.proj = nn.Conv2D(
-            in_chans,
-            embed_dim,
-            kernel_size=patch_size,
-            stride=(patch_size[0] // ratio),
-            padding=(4 + 2 * (ratio // 2 - 1), 4 + 2 * (ratio // 2 - 1)))
-
-    def forward(self, x):
-        B, C, H, W = x.shape
-        assert H == self.img_size[0] and W == self.img_size[1], \
-            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
-
-        x = self.proj(x)
-        return x
-
-
-@register
-@serializable
-class ViT(nn.Layer):
-    """ Vision Transformer with support for patch input
-
-        This module is different from ppdet's VisionTransformer (from ppdet/modeling/backbones/visio_transformer.py),
-        the main differences are:
-        1.the module PatchEmbed.proj has padding set,padding=(4 + 2 * (ratio // 2 - 1), 4 + 2 * (ratio // 2 - 1),
-          VisionTransformer dose not
-        2.Attention module qkv is standard.but VisionTransformer provide more options
-        3.MLP module only one Dropout,and VisionTransformer twice;
-        4.VisionTransformer provide fpn layer,but the module does not.
-        
-    """
-
-    def __init__(self,
-                 img_size=224,
-                 patch_size=16,
-                 in_chans=3,
-                 embed_dim=768,
-                 depth=12,
-                 num_heads=12,
-                 mlp_ratio=4,
-                 qkv_bias=False,
-                 qk_scale=None,
-                 drop_rate=0.,
-                 attn_drop_rate=0.,
-                 drop_path_rate=0.,
-                 norm_layer='nn.LayerNorm',
-                 epsilon=1e-5,
-                 ratio=1,
-                 pretrained=None,
-                 **kwargs):
-        super().__init__()
-
-        self.pretrained = pretrained
-        self.num_features = self.embed_dim = embed_dim
-
-        self.patch_embed = PatchEmbed(
-            img_size=img_size,
-            patch_size=patch_size,
-            in_chans=in_chans,
-            embed_dim=embed_dim,
-            ratio=ratio)
-        num_patches = self.patch_embed.num_patches
-
-        self.pos_embed = self.create_parameter(
-            shape=(1, num_patches + 1, embed_dim),
-            default_initializer=trunc_normal_)
-        self.add_parameter("pos_embed", self.pos_embed)
-
-        dpr = np.linspace(0, drop_path_rate, depth, dtype='float32')
-
-        self.blocks = nn.LayerList([
-            Block(
-                dim=embed_dim,
-                num_heads=num_heads,
-                mlp_ratio=mlp_ratio,
-                qkv_bias=qkv_bias,
-                qk_scale=qk_scale,
-                drop=drop_rate,
-                attn_drop=attn_drop_rate,
-                drop_path=dpr[i],
-                norm_layer=norm_layer,
-                epsilon=epsilon) for i in range(depth)
-        ])
-
-        self.last_norm = eval(norm_layer)(embed_dim, epsilon=epsilon)
-        trunc_normal_(self.pos_embed)
-        self._init_weights()
-
-    def _init_weights(self):
-        pretrained = self.pretrained
-
-        if pretrained:
-
-            if 'http' in pretrained:  #URL
-                path = paddle.utils.download.get_weights_path_from_url(
-                    pretrained)
-            else:  #model in local path
-                path = pretrained
-
-            load_state_dict = paddle.load(path)
-            self.set_state_dict(load_state_dict)
-            print("Load load_state_dict:", path)
-
-    def forward_features(self, x):
-
-        B = paddle.shape(x)[0]
-        x = self.patch_embed(x)
-        B, D, Hp, Wp = x.shape
-        x = x.flatten(2).transpose([0, 2, 1])
-        x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1]
-
-        for blk in self.blocks:
-            x = blk(x)
-
-        x = self.last_norm(x)
-        xp = paddle.reshape(
-            paddle.transpose(
-                x, perm=[0, 2, 1]), shape=[B, -1, Hp, Wp])
-
-        return xp
diff --git a/pdfdet/models/Paddle/ppdet/modeling/bbox_utils.py b/pdfdet/models/Paddle/ppdet/modeling/bbox_utils.py
deleted file mode 100644
index 576cbbf..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/bbox_utils.py
+++ /dev/null
@@ -1,607 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import paddle
-import numpy as np
-
-
-def bbox2delta(src_boxes, tgt_boxes, weights=[1.0, 1.0, 1.0, 1.0]):
-    """Encode bboxes to deltas.
-    """
-    src_w = src_boxes[:, 2] - src_boxes[:, 0]
-    src_h = src_boxes[:, 3] - src_boxes[:, 1]
-    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
-    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
-
-    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
-    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
-    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
-    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
-
-    wx, wy, ww, wh = weights
-    dx = wx * (tgt_ctr_x - src_ctr_x) / src_w
-    dy = wy * (tgt_ctr_y - src_ctr_y) / src_h
-    dw = ww * paddle.log(tgt_w / src_w)
-    dh = wh * paddle.log(tgt_h / src_h)
-
-    deltas = paddle.stack((dx, dy, dw, dh), axis=1)
-    return deltas
-
-
-def delta2bbox(deltas, boxes, weights=[1.0, 1.0, 1.0, 1.0], max_shape=None):
-    """Decode deltas to boxes. Used in RCNNBox,CascadeHead,RCNNHead,RetinaHead.
-    Note: return tensor shape [n,1,4]
-        If you want to add a reshape, please add after the calling code instead of here.
-    """
-    clip_scale = math.log(1000.0 / 16)
-
-    widths = boxes[:, 2] - boxes[:, 0]
-    heights = boxes[:, 3] - boxes[:, 1]
-    ctr_x = boxes[:, 0] + 0.5 * widths
-    ctr_y = boxes[:, 1] + 0.5 * heights
-
-    wx, wy, ww, wh = weights
-    dx = deltas[:, 0::4] / wx
-    dy = deltas[:, 1::4] / wy
-    dw = deltas[:, 2::4] / ww
-    dh = deltas[:, 3::4] / wh
-    # Prevent sending too large values into paddle.exp()
-    dw = paddle.clip(dw, max=clip_scale)
-    dh = paddle.clip(dh, max=clip_scale)
-
-    pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1)
-    pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1)
-    pred_w = paddle.exp(dw) * widths.unsqueeze(1)
-    pred_h = paddle.exp(dh) * heights.unsqueeze(1)
-
-    pred_boxes = []
-    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
-    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
-    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
-    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
-    pred_boxes = paddle.stack(pred_boxes, axis=-1)
-
-    if max_shape is not None:
-        pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
-            min=0, max=max_shape[1])
-        pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
-            min=0, max=max_shape[0])
-    return pred_boxes
-
-
-def bbox2delta_v2(src_boxes,
-                  tgt_boxes,
-                  delta_mean=[0.0, 0.0, 0.0, 0.0],
-                  delta_std=[1.0, 1.0, 1.0, 1.0]):
-    """Encode bboxes to deltas.
-    Modified from bbox2delta() which just use weight parameters to multiply deltas.
-    """
-    src_w = src_boxes[:, 2] - src_boxes[:, 0]
-    src_h = src_boxes[:, 3] - src_boxes[:, 1]
-    src_ctr_x = src_boxes[:, 0] + 0.5 * src_w
-    src_ctr_y = src_boxes[:, 1] + 0.5 * src_h
-
-    tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0]
-    tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1]
-    tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w
-    tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h
-
-    dx = (tgt_ctr_x - src_ctr_x) / src_w
-    dy = (tgt_ctr_y - src_ctr_y) / src_h
-    dw = paddle.log(tgt_w / src_w)
-    dh = paddle.log(tgt_h / src_h)
-
-    deltas = paddle.stack((dx, dy, dw, dh), axis=1)
-    deltas = (
-        deltas - paddle.to_tensor(delta_mean)) / paddle.to_tensor(delta_std)
-    return deltas
-
-
-def delta2bbox_v2(deltas,
-                  boxes,
-                  delta_mean=[0.0, 0.0, 0.0, 0.0],
-                  delta_std=[1.0, 1.0, 1.0, 1.0],
-                  max_shape=None,
-                  ctr_clip=32.0):
-    """Decode deltas to bboxes.
-    Modified from delta2bbox() which just use weight parameters to be divided by deltas.
-    Used in YOLOFHead.
-    Note: return tensor shape [n,1,4]
-        If you want to add a reshape, please add after the calling code instead of here.
-    """
-    clip_scale = math.log(1000.0 / 16)
-
-    widths = boxes[:, 2] - boxes[:, 0]
-    heights = boxes[:, 3] - boxes[:, 1]
-    ctr_x = boxes[:, 0] + 0.5 * widths
-    ctr_y = boxes[:, 1] + 0.5 * heights
-
-    deltas = deltas * paddle.to_tensor(delta_std) + paddle.to_tensor(delta_mean)
-    dx = deltas[:, 0::4]
-    dy = deltas[:, 1::4]
-    dw = deltas[:, 2::4]
-    dh = deltas[:, 3::4]
-
-    # Prevent sending too large values into paddle.exp()
-    dx = dx * widths.unsqueeze(1)
-    dy = dy * heights.unsqueeze(1)
-    if ctr_clip is not None:
-        dx = paddle.clip(dx, max=ctr_clip, min=-ctr_clip)
-        dy = paddle.clip(dy, max=ctr_clip, min=-ctr_clip)
-        dw = paddle.clip(dw, max=clip_scale)
-        dh = paddle.clip(dh, max=clip_scale)
-    else:
-        dw = dw.clip(min=-clip_scale, max=clip_scale)
-        dh = dh.clip(min=-clip_scale, max=clip_scale)
-
-    pred_ctr_x = dx + ctr_x.unsqueeze(1)
-    pred_ctr_y = dy + ctr_y.unsqueeze(1)
-    pred_w = paddle.exp(dw) * widths.unsqueeze(1)
-    pred_h = paddle.exp(dh) * heights.unsqueeze(1)
-
-    pred_boxes = []
-    pred_boxes.append(pred_ctr_x - 0.5 * pred_w)
-    pred_boxes.append(pred_ctr_y - 0.5 * pred_h)
-    pred_boxes.append(pred_ctr_x + 0.5 * pred_w)
-    pred_boxes.append(pred_ctr_y + 0.5 * pred_h)
-    pred_boxes = paddle.stack(pred_boxes, axis=-1)
-
-    if max_shape is not None:
-        pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip(
-            min=0, max=max_shape[1])
-        pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip(
-            min=0, max=max_shape[0])
-    return pred_boxes
-
-
-def expand_bbox(bboxes, scale):
-    w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5
-    h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5
-    x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5
-    y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5
-
-    w_half *= scale
-    h_half *= scale
-
-    bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32)
-    bboxes_exp[:, 0] = x_c - w_half
-    bboxes_exp[:, 2] = x_c + w_half
-    bboxes_exp[:, 1] = y_c - h_half
-    bboxes_exp[:, 3] = y_c + h_half
-
-    return bboxes_exp
-
-
-def clip_bbox(boxes, im_shape):
-    h, w = im_shape[0], im_shape[1]
-    x1 = boxes[:, 0].clip(0, w)
-    y1 = boxes[:, 1].clip(0, h)
-    x2 = boxes[:, 2].clip(0, w)
-    y2 = boxes[:, 3].clip(0, h)
-    return paddle.stack([x1, y1, x2, y2], axis=1)
-
-
-def nonempty_bbox(boxes, min_size=0, return_mask=False):
-    w = boxes[:, 2] - boxes[:, 0]
-    h = boxes[:, 3] - boxes[:, 1]
-    mask = paddle.logical_and(h > min_size, w > min_size)
-    if return_mask:
-        return mask
-    keep = paddle.nonzero(mask).flatten()
-    return keep
-
-
-def bbox_area(boxes):
-    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
-
-
-def bbox_overlaps(boxes1, boxes2):
-    """
-    Calculate overlaps between boxes1 and boxes2
-
-    Args:
-        boxes1 (Tensor): boxes with shape [M, 4]
-        boxes2 (Tensor): boxes with shape [N, 4]
-
-    Return:
-        overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N]
-    """
-    M = boxes1.shape[0]
-    N = boxes2.shape[0]
-    if M * N == 0:
-        return paddle.zeros([M, N], dtype='float32')
-    area1 = bbox_area(boxes1)
-    area2 = bbox_area(boxes2)
-
-    xy_max = paddle.minimum(
-        paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:])
-    xy_min = paddle.maximum(
-        paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2])
-    width_height = xy_max - xy_min
-    width_height = width_height.clip(min=0)
-    inter = width_height.prod(axis=2)
-
-    overlaps = paddle.where(inter > 0, inter /
-                            (paddle.unsqueeze(area1, 1) + area2 - inter),
-                            paddle.zeros_like(inter))
-    return overlaps
-
-
-def batch_bbox_overlaps(bboxes1,
-                        bboxes2,
-                        mode='iou',
-                        is_aligned=False,
-                        eps=1e-6):
-    """Calculate overlap between two set of bboxes.
-    If ``is_aligned `` is ``False``, then calculate the overlaps between each
-    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
-    pair of bboxes1 and bboxes2.
-    Args:
-        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
-        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
-            B indicates the batch dim, in shape (B1, B2, ..., Bn).
-            If ``is_aligned `` is ``True``, then m and n must be equal.
-        mode (str): "iou" (intersection over union) or "iof" (intersection over
-            foreground).
-        is_aligned (bool, optional): If True, then m and n must be equal.
-            Default False.
-        eps (float, optional): A value added to the denominator for numerical
-            stability. Default 1e-6.
-    Returns:
-        Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
-    """
-    assert mode in ['iou', 'iof', 'giou'], 'Unsupported mode {}'.format(mode)
-    # Either the boxes are empty or the length of boxes's last dimenstion is 4
-    assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0)
-    assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0)
-
-    # Batch dim must be the same
-    # Batch dim: (B1, B2, ... Bn)
-    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
-    batch_shape = bboxes1.shape[:-2]
-
-    rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0
-    cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0
-    if is_aligned:
-        assert rows == cols
-
-    if rows * cols == 0:
-        if is_aligned:
-            return paddle.full(batch_shape + (rows, ), 1)
-        else:
-            return paddle.full(batch_shape + (rows, cols), 1)
-
-    area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1])
-    area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1])
-
-    if is_aligned:
-        lt = paddle.maximum(bboxes1[:, :2], bboxes2[:, :2])  # [B, rows, 2]
-        rb = paddle.minimum(bboxes1[:, 2:], bboxes2[:, 2:])  # [B, rows, 2]
-
-        wh = (rb - lt).clip(min=0)  # [B, rows, 2]
-        overlap = wh[:, 0] * wh[:, 1]
-
-        if mode in ['iou', 'giou']:
-            union = area1 + area2 - overlap
-        else:
-            union = area1
-        if mode == 'giou':
-            enclosed_lt = paddle.minimum(bboxes1[:, :2], bboxes2[:, :2])
-            enclosed_rb = paddle.maximum(bboxes1[:, 2:], bboxes2[:, 2:])
-    else:
-        lt = paddle.maximum(bboxes1[:, :2].reshape([rows, 1, 2]),
-                            bboxes2[:, :2])  # [B, rows, cols, 2]
-        rb = paddle.minimum(bboxes1[:, 2:].reshape([rows, 1, 2]),
-                            bboxes2[:, 2:])  # [B, rows, cols, 2]
-
-        wh = (rb - lt).clip(min=0)  # [B, rows, cols, 2]
-        overlap = wh[:, :, 0] * wh[:, :, 1]
-
-        if mode in ['iou', 'giou']:
-            union = area1.reshape([rows,1]) \
-                    + area2.reshape([1,cols]) - overlap
-        else:
-            union = area1[:, None]
-        if mode == 'giou':
-            enclosed_lt = paddle.minimum(bboxes1[:, :2].reshape([rows, 1, 2]),
-                                         bboxes2[:, :2])
-            enclosed_rb = paddle.maximum(bboxes1[:, 2:].reshape([rows, 1, 2]),
-                                         bboxes2[:, 2:])
-
-    eps = paddle.to_tensor([eps])
-    union = paddle.maximum(union, eps)
-    ious = overlap / union
-    if mode in ['iou', 'iof']:
-        return ious
-    # calculate gious
-    enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0)
-    enclose_area = enclose_wh[:, :, 0] * enclose_wh[:, :, 1]
-    enclose_area = paddle.maximum(enclose_area, eps)
-    gious = ious - (enclose_area - union) / enclose_area
-    return 1 - gious
-
-
-def xywh2xyxy(box):
-    x, y, w, h = box
-    x1 = x - w * 0.5
-    y1 = y - h * 0.5
-    x2 = x + w * 0.5
-    y2 = y + h * 0.5
-    return [x1, y1, x2, y2]
-
-
-def make_grid(h, w, dtype):
-    yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)])
-    return paddle.stack((xv, yv), 2).cast(dtype=dtype)
-
-
-def decode_yolo(box, anchor, downsample_ratio):
-    """decode yolo box
-
-    Args:
-        box (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
-        anchor (list): anchor with the shape [na, 2]
-        downsample_ratio (int): downsample ratio, default 32
-        scale (float): scale, default 1.
-
-    Return:
-        box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1]
-    """
-    x, y, w, h = box
-    na, grid_h, grid_w = x.shape[1:4]
-    grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2))
-    x1 = (x + grid[:, :, :, :, 0:1]) / grid_w
-    y1 = (y + grid[:, :, :, :, 1:2]) / grid_h
-
-    anchor = paddle.to_tensor(anchor, dtype=x.dtype)
-    anchor = anchor.reshape((1, na, 1, 1, 2))
-    w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w)
-    h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h)
-
-    return [x1, y1, w1, h1]
-
-
-def batch_iou_similarity(box1, box2, eps=1e-9):
-    """Calculate iou of box1 and box2 in batch
-
-    Args:
-        box1 (Tensor): box with the shape [N, M1, 4]
-        box2 (Tensor): box with the shape [N, M2, 4]
-
-    Return:
-        iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]
-    """
-    box1 = box1.unsqueeze(2)  # [N, M1, 4] -> [N, M1, 1, 4]
-    box2 = box2.unsqueeze(1)  # [N, M2, 4] -> [N, 1, M2, 4]
-    px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4]
-    gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4]
-    x1y1 = paddle.maximum(px1y1, gx1y1)
-    x2y2 = paddle.minimum(px2y2, gx2y2)
-    overlap = (x2y2 - x1y1).clip(0).prod(-1)
-    area1 = (px2y2 - px1y1).clip(0).prod(-1)
-    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
-    union = area1 + area2 - overlap + eps
-    return overlap / union
-
-
-def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9):
-    """calculate the iou of box1 and box2
-
-    Args:
-        box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
-        box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1]
-        giou (bool): whether use giou or not, default False
-        diou (bool): whether use diou or not, default False
-        ciou (bool): whether use ciou or not, default False
-        eps (float): epsilon to avoid divide by zero
-
-    Return:
-        iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1]
-    """
-    px1, py1, px2, py2 = box1
-    gx1, gy1, gx2, gy2 = box2
-    x1 = paddle.maximum(px1, gx1)
-    y1 = paddle.maximum(py1, gy1)
-    x2 = paddle.minimum(px2, gx2)
-    y2 = paddle.minimum(py2, gy2)
-
-    overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0))
-
-    area1 = (px2 - px1) * (py2 - py1)
-    area1 = area1.clip(0)
-
-    area2 = (gx2 - gx1) * (gy2 - gy1)
-    area2 = area2.clip(0)
-
-    union = area1 + area2 - overlap + eps
-    iou = overlap / union
-
-    if giou or ciou or diou:
-        # convex w, h
-        cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1)
-        ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1)
-        if giou:
-            c_area = cw * ch + eps
-            return iou - (c_area - union) / c_area
-        else:
-            # convex diagonal squared
-            c2 = cw**2 + ch**2 + eps
-            # center distance
-            rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4
-            if diou:
-                return iou - rho2 / c2
-            else:
-                w1, h1 = px2 - px1, py2 - py1 + eps
-                w2, h2 = gx2 - gx1, gy2 - gy1 + eps
-                delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2)
-                v = (4 / math.pi**2) * paddle.pow(delta, 2)
-                alpha = v / (1 + eps - iou + v)
-                alpha.stop_gradient = True
-                return iou - (rho2 / c2 + v * alpha)
-    else:
-        return iou
-
-
-def bbox_iou_np_expand(box1, box2, x1y1x2y2=True, eps=1e-16):
-    """
-    Calculate the iou of box1 and box2 with numpy.
-
-    Args:
-        box1 (ndarray): [N, 4]
-        box2 (ndarray): [M, 4], usually N != M
-        x1y1x2y2 (bool): whether in x1y1x2y2 stype, default True
-        eps (float): epsilon to avoid divide by zero
-    Return:
-        iou (ndarray): iou of box1 and box2, [N, M]
-    """
-    N, M = len(box1), len(box2)  # usually N != M
-    if x1y1x2y2:
-        b1_x1, b1_y1 = box1[:, 0], box1[:, 1]
-        b1_x2, b1_y2 = box1[:, 2], box1[:, 3]
-        b2_x1, b2_y1 = box2[:, 0], box2[:, 1]
-        b2_x2, b2_y2 = box2[:, 2], box2[:, 3]
-    else:
-        # cxcywh style
-        # Transform from center and width to exact coordinates
-        b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
-        b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
-        b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
-        b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
-
-    # get the coordinates of the intersection rectangle
-    inter_rect_x1 = np.zeros((N, M), dtype=np.float32)
-    inter_rect_y1 = np.zeros((N, M), dtype=np.float32)
-    inter_rect_x2 = np.zeros((N, M), dtype=np.float32)
-    inter_rect_y2 = np.zeros((N, M), dtype=np.float32)
-    for i in range(len(box2)):
-        inter_rect_x1[:, i] = np.maximum(b1_x1, b2_x1[i])
-        inter_rect_y1[:, i] = np.maximum(b1_y1, b2_y1[i])
-        inter_rect_x2[:, i] = np.minimum(b1_x2, b2_x2[i])
-        inter_rect_y2[:, i] = np.minimum(b1_y2, b2_y2[i])
-    # Intersection area
-    inter_area = np.maximum(inter_rect_x2 - inter_rect_x1, 0) * np.maximum(
-        inter_rect_y2 - inter_rect_y1, 0)
-    # Union Area
-    b1_area = np.repeat(
-        ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).reshape(-1, 1), M, axis=-1)
-    b2_area = np.repeat(
-        ((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).reshape(1, -1), N, axis=0)
-
-    ious = inter_area / (b1_area + b2_area - inter_area + eps)
-    return ious
-
-
-def bbox2distance(points, bbox, max_dis=None, eps=0.1):
-    """Decode bounding box based on distances.
-    Args:
-        points (Tensor): Shape (n, 2), [x, y].
-        bbox (Tensor): Shape (n, 4), "xyxy" format
-        max_dis (float): Upper bound of the distance.
-        eps (float): a small value to ensure target < max_dis, instead <=
-    Returns:
-        Tensor: Decoded distances.
-    """
-    left = points[:, 0] - bbox[:, 0]
-    top = points[:, 1] - bbox[:, 1]
-    right = bbox[:, 2] - points[:, 0]
-    bottom = bbox[:, 3] - points[:, 1]
-    if max_dis is not None:
-        left = left.clip(min=0, max=max_dis - eps)
-        top = top.clip(min=0, max=max_dis - eps)
-        right = right.clip(min=0, max=max_dis - eps)
-        bottom = bottom.clip(min=0, max=max_dis - eps)
-    return paddle.stack([left, top, right, bottom], -1)
-
-
-def distance2bbox(points, distance, max_shape=None):
-    """Decode distance prediction to bounding box.
-        Args:
-            points (Tensor): Shape (n, 2), [x, y].
-            distance (Tensor): Distance from the given point to 4
-                boundaries (left, top, right, bottom).
-            max_shape (tuple): Shape of the image.
-        Returns:
-            Tensor: Decoded bboxes.
-        """
-    x1 = points[:, 0] - distance[:, 0]
-    y1 = points[:, 1] - distance[:, 1]
-    x2 = points[:, 0] + distance[:, 2]
-    y2 = points[:, 1] + distance[:, 3]
-    if max_shape is not None:
-        x1 = x1.clip(min=0, max=max_shape[1])
-        y1 = y1.clip(min=0, max=max_shape[0])
-        x2 = x2.clip(min=0, max=max_shape[1])
-        y2 = y2.clip(min=0, max=max_shape[0])
-    return paddle.stack([x1, y1, x2, y2], -1)
-
-
-def bbox_center(boxes):
-    """Get bbox centers from boxes.
-    Args:
-        boxes (Tensor): boxes with shape (..., 4), "xmin, ymin, xmax, ymax" format.
-    Returns:
-        Tensor: boxes centers with shape (..., 2), "cx, cy" format.
-    """
-    boxes_cx = (boxes[..., 0] + boxes[..., 2]) / 2
-    boxes_cy = (boxes[..., 1] + boxes[..., 3]) / 2
-    return paddle.stack([boxes_cx, boxes_cy], axis=-1)
-
-
-def batch_distance2bbox(points, distance, max_shapes=None):
-    """Decode distance prediction to bounding box for batch.
-    Args:
-        points (Tensor): [B, ..., 2], "xy" format
-        distance (Tensor): [B, ..., 4], "ltrb" format
-        max_shapes (Tensor): [B, 2], "h,w" format, Shape of the image.
-    Returns:
-        Tensor: Decoded bboxes, "x1y1x2y2" format.
-    """
-    lt, rb = paddle.split(distance, 2, -1)
-    # while tensor add parameters, parameters should be better placed on the second place
-    x1y1 = -lt + points
-    x2y2 = rb + points
-    out_bbox = paddle.concat([x1y1, x2y2], -1)
-    if max_shapes is not None:
-        max_shapes = max_shapes.flip(-1).tile([1, 2])
-        delta_dim = out_bbox.ndim - max_shapes.ndim
-        for _ in range(delta_dim):
-            max_shapes.unsqueeze_(1)
-        out_bbox = paddle.where(out_bbox < max_shapes, out_bbox, max_shapes)
-        out_bbox = paddle.where(out_bbox > 0, out_bbox,
-                                paddle.zeros_like(out_bbox))
-    return out_bbox
-
-
-def iou_similarity(box1, box2, eps=1e-10):
-    """Calculate iou of box1 and box2
-
-    Args:
-        box1 (Tensor): box with the shape [M1, 4]
-        box2 (Tensor): box with the shape [M2, 4]
-
-    Return:
-        iou (Tensor): iou between box1 and box2 with the shape [M1, M2]
-    """
-    box1 = box1.unsqueeze(1)  # [M1, 4] -> [M1, 1, 4]
-    box2 = box2.unsqueeze(0)  # [M2, 4] -> [1, M2, 4]
-    px1y1, px2y2 = box1[:, :, 0:2], box1[:, :, 2:4]
-    gx1y1, gx2y2 = box2[:, :, 0:2], box2[:, :, 2:4]
-    x1y1 = paddle.maximum(px1y1, gx1y1)
-    x2y2 = paddle.minimum(px2y2, gx2y2)
-    overlap = (x2y2 - x1y1).clip(0).prod(-1)
-    area1 = (px2y2 - px1y1).clip(0).prod(-1)
-    area2 = (gx2y2 - gx1y1).clip(0).prod(-1)
-    union = area1 + area2 - overlap + eps
-    return overlap / union
diff --git a/pdfdet/models/Paddle/ppdet/modeling/clrnet_utils.py b/pdfdet/models/Paddle/ppdet/modeling/clrnet_utils.py
deleted file mode 100644
index 24ece5c..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/clrnet_utils.py
+++ /dev/null
@@ -1,309 +0,0 @@
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.modeling.initializer import constant_
-from paddle.nn.initializer import KaimingNormal
-
-
-class ConvModule(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size=1,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=False,
-                 norm_type='bn',
-                 wtih_act=True):
-        super(ConvModule, self).__init__()
-        assert norm_type in ['bn', 'sync_bn', 'gn', None]
-        self.with_norm = norm_type is not None
-        self.wtih_act = wtih_act
-        self.conv = nn.Conv2D(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias_attr=bias,
-            weight_attr=KaimingNormal())
-        if self.with_norm:
-            if norm_type == 'bn':
-                self.bn = nn.BatchNorm2D(out_channels)
-            elif norm_type == 'gn':
-                self.bn = nn.GroupNorm(out_channels, out_channels)
-
-        if self.wtih_act:
-            self.act = nn.ReLU()
-
-    def forward(self, inputs):
-        x = self.conv(inputs)
-        if self.with_norm:
-            x = self.bn(x)
-        if self.wtih_act:
-            x = self.act(x)
-        return x
-
-
-def LinearModule(hidden_dim):
-    return nn.LayerList(
-        [nn.Linear(
-            hidden_dim, hidden_dim, bias_attr=True), nn.ReLU()])
-
-
-class FeatureResize(nn.Layer):
-    def __init__(self, size=(10, 25)):
-        super(FeatureResize, self).__init__()
-        self.size = size
-
-    def forward(self, x):
-        x = F.interpolate(x, self.size)
-        return x.flatten(2)
-
-
-class ROIGather(nn.Layer):
-    '''
-    ROIGather module for gather global information
-    Args: 
-        in_channels: prior feature channels
-        num_priors: prior numbers we predefined
-        sample_points: the number of sampled points when we extract feature from line
-        fc_hidden_dim: the fc output channel
-        refine_layers: the total number of layers to build refine
-    '''
-
-    def __init__(self,
-                 in_channels,
-                 num_priors,
-                 sample_points,
-                 fc_hidden_dim,
-                 refine_layers,
-                 mid_channels=48):
-        super(ROIGather, self).__init__()
-        self.in_channels = in_channels
-        self.num_priors = num_priors
-        self.f_key = ConvModule(
-            in_channels=self.in_channels,
-            out_channels=self.in_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            norm_type='bn')
-
-        self.f_query = nn.Sequential(
-            nn.Conv1D(
-                in_channels=num_priors,
-                out_channels=num_priors,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                groups=num_priors),
-            nn.ReLU(), )
-        self.f_value = nn.Conv2D(
-            in_channels=self.in_channels,
-            out_channels=self.in_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0)
-        self.W = nn.Conv1D(
-            in_channels=num_priors,
-            out_channels=num_priors,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            groups=num_priors)
-
-        self.resize = FeatureResize()
-        constant_(self.W.weight, 0)
-        constant_(self.W.bias, 0)
-
-        self.convs = nn.LayerList()
-        self.catconv = nn.LayerList()
-        for i in range(refine_layers):
-            self.convs.append(
-                ConvModule(
-                    in_channels,
-                    mid_channels, (9, 1),
-                    padding=(4, 0),
-                    bias=False,
-                    norm_type='bn'))
-
-            self.catconv.append(
-                ConvModule(
-                    mid_channels * (i + 1),
-                    in_channels, (9, 1),
-                    padding=(4, 0),
-                    bias=False,
-                    norm_type='bn'))
-
-        self.fc = nn.Linear(
-            sample_points * fc_hidden_dim, fc_hidden_dim, bias_attr=True)
-
-        self.fc_norm = nn.LayerNorm(fc_hidden_dim)
-
-    def roi_fea(self, x, layer_index):
-        feats = []
-        for i, feature in enumerate(x):
-            feat_trans = self.convs[i](feature)
-            feats.append(feat_trans)
-        cat_feat = paddle.concat(feats, axis=1)
-        cat_feat = self.catconv[layer_index](cat_feat)
-        return cat_feat
-
-    def forward(self, roi_features, x, layer_index):
-        '''
-        Args:
-            roi_features: prior feature, shape: (Batch * num_priors, prior_feat_channel, sample_point, 1)
-            x: feature map
-            layer_index: currently on which layer to refine
-        Return: 
-            roi: prior features with gathered global information, shape: (Batch, num_priors, fc_hidden_dim)
-        '''
-
-        roi = self.roi_fea(roi_features, layer_index)
-        # return roi
-        # print(roi.shape)
-        # return roi
-        bs = x.shape[0]
-        # print(bs)
-        #roi = roi.contiguous().view(bs * self.num_priors, -1)
-        roi = roi.reshape([bs * self.num_priors, -1])
-        # roi = paddle.randn([192,2304])
-        # return roi
-        # print(roi)
-        # print(self.fc)
-        # print(self.fc.weight)
-        roi = self.fc(roi)
-        roi = F.relu(self.fc_norm(roi))
-        # return roi
-        #roi = roi.view(bs, self.num_priors, -1)
-        roi = roi.reshape([bs, self.num_priors, -1])
-        query = roi
-
-        value = self.resize(self.f_value(x))  # (B, C, N) global feature
-        query = self.f_query(
-            query)  # (B, N, 1) sample context feature from prior roi
-        key = self.f_key(x)
-        value = value.transpose(perm=[0, 2, 1])
-        key = self.resize(key)  # (B, C, N) global feature
-        sim_map = paddle.matmul(query, key)
-        sim_map = (self.in_channels**-.5) * sim_map
-        sim_map = F.softmax(sim_map, axis=-1)
-
-        context = paddle.matmul(sim_map, value)
-        context = self.W(context)
-
-        roi = roi + F.dropout(context, p=0.1, training=self.training)
-
-        return roi
-
-
-class SegDecoder(nn.Layer):
-    '''
-    Optionaly seg decoder
-    '''
-
-    def __init__(self,
-                 image_height,
-                 image_width,
-                 num_class,
-                 prior_feat_channels=64,
-                 refine_layers=3):
-        super().__init__()
-        self.dropout = nn.Dropout2D(0.1)
-        self.conv = nn.Conv2D(prior_feat_channels * refine_layers, num_class, 1)
-        self.image_height = image_height
-        self.image_width = image_width
-
-    def forward(self, x):
-        x = self.dropout(x)
-        x = self.conv(x)
-        x = F.interpolate(
-            x,
-            size=[self.image_height, self.image_width],
-            mode='bilinear',
-            align_corners=False)
-        return x
-
-
-import paddle.nn as nn
-
-
-def accuracy(pred, target, topk=1, thresh=None):
-    """Calculate accuracy according to the prediction and target.
-
-    Args:
-        pred (torch.Tensor): The model prediction, shape (N, num_class)
-        target (torch.Tensor): The target of each prediction, shape (N, )
-        topk (int | tuple[int], optional): If the predictions in ``topk``
-            matches the target, the predictions will be regarded as
-            correct ones. Defaults to 1.
-        thresh (float, optional): If not None, predictions with scores under
-            this threshold are considered incorrect. Default to None.
-
-    Returns:
-        float | tuple[float]: If the input ``topk`` is a single integer,
-            the function will return a single float as accuracy. If
-            ``topk`` is a tuple containing multiple integers, the
-            function will return a tuple containing accuracies of
-            each ``topk`` number.
-    """
-    assert isinstance(topk, (int, tuple))
-    if isinstance(topk, int):
-        topk = (topk, )
-        return_single = True
-    else:
-        return_single = False
-
-    maxk = max(topk)
-    if pred.shape[0] == 0:
-        accu = [pred.new_tensor(0.) for i in range(len(topk))]
-        return accu[0] if return_single else accu
-    assert pred.ndim == 2 and target.ndim == 1
-    assert pred.shape[0] == target.shape[0]
-    assert maxk <= pred.shape[1], \
-        f'maxk {maxk} exceeds pred dimension {pred.shape[1]}'
-    pred_value, pred_label = pred.topk(maxk, axis=1)
-    pred_label = pred_label.t()  # transpose to shape (maxk, N)
-    correct = pred_label.equal(target.reshape([1, -1]).expand_as(pred_label))
-    if thresh is not None:
-        # Only prediction values larger than thresh are counted as correct
-        correct = correct & (pred_value > thresh).t()
-    res = []
-    for k in topk:
-        correct_k = correct[:k].reshape([-1]).cast("float32").sum(0,
-                                                                  keepdim=True)
-        correct_k = correct_k * (100.0 / pred.shape[0])
-        res.append(correct_k)
-    return res[0] if return_single else res
-
-
-class Accuracy(nn.Layer):
-    def __init__(self, topk=(1, ), thresh=None):
-        """Module to calculate the accuracy.
-
-        Args:
-            topk (tuple, optional): The criterion used to calculate the
-                accuracy. Defaults to (1,).
-            thresh (float, optional): If not None, predictions with scores
-                under this threshold are considered incorrect. Default to None.
-        """
-        super().__init__()
-        self.topk = topk
-        self.thresh = thresh
-
-    def forward(self, pred, target):
-        """Forward function to calculate accuracy.
-
-        Args:
-            pred (torch.Tensor): Prediction of models.
-            target (torch.Tensor): Target for each prediction.
-
-        Returns:
-            tuple[float]: The accuracies under different topk criterions.
-        """
-        return accuracy(pred, target, self.topk, self.thresh)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/cls_utils.py b/pdfdet/models/Paddle/ppdet/modeling/cls_utils.py
deleted file mode 100644
index 3ae8d11..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/cls_utils.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-def _get_class_default_kwargs(cls, *args, **kwargs):
-    """
-    Get default arguments of a class in dict format, if args and
-    kwargs is specified, it will replace default arguments
-    """
-    varnames = cls.__init__.__code__.co_varnames
-    argcount = cls.__init__.__code__.co_argcount
-    keys = varnames[:argcount]
-    assert keys[0] == 'self'
-    keys = keys[1:]
-
-    values = list(cls.__init__.__defaults__)
-    assert len(values) == len(keys)
-
-    if len(args) > 0:
-        for i, arg in enumerate(args):
-            values[i] = arg
-
-    default_kwargs = dict(zip(keys, values))
-
-    if len(kwargs) > 0:
-        for k, v in kwargs.items():
-            default_kwargs[k] = v
-
-    return default_kwargs
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/heads/__init__.py
deleted file mode 100644
index 0d126c0..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/__init__.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import bbox_head
-from . import mask_head
-from . import yolo_head
-from . import roi_extractor
-from . import ssd_head
-from . import fcos_head
-from . import solov2_head
-from . import ttf_head
-from . import cascade_head
-from . import face_head
-from . import s2anet_head
-from . import keypoint_hrhrnet_head
-from . import centernet_head
-from . import gfl_head
-from . import simota_head
-from . import pico_head
-from . import detr_head
-from . import sparsercnn_head
-from . import tood_head
-from . import retina_head
-from . import ppyoloe_head
-from . import fcosr_head
-from . import ppyoloe_r_head
-from . import yolof_head
-from . import ppyoloe_contrast_head
-from . import centertrack_head
-from . import sparse_roi_head
-from . import vitpose_head
-from . import clrnet_head
-
-from .bbox_head import *
-from .mask_head import *
-from .yolo_head import *
-from .roi_extractor import *
-from .ssd_head import *
-from .fcos_head import *
-from .solov2_head import *
-from .ttf_head import *
-from .cascade_head import *
-from .face_head import *
-from .s2anet_head import *
-from .keypoint_hrhrnet_head import *
-from .centernet_head import *
-from .gfl_head import *
-from .simota_head import *
-from .pico_head import *
-from .detr_head import *
-from .sparsercnn_head import *
-from .tood_head import *
-from .retina_head import *
-from .ppyoloe_head import *
-from .fcosr_head import *
-from .ppyoloe_r_head import *
-from .yolof_head import *
-from .ppyoloe_contrast_head import *
-from .centertrack_head import *
-from .sparse_roi_head import *
-from .petr_head import *
-from .vitpose_head import *
-from .clrnet_head import *
\ No newline at end of file
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/bbox_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/bbox_head.py
deleted file mode 100644
index 3ce4798..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/bbox_head.py
+++ /dev/null
@@ -1,443 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import numpy as np
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.nn.initializer import Normal, XavierUniform, KaimingNormal
-from paddle.regularizer import L2Decay
-
-from ppdet.core.workspace import register, create
-from .roi_extractor import RoIAlign
-from ..shape_spec import ShapeSpec
-from ..bbox_utils import bbox2delta
-from ..cls_utils import _get_class_default_kwargs
-from ppdet.modeling.layers import ConvNormLayer
-
-__all__ = ['TwoFCHead', 'XConvNormHead', 'BBoxHead']
-
-
-@register
-class TwoFCHead(nn.Layer):
-    """
-    RCNN bbox head with Two fc layers to extract feature
-
-    Args:
-        in_channel (int): Input channel which can be derived by from_config
-        out_channel (int): Output channel
-        resolution (int): Resolution of input feature map, default 7
-    """
-
-    def __init__(self, in_channel=256, out_channel=1024, resolution=7):
-        super(TwoFCHead, self).__init__()
-        self.in_channel = in_channel
-        self.out_channel = out_channel
-        fan = in_channel * resolution * resolution
-        self.fc6 = nn.Linear(
-            in_channel * resolution * resolution,
-            out_channel,
-            weight_attr=paddle.ParamAttr(
-                initializer=XavierUniform(fan_out=fan)))
-        self.fc6.skip_quant = True
-
-        self.fc7 = nn.Linear(
-            out_channel,
-            out_channel,
-            weight_attr=paddle.ParamAttr(initializer=XavierUniform()))
-        self.fc7.skip_quant = True
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        s = input_shape
-        s = s[0] if isinstance(s, (list, tuple)) else s
-        return {'in_channel': s.channels}
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=self.out_channel, )]
-
-    def forward(self, rois_feat):
-        rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1)
-        fc6 = self.fc6(rois_feat)
-        fc6 = F.relu(fc6)
-        fc7 = self.fc7(fc6)
-        fc7 = F.relu(fc7)
-        return fc7
-
-
-@register
-class XConvNormHead(nn.Layer):
-    __shared__ = ['norm_type', 'freeze_norm']
-    """
-    RCNN bbox head with serveral convolution layers
-
-    Args:
-        in_channel (int): Input channels which can be derived by from_config
-        num_convs (int): The number of conv layers
-        conv_dim (int): The number of channels for the conv layers
-        out_channel (int): Output channels
-        resolution (int): Resolution of input feature map
-        norm_type (string): Norm type, bn, gn, sync_bn are available, 
-            default `gn`
-        freeze_norm (bool): Whether to freeze the norm
-        stage_name (string): Prefix name for conv layer,  '' by default
-    """
-
-    def __init__(self,
-                 in_channel=256,
-                 num_convs=4,
-                 conv_dim=256,
-                 out_channel=1024,
-                 resolution=7,
-                 norm_type='gn',
-                 freeze_norm=False,
-                 stage_name=''):
-        super(XConvNormHead, self).__init__()
-        self.in_channel = in_channel
-        self.num_convs = num_convs
-        self.conv_dim = conv_dim
-        self.out_channel = out_channel
-        self.norm_type = norm_type
-        self.freeze_norm = freeze_norm
-
-        self.bbox_head_convs = []
-        fan = conv_dim * 3 * 3
-        initializer = KaimingNormal(fan_in=fan)
-        for i in range(self.num_convs):
-            in_c = in_channel if i == 0 else conv_dim
-            head_conv_name = stage_name + 'bbox_head_conv{}'.format(i)
-            head_conv = self.add_sublayer(
-                head_conv_name,
-                ConvNormLayer(
-                    ch_in=in_c,
-                    ch_out=conv_dim,
-                    filter_size=3,
-                    stride=1,
-                    norm_type=self.norm_type,
-                    freeze_norm=self.freeze_norm,
-                    initializer=initializer))
-            self.bbox_head_convs.append(head_conv)
-
-        fan = conv_dim * resolution * resolution
-        self.fc6 = nn.Linear(
-            conv_dim * resolution * resolution,
-            out_channel,
-            weight_attr=paddle.ParamAttr(
-                initializer=XavierUniform(fan_out=fan)),
-            bias_attr=paddle.ParamAttr(
-                learning_rate=2., regularizer=L2Decay(0.)))
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        s = input_shape
-        s = s[0] if isinstance(s, (list, tuple)) else s
-        return {'in_channel': s.channels}
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=self.out_channel, )]
-
-    def forward(self, rois_feat):
-        for i in range(self.num_convs):
-            rois_feat = F.relu(self.bbox_head_convs[i](rois_feat))
-        rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1)
-        fc6 = F.relu(self.fc6(rois_feat))
-        return fc6
-
-
-@register
-class BBoxHead(nn.Layer):
-    __shared__ = ['num_classes', 'use_cot']
-    __inject__ = ['bbox_assigner', 'bbox_loss', 'loss_cot']
-    """
-    RCNN bbox head
-
-    Args:
-        head (nn.Layer): Extract feature in bbox head
-        in_channel (int): Input channel after RoI extractor
-        roi_extractor (object): The module of RoI Extractor
-        bbox_assigner (object): The module of Box Assigner, label and sample the 
-            box.
-        with_pool (bool): Whether to use pooling for the RoI feature.
-        num_classes (int): The number of classes
-        bbox_weight (List[float]): The weight to get the decode box
-        cot_classes (int): The number of base classes
-        loss_cot (object): The module of Label-cotuning
-        use_cot(bool): whether to use Label-cotuning 
-    """
-
-    def __init__(self,
-                 head,
-                 in_channel,
-                 roi_extractor=_get_class_default_kwargs(RoIAlign),
-                 bbox_assigner='BboxAssigner',
-                 with_pool=False,
-                 num_classes=80,
-                 bbox_weight=[10., 10., 5., 5.],
-                 bbox_loss=None,
-                 loss_normalize_pos=False,
-                 cot_classes=None,
-                 loss_cot='COTLoss',
-                 use_cot=False):
-        super(BBoxHead, self).__init__()
-        self.head = head
-        self.roi_extractor = roi_extractor
-        if isinstance(roi_extractor, dict):
-            self.roi_extractor = RoIAlign(**roi_extractor)
-        self.bbox_assigner = bbox_assigner
-
-        self.with_pool = with_pool
-        self.num_classes = num_classes
-        self.bbox_weight = bbox_weight
-        self.bbox_loss = bbox_loss
-        self.loss_normalize_pos = loss_normalize_pos
-
-        self.loss_cot = loss_cot
-        self.cot_relation = None
-        self.cot_classes = cot_classes
-        self.use_cot = use_cot
-        if use_cot:
-            self.cot_bbox_score = nn.Linear(
-                in_channel,
-                self.num_classes + 1,
-                weight_attr=paddle.ParamAttr(initializer=Normal(
-                    mean=0.0, std=0.01)))
-            
-            self.bbox_score = nn.Linear(
-                in_channel,
-                self.cot_classes + 1,
-                weight_attr=paddle.ParamAttr(initializer=Normal(
-                    mean=0.0, std=0.01)))
-            self.cot_bbox_score.skip_quant = True
-        else:
-            self.bbox_score = nn.Linear(
-                in_channel,
-                self.num_classes + 1,
-                weight_attr=paddle.ParamAttr(initializer=Normal(
-                    mean=0.0, std=0.01)))
-        self.bbox_score.skip_quant = True
-
-        self.bbox_delta = nn.Linear(
-            in_channel,
-            4 * self.num_classes,
-            weight_attr=paddle.ParamAttr(initializer=Normal(
-                mean=0.0, std=0.001)))
-        self.bbox_delta.skip_quant = True
-        self.assigned_label = None
-        self.assigned_rois = None
-
-    def init_cot_head(self, relationship):
-        self.cot_relation = relationship
-        
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        roi_pooler = cfg['roi_extractor']
-        assert isinstance(roi_pooler, dict)
-        kwargs = RoIAlign.from_config(cfg, input_shape)
-        roi_pooler.update(kwargs)
-        kwargs = {'input_shape': input_shape}
-        head = create(cfg['head'], **kwargs)
-        return {
-            'roi_extractor': roi_pooler,
-            'head': head,
-            'in_channel': head.out_shape[0].channels
-        }
-
-    def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None, cot=False):
-        """
-        body_feats (list[Tensor]): Feature maps from backbone
-        rois (list[Tensor]): RoIs generated from RPN module
-        rois_num (Tensor): The number of RoIs in each image
-        inputs (dict{Tensor}): The ground-truth of image
-        """
-        if self.training:
-            rois, rois_num, targets = self.bbox_assigner(rois, rois_num, inputs)
-            self.assigned_rois = (rois, rois_num)
-            self.assigned_targets = targets
-
-        rois_feat = self.roi_extractor(body_feats, rois, rois_num)
-        bbox_feat = self.head(rois_feat)
-        if self.with_pool:
-            feat = F.adaptive_avg_pool2d(bbox_feat, output_size=1)
-            feat = paddle.squeeze(feat, axis=[2, 3])
-        else:
-            feat = bbox_feat
-        if self.use_cot:
-            scores = self.cot_bbox_score(feat)
-            cot_scores = self.bbox_score(feat)
-        else:
-            scores = self.bbox_score(feat)
-        deltas = self.bbox_delta(feat)
-
-        if self.training:
-            loss = self.get_loss(
-                scores,
-                deltas,
-                targets,
-                rois,
-                self.bbox_weight,
-                loss_normalize_pos=self.loss_normalize_pos)
-            
-            if self.cot_relation is not None:
-                loss_cot = self.loss_cot(cot_scores, targets, self.cot_relation)
-                loss.update(loss_cot)
-            return loss, bbox_feat
-        else:
-            if cot:
-                pred = self.get_prediction(cot_scores, deltas)
-            else:
-                pred = self.get_prediction(scores, deltas)
-            return pred, self.head
-
-
-    def get_loss(self,
-                 scores,
-                 deltas,
-                 targets,
-                 rois,
-                 bbox_weight,
-                 loss_normalize_pos=False):
-        """
-        scores (Tensor): scores from bbox head outputs
-        deltas (Tensor): deltas from bbox head outputs
-        targets (list[List[Tensor]]): bbox targets containing tgt_labels, tgt_bboxes and tgt_gt_inds
-        rois (List[Tensor]): RoIs generated in each batch
-        """
-        cls_name = 'loss_bbox_cls'
-        reg_name = 'loss_bbox_reg'
-        loss_bbox = {}
-
-        # TODO: better pass args
-        tgt_labels, tgt_bboxes, tgt_gt_inds = targets
-
-        # bbox cls
-        tgt_labels = paddle.concat(tgt_labels) if len(
-            tgt_labels) > 1 else tgt_labels[0]
-        valid_inds = paddle.nonzero(tgt_labels >= 0).flatten()
-        if valid_inds.shape[0] == 0:
-            loss_bbox[cls_name] = paddle.zeros([1], dtype='float32')
-        else:
-            tgt_labels = tgt_labels.cast('int64')
-            tgt_labels.stop_gradient = True
-
-            if not loss_normalize_pos:
-                loss_bbox_cls = F.cross_entropy(
-                    input=scores, label=tgt_labels, reduction='mean')
-            else:
-                loss_bbox_cls = F.cross_entropy(
-                    input=scores, label=tgt_labels,
-                    reduction='none').sum() / (tgt_labels.shape[0] + 1e-7)
-
-            loss_bbox[cls_name] = loss_bbox_cls
-
-        # bbox reg
-
-        cls_agnostic_bbox_reg = deltas.shape[1] == 4
-
-        fg_inds = paddle.nonzero(
-            paddle.logical_and(tgt_labels >= 0, tgt_labels <
-                               self.num_classes)).flatten()
-
-        if fg_inds.numel() == 0:
-            loss_bbox[reg_name] = paddle.zeros([1], dtype='float32')
-            return loss_bbox
-
-        if cls_agnostic_bbox_reg:
-            reg_delta = paddle.gather(deltas, fg_inds)
-        else:
-            fg_gt_classes = paddle.gather(tgt_labels, fg_inds)
-
-            reg_row_inds = paddle.arange(fg_gt_classes.shape[0]).unsqueeze(1)
-            reg_row_inds = paddle.tile(reg_row_inds, [1, 4]).reshape([-1, 1])
-
-            reg_col_inds = 4 * fg_gt_classes.unsqueeze(1) + paddle.arange(4)
-
-            reg_col_inds = reg_col_inds.reshape([-1, 1])
-            reg_inds = paddle.concat([reg_row_inds, reg_col_inds], axis=1)
-
-            reg_delta = paddle.gather(deltas, fg_inds)
-            reg_delta = paddle.gather_nd(reg_delta, reg_inds).reshape([-1, 4])
-        rois = paddle.concat(rois) if len(rois) > 1 else rois[0]
-        tgt_bboxes = paddle.concat(tgt_bboxes) if len(
-            tgt_bboxes) > 1 else tgt_bboxes[0]
-
-        reg_target = bbox2delta(rois, tgt_bboxes, bbox_weight)
-        reg_target = paddle.gather(reg_target, fg_inds)
-        reg_target.stop_gradient = True
-
-        if self.bbox_loss is not None:
-            reg_delta = self.bbox_transform(reg_delta)
-            reg_target = self.bbox_transform(reg_target)
-
-            if not loss_normalize_pos:
-                loss_bbox_reg = self.bbox_loss(
-                    reg_delta, reg_target).sum() / tgt_labels.shape[0]
-                loss_bbox_reg *= self.num_classes
-
-            else:
-                loss_bbox_reg = self.bbox_loss(
-                    reg_delta, reg_target).sum() / (tgt_labels.shape[0] + 1e-7)
-
-        else:
-            loss_bbox_reg = paddle.abs(reg_delta - reg_target).sum(
-            ) / tgt_labels.shape[0]
-
-        loss_bbox[reg_name] = loss_bbox_reg
-
-        return loss_bbox
-
-    def bbox_transform(self, deltas, weights=[0.1, 0.1, 0.2, 0.2]):
-        wx, wy, ww, wh = weights
-
-        deltas = paddle.reshape(deltas, shape=(0, -1, 4))
-
-        dx = paddle.slice(deltas, axes=[2], starts=[0], ends=[1]) * wx
-        dy = paddle.slice(deltas, axes=[2], starts=[1], ends=[2]) * wy
-        dw = paddle.slice(deltas, axes=[2], starts=[2], ends=[3]) * ww
-        dh = paddle.slice(deltas, axes=[2], starts=[3], ends=[4]) * wh
-
-        dw = paddle.clip(dw, -1.e10, np.log(1000. / 16))
-        dh = paddle.clip(dh, -1.e10, np.log(1000. / 16))
-
-        pred_ctr_x = dx
-        pred_ctr_y = dy
-        pred_w = paddle.exp(dw)
-        pred_h = paddle.exp(dh)
-
-        x1 = pred_ctr_x - 0.5 * pred_w
-        y1 = pred_ctr_y - 0.5 * pred_h
-        x2 = pred_ctr_x + 0.5 * pred_w
-        y2 = pred_ctr_y + 0.5 * pred_h
-
-        x1 = paddle.reshape(x1, shape=(-1, ))
-        y1 = paddle.reshape(y1, shape=(-1, ))
-        x2 = paddle.reshape(x2, shape=(-1, ))
-        y2 = paddle.reshape(y2, shape=(-1, ))
-
-        return paddle.concat([x1, y1, x2, y2])
-
-    def get_prediction(self, score, delta):
-        bbox_prob = F.softmax(score)
-        return delta, bbox_prob
-
-    def get_head(self, ):
-        return self.head
-
-    def get_assigned_targets(self, ):
-        return self.assigned_targets
-
-    def get_assigned_rois(self, ):
-        return self.assigned_rois
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/cascade_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/cascade_head.py
deleted file mode 100644
index d6f21d2..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/cascade_head.py
+++ /dev/null
@@ -1,337 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.nn.initializer import Normal
-
-from ppdet.core.workspace import register
-from .bbox_head import BBoxHead, TwoFCHead, XConvNormHead
-from .roi_extractor import RoIAlign
-from ..shape_spec import ShapeSpec
-from ..bbox_utils import delta2bbox, clip_bbox, nonempty_bbox
-from ..cls_utils import _get_class_default_kwargs
-
-__all__ = ['CascadeTwoFCHead', 'CascadeXConvNormHead', 'CascadeHead']
-
-
-@register
-class CascadeTwoFCHead(nn.Layer):
-    __shared__ = ['num_cascade_stage']
-    """
-    Cascade RCNN bbox head  with Two fc layers to extract feature
-
-    Args:
-        in_channel (int): Input channel which can be derived by from_config
-        out_channel (int): Output channel
-        resolution (int): Resolution of input feature map, default 7
-        num_cascade_stage (int): The number of cascade stage, default 3
-    """
-
-    def __init__(self,
-                 in_channel=256,
-                 out_channel=1024,
-                 resolution=7,
-                 num_cascade_stage=3):
-        super(CascadeTwoFCHead, self).__init__()
-
-        self.in_channel = in_channel
-        self.out_channel = out_channel
-
-        self.head_list = []
-        for stage in range(num_cascade_stage):
-            head_per_stage = self.add_sublayer(
-                str(stage), TwoFCHead(in_channel, out_channel, resolution))
-            self.head_list.append(head_per_stage)
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        s = input_shape
-        s = s[0] if isinstance(s, (list, tuple)) else s
-        return {'in_channel': s.channels}
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=self.out_channel, )]
-
-    def forward(self, rois_feat, stage=0):
-        out = self.head_list[stage](rois_feat)
-        return out
-
-
-@register
-class CascadeXConvNormHead(nn.Layer):
-    __shared__ = ['norm_type', 'freeze_norm', 'num_cascade_stage']
-    """
-    Cascade RCNN bbox head with serveral convolution layers
-
-    Args:
-        in_channel (int): Input channels which can be derived by from_config
-        num_convs (int): The number of conv layers
-        conv_dim (int): The number of channels for the conv layers
-        out_channel (int): Output channels
-        resolution (int): Resolution of input feature map
-        norm_type (string): Norm type, bn, gn, sync_bn are available, 
-            default `gn`
-        freeze_norm (bool): Whether to freeze the norm
-        num_cascade_stage (int): The number of cascade stage, default 3
-    """
-
-    def __init__(self,
-                 in_channel=256,
-                 num_convs=4,
-                 conv_dim=256,
-                 out_channel=1024,
-                 resolution=7,
-                 norm_type='gn',
-                 freeze_norm=False,
-                 num_cascade_stage=3):
-        super(CascadeXConvNormHead, self).__init__()
-        self.in_channel = in_channel
-        self.out_channel = out_channel
-
-        self.head_list = []
-        for stage in range(num_cascade_stage):
-            head_per_stage = self.add_sublayer(
-                str(stage),
-                XConvNormHead(
-                    in_channel,
-                    num_convs,
-                    conv_dim,
-                    out_channel,
-                    resolution,
-                    norm_type,
-                    freeze_norm,
-                    stage_name='stage{}_'.format(stage)))
-            self.head_list.append(head_per_stage)
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        s = input_shape
-        s = s[0] if isinstance(s, (list, tuple)) else s
-        return {'in_channel': s.channels}
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=self.out_channel, )]
-
-    def forward(self, rois_feat, stage=0):
-        out = self.head_list[stage](rois_feat)
-        return out
-
-
-@register
-class CascadeHead(BBoxHead):
-    __shared__ = ['num_classes', 'num_cascade_stages']
-    __inject__ = ['bbox_assigner', 'bbox_loss']
-    """
-    Cascade RCNN bbox head
-
-    Args:
-        head (nn.Layer): Extract feature in bbox head
-        in_channel (int): Input channel after RoI extractor
-        roi_extractor (object): The module of RoI Extractor
-        bbox_assigner (object): The module of Box Assigner, label and sample the 
-            box.
-        num_classes (int): The number of classes
-        bbox_weight (List[List[float]]): The weight to get the decode box and the 
-            length of weight is the number of cascade stage
-        num_cascade_stages (int): THe number of stage to refine the box
-    """
-
-    def __init__(self,
-                 head,
-                 in_channel,
-                 roi_extractor=_get_class_default_kwargs(RoIAlign),
-                 bbox_assigner='BboxAssigner',
-                 num_classes=80,
-                 bbox_weight=[[10., 10., 5., 5.], [20.0, 20.0, 10.0, 10.0],
-                              [30.0, 30.0, 15.0, 15.0]],
-                 num_cascade_stages=3,
-                 bbox_loss=None,
-                 reg_class_agnostic=True,
-                 stage_loss_weights=None,
-                 loss_normalize_pos=False,
-                 add_gt_as_proposals=[True, False, False]):
-
-        nn.Layer.__init__(self, )
-        self.head = head
-        self.roi_extractor = roi_extractor
-        if isinstance(roi_extractor, dict):
-            self.roi_extractor = RoIAlign(**roi_extractor)
-        self.bbox_assigner = bbox_assigner
-
-        self.num_classes = num_classes
-        self.bbox_weight = bbox_weight
-        self.num_cascade_stages = num_cascade_stages
-        self.bbox_loss = bbox_loss
-        self.stage_loss_weights = [
-            1. / num_cascade_stages for _ in range(num_cascade_stages)
-        ] if stage_loss_weights is None else stage_loss_weights
-        self.add_gt_as_proposals = add_gt_as_proposals
-
-        assert len(
-            self.stage_loss_weights
-        ) == num_cascade_stages, f'stage_loss_weights({len(self.stage_loss_weights)}) do not equal to num_cascade_stages({num_cascade_stages})'
-
-        self.reg_class_agnostic = reg_class_agnostic
-        num_bbox_delta = 4 if reg_class_agnostic else 4 * num_classes
-        self.loss_normalize_pos = loss_normalize_pos
-
-        self.bbox_score_list = []
-        self.bbox_delta_list = []
-        for i in range(num_cascade_stages):
-            score_name = 'bbox_score_stage{}'.format(i)
-            delta_name = 'bbox_delta_stage{}'.format(i)
-            bbox_score = self.add_sublayer(
-                score_name,
-                nn.Linear(
-                    in_channel,
-                    self.num_classes + 1,
-                    weight_attr=paddle.ParamAttr(initializer=Normal(
-                        mean=0.0, std=0.01))))
-
-            bbox_delta = self.add_sublayer(
-                delta_name,
-                nn.Linear(
-                    in_channel,
-                    num_bbox_delta,
-                    weight_attr=paddle.ParamAttr(initializer=Normal(
-                        mean=0.0, std=0.001))))
-            self.bbox_score_list.append(bbox_score)
-            self.bbox_delta_list.append(bbox_delta)
-        self.assigned_label = None
-        self.assigned_rois = None
-
-    def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None):
-        """
-        body_feats (list[Tensor]): Feature maps from backbone
-        rois (Tensor): RoIs generated from RPN module
-        rois_num (Tensor): The number of RoIs in each image
-        inputs (dict{Tensor}): The ground-truth of image
-        """
-        targets = []
-        if self.training:
-            rois, rois_num, targets = self.bbox_assigner(
-                rois,
-                rois_num,
-                inputs,
-                add_gt_as_proposals=self.add_gt_as_proposals[0])
-            targets_list = [targets]
-            self.assigned_rois = (rois, rois_num)
-            self.assigned_targets = targets
-
-        pred_bbox = None
-        head_out_list = []
-        for i in range(self.num_cascade_stages):
-            if i > 0:
-                rois, rois_num = self._get_rois_from_boxes(pred_bbox,
-                                                           inputs['im_shape'])
-                if self.training:
-                    rois, rois_num, targets = self.bbox_assigner(
-                        rois,
-                        rois_num,
-                        inputs,
-                        i,
-                        is_cascade=True,
-                        add_gt_as_proposals=self.add_gt_as_proposals[i])
-                    targets_list.append(targets)
-
-            rois_feat = self.roi_extractor(body_feats, rois, rois_num)
-            bbox_feat = self.head(rois_feat, i)
-            scores = self.bbox_score_list[i](bbox_feat)
-            deltas = self.bbox_delta_list[i](bbox_feat)
-
-            # TODO (lyuwenyu) Is it correct for only one class ?
-            if not self.reg_class_agnostic and i < self.num_cascade_stages - 1:
-                deltas = deltas.reshape([deltas.shape[0], self.num_classes, 4])
-                labels = scores[:, :-1].argmax(axis=-1)
-
-                if self.training:
-                    deltas = deltas[paddle.arange(deltas.shape[0]), labels]
-                else:
-                    deltas = deltas[((deltas + 10000) * F.one_hot(
-                        labels, num_classes=self.num_classes).unsqueeze(-1) != 0
-                                     ).nonzero(as_tuple=True)].reshape(
-                                         [deltas.shape[0], 4])
-
-            head_out_list.append([scores, deltas, rois])
-            pred_bbox = self._get_pred_bbox(deltas, rois, self.bbox_weight[i])
-
-        if self.training:
-            loss = {}
-            for stage, value in enumerate(zip(head_out_list, targets_list)):
-                (scores, deltas, rois), targets = value
-                loss_stage = self.get_loss(
-                    scores,
-                    deltas,
-                    targets,
-                    rois,
-                    self.bbox_weight[stage],
-                    loss_normalize_pos=self.loss_normalize_pos)
-                for k, v in loss_stage.items():
-                    loss[k + "_stage{}".format(
-                        stage)] = v * self.stage_loss_weights[stage]
-
-            return loss, bbox_feat
-        else:
-            scores, deltas, self.refined_rois = self.get_prediction(
-                head_out_list)
-            return (deltas, scores), self.head
-
-    def _get_rois_from_boxes(self, boxes, im_shape):
-        rois = []
-        for i, boxes_per_image in enumerate(boxes):
-            clip_box = clip_bbox(boxes_per_image, im_shape[i])
-            if self.training:
-                keep = nonempty_bbox(clip_box)
-                if keep.shape[0] == 0:
-                    keep = paddle.zeros([1], dtype='int32')
-                clip_box = paddle.gather(clip_box, keep)
-            rois.append(clip_box)
-        rois_num = paddle.concat([paddle.shape(r)[0:1] for r in rois])
-        return rois, rois_num
-
-    def _get_pred_bbox(self, deltas, proposals, weights):
-        pred_proposals = paddle.concat(proposals) if len(
-            proposals) > 1 else proposals[0]
-        pred_bbox = delta2bbox(deltas, pred_proposals, weights)
-        pred_bbox = paddle.reshape(pred_bbox, [-1, deltas.shape[-1]])
-        num_prop = []
-        for p in proposals:
-            num_prop.append(p.shape[0])
-
-        # NOTE(dev): num_prob will be tagged as LoDTensorArray because it
-        # depends on batch_size under @to_static. However the argument
-        # num_or_sections in paddle.split does not support LoDTensorArray,
-        # so we use [-1] to replace it if num_prop is not list. The modification
-        # This ensures the correctness of both dynamic and static graphs.
-        if not isinstance(num_prop, list):
-            num_prop = [-1]
-        return pred_bbox.split(num_prop)
-
-    def get_prediction(self, head_out_list):
-        """
-        head_out_list(List[Tensor]): scores, deltas, rois
-        """
-        pred_list = []
-        scores_list = [F.softmax(head[0]) for head in head_out_list]
-        scores = paddle.add_n(scores_list) / self.num_cascade_stages
-        # Get deltas and rois from the last stage
-        _, deltas, rois = head_out_list[-1]
-        return scores, deltas, rois
-
-    def get_refined_rois(self, ):
-        return self.refined_rois
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/centernet_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/centernet_head.py
deleted file mode 100644
index 7657774..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/centernet_head.py
+++ /dev/null
@@ -1,293 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.nn.initializer import Constant, Uniform
-from ppdet.core.workspace import register
-from ppdet.modeling.losses import CTFocalLoss, GIoULoss
-
-
-class ConvLayer(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 bias=False):
-        super(ConvLayer, self).__init__()
-        bias_attr = False
-        fan_in = ch_in * kernel_size**2
-        bound = 1 / math.sqrt(fan_in)
-        param_attr = paddle.ParamAttr(initializer=Uniform(-bound, bound))
-        if bias:
-            bias_attr = paddle.ParamAttr(initializer=Constant(0.))
-        self.conv = nn.Conv2D(
-            in_channels=ch_in,
-            out_channels=ch_out,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            weight_attr=param_attr,
-            bias_attr=bias_attr)
-
-    def forward(self, inputs):
-        out = self.conv(inputs)
-        return out
-
-
-@register
-class CenterNetHead(nn.Layer):
-    """
-    Args:
-        in_channels (int): the channel number of input to CenterNetHead.
-        num_classes (int): the number of classes, 80 (COCO dataset) by default.
-        head_planes (int): the channel number in all head, 256 by default.
-        prior_bias (float): prior bias in heatmap head, -2.19 by default, -4.6 in CenterTrack
-        regress_ltrb (bool): whether to regress left/top/right/bottom or
-            width/height for a box, True by default.
-        size_loss (str): the type of size regression loss, 'L1' by default, can be 'giou'.
-        loss_weight (dict): the weight of each loss.
-        add_iou (bool): whether to add iou branch, False by default.
-    """
-
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 in_channels,
-                 num_classes=80,
-                 head_planes=256,
-                 prior_bias=-2.19,
-                 regress_ltrb=True,
-                 size_loss='L1',
-                 loss_weight={
-                     'heatmap': 1.0,
-                     'size': 0.1,
-                     'offset': 1.0,
-                     'iou': 0.0,
-                 },
-                 add_iou=False):
-        super(CenterNetHead, self).__init__()
-        self.regress_ltrb = regress_ltrb
-        self.loss_weight = loss_weight
-        self.add_iou = add_iou
-
-        # heatmap head
-        self.heatmap = nn.Sequential(
-            ConvLayer(
-                in_channels, head_planes, kernel_size=3, padding=1, bias=True),
-            nn.ReLU(),
-            ConvLayer(
-                head_planes,
-                num_classes,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                bias=True))
-        with paddle.no_grad():
-            self.heatmap[2].conv.bias[:] = prior_bias
-
-        # size(ltrb or wh) head
-        self.size = nn.Sequential(
-            ConvLayer(
-                in_channels, head_planes, kernel_size=3, padding=1, bias=True),
-            nn.ReLU(),
-            ConvLayer(
-                head_planes,
-                4 if regress_ltrb else 2,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                bias=True))
-        self.size_loss = size_loss
-
-        # offset head
-        self.offset = nn.Sequential(
-            ConvLayer(
-                in_channels, head_planes, kernel_size=3, padding=1, bias=True),
-            nn.ReLU(),
-            ConvLayer(
-                head_planes, 2, kernel_size=1, stride=1, padding=0, bias=True))
-
-        # iou head (optinal)
-        if self.add_iou and 'iou' in self.loss_weight:
-            self.iou = nn.Sequential(
-                ConvLayer(
-                    in_channels,
-                    head_planes,
-                    kernel_size=3,
-                    padding=1,
-                    bias=True),
-                nn.ReLU(),
-                ConvLayer(
-                    head_planes,
-                    4 if regress_ltrb else 2,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                    bias=True))
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        if isinstance(input_shape, (list, tuple)):
-            input_shape = input_shape[0]
-        return {'in_channels': input_shape.channels}
-
-    def forward(self, feat, inputs):
-        heatmap = F.sigmoid(self.heatmap(feat))
-        size = self.size(feat)
-        offset = self.offset(feat)
-        head_outs = {'heatmap': heatmap, 'size': size, 'offset': offset}
-        if self.add_iou and 'iou' in self.loss_weight:
-            iou = self.iou(feat)
-            head_outs.update({'iou': iou})
-
-        if self.training:
-            losses = self.get_loss(inputs, self.loss_weight, head_outs)
-            return losses
-        else:
-            return head_outs
-
-    def get_loss(self, inputs, weights, head_outs):
-        # 1.heatmap(hm) head loss: CTFocalLoss
-        heatmap = head_outs['heatmap']
-        heatmap_target = inputs['heatmap']
-        heatmap = paddle.clip(heatmap, 1e-4, 1 - 1e-4)
-        ctfocal_loss = CTFocalLoss()
-        heatmap_loss = ctfocal_loss(heatmap, heatmap_target)
-
-        # 2.size(wh) head loss: L1 loss or GIoU loss
-        size = head_outs['size']
-        index = inputs['index']
-        mask = inputs['index_mask']
-        size = paddle.transpose(size, perm=[0, 2, 3, 1])
-        size_n, _, _, size_c = size.shape
-        size = paddle.reshape(size, shape=[size_n, -1, size_c])
-        index = paddle.unsqueeze(index, 2)
-        batch_inds = list()
-        for i in range(size_n):
-            batch_ind = paddle.full(
-                shape=[1, index.shape[1], 1], fill_value=i, dtype='int64')
-            batch_inds.append(batch_ind)
-        batch_inds = paddle.concat(batch_inds, axis=0)
-        index = paddle.concat(x=[batch_inds, index], axis=2)
-        pos_size = paddle.gather_nd(size, index=index)
-        mask = paddle.unsqueeze(mask, axis=2)
-        size_mask = paddle.expand_as(mask, pos_size)
-        size_mask = paddle.cast(size_mask, dtype=pos_size.dtype)
-        pos_num = size_mask.sum()
-        size_mask.stop_gradient = True
-        if self.size_loss == 'L1':
-            if self.regress_ltrb:
-                size_target = inputs['size']
-                # shape: [bs, max_per_img, 4]
-            else:
-                if inputs['size'].shape[-1] == 2:
-                    # inputs['size'] is wh, and regress as wh
-                    # shape: [bs, max_per_img, 2]
-                    size_target = inputs['size']
-                else:
-                    # inputs['size'] is ltrb, but regress as wh
-                    # shape: [bs, max_per_img, 4]
-                    size_target = inputs['size'][:, :, 0:2] + inputs[
-                        'size'][:, :, 2:]
-
-            size_target.stop_gradient = True
-            size_loss = F.l1_loss(
-                pos_size * size_mask, size_target * size_mask, reduction='sum')
-            size_loss = size_loss / (pos_num + 1e-4)
-        elif self.size_loss == 'giou':
-            size_target = inputs['bbox_xys']
-            size_target.stop_gradient = True
-            centers_x = (size_target[:, :, 0:1] + size_target[:, :, 2:3]) / 2.0
-            centers_y = (size_target[:, :, 1:2] + size_target[:, :, 3:4]) / 2.0
-            x1 = centers_x - pos_size[:, :, 0:1]
-            y1 = centers_y - pos_size[:, :, 1:2]
-            x2 = centers_x + pos_size[:, :, 2:3]
-            y2 = centers_y + pos_size[:, :, 3:4]
-            pred_boxes = paddle.concat([x1, y1, x2, y2], axis=-1)
-            giou_loss = GIoULoss(reduction='sum')
-            size_loss = giou_loss(
-                pred_boxes * size_mask,
-                size_target * size_mask,
-                iou_weight=size_mask,
-                loc_reweight=None)
-            size_loss = size_loss / (pos_num + 1e-4)
-
-        # 3.offset(reg) head loss: L1 loss
-        offset = head_outs['offset']
-        offset_target = inputs['offset']
-        offset = paddle.transpose(offset, perm=[0, 2, 3, 1])
-        offset_n, _, _, offset_c = offset.shape
-        offset = paddle.reshape(offset, shape=[offset_n, -1, offset_c])
-        pos_offset = paddle.gather_nd(offset, index=index)
-        offset_mask = paddle.expand_as(mask, pos_offset)
-        offset_mask = paddle.cast(offset_mask, dtype=pos_offset.dtype)
-        pos_num = offset_mask.sum()
-        offset_mask.stop_gradient = True
-        offset_target.stop_gradient = True
-        offset_loss = F.l1_loss(
-            pos_offset * offset_mask,
-            offset_target * offset_mask,
-            reduction='sum')
-        offset_loss = offset_loss / (pos_num + 1e-4)
-
-        # 4.iou head loss: GIoU loss (optinal)
-        if self.add_iou and 'iou' in self.loss_weight:
-            iou = head_outs['iou']
-            iou = paddle.transpose(iou, perm=[0, 2, 3, 1])
-            iou_n, _, _, iou_c = iou.shape
-            iou = paddle.reshape(iou, shape=[iou_n, -1, iou_c])
-            pos_iou = paddle.gather_nd(iou, index=index)
-            iou_mask = paddle.expand_as(mask, pos_iou)
-            iou_mask = paddle.cast(iou_mask, dtype=pos_iou.dtype)
-            pos_num = iou_mask.sum()
-            iou_mask.stop_gradient = True
-            gt_bbox_xys = inputs['bbox_xys']
-            gt_bbox_xys.stop_gradient = True
-            centers_x = (gt_bbox_xys[:, :, 0:1] + gt_bbox_xys[:, :, 2:3]) / 2.0
-            centers_y = (gt_bbox_xys[:, :, 1:2] + gt_bbox_xys[:, :, 3:4]) / 2.0
-            x1 = centers_x - pos_size[:, :, 0:1]
-            y1 = centers_y - pos_size[:, :, 1:2]
-            x2 = centers_x + pos_size[:, :, 2:3]
-            y2 = centers_y + pos_size[:, :, 3:4]
-            pred_boxes = paddle.concat([x1, y1, x2, y2], axis=-1)
-            giou_loss = GIoULoss(reduction='sum')
-            iou_loss = giou_loss(
-                pred_boxes * iou_mask,
-                gt_bbox_xys * iou_mask,
-                iou_weight=iou_mask,
-                loc_reweight=None)
-            iou_loss = iou_loss / (pos_num + 1e-4)
-
-        losses = {
-            'heatmap_loss': heatmap_loss,
-            'size_loss': size_loss,
-            'offset_loss': offset_loss,
-        }
-        det_loss = weights['heatmap'] * heatmap_loss + weights[
-            'size'] * size_loss + weights['offset'] * offset_loss
-
-        if self.add_iou and 'iou' in self.loss_weight:
-            losses.update({'iou_loss': iou_loss})
-            det_loss += weights['iou'] * iou_loss
-        losses.update({'det_loss': det_loss})
-        return losses
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/centertrack_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/centertrack_head.py
deleted file mode 100644
index dc35336..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/centertrack_head.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-from .centernet_head import ConvLayer
-from ..keypoint_utils import get_affine_transform
-
-__all__ = ['CenterTrackHead']
-
-
-@register
-class CenterTrackHead(nn.Layer):
-    """
-    Args:
-        in_channels (int): the channel number of input to CenterNetHead.
-        num_classes (int): the number of classes, 1 (MOT17 dataset) by default.
-        head_planes (int): the channel number in all head, 256 by default.
-        task (str): the type of task for regression, 'tracking' by default.
-        loss_weight (dict): the weight of each loss.
-        add_ltrb_amodal (bool): whether to add ltrb_amodal branch, False by default.
-    """
-
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 in_channels,
-                 num_classes=1,
-                 head_planes=256,
-                 task='tracking',
-                 loss_weight={
-                     'tracking': 1.0,
-                     'ltrb_amodal': 0.1,
-                 },
-                 add_ltrb_amodal=True):
-        super(CenterTrackHead, self).__init__()
-        self.task = task
-        self.loss_weight = loss_weight
-        self.add_ltrb_amodal = add_ltrb_amodal
-
-        # tracking head
-        self.tracking = nn.Sequential(
-            ConvLayer(
-                in_channels, head_planes, kernel_size=3, padding=1, bias=True),
-            nn.ReLU(),
-            ConvLayer(
-                head_planes, 2, kernel_size=1, stride=1, padding=0, bias=True))
-
-        # ltrb_amodal head
-        if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight:
-            self.ltrb_amodal = nn.Sequential(
-                ConvLayer(
-                    in_channels,
-                    head_planes,
-                    kernel_size=3,
-                    padding=1,
-                    bias=True),
-                nn.ReLU(),
-                ConvLayer(
-                    head_planes,
-                    4,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                    bias=True))
-
-        # TODO: add more tasks
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        if isinstance(input_shape, (list, tuple)):
-            input_shape = input_shape[0]
-        return {'in_channels': input_shape.channels}
-
-    def forward(self,
-                feat,
-                inputs,
-                bboxes=None,
-                bbox_inds=None,
-                topk_clses=None,
-                topk_ys=None,
-                topk_xs=None):
-        tracking = self.tracking(feat)
-        head_outs = {'tracking': tracking}
-        if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight:
-            ltrb_amodal = self.ltrb_amodal(feat)
-            head_outs.update({'ltrb_amodal': ltrb_amodal})
-
-        if self.training:
-            losses = self.get_loss(inputs, self.loss_weight, head_outs)
-            return losses
-        else:
-            ret = self.generic_decode(head_outs, bboxes, bbox_inds, topk_ys,
-                                      topk_xs)
-            return ret
-
-    def get_loss(self, inputs, weights, head_outs):
-        index = inputs['index'].unsqueeze(2)
-        mask = inputs['index_mask'].unsqueeze(2)
-        batch_inds = list()
-        for i in range(head_outs['tracking'].shape[0]):
-            batch_ind = paddle.full(
-                shape=[1, index.shape[1], 1], fill_value=i, dtype='int64')
-            batch_inds.append(batch_ind)
-        batch_inds = paddle.concat(batch_inds, axis=0)
-        index = paddle.concat(x=[batch_inds, index], axis=2)
-
-        # 1.tracking head loss: L1 loss
-        tracking = head_outs['tracking'].transpose([0, 2, 3, 1])
-        tracking_target = inputs['tracking']
-        bs, _, _, c = tracking.shape
-        tracking = tracking.reshape([bs, -1, c])
-        pos_tracking = paddle.gather_nd(tracking, index=index)
-        tracking_mask = paddle.cast(
-            paddle.expand_as(mask, pos_tracking), dtype=pos_tracking.dtype)
-        pos_num = tracking_mask.sum()
-        tracking_mask.stop_gradient = True
-        tracking_target.stop_gradient = True
-        tracking_loss = F.l1_loss(
-            pos_tracking * tracking_mask,
-            tracking_target * tracking_mask,
-            reduction='sum')
-        tracking_loss = tracking_loss / (pos_num + 1e-4)
-
-        # 2.ltrb_amodal head loss(optinal): L1 loss
-        if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight:
-            ltrb_amodal = head_outs['ltrb_amodal'].transpose([0, 2, 3, 1])
-            ltrb_amodal_target = inputs['ltrb_amodal']
-            bs, _, _, c = ltrb_amodal.shape
-            ltrb_amodal = ltrb_amodal.reshape([bs, -1, c])
-            pos_ltrb_amodal = paddle.gather_nd(ltrb_amodal, index=index)
-            ltrb_amodal_mask = paddle.cast(
-                paddle.expand_as(mask, pos_ltrb_amodal),
-                dtype=pos_ltrb_amodal.dtype)
-            pos_num = ltrb_amodal_mask.sum()
-            ltrb_amodal_mask.stop_gradient = True
-            ltrb_amodal_target.stop_gradient = True
-            ltrb_amodal_loss = F.l1_loss(
-                pos_ltrb_amodal * ltrb_amodal_mask,
-                ltrb_amodal_target * ltrb_amodal_mask,
-                reduction='sum')
-            ltrb_amodal_loss = ltrb_amodal_loss / (pos_num + 1e-4)
-
-        losses = {'tracking_loss': tracking_loss, }
-        plugin_loss = weights['tracking'] * tracking_loss
-
-        if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight:
-            losses.update({'ltrb_amodal_loss': ltrb_amodal_loss})
-            plugin_loss += weights['ltrb_amodal'] * ltrb_amodal_loss
-        losses.update({'plugin_loss': plugin_loss})
-        return losses
-
-    def generic_decode(self, head_outs, bboxes, bbox_inds, topk_ys, topk_xs):
-        topk_ys = paddle.floor(topk_ys)  # note: More accurate
-        topk_xs = paddle.floor(topk_xs)
-        cts = paddle.concat([topk_xs, topk_ys], 1)
-        ret = {'bboxes': bboxes, 'cts': cts}
-
-        regression_heads = ['tracking']  # todo: add more tasks
-        for head in regression_heads:
-            if head in head_outs:
-                ret[head] = _tranpose_and_gather_feat(head_outs[head],
-                                                      bbox_inds)
-
-        if 'ltrb_amodal' in head_outs:
-            ltrb_amodal = head_outs['ltrb_amodal']
-            ltrb_amodal = _tranpose_and_gather_feat(ltrb_amodal, bbox_inds)
-            bboxes_amodal = paddle.concat(
-                [
-                    topk_xs * 1.0 + ltrb_amodal[..., 0:1],
-                    topk_ys * 1.0 + ltrb_amodal[..., 1:2],
-                    topk_xs * 1.0 + ltrb_amodal[..., 2:3],
-                    topk_ys * 1.0 + ltrb_amodal[..., 3:4]
-                ],
-                axis=1)
-            ret['bboxes'] = paddle.concat([bboxes[:, 0:2], bboxes_amodal], 1)
-            # cls_id, score, x0, y0, x1, y1
-
-        return ret
-
-    def centertrack_post_process(self, dets, meta, out_thresh):
-        if not ('bboxes' in dets):
-            return [{}]
-
-        preds = []
-        c, s = meta['center'].numpy(), meta['scale'].numpy()
-        h, w = meta['out_height'].numpy(), meta['out_width'].numpy()
-        trans = get_affine_transform(
-            center=c[0],
-            input_size=s[0],
-            rot=0,
-            output_size=[w[0], h[0]],
-            shift=(0., 0.),
-            inv=True).astype(np.float32)
-        for i, dets_bbox in enumerate(dets['bboxes']):
-            if dets_bbox[1] < out_thresh:
-                break
-            item = {}
-            item['score'] = dets_bbox[1]
-            item['class'] = int(dets_bbox[0]) + 1
-            item['ct'] = transform_preds_with_trans(
-                dets['cts'][i].reshape([1, 2]), trans).reshape(2)
-
-            if 'tracking' in dets:
-                tracking = transform_preds_with_trans(
-                    (dets['tracking'][i] + dets['cts'][i]).reshape([1, 2]),
-                    trans).reshape(2)
-                item['tracking'] = tracking - item['ct']
-
-            if 'bboxes' in dets:
-                bbox = transform_preds_with_trans(
-                    dets_bbox[2:6].reshape([2, 2]), trans).reshape(4)
-                item['bbox'] = bbox
-
-            preds.append(item)
-        return preds
-
-
-def transform_preds_with_trans(coords, trans):
-    target_coords = np.ones((coords.shape[0], 3), np.float32)
-    target_coords[:, :2] = coords
-    target_coords = np.dot(trans, target_coords.transpose()).transpose()
-    return target_coords[:, :2]
-
-
-def _tranpose_and_gather_feat(feat, bbox_inds):
-    feat = feat.transpose([0, 2, 3, 1])
-    feat = feat.reshape([-1, feat.shape[3]])
-    feat = paddle.gather(feat, bbox_inds)
-    return feat
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/clrnet_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/clrnet_head.py
deleted file mode 100644
index 14760b9..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/clrnet_head.py
+++ /dev/null
@@ -1,399 +0,0 @@
-import math
-import paddle
-import numpy as np
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-
-from ppdet.modeling.initializer import normal_
-from ppdet.modeling.lane_utils import Lane
-from ppdet.modeling.losses import line_iou
-from ppdet.modeling.clrnet_utils import ROIGather, LinearModule, SegDecoder
-
-__all__ = ['CLRHead']
-
-
-@register
-class CLRHead(nn.Layer):
-    __inject__ = ['loss']
-    __shared__ = [
-        'img_w', 'img_h', 'ori_img_h', 'num_classes', 'cut_height',
-        'num_points', "max_lanes"
-    ]
-
-    def __init__(self,
-                 num_points=72,
-                 prior_feat_channels=64,
-                 fc_hidden_dim=64,
-                 num_priors=192,
-                 img_w=800,
-                 img_h=320,
-                 ori_img_h=590,
-                 cut_height=270,
-                 num_classes=5,
-                 num_fc=2,
-                 refine_layers=3,
-                 sample_points=36,
-                 conf_threshold=0.4,
-                 nms_thres=0.5,
-                 max_lanes=4,
-                 loss='CLRNetLoss'):
-        super(CLRHead, self).__init__()
-        self.img_w = img_w
-        self.img_h = img_h
-        self.n_strips = num_points - 1
-        self.n_offsets = num_points
-        self.num_priors = num_priors
-        self.sample_points = sample_points
-        self.refine_layers = refine_layers
-        self.num_classes = num_classes
-        self.fc_hidden_dim = fc_hidden_dim
-        self.ori_img_h = ori_img_h
-        self.cut_height = cut_height
-        self.conf_threshold = conf_threshold
-        self.nms_thres = nms_thres
-        self.max_lanes = max_lanes
-        self.prior_feat_channels = prior_feat_channels
-        self.loss = loss
-        self.register_buffer(
-            name='sample_x_indexs',
-            tensor=(paddle.linspace(
-                start=0, stop=1, num=self.sample_points,
-                dtype=paddle.float32) * self.n_strips).astype(dtype='int64'))
-        self.register_buffer(
-            name='prior_feat_ys',
-            tensor=paddle.flip(
-                x=(1 - self.sample_x_indexs.astype('float32') / self.n_strips),
-                axis=[-1]))
-        self.register_buffer(
-            name='prior_ys',
-            tensor=paddle.linspace(
-                start=1, stop=0, num=self.n_offsets).astype('float32'))
-        self.prior_feat_channels = prior_feat_channels
-        self._init_prior_embeddings()
-        init_priors, priors_on_featmap = self.generate_priors_from_embeddings()
-        self.register_buffer(name='priors', tensor=init_priors)
-        self.register_buffer(name='priors_on_featmap', tensor=priors_on_featmap)
-        self.seg_decoder = SegDecoder(self.img_h, self.img_w, self.num_classes,
-                                      self.prior_feat_channels,
-                                      self.refine_layers)
-        reg_modules = list()
-        cls_modules = list()
-        for _ in range(num_fc):
-            reg_modules += [*LinearModule(self.fc_hidden_dim)]
-            cls_modules += [*LinearModule(self.fc_hidden_dim)]
-        self.reg_modules = nn.LayerList(sublayers=reg_modules)
-        self.cls_modules = nn.LayerList(sublayers=cls_modules)
-        self.roi_gather = ROIGather(self.prior_feat_channels, self.num_priors,
-                                    self.sample_points, self.fc_hidden_dim,
-                                    self.refine_layers)
-        self.reg_layers = nn.Linear(
-            in_features=self.fc_hidden_dim,
-            out_features=self.n_offsets + 1 + 2 + 1,
-            bias_attr=True)
-        self.cls_layers = nn.Linear(
-            in_features=self.fc_hidden_dim, out_features=2, bias_attr=True)
-        self.init_weights()
-
-    def init_weights(self):
-        for m in self.cls_layers.parameters():
-            normal_(m, mean=0.0, std=0.001)
-        for m in self.reg_layers.parameters():
-            normal_(m, mean=0.0, std=0.001)
-
-    def pool_prior_features(self, batch_features, num_priors, prior_xs):
-        """
-        pool prior feature from feature map.
-        Args:
-            batch_features (Tensor): Input feature maps, shape: (B, C, H, W) 
-        """
-        batch_size = batch_features.shape[0]
-        prior_xs = prior_xs.reshape([batch_size, num_priors, -1, 1])
-
-        prior_ys = self.prior_feat_ys.tile(repeat_times=[
-            batch_size * num_priors
-        ]).reshape([batch_size, num_priors, -1, 1])
-        prior_xs = prior_xs * 2.0 - 1.0
-        prior_ys = prior_ys * 2.0 - 1.0
-        grid = paddle.concat(x=(prior_xs, prior_ys), axis=-1)
-        feature = F.grid_sample(
-            x=batch_features, grid=grid,
-            align_corners=True).transpose(perm=[0, 2, 1, 3])
-        feature = feature.reshape([
-            batch_size * num_priors, self.prior_feat_channels,
-            self.sample_points, 1
-        ])
-        return feature
-
-    def generate_priors_from_embeddings(self):
-        predictions = self.prior_embeddings.weight
-        # 2 scores, 1 start_y, 1 start_x, 1 theta, 1 length, 72 coordinates, score[0] = negative prob, score[1] = positive prob       
-        priors = paddle.zeros(
-            (self.num_priors, 2 + 2 + 2 + self.n_offsets),
-            dtype=predictions.dtype)
-        priors[:, 2:5] = predictions.clone()
-        priors[:, 6:] = (
-            priors[:, 3].unsqueeze(1).clone().tile([1, self.n_offsets]) *
-            (self.img_w - 1) +
-            ((1 - self.prior_ys.tile([self.num_priors, 1]) -
-              priors[:, 2].unsqueeze(1).clone().tile([1, self.n_offsets])) *
-             self.img_h / paddle.tan(x=priors[:, 4].unsqueeze(1).clone().tile(
-                 [1, self.n_offsets]) * math.pi + 1e-05))) / (self.img_w - 1)
-        priors_on_featmap = paddle.index_select(
-            priors, 6 + self.sample_x_indexs, axis=-1)
-        return priors, priors_on_featmap
-
-    def _init_prior_embeddings(self):
-        self.prior_embeddings = nn.Embedding(self.num_priors, 3)
-        bottom_priors_nums = self.num_priors * 3 // 4
-        left_priors_nums, _ = self.num_priors // 8, self.num_priors // 8
-        strip_size = 0.5 / (left_priors_nums // 2 - 1)
-        bottom_strip_size = 1 / (bottom_priors_nums // 4 + 1)
-
-        with paddle.no_grad():
-            for i in range(left_priors_nums):
-                self.prior_embeddings.weight[i, 0] = i // 2 * strip_size
-                self.prior_embeddings.weight[i, 1] = 0.0
-                self.prior_embeddings.weight[i,
-                                             2] = 0.16 if i % 2 == 0 else 0.32
-
-            for i in range(left_priors_nums,
-                           left_priors_nums + bottom_priors_nums):
-                self.prior_embeddings.weight[i, 0] = 0.0
-                self.prior_embeddings.weight[i, 1] = (
-                    (i - left_priors_nums) // 4 + 1) * bottom_strip_size
-                self.prior_embeddings.weight[i, 2] = 0.2 * (i % 4 + 1)
-
-            for i in range(left_priors_nums + bottom_priors_nums,
-                           self.num_priors):
-                self.prior_embeddings.weight[i, 0] = (
-                    i - left_priors_nums - bottom_priors_nums) // 2 * strip_size
-                self.prior_embeddings.weight[i, 1] = 1.0
-                self.prior_embeddings.weight[i,
-                                             2] = 0.68 if i % 2 == 0 else 0.84
-
-    def forward(self, x, inputs=None):
-        """
-        Take pyramid features as input to perform Cross Layer Refinement and finally output the prediction lanes.
-        Each feature is a 4D tensor.
-        Args:
-            x: input features (list[Tensor])
-        Return:
-            prediction_list: each layer's prediction result
-            seg: segmentation result for auxiliary loss
-        """
-        batch_features = list(x[len(x) - self.refine_layers:])
-        batch_features.reverse()
-        batch_size = batch_features[-1].shape[0]
-
-        if self.training:
-            self.priors, self.priors_on_featmap = self.generate_priors_from_embeddings(
-            )
-        priors, priors_on_featmap = self.priors.tile(
-            [batch_size, 1,
-             1]), self.priors_on_featmap.tile([batch_size, 1, 1])
-        predictions_lists = []
-        prior_features_stages = []
-
-        for stage in range(self.refine_layers):
-            num_priors = priors_on_featmap.shape[1]
-            prior_xs = paddle.flip(x=priors_on_featmap, axis=[2])
-            batch_prior_features = self.pool_prior_features(
-                batch_features[stage], num_priors, prior_xs)
-            prior_features_stages.append(batch_prior_features)
-
-            fc_features = self.roi_gather(prior_features_stages,
-                                          batch_features[stage], stage)
-            # return fc_features
-            fc_features = fc_features.reshape(
-                [num_priors, batch_size, -1]).reshape(
-                    [batch_size * num_priors, self.fc_hidden_dim])
-            cls_features = fc_features.clone()
-            reg_features = fc_features.clone()
-
-            for cls_layer in self.cls_modules:
-                cls_features = cls_layer(cls_features)
-
-            # return cls_features
-            for reg_layer in self.reg_modules:
-                reg_features = reg_layer(reg_features)
-            cls_logits = self.cls_layers(cls_features)
-            reg = self.reg_layers(reg_features)
-
-            cls_logits = cls_logits.reshape(
-                [batch_size, -1, cls_logits.shape[1]])
-            reg = reg.reshape([batch_size, -1, reg.shape[1]])
-            predictions = priors.clone()
-            predictions[:, :, :2] = cls_logits
-            predictions[:, :, 2:5] += reg[:, :, :3]
-            predictions[:, :, 5] = reg[:, :, 3]
-
-            def tran_tensor(t):
-                return t.unsqueeze(axis=2).clone().tile([1, 1, self.n_offsets])
-
-            predictions[..., 6:] = (
-                tran_tensor(predictions[..., 3]) * (self.img_w - 1) +
-                ((1 - self.prior_ys.tile([batch_size, num_priors, 1]) -
-                  tran_tensor(predictions[..., 2])) * self.img_h / paddle.tan(
-                      tran_tensor(predictions[..., 4]) * math.pi + 1e-05))) / (
-                          self.img_w - 1)
-
-            prediction_lines = predictions.clone()
-            predictions[..., 6:] += reg[..., 4:]
-            predictions_lists.append(predictions)
-
-            if stage != self.refine_layers - 1:
-                priors = prediction_lines.detach().clone()
-                priors_on_featmap = priors.index_select(
-                    6 + self.sample_x_indexs, axis=-1)
-
-        if self.training:
-            seg = None
-            seg_features = paddle.concat(
-                [
-                    F.interpolate(
-                        feature,
-                        size=[
-                            batch_features[-1].shape[2],
-                            batch_features[-1].shape[3]
-                        ],
-                        mode='bilinear',
-                        align_corners=False) for feature in batch_features
-                ],
-                axis=1)
-
-            seg = self.seg_decoder(seg_features)
-
-            output = {'predictions_lists': predictions_lists, 'seg': seg}
-            return self.loss(output, inputs)
-        return predictions_lists[-1]
-
-    def predictions_to_pred(self, predictions):
-        """
-        Convert predictions to internal Lane structure for evaluation.
-        """
-        self.prior_ys = paddle.to_tensor(self.prior_ys)
-        self.prior_ys = self.prior_ys.astype('float64')
-        lanes = []
-        for lane in predictions:
-            lane_xs = lane[6:].clone()
-            start = min(
-                max(0, int(round(lane[2].item() * self.n_strips))),
-                self.n_strips)
-            length = int(round(lane[5].item()))
-            end = start + length - 1
-            end = min(end, len(self.prior_ys) - 1)
-            if start > 0:
-                mask = ((lane_xs[:start] >= 0.) &
-                        (lane_xs[:start] <= 1.)).cpu().detach().numpy()[::-1]
-                mask = ~((mask.cumprod()[::-1]).astype(np.bool))
-                lane_xs[:start][mask] = -2
-            if end < len(self.prior_ys) - 1:
-                lane_xs[end + 1:] = -2
-
-            lane_ys = self.prior_ys[lane_xs >= 0].clone()
-            lane_xs = lane_xs[lane_xs >= 0]
-            lane_xs = lane_xs.flip(axis=0).astype('float64')
-            lane_ys = lane_ys.flip(axis=0)
-
-            lane_ys = (lane_ys *
-                       (self.ori_img_h - self.cut_height) + self.cut_height
-                       ) / self.ori_img_h
-            if len(lane_xs) <= 1:
-                continue
-            points = paddle.stack(
-                x=(lane_xs.reshape([-1, 1]), lane_ys.reshape([-1, 1])),
-                axis=1).squeeze(axis=2)
-            lane = Lane(
-                points=points.cpu().numpy(),
-                metadata={
-                    'start_x': lane[3],
-                    'start_y': lane[2],
-                    'conf': lane[1]
-                })
-            lanes.append(lane)
-        return lanes
-
-    def lane_nms(self, predictions, scores, nms_overlap_thresh, top_k):
-        """
-        NMS for lane detection.
-        predictions: paddle.Tensor [num_lanes,conf,y,x,lenght,72offsets] [12,77]
-        scores: paddle.Tensor [num_lanes]
-        nms_overlap_thresh: float
-        top_k: int
-        """
-        # sort by scores to get idx
-        idx = scores.argsort(descending=True)
-        keep = []
-
-        condidates = predictions.clone()
-        condidates = condidates.index_select(idx)
-
-        while len(condidates) > 0:
-            keep.append(idx[0])
-            if len(keep) >= top_k or len(condidates) == 1:
-                break
-
-            ious = []
-            for i in range(1, len(condidates)):
-                ious.append(1 - line_iou(
-                    condidates[i].unsqueeze(0),
-                    condidates[0].unsqueeze(0),
-                    img_w=self.img_w,
-                    length=15))
-            ious = paddle.to_tensor(ious)
-
-            mask = ious <= nms_overlap_thresh
-            id = paddle.where(mask == False)[0]
-
-            if id.shape[0] == 0:
-                break
-            condidates = condidates[1:].index_select(id)
-            idx = idx[1:].index_select(id)
-        keep = paddle.stack(keep)
-
-        return keep
-
-    def get_lanes(self, output, as_lanes=True):
-        """
-        Convert model output to lanes.
-        """
-        softmax = nn.Softmax(axis=1)
-        decoded = []
-
-        for predictions in output:
-            threshold = self.conf_threshold
-            scores = softmax(predictions[:, :2])[:, 1]
-            keep_inds = scores >= threshold
-            predictions = predictions[keep_inds]
-            scores = scores[keep_inds]
-
-            if predictions.shape[0] == 0:
-                decoded.append([])
-                continue
-            nms_predictions = predictions.detach().clone()
-            nms_predictions = paddle.concat(
-                x=[nms_predictions[..., :4], nms_predictions[..., 5:]], axis=-1)
-
-            nms_predictions[..., 4] = nms_predictions[..., 4] * self.n_strips
-            nms_predictions[..., 5:] = nms_predictions[..., 5:] * (
-                self.img_w - 1)
-
-            keep = self.lane_nms(
-                nms_predictions[..., 5:],
-                scores,
-                nms_overlap_thresh=self.nms_thres,
-                top_k=self.max_lanes)
-
-            predictions = predictions.index_select(keep)
-
-            if predictions.shape[0] == 0:
-                decoded.append([])
-                continue
-            predictions[:, 5] = paddle.round(predictions[:, 5] * self.n_strips)
-            if as_lanes:
-                pred = self.predictions_to_pred(predictions)
-            else:
-                pred = predictions
-            decoded.append(pred)
-        return decoded
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/detr_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/detr_head.py
deleted file mode 100644
index d3c093f..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/detr_head.py
+++ /dev/null
@@ -1,536 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-import pycocotools.mask as mask_util
-from ..initializer import linear_init_, constant_
-from ..transformers.utils import inverse_sigmoid
-
-__all__ = ['DETRHead', 'DeformableDETRHead', 'DINOHead', 'MaskDINOHead']
-
-
-class MLP(nn.Layer):
-    """This code is based on
-        https://github.com/facebookresearch/detr/blob/main/models/detr.py
-    """
-
-    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
-        super().__init__()
-        self.num_layers = num_layers
-        h = [hidden_dim] * (num_layers - 1)
-        self.layers = nn.LayerList(
-            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        for l in self.layers:
-            linear_init_(l)
-
-    def forward(self, x):
-        for i, layer in enumerate(self.layers):
-            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
-        return x
-
-
-class MultiHeadAttentionMap(nn.Layer):
-    """This code is based on
-        https://github.com/facebookresearch/detr/blob/main/models/segmentation.py
-
-        This is a 2D attention module, which only returns the attention softmax (no multiplication by value)
-    """
-
-    def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0,
-                 bias=True):
-        super().__init__()
-        self.num_heads = num_heads
-        self.hidden_dim = hidden_dim
-        self.dropout = nn.Dropout(dropout)
-
-        weight_attr = paddle.ParamAttr(
-            initializer=paddle.nn.initializer.XavierUniform())
-        bias_attr = paddle.framework.ParamAttr(
-            initializer=paddle.nn.initializer.Constant()) if bias else False
-
-        self.q_proj = nn.Linear(query_dim, hidden_dim, weight_attr, bias_attr)
-        self.k_proj = nn.Conv2D(
-            query_dim,
-            hidden_dim,
-            1,
-            weight_attr=weight_attr,
-            bias_attr=bias_attr)
-
-        self.normalize_fact = float(hidden_dim / self.num_heads)**-0.5
-
-    def forward(self, q, k, mask=None):
-        q = self.q_proj(q)
-        k = self.k_proj(k)
-        bs, num_queries, n, c, h, w = q.shape[0], q.shape[1], self.num_heads,\
-                                      self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1]
-        qh = q.reshape([bs, num_queries, n, c])
-        kh = k.reshape([bs, n, c, h, w])
-        # weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh)
-        qh = qh.transpose([0, 2, 1, 3]).reshape([-1, num_queries, c])
-        kh = kh.reshape([-1, c, h * w])
-        weights = paddle.bmm(qh * self.normalize_fact, kh).reshape(
-            [bs, n, num_queries, h, w]).transpose([0, 2, 1, 3, 4])
-
-        if mask is not None:
-            weights += mask
-        # fix a potenial bug: https://github.com/facebookresearch/detr/issues/247
-        weights = F.softmax(weights.flatten(3), axis=-1).reshape(weights.shape)
-        weights = self.dropout(weights)
-        return weights
-
-
-class MaskHeadFPNConv(nn.Layer):
-    """This code is based on
-        https://github.com/facebookresearch/detr/blob/main/models/segmentation.py
-
-        Simple convolutional head, using group norm.
-        Upsampling is done using a FPN approach
-    """
-
-    def __init__(self, input_dim, fpn_dims, context_dim, num_groups=8):
-        super().__init__()
-
-        inter_dims = [input_dim,
-                      ] + [context_dim // (2**i) for i in range(1, 5)]
-        weight_attr = paddle.ParamAttr(
-            initializer=paddle.nn.initializer.KaimingUniform())
-        bias_attr = paddle.framework.ParamAttr(
-            initializer=paddle.nn.initializer.Constant())
-
-        self.conv0 = self._make_layers(input_dim, input_dim, 3, num_groups,
-                                       weight_attr, bias_attr)
-        self.conv_inter = nn.LayerList()
-        for in_dims, out_dims in zip(inter_dims[:-1], inter_dims[1:]):
-            self.conv_inter.append(
-                self._make_layers(in_dims, out_dims, 3, num_groups, weight_attr,
-                                  bias_attr))
-
-        self.conv_out = nn.Conv2D(
-            inter_dims[-1],
-            1,
-            3,
-            padding=1,
-            weight_attr=weight_attr,
-            bias_attr=bias_attr)
-
-        self.adapter = nn.LayerList()
-        for i in range(len(fpn_dims)):
-            self.adapter.append(
-                nn.Conv2D(
-                    fpn_dims[i],
-                    inter_dims[i + 1],
-                    1,
-                    weight_attr=weight_attr,
-                    bias_attr=bias_attr))
-
-    def _make_layers(self,
-                     in_dims,
-                     out_dims,
-                     kernel_size,
-                     num_groups,
-                     weight_attr=None,
-                     bias_attr=None):
-        return nn.Sequential(
-            nn.Conv2D(
-                in_dims,
-                out_dims,
-                kernel_size,
-                padding=kernel_size // 2,
-                weight_attr=weight_attr,
-                bias_attr=bias_attr),
-            nn.GroupNorm(num_groups, out_dims),
-            nn.ReLU())
-
-    def forward(self, x, bbox_attention_map, fpns):
-        x = paddle.concat([
-            x.tile([bbox_attention_map.shape[1], 1, 1, 1]),
-            bbox_attention_map.flatten(0, 1)
-        ], 1)
-        x = self.conv0(x)
-        for inter_layer, adapter_layer, feat in zip(self.conv_inter[:-1],
-                                                    self.adapter, fpns):
-            feat = adapter_layer(feat).tile(
-                [bbox_attention_map.shape[1], 1, 1, 1])
-            x = inter_layer(x)
-            x = feat + F.interpolate(x, size=feat.shape[-2:])
-
-        x = self.conv_inter[-1](x)
-        x = self.conv_out(x)
-        return x
-
-
-@register
-class DETRHead(nn.Layer):
-    __shared__ = ['num_classes', 'hidden_dim', 'use_focal_loss']
-    __inject__ = ['loss']
-
-    def __init__(self,
-                 num_classes=80,
-                 hidden_dim=256,
-                 nhead=8,
-                 num_mlp_layers=3,
-                 loss='DETRLoss',
-                 fpn_dims=[1024, 512, 256],
-                 with_mask_head=False,
-                 use_focal_loss=False):
-        super(DETRHead, self).__init__()
-        # add background class
-        self.num_classes = num_classes if use_focal_loss else num_classes + 1
-        self.hidden_dim = hidden_dim
-        self.loss = loss
-        self.with_mask_head = with_mask_head
-        self.use_focal_loss = use_focal_loss
-
-        self.score_head = nn.Linear(hidden_dim, self.num_classes)
-        self.bbox_head = MLP(hidden_dim,
-                             hidden_dim,
-                             output_dim=4,
-                             num_layers=num_mlp_layers)
-        if self.with_mask_head:
-            self.bbox_attention = MultiHeadAttentionMap(hidden_dim, hidden_dim,
-                                                        nhead)
-            self.mask_head = MaskHeadFPNConv(hidden_dim + nhead, fpn_dims,
-                                             hidden_dim)
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        linear_init_(self.score_head)
-
-    @classmethod
-    def from_config(cls, cfg, hidden_dim, nhead, input_shape):
-
-        return {
-            'hidden_dim': hidden_dim,
-            'nhead': nhead,
-            'fpn_dims': [i.channels for i in input_shape[::-1]][1:]
-        }
-
-    @staticmethod
-    def get_gt_mask_from_polygons(gt_poly, pad_mask):
-        out_gt_mask = []
-        for polygons, padding in zip(gt_poly, pad_mask):
-            height, width = int(padding[:, 0].sum()), int(padding[0, :].sum())
-            masks = []
-            for obj_poly in polygons:
-                rles = mask_util.frPyObjects(obj_poly, height, width)
-                rle = mask_util.merge(rles)
-                masks.append(
-                    paddle.to_tensor(mask_util.decode(rle)).astype('float32'))
-            masks = paddle.stack(masks)
-            masks_pad = paddle.zeros(
-                [masks.shape[0], pad_mask.shape[1], pad_mask.shape[2]])
-            masks_pad[:, :height, :width] = masks
-            out_gt_mask.append(masks_pad)
-        return out_gt_mask
-
-    def forward(self, out_transformer, body_feats, inputs=None):
-        r"""
-        Args:
-            out_transformer (Tuple): (feats: [num_levels, batch_size,
-                                                num_queries, hidden_dim],
-                            memory: [batch_size, hidden_dim, h, w],
-                            src_proj: [batch_size, h*w, hidden_dim],
-                            src_mask: [batch_size, 1, 1, h, w])
-            body_feats (List(Tensor)): list[[B, C, H, W]]
-            inputs (dict): dict(inputs)
-        """
-        feats, memory, src_proj, src_mask = out_transformer
-        outputs_logit = self.score_head(feats)
-        outputs_bbox = F.sigmoid(self.bbox_head(feats))
-        outputs_seg = None
-        if self.with_mask_head:
-            bbox_attention_map = self.bbox_attention(feats[-1], memory,
-                                                     src_mask)
-            fpn_feats = [a for a in body_feats[::-1]][1:]
-            outputs_seg = self.mask_head(src_proj, bbox_attention_map,
-                                         fpn_feats)
-            outputs_seg = outputs_seg.reshape([
-                feats.shape[1], feats.shape[2], outputs_seg.shape[-2],
-                outputs_seg.shape[-1]
-            ])
-
-        if self.training:
-            assert inputs is not None
-            assert 'gt_bbox' in inputs and 'gt_class' in inputs
-            gt_mask = self.get_gt_mask_from_polygons(
-                inputs['gt_poly'],
-                inputs['pad_mask']) if 'gt_poly' in inputs else None
-            return self.loss(
-                outputs_bbox,
-                outputs_logit,
-                inputs['gt_bbox'],
-                inputs['gt_class'],
-                masks=outputs_seg,
-                gt_mask=gt_mask)
-        else:
-            return (outputs_bbox[-1], outputs_logit[-1], outputs_seg)
-
-
-@register
-class DeformableDETRHead(nn.Layer):
-    __shared__ = ['num_classes', 'hidden_dim']
-    __inject__ = ['loss']
-
-    def __init__(self,
-                 num_classes=80,
-                 hidden_dim=512,
-                 nhead=8,
-                 num_mlp_layers=3,
-                 loss='DETRLoss'):
-        super(DeformableDETRHead, self).__init__()
-        self.num_classes = num_classes
-        self.hidden_dim = hidden_dim
-        self.nhead = nhead
-        self.loss = loss
-
-        self.score_head = nn.Linear(hidden_dim, self.num_classes)
-        self.bbox_head = MLP(hidden_dim,
-                             hidden_dim,
-                             output_dim=4,
-                             num_layers=num_mlp_layers)
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        linear_init_(self.score_head)
-        constant_(self.score_head.bias, -4.595)
-        constant_(self.bbox_head.layers[-1].weight)
-
-        with paddle.no_grad():
-            bias = paddle.zeros_like(self.bbox_head.layers[-1].bias)
-            bias[2:] = -2.0
-            self.bbox_head.layers[-1].bias.set_value(bias)
-
-    @classmethod
-    def from_config(cls, cfg, hidden_dim, nhead, input_shape):
-        return {'hidden_dim': hidden_dim, 'nhead': nhead}
-
-    def forward(self, out_transformer, body_feats, inputs=None):
-        r"""
-        Args:
-            out_transformer (Tuple): (feats: [num_levels, batch_size,
-                                                num_queries, hidden_dim],
-                            memory: [batch_size,
-                                \sum_{l=0}^{L-1} H_l \cdot W_l, hidden_dim],
-                            reference_points: [batch_size, num_queries, 2])
-            body_feats (List(Tensor)): list[[B, C, H, W]]
-            inputs (dict): dict(inputs)
-        """
-        feats, memory, reference_points = out_transformer
-        reference_points = inverse_sigmoid(reference_points.unsqueeze(0))
-        outputs_bbox = self.bbox_head(feats)
-
-        # It's equivalent to "outputs_bbox[:, :, :, :2] += reference_points",
-        # but the gradient is wrong in paddle.
-        outputs_bbox = paddle.concat(
-            [
-                outputs_bbox[:, :, :, :2] + reference_points,
-                outputs_bbox[:, :, :, 2:]
-            ],
-            axis=-1)
-
-        outputs_bbox = F.sigmoid(outputs_bbox)
-        outputs_logit = self.score_head(feats)
-
-        if self.training:
-            assert inputs is not None
-            assert 'gt_bbox' in inputs and 'gt_class' in inputs
-
-            return self.loss(outputs_bbox, outputs_logit, inputs['gt_bbox'],
-                             inputs['gt_class'])
-        else:
-            return (outputs_bbox[-1], outputs_logit[-1], None)
-
-
-@register
-class DINOHead(nn.Layer):
-    __inject__ = ['loss']
-
-    def __init__(self, loss='DINOLoss', eval_idx=-1):
-        super(DINOHead, self).__init__()
-        self.loss = loss
-        self.eval_idx = eval_idx
-
-    def forward(self, out_transformer, body_feats, inputs=None):
-        (dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits,
-         dn_meta) = out_transformer
-        if self.training:
-            assert inputs is not None
-            assert 'gt_bbox' in inputs and 'gt_class' in inputs
-
-            if dn_meta is not None:
-                if isinstance(dn_meta, list):
-                    dual_groups = len(dn_meta) - 1
-                    dec_out_bboxes = paddle.split(
-                        dec_out_bboxes, dual_groups + 1, axis=2)
-                    dec_out_logits = paddle.split(
-                        dec_out_logits, dual_groups + 1, axis=2)
-                    enc_topk_bboxes = paddle.split(
-                        enc_topk_bboxes, dual_groups + 1, axis=1)
-                    enc_topk_logits = paddle.split(
-                        enc_topk_logits, dual_groups + 1, axis=1)
-
-                    dec_out_bboxes_list = []
-                    dec_out_logits_list = []
-                    dn_out_bboxes_list = []
-                    dn_out_logits_list = []
-                    loss = {}
-                    for g_id in range(dual_groups + 1):
-                        if dn_meta[g_id] is not None:
-                            dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split(
-                                dec_out_bboxes[g_id],
-                                dn_meta[g_id]['dn_num_split'],
-                                axis=2)
-                            dn_out_logits_gid, dec_out_logits_gid = paddle.split(
-                                dec_out_logits[g_id],
-                                dn_meta[g_id]['dn_num_split'],
-                                axis=2)
-                        else:
-                            dn_out_bboxes_gid, dn_out_logits_gid = None, None
-                            dec_out_bboxes_gid = dec_out_bboxes[g_id]
-                            dec_out_logits_gid = dec_out_logits[g_id]
-                        out_bboxes_gid = paddle.concat([
-                            enc_topk_bboxes[g_id].unsqueeze(0),
-                            dec_out_bboxes_gid
-                        ])
-                        out_logits_gid = paddle.concat([
-                            enc_topk_logits[g_id].unsqueeze(0),
-                            dec_out_logits_gid
-                        ])
-                        loss_gid = self.loss(
-                            out_bboxes_gid,
-                            out_logits_gid,
-                            inputs['gt_bbox'],
-                            inputs['gt_class'],
-                            dn_out_bboxes=dn_out_bboxes_gid,
-                            dn_out_logits=dn_out_logits_gid,
-                            dn_meta=dn_meta[g_id])
-                        # sum loss
-                        for key, value in loss_gid.items():
-                            loss.update({
-                                key: loss.get(key, paddle.zeros([1])) + value
-                            })
-
-                    # average across (dual_groups + 1)
-                    for key, value in loss.items():
-                        loss.update({key: value / (dual_groups + 1)})
-                    return loss
-                else:
-                    dn_out_bboxes, dec_out_bboxes = paddle.split(
-                        dec_out_bboxes, dn_meta['dn_num_split'], axis=2)
-                    dn_out_logits, dec_out_logits = paddle.split(
-                        dec_out_logits, dn_meta['dn_num_split'], axis=2)
-            else:
-                dn_out_bboxes, dn_out_logits = None, None
-
-            out_bboxes = paddle.concat(
-                [enc_topk_bboxes.unsqueeze(0), dec_out_bboxes])
-            out_logits = paddle.concat(
-                [enc_topk_logits.unsqueeze(0), dec_out_logits])
-
-            return self.loss(
-                out_bboxes,
-                out_logits,
-                inputs['gt_bbox'],
-                inputs['gt_class'],
-                dn_out_bboxes=dn_out_bboxes,
-                dn_out_logits=dn_out_logits,
-                dn_meta=dn_meta,
-                gt_score=inputs.get('gt_score', None))
-        else:
-            return (dec_out_bboxes[self.eval_idx],
-                    dec_out_logits[self.eval_idx], None)
-
-
-@register
-class MaskDINOHead(nn.Layer):
-    __inject__ = ['loss']
-
-    def __init__(self, loss='DINOLoss'):
-        super(MaskDINOHead, self).__init__()
-        self.loss = loss
-
-    def forward(self, out_transformer, body_feats, inputs=None):
-        (dec_out_logits, dec_out_bboxes, dec_out_masks, enc_out, init_out,
-         dn_meta) = out_transformer
-        if self.training:
-            assert inputs is not None
-            assert 'gt_bbox' in inputs and 'gt_class' in inputs
-            assert 'gt_segm' in inputs
-
-            if dn_meta is not None:
-                dn_out_logits, dec_out_logits = paddle.split(
-                    dec_out_logits, dn_meta['dn_num_split'], axis=2)
-                dn_out_bboxes, dec_out_bboxes = paddle.split(
-                    dec_out_bboxes, dn_meta['dn_num_split'], axis=2)
-                dn_out_masks, dec_out_masks = paddle.split(
-                    dec_out_masks, dn_meta['dn_num_split'], axis=2)
-                if init_out is not None:
-                    init_out_logits, init_out_bboxes, init_out_masks = init_out
-                    init_out_logits_dn, init_out_logits = paddle.split(
-                        init_out_logits, dn_meta['dn_num_split'], axis=1)
-                    init_out_bboxes_dn, init_out_bboxes = paddle.split(
-                        init_out_bboxes, dn_meta['dn_num_split'], axis=1)
-                    init_out_masks_dn, init_out_masks = paddle.split(
-                        init_out_masks, dn_meta['dn_num_split'], axis=1)
-
-                    dec_out_logits = paddle.concat(
-                        [init_out_logits.unsqueeze(0), dec_out_logits])
-                    dec_out_bboxes = paddle.concat(
-                        [init_out_bboxes.unsqueeze(0), dec_out_bboxes])
-                    dec_out_masks = paddle.concat(
-                        [init_out_masks.unsqueeze(0), dec_out_masks])
-
-                    dn_out_logits = paddle.concat(
-                        [init_out_logits_dn.unsqueeze(0), dn_out_logits])
-                    dn_out_bboxes = paddle.concat(
-                        [init_out_bboxes_dn.unsqueeze(0), dn_out_bboxes])
-                    dn_out_masks = paddle.concat(
-                        [init_out_masks_dn.unsqueeze(0), dn_out_masks])
-            else:
-                dn_out_bboxes, dn_out_logits = None, None
-                dn_out_masks = None
-
-            enc_out_logits, enc_out_bboxes, enc_out_masks = enc_out
-            out_logits = paddle.concat(
-                [enc_out_logits.unsqueeze(0), dec_out_logits])
-            out_bboxes = paddle.concat(
-                [enc_out_bboxes.unsqueeze(0), dec_out_bboxes])
-            out_masks = paddle.concat(
-                [enc_out_masks.unsqueeze(0), dec_out_masks])
-
-            return self.loss(
-                out_bboxes,
-                out_logits,
-                inputs['gt_bbox'],
-                inputs['gt_class'],
-                masks=out_masks,
-                gt_mask=inputs['gt_segm'],
-                dn_out_logits=dn_out_logits,
-                dn_out_bboxes=dn_out_bboxes,
-                dn_out_masks=dn_out_masks,
-                dn_meta=dn_meta)
-        else:
-            return (dec_out_bboxes[-1], dec_out_logits[-1], dec_out_masks[-1])
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/face_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/face_head.py
deleted file mode 100644
index 360f909..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/face_head.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-
-from ppdet.core.workspace import register
-from ..layers import AnchorGeneratorSSD
-from ..cls_utils import _get_class_default_kwargs
-
-
-@register
-class FaceHead(nn.Layer):
-    """
-    Head block for Face detection network
-
-    Args:
-        num_classes (int): Number of output classes.
-        in_channels (int): Number of input channels.
-        anchor_generator(object): instance of anchor genertor method.
-        kernel_size (int): kernel size of Conv2D in FaceHead.
-        padding (int): padding of Conv2D in FaceHead.
-        conv_decay (float): norm_decay (float): weight decay for conv layer weights.
-        loss (object): loss of face detection model.
-    """
-    __shared__ = ['num_classes']
-    __inject__ = ['anchor_generator', 'loss']
-
-    def __init__(self,
-                 num_classes=80,
-                 in_channels=[96, 96],
-                 anchor_generator=_get_class_default_kwargs(AnchorGeneratorSSD),
-                 kernel_size=3,
-                 padding=1,
-                 conv_decay=0.,
-                 loss='SSDLoss'):
-        super(FaceHead, self).__init__()
-        # add background class
-        self.num_classes = num_classes + 1
-        self.in_channels = in_channels
-        self.anchor_generator = anchor_generator
-        self.loss = loss
-
-        if isinstance(anchor_generator, dict):
-            self.anchor_generator = AnchorGeneratorSSD(**anchor_generator)
-
-        self.num_priors = self.anchor_generator.num_priors
-        self.box_convs = []
-        self.score_convs = []
-        for i, num_prior in enumerate(self.num_priors):
-            box_conv_name = "boxes{}".format(i)
-            box_conv = self.add_sublayer(
-                box_conv_name,
-                nn.Conv2D(
-                    in_channels=self.in_channels[i],
-                    out_channels=num_prior * 4,
-                    kernel_size=kernel_size,
-                    padding=padding))
-            self.box_convs.append(box_conv)
-
-            score_conv_name = "scores{}".format(i)
-            score_conv = self.add_sublayer(
-                score_conv_name,
-                nn.Conv2D(
-                    in_channels=self.in_channels[i],
-                    out_channels=num_prior * self.num_classes,
-                    kernel_size=kernel_size,
-                    padding=padding))
-            self.score_convs.append(score_conv)
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
-
-    def forward(self, feats, image, gt_bbox=None, gt_class=None):
-        box_preds = []
-        cls_scores = []
-        prior_boxes = []
-        for feat, box_conv, score_conv in zip(feats, self.box_convs,
-                                              self.score_convs):
-            box_pred = box_conv(feat)
-            box_pred = paddle.transpose(box_pred, [0, 2, 3, 1])
-            box_pred = paddle.reshape(box_pred, [0, -1, 4])
-            box_preds.append(box_pred)
-
-            cls_score = score_conv(feat)
-            cls_score = paddle.transpose(cls_score, [0, 2, 3, 1])
-            cls_score = paddle.reshape(cls_score, [0, -1, self.num_classes])
-            cls_scores.append(cls_score)
-
-        prior_boxes = self.anchor_generator(feats, image)
-
-        if self.training:
-            return self.get_loss(box_preds, cls_scores, gt_bbox, gt_class,
-                                 prior_boxes)
-        else:
-            return (box_preds, cls_scores), prior_boxes
-
-    def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes):
-        return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/fcos_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/fcos_head.py
deleted file mode 100644
index f975789..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/fcos_head.py
+++ /dev/null
@@ -1,499 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn.initializer import Normal, Constant
-
-from ppdet.core.workspace import register
-from ppdet.modeling.layers import ConvNormLayer, MultiClassNMS
-
-__all__ = ['FCOSFeat', 'FCOSHead', 'FCOSHead_ARSL']
-
-
-class ScaleReg(nn.Layer):
-    """
-    Parameter for scaling the regression outputs.
-    """
-
-    def __init__(self):
-        super(ScaleReg, self).__init__()
-        self.scale_reg = self.create_parameter(
-            shape=[1],
-            attr=ParamAttr(initializer=Constant(value=1.)),
-            dtype="float32")
-
-    def forward(self, inputs):
-        out = inputs * self.scale_reg
-        return out
-
-
-@register
-class FCOSFeat(nn.Layer):
-    """
-    FCOSFeat of FCOS
-
-    Args:
-        feat_in (int): The channel number of input Tensor.
-        feat_out (int): The channel number of output Tensor.
-        num_convs (int): The convolution number of the FCOSFeat.
-        norm_type (str): Normalization type, 'bn'/'sync_bn'/'gn'.
-        use_dcn (bool): Whether to use dcn in tower or not.
-    """
-
-    def __init__(self,
-                 feat_in=256,
-                 feat_out=256,
-                 num_convs=4,
-                 norm_type='bn',
-                 use_dcn=False):
-        super(FCOSFeat, self).__init__()
-        self.feat_in = feat_in
-        self.feat_out = feat_out
-        self.num_convs = num_convs
-        self.norm_type = norm_type
-        self.cls_subnet_convs = []
-        self.reg_subnet_convs = []
-        for i in range(self.num_convs):
-            in_c = feat_in if i == 0 else feat_out
-
-            cls_conv_name = 'fcos_head_cls_tower_conv_{}'.format(i)
-            cls_conv = self.add_sublayer(
-                cls_conv_name,
-                ConvNormLayer(
-                    ch_in=in_c,
-                    ch_out=feat_out,
-                    filter_size=3,
-                    stride=1,
-                    norm_type=norm_type,
-                    use_dcn=use_dcn,
-                    bias_on=True,
-                    lr_scale=2.))
-            self.cls_subnet_convs.append(cls_conv)
-
-            reg_conv_name = 'fcos_head_reg_tower_conv_{}'.format(i)
-            reg_conv = self.add_sublayer(
-                reg_conv_name,
-                ConvNormLayer(
-                    ch_in=in_c,
-                    ch_out=feat_out,
-                    filter_size=3,
-                    stride=1,
-                    norm_type=norm_type,
-                    use_dcn=use_dcn,
-                    bias_on=True,
-                    lr_scale=2.))
-            self.reg_subnet_convs.append(reg_conv)
-
-    def forward(self, fpn_feat):
-        cls_feat = fpn_feat
-        reg_feat = fpn_feat
-        for i in range(self.num_convs):
-            cls_feat = F.relu(self.cls_subnet_convs[i](cls_feat))
-            reg_feat = F.relu(self.reg_subnet_convs[i](reg_feat))
-        return cls_feat, reg_feat
-
-
-@register
-class FCOSHead(nn.Layer):
-    """
-    FCOSHead
-    Args:
-        num_classes (int): Number of classes
-        fcos_feat (object): Instance of 'FCOSFeat'
-        fpn_stride (list): The stride of each FPN Layer
-        prior_prob (float): Used to set the bias init for the class prediction layer
-        norm_reg_targets (bool): Normalization the regression target if true
-        centerness_on_reg (bool): The prediction of centerness on regression or clssification branch
-        num_shift (float): Relative offset between the center of the first shift and the top-left corner of img
-        fcos_loss (object): Instance of 'FCOSLoss'
-        nms (object): Instance of 'MultiClassNMS'
-        trt (bool): Whether to use trt in nms of deploy
-    """
-    __inject__ = ['fcos_feat', 'fcos_loss', 'nms']
-    __shared__ = ['num_classes', 'trt']
-
-    def __init__(self,
-                 num_classes=80,
-                 fcos_feat='FCOSFeat',
-                 fpn_stride=[8, 16, 32, 64, 128],
-                 prior_prob=0.01,
-                 multiply_strides_reg_targets=False,
-                 norm_reg_targets=True,
-                 centerness_on_reg=True,
-                 num_shift=0.5,
-                 sqrt_score=False,
-                 fcos_loss='FCOSLoss',
-                 nms='MultiClassNMS',
-                 trt=False):
-        super(FCOSHead, self).__init__()
-        self.fcos_feat = fcos_feat
-        self.num_classes = num_classes
-        self.fpn_stride = fpn_stride
-        self.prior_prob = prior_prob
-        self.fcos_loss = fcos_loss
-        self.norm_reg_targets = norm_reg_targets
-        self.centerness_on_reg = centerness_on_reg
-        self.multiply_strides_reg_targets = multiply_strides_reg_targets
-        self.num_shift = num_shift
-        self.nms = nms
-        if isinstance(self.nms, MultiClassNMS) and trt:
-            self.nms.trt = trt
-        self.sqrt_score = sqrt_score
-        self.is_teacher = False
-
-        conv_cls_name = "fcos_head_cls"
-        bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)
-        self.fcos_head_cls = self.add_sublayer(
-            conv_cls_name,
-            nn.Conv2D(
-                in_channels=256,
-                out_channels=self.num_classes,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0., std=0.01)),
-                bias_attr=ParamAttr(
-                    initializer=Constant(value=bias_init_value))))
-
-        conv_reg_name = "fcos_head_reg"
-        self.fcos_head_reg = self.add_sublayer(
-            conv_reg_name,
-            nn.Conv2D(
-                in_channels=256,
-                out_channels=4,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0., std=0.01)),
-                bias_attr=ParamAttr(initializer=Constant(value=0))))
-
-        conv_centerness_name = "fcos_head_centerness"
-        self.fcos_head_centerness = self.add_sublayer(
-            conv_centerness_name,
-            nn.Conv2D(
-                in_channels=256,
-                out_channels=1,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0., std=0.01)),
-                bias_attr=ParamAttr(initializer=Constant(value=0))))
-
-        self.scales_regs = []
-        for i in range(len(self.fpn_stride)):
-            lvl = int(math.log(int(self.fpn_stride[i]), 2))
-            feat_name = 'p{}_feat'.format(lvl)
-            scale_reg = self.add_sublayer(feat_name, ScaleReg())
-            self.scales_regs.append(scale_reg)
-
-    def _compute_locations_by_level(self, fpn_stride, feature, num_shift=0.5):
-        """
-        Compute locations of anchor points of each FPN layer
-        Args:
-            fpn_stride (int): The stride of current FPN feature map
-            feature (Tensor): Tensor of current FPN feature map
-        Return:
-            Anchor points locations of current FPN feature map
-        """
-        h, w = feature.shape[2], feature.shape[3]
-        shift_x = paddle.arange(0, w * fpn_stride, fpn_stride)
-        shift_y = paddle.arange(0, h * fpn_stride, fpn_stride)
-        shift_x = paddle.unsqueeze(shift_x, axis=0)
-        shift_y = paddle.unsqueeze(shift_y, axis=1)
-        shift_x = paddle.expand(shift_x, shape=[h, w])
-        shift_y = paddle.expand(shift_y, shape=[h, w])
-
-        shift_x = paddle.reshape(shift_x, shape=[-1])
-        shift_y = paddle.reshape(shift_y, shape=[-1])
-        location = paddle.stack(
-            [shift_x, shift_y], axis=-1) + float(fpn_stride * num_shift)
-        return location
-
-    def forward(self, fpn_feats, targets=None):
-        assert len(fpn_feats) == len(
-            self.fpn_stride
-        ), "The size of fpn_feats is not equal to size of fpn_stride"
-        cls_logits_list = []
-        bboxes_reg_list = []
-        centerness_list = []
-        for scale_reg, fpn_stride, fpn_feat in zip(self.scales_regs,
-                                                   self.fpn_stride, fpn_feats):
-            fcos_cls_feat, fcos_reg_feat = self.fcos_feat(fpn_feat)
-            cls_logits = self.fcos_head_cls(fcos_cls_feat)
-            bbox_reg = scale_reg(self.fcos_head_reg(fcos_reg_feat))
-            if self.centerness_on_reg:
-                centerness = self.fcos_head_centerness(fcos_reg_feat)
-            else:
-                centerness = self.fcos_head_centerness(fcos_cls_feat)
-            if self.norm_reg_targets:
-                bbox_reg = F.relu(bbox_reg)
-                if self.multiply_strides_reg_targets:
-                    bbox_reg = bbox_reg * fpn_stride
-                else:
-                    if not self.training or targets.get(
-                            'get_data',
-                            False) or targets.get('is_teacher', False):
-                        bbox_reg = bbox_reg * fpn_stride
-            else:
-                bbox_reg = paddle.exp(bbox_reg)
-            cls_logits_list.append(cls_logits)
-            bboxes_reg_list.append(bbox_reg)
-            centerness_list.append(centerness)
-
-        if targets is not None:
-            self.is_teacher = targets.get('is_teacher', False)
-            if self.is_teacher:
-                return [cls_logits_list, bboxes_reg_list, centerness_list]
-
-        if self.training and targets is not None:
-            get_data = targets.get('get_data', False)
-            if get_data:
-                return [cls_logits_list, bboxes_reg_list, centerness_list]
-
-            losses = {}
-            fcos_head_outs = [cls_logits_list, bboxes_reg_list, centerness_list]
-            losses_fcos = self.get_loss(fcos_head_outs, targets)
-            losses.update(losses_fcos)
-
-            total_loss = paddle.add_n(list(losses.values()))
-            losses.update({'loss': total_loss})
-            return losses
-        else:
-            # eval or infer
-            locations_list = []
-            for fpn_stride, feature in zip(self.fpn_stride, fpn_feats):
-                location = self._compute_locations_by_level(fpn_stride, feature,
-                                                            self.num_shift)
-                locations_list.append(location)
-
-            fcos_head_outs = [
-                locations_list, cls_logits_list, bboxes_reg_list,
-                centerness_list
-            ]
-            return fcos_head_outs
-
-    def get_loss(self, fcos_head_outs, targets):
-        cls_logits, bboxes_reg, centerness = fcos_head_outs
-
-        # get labels,reg_target,centerness
-        tag_labels, tag_bboxes, tag_centerness = [], [], []
-        for i in range(len(self.fpn_stride)):
-            k_lbl = 'labels{}'.format(i)
-            if k_lbl in targets:
-                tag_labels.append(targets[k_lbl])
-            k_box = 'reg_target{}'.format(i)
-            if k_box in targets:
-                tag_bboxes.append(targets[k_box])
-            k_ctn = 'centerness{}'.format(i)
-            if k_ctn in targets:
-                tag_centerness.append(targets[k_ctn])
-
-        losses_fcos = self.fcos_loss(cls_logits, bboxes_reg, centerness,
-                                     tag_labels, tag_bboxes, tag_centerness)
-        return losses_fcos
-
-    def _post_process_by_level(self,
-                               locations,
-                               box_cls,
-                               box_reg,
-                               box_ctn,
-                               sqrt_score=False):
-        box_scores = F.sigmoid(box_cls).flatten(2).transpose([0, 2, 1])
-        box_centerness = F.sigmoid(box_ctn).flatten(2).transpose([0, 2, 1])
-        pred_scores = box_scores * box_centerness
-        if sqrt_score:
-            pred_scores = paddle.sqrt(pred_scores)
-
-        box_reg_ch_last = box_reg.flatten(2).transpose([0, 2, 1])
-        box_reg_decoding = paddle.stack(
-            [
-                locations[:, 0] - box_reg_ch_last[:, :, 0],
-                locations[:, 1] - box_reg_ch_last[:, :, 1],
-                locations[:, 0] + box_reg_ch_last[:, :, 2],
-                locations[:, 1] + box_reg_ch_last[:, :, 3]
-            ],
-            axis=1)
-        pred_boxes = box_reg_decoding.transpose([0, 2, 1])
-
-        return pred_scores, pred_boxes
-
-    def post_process(self, fcos_head_outs, scale_factor):
-        locations, cls_logits, bboxes_reg, centerness = fcos_head_outs
-        pred_bboxes, pred_scores = [], []
-
-        for pts, cls, reg, ctn in zip(locations, cls_logits, bboxes_reg,
-                                      centerness):
-            scores, boxes = self._post_process_by_level(pts, cls, reg, ctn,
-                                                        self.sqrt_score)
-            pred_scores.append(scores)
-            pred_bboxes.append(boxes)
-        pred_bboxes = paddle.concat(pred_bboxes, axis=1)
-        pred_scores = paddle.concat(pred_scores, axis=1)
-
-        # scale bbox to origin
-        scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
-        scale_factor = paddle.concat(
-            [scale_x, scale_y, scale_x, scale_y], axis=-1).reshape([-1, 1, 4])
-        pred_bboxes /= scale_factor
-
-        pred_scores = pred_scores.transpose([0, 2, 1])
-        bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
-        return bbox_pred, bbox_num
-
-
-@register
-class FCOSHead_ARSL(FCOSHead):
-    """
-    FCOSHead of ARSL for semi-det(ssod)
-    Args:
-        fcos_feat (object): Instance of 'FCOSFeat'
-        num_classes (int): Number of classes
-        fpn_stride (list): The stride of each FPN Layer
-        prior_prob (float): Used to set the bias init for the class prediction layer
-        fcos_loss (object): Instance of 'FCOSLoss'
-        norm_reg_targets (bool): Normalization the regression target if true
-        centerness_on_reg (bool): The prediction of centerness on regression or clssification branch
-        nms (object): Instance of 'MultiClassNMS'
-        trt (bool): Whether to use trt in nms of deploy
-    """
-    __inject__ = ['fcos_feat', 'fcos_loss', 'nms']
-    __shared__ = ['num_classes', 'trt']
-
-    def __init__(self,
-                 num_classes=80,
-                 fcos_feat='FCOSFeat',
-                 fpn_stride=[8, 16, 32, 64, 128],
-                 prior_prob=0.01,
-                 multiply_strides_reg_targets=False,
-                 norm_reg_targets=True,
-                 centerness_on_reg=True,
-                 num_shift=0.5,
-                 sqrt_score=False,
-                 fcos_loss='FCOSLossMILC',
-                 nms='MultiClassNMS',
-                 trt=False):
-        super(FCOSHead_ARSL, self).__init__()
-        self.fcos_feat = fcos_feat
-        self.num_classes = num_classes
-        self.fpn_stride = fpn_stride
-        self.prior_prob = prior_prob
-        self.fcos_loss = fcos_loss
-        self.norm_reg_targets = norm_reg_targets
-        self.centerness_on_reg = centerness_on_reg
-        self.multiply_strides_reg_targets = multiply_strides_reg_targets
-        self.num_shift = num_shift
-        self.nms = nms
-        if isinstance(self.nms, MultiClassNMS) and trt:
-            self.nms.trt = trt
-        self.sqrt_score = sqrt_score
-
-        conv_cls_name = "fcos_head_cls"
-        bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)
-        self.fcos_head_cls = self.add_sublayer(
-            conv_cls_name,
-            nn.Conv2D(
-                in_channels=256,
-                out_channels=self.num_classes,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0., std=0.01)),
-                bias_attr=ParamAttr(
-                    initializer=Constant(value=bias_init_value))))
-
-        conv_reg_name = "fcos_head_reg"
-        self.fcos_head_reg = self.add_sublayer(
-            conv_reg_name,
-            nn.Conv2D(
-                in_channels=256,
-                out_channels=4,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0., std=0.01)),
-                bias_attr=ParamAttr(initializer=Constant(value=0))))
-
-        conv_centerness_name = "fcos_head_centerness"
-        self.fcos_head_centerness = self.add_sublayer(
-            conv_centerness_name,
-            nn.Conv2D(
-                in_channels=256,
-                out_channels=1,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0., std=0.01)),
-                bias_attr=ParamAttr(initializer=Constant(value=0))))
-
-        self.scales_regs = []
-        for i in range(len(self.fpn_stride)):
-            lvl = int(math.log(int(self.fpn_stride[i]), 2))
-            feat_name = 'p{}_feat'.format(lvl)
-            scale_reg = self.add_sublayer(feat_name, ScaleReg())
-            self.scales_regs.append(scale_reg)
-
-    def forward(self, fpn_feats, targets=None):
-        assert len(fpn_feats) == len(
-            self.fpn_stride
-        ), "The size of fpn_feats is not equal to size of fpn_stride"
-        cls_logits_list = []
-        bboxes_reg_list = []
-        centerness_list = []
-        for scale_reg, fpn_stride, fpn_feat in zip(self.scales_regs,
-                                                   self.fpn_stride, fpn_feats):
-            fcos_cls_feat, fcos_reg_feat = self.fcos_feat(fpn_feat)
-            cls_logits = self.fcos_head_cls(fcos_cls_feat)
-            bbox_reg = scale_reg(self.fcos_head_reg(fcos_reg_feat))
-            if self.centerness_on_reg:
-                centerness = self.fcos_head_centerness(fcos_reg_feat)
-            else:
-                centerness = self.fcos_head_centerness(fcos_cls_feat)
-            if self.norm_reg_targets:
-                bbox_reg = F.relu(bbox_reg)
-                if not self.training:
-                    bbox_reg = bbox_reg * fpn_stride
-            else:
-                bbox_reg = paddle.exp(bbox_reg)
-            cls_logits_list.append(cls_logits)
-            bboxes_reg_list.append(bbox_reg)
-            centerness_list.append(centerness)
-
-        if not self.training:
-            locations_list = []
-            for fpn_stride, feature in zip(self.fpn_stride, fpn_feats):
-                location = self._compute_locations_by_level(fpn_stride, feature)
-                locations_list.append(location)
-
-            return locations_list, cls_logits_list, bboxes_reg_list, centerness_list
-        else:
-            return cls_logits_list, bboxes_reg_list, centerness_list
-
-    def get_loss(self, fcos_head_outs, tag_labels, tag_bboxes, tag_centerness):
-        cls_logits, bboxes_reg, centerness = fcos_head_outs
-        return self.fcos_loss(cls_logits, bboxes_reg, centerness, tag_labels,
-                              tag_bboxes, tag_centerness)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/fcosr_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/fcosr_head.py
deleted file mode 100644
index df98883..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/fcosr_head.py
+++ /dev/null
@@ -1,396 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-
-from .fcos_head import ScaleReg
-from ..initializer import bias_init_with_prob, constant_, normal_
-from ..ops import get_act_fn, anchor_generator
-from ..rbox_utils import box2corners
-from ..losses import ProbIoULoss
-import numpy as np
-
-__all__ = ['FCOSRHead']
-
-
-def trunc_div(a, b):
-    ipt = paddle.divide(a, b)
-    sign_ipt = paddle.sign(ipt)
-    abs_ipt = paddle.abs(ipt)
-    abs_ipt = paddle.floor(abs_ipt)
-    out = paddle.multiply(sign_ipt, abs_ipt)
-    return out
-
-
-def fmod(a, b):
-    return a - trunc_div(a, b) * b
-
-
-def fmod_eval(a, b):
-    return a - a.divide(b).cast(paddle.int32).cast(paddle.float32) * b
-
-
-class ConvBNLayer(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 filter_size=3,
-                 stride=1,
-                 groups=1,
-                 padding=0,
-                 norm_cfg={'name': 'gn',
-                           'num_groups': 32},
-                 act=None):
-        super(ConvBNLayer, self).__init__()
-
-        self.conv = nn.Conv2D(
-            in_channels=ch_in,
-            out_channels=ch_out,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=padding,
-            groups=groups,
-            bias_attr=False)
-
-        norm_type = norm_cfg['name']
-        if norm_type in ['sync_bn', 'bn']:
-            self.norm = nn.BatchNorm2D(
-                ch_out,
-                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-                bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-        else:
-            groups = norm_cfg.get('num_groups', 1)
-            self.norm = nn.GroupNorm(
-                num_groups=groups,
-                num_channels=ch_out,
-                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-                bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-        self.act = get_act_fn(act) if act is None or isinstance(act, (
-            str, dict)) else act
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.norm(x)
-        x = self.act(x)
-
-        return x
-
-
-@register
-class FCOSRHead(nn.Layer):
-    """ FCOSR Head, refer to https://arxiv.org/abs/2111.10780 for details """
-
-    __shared__ = ['num_classes', 'trt']
-    __inject__ = ['assigner', 'nms']
-
-    def __init__(self,
-                 num_classes=15,
-                 in_channels=256,
-                 feat_channels=256,
-                 stacked_convs=4,
-                 act='relu',
-                 fpn_strides=[4, 8, 16, 32, 64],
-                 trt=False,
-                 loss_weight={'class': 1.0,
-                              'probiou': 1.0},
-                 norm_cfg={'name': 'gn',
-                           'num_groups': 32},
-                 assigner='FCOSRAssigner',
-                 nms='MultiClassNMS'):
-
-        super(FCOSRHead, self).__init__()
-        self.in_channels = in_channels
-        self.num_classes = num_classes
-        self.fpn_strides = fpn_strides
-        self.stacked_convs = stacked_convs
-        self.loss_weight = loss_weight
-        self.half_pi = paddle.to_tensor(
-            [1.5707963267948966], dtype=paddle.float32)
-        self.probiou_loss = ProbIoULoss(mode='l1')
-        act = get_act_fn(
-            act, trt=trt) if act is None or isinstance(act,
-                                                       (str, dict)) else act
-        self.trt = trt
-        self.loss_weight = loss_weight
-        self.assigner = assigner
-        self.nms = nms
-        # stem
-        self.stem_cls = nn.LayerList()
-        self.stem_reg = nn.LayerList()
-        for i in range(self.stacked_convs):
-            self.stem_cls.append(
-                ConvBNLayer(
-                    self.in_channels[i],
-                    feat_channels,
-                    filter_size=3,
-                    stride=1,
-                    padding=1,
-                    norm_cfg=norm_cfg,
-                    act=act))
-            self.stem_reg.append(
-                ConvBNLayer(
-                    self.in_channels[i],
-                    feat_channels,
-                    filter_size=3,
-                    stride=1,
-                    padding=1,
-                    norm_cfg=norm_cfg,
-                    act=act))
-
-        self.scales = nn.LayerList(
-            [ScaleReg() for _ in range(len(fpn_strides))])
-
-        # prediction
-        self.pred_cls = nn.Conv2D(feat_channels, self.num_classes, 3, padding=1)
-
-        self.pred_xy = nn.Conv2D(feat_channels, 2, 3, padding=1)
-
-        self.pred_wh = nn.Conv2D(feat_channels, 2, 3, padding=1)
-
-        self.pred_angle = nn.Conv2D(feat_channels, 1, 3, padding=1)
-
-        self._init_weights()
-
-    def _init_weights(self):
-        for cls_, reg_ in zip(self.stem_cls, self.stem_reg):
-            normal_(cls_.conv.weight, std=0.01)
-            normal_(reg_.conv.weight, std=0.01)
-
-        bias_cls = bias_init_with_prob(0.01)
-        normal_(self.pred_cls.weight, std=0.01)
-        constant_(self.pred_cls.bias, bias_cls)
-        normal_(self.pred_xy.weight, std=0.01)
-        normal_(self.pred_wh.weight, std=0.01)
-        normal_(self.pred_angle.weight, std=0.01)
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
-
-    def _generate_anchors(self, feats):
-        if self.trt:
-            anchor_points = []
-            for feat, stride in zip(feats, self.fpn_strides):
-                _, _, h, w = paddle.shape(feat)
-                anchor, _ = anchor_generator(
-                    feat,
-                    stride * 4,
-                    1.0, [1.0, 1.0, 1.0, 1.0], [stride, stride],
-                    offset=0.5)
-                x1, y1, x2, y2 = paddle.split(anchor, 4, axis=-1)
-                xc = (x1 + x2 + 1) / 2
-                yc = (y1 + y2 + 1) / 2
-                anchor_point = paddle.concat(
-                    [xc, yc], axis=-1).reshape((1, h * w, 2))
-                anchor_points.append(anchor_point)
-            anchor_points = paddle.concat(anchor_points, axis=1)
-            return anchor_points, None, None
-        else:
-            anchor_points = []
-            stride_tensor = []
-            num_anchors_list = []
-            for feat, stride in zip(feats, self.fpn_strides):
-                _, _, h, w = paddle.shape(feat)
-                shift_x = (paddle.arange(end=w) + 0.5) * stride
-                shift_y = (paddle.arange(end=h) + 0.5) * stride
-                shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
-                anchor_point = paddle.cast(
-                    paddle.stack(
-                        [shift_x, shift_y], axis=-1), dtype='float32')
-                anchor_points.append(anchor_point.reshape([1, -1, 2]))
-                stride_tensor.append(
-                    paddle.full(
-                        [1, h * w, 1], stride, dtype='float32'))
-                num_anchors_list.append(h * w)
-            anchor_points = paddle.concat(anchor_points, axis=1)
-            stride_tensor = paddle.concat(stride_tensor, axis=1)
-            return anchor_points, stride_tensor, num_anchors_list
-
-    def forward(self, feats, target=None):
-        if self.training:
-            return self.forward_train(feats, target)
-        else:
-            return self.forward_eval(feats, target)
-
-    def forward_train(self, feats, target=None):
-        anchor_points, stride_tensor, num_anchors_list = self._generate_anchors(
-            feats)
-        cls_pred_list, reg_pred_list = [], []
-        for stride, feat, scale in zip(self.fpn_strides, feats, self.scales):
-            # cls
-            cls_feat = feat
-            for cls_layer in self.stem_cls:
-                cls_feat = cls_layer(cls_feat)
-            cls_pred = F.sigmoid(self.pred_cls(cls_feat))
-            cls_pred_list.append(cls_pred.flatten(2).transpose((0, 2, 1)))
-            # reg
-            reg_feat = feat
-            for reg_layer in self.stem_reg:
-                reg_feat = reg_layer(reg_feat)
-
-            reg_xy = scale(self.pred_xy(reg_feat)) * stride
-            reg_wh = F.elu(scale(self.pred_wh(reg_feat)) + 1.) * stride
-            reg_angle = self.pred_angle(reg_feat)
-            reg_angle = fmod(reg_angle, self.half_pi)
-            reg_pred = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1)
-            reg_pred_list.append(reg_pred.flatten(2).transpose((0, 2, 1)))
-
-        cls_pred_list = paddle.concat(cls_pred_list, axis=1)
-        reg_pred_list = paddle.concat(reg_pred_list, axis=1)
-
-        return self.get_loss([
-            cls_pred_list, reg_pred_list, anchor_points, stride_tensor,
-            num_anchors_list
-        ], target)
-
-    def forward_eval(self, feats, target=None):
-        cls_pred_list, reg_pred_list = [], []
-        anchor_points, _, _ = self._generate_anchors(feats)
-        for stride, feat, scale in zip(self.fpn_strides, feats, self.scales):
-            b, _, h, w = paddle.shape(feat)
-            # cls
-            cls_feat = feat
-            for cls_layer in self.stem_cls:
-                cls_feat = cls_layer(cls_feat)
-            cls_pred = F.sigmoid(self.pred_cls(cls_feat))
-            cls_pred_list.append(cls_pred.reshape([b, self.num_classes, h * w]))
-            # reg
-            reg_feat = feat
-            for reg_layer in self.stem_reg:
-                reg_feat = reg_layer(reg_feat)
-
-            reg_xy = scale(self.pred_xy(reg_feat)) * stride
-            reg_wh = F.elu(scale(self.pred_wh(reg_feat)) + 1.) * stride
-            reg_angle = self.pred_angle(reg_feat)
-            reg_angle = fmod_eval(reg_angle, self.half_pi)
-            reg_pred = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1)
-            reg_pred = reg_pred.reshape([b, 5, h * w]).transpose((0, 2, 1))
-            reg_pred_list.append(reg_pred)
-
-        cls_pred_list = paddle.concat(cls_pred_list, axis=2)
-        reg_pred_list = paddle.concat(reg_pred_list, axis=1)
-        reg_pred_list = self._bbox_decode(anchor_points, reg_pred_list)
-        return cls_pred_list, reg_pred_list
-
-    def _bbox_decode(self, points, reg_pred_list):
-        xy, wha = paddle.split(reg_pred_list, [2, 3], axis=-1)
-        xy = xy + points
-        return paddle.concat([xy, wha], axis=-1)
-
-    def _box2corners(self, pred_bboxes):
-        """ convert (x, y, w, h, angle) to (x1, y1, x2, y2, x3, y3, x4, y4)
-
-        Args:
-            pred_bboxes (Tensor): [B, N, 5]
-        
-        Returns:
-            polys (Tensor): [B, N, 8]
-        """
-        x, y, w, h, angle = paddle.split(pred_bboxes, 5, axis=-1)
-        cos_a_half = paddle.cos(angle) * 0.5
-        sin_a_half = paddle.sin(angle) * 0.5
-        w_x = cos_a_half * w
-        w_y = sin_a_half * w
-        h_x = -sin_a_half * h
-        h_y = cos_a_half * h
-        return paddle.concat(
-            [
-                x + w_x + h_x, y + w_y + h_y, x - w_x + h_x, y - w_y + h_y,
-                x - w_x - h_x, y - w_y - h_y, x + w_x - h_x, y + w_y - h_y
-            ],
-            axis=-1)
-
-    def get_loss(self, head_outs, gt_meta):
-        cls_pred_list, reg_pred_list, anchor_points, stride_tensor, num_anchors_list = head_outs
-        gt_labels = gt_meta['gt_class']
-        gt_bboxes = gt_meta['gt_bbox']
-        gt_rboxes = gt_meta['gt_rbox']
-        pad_gt_mask = gt_meta['pad_gt_mask']
-        # decode
-        pred_rboxes = self._bbox_decode(anchor_points, reg_pred_list)
-        # label assignment
-        assigned_labels, assigned_rboxes, assigned_scores = \
-            self.assigner(
-                anchor_points,
-                stride_tensor,
-                num_anchors_list,
-                gt_labels,
-                gt_bboxes,
-                gt_rboxes,
-                pad_gt_mask,
-                self.num_classes,
-                pred_rboxes
-            )
-
-        # reg_loss
-        mask_positive = (assigned_labels != self.num_classes)
-        num_pos = mask_positive.sum().item()
-        if num_pos > 0:
-            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 5])
-            pred_rboxes_pos = paddle.masked_select(pred_rboxes,
-                                                   bbox_mask).reshape([-1, 5])
-            assigned_rboxes_pos = paddle.masked_select(
-                assigned_rboxes, bbox_mask).reshape([-1, 5])
-            bbox_weight = paddle.masked_select(
-                assigned_scores.sum(-1), mask_positive).reshape([-1])
-            avg_factor = bbox_weight.sum()
-            loss_probiou = self.probiou_loss(pred_rboxes_pos,
-                                             assigned_rboxes_pos)
-            loss_probiou = paddle.sum(loss_probiou * bbox_weight) / avg_factor
-        else:
-            loss_probiou = pred_rboxes.sum() * 0.
-
-        avg_factor = max(num_pos, 1.0)
-        # cls_loss
-        loss_cls = self._qfocal_loss(
-            cls_pred_list, assigned_scores, reduction='sum')
-        loss_cls = loss_cls / avg_factor
-
-        loss = self.loss_weight['class'] * loss_cls + \
-               self.loss_weight['probiou'] * loss_probiou
-        out_dict = {
-            'loss': loss,
-            'loss_probiou': loss_probiou,
-            'loss_cls': loss_cls
-        }
-        return out_dict
-
-    @staticmethod
-    def _qfocal_loss(score, label, gamma=2.0, reduction='sum'):
-        weight = (score - label).pow(gamma)
-        loss = F.binary_cross_entropy(
-            score, label, weight=weight, reduction=reduction)
-        return loss
-
-    def post_process(self, head_outs, scale_factor):
-        pred_scores, pred_rboxes = head_outs
-        # [B, N, 5] -> [B, N, 4, 2] -> [B, N, 8]
-        pred_rboxes = self._box2corners(pred_rboxes)
-        # scale bbox to origin
-        scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
-        scale_factor = paddle.concat(
-            [
-                scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x,
-                scale_y
-            ],
-            axis=-1).reshape([-1, 1, 8])
-        pred_rboxes /= scale_factor
-        bbox_pred, bbox_num, before_nms_indexes = self.nms(pred_rboxes,
-                                                           pred_scores)
-        return bbox_pred, bbox_num, before_nms_indexes
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/gfl_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/gfl_head.py
deleted file mode 100644
index 040a3f7..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/gfl_head.py
+++ /dev/null
@@ -1,736 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-# The code is based on:
-# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/gfl_head.py
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn.initializer import Normal, Constant
-
-from ppdet.core.workspace import register
-from ppdet.modeling.bbox_utils import distance2bbox, bbox2distance, batch_distance2bbox
-from ppdet.data.transform.atss_assigner import bbox_overlaps
-
-__all__ = ['GFLHead', 'LDGFLHead']
-
-
-class ScaleReg(nn.Layer):
-    """
-    Parameter for scaling the regression outputs.
-    """
-
-    def __init__(self):
-        super(ScaleReg, self).__init__()
-        self.scale_reg = self.create_parameter(
-            shape=[1],
-            attr=ParamAttr(initializer=Constant(value=1.)),
-            dtype="float32")
-
-    def forward(self, inputs):
-        out = inputs * self.scale_reg
-        return out
-
-
-class Integral(nn.Layer):
-    """A fixed layer for calculating integral result from distribution.
-    This layer calculates the target location by :math: `sum{P(y_i) * y_i}`,
-    P(y_i) denotes the softmax vector that represents the discrete distribution
-    y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max}
-    Args:
-        reg_max (int): The maximal value of the discrete set. Default: 16. You
-            may want to reset it according to your new dataset or related
-            settings.
-    """
-
-    def __init__(self, reg_max=16):
-        super(Integral, self).__init__()
-        self.reg_max = reg_max
-        self.register_buffer('project',
-                             paddle.linspace(0, self.reg_max, self.reg_max + 1))
-
-    def forward(self, x):
-        """Forward feature from the regression head to get integral result of
-        bounding box location.
-        Args:
-            x (Tensor): Features of the regression head, shape (N, 4*(n+1)),
-                n is self.reg_max.
-        Returns:
-            x (Tensor): Integral result of box locations, i.e., distance
-                offsets from the box center in four directions, shape (N, 4).
-        """
-        x = F.softmax(x.reshape([-1, self.reg_max + 1]), axis=1)
-        x = F.linear(x, self.project)
-        if self.training:
-            x = x.reshape([-1, 4])
-        return x
-
-
-@register
-class DGQP(nn.Layer):
-    """Distribution-Guided Quality Predictor of GFocal head
-    Args:
-        reg_topk (int): top-k statistics of distribution to guide LQE
-        reg_channels (int): hidden layer unit to generate LQE
-        add_mean (bool): Whether to calculate the mean of top-k statistics
-    """
-
-    def __init__(self, reg_topk=4, reg_channels=64, add_mean=True):
-        super(DGQP, self).__init__()
-        self.reg_topk = reg_topk
-        self.reg_channels = reg_channels
-        self.add_mean = add_mean
-        self.total_dim = reg_topk
-        if add_mean:
-            self.total_dim += 1
-        self.reg_conv1 = self.add_sublayer(
-            'dgqp_reg_conv1',
-            nn.Conv2D(
-                in_channels=4 * self.total_dim,
-                out_channels=self.reg_channels,
-                kernel_size=1,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0., std=0.01)),
-                bias_attr=ParamAttr(initializer=Constant(value=0))))
-        self.reg_conv2 = self.add_sublayer(
-            'dgqp_reg_conv2',
-            nn.Conv2D(
-                in_channels=self.reg_channels,
-                out_channels=1,
-                kernel_size=1,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0., std=0.01)),
-                bias_attr=ParamAttr(initializer=Constant(value=0))))
-
-    def forward(self, x):
-        """Forward feature from the regression head to get integral result of
-        bounding box location.
-        Args:
-            x (Tensor): Features of the regression head, shape (N, 4*(n+1)),
-                n is self.reg_max.
-        Returns:
-            x (Tensor): Integral result of box locations, i.e., distance
-                offsets from the box center in four directions, shape (N, 4).
-        """
-        N, _, H, W = x.shape[:]
-        prob = F.softmax(x.reshape([N, 4, -1, H, W]), axis=2)
-        prob_topk, _ = prob.topk(self.reg_topk, axis=2)
-        if self.add_mean:
-            stat = paddle.concat(
-                [prob_topk, prob_topk.mean(
-                    axis=2, keepdim=True)], axis=2)
-        else:
-            stat = prob_topk
-        y = F.relu(self.reg_conv1(stat.reshape([N, 4 * self.total_dim, H, W])))
-        y = F.sigmoid(self.reg_conv2(y))
-        return y
-
-
-@register
-class GFLHead(nn.Layer):
-    """
-    GFLHead
-    Args:
-        conv_feat (object): Instance of 'FCOSFeat'
-        num_classes (int): Number of classes
-        fpn_stride (list): The stride of each FPN Layer
-        prior_prob (float): Used to set the bias init for the class prediction layer
-        loss_class (object): Instance of QualityFocalLoss.
-        loss_dfl (object): Instance of DistributionFocalLoss.
-        loss_bbox (object): Instance of bbox loss.
-        reg_max: Max value of integral set :math: `{0, ..., reg_max}`
-                n QFL setting. Default: 16.
-    """
-    __inject__ = [
-        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', 'nms'
-    ]
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 conv_feat='FCOSFeat',
-                 dgqp_module=None,
-                 num_classes=80,
-                 fpn_stride=[8, 16, 32, 64, 128],
-                 prior_prob=0.01,
-                 loss_class='QualityFocalLoss',
-                 loss_dfl='DistributionFocalLoss',
-                 loss_bbox='GIoULoss',
-                 reg_max=16,
-                 feat_in_chan=256,
-                 nms=None,
-                 nms_pre=1000,
-                 cell_offset=0):
-        super(GFLHead, self).__init__()
-        self.conv_feat = conv_feat
-        self.dgqp_module = dgqp_module
-        self.num_classes = num_classes
-        self.fpn_stride = fpn_stride
-        self.prior_prob = prior_prob
-        self.loss_qfl = loss_class
-        self.loss_dfl = loss_dfl
-        self.loss_bbox = loss_bbox
-        self.reg_max = reg_max
-        self.feat_in_chan = feat_in_chan
-        self.nms = nms
-        self.nms_pre = nms_pre
-        self.cell_offset = cell_offset
-        self.use_sigmoid = self.loss_qfl.use_sigmoid
-        if self.use_sigmoid:
-            self.cls_out_channels = self.num_classes
-        else:
-            self.cls_out_channels = self.num_classes + 1
-
-        conv_cls_name = "gfl_head_cls"
-        bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)
-        self.gfl_head_cls = self.add_sublayer(
-            conv_cls_name,
-            nn.Conv2D(
-                in_channels=self.feat_in_chan,
-                out_channels=self.cls_out_channels,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0., std=0.01)),
-                bias_attr=ParamAttr(
-                    initializer=Constant(value=bias_init_value))))
-
-        conv_reg_name = "gfl_head_reg"
-        self.gfl_head_reg = self.add_sublayer(
-            conv_reg_name,
-            nn.Conv2D(
-                in_channels=self.feat_in_chan,
-                out_channels=4 * (self.reg_max + 1),
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0., std=0.01)),
-                bias_attr=ParamAttr(initializer=Constant(value=0))))
-
-        self.scales_regs = []
-        for i in range(len(self.fpn_stride)):
-            lvl = int(math.log(int(self.fpn_stride[i]), 2))
-            feat_name = 'p{}_feat'.format(lvl)
-            scale_reg = self.add_sublayer(feat_name, ScaleReg())
-            self.scales_regs.append(scale_reg)
-
-        self.distribution_project = Integral(self.reg_max)
-
-    def forward(self, fpn_feats):
-        assert len(fpn_feats) == len(
-            self.fpn_stride
-        ), "The size of fpn_feats is not equal to size of fpn_stride"
-        cls_logits_list = []
-        bboxes_reg_list = []
-        for stride, scale_reg, fpn_feat in zip(self.fpn_stride,
-                                               self.scales_regs, fpn_feats):
-            conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat)
-            cls_score = self.gfl_head_cls(conv_cls_feat)
-            bbox_pred = scale_reg(self.gfl_head_reg(conv_reg_feat))
-            if self.dgqp_module:
-                quality_score = self.dgqp_module(bbox_pred)
-                cls_score = F.sigmoid(cls_score) * quality_score
-            if not self.training:
-                cls_score = F.sigmoid(cls_score.transpose([0, 2, 3, 1]))
-                bbox_pred = bbox_pred.transpose([0, 2, 3, 1])
-                b, cell_h, cell_w, _ = paddle.shape(cls_score)
-                y, x = self.get_single_level_center_point(
-                    [cell_h, cell_w], stride, cell_offset=self.cell_offset)
-                center_points = paddle.stack([x, y], axis=-1)
-                cls_score = cls_score.reshape([b, -1, self.cls_out_channels])
-                bbox_pred = self.distribution_project(bbox_pred) * stride
-                bbox_pred = bbox_pred.reshape([-1, cell_h * cell_w, 4])
-
-                # NOTE: If keep_ratio=False and image shape value that
-                # multiples of 32, distance2bbox not set max_shapes parameter
-                # to speed up model prediction. If need to set max_shapes,
-                # please use inputs['im_shape'].
-                bbox_pred = batch_distance2bbox(
-                    center_points, bbox_pred, max_shapes=None)
-
-            cls_logits_list.append(cls_score)
-            bboxes_reg_list.append(bbox_pred)
-
-        return (cls_logits_list, bboxes_reg_list)
-
-    def _images_to_levels(self, target, num_level_anchors):
-        """
-        Convert targets by image to targets by feature level.
-        """
-        level_targets = []
-        start = 0
-        for n in num_level_anchors:
-            end = start + n
-            level_targets.append(target[:, start:end].squeeze(0))
-            start = end
-        return level_targets
-
-    def _grid_cells_to_center(self, grid_cells):
-        """
-        Get center location of each gird cell
-        Args:
-            grid_cells: grid cells of a feature map
-        Returns:
-            center points
-        """
-        cells_cx = (grid_cells[:, 2] + grid_cells[:, 0]) / 2
-        cells_cy = (grid_cells[:, 3] + grid_cells[:, 1]) / 2
-        return paddle.stack([cells_cx, cells_cy], axis=-1)
-
-    def get_loss(self, gfl_head_outs, gt_meta):
-        cls_logits, bboxes_reg = gfl_head_outs
-        num_level_anchors = [
-            featmap.shape[-2] * featmap.shape[-1] for featmap in cls_logits
-        ]
-        grid_cells_list = self._images_to_levels(gt_meta['grid_cells'],
-                                                 num_level_anchors)
-        labels_list = self._images_to_levels(gt_meta['labels'],
-                                             num_level_anchors)
-        label_weights_list = self._images_to_levels(gt_meta['label_weights'],
-                                                    num_level_anchors)
-        bbox_targets_list = self._images_to_levels(gt_meta['bbox_targets'],
-                                                   num_level_anchors)
-        num_total_pos = sum(gt_meta['pos_num'])
-        try:
-            paddle.distributed.all_reduce(num_total_pos)
-            num_total_pos = paddle.clip(
-                num_total_pos / paddle.distributed.get_world_size(), min=1)
-        except:
-            num_total_pos = max(num_total_pos, 1)
-
-        loss_bbox_list, loss_dfl_list, loss_qfl_list, avg_factor = [], [], [], []
-        for cls_score, bbox_pred, grid_cells, labels, label_weights, bbox_targets, stride in zip(
-                cls_logits, bboxes_reg, grid_cells_list, labels_list,
-                label_weights_list, bbox_targets_list, self.fpn_stride):
-            grid_cells = grid_cells.reshape([-1, 4])
-            cls_score = cls_score.transpose([0, 2, 3, 1]).reshape(
-                [-1, self.cls_out_channels])
-            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
-                [-1, 4 * (self.reg_max + 1)])
-            bbox_targets = bbox_targets.reshape([-1, 4])
-            labels = labels.reshape([-1])
-            label_weights = label_weights.reshape([-1])
-
-            bg_class_ind = self.num_classes
-            pos_inds = paddle.nonzero(
-                paddle.logical_and((labels >= 0), (labels < bg_class_ind)),
-                as_tuple=False).squeeze(1)
-            score = np.zeros(labels.shape)
-            if len(pos_inds) > 0:
-                pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0)
-                pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0)
-                pos_grid_cells = paddle.gather(grid_cells, pos_inds, axis=0)
-                pos_grid_cell_centers = self._grid_cells_to_center(
-                    pos_grid_cells) / stride
-
-                weight_targets = F.sigmoid(cls_score.detach())
-                weight_targets = paddle.gather(
-                    weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)
-                pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred)
-                pos_decode_bbox_pred = distance2bbox(pos_grid_cell_centers,
-                                                     pos_bbox_pred_corners)
-                pos_decode_bbox_targets = pos_bbox_targets / stride
-                bbox_iou = bbox_overlaps(
-                    pos_decode_bbox_pred.detach().numpy(),
-                    pos_decode_bbox_targets.detach().numpy(),
-                    is_aligned=True)
-                score[pos_inds.numpy()] = bbox_iou
-                pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1])
-                target_corners = bbox2distance(pos_grid_cell_centers,
-                                               pos_decode_bbox_targets,
-                                               self.reg_max).reshape([-1])
-                # regression loss
-                loss_bbox = paddle.sum(
-                    self.loss_bbox(pos_decode_bbox_pred,
-                                   pos_decode_bbox_targets) * weight_targets)
-
-                # dfl loss
-                loss_dfl = self.loss_dfl(
-                    pred_corners,
-                    target_corners,
-                    weight=weight_targets.expand([-1, 4]).reshape([-1]),
-                    avg_factor=4.0)
-            else:
-                loss_bbox = bbox_pred.sum() * 0
-                loss_dfl = bbox_pred.sum() * 0
-                weight_targets = paddle.to_tensor([0], dtype='float32')
-
-            # qfl loss
-            score = paddle.to_tensor(score)
-            loss_qfl = self.loss_qfl(
-                cls_score, (labels, score),
-                weight=label_weights,
-                avg_factor=num_total_pos)
-            loss_bbox_list.append(loss_bbox)
-            loss_dfl_list.append(loss_dfl)
-            loss_qfl_list.append(loss_qfl)
-            avg_factor.append(weight_targets.sum())
-
-        avg_factor = sum(avg_factor)
-        try:
-            paddle.distributed.all_reduce(avg_factor)
-            avg_factor = paddle.clip(
-                avg_factor / paddle.distributed.get_world_size(), min=1)
-        except:
-            avg_factor = max(avg_factor.item(), 1)
-        if avg_factor <= 0:
-            loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
-            loss_bbox = paddle.to_tensor(
-                0, dtype='float32', stop_gradient=False)
-            loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
-        else:
-            losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list))
-            losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list))
-            loss_qfl = sum(loss_qfl_list)
-            loss_bbox = sum(losses_bbox)
-            loss_dfl = sum(losses_dfl)
-
-        loss_states = dict(
-            loss_qfl=loss_qfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl)
-
-        return loss_states
-
-    def get_single_level_center_point(self, featmap_size, stride,
-                                      cell_offset=0):
-        """
-        Generate pixel centers of a single stage feature map.
-        Args:
-            featmap_size: height and width of the feature map
-            stride: down sample stride of the feature map
-        Returns:
-            y and x of the center points
-        """
-        h, w = featmap_size
-        x_range = (paddle.arange(w, dtype='float32') + cell_offset) * stride
-        y_range = (paddle.arange(h, dtype='float32') + cell_offset) * stride
-        y, x = paddle.meshgrid(y_range, x_range)
-        y = y.flatten()
-        x = x.flatten()
-        return y, x
-
-    def post_process(self, gfl_head_outs, im_shape, scale_factor):
-        cls_scores, bboxes_reg = gfl_head_outs
-        bboxes = paddle.concat(bboxes_reg, axis=1)
-        # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale]
-        im_scale = scale_factor.flip([1]).tile([1, 2]).unsqueeze(1)
-        bboxes /= im_scale
-        mlvl_scores = paddle.concat(cls_scores, axis=1)
-        mlvl_scores = mlvl_scores.transpose([0, 2, 1])
-        bbox_pred, bbox_num, _ = self.nms(bboxes, mlvl_scores)
-        return bbox_pred, bbox_num
-
-
-@register
-class LDGFLHead(GFLHead):
-    """
-    GFLHead for LD distill
-    Args:
-        conv_feat (object): Instance of 'FCOSFeat'
-        num_classes (int): Number of classes
-        fpn_stride (list): The stride of each FPN Layer
-        prior_prob (float): Used to set the bias init for the class prediction layer
-        loss_class (object): Instance of QualityFocalLoss.
-        loss_dfl (object): Instance of DistributionFocalLoss.
-        loss_bbox (object): Instance of bbox loss.
-        reg_max: Max value of integral set :math: `{0, ..., reg_max}`
-                n QFL setting. Default: 16.
-    """
-    __inject__ = [
-        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
-        'loss_ld', 'loss_ld_vlr', 'loss_kd', 'nms'
-    ]
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 conv_feat='FCOSFeat',
-                 dgqp_module=None,
-                 num_classes=80,
-                 fpn_stride=[8, 16, 32, 64, 128],
-                 prior_prob=0.01,
-                 loss_class='QualityFocalLoss',
-                 loss_dfl='DistributionFocalLoss',
-                 loss_bbox='GIoULoss',
-                 loss_ld='KnowledgeDistillationKLDivLoss',
-                 loss_ld_vlr='KnowledgeDistillationKLDivLoss',
-                 loss_kd='KnowledgeDistillationKLDivLoss',
-                 reg_max=16,
-                 feat_in_chan=256,
-                 nms=None,
-                 nms_pre=1000,
-                 cell_offset=0):
-
-        super(LDGFLHead, self).__init__(
-            conv_feat=conv_feat,
-            dgqp_module=dgqp_module,
-            num_classes=num_classes,
-            fpn_stride=fpn_stride,
-            prior_prob=prior_prob,
-            loss_class=loss_class,
-            loss_dfl=loss_dfl,
-            loss_bbox=loss_bbox,
-            reg_max=reg_max,
-            feat_in_chan=feat_in_chan,
-            nms=nms,
-            nms_pre=nms_pre,
-            cell_offset=cell_offset)
-        self.loss_ld = loss_ld
-        self.loss_kd = loss_kd
-        self.loss_ld_vlr = loss_ld_vlr
-
-    def forward(self, fpn_feats):
-        assert len(fpn_feats) == len(
-            self.fpn_stride
-        ), "The size of fpn_feats is not equal to size of fpn_stride"
-        cls_logits_list = []
-        bboxes_reg_list = []
-        for stride, scale_reg, fpn_feat in zip(self.fpn_stride,
-                                               self.scales_regs, fpn_feats):
-            conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat)
-            cls_score = self.gfl_head_cls(conv_cls_feat)
-            bbox_pred = scale_reg(self.gfl_head_reg(conv_reg_feat))
-
-            if self.dgqp_module:
-                quality_score = self.dgqp_module(bbox_pred)
-                cls_score = F.sigmoid(cls_score) * quality_score
-            if not self.training:
-                cls_score = F.sigmoid(cls_score.transpose([0, 2, 3, 1]))
-                bbox_pred = bbox_pred.transpose([0, 2, 3, 1])
-                b, cell_h, cell_w, _ = paddle.shape(cls_score)
-                y, x = self.get_single_level_center_point(
-                    [cell_h, cell_w], stride, cell_offset=self.cell_offset)
-                center_points = paddle.stack([x, y], axis=-1)
-                cls_score = cls_score.reshape([b, -1, self.cls_out_channels])
-                bbox_pred = self.distribution_project(bbox_pred) * stride
-                bbox_pred = bbox_pred.reshape([b, cell_h * cell_w, 4])
-
-                # NOTE: If keep_ratio=False and image shape value that
-                # multiples of 32, distance2bbox not set max_shapes parameter
-                # to speed up model prediction. If need to set max_shapes,
-                # please use inputs['im_shape'].
-                bbox_pred = batch_distance2bbox(
-                    center_points, bbox_pred, max_shapes=None)
-
-            cls_logits_list.append(cls_score)
-            bboxes_reg_list.append(bbox_pred)
-
-        return (cls_logits_list, bboxes_reg_list)
-
-    def get_loss(self, gfl_head_outs, gt_meta, soft_label_list,
-                 soft_targets_list):
-        cls_logits, bboxes_reg = gfl_head_outs
-
-        num_level_anchors = [
-            featmap.shape[-2] * featmap.shape[-1] for featmap in cls_logits
-        ]
-
-        grid_cells_list = self._images_to_levels(gt_meta['grid_cells'],
-                                                 num_level_anchors)
-
-        labels_list = self._images_to_levels(gt_meta['labels'],
-                                             num_level_anchors)
-
-        label_weights_list = self._images_to_levels(gt_meta['label_weights'],
-                                                    num_level_anchors)
-        bbox_targets_list = self._images_to_levels(gt_meta['bbox_targets'],
-                                                   num_level_anchors)
-        # vlr regions                                         
-        vlr_regions_list = self._images_to_levels(gt_meta['vlr_regions'],
-                                                  num_level_anchors)
-
-        num_total_pos = sum(gt_meta['pos_num'])
-        try:
-            paddle.distributed.all_reduce(num_total_pos)
-            num_total_pos = paddle.clip(
-                num_total_pos / paddle.distributed.get_world_size(), min=1.)
-        except:
-            num_total_pos = max(num_total_pos, 1)
-
-        loss_bbox_list, loss_dfl_list, loss_qfl_list, loss_ld_list, avg_factor = [], [], [], [], []
-        loss_ld_vlr_list, loss_kd_list = [], []
-
-        for cls_score, bbox_pred, grid_cells, labels, label_weights, bbox_targets, stride, soft_targets,\
-                soft_label, vlr_region in zip(
-                cls_logits, bboxes_reg, grid_cells_list, labels_list,
-                label_weights_list, bbox_targets_list, self.fpn_stride, soft_targets_list,
-                soft_label_list, vlr_regions_list):
-
-            grid_cells = grid_cells.reshape([-1, 4])
-            cls_score = cls_score.transpose([0, 2, 3, 1]).reshape(
-                [-1, self.cls_out_channels])
-            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
-                [-1, 4 * (self.reg_max + 1)])
-
-            soft_targets = soft_targets.transpose([0, 2, 3, 1]).reshape(
-                [-1, 4 * (self.reg_max + 1)])
-
-            soft_label = soft_label.transpose([0, 2, 3, 1]).reshape(
-                [-1, self.cls_out_channels])
-
-            # feture im
-            # teacher_x = teacher_x.transpose([0, 2, 3, 1]).reshape([-1, 256])
-            # x = x.transpose([0, 2, 3, 1]).reshape([-1, 256])  
-
-            bbox_targets = bbox_targets.reshape([-1, 4])
-            labels = labels.reshape([-1])
-            label_weights = label_weights.reshape([-1])
-
-            vlr_region = vlr_region.reshape([-1])
-
-            bg_class_ind = self.num_classes
-            pos_inds = paddle.nonzero(
-                paddle.logical_and((labels >= 0), (labels < bg_class_ind)),
-                as_tuple=False).squeeze(1)
-            score = np.zeros(labels.shape)
-
-            remain_inds = (vlr_region > 0).nonzero()
-
-            if len(pos_inds) > 0:
-                pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0)
-                pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0)
-                pos_grid_cells = paddle.gather(grid_cells, pos_inds, axis=0)
-
-                pos_grid_cell_centers = self._grid_cells_to_center(
-                    pos_grid_cells) / stride
-
-                weight_targets = F.sigmoid(cls_score.detach())
-                weight_targets = paddle.gather(
-                    weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)
-                pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred)
-                pos_decode_bbox_pred = distance2bbox(pos_grid_cell_centers,
-                                                     pos_bbox_pred_corners)
-                pos_decode_bbox_targets = pos_bbox_targets / stride
-                bbox_iou = bbox_overlaps(
-                    pos_decode_bbox_pred.detach().numpy(),
-                    pos_decode_bbox_targets.detach().numpy(),
-                    is_aligned=True)
-                score[pos_inds.numpy()] = bbox_iou
-                pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1])
-
-                pos_soft_targets = paddle.gather(soft_targets, pos_inds, axis=0)
-                soft_corners = pos_soft_targets.reshape([-1, self.reg_max + 1])
-
-                target_corners = bbox2distance(pos_grid_cell_centers,
-                                               pos_decode_bbox_targets,
-                                               self.reg_max).reshape([-1])
-                # regression loss
-                loss_bbox = paddle.sum(
-                    self.loss_bbox(pos_decode_bbox_pred,
-                                   pos_decode_bbox_targets) * weight_targets)
-
-                # dfl loss
-                loss_dfl = self.loss_dfl(
-                    pred_corners,
-                    target_corners,
-                    weight=weight_targets.expand([-1, 4]).reshape([-1]),
-                    avg_factor=4.0)
-
-                # ld loss
-                loss_ld = self.loss_ld(
-                    pred_corners,
-                    soft_corners,
-                    weight=weight_targets.expand([-1, 4]).reshape([-1]),
-                    avg_factor=4.0)
-
-                loss_kd = self.loss_kd(
-                    paddle.gather(
-                        cls_score, pos_inds, axis=0),
-                    paddle.gather(
-                        soft_label, pos_inds, axis=0),
-                    weight=paddle.gather(
-                        label_weights, pos_inds, axis=0),
-                    avg_factor=pos_inds.shape[0])
-
-            else:
-                loss_bbox = bbox_pred.sum() * 0
-                loss_dfl = bbox_pred.sum() * 0
-                loss_ld = bbox_pred.sum() * 0
-                loss_kd = bbox_pred.sum() * 0
-                weight_targets = paddle.to_tensor([0], dtype='float32')
-
-            if len(remain_inds) > 0:
-                neg_pred_corners = bbox_pred[remain_inds].reshape(
-                    [-1, self.reg_max + 1])
-                neg_soft_corners = soft_targets[remain_inds].reshape(
-                    [-1, self.reg_max + 1])
-
-                remain_targets = vlr_region[remain_inds]
-
-                loss_ld_vlr = self.loss_ld_vlr(
-                    neg_pred_corners,
-                    neg_soft_corners,
-                    weight=remain_targets.expand([-1, 4]).reshape([-1]),
-                    avg_factor=16.0)
-            else:
-                loss_ld_vlr = bbox_pred.sum() * 0
-
-            # qfl loss
-            score = paddle.to_tensor(score)
-            loss_qfl = self.loss_qfl(
-                cls_score, (labels, score),
-                weight=label_weights,
-                avg_factor=num_total_pos)
-
-            loss_bbox_list.append(loss_bbox)
-            loss_dfl_list.append(loss_dfl)
-            loss_qfl_list.append(loss_qfl)
-            loss_ld_list.append(loss_ld)
-            loss_ld_vlr_list.append(loss_ld_vlr)
-            loss_kd_list.append(loss_kd)
-            avg_factor.append(weight_targets.sum())
-
-        avg_factor = sum(avg_factor)  # + 1e-6
-        try:
-            paddle.distributed.all_reduce(avg_factor)
-            avg_factor = paddle.clip(
-                avg_factor / paddle.distributed.get_world_size(), min=1)
-        except:
-            avg_factor = max(avg_factor.item(), 1)
-
-        if avg_factor <= 0:
-            loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
-            loss_bbox = paddle.to_tensor(
-                0, dtype='float32', stop_gradient=False)
-            loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
-            loss_ld = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
-            loss_ld_vlr = paddle.to_tensor(
-                0, dtype='float32', stop_gradient=False)
-            loss_kd = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
-        else:
-            losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list))
-            losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list))
-            loss_qfl = sum(loss_qfl_list)
-            loss_bbox = sum(losses_bbox)
-            loss_dfl = sum(losses_dfl)
-            loss_ld = sum(loss_ld_list)
-            loss_ld_vlr = sum(loss_ld_vlr_list)
-            loss_kd = sum(loss_kd_list)
-
-        loss_states = dict(
-            loss_qfl=loss_qfl,
-            loss_bbox=loss_bbox,
-            loss_dfl=loss_dfl,
-            loss_ld=loss_ld,
-            loss_ld_vlr=loss_ld_vlr,
-            loss_kd=loss_kd)
-
-        return loss_states
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/keypoint_hrhrnet_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/keypoint_hrhrnet_head.py
deleted file mode 100644
index 869b181..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/keypoint_hrhrnet_head.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-
-from ppdet.core.workspace import register
-from .. import layers as L
-from ..backbones.hrnet import BasicBlock
-
-
-@register
-class HrHRNetHead(nn.Layer):
-    __inject__ = ['loss']
-
-    def __init__(self, num_joints, loss='HrHRNetLoss', swahr=False, width=32):
-        """
-        Head for HigherHRNet network
-
-        Args:
-            num_joints (int): number of keypoints
-            hrloss (object): HrHRNetLoss instance
-            swahr (bool): whether to use swahr
-            width (int): hrnet channel width
-        """
-        super(HrHRNetHead, self).__init__()
-        self.loss = loss
-
-        self.num_joints = num_joints
-        num_featout1 = num_joints * 2
-        num_featout2 = num_joints
-        self.swahr = swahr
-        self.conv1 = L.Conv2d(width, num_featout1, 1, 1, 0, bias=True)
-        self.conv2 = L.Conv2d(width, num_featout2, 1, 1, 0, bias=True)
-        self.deconv = nn.Sequential(
-            L.ConvTranspose2d(
-                num_featout1 + width, width, 4, 2, 1, 0, bias=False),
-            L.BatchNorm2d(width),
-            L.ReLU())
-        self.blocks = nn.Sequential(*(BasicBlock(
-            num_channels=width,
-            num_filters=width,
-            has_se=False,
-            freeze_norm=False,
-            name='HrHRNetHead_{}'.format(i)) for i in range(4)))
-
-        self.interpolate = L.Upsample(2, mode='bilinear')
-        self.concat = L.Concat(dim=1)
-        if swahr:
-            self.scalelayer0 = nn.Sequential(
-                L.Conv2d(
-                    width, num_joints, 1, 1, 0, bias=True),
-                L.BatchNorm2d(num_joints),
-                L.ReLU(),
-                L.Conv2d(
-                    num_joints,
-                    num_joints,
-                    9,
-                    1,
-                    4,
-                    groups=num_joints,
-                    bias=True))
-            self.scalelayer1 = nn.Sequential(
-                L.Conv2d(
-                    width, num_joints, 1, 1, 0, bias=True),
-                L.BatchNorm2d(num_joints),
-                L.ReLU(),
-                L.Conv2d(
-                    num_joints,
-                    num_joints,
-                    9,
-                    1,
-                    4,
-                    groups=num_joints,
-                    bias=True))
-
-    def forward(self, feats, targets=None):
-        x1 = feats[0]
-        xo1 = self.conv1(x1)
-        x2 = self.blocks(self.deconv(self.concat((x1, xo1))))
-        xo2 = self.conv2(x2)
-        num_joints = self.num_joints
-        if self.training:
-            heatmap1, tagmap = paddle.split(xo1, 2, axis=1)
-            if self.swahr:
-                so1 = self.scalelayer0(x1)
-                so2 = self.scalelayer1(x2)
-                hrhrnet_outputs = ([heatmap1, so1], [xo2, so2], tagmap)
-                return self.loss(hrhrnet_outputs, targets)
-            else:
-                hrhrnet_outputs = (heatmap1, xo2, tagmap)
-                return self.loss(hrhrnet_outputs, targets)
-
-        # averaged heatmap, upsampled tagmap
-        upsampled = self.interpolate(xo1)
-        avg = (upsampled[:, :num_joints] + xo2[:, :num_joints]) / 2
-        return avg, upsampled[:, num_joints:]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/mask_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/mask_head.py
deleted file mode 100644
index 403d4ce..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/mask_head.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.nn.initializer import KaimingNormal
-
-from ppdet.core.workspace import register, create
-from ppdet.modeling.layers import ConvNormLayer
-from .roi_extractor import RoIAlign
-from ..cls_utils import _get_class_default_kwargs
-
-
-@register
-class MaskFeat(nn.Layer):
-    """
-    Feature extraction in Mask head
-
-    Args:
-        in_channel (int): Input channels
-        out_channel (int): Output channels
-        num_convs (int): The number of conv layers, default 4
-        norm_type (string | None): Norm type, bn, gn, sync_bn are available,
-            default None
-    """
-
-    def __init__(self,
-                 in_channel=256,
-                 out_channel=256,
-                 num_convs=4,
-                 norm_type=None):
-        super(MaskFeat, self).__init__()
-        self.num_convs = num_convs
-        self.in_channel = in_channel
-        self.out_channel = out_channel
-        self.norm_type = norm_type
-        fan_conv = out_channel * 3 * 3
-        fan_deconv = out_channel * 2 * 2
-
-        mask_conv = nn.Sequential()
-        if norm_type == 'gn':
-            for i in range(self.num_convs):
-                conv_name = 'mask_inter_feat_{}'.format(i + 1)
-                mask_conv.add_sublayer(
-                    conv_name,
-                    ConvNormLayer(
-                        ch_in=in_channel if i == 0 else out_channel,
-                        ch_out=out_channel,
-                        filter_size=3,
-                        stride=1,
-                        norm_type=self.norm_type,
-                        initializer=KaimingNormal(fan_in=fan_conv),
-                        skip_quant=True))
-                mask_conv.add_sublayer(conv_name + 'act', nn.ReLU())
-        else:
-            for i in range(self.num_convs):
-                conv_name = 'mask_inter_feat_{}'.format(i + 1)
-                conv = nn.Conv2D(
-                    in_channels=in_channel if i == 0 else out_channel,
-                    out_channels=out_channel,
-                    kernel_size=3,
-                    padding=1,
-                    weight_attr=paddle.ParamAttr(
-                        initializer=KaimingNormal(fan_in=fan_conv)))
-                conv.skip_quant = True
-                mask_conv.add_sublayer(conv_name, conv)
-                mask_conv.add_sublayer(conv_name + 'act', nn.ReLU())
-        mask_conv.add_sublayer(
-            'conv5_mask',
-            nn.Conv2DTranspose(
-                in_channels=self.out_channel if num_convs > 0 else self.in_channel,
-                out_channels=self.out_channel,
-                kernel_size=2,
-                stride=2,
-                weight_attr=paddle.ParamAttr(
-                    initializer=KaimingNormal(fan_in=fan_deconv))))
-        mask_conv.add_sublayer('conv5_mask' + 'act', nn.ReLU())
-        self.upsample = mask_conv
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        if isinstance(input_shape, (list, tuple)):
-            input_shape = input_shape[0]
-        return {'in_channel': input_shape.channels, }
-
-    def out_channels(self):
-        return self.out_channel
-
-    def forward(self, feats):
-        return self.upsample(feats)
-
-
-@register
-class MaskHead(nn.Layer):
-    __shared__ = ['num_classes', 'export_onnx']
-    __inject__ = ['mask_assigner']
-    """
-    RCNN mask head
-
-    Args:
-        head (nn.Layer): Extract feature in mask head
-        roi_extractor (object): The module of RoI Extractor
-        mask_assigner (object): The module of Mask Assigner, 
-            label and sample the mask
-        num_classes (int): The number of classes
-        share_bbox_feat (bool): Whether to share the feature from bbox head,
-            default false
-    """
-
-    def __init__(self,
-                 head,
-                 roi_extractor=_get_class_default_kwargs(RoIAlign),
-                 mask_assigner='MaskAssigner',
-                 num_classes=80,
-                 share_bbox_feat=False,
-                 export_onnx=False):
-        super(MaskHead, self).__init__()
-        self.num_classes = num_classes
-        self.export_onnx = export_onnx
-
-        self.roi_extractor = roi_extractor
-        if isinstance(roi_extractor, dict):
-            self.roi_extractor = RoIAlign(**roi_extractor)
-        self.head = head
-        self.in_channels = head.out_channels()
-        self.mask_assigner = mask_assigner
-        self.share_bbox_feat = share_bbox_feat
-        self.bbox_head = None
-
-        self.mask_fcn_logits = nn.Conv2D(
-            in_channels=self.in_channels,
-            out_channels=self.num_classes,
-            kernel_size=1,
-            weight_attr=paddle.ParamAttr(initializer=KaimingNormal(
-                fan_in=self.num_classes)))
-        self.mask_fcn_logits.skip_quant = True
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        roi_pooler = cfg['roi_extractor']
-        assert isinstance(roi_pooler, dict)
-        kwargs = RoIAlign.from_config(cfg, input_shape)
-        roi_pooler.update(kwargs)
-        kwargs = {'input_shape': input_shape}
-        head = create(cfg['head'], **kwargs)
-        return {
-            'roi_extractor': roi_pooler,
-            'head': head,
-        }
-
-    def get_loss(self, mask_logits, mask_label, mask_target, mask_weight):
-        mask_label = F.one_hot(mask_label, self.num_classes).unsqueeze([2, 3])
-        mask_label = paddle.expand_as(mask_label, mask_logits)
-        mask_label.stop_gradient = True
-        mask_pred = paddle.gather_nd(mask_logits, paddle.nonzero(mask_label))
-        shape = mask_logits.shape
-        mask_pred = paddle.reshape(mask_pred, [shape[0], shape[2], shape[3]])
-
-        mask_target = mask_target.cast('float32')
-        mask_weight = mask_weight.unsqueeze([1, 2])
-        loss_mask = F.binary_cross_entropy_with_logits(
-            mask_pred, mask_target, weight=mask_weight, reduction="mean")
-        return loss_mask
-
-    def forward_train(self, body_feats, rois, rois_num, inputs, targets,
-                      bbox_feat):
-        """
-        body_feats (list[Tensor]): Multi-level backbone features
-        rois (list[Tensor]): Proposals for each batch with shape [N, 4]
-        rois_num (Tensor): The number of proposals for each batch
-        inputs (dict): ground truth info
-        """
-        tgt_labels, _, tgt_gt_inds = targets
-        rois, rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights = self.mask_assigner(
-            rois, tgt_labels, tgt_gt_inds, inputs)
-
-        if self.share_bbox_feat:
-            rois_feat = paddle.gather(bbox_feat, mask_index)
-        else:
-            rois_feat = self.roi_extractor(body_feats, rois, rois_num)
-        mask_feat = self.head(rois_feat)
-        mask_logits = self.mask_fcn_logits(mask_feat)
-
-        loss_mask = self.get_loss(mask_logits, tgt_classes, tgt_masks,
-                                  tgt_weights)
-        return {'loss_mask': loss_mask}
-
-    def forward_test(self,
-                     body_feats,
-                     rois,
-                     rois_num,
-                     scale_factor,
-                     feat_func=None):
-        """
-        body_feats (list[Tensor]): Multi-level backbone features
-        rois (Tensor): Prediction from bbox head with shape [N, 6]
-        rois_num (Tensor): The number of prediction for each batch
-        scale_factor (Tensor): The scale factor from origin size to input size
-        """
-        if not self.export_onnx and rois.shape[0] == 0:
-            mask_out = paddle.full([1, 1, 1], -1)
-        else:
-            bbox = [rois[:, 2:]]
-            labels = rois[:, 0].cast('int32')
-            rois_feat = self.roi_extractor(body_feats, bbox, rois_num)
-            if self.share_bbox_feat:
-                assert feat_func is not None
-                rois_feat = feat_func(rois_feat)
-
-            mask_feat = self.head(rois_feat)
-            mask_logit = self.mask_fcn_logits(mask_feat)
-            if self.num_classes == 1:
-                mask_out = F.sigmoid(mask_logit)[:, 0, :, :]
-            else:
-                num_masks = paddle.shape(mask_logit)[0]
-                index = paddle.arange(num_masks).cast('int32')
-                mask_out = mask_logit[index, labels]
-                mask_out_shape = paddle.shape(mask_out)
-                mask_out = paddle.reshape(mask_out, [
-                    paddle.shape(index), mask_out_shape[-2], mask_out_shape[-1]
-                ])
-                mask_out = F.sigmoid(mask_out)
-        return mask_out
-
-    def forward(self,
-                body_feats,
-                rois,
-                rois_num,
-                inputs,
-                targets=None,
-                bbox_feat=None,
-                feat_func=None):
-        if self.training:
-            return self.forward_train(body_feats, rois, rois_num, inputs,
-                                      targets, bbox_feat)
-        else:
-            im_scale = inputs['scale_factor']
-            return self.forward_test(body_feats, rois, rois_num, im_scale,
-                                     feat_func)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/petr_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/petr_head.py
deleted file mode 100644
index 90760c6..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/petr_head.py
+++ /dev/null
@@ -1,1161 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-"""
-this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/dense_heads/petr_head.py
-"""
-import copy
-import numpy as np
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-import paddle.distributed as dist
-
-from ..transformers.petr_transformer import inverse_sigmoid, masked_fill
-from ..initializer import constant_, normal_
-
-__all__ = ["PETRHead"]
-
-from functools import partial
-
-
-def bias_init_with_prob(prior_prob: float) -> float:
-    """initialize conv/fc bias value according to a given probability value."""
-    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
-    return bias_init
-
-
-def multi_apply(func, *args, **kwargs):
-    """Apply function to a list of arguments.
-
-    Note:
-        This function applies the ``func`` to multiple inputs and
-        map the multiple outputs of the ``func`` into different
-        list. Each list contains the same type of outputs corresponding
-        to different inputs.
-
-    Args:
-        func (Function): A function that will be applied to a list of
-            arguments
-
-    Returns:
-        tuple(list): A tuple containing multiple list, each list contains \
-            a kind of returned results by the function
-    """
-    pfunc = partial(func, **kwargs) if kwargs else func
-    map_results = map(pfunc, *args)
-    res = tuple(map(list, zip(*map_results)))
-    return res
-
-
-def reduce_mean(tensor):
-    """"Obtain the mean of tensor on different GPUs."""
-    if not (dist.get_world_size() and dist.is_initialized()):
-        return tensor
-    tensor = tensor.clone()
-    dist.all_reduce(
-        tensor.divide(
-            paddle.to_tensor(
-                dist.get_world_size(), dtype='float32')),
-        op=dist.ReduceOp.SUM)
-    return tensor
-
-
-def gaussian_radius(det_size, min_overlap=0.7):
-    """calculate gaussian radius according to object size.
-    """
-    height, width = det_size
-
-    a1 = 1
-    b1 = (height + width)
-    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
-    sq1 = paddle.sqrt(b1**2 - 4 * a1 * c1)
-    r1 = (b1 + sq1) / 2
-
-    a2 = 4
-    b2 = 2 * (height + width)
-    c2 = (1 - min_overlap) * width * height
-    sq2 = paddle.sqrt(b2**2 - 4 * a2 * c2)
-    r2 = (b2 + sq2) / 2
-
-    a3 = 4 * min_overlap
-    b3 = -2 * min_overlap * (height + width)
-    c3 = (min_overlap - 1) * width * height
-    sq3 = paddle.sqrt(b3**2 - 4 * a3 * c3)
-    r3 = (b3 + sq3) / 2
-    return min(r1, r2, r3)
-
-
-def gaussian2D(shape, sigma=1):
-    m, n = [(ss - 1.) / 2. for ss in shape]
-    y = paddle.arange(-m, m + 1, dtype="float32")[:, None]
-    x = paddle.arange(-n, n + 1, dtype="float32")[None, :]
-    # y, x = np.ogrid[-m:m + 1, -n:n + 1]
-
-    h = paddle.exp(-(x * x + y * y) / (2 * sigma * sigma))
-    h[h < np.finfo(np.float32).eps * h.max()] = 0
-    return h
-
-
-def draw_umich_gaussian(heatmap, center, radius, k=1):
-    diameter = 2 * radius + 1
-    gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6)
-    gaussian = paddle.to_tensor(gaussian, dtype=heatmap.dtype)
-
-    x, y = int(center[0]), int(center[1])
-    radius = int(radius)
-
-    height, width = heatmap.shape[0:2]
-
-    left, right = min(x, radius), min(width - x, radius + 1)
-    top, bottom = min(y, radius), min(height - y, radius + 1)
-
-    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
-    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
-                               radius + right]
-    # assert masked_gaussian.equal(1).float().sum() == 1
-    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
-        heatmap[y - top:y + bottom, x - left:x + right] = paddle.maximum(
-            masked_heatmap, masked_gaussian * k)
-    return heatmap
-
-
-@register
-class PETRHead(nn.Layer):
-    """Head of `End-to-End Multi-Person Pose Estimation with Transformers`.
-
-    Args:
-        num_classes (int): Number of categories excluding the background.
-        in_channels (int): Number of channels in the input feature map.
-        num_query (int): Number of query in Transformer.
-        num_kpt_fcs (int, optional): Number of fully-connected layers used in
-            `FFN`, which is then used for the keypoint regression head.
-            Default 2.
-        transformer (obj:`mmcv.ConfigDict`|dict): ConfigDict is used for
-            building the Encoder and Decoder. Default: None.
-        sync_cls_avg_factor (bool): Whether to sync the avg_factor of
-            all ranks. Default to False.
-        positional_encoding (obj:`mmcv.ConfigDict`|dict):
-            Config for position encoding.
-        loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the
-            classification loss. Default `CrossEntropyLoss`.
-        loss_kpt (obj:`mmcv.ConfigDict`|dict): Config of the
-            regression loss. Default `L1Loss`.
-        loss_oks (obj:`mmcv.ConfigDict`|dict): Config of the
-            regression oks loss. Default `OKSLoss`.
-        loss_hm (obj:`mmcv.ConfigDict`|dict): Config of the
-            regression heatmap loss. Default `NegLoss`.
-        as_two_stage (bool) : Whether to generate the proposal from
-            the outputs of encoder.
-        with_kpt_refine (bool): Whether to refine the reference points
-            in the decoder. Defaults to True.
-        test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of
-            transformer head.
-        init_cfg (dict or list[dict], optional): Initialization config dict.
-            Default: None.
-    """
-    __inject__ = [
-        "transformer", "positional_encoding", "assigner", "sampler", "loss_cls",
-        "loss_kpt", "loss_oks", "loss_hm", "loss_kpt_rpn", "loss_kpt_refine",
-        "loss_oks_refine"
-    ]
-
-    def __init__(self,
-                 num_classes,
-                 in_channels,
-                 num_query=100,
-                 num_kpt_fcs=2,
-                 num_keypoints=17,
-                 transformer=None,
-                 sync_cls_avg_factor=True,
-                 positional_encoding='SinePositionalEncoding',
-                 loss_cls='FocalLoss',
-                 loss_kpt='L1Loss',
-                 loss_oks='OKSLoss',
-                 loss_hm='CenterFocalLoss',
-                 with_kpt_refine=True,
-                 assigner='PoseHungarianAssigner',
-                 sampler='PseudoSampler',
-                 loss_kpt_rpn='L1Loss',
-                 loss_kpt_refine='L1Loss',
-                 loss_oks_refine='opera.OKSLoss',
-                 test_cfg=dict(max_per_img=100),
-                 init_cfg=None,
-                 **kwargs):
-        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
-        # since it brings inconvenience when the initialization of
-        # `AnchorFreeHead` is called.
-        super().__init__()
-        self.bg_cls_weight = 0
-        self.sync_cls_avg_factor = sync_cls_avg_factor
-        self.assigner = assigner
-        self.sampler = sampler
-        self.num_query = num_query
-        self.num_classes = num_classes
-        self.in_channels = in_channels
-        self.num_kpt_fcs = num_kpt_fcs
-        self.test_cfg = test_cfg
-        self.fp16_enabled = False
-        self.as_two_stage = transformer.as_two_stage
-        self.with_kpt_refine = with_kpt_refine
-        self.num_keypoints = num_keypoints
-        self.loss_cls = loss_cls
-        self.loss_kpt = loss_kpt
-        self.loss_kpt_rpn = loss_kpt_rpn
-        self.loss_kpt_refine = loss_kpt_refine
-        self.loss_oks = loss_oks
-        self.loss_oks_refine = loss_oks_refine
-        self.loss_hm = loss_hm
-        if self.loss_cls.use_sigmoid:
-            self.cls_out_channels = num_classes
-        else:
-            self.cls_out_channels = num_classes + 1
-        self.positional_encoding = positional_encoding
-        self.transformer = transformer
-        self.embed_dims = self.transformer.embed_dims
-        # assert 'num_feats' in positional_encoding
-        num_feats = positional_encoding.num_pos_feats
-        assert num_feats * 2 == self.embed_dims, 'embed_dims should' \
-            f' be exactly 2 times of num_feats. Found {self.embed_dims}' \
-            f' and {num_feats}.'
-        self._init_layers()
-        self.init_weights()
-
-    def _init_layers(self):
-        """Initialize classification branch and keypoint branch of head."""
-
-        fc_cls = nn.Linear(self.embed_dims, self.cls_out_channels)
-
-        kpt_branch = []
-        kpt_branch.append(nn.Linear(self.embed_dims, 512))
-        kpt_branch.append(nn.ReLU())
-        for _ in range(self.num_kpt_fcs):
-            kpt_branch.append(nn.Linear(512, 512))
-            kpt_branch.append(nn.ReLU())
-        kpt_branch.append(nn.Linear(512, 2 * self.num_keypoints))
-        kpt_branch = nn.Sequential(*kpt_branch)
-
-        def _get_clones(module, N):
-            return nn.LayerList([copy.deepcopy(module) for i in range(N)])
-
-        # last kpt_branch is used to generate proposal from
-        # encode feature map when as_two_stage is True.
-        num_pred = (self.transformer.decoder.num_layers + 1) if \
-            self.as_two_stage else self.transformer.decoder.num_layers
-
-        if self.with_kpt_refine:
-            self.cls_branches = _get_clones(fc_cls, num_pred)
-            self.kpt_branches = _get_clones(kpt_branch, num_pred)
-        else:
-            self.cls_branches = nn.LayerList([fc_cls for _ in range(num_pred)])
-            self.kpt_branches = nn.LayerList(
-                [kpt_branch for _ in range(num_pred)])
-
-        self.query_embedding = nn.Embedding(self.num_query, self.embed_dims * 2)
-
-        refine_kpt_branch = []
-        for _ in range(self.num_kpt_fcs):
-            refine_kpt_branch.append(
-                nn.Linear(self.embed_dims, self.embed_dims))
-            refine_kpt_branch.append(nn.ReLU())
-        refine_kpt_branch.append(nn.Linear(self.embed_dims, 2))
-        refine_kpt_branch = nn.Sequential(*refine_kpt_branch)
-        if self.with_kpt_refine:
-            num_pred = self.transformer.refine_decoder.num_layers
-            self.refine_kpt_branches = _get_clones(refine_kpt_branch, num_pred)
-        self.fc_hm = nn.Linear(self.embed_dims, self.num_keypoints)
-
-    def init_weights(self):
-        """Initialize weights of the PETR head."""
-        self.transformer.init_weights()
-        if self.loss_cls.use_sigmoid:
-            bias_init = bias_init_with_prob(0.01)
-            for m in self.cls_branches:
-                constant_(m.bias, bias_init)
-        for m in self.kpt_branches:
-            constant_(m[-1].bias, 0)
-        # initialization of keypoint refinement branch
-        if self.with_kpt_refine:
-            for m in self.refine_kpt_branches:
-                constant_(m[-1].bias, 0)
-        # initialize bias for heatmap prediction
-        bias_init = bias_init_with_prob(0.1)
-        normal_(self.fc_hm.weight, std=0.01)
-        constant_(self.fc_hm.bias, bias_init)
-
-    def forward(self, mlvl_feats, img_metas):
-        """Forward function.
-
-        Args:
-            mlvl_feats (tuple[Tensor]): Features from the upstream
-                network, each is a 4D-tensor with shape
-                (N, C, H, W).
-            img_metas (list[dict]): List of image information.
-
-        Returns:
-            outputs_classes (Tensor): Outputs from the classification head,
-                shape [nb_dec, bs, num_query, cls_out_channels]. Note
-                cls_out_channels should include background.
-            outputs_kpts (Tensor): Sigmoid outputs from the regression
-                head with normalized coordinate format (cx, cy, w, h).
-                Shape [nb_dec, bs, num_query, K*2].
-            enc_outputs_class (Tensor): The score of each point on encode
-                feature map, has shape (N, h*w, num_class). Only when
-                as_two_stage is Ture it would be returned, otherwise
-                `None` would be returned.
-            enc_outputs_kpt (Tensor): The proposal generate from the
-                encode feature map, has shape (N, h*w, K*2). Only when
-                as_two_stage is Ture it would be returned, otherwise
-                `None` would be returned.
-        """
-
-        batch_size = mlvl_feats[0].shape[0]
-        input_img_h, input_img_w = img_metas[0]['batch_input_shape']
-        img_masks = paddle.zeros(
-            (batch_size, input_img_h, input_img_w), dtype=mlvl_feats[0].dtype)
-        for img_id in range(batch_size):
-            img_h, img_w, _ = img_metas[img_id]['img_shape']
-            img_masks[img_id, :img_h, :img_w] = 1
-
-        mlvl_masks = []
-        mlvl_positional_encodings = []
-        for feat in mlvl_feats:
-            mlvl_masks.append(
-                F.interpolate(
-                    img_masks[None], size=feat.shape[-2:]).squeeze(0))
-            mlvl_positional_encodings.append(
-                self.positional_encoding(mlvl_masks[-1]).transpose(
-                    [0, 3, 1, 2]))
-
-        query_embeds = self.query_embedding.weight
-        hs, init_reference, inter_references, \
-            enc_outputs_class, enc_outputs_kpt, hm_proto, memory = \
-                self.transformer(
-                    mlvl_feats,
-                    mlvl_masks,
-                    query_embeds,
-                    mlvl_positional_encodings,
-                    kpt_branches=self.kpt_branches \
-                        if self.with_kpt_refine else None,  # noqa:E501
-                    cls_branches=self.cls_branches \
-                        if self.as_two_stage else None  # noqa:E501
-            )
-
-        outputs_classes = []
-        outputs_kpts = []
-
-        for lvl in range(hs.shape[0]):
-            if lvl == 0:
-                reference = init_reference
-            else:
-                reference = inter_references[lvl - 1]
-            reference = inverse_sigmoid(reference)
-            outputs_class = self.cls_branches[lvl](hs[lvl])
-            tmp_kpt = self.kpt_branches[lvl](hs[lvl])
-            assert reference.shape[-1] == self.num_keypoints * 2
-            tmp_kpt += reference
-            outputs_kpt = F.sigmoid(tmp_kpt)
-            outputs_classes.append(outputs_class)
-            outputs_kpts.append(outputs_kpt)
-
-        outputs_classes = paddle.stack(outputs_classes)
-        outputs_kpts = paddle.stack(outputs_kpts)
-
-        if hm_proto is not None:
-            # get heatmap prediction (training phase)
-            hm_memory, hm_mask = hm_proto
-            hm_pred = self.fc_hm(hm_memory)
-            hm_proto = (hm_pred.transpose((0, 3, 1, 2)), hm_mask)
-
-        if self.as_two_stage:
-            return outputs_classes, outputs_kpts, \
-                enc_outputs_class, F.sigmoid(enc_outputs_kpt), \
-                hm_proto, memory, mlvl_masks
-        else:
-            raise RuntimeError('only "as_two_stage=True" is supported.')
-
-    def forward_refine(self, memory, mlvl_masks, refine_targets, losses,
-                       img_metas):
-        """Forward function.
-
-        Args:
-            mlvl_masks (tuple[Tensor]): The key_padding_mask from
-                different level used for encoder and decoder,
-                each is a 3D-tensor with shape (bs, H, W).
-            losses (dict[str, Tensor]): A dictionary of loss components.
-            img_metas (list[dict]): List of image information.
-
-        Returns:
-            dict[str, Tensor]: A dictionary of loss components.
-        """
-        kpt_preds, kpt_targets, area_targets, kpt_weights = refine_targets
-        pos_inds = kpt_weights.sum(-1) > 0
-        if not pos_inds.any():
-            pos_kpt_preds = paddle.zeros_like(kpt_preds[:1])
-            pos_img_inds = paddle.zeros([1], dtype="int64")
-        else:
-            pos_kpt_preds = kpt_preds[pos_inds]
-            pos_img_inds = (pos_inds.nonzero() /
-                            self.num_query).squeeze(1).astype("int64")
-        hs, init_reference, inter_references = self.transformer.forward_refine(
-            mlvl_masks,
-            memory,
-            pos_kpt_preds.detach(),
-            pos_img_inds,
-            kpt_branches=self.refine_kpt_branches
-            if self.with_kpt_refine else None,  # noqa:E501
-        )
-
-        outputs_kpts = []
-
-        for lvl in range(hs.shape[0]):
-            if lvl == 0:
-                reference = init_reference
-            else:
-                reference = inter_references[lvl - 1]
-            reference = inverse_sigmoid(reference)
-            tmp_kpt = self.refine_kpt_branches[lvl](hs[lvl])
-            assert reference.shape[-1] == 2
-            tmp_kpt += reference
-            outputs_kpt = F.sigmoid(tmp_kpt)
-            outputs_kpts.append(outputs_kpt)
-        outputs_kpts = paddle.stack(outputs_kpts)
-
-        if not self.training:
-            return outputs_kpts
-
-        num_valid_kpt = paddle.clip(
-            reduce_mean(kpt_weights.sum()), min=1).item()
-        num_total_pos = paddle.to_tensor(
-            [outputs_kpts.shape[1]], dtype=kpt_weights.dtype)
-        num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item()
-
-        if not pos_inds.any():
-            for i, kpt_refine_preds in enumerate(outputs_kpts):
-                loss_kpt = loss_oks = kpt_refine_preds.sum() * 0
-                losses[f'd{i}.loss_kpt_refine'] = loss_kpt
-                losses[f'd{i}.loss_oks_refine'] = loss_oks
-                continue
-            return losses
-
-        batch_size = mlvl_masks[0].shape[0]
-        factors = []
-        for img_id in range(batch_size):
-            img_h, img_w, _ = img_metas[img_id]['img_shape']
-            factor = paddle.to_tensor(
-                [img_w, img_h, img_w, img_h],
-                dtype="float32").squeeze(-1).unsqueeze(0).tile(
-                    (self.num_query, 1))
-            factors.append(factor)
-        factors = paddle.concat(factors, 0)
-        factors = factors[pos_inds][:, :2].tile((1, kpt_preds.shape[-1] // 2))
-
-        pos_kpt_weights = kpt_weights[pos_inds]
-        pos_kpt_targets = kpt_targets[pos_inds]
-        pos_kpt_targets_scaled = pos_kpt_targets * factors
-        pos_areas = area_targets[pos_inds]
-        pos_valid = kpt_weights[pos_inds][:, 0::2]
-        for i, kpt_refine_preds in enumerate(outputs_kpts):
-            if not pos_inds.any():
-                print("refine kpt and oks skip")
-                loss_kpt = loss_oks = kpt_refine_preds.sum() * 0
-                losses[f'd{i}.loss_kpt_refine'] = loss_kpt
-                losses[f'd{i}.loss_oks_refine'] = loss_oks
-                continue
-
-            # kpt L1 Loss
-            pos_refine_preds = kpt_refine_preds.reshape(
-                (kpt_refine_preds.shape[0], -1))
-            loss_kpt = self.loss_kpt_refine(
-                pos_refine_preds,
-                pos_kpt_targets,
-                pos_kpt_weights,
-                avg_factor=num_valid_kpt)
-            losses[f'd{i}.loss_kpt_refine'] = loss_kpt
-            # kpt oks loss
-            pos_refine_preds_scaled = pos_refine_preds * factors
-            assert (pos_areas > 0).all()
-            loss_oks = self.loss_oks_refine(
-                pos_refine_preds_scaled,
-                pos_kpt_targets_scaled,
-                pos_valid,
-                pos_areas,
-                avg_factor=num_total_pos)
-            losses[f'd{i}.loss_oks_refine'] = loss_oks
-        return losses
-
-    # over-write because img_metas are needed as inputs for bbox_head.
-    def forward_train(self,
-                      x,
-                      img_metas,
-                      gt_bboxes,
-                      gt_labels=None,
-                      gt_keypoints=None,
-                      gt_areas=None,
-                      gt_bboxes_ignore=None,
-                      proposal_cfg=None,
-                      **kwargs):
-        """Forward function for training mode.
-
-        Args:
-            x (list[Tensor]): Features from backbone.
-            img_metas (list[dict]): Meta information of each image, e.g.,
-                image size, scaling factor, etc.
-            gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
-                shape (num_gts, 4).
-            gt_labels (list[Tensor]): Ground truth labels of each box,
-                shape (num_gts,).
-            gt_keypoints (list[Tensor]): Ground truth keypoints of the image,
-                shape (num_gts, K*3).
-            gt_areas (list[Tensor]): Ground truth mask areas of each box,
-                shape (num_gts,).
-            gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
-                ignored, shape (num_ignored_gts, 4).
-            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
-                if None, test_cfg would be used.
-
-        Returns:
-            dict[str, Tensor]: A dictionary of loss components.
-        """
-        assert proposal_cfg is None, '"proposal_cfg" must be None'
-        outs = self(x, img_metas)
-        memory, mlvl_masks = outs[-2:]
-        outs = outs[:-2]
-        if gt_labels is None:
-            loss_inputs = outs + (gt_bboxes, gt_keypoints, gt_areas, img_metas)
-        else:
-            loss_inputs = outs + (gt_bboxes, gt_labels, gt_keypoints, gt_areas,
-                                  img_metas)
-        losses_and_targets = self.loss(
-            *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
-        # losses = losses_and_targets
-        losses, refine_targets = losses_and_targets
-        # get pose refinement loss
-        losses = self.forward_refine(memory, mlvl_masks, refine_targets, losses,
-                                     img_metas)
-        return losses
-
-    def loss(self,
-             all_cls_scores,
-             all_kpt_preds,
-             enc_cls_scores,
-             enc_kpt_preds,
-             enc_hm_proto,
-             gt_bboxes_list,
-             gt_labels_list,
-             gt_keypoints_list,
-             gt_areas_list,
-             img_metas,
-             gt_bboxes_ignore=None):
-        """Loss function.
-
-        Args:
-            all_cls_scores (Tensor): Classification score of all
-                decoder layers, has shape
-                [nb_dec, bs, num_query, cls_out_channels].
-            all_kpt_preds (Tensor): Sigmoid regression
-                outputs of all decode layers. Each is a 4D-tensor with
-                normalized coordinate format (x_{i}, y_{i}) and shape
-                [nb_dec, bs, num_query, K*2].
-            enc_cls_scores (Tensor): Classification scores of
-                points on encode feature map, has shape
-                (N, h*w, num_classes). Only be passed when as_two_stage is
-                True, otherwise is None.
-            enc_kpt_preds (Tensor): Regression results of each points
-                on the encode feature map, has shape (N, h*w, K*2). Only be
-                passed when as_two_stage is True, otherwise is None.
-            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
-                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
-            gt_labels_list (list[Tensor]): Ground truth class indices for each
-                image with shape (num_gts, ).
-            gt_keypoints_list (list[Tensor]): Ground truth keypoints for each
-                image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v,
-                    ..., p^{K}_x, p^{K}_y, p^{K}_v] format.
-            gt_areas_list (list[Tensor]): Ground truth mask areas for each
-                image with shape (num_gts, ).
-            img_metas (list[dict]): List of image meta information.
-            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
-                which can be ignored for each image. Default None.
-
-        Returns:
-            dict[str, Tensor]: A dictionary of loss components.
-        """
-        assert gt_bboxes_ignore is None, \
-            f'{self.__class__.__name__} only supports ' \
-            f'for gt_bboxes_ignore setting to None.'
-
-        num_dec_layers = len(all_cls_scores)
-        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
-        all_gt_keypoints_list = [
-            gt_keypoints_list for _ in range(num_dec_layers)
-        ]
-        all_gt_areas_list = [gt_areas_list for _ in range(num_dec_layers)]
-        img_metas_list = [img_metas for _ in range(num_dec_layers)]
-
-        losses_cls, losses_kpt, losses_oks, kpt_preds_list, kpt_targets_list, \
-            area_targets_list, kpt_weights_list = multi_apply(
-                self.loss_single, all_cls_scores, all_kpt_preds,
-                all_gt_labels_list, all_gt_keypoints_list,
-                all_gt_areas_list, img_metas_list)
-
-        loss_dict = dict()
-        # loss of proposal generated from encode feature map.
-        if enc_cls_scores is not None:
-            binary_labels_list = [
-                paddle.zeros_like(gt_labels_list[i])
-                for i in range(len(img_metas))
-            ]
-            enc_loss_cls, enc_losses_kpt = \
-                self.loss_single_rpn(
-                    enc_cls_scores, enc_kpt_preds, binary_labels_list,
-                    gt_keypoints_list, gt_areas_list, img_metas)
-            loss_dict['enc_loss_cls'] = enc_loss_cls
-            loss_dict['enc_loss_kpt'] = enc_losses_kpt
-
-        # loss from the last decoder layer
-        loss_dict['loss_cls'] = losses_cls[-1]
-        loss_dict['loss_kpt'] = losses_kpt[-1]
-        loss_dict['loss_oks'] = losses_oks[-1]
-        # loss from other decoder layers
-        num_dec_layer = 0
-        for loss_cls_i, loss_kpt_i, loss_oks_i in zip(
-                losses_cls[:-1], losses_kpt[:-1], losses_oks[:-1]):
-            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
-            loss_dict[f'd{num_dec_layer}.loss_kpt'] = loss_kpt_i
-            loss_dict[f'd{num_dec_layer}.loss_oks'] = loss_oks_i
-            num_dec_layer += 1
-
-        # losses of heatmap generated from P3 feature map
-        hm_pred, hm_mask = enc_hm_proto
-        loss_hm = self.loss_heatmap(hm_pred, hm_mask, gt_keypoints_list,
-                                    gt_labels_list, gt_bboxes_list)
-        loss_dict['loss_hm'] = loss_hm
-
-        return loss_dict, (kpt_preds_list[-1], kpt_targets_list[-1],
-                           area_targets_list[-1], kpt_weights_list[-1])
-
-    def loss_heatmap(self, hm_pred, hm_mask, gt_keypoints, gt_labels,
-                     gt_bboxes):
-        assert hm_pred.shape[-2:] == hm_mask.shape[-2:]
-        num_img, _, h, w = hm_pred.shape
-        # placeholder of heatmap target (Gaussian distribution)
-        hm_target = paddle.zeros(hm_pred.shape, hm_pred.dtype)
-        for i, (gt_label, gt_bbox, gt_keypoint
-                ) in enumerate(zip(gt_labels, gt_bboxes, gt_keypoints)):
-            if gt_label.shape[0] == 0:
-                continue
-            gt_keypoint = gt_keypoint.reshape((gt_keypoint.shape[0], -1,
-                                               3)).clone()
-            gt_keypoint[..., :2] /= 8
-
-            assert gt_keypoint[..., 0].max() <= w + 0.5  # new coordinate system
-            assert gt_keypoint[..., 1].max() <= h + 0.5  # new coordinate system
-            gt_bbox /= 8
-            gt_w = gt_bbox[:, 2] - gt_bbox[:, 0]
-            gt_h = gt_bbox[:, 3] - gt_bbox[:, 1]
-            for j in range(gt_label.shape[0]):
-                # get heatmap radius
-                kp_radius = paddle.clip(
-                    paddle.floor(
-                        gaussian_radius(
-                            (gt_h[j], gt_w[j]), min_overlap=0.9)),
-                    min=0,
-                    max=3)
-                for k in range(self.num_keypoints):
-                    if gt_keypoint[j, k, 2] > 0:
-                        gt_kp = gt_keypoint[j, k, :2]
-                        gt_kp_int = paddle.floor(gt_kp)
-                        hm_target[i, k] = draw_umich_gaussian(
-                            hm_target[i, k], gt_kp_int, kp_radius)
-        # compute heatmap loss
-        hm_pred = paddle.clip(
-            F.sigmoid(hm_pred), min=1e-4, max=1 - 1e-4)  # refer to CenterNet
-        loss_hm = self.loss_hm(
-            hm_pred,
-            hm_target.detach(),
-            mask=~hm_mask.astype("bool").unsqueeze(1))
-        return loss_hm
-
-    def loss_single(self, cls_scores, kpt_preds, gt_labels_list,
-                    gt_keypoints_list, gt_areas_list, img_metas):
-        """Loss function for outputs from a single decoder layer of a single
-        feature level.
-
-        Args:
-            cls_scores (Tensor): Box score logits from a single decoder layer
-                for all images. Shape [bs, num_query, cls_out_channels].
-            kpt_preds (Tensor): Sigmoid outputs from a single decoder layer
-                for all images, with normalized coordinate (x_{i}, y_{i}) and
-                shape [bs, num_query, K*2].
-            gt_labels_list (list[Tensor]): Ground truth class indices for each
-                image with shape (num_gts, ).
-            gt_keypoints_list (list[Tensor]): Ground truth keypoints for each
-                image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v,
-                ..., p^{K}_x, p^{K}_y, p^{K}_v] format.
-            gt_areas_list (list[Tensor]): Ground truth mask areas for each
-                image with shape (num_gts, ).
-            img_metas (list[dict]): List of image meta information.
-
-        Returns:
-            dict[str, Tensor]: A dictionary of loss components for outputs from
-                a single decoder layer.
-        """
-        num_imgs = cls_scores.shape[0]
-        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
-        kpt_preds_list = [kpt_preds[i] for i in range(num_imgs)]
-        cls_reg_targets = self.get_targets(cls_scores_list, kpt_preds_list,
-                                           gt_labels_list, gt_keypoints_list,
-                                           gt_areas_list, img_metas)
-        (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list,
-         area_targets_list, num_total_pos, num_total_neg) = cls_reg_targets
-        labels = paddle.concat(labels_list, 0)
-        label_weights = paddle.concat(label_weights_list, 0)
-        kpt_targets = paddle.concat(kpt_targets_list, 0)
-        kpt_weights = paddle.concat(kpt_weights_list, 0)
-        area_targets = paddle.concat(area_targets_list, 0)
-
-        # classification loss
-        cls_scores = cls_scores.reshape((-1, self.cls_out_channels))
-        # construct weighted avg_factor to match with the official DETR repo
-        cls_avg_factor = num_total_pos * 1.0 + \
-            num_total_neg * self.bg_cls_weight
-        if self.sync_cls_avg_factor:
-            cls_avg_factor = reduce_mean(
-                paddle.to_tensor(
-                    [cls_avg_factor], dtype=cls_scores.dtype))
-        cls_avg_factor = max(cls_avg_factor, 1)
-
-        loss_cls = self.loss_cls(
-            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
-
-        # Compute the average number of gt keypoints accross all gpus, for
-        # normalization purposes
-        num_total_pos = paddle.to_tensor([num_total_pos], dtype=loss_cls.dtype)
-        num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item()
-
-        # construct factors used for rescale keypoints
-        factors = []
-        for img_meta, kpt_pred in zip(img_metas, kpt_preds):
-            img_h, img_w, _ = img_meta['img_shape']
-            factor = paddle.to_tensor(
-                [img_w, img_h, img_w, img_h],
-                dtype=kpt_pred.dtype).squeeze().unsqueeze(0).tile(
-                    (kpt_pred.shape[0], 1))
-            factors.append(factor)
-        factors = paddle.concat(factors, 0)
-
-        # keypoint regression loss
-        kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1]))
-        num_valid_kpt = paddle.clip(
-            reduce_mean(kpt_weights.sum()), min=1).item()
-        # assert num_valid_kpt == (kpt_targets>0).sum().item()
-        loss_kpt = self.loss_kpt(
-            kpt_preds,
-            kpt_targets.detach(),
-            kpt_weights.detach(),
-            avg_factor=num_valid_kpt)
-
-        # keypoint oks loss
-        pos_inds = kpt_weights.sum(-1) > 0
-        if not pos_inds.any():
-            loss_oks = kpt_preds.sum() * 0
-        else:
-            factors = factors[pos_inds][:, :2].tile((
-                (1, kpt_preds.shape[-1] // 2)))
-            pos_kpt_preds = kpt_preds[pos_inds] * factors
-            pos_kpt_targets = kpt_targets[pos_inds] * factors
-            pos_areas = area_targets[pos_inds]
-            pos_valid = kpt_weights[pos_inds][..., 0::2]
-            assert (pos_areas > 0).all()
-            loss_oks = self.loss_oks(
-                pos_kpt_preds,
-                pos_kpt_targets,
-                pos_valid,
-                pos_areas,
-                avg_factor=num_total_pos)
-        return loss_cls, loss_kpt, loss_oks, kpt_preds, kpt_targets, \
-            area_targets, kpt_weights
-
-    def get_targets(self, cls_scores_list, kpt_preds_list, gt_labels_list,
-                    gt_keypoints_list, gt_areas_list, img_metas):
-        """Compute regression and classification targets for a batch image.
-
-        Outputs from a single decoder layer of a single feature level are used.
-
-        Args:
-            cls_scores_list (list[Tensor]): Box score logits from a single
-                decoder layer for each image with shape [num_query,
-                cls_out_channels].
-            kpt_preds_list (list[Tensor]): Sigmoid outputs from a single
-                decoder layer for each image, with normalized coordinate
-                (x_{i}, y_{i}) and shape [num_query, K*2].
-            gt_labels_list (list[Tensor]): Ground truth class indices for each
-                image with shape (num_gts, ).
-            gt_keypoints_list (list[Tensor]): Ground truth keypoints for each
-                image with shape (num_gts, K*3).
-            gt_areas_list (list[Tensor]): Ground truth mask areas for each
-                image with shape (num_gts, ).
-            img_metas (list[dict]): List of image meta information.
-
-        Returns:
-            tuple: a tuple containing the following targets.
-
-                - labels_list (list[Tensor]): Labels for all images.
-                - label_weights_list (list[Tensor]): Label weights for all
-                    images.
-                - kpt_targets_list (list[Tensor]): Keypoint targets for all
-                    images.
-                - kpt_weights_list (list[Tensor]): Keypoint weights for all
-                    images.
-                - area_targets_list (list[Tensor]): area targets for all
-                    images.
-                - num_total_pos (int): Number of positive samples in all
-                    images.
-                - num_total_neg (int): Number of negative samples in all
-                    images.
-        """
-        (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list,
-         area_targets_list, pos_inds_list, neg_inds_list) = multi_apply(
-             self._get_target_single, cls_scores_list, kpt_preds_list,
-             gt_labels_list, gt_keypoints_list, gt_areas_list, img_metas)
-        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
-        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
-        return (labels_list, label_weights_list, kpt_targets_list,
-                kpt_weights_list, area_targets_list, num_total_pos,
-                num_total_neg)
-
-    def _get_target_single(self, cls_score, kpt_pred, gt_labels, gt_keypoints,
-                           gt_areas, img_meta):
-        """Compute regression and classification targets for one image.
-
-        Outputs from a single decoder layer of a single feature level are used.
-
-        Args:
-            cls_score (Tensor): Box score logits from a single decoder layer
-                for one image. Shape [num_query, cls_out_channels].
-            kpt_pred (Tensor): Sigmoid outputs from a single decoder layer
-                for one image, with normalized coordinate (x_{i}, y_{i}) and
-                shape [num_query, K*2].
-            gt_labels (Tensor): Ground truth class indices for one image
-                with shape (num_gts, ).
-            gt_keypoints (Tensor): Ground truth keypoints for one image with
-                shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v, ..., \
-                    p^{K}_x, p^{K}_y, p^{K}_v] format.
-            gt_areas (Tensor): Ground truth mask areas for one image
-                with shape (num_gts, ).
-            img_meta (dict): Meta information for one image.
-
-        Returns:
-            tuple[Tensor]: a tuple containing the following for one image.
-
-                - labels (Tensor): Labels of each image.
-                - label_weights (Tensor): Label weights of each image.
-                - kpt_targets (Tensor): Keypoint targets of each image.
-                - kpt_weights (Tensor): Keypoint weights of each image.
-                - area_targets (Tensor): Area targets of each image.
-                - pos_inds (Tensor): Sampled positive indices for each image.
-                - neg_inds (Tensor): Sampled negative indices for each image.
-        """
-        num_bboxes = kpt_pred.shape[0]
-        # assigner and sampler
-        assign_result = self.assigner.assign(cls_score, kpt_pred, gt_labels,
-                                             gt_keypoints, gt_areas, img_meta)
-        sampling_result = self.sampler.sample(assign_result, kpt_pred,
-                                              gt_keypoints)
-
-        pos_inds = sampling_result.pos_inds
-        neg_inds = sampling_result.neg_inds
-
-        # label targets
-        labels = paddle.full((num_bboxes, ), self.num_classes, dtype="int64")
-        label_weights = paddle.ones((num_bboxes, ), dtype=gt_labels.dtype)
-        kpt_targets = paddle.zeros_like(kpt_pred)
-        kpt_weights = paddle.zeros_like(kpt_pred)
-        area_targets = paddle.zeros((kpt_pred.shape[0], ), dtype=kpt_pred.dtype)
-
-        if pos_inds.size == 0:
-            return (labels, label_weights, kpt_targets, kpt_weights,
-                    area_targets, pos_inds, neg_inds)
-
-        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds][
-            ..., 0].astype("int64")
-
-        img_h, img_w, _ = img_meta['img_shape']
-        # keypoint targets
-        pos_gt_kpts = gt_keypoints[sampling_result.pos_assigned_gt_inds]
-        pos_gt_kpts = pos_gt_kpts.reshape(
-            (len(sampling_result.pos_assigned_gt_inds), -1, 3))
-        valid_idx = pos_gt_kpts[:, :, 2] > 0
-        pos_kpt_weights = kpt_weights[pos_inds].reshape(
-            (pos_gt_kpts.shape[0], kpt_weights.shape[-1] // 2, 2))
-        # pos_kpt_weights[valid_idx][...] = 1.0
-        pos_kpt_weights = masked_fill(pos_kpt_weights,
-                                      valid_idx.unsqueeze(-1), 1.0)
-        kpt_weights[pos_inds] = pos_kpt_weights.reshape(
-            (pos_kpt_weights.shape[0], kpt_pred.shape[-1]))
-
-        factor = paddle.to_tensor(
-            [img_w, img_h], dtype=kpt_pred.dtype).squeeze().unsqueeze(0)
-        pos_gt_kpts_normalized = pos_gt_kpts[..., :2]
-        pos_gt_kpts_normalized[..., 0] = pos_gt_kpts_normalized[..., 0] / \
-            factor[:, 0:1]
-        pos_gt_kpts_normalized[..., 1] = pos_gt_kpts_normalized[..., 1] / \
-            factor[:, 1:2]
-        kpt_targets[pos_inds] = pos_gt_kpts_normalized.reshape(
-            (pos_gt_kpts.shape[0], kpt_pred.shape[-1]))
-
-        pos_gt_areas = gt_areas[sampling_result.pos_assigned_gt_inds][..., 0]
-        area_targets[pos_inds] = pos_gt_areas
-
-        return (labels, label_weights, kpt_targets, kpt_weights, area_targets,
-                pos_inds, neg_inds)
-
-    def loss_single_rpn(self, cls_scores, kpt_preds, gt_labels_list,
-                        gt_keypoints_list, gt_areas_list, img_metas):
-        """Loss function for outputs from a single decoder layer of a single
-        feature level.
-
-        Args:
-            cls_scores (Tensor): Box score logits from a single decoder layer
-                for all images. Shape [bs, num_query, cls_out_channels].
-            kpt_preds (Tensor): Sigmoid outputs from a single decoder layer
-                for all images, with normalized coordinate (x_{i}, y_{i}) and
-                shape [bs, num_query, K*2].
-            gt_labels_list (list[Tensor]): Ground truth class indices for each
-                image with shape (num_gts, ).
-            gt_keypoints_list (list[Tensor]): Ground truth keypoints for each
-                image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v,
-                ..., p^{K}_x, p^{K}_y, p^{K}_v] format.
-            gt_areas_list (list[Tensor]): Ground truth mask areas for each
-                image with shape (num_gts, ).
-            img_metas (list[dict]): List of image meta information.
-
-        Returns:
-            dict[str, Tensor]: A dictionary of loss components for outputs from
-                a single decoder layer.
-        """
-        num_imgs = cls_scores.shape[0]
-        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
-        kpt_preds_list = [kpt_preds[i] for i in range(num_imgs)]
-        cls_reg_targets = self.get_targets(cls_scores_list, kpt_preds_list,
-                                           gt_labels_list, gt_keypoints_list,
-                                           gt_areas_list, img_metas)
-        (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list,
-         area_targets_list, num_total_pos, num_total_neg) = cls_reg_targets
-        labels = paddle.concat(labels_list, 0)
-        label_weights = paddle.concat(label_weights_list, 0)
-        kpt_targets = paddle.concat(kpt_targets_list, 0)
-        kpt_weights = paddle.concat(kpt_weights_list, 0)
-
-        # classification loss
-        cls_scores = cls_scores.reshape((-1, self.cls_out_channels))
-        # construct weighted avg_factor to match with the official DETR repo
-        cls_avg_factor = num_total_pos * 1.0 + \
-            num_total_neg * self.bg_cls_weight
-        if self.sync_cls_avg_factor:
-            cls_avg_factor = reduce_mean(
-                paddle.to_tensor(
-                    [cls_avg_factor], dtype=cls_scores.dtype))
-        cls_avg_factor = max(cls_avg_factor, 1)
-
-        cls_avg_factor = max(cls_avg_factor, 1)
-        loss_cls = self.loss_cls(
-            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)
-
-        # Compute the average number of gt keypoints accross all gpus, for
-        # normalization purposes
-        # num_total_pos = loss_cls.to_tensor([num_total_pos])
-        # num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item()
-
-        # keypoint regression loss
-        kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1]))
-        num_valid_kpt = paddle.clip(
-            reduce_mean(kpt_weights.sum()), min=1).item()
-        # assert num_valid_kpt == (kpt_targets>0).sum().item()
-        loss_kpt = self.loss_kpt_rpn(
-            kpt_preds, kpt_targets, kpt_weights, avg_factor=num_valid_kpt)
-
-        return loss_cls, loss_kpt
-
-    def get_bboxes(self,
-                   all_cls_scores,
-                   all_kpt_preds,
-                   enc_cls_scores,
-                   enc_kpt_preds,
-                   hm_proto,
-                   memory,
-                   mlvl_masks,
-                   img_metas,
-                   rescale=False):
-        """Transform network outputs for a batch into bbox predictions.
-
-        Args:
-            all_cls_scores (Tensor): Classification score of all
-                decoder layers, has shape
-                [nb_dec, bs, num_query, cls_out_channels].
-            all_kpt_preds (Tensor): Sigmoid regression
-                outputs of all decode layers. Each is a 4D-tensor with
-                normalized coordinate format (x_{i}, y_{i}) and shape
-                [nb_dec, bs, num_query, K*2].
-            enc_cls_scores (Tensor): Classification scores of points on
-                encode feature map, has shape (N, h*w, num_classes).
-                Only be passed when as_two_stage is True, otherwise is None.
-            enc_kpt_preds (Tensor): Regression results of each points
-                on the encode feature map, has shape (N, h*w, K*2). Only be
-                passed when as_two_stage is True, otherwise is None.
-            img_metas (list[dict]): Meta information of each image.
-            rescale (bool, optional): If True, return boxes in original
-                image space. Defalut False.
-
-        Returns:
-            list[list[Tensor, Tensor]]: Each item in result_list is 3-tuple.
-                The first item is an (n, 5) tensor, where the first 4 columns
-                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
-                5-th column is a score between 0 and 1. The second item is a
-                (n,) tensor where each item is the predicted class label of
-                the corresponding box. The third item is an (n, K, 3) tensor
-                with [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x, p^{K}_y,
-                p^{K}_v] format.
-        """
-        cls_scores = all_cls_scores[-1]
-        kpt_preds = all_kpt_preds[-1]
-
-        result_list = []
-        for img_id in range(len(img_metas)):
-            cls_score = cls_scores[img_id]
-            kpt_pred = kpt_preds[img_id]
-            img_shape = img_metas[img_id]['img_shape']
-            scale_factor = img_metas[img_id]['scale_factor']
-            # TODO: only support single image test
-            # memory_i = memory[:, img_id, :]
-            # mlvl_mask = mlvl_masks[img_id]
-            proposals = self._get_bboxes_single(cls_score, kpt_pred, img_shape,
-                                                scale_factor, memory,
-                                                mlvl_masks, rescale)
-            result_list.append(proposals)
-        return result_list
-
-    def _get_bboxes_single(self,
-                           cls_score,
-                           kpt_pred,
-                           img_shape,
-                           scale_factor,
-                           memory,
-                           mlvl_masks,
-                           rescale=False):
-        """Transform outputs from the last decoder layer into bbox predictions
-        for each image.
-
-        Args:
-            cls_score (Tensor): Box score logits from the last decoder layer
-                for each image. Shape [num_query, cls_out_channels].
-            kpt_pred (Tensor): Sigmoid outputs from the last decoder layer
-                for each image, with coordinate format (x_{i}, y_{i}) and
-                shape [num_query, K*2].
-            img_shape (tuple[int]): Shape of input image, (height, width, 3).
-            scale_factor (ndarray, optional): Scale factor of the image arange
-                as (w_scale, h_scale, w_scale, h_scale).
-            rescale (bool, optional): If True, return boxes in original image
-                space. Default False.
-
-        Returns:
-            tuple[Tensor]: Results of detected bboxes and labels.
-
-                - det_bboxes: Predicted bboxes with shape [num_query, 5],
-                    where the first 4 columns are bounding box positions
-                    (tl_x, tl_y, br_x, br_y) and the 5-th column are scores
-                    between 0 and 1.
-                - det_labels: Predicted labels of the corresponding box with
-                    shape [num_query].
-                - det_kpts: Predicted keypoints with shape [num_query, K, 3].
-        """
-        assert len(cls_score) == len(kpt_pred)
-        max_per_img = self.test_cfg.get('max_per_img', self.num_query)
-        # exclude background
-        if self.loss_cls.use_sigmoid:
-            cls_score = F.sigmoid(cls_score)
-            scores, indexs = cls_score.reshape([-1]).topk(max_per_img)
-            det_labels = indexs % self.num_classes
-            bbox_index = indexs // self.num_classes
-            kpt_pred = kpt_pred[bbox_index]
-        else:
-            scores, det_labels = F.softmax(cls_score, axis=-1)[..., :-1].max(-1)
-            scores, bbox_index = scores.topk(max_per_img)
-            kpt_pred = kpt_pred[bbox_index]
-            det_labels = det_labels[bbox_index]
-
-        # ----- results after pose decoder -----
-        # det_kpts = kpt_pred.reshape((kpt_pred.shape[0], -1, 2))
-
-        # ----- results after joint decoder (default) -----
-        # import time
-        # start = time.time()
-        refine_targets = (kpt_pred, None, None, paddle.ones_like(kpt_pred))
-        refine_outputs = self.forward_refine(memory, mlvl_masks, refine_targets,
-                                             None, None)
-        # end = time.time()
-        # print(f'refine time: {end - start:.6f}')
-        det_kpts = refine_outputs[-1]
-
-        det_kpts[..., 0] = det_kpts[..., 0] * img_shape[1]
-        det_kpts[..., 1] = det_kpts[..., 1] * img_shape[0]
-        det_kpts[..., 0].clip_(min=0, max=img_shape[1])
-        det_kpts[..., 1].clip_(min=0, max=img_shape[0])
-        if rescale:
-            det_kpts /= paddle.to_tensor(
-                scale_factor[:2],
-                dtype=det_kpts.dtype).unsqueeze(0).unsqueeze(0)
-
-        # use circumscribed rectangle box of keypoints as det bboxes
-        x1 = det_kpts[..., 0].min(axis=1, keepdim=True)
-        y1 = det_kpts[..., 1].min(axis=1, keepdim=True)
-        x2 = det_kpts[..., 0].max(axis=1, keepdim=True)
-        y2 = det_kpts[..., 1].max(axis=1, keepdim=True)
-        det_bboxes = paddle.concat([x1, y1, x2, y2], axis=1)
-        det_bboxes = paddle.concat((det_bboxes, scores.unsqueeze(1)), -1)
-
-        det_kpts = paddle.concat(
-            (det_kpts, paddle.ones(
-                det_kpts[..., :1].shape, dtype=det_kpts.dtype)),
-            axis=2)
-
-        return det_bboxes, det_labels, det_kpts
-
-    def simple_test(self, feats, img_metas, rescale=False):
-        """Test det bboxes without test-time augmentation.
-
-        Args:
-            feats (tuple[paddle.Tensor]): Multi-level features from the
-                upstream network, each is a 4D-tensor.
-            img_metas (list[dict]): List of image information.
-            rescale (bool, optional): Whether to rescale the results.
-                Defaults to False.
-
-        Returns:
-            list[tuple[Tensor, Tensor, Tensor]]: Each item in result_list is
-                3-tuple. The first item is ``bboxes`` with shape (n, 5),
-                where 5 represent (tl_x, tl_y, br_x, br_y, score).
-                The shape of the second tensor in the tuple is ``labels``
-                with shape (n,). The third item is ``kpts`` with shape
-                (n, K, 3), in [p^{1}_x, p^{1}_y, p^{1}_v, p^{K}_x, p^{K}_y,
-                p^{K}_v] format.
-        """
-        # forward of this head requires img_metas
-        outs = self.forward(feats, img_metas)
-        results_list = self.get_bboxes(*outs, img_metas, rescale=rescale)
-        return results_list
-
-    def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes):
-        return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/pico_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/pico_head.py
deleted file mode 100644
index 6e04173..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/pico_head.py
+++ /dev/null
@@ -1,797 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn.initializer import Normal, Constant
-
-from ppdet.modeling.ops import get_static_shape
-from ..initializer import normal_
-from ..assigners.utils import generate_anchors_for_grid_cell
-from ..bbox_utils import bbox_center, batch_distance2bbox, bbox2distance
-from ppdet.core.workspace import register
-from ppdet.modeling.layers import ConvNormLayer
-from .simota_head import OTAVFLHead
-from .gfl_head import Integral, GFLHead
-from ppdet.modeling.necks.csp_pan import DPModule
-
-eps = 1e-9
-
-__all__ = ['PicoHead', 'PicoHeadV2', 'PicoFeat']
-
-
-class PicoSE(nn.Layer):
-    def __init__(self, feat_channels):
-        super(PicoSE, self).__init__()
-        self.fc = nn.Conv2D(feat_channels, feat_channels, 1)
-        self.conv = ConvNormLayer(feat_channels, feat_channels, 1, 1)
-
-        self._init_weights()
-
-    def _init_weights(self):
-        normal_(self.fc.weight, std=0.001)
-
-    def forward(self, feat, avg_feat):
-        weight = F.sigmoid(self.fc(avg_feat))
-        out = self.conv(feat * weight)
-        return out
-
-
-@register
-class PicoFeat(nn.Layer):
-    """
-    PicoFeat of PicoDet
-
-    Args:
-        feat_in (int): The channel number of input Tensor.
-        feat_out (int): The channel number of output Tensor.
-        num_convs (int): The convolution number of the LiteGFLFeat.
-        norm_type (str): Normalization type, 'bn'/'sync_bn'/'gn'.
-        share_cls_reg (bool): Whether to share the cls and reg output.
-        act (str): The act of per layers.
-        use_se (bool): Whether to use se module.
-    """
-
-    def __init__(self,
-                 feat_in=256,
-                 feat_out=96,
-                 num_fpn_stride=3,
-                 num_convs=2,
-                 norm_type='bn',
-                 share_cls_reg=False,
-                 act='hard_swish',
-                 use_se=False):
-        super(PicoFeat, self).__init__()
-        self.num_convs = num_convs
-        self.norm_type = norm_type
-        self.share_cls_reg = share_cls_reg
-        self.act = act
-        self.use_se = use_se
-        self.cls_convs = []
-        self.reg_convs = []
-        if use_se:
-            assert share_cls_reg == True, \
-                'In the case of using se, share_cls_reg must be set to True'
-            self.se = nn.LayerList()
-        for stage_idx in range(num_fpn_stride):
-            cls_subnet_convs = []
-            reg_subnet_convs = []
-            for i in range(self.num_convs):
-                in_c = feat_in if i == 0 else feat_out
-                cls_conv_dw = self.add_sublayer(
-                    'cls_conv_dw{}.{}'.format(stage_idx, i),
-                    ConvNormLayer(
-                        ch_in=in_c,
-                        ch_out=feat_out,
-                        filter_size=5,
-                        stride=1,
-                        groups=feat_out,
-                        norm_type=norm_type,
-                        bias_on=False,
-                        lr_scale=2.))
-                cls_subnet_convs.append(cls_conv_dw)
-                cls_conv_pw = self.add_sublayer(
-                    'cls_conv_pw{}.{}'.format(stage_idx, i),
-                    ConvNormLayer(
-                        ch_in=in_c,
-                        ch_out=feat_out,
-                        filter_size=1,
-                        stride=1,
-                        norm_type=norm_type,
-                        bias_on=False,
-                        lr_scale=2.))
-                cls_subnet_convs.append(cls_conv_pw)
-
-                if not self.share_cls_reg:
-                    reg_conv_dw = self.add_sublayer(
-                        'reg_conv_dw{}.{}'.format(stage_idx, i),
-                        ConvNormLayer(
-                            ch_in=in_c,
-                            ch_out=feat_out,
-                            filter_size=5,
-                            stride=1,
-                            groups=feat_out,
-                            norm_type=norm_type,
-                            bias_on=False,
-                            lr_scale=2.))
-                    reg_subnet_convs.append(reg_conv_dw)
-                    reg_conv_pw = self.add_sublayer(
-                        'reg_conv_pw{}.{}'.format(stage_idx, i),
-                        ConvNormLayer(
-                            ch_in=in_c,
-                            ch_out=feat_out,
-                            filter_size=1,
-                            stride=1,
-                            norm_type=norm_type,
-                            bias_on=False,
-                            lr_scale=2.))
-                    reg_subnet_convs.append(reg_conv_pw)
-            self.cls_convs.append(cls_subnet_convs)
-            self.reg_convs.append(reg_subnet_convs)
-            if use_se:
-                self.se.append(PicoSE(feat_out))
-
-    def act_func(self, x):
-        if self.act == "leaky_relu":
-            x = F.leaky_relu(x)
-        elif self.act == "hard_swish":
-            x = F.hardswish(x)
-        elif self.act == "relu6":
-            x = F.relu6(x)
-        return x
-
-    def forward(self, fpn_feat, stage_idx):
-        assert stage_idx < len(self.cls_convs)
-        cls_feat = fpn_feat
-        reg_feat = fpn_feat
-        for i in range(len(self.cls_convs[stage_idx])):
-            cls_feat = self.act_func(self.cls_convs[stage_idx][i](cls_feat))
-            reg_feat = cls_feat
-            if not self.share_cls_reg:
-                reg_feat = self.act_func(self.reg_convs[stage_idx][i](reg_feat))
-        if self.use_se:
-            avg_feat = F.adaptive_avg_pool2d(cls_feat, (1, 1))
-            se_feat = self.act_func(self.se[stage_idx](cls_feat, avg_feat))
-            return cls_feat, se_feat
-        return cls_feat, reg_feat
-
-
-@register
-class PicoHead(OTAVFLHead):
-    """
-    PicoHead
-    Args:
-        conv_feat (object): Instance of 'PicoFeat'
-        num_classes (int): Number of classes
-        fpn_stride (list): The stride of each FPN Layer
-        prior_prob (float): Used to set the bias init for the class prediction layer
-        loss_class (object): Instance of VariFocalLoss.
-        loss_dfl (object): Instance of DistributionFocalLoss.
-        loss_bbox (object): Instance of bbox loss.
-        assigner (object): Instance of label assigner.
-        reg_max: Max value of integral set :math: `{0, ..., reg_max}`
-                n QFL setting. Default: 7.
-    """
-    __inject__ = [
-        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
-        'assigner', 'nms'
-    ]
-    __shared__ = ['num_classes', 'eval_size']
-
-    def __init__(self,
-                 conv_feat='PicoFeat',
-                 dgqp_module=None,
-                 num_classes=80,
-                 fpn_stride=[8, 16, 32],
-                 prior_prob=0.01,
-                 loss_class='VariFocalLoss',
-                 loss_dfl='DistributionFocalLoss',
-                 loss_bbox='GIoULoss',
-                 assigner='SimOTAAssigner',
-                 reg_max=16,
-                 feat_in_chan=96,
-                 nms=None,
-                 nms_pre=1000,
-                 cell_offset=0,
-                 eval_size=None):
-        super(PicoHead, self).__init__(
-            conv_feat=conv_feat,
-            dgqp_module=dgqp_module,
-            num_classes=num_classes,
-            fpn_stride=fpn_stride,
-            prior_prob=prior_prob,
-            loss_class=loss_class,
-            loss_dfl=loss_dfl,
-            loss_bbox=loss_bbox,
-            assigner=assigner,
-            reg_max=reg_max,
-            feat_in_chan=feat_in_chan,
-            nms=nms,
-            nms_pre=nms_pre,
-            cell_offset=cell_offset)
-        self.conv_feat = conv_feat
-        self.num_classes = num_classes
-        self.fpn_stride = fpn_stride
-        self.prior_prob = prior_prob
-        self.loss_vfl = loss_class
-        self.loss_dfl = loss_dfl
-        self.loss_bbox = loss_bbox
-        self.assigner = assigner
-        self.reg_max = reg_max
-        self.feat_in_chan = feat_in_chan
-        self.nms = nms
-        self.nms_pre = nms_pre
-        self.cell_offset = cell_offset
-        self.eval_size = eval_size
-        self.device = paddle.device.get_device()
-
-        self.use_sigmoid = self.loss_vfl.use_sigmoid
-        if self.use_sigmoid:
-            self.cls_out_channels = self.num_classes
-        else:
-            self.cls_out_channels = self.num_classes + 1
-        bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)
-        # Clear the super class initialization
-        self.gfl_head_cls = None
-        self.gfl_head_reg = None
-        self.scales_regs = None
-
-        self.head_cls_list = []
-        self.head_reg_list = []
-        for i in range(len(fpn_stride)):
-            head_cls = self.add_sublayer(
-                "head_cls" + str(i),
-                nn.Conv2D(
-                    in_channels=self.feat_in_chan,
-                    out_channels=self.cls_out_channels + 4 * (self.reg_max + 1)
-                    if self.conv_feat.share_cls_reg else self.cls_out_channels,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                    weight_attr=ParamAttr(initializer=Normal(
-                        mean=0., std=0.01)),
-                    bias_attr=ParamAttr(
-                        initializer=Constant(value=bias_init_value))))
-            self.head_cls_list.append(head_cls)
-            if not self.conv_feat.share_cls_reg:
-                head_reg = self.add_sublayer(
-                    "head_reg" + str(i),
-                    nn.Conv2D(
-                        in_channels=self.feat_in_chan,
-                        out_channels=4 * (self.reg_max + 1),
-                        kernel_size=1,
-                        stride=1,
-                        padding=0,
-                        weight_attr=ParamAttr(initializer=Normal(
-                            mean=0., std=0.01)),
-                        bias_attr=ParamAttr(initializer=Constant(value=0))))
-                self.head_reg_list.append(head_reg)
-
-        # initialize the anchor points
-        if self.eval_size:
-            self.anchor_points, self.stride_tensor = self._generate_anchors()
-
-    def forward(self, fpn_feats, export_post_process=True):
-        assert len(fpn_feats) == len(
-            self.fpn_stride
-        ), "The size of fpn_feats is not equal to size of fpn_stride"
-
-        if self.training:
-            return self.forward_train(fpn_feats)
-        else:
-            return self.forward_eval(
-                fpn_feats, export_post_process=export_post_process)
-
-    def forward_train(self, fpn_feats):
-        cls_logits_list, bboxes_reg_list = [], []
-        for i, fpn_feat in enumerate(fpn_feats):
-            conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat, i)
-            if self.conv_feat.share_cls_reg:
-                cls_logits = self.head_cls_list[i](conv_cls_feat)
-                cls_score, bbox_pred = paddle.split(
-                    cls_logits,
-                    [self.cls_out_channels, 4 * (self.reg_max + 1)],
-                    axis=1)
-            else:
-                cls_score = self.head_cls_list[i](conv_cls_feat)
-                bbox_pred = self.head_reg_list[i](conv_reg_feat)
-
-            if self.dgqp_module:
-                quality_score = self.dgqp_module(bbox_pred)
-                cls_score = F.sigmoid(cls_score) * quality_score
-
-            cls_logits_list.append(cls_score)
-            bboxes_reg_list.append(bbox_pred)
-
-        return (cls_logits_list, bboxes_reg_list)
-
-    def forward_eval(self, fpn_feats, export_post_process=True):
-        if self.eval_size:
-            anchor_points, stride_tensor = self.anchor_points, self.stride_tensor
-        else:
-            anchor_points, stride_tensor = self._generate_anchors(fpn_feats)
-        cls_logits_list, bboxes_reg_list = [], []
-        for i, fpn_feat in enumerate(fpn_feats):
-            conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat, i)
-            if self.conv_feat.share_cls_reg:
-                cls_logits = self.head_cls_list[i](conv_cls_feat)
-                cls_score, bbox_pred = paddle.split(
-                    cls_logits,
-                    [self.cls_out_channels, 4 * (self.reg_max + 1)],
-                    axis=1)
-            else:
-                cls_score = self.head_cls_list[i](conv_cls_feat)
-                bbox_pred = self.head_reg_list[i](conv_reg_feat)
-
-            if self.dgqp_module:
-                quality_score = self.dgqp_module(bbox_pred)
-                cls_score = F.sigmoid(cls_score) * quality_score
-
-            if not export_post_process:
-                # Now only supports batch size = 1 in deploy
-                # TODO(ygh): support batch size > 1
-                cls_score_out = F.sigmoid(cls_score).reshape(
-                    [1, self.cls_out_channels, -1]).transpose([0, 2, 1])
-                bbox_pred = bbox_pred.reshape([1, (self.reg_max + 1) * 4,
-                                               -1]).transpose([0, 2, 1])
-            else:
-                _, _, h, w = fpn_feat.shape
-                l = h * w
-                cls_score_out = F.sigmoid(
-                    cls_score.reshape([-1, self.cls_out_channels, l]))
-                bbox_pred = bbox_pred.transpose([0, 2, 3, 1])
-                bbox_pred = self.distribution_project(bbox_pred)
-                bbox_pred = bbox_pred.reshape([-1, l, 4])
-
-            cls_logits_list.append(cls_score_out)
-            bboxes_reg_list.append(bbox_pred)
-
-        if export_post_process:
-            cls_logits_list = paddle.concat(cls_logits_list, axis=-1)
-            bboxes_reg_list = paddle.concat(bboxes_reg_list, axis=1)
-            bboxes_reg_list = batch_distance2bbox(anchor_points,
-                                                  bboxes_reg_list)
-            bboxes_reg_list *= stride_tensor
-
-        return (cls_logits_list, bboxes_reg_list)
-
-    def _generate_anchors(self, feats=None):
-        # just use in eval time
-        anchor_points = []
-        stride_tensor = []
-        for i, stride in enumerate(self.fpn_stride):
-            if feats is not None:
-                _, _, h, w = feats[i].shape
-            else:
-                h = math.ceil(self.eval_size[0] / stride)
-                w = math.ceil(self.eval_size[1] / stride)
-            shift_x = paddle.arange(end=w) + self.cell_offset
-            shift_y = paddle.arange(end=h) + self.cell_offset
-            shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
-            anchor_point = paddle.cast(
-                paddle.stack(
-                    [shift_x, shift_y], axis=-1), dtype='float32')
-            anchor_points.append(anchor_point.reshape([-1, 2]))
-            stride_tensor.append(
-                paddle.full(
-                    [h * w, 1], stride, dtype='float32'))
-        anchor_points = paddle.concat(anchor_points)
-        stride_tensor = paddle.concat(stride_tensor)
-        return anchor_points, stride_tensor
-
-    def post_process(self,
-                     head_outs,
-                     scale_factor,
-                     export_nms=True,
-                     nms_cpu=False):
-        pred_scores, pred_bboxes = head_outs
-        if not export_nms:
-            return pred_bboxes, pred_scores
-        else:
-            # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale]
-            scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
-            scale_factor = paddle.concat(
-                [scale_x, scale_y, scale_x, scale_y],
-                axis=-1).reshape([-1, 1, 4])
-            # scale bbox to origin image size.
-            pred_bboxes /= scale_factor
-            if nms_cpu:
-                paddle.set_device("cpu")
-                bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
-                paddle.set_device(self.device)
-            else:
-                bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
-            return bbox_pred, bbox_num
-
-
-@register
-class PicoHeadV2(GFLHead):
-    """
-    PicoHeadV2
-    Args:
-        conv_feat (object): Instance of 'PicoFeat'
-        num_classes (int): Number of classes
-        fpn_stride (list): The stride of each FPN Layer
-        prior_prob (float): Used to set the bias init for the class prediction layer
-        loss_class (object): Instance of VariFocalLoss.
-        loss_dfl (object): Instance of DistributionFocalLoss.
-        loss_bbox (object): Instance of bbox loss.
-        assigner (object): Instance of label assigner.
-        reg_max: Max value of integral set :math: `{0, ..., reg_max}`
-                n QFL setting. Default: 7.
-    """
-    __inject__ = [
-        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
-        'static_assigner', 'assigner', 'nms'
-    ]
-    __shared__ = ['num_classes', 'eval_size']
-
-    def __init__(self,
-                 conv_feat='PicoFeatV2',
-                 dgqp_module=None,
-                 num_classes=80,
-                 fpn_stride=[8, 16, 32],
-                 prior_prob=0.01,
-                 use_align_head=True,
-                 loss_class='VariFocalLoss',
-                 loss_dfl='DistributionFocalLoss',
-                 loss_bbox='GIoULoss',
-                 static_assigner_epoch=60,
-                 static_assigner='ATSSAssigner',
-                 assigner='TaskAlignedAssigner',
-                 reg_max=16,
-                 feat_in_chan=96,
-                 nms=None,
-                 nms_pre=1000,
-                 cell_offset=0,
-                 act='hard_swish',
-                 grid_cell_scale=5.0,
-                 eval_size=None):
-        super(PicoHeadV2, self).__init__(
-            conv_feat=conv_feat,
-            dgqp_module=dgqp_module,
-            num_classes=num_classes,
-            fpn_stride=fpn_stride,
-            prior_prob=prior_prob,
-            loss_class=loss_class,
-            loss_dfl=loss_dfl,
-            loss_bbox=loss_bbox,
-            reg_max=reg_max,
-            feat_in_chan=feat_in_chan,
-            nms=nms,
-            nms_pre=nms_pre,
-            cell_offset=cell_offset, )
-        self.conv_feat = conv_feat
-        self.num_classes = num_classes
-        self.fpn_stride = fpn_stride
-        self.prior_prob = prior_prob
-        self.loss_vfl = loss_class
-        self.loss_dfl = loss_dfl
-        self.loss_bbox = loss_bbox
-
-        self.static_assigner_epoch = static_assigner_epoch
-        self.static_assigner = static_assigner
-        self.assigner = assigner
-
-        self.reg_max = reg_max
-        self.feat_in_chan = feat_in_chan
-        self.nms = nms
-        self.nms_pre = nms_pre
-        self.cell_offset = cell_offset
-        self.act = act
-        self.grid_cell_scale = grid_cell_scale
-        self.use_align_head = use_align_head
-        self.cls_out_channels = self.num_classes
-        self.eval_size = eval_size
-
-        bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob)
-        # Clear the super class initialization
-        self.gfl_head_cls = None
-        self.gfl_head_reg = None
-        self.scales_regs = None
-
-        self.head_cls_list = nn.LayerList()
-        self.head_reg_list = nn.LayerList()
-        self.cls_align = nn.LayerList()
-
-        for i in range(len(fpn_stride)):
-            head_cls = self.add_sublayer(
-                "head_cls" + str(i),
-                nn.Conv2D(
-                    in_channels=self.feat_in_chan,
-                    out_channels=self.cls_out_channels,
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                    weight_attr=ParamAttr(initializer=Normal(
-                        mean=0., std=0.01)),
-                    bias_attr=ParamAttr(
-                        initializer=Constant(value=bias_init_value))))
-            self.head_cls_list.append(head_cls)
-            head_reg = self.add_sublayer(
-                "head_reg" + str(i),
-                nn.Conv2D(
-                    in_channels=self.feat_in_chan,
-                    out_channels=4 * (self.reg_max + 1),
-                    kernel_size=1,
-                    stride=1,
-                    padding=0,
-                    weight_attr=ParamAttr(initializer=Normal(
-                        mean=0., std=0.01)),
-                    bias_attr=ParamAttr(initializer=Constant(value=0))))
-            self.head_reg_list.append(head_reg)
-            if self.use_align_head:
-                self.cls_align.append(
-                    DPModule(
-                        self.feat_in_chan,
-                        1,
-                        5,
-                        act=self.act,
-                        use_act_in_out=False))
-
-        # initialize the anchor points
-        if self.eval_size:
-            self.anchor_points, self.stride_tensor = self._generate_anchors()
-
-    def forward(self, fpn_feats, export_post_process=True):
-        assert len(fpn_feats) == len(
-            self.fpn_stride
-        ), "The size of fpn_feats is not equal to size of fpn_stride"
-
-        if self.training:
-            return self.forward_train(fpn_feats)
-        else:
-            return self.forward_eval(
-                fpn_feats, export_post_process=export_post_process)
-
-    def forward_train(self, fpn_feats):
-        cls_score_list, reg_list, box_list = [], [], []
-        for i, (fpn_feat, stride) in enumerate(zip(fpn_feats, self.fpn_stride)):
-            b, _, h, w = get_static_shape(fpn_feat)
-            # task decomposition
-            conv_cls_feat, se_feat = self.conv_feat(fpn_feat, i)
-            cls_logit = self.head_cls_list[i](se_feat)
-            reg_pred = self.head_reg_list[i](se_feat)
-
-            # cls prediction and alignment
-            if self.use_align_head:
-                cls_prob = F.sigmoid(self.cls_align[i](conv_cls_feat))
-                cls_score = (F.sigmoid(cls_logit) * cls_prob + eps).sqrt()
-            else:
-                cls_score = F.sigmoid(cls_logit)
-
-            cls_score_out = cls_score.transpose([0, 2, 3, 1])
-            bbox_pred = reg_pred.transpose([0, 2, 3, 1])
-            b, cell_h, cell_w, _ = paddle.shape(cls_score_out)
-            y, x = self.get_single_level_center_point(
-                [cell_h, cell_w], stride, cell_offset=self.cell_offset)
-            center_points = paddle.stack([x, y], axis=-1)
-            cls_score_out = cls_score_out.reshape(
-                [b, -1, self.cls_out_channels])
-            bbox_pred = self.distribution_project(bbox_pred) * stride
-            bbox_pred = bbox_pred.reshape([b, cell_h * cell_w, 4])
-            bbox_pred = batch_distance2bbox(
-                center_points, bbox_pred, max_shapes=None)
-            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
-            reg_list.append(reg_pred.flatten(2).transpose([0, 2, 1]))
-            box_list.append(bbox_pred / stride)
-
-        cls_score_list = paddle.concat(cls_score_list, axis=1)
-        box_list = paddle.concat(box_list, axis=1)
-        reg_list = paddle.concat(reg_list, axis=1)
-        return cls_score_list, reg_list, box_list, fpn_feats
-
-    def forward_eval(self, fpn_feats, export_post_process=True):
-        if self.eval_size:
-            anchor_points, stride_tensor = self.anchor_points, self.stride_tensor
-        else:
-            anchor_points, stride_tensor = self._generate_anchors(fpn_feats)
-        cls_score_list, box_list = [], []
-        for i, (fpn_feat, stride) in enumerate(zip(fpn_feats, self.fpn_stride)):
-            _, _, h, w = fpn_feat.shape
-            # task decomposition
-            conv_cls_feat, se_feat = self.conv_feat(fpn_feat, i)
-            cls_logit = self.head_cls_list[i](se_feat)
-            reg_pred = self.head_reg_list[i](se_feat)
-
-            # cls prediction and alignment
-            if self.use_align_head:
-                cls_prob = F.sigmoid(self.cls_align[i](conv_cls_feat))
-                cls_score = (F.sigmoid(cls_logit) * cls_prob + eps).sqrt()
-            else:
-                cls_score = F.sigmoid(cls_logit)
-
-            if not export_post_process:
-                # Now only supports batch size = 1 in deploy
-                cls_score_list.append(
-                    cls_score.reshape([1, self.cls_out_channels, -1]).transpose(
-                        [0, 2, 1]))
-                box_list.append(
-                    reg_pred.reshape([1, (self.reg_max + 1) * 4, -1]).transpose(
-                        [0, 2, 1]))
-            else:
-                l = h * w
-                cls_score_out = cls_score.reshape(
-                    [-1, self.cls_out_channels, l])
-                bbox_pred = reg_pred.transpose([0, 2, 3, 1])
-                bbox_pred = self.distribution_project(bbox_pred)
-                bbox_pred = bbox_pred.reshape([-1, l, 4])
-                cls_score_list.append(cls_score_out)
-                box_list.append(bbox_pred)
-
-        if export_post_process:
-            cls_score_list = paddle.concat(cls_score_list, axis=-1)
-            box_list = paddle.concat(box_list, axis=1)
-            box_list = batch_distance2bbox(anchor_points, box_list)
-            box_list *= stride_tensor
-
-        return cls_score_list, box_list
-
-    def get_loss(self, head_outs, gt_meta):
-        pred_scores, pred_regs, pred_bboxes, fpn_feats = head_outs
-        gt_labels = gt_meta['gt_class']
-        gt_bboxes = gt_meta['gt_bbox']
-        gt_scores = gt_meta['gt_score'] if 'gt_score' in gt_meta else None
-        num_imgs = gt_meta['im_id'].shape[0]
-        pad_gt_mask = gt_meta['pad_gt_mask']
-
-        anchors, _, num_anchors_list, stride_tensor_list = generate_anchors_for_grid_cell(
-            fpn_feats, self.fpn_stride, self.grid_cell_scale, self.cell_offset)
-
-        centers = bbox_center(anchors)
-
-        # label assignment
-        if gt_meta['epoch_id'] < self.static_assigner_epoch:
-            assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner(
-                anchors,
-                num_anchors_list,
-                gt_labels,
-                gt_bboxes,
-                pad_gt_mask,
-                bg_index=self.num_classes,
-                gt_scores=gt_scores,
-                pred_bboxes=pred_bboxes.detach() * stride_tensor_list)
-
-        else:
-            assigned_labels, assigned_bboxes, assigned_scores = self.assigner(
-                pred_scores.detach(),
-                pred_bboxes.detach() * stride_tensor_list,
-                centers,
-                num_anchors_list,
-                gt_labels,
-                gt_bboxes,
-                pad_gt_mask,
-                bg_index=self.num_classes,
-                gt_scores=gt_scores)
-
-        assigned_bboxes /= stride_tensor_list
-
-        centers_shape = centers.shape
-        flatten_centers = centers.expand(
-            [num_imgs, centers_shape[0], centers_shape[1]]).reshape([-1, 2])
-        flatten_strides = stride_tensor_list.expand(
-            [num_imgs, centers_shape[0], 1]).reshape([-1, 1])
-        flatten_cls_preds = pred_scores.reshape([-1, self.num_classes])
-        flatten_regs = pred_regs.reshape([-1, 4 * (self.reg_max + 1)])
-        flatten_bboxes = pred_bboxes.reshape([-1, 4])
-        flatten_bbox_targets = assigned_bboxes.reshape([-1, 4])
-        flatten_labels = assigned_labels.reshape([-1])
-        flatten_assigned_scores = assigned_scores.reshape(
-            [-1, self.num_classes])
-
-        pos_inds = paddle.nonzero(
-            paddle.logical_and((flatten_labels >= 0),
-                               (flatten_labels < self.num_classes)),
-            as_tuple=False).squeeze(1)
-
-        num_total_pos = len(pos_inds)
-
-        if num_total_pos > 0:
-            pos_bbox_targets = paddle.gather(
-                flatten_bbox_targets, pos_inds, axis=0)
-            pos_decode_bbox_pred = paddle.gather(
-                flatten_bboxes, pos_inds, axis=0)
-            pos_reg = paddle.gather(flatten_regs, pos_inds, axis=0)
-            pos_strides = paddle.gather(flatten_strides, pos_inds, axis=0)
-            pos_centers = paddle.gather(
-                flatten_centers, pos_inds, axis=0) / pos_strides
-
-            weight_targets = flatten_assigned_scores.detach()
-            weight_targets = paddle.gather(
-                weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)
-
-            pred_corners = pos_reg.reshape([-1, self.reg_max + 1])
-            target_corners = bbox2distance(pos_centers, pos_bbox_targets,
-                                           self.reg_max).reshape([-1])
-            # regression loss
-            loss_bbox = paddle.sum(
-                self.loss_bbox(pos_decode_bbox_pred,
-                               pos_bbox_targets) * weight_targets)
-
-            # dfl loss
-            loss_dfl = self.loss_dfl(
-                pred_corners,
-                target_corners,
-                weight=weight_targets.expand([-1, 4]).reshape([-1]),
-                avg_factor=4.0)
-        else:
-            loss_bbox = paddle.zeros([1])
-            loss_dfl = paddle.zeros([1])
-
-        avg_factor = flatten_assigned_scores.sum()
-        if paddle.distributed.get_world_size() > 1:
-            paddle.distributed.all_reduce(avg_factor)
-            avg_factor = paddle.clip(
-                avg_factor / paddle.distributed.get_world_size(), min=1)
-        loss_vfl = self.loss_vfl(
-            flatten_cls_preds, flatten_assigned_scores, avg_factor=avg_factor)
-
-        loss_bbox = loss_bbox / avg_factor
-        loss_dfl = loss_dfl / avg_factor
-
-        loss_states = dict(
-            loss_vfl=loss_vfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl)
-
-        return loss_states
-
-    def _generate_anchors(self, feats=None):
-        # just use in eval time
-        anchor_points = []
-        stride_tensor = []
-        for i, stride in enumerate(self.fpn_stride):
-            if feats is not None:
-                _, _, h, w = feats[i].shape
-            else:
-                h = math.ceil(self.eval_size[0] / stride)
-                w = math.ceil(self.eval_size[1] / stride)
-            shift_x = paddle.arange(end=w) + self.cell_offset
-            shift_y = paddle.arange(end=h) + self.cell_offset
-            shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
-            anchor_point = paddle.cast(
-                paddle.stack(
-                    [shift_x, shift_y], axis=-1), dtype='float32')
-            anchor_points.append(anchor_point.reshape([-1, 2]))
-            stride_tensor.append(
-                paddle.full(
-                    [h * w, 1], stride, dtype='float32'))
-        anchor_points = paddle.concat(anchor_points)
-        stride_tensor = paddle.concat(stride_tensor)
-        return anchor_points, stride_tensor
-
-    def post_process(self,
-                     head_outs,
-                     scale_factor,
-                     export_nms=True,
-                     nms_cpu=False):
-        pred_scores, pred_bboxes = head_outs
-        if not export_nms:
-            return pred_bboxes, pred_scores
-        else:
-            # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale]
-            scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
-            scale_factor = paddle.concat(
-                [scale_x, scale_y, scale_x, scale_y],
-                axis=-1).reshape([-1, 1, 4])
-            # scale bbox to origin image size.
-            pred_bboxes /= scale_factor
-            bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
-            return bbox_pred, bbox_num
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_contrast_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_contrast_head.py
deleted file mode 100644
index 8732c2c..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_contrast_head.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-
-from ..initializer import bias_init_with_prob, constant_
-from ..assigners.utils import generate_anchors_for_grid_cell
-from ppdet.modeling.heads.ppyoloe_head import PPYOLOEHead
-
-__all__ = ['PPYOLOEContrastHead']
-
-
-@register
-class PPYOLOEContrastHead(PPYOLOEHead):
-    __shared__ = [
-        'num_classes', 'eval_size', 'trt', 'exclude_nms',
-        'exclude_post_process', 'use_shared_conv', 'for_distill'
-    ]
-    __inject__ = ['static_assigner', 'assigner', 'nms', 'contrast_loss']
-
-    def __init__(self,
-                 in_channels=[1024, 512, 256],
-                 num_classes=80,
-                 act='swish',
-                 fpn_strides=(32, 16, 8),
-                 grid_cell_scale=5.0,
-                 grid_cell_offset=0.5,
-                 reg_max=16,
-                 reg_range=None,
-                 static_assigner_epoch=4,
-                 use_varifocal_loss=True,
-                 static_assigner='ATSSAssigner',
-                 assigner='TaskAlignedAssigner',
-                 contrast_loss='SupContrast',
-                 nms='MultiClassNMS',
-                 eval_size=None,
-                 loss_weight={
-                     'class': 1.0,
-                     'iou': 2.5,
-                     'dfl': 0.5,
-                 },
-                 trt=False,
-                 attn_conv='convbn',
-                 exclude_nms=False,
-                 exclude_post_process=False,
-                 use_shared_conv=True,
-                 for_distill=False):
-        super().__init__(in_channels, num_classes, act, fpn_strides,
-                         grid_cell_scale, grid_cell_offset, reg_max, reg_range,
-                         static_assigner_epoch, use_varifocal_loss,
-                         static_assigner, assigner, nms, eval_size, loss_weight,
-                         trt, attn_conv, exclude_nms, exclude_post_process,
-                         use_shared_conv, for_distill)
-
-        assert len(in_channels) > 0, "len(in_channels) should > 0"
-        self.contrast_loss = contrast_loss
-        self.contrast_encoder = nn.LayerList()
-        for in_c in self.in_channels:
-            self.contrast_encoder.append(nn.Conv2D(in_c, 128, 3, padding=1))
-        self._init_contrast_encoder()
-
-    def _init_contrast_encoder(self):
-        bias_en = bias_init_with_prob(0.01)
-        for en_ in self.contrast_encoder:
-            constant_(en_.weight)
-            constant_(en_.bias, bias_en)
-
-    def forward_train(self, feats, targets, aux_pred=None):
-        anchors, anchor_points, num_anchors_list, stride_tensor = \
-            generate_anchors_for_grid_cell(
-                feats, self.fpn_strides, self.grid_cell_scale,
-                self.grid_cell_offset)
-
-        cls_score_list, reg_distri_list = [], []
-        contrast_encoder_list = []
-        for i, feat in enumerate(feats):
-            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
-            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
-                                         feat)
-            reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
-            contrast_logit = self.contrast_encoder[i](self.stem_cls[i](
-                feat, avg_feat) + feat)
-            contrast_encoder_list.append(
-                contrast_logit.flatten(2).transpose([0, 2, 1]))
-            # cls and reg
-            cls_score = F.sigmoid(cls_logit)
-            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
-            reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1]))
-        cls_score_list = paddle.concat(cls_score_list, axis=1)
-        reg_distri_list = paddle.concat(reg_distri_list, axis=1)
-        contrast_encoder_list = paddle.concat(contrast_encoder_list, axis=1)
-
-        return self.get_loss([
-            cls_score_list, reg_distri_list, contrast_encoder_list, anchors,
-            anchor_points, num_anchors_list, stride_tensor
-        ], targets)
-
-    def get_loss(self, head_outs, gt_meta):
-        pred_scores, pred_distri, pred_contrast_encoder, anchors,\
-        anchor_points, num_anchors_list, stride_tensor = head_outs
-
-        anchor_points_s = anchor_points / stride_tensor
-        pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)
-
-        gt_labels = gt_meta['gt_class']
-        gt_bboxes = gt_meta['gt_bbox']
-        pad_gt_mask = gt_meta['pad_gt_mask']
-        # label assignment
-        if gt_meta['epoch_id'] < self.static_assigner_epoch:
-            assigned_labels, assigned_bboxes, assigned_scores = \
-                self.static_assigner(
-                    anchors,
-                    num_anchors_list,
-                    gt_labels,
-                    gt_bboxes,
-                    pad_gt_mask,
-                    bg_index=self.num_classes,
-                    pred_bboxes=pred_bboxes.detach() * stride_tensor)
-            alpha_l = 0.25
-        else:
-            if self.sm_use:
-                assigned_labels, assigned_bboxes, assigned_scores = \
-                    self.assigner(
-                    pred_scores.detach(),
-                    pred_bboxes.detach() * stride_tensor,
-                    anchor_points,
-                    stride_tensor,
-                    gt_labels,
-                    gt_bboxes,
-                    pad_gt_mask,
-                    bg_index=self.num_classes)
-            else:
-                assigned_labels, assigned_bboxes, assigned_scores = \
-                    self.assigner(
-                    pred_scores.detach(),
-                    pred_bboxes.detach() * stride_tensor,
-                    anchor_points,
-                    num_anchors_list,
-                    gt_labels,
-                    gt_bboxes,
-                    pad_gt_mask,
-                    bg_index=self.num_classes)
-            alpha_l = -1
-        # rescale bbox
-        assigned_bboxes /= stride_tensor
-        # cls loss
-        if self.use_varifocal_loss:
-            one_hot_label = F.one_hot(assigned_labels,
-                                      self.num_classes + 1)[..., :-1]
-            loss_cls = self._varifocal_loss(pred_scores, assigned_scores,
-                                            one_hot_label)
-        else:
-            loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l)
-
-        assigned_scores_sum = assigned_scores.sum()
-        if paddle.distributed.get_world_size() > 1:
-            paddle.distributed.all_reduce(assigned_scores_sum)
-            assigned_scores_sum /= paddle.distributed.get_world_size()
-        assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)
-        loss_cls /= assigned_scores_sum
-
-        loss_l1, loss_iou, loss_dfl = \
-            self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s,
-                            assigned_labels, assigned_bboxes, assigned_scores,
-                            assigned_scores_sum)
-        # contrast loss
-        loss_contrast = self.contrast_loss(pred_contrast_encoder.reshape([-1, pred_contrast_encoder.shape[-1]]), \
-            assigned_labels.reshape([-1]), assigned_scores.max(-1).reshape([-1]))
-
-        loss = self.loss_weight['class'] * loss_cls + \
-               self.loss_weight['iou'] * loss_iou + \
-               self.loss_weight['dfl'] * loss_dfl + \
-               self.loss_weight['contrast'] * loss_contrast
-
-        out_dict = {
-            'loss': loss,
-            'loss_cls': loss_cls,
-            'loss_iou': loss_iou,
-            'loss_dfl': loss_dfl,
-            'loss_l1': loss_l1,
-            'loss_contrast': loss_contrast
-        }
-        return out_dict
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_head.py
deleted file mode 100644
index 80f1bc4..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_head.py
+++ /dev/null
@@ -1,700 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-from paddle import ParamAttr
-from paddle.nn.initializer import KaimingNormal
-from paddle.nn.initializer import Normal, Constant
-
-from ..bbox_utils import batch_distance2bbox
-from ..losses import GIoULoss
-from ..initializer import bias_init_with_prob, constant_, normal_
-from ..assigners.utils import generate_anchors_for_grid_cell
-from ppdet.modeling.backbones.cspresnet import ConvBNLayer, RepVggBlock
-from ppdet.modeling.ops import get_static_shape, get_act_fn
-from ppdet.modeling.layers import MultiClassNMS
-
-__all__ = ['PPYOLOEHead', 'SimpleConvHead']
-
-
-class ESEAttn(nn.Layer):
-    def __init__(self, feat_channels, act='swish', attn_conv='convbn'):
-        super(ESEAttn, self).__init__()
-        self.fc = nn.Conv2D(feat_channels, feat_channels, 1)
-        if attn_conv == 'convbn':
-            self.conv = ConvBNLayer(feat_channels, feat_channels, 1, act=act)
-        elif attn_conv == 'repvgg':
-            self.conv = RepVggBlock(feat_channels, feat_channels, act=act)
-        else:
-            self.conv = None
-        self._init_weights()
-
-    def _init_weights(self):
-        normal_(self.fc.weight, std=0.001)
-
-    def forward(self, feat, avg_feat):
-        weight = F.sigmoid(self.fc(avg_feat))
-        if self.conv:
-            return self.conv(feat * weight)
-        else:
-            return feat * weight
-
-
-@register
-class PPYOLOEHead(nn.Layer):
-    __shared__ = [
-        'num_classes', 'eval_size', 'trt', 'exclude_nms',
-        'exclude_post_process', 'use_shared_conv', 'for_distill'
-    ]
-    __inject__ = ['static_assigner', 'assigner', 'nms']
-
-    def __init__(self,
-                 in_channels=[1024, 512, 256],
-                 num_classes=80,
-                 act='swish',
-                 fpn_strides=(32, 16, 8),
-                 grid_cell_scale=5.0,
-                 grid_cell_offset=0.5,
-                 reg_max=16,
-                 reg_range=None,
-                 static_assigner_epoch=4,
-                 use_varifocal_loss=True,
-                 static_assigner='ATSSAssigner',
-                 assigner='TaskAlignedAssigner',
-                 nms='MultiClassNMS',
-                 eval_size=None,
-                 loss_weight={
-                     'class': 1.0,
-                     'iou': 2.5,
-                     'dfl': 0.5,
-                 },
-                 trt=False,
-                 attn_conv='convbn',
-                 exclude_nms=False,
-                 exclude_post_process=False,
-                 use_shared_conv=True,
-                 for_distill=False):
-        super(PPYOLOEHead, self).__init__()
-        assert len(in_channels) > 0, "len(in_channels) should > 0"
-        self.in_channels = in_channels
-        self.num_classes = num_classes
-        self.fpn_strides = fpn_strides
-        self.grid_cell_scale = grid_cell_scale
-        self.grid_cell_offset = grid_cell_offset
-        if reg_range:
-            self.sm_use = True
-            self.reg_range = reg_range
-        else:
-            self.sm_use = False
-            self.reg_range = (0, reg_max + 1)
-        self.reg_channels = self.reg_range[1] - self.reg_range[0]
-        self.iou_loss = GIoULoss()
-        self.loss_weight = loss_weight
-        self.use_varifocal_loss = use_varifocal_loss
-        self.eval_size = eval_size
-
-        self.static_assigner_epoch = static_assigner_epoch
-        self.static_assigner = static_assigner
-        self.assigner = assigner
-        self.nms = nms
-        if isinstance(self.nms, MultiClassNMS) and trt:
-            self.nms.trt = trt
-        self.exclude_nms = exclude_nms
-        self.exclude_post_process = exclude_post_process
-        self.use_shared_conv = use_shared_conv
-        self.for_distill = for_distill
-        self.is_teacher = False
-
-        # stem
-        self.stem_cls = nn.LayerList()
-        self.stem_reg = nn.LayerList()
-        act = get_act_fn(
-            act, trt=trt) if act is None or isinstance(act,
-                                                       (str, dict)) else act
-        for in_c in self.in_channels:
-            self.stem_cls.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))
-            self.stem_reg.append(ESEAttn(in_c, act=act, attn_conv=attn_conv))
-        # pred head
-        self.pred_cls = nn.LayerList()
-        self.pred_reg = nn.LayerList()
-        for in_c in self.in_channels:
-            self.pred_cls.append(
-                nn.Conv2D(
-                    in_c, self.num_classes, 3, padding=1))
-            self.pred_reg.append(
-                nn.Conv2D(
-                    in_c, 4 * self.reg_channels, 3, padding=1))
-        # projection conv
-        self.proj_conv = nn.Conv2D(self.reg_channels, 1, 1, bias_attr=False)
-        self.proj_conv.skip_quant = True
-        self._init_weights()
-
-        if self.for_distill:
-            self.distill_pairs = {}
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
-
-    def _init_weights(self):
-        bias_cls = bias_init_with_prob(0.01)
-        for cls_, reg_ in zip(self.pred_cls, self.pred_reg):
-            constant_(cls_.weight)
-            constant_(cls_.bias, bias_cls)
-            constant_(reg_.weight)
-            constant_(reg_.bias, 1.0)
-
-        proj = paddle.linspace(self.reg_range[0], self.reg_range[1] - 1,
-                               self.reg_channels).reshape(
-                                   [1, self.reg_channels, 1, 1])
-        self.proj_conv.weight.set_value(proj)
-        self.proj_conv.weight.stop_gradient = True
-        if self.eval_size:
-            anchor_points, stride_tensor = self._generate_anchors()
-            self.anchor_points = anchor_points
-            self.stride_tensor = stride_tensor
-
-    def forward_train(self, feats, targets, aux_pred=None):
-        anchors, anchor_points, num_anchors_list, stride_tensor = \
-            generate_anchors_for_grid_cell(
-                feats, self.fpn_strides, self.grid_cell_scale,
-                self.grid_cell_offset)
-
-        cls_score_list, reg_distri_list = [], []
-        for i, feat in enumerate(feats):
-            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
-            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
-                                         feat)
-            reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
-            # cls and reg
-            cls_score = F.sigmoid(cls_logit)
-            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
-            reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1]))
-        cls_score_list = paddle.concat(cls_score_list, axis=1)
-        reg_distri_list = paddle.concat(reg_distri_list, axis=1)
-
-        if targets.get('is_teacher', False):
-            pred_deltas, pred_dfls = self._bbox_decode_fake(reg_distri_list)
-            return cls_score_list, pred_deltas * stride_tensor, pred_dfls
-
-        if targets.get('get_data', False):
-            pred_deltas, pred_dfls = self._bbox_decode_fake(reg_distri_list)
-            return cls_score_list, pred_deltas * stride_tensor, pred_dfls
-
-        return self.get_loss([
-            cls_score_list, reg_distri_list, anchors, anchor_points,
-            num_anchors_list, stride_tensor
-        ], targets, aux_pred)
-
-    def _generate_anchors(self, feats=None, dtype='float32'):
-        # just use in eval time
-        anchor_points = []
-        stride_tensor = []
-        for i, stride in enumerate(self.fpn_strides):
-            if feats is not None:
-                _, _, h, w = feats[i].shape
-            else:
-                h = int(self.eval_size[0] / stride)
-                w = int(self.eval_size[1] / stride)
-            shift_x = paddle.arange(end=w) + self.grid_cell_offset
-            shift_y = paddle.arange(end=h) + self.grid_cell_offset
-            shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
-            anchor_point = paddle.cast(
-                paddle.stack(
-                    [shift_x, shift_y], axis=-1), dtype=dtype)
-            anchor_points.append(anchor_point.reshape([-1, 2]))
-            stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype))
-        anchor_points = paddle.concat(anchor_points)
-        stride_tensor = paddle.concat(stride_tensor)
-        return anchor_points, stride_tensor
-
-    def forward_eval(self, feats):
-        if self.eval_size:
-            anchor_points, stride_tensor = self.anchor_points, self.stride_tensor
-        else:
-            anchor_points, stride_tensor = self._generate_anchors(feats)
-        cls_score_list, reg_dist_list = [], []
-        for i, feat in enumerate(feats):
-            _, _, h, w = feat.shape
-            l = h * w
-            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
-            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
-                                         feat)
-            reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
-            reg_dist = reg_dist.reshape(
-                [-1, 4, self.reg_channels, l]).transpose([0, 2, 3, 1])
-            if self.use_shared_conv:
-                reg_dist = self.proj_conv(F.softmax(
-                    reg_dist, axis=1)).squeeze(1)
-            else:
-                reg_dist = F.softmax(reg_dist, axis=1)
-            # cls and reg
-            cls_score = F.sigmoid(cls_logit)
-            cls_score_list.append(cls_score.reshape([-1, self.num_classes, l]))
-            reg_dist_list.append(reg_dist)
-
-        cls_score_list = paddle.concat(cls_score_list, axis=-1)
-        if self.use_shared_conv:
-            reg_dist_list = paddle.concat(reg_dist_list, axis=1)
-        else:
-            reg_dist_list = paddle.concat(reg_dist_list, axis=2)
-            reg_dist_list = self.proj_conv(reg_dist_list).squeeze(1)
-
-        return cls_score_list, reg_dist_list, anchor_points, stride_tensor
-
-    def forward(self, feats, targets=None, aux_pred=None):
-        assert len(feats) == len(self.fpn_strides), \
-            "The size of feats is not equal to size of fpn_strides"
-
-        if self.training:
-            return self.forward_train(feats, targets, aux_pred)
-        else:
-            if targets is not None:
-                # only for semi-det
-                self.is_teacher = targets.get('is_teacher', False)
-                if self.is_teacher:
-                    return self.forward_train(feats, targets, aux_pred=None)
-                else:
-                    return self.forward_eval(feats)
-
-            return self.forward_eval(feats)
-
-    @staticmethod
-    def _focal_loss(score, label, alpha=0.25, gamma=2.0):
-        weight = (score - label).pow(gamma)
-        if alpha > 0:
-            alpha_t = alpha * label + (1 - alpha) * (1 - label)
-            weight *= alpha_t
-        loss = F.binary_cross_entropy(
-            score, label, weight=weight, reduction='sum')
-        return loss
-
-    @staticmethod
-    def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0):
-        weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
-        loss = F.binary_cross_entropy(
-            pred_score, gt_score, weight=weight, reduction='sum')
-        return loss
-
-    def _bbox_decode(self, anchor_points, pred_dist):
-        _, l, _ = get_static_shape(pred_dist)
-        pred_dist = F.softmax(pred_dist.reshape([-1, l, 4, self.reg_channels]))
-        pred_dist = self.proj_conv(pred_dist.transpose([0, 3, 1, 2])).squeeze(1)
-        return batch_distance2bbox(anchor_points, pred_dist)
-
-    def _bbox_decode_fake(self, pred_dist):
-        _, l, _ = get_static_shape(pred_dist)
-        pred_dist_dfl = F.softmax(
-            pred_dist.reshape([-1, l, 4, self.reg_channels]))
-        pred_dist = self.proj_conv(pred_dist_dfl.transpose([0, 3, 1, 2
-                                                            ])).squeeze(1)
-        return pred_dist, pred_dist_dfl
-
-    def _bbox2distance(self, points, bbox):
-        x1y1, x2y2 = paddle.split(bbox, 2, -1)
-        lt = points - x1y1
-        rb = x2y2 - points
-        return paddle.concat([lt, rb], -1).clip(self.reg_range[0],
-                                                self.reg_range[1] - 1 - 0.01)
-
-    def _df_loss(self, pred_dist, target, lower_bound=0):
-        target_left = paddle.cast(target.floor(), 'int64')
-        target_right = target_left + 1
-        weight_left = target_right.astype('float32') - target
-        weight_right = 1 - weight_left
-        loss_left = F.cross_entropy(
-            pred_dist, target_left - lower_bound,
-            reduction='none') * weight_left
-        loss_right = F.cross_entropy(
-            pred_dist, target_right - lower_bound,
-            reduction='none') * weight_right
-        return (loss_left + loss_right).mean(-1, keepdim=True)
-
-    def _bbox_loss(self, pred_dist, pred_bboxes, anchor_points, assigned_labels,
-                   assigned_bboxes, assigned_scores, assigned_scores_sum):
-        # select positive samples mask
-        mask_positive = (assigned_labels != self.num_classes)
-
-        if self.for_distill:
-            # only used for LD main_kd distill
-            self.distill_pairs['mask_positive_select'] = mask_positive
-
-        num_pos = mask_positive.sum()
-        # pos/neg loss
-        if num_pos > 0:
-            # l1 + iou
-            bbox_mask = mask_positive.astype('int32').unsqueeze(-1).tile(
-                [1, 1, 4]).astype('bool')
-            pred_bboxes_pos = paddle.masked_select(pred_bboxes,
-                                                   bbox_mask).reshape([-1, 4])
-            assigned_bboxes_pos = paddle.masked_select(
-                assigned_bboxes, bbox_mask).reshape([-1, 4])
-            bbox_weight = paddle.masked_select(
-                assigned_scores.sum(-1), mask_positive).unsqueeze(-1)
-
-            loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos)
-
-            loss_iou = self.iou_loss(pred_bboxes_pos,
-                                     assigned_bboxes_pos) * bbox_weight
-            loss_iou = loss_iou.sum() / assigned_scores_sum
-
-            dist_mask = mask_positive.unsqueeze(-1).astype('int32').tile(
-                [1, 1, self.reg_channels * 4]).astype('bool')
-            pred_dist_pos = paddle.masked_select(
-                pred_dist, dist_mask).reshape([-1, 4, self.reg_channels])
-            assigned_ltrb = self._bbox2distance(anchor_points, assigned_bboxes)
-            assigned_ltrb_pos = paddle.masked_select(
-                assigned_ltrb, bbox_mask).reshape([-1, 4])
-            loss_dfl = self._df_loss(pred_dist_pos, assigned_ltrb_pos,
-                                     self.reg_range[0]) * bbox_weight
-            loss_dfl = loss_dfl.sum() / assigned_scores_sum
-            if self.for_distill:
-                self.distill_pairs['pred_bboxes_pos'] = pred_bboxes_pos
-                self.distill_pairs['pred_dist_pos'] = pred_dist_pos
-                self.distill_pairs['bbox_weight'] = bbox_weight
-        else:
-            loss_l1 = paddle.zeros([1])
-            loss_iou = paddle.zeros([1])
-            loss_dfl = pred_dist.sum() * 0.
-        return loss_l1, loss_iou, loss_dfl
-
-    def get_loss(self, head_outs, gt_meta, aux_pred=None):
-        pred_scores, pred_distri, anchors,\
-        anchor_points, num_anchors_list, stride_tensor = head_outs
-
-        anchor_points_s = anchor_points / stride_tensor
-        pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri)
-
-        if aux_pred is not None:
-            pred_scores_aux = aux_pred[0]
-            pred_bboxes_aux = self._bbox_decode(anchor_points_s, aux_pred[1])
-
-        gt_labels = gt_meta['gt_class']
-        gt_bboxes = gt_meta['gt_bbox']
-        pad_gt_mask = gt_meta['pad_gt_mask']
-        # label assignment
-        if gt_meta['epoch_id'] < self.static_assigner_epoch:
-            assigned_labels, assigned_bboxes, assigned_scores = \
-                self.static_assigner(
-                    anchors,
-                    num_anchors_list,
-                    gt_labels,
-                    gt_bboxes,
-                    pad_gt_mask,
-                    bg_index=self.num_classes,
-                    pred_bboxes=pred_bboxes.detach() * stride_tensor)
-            alpha_l = 0.25
-        else:
-            if self.sm_use:
-                # only used in smalldet of PPYOLOE-SOD model
-                assigned_labels, assigned_bboxes, assigned_scores = \
-                    self.assigner(
-                    pred_scores.detach(),
-                    pred_bboxes.detach() * stride_tensor,
-                    anchor_points,
-                    stride_tensor,
-                    gt_labels,
-                    gt_bboxes,
-                    pad_gt_mask,
-                    bg_index=self.num_classes)
-            else:
-                if aux_pred is None:
-                    if not hasattr(self, "assigned_labels"):
-                        assigned_labels, assigned_bboxes, assigned_scores = \
-                            self.assigner(
-                            pred_scores.detach(),
-                            pred_bboxes.detach() * stride_tensor,
-                            anchor_points,
-                            num_anchors_list,
-                            gt_labels,
-                            gt_bboxes,
-                            pad_gt_mask,
-                            bg_index=self.num_classes)
-                        if self.for_distill:
-                            self.assigned_labels = assigned_labels
-                            self.assigned_bboxes = assigned_bboxes
-                            self.assigned_scores = assigned_scores
-
-                    else:
-                        # only used in distill
-                        assigned_labels = self.assigned_labels
-                        assigned_bboxes = self.assigned_bboxes
-                        assigned_scores = self.assigned_scores
-
-                else:
-                    assigned_labels, assigned_bboxes, assigned_scores = \
-                            self.assigner(
-                            pred_scores_aux.detach(),
-                            pred_bboxes_aux.detach() * stride_tensor,
-                            anchor_points,
-                            num_anchors_list,
-                            gt_labels,
-                            gt_bboxes,
-                            pad_gt_mask,
-                            bg_index=self.num_classes)
-            alpha_l = -1
-        # rescale bbox
-        assigned_bboxes /= stride_tensor
-
-        assign_out_dict = self.get_loss_from_assign(
-            pred_scores, pred_distri, pred_bboxes, anchor_points_s,
-            assigned_labels, assigned_bboxes, assigned_scores, alpha_l)
-
-        if aux_pred is not None:
-            assign_out_dict_aux = self.get_loss_from_assign(
-                aux_pred[0], aux_pred[1], pred_bboxes_aux, anchor_points_s,
-                assigned_labels, assigned_bboxes, assigned_scores, alpha_l)
-            loss = {}
-            for key in assign_out_dict.keys():
-                loss[key] = assign_out_dict[key] + assign_out_dict_aux[key]
-        else:
-            loss = assign_out_dict
-
-        return loss
-
-    def get_loss_from_assign(self, pred_scores, pred_distri, pred_bboxes,
-                             anchor_points_s, assigned_labels, assigned_bboxes,
-                             assigned_scores, alpha_l):
-        # cls loss
-        if self.use_varifocal_loss:
-            one_hot_label = F.one_hot(assigned_labels,
-                                      self.num_classes + 1)[..., :-1]
-            loss_cls = self._varifocal_loss(pred_scores, assigned_scores,
-                                            one_hot_label)
-        else:
-            loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l)
-
-        assigned_scores_sum = assigned_scores.sum()
-        if paddle.distributed.get_world_size() > 1:
-            paddle.distributed.all_reduce(assigned_scores_sum)
-            assigned_scores_sum /= paddle.distributed.get_world_size()
-        assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)
-        loss_cls /= assigned_scores_sum
-
-        if self.for_distill:
-            self.distill_pairs['pred_cls_scores'] = pred_scores
-            self.distill_pairs['pos_num'] = assigned_scores_sum
-            self.distill_pairs['assigned_scores'] = assigned_scores
-
-            one_hot_label = F.one_hot(assigned_labels,
-                                      self.num_classes + 1)[..., :-1]
-            self.distill_pairs['target_labels'] = one_hot_label
-
-        loss_l1, loss_iou, loss_dfl = \
-            self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s,
-                            assigned_labels, assigned_bboxes, assigned_scores,
-                            assigned_scores_sum)
-        loss = self.loss_weight['class'] * loss_cls + \
-               self.loss_weight['iou'] * loss_iou + \
-               self.loss_weight['dfl'] * loss_dfl
-        out_dict = {
-            'loss': loss,
-            'loss_cls': loss_cls,
-            'loss_iou': loss_iou,
-            'loss_dfl': loss_dfl,
-            'loss_l1': loss_l1,
-        }
-        return out_dict
-
-    def post_process(self, head_outs, scale_factor):
-        pred_scores, pred_dist, anchor_points, stride_tensor = head_outs
-        pred_bboxes = batch_distance2bbox(anchor_points, pred_dist)
-        pred_bboxes *= stride_tensor
-        if self.exclude_post_process:
-            return paddle.concat(
-                [pred_bboxes, pred_scores.transpose([0, 2, 1])],
-                axis=-1), None, None
-        else:
-            # scale bbox to origin
-            scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
-            scale_factor = paddle.concat(
-                [scale_x, scale_y, scale_x, scale_y],
-                axis=-1).reshape([-1, 1, 4])
-            pred_bboxes /= scale_factor
-            if self.exclude_nms:
-                # `exclude_nms=True` just use in benchmark
-                return pred_bboxes, pred_scores, None
-            else:
-                bbox_pred, bbox_num, nms_keep_idx = self.nms(pred_bboxes,
-                                                             pred_scores)
-                return bbox_pred, bbox_num, nms_keep_idx
-
-
-def get_activation(name="LeakyReLU"):
-    if name == "silu":
-        module = nn.Silu()
-    elif name == "relu":
-        module = nn.ReLU()
-    elif name in ["LeakyReLU", 'leakyrelu', 'lrelu']:
-        module = nn.LeakyReLU(0.1)
-    elif name is None:
-        module = nn.Identity()
-    else:
-        raise AttributeError("Unsupported act type: {}".format(name))
-    return module
-
-
-class ConvNormLayer(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 norm_type='gn',
-                 activation="LeakyReLU"):
-        super(ConvNormLayer, self).__init__()
-        assert norm_type in ['bn', 'sync_bn', 'syncbn', 'gn', None]
-        self.conv = nn.Conv2D(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias_attr=False,
-            weight_attr=ParamAttr(initializer=KaimingNormal()))
-
-        if norm_type in ['bn', 'sync_bn', 'syncbn']:
-            self.norm = nn.BatchNorm2D(out_channels)
-        elif norm_type == 'gn':
-            self.norm = nn.GroupNorm(num_groups=32, num_channels=out_channels)
-        else:
-            self.norm = None
-
-        self.act = get_activation(activation)
-
-    def forward(self, x):
-        y = self.conv(x)
-        if self.norm is not None:
-            y = self.norm(y)
-        y = self.act(y)
-        return y
-
-
-class ScaleReg(nn.Layer):
-    """
-    Parameter for scaling the regression outputs.
-    """
-
-    def __init__(self, scale=1.0):
-        super(ScaleReg, self).__init__()
-        scale = paddle.to_tensor(scale)
-        self.scale = self.create_parameter(
-            shape=[1],
-            dtype='float32',
-            default_initializer=nn.initializer.Assign(scale))
-
-    def forward(self, x):
-        return x * self.scale
-
-
-@register
-class SimpleConvHead(nn.Layer):
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 num_classes=80,
-                 feat_in=288,
-                 feat_out=288,
-                 num_convs=1,
-                 fpn_strides=[32, 16, 8, 4],
-                 norm_type='gn',
-                 act='LeakyReLU',
-                 prior_prob=0.01,
-                 reg_max=16):
-        super(SimpleConvHead, self).__init__()
-        self.num_classes = num_classes
-        self.feat_in = feat_in
-        self.feat_out = feat_out
-        self.num_convs = num_convs
-        self.fpn_strides = fpn_strides
-        self.reg_max = reg_max
-
-        self.cls_convs = nn.LayerList()
-        self.reg_convs = nn.LayerList()
-        for i in range(self.num_convs):
-            in_c = feat_in if i == 0 else feat_out
-            self.cls_convs.append(
-                ConvNormLayer(
-                    in_c,
-                    feat_out,
-                    3,
-                    stride=1,
-                    padding=1,
-                    norm_type=norm_type,
-                    activation=act))
-            self.reg_convs.append(
-                ConvNormLayer(
-                    in_c,
-                    feat_out,
-                    3,
-                    stride=1,
-                    padding=1,
-                    norm_type=norm_type,
-                    activation=act))
-
-        bias_cls = bias_init_with_prob(prior_prob)
-        self.gfl_cls = nn.Conv2D(
-            feat_out,
-            self.num_classes,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            weight_attr=ParamAttr(initializer=Normal(
-                mean=0.0, std=0.01)),
-            bias_attr=ParamAttr(initializer=Constant(value=bias_cls)))
-        self.gfl_reg = nn.Conv2D(
-            feat_out,
-            4 * (self.reg_max + 1),
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            weight_attr=ParamAttr(initializer=Normal(
-                mean=0.0, std=0.01)),
-            bias_attr=ParamAttr(initializer=Constant(value=0)))
-
-        self.scales = nn.LayerList()
-        for i in range(len(self.fpn_strides)):
-            self.scales.append(ScaleReg(1.0))
-
-    def forward(self, feats):
-        cls_scores = []
-        bbox_preds = []
-        for x, scale in zip(feats, self.scales):
-            cls_feat = x
-            reg_feat = x
-            for cls_conv in self.cls_convs:
-                cls_feat = cls_conv(cls_feat)
-            for reg_conv in self.reg_convs:
-                reg_feat = reg_conv(reg_feat)
-
-            cls_score = self.gfl_cls(cls_feat)
-            cls_score = F.sigmoid(cls_score)
-            cls_score = cls_score.flatten(2).transpose([0, 2, 1])
-            cls_scores.append(cls_score)
-
-            bbox_pred = scale(self.gfl_reg(reg_feat))
-            bbox_pred = bbox_pred.flatten(2).transpose([0, 2, 1])
-            bbox_preds.append(bbox_pred)
-
-        cls_scores = paddle.concat(cls_scores, axis=1)
-        bbox_preds = paddle.concat(bbox_preds, axis=1)
-        return cls_scores, bbox_preds
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_r_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_r_head.py
deleted file mode 100644
index e7cf772..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_r_head.py
+++ /dev/null
@@ -1,425 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-
-from ..losses import ProbIoULoss
-from ..initializer import bias_init_with_prob, constant_, normal_, vector_
-from ppdet.modeling.backbones.cspresnet import ConvBNLayer
-from ppdet.modeling.ops import get_static_shape, get_act_fn, anchor_generator
-from ppdet.modeling.layers import MultiClassNMS
-
-__all__ = ['PPYOLOERHead']
-
-
-class ESEAttn(nn.Layer):
-    def __init__(self, feat_channels, act='swish'):
-        super(ESEAttn, self).__init__()
-        self.fc = nn.Conv2D(feat_channels, feat_channels, 1)
-        self.conv = ConvBNLayer(feat_channels, feat_channels, 1, act=act)
-
-        self._init_weights()
-
-    def _init_weights(self):
-        normal_(self.fc.weight, std=0.01)
-
-    def forward(self, feat, avg_feat):
-        weight = F.sigmoid(self.fc(avg_feat))
-        return self.conv(feat * weight)
-
-
-@register
-class PPYOLOERHead(nn.Layer):
-    __shared__ = ['num_classes', 'trt', 'export_onnx']
-    __inject__ = ['static_assigner', 'assigner', 'nms']
-
-    def __init__(self,
-                 in_channels=[1024, 512, 256],
-                 num_classes=15,
-                 act='swish',
-                 fpn_strides=(32, 16, 8),
-                 grid_cell_offset=0.5,
-                 angle_max=90,
-                 use_varifocal_loss=True,
-                 static_assigner_epoch=4,
-                 trt=False,
-                 export_onnx=False,
-                 static_assigner='ATSSAssigner',
-                 assigner='TaskAlignedAssigner',
-                 nms='MultiClassNMS',
-                 loss_weight={'class': 1.0,
-                              'iou': 2.5,
-                              'dfl': 0.05}):
-        super(PPYOLOERHead, self).__init__()
-        assert len(in_channels) > 0, "len(in_channels) should > 0"
-        self.in_channels = in_channels
-        self.num_classes = num_classes
-        self.fpn_strides = fpn_strides
-        self.grid_cell_offset = grid_cell_offset
-        self.angle_max = angle_max
-        self.loss_weight = loss_weight
-        self.use_varifocal_loss = use_varifocal_loss
-        self.half_pi = paddle.to_tensor(
-            [1.5707963267948966], dtype=paddle.float32)
-        self.half_pi_bin = self.half_pi / angle_max
-        self.iou_loss = ProbIoULoss()
-        self.static_assigner_epoch = static_assigner_epoch
-        self.static_assigner = static_assigner
-        self.assigner = assigner
-        self.nms = nms
-        # stem
-        self.stem_cls = nn.LayerList()
-        self.stem_reg = nn.LayerList()
-        self.stem_angle = nn.LayerList()
-        trt = False if export_onnx else trt
-        self.export_onnx = export_onnx
-        act = get_act_fn(
-            act, trt=trt) if act is None or isinstance(act,
-                                                       (str, dict)) else act
-        self.trt = trt
-        for in_c in self.in_channels:
-            self.stem_cls.append(ESEAttn(in_c, act=act))
-            self.stem_reg.append(ESEAttn(in_c, act=act))
-            self.stem_angle.append(ESEAttn(in_c, act=act))
-        # pred head
-        self.pred_cls = nn.LayerList()
-        self.pred_reg = nn.LayerList()
-        self.pred_angle = nn.LayerList()
-        for in_c in self.in_channels:
-            self.pred_cls.append(
-                nn.Conv2D(
-                    in_c, self.num_classes, 3, padding=1))
-            self.pred_reg.append(nn.Conv2D(in_c, 4, 3, padding=1))
-            self.pred_angle.append(
-                nn.Conv2D(
-                    in_c, self.angle_max + 1, 3, padding=1))
-        self.angle_proj_conv = nn.Conv2D(
-            self.angle_max + 1, 1, 1, bias_attr=False)
-        self._init_weights()
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
-
-    def _init_weights(self):
-        bias_cls = bias_init_with_prob(0.01)
-        bias_angle = [10.] + [1.] * self.angle_max
-        for cls_, reg_, angle_ in zip(self.pred_cls, self.pred_reg,
-                                      self.pred_angle):
-            normal_(cls_.weight, std=0.01)
-            constant_(cls_.bias, bias_cls)
-            normal_(reg_.weight, std=0.01)
-            constant_(reg_.bias)
-            constant_(angle_.weight)
-            vector_(angle_.bias, bias_angle)
-
-        angle_proj = paddle.linspace(0, self.angle_max, self.angle_max + 1)
-        self.angle_proj = angle_proj * self.half_pi_bin
-        self.angle_proj_conv.weight.set_value(
-            self.angle_proj.reshape([1, self.angle_max + 1, 1, 1]))
-        self.angle_proj_conv.weight.stop_gradient = True
-
-    def _generate_anchors(self, feats):
-        if self.trt:
-            anchor_points = []
-            for feat, stride in zip(feats, self.fpn_strides):
-                _, _, h, w = paddle.shape(feat)
-                anchor, _ = anchor_generator(
-                    feat,
-                    stride * 4,
-                    1.0, [1.0, 1.0, 1.0, 1.0], [stride, stride],
-                    offset=0.5)
-                x1, y1, x2, y2 = paddle.split(anchor, 4, axis=-1)
-                xc = (x1 + x2 + 1) / 2
-                yc = (y1 + y2 + 1) / 2
-                anchor_point = paddle.concat(
-                    [xc, yc], axis=-1).reshape((1, h * w, 2))
-                anchor_points.append(anchor_point)
-            anchor_points = paddle.concat(anchor_points, axis=1)
-            return anchor_points, None, None
-        else:
-            anchor_points = []
-            stride_tensor = []
-            num_anchors_list = []
-            for feat, stride in zip(feats, self.fpn_strides):
-                _, _, h, w = paddle.shape(feat)
-                shift_x = (paddle.arange(end=w) + 0.5) * stride
-                shift_y = (paddle.arange(end=h) + 0.5) * stride
-                shift_y, shift_x = paddle.meshgrid(shift_y, shift_x)
-                anchor_point = paddle.cast(
-                    paddle.stack(
-                        [shift_x, shift_y], axis=-1), dtype='float32')
-                anchor_points.append(anchor_point.reshape([1, -1, 2]))
-                stride_tensor.append(
-                    paddle.full(
-                        [1, h * w, 1], stride, dtype='float32'))
-                num_anchors_list.append(h * w)
-            anchor_points = paddle.concat(anchor_points, axis=1)
-            stride_tensor = paddle.concat(stride_tensor, axis=1)
-            return anchor_points, stride_tensor, num_anchors_list
-
-    def forward(self, feats, targets=None):
-        assert len(feats) == len(self.fpn_strides), \
-            "The size of feats is not equal to size of fpn_strides"
-
-        if self.training:
-            return self.forward_train(feats, targets)
-        else:
-            return self.forward_eval(feats)
-
-    def forward_train(self, feats, targets):
-        anchor_points, stride_tensor, num_anchors_list = self._generate_anchors(
-            feats)
-
-        cls_score_list, reg_dist_list, reg_angle_list = [], [], []
-        for i, feat in enumerate(feats):
-            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
-            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
-                                         feat)
-            reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
-            reg_angle = self.pred_angle[i](self.stem_angle[i](feat, avg_feat))
-            # cls and reg
-            cls_score = F.sigmoid(cls_logit)
-            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
-            reg_dist_list.append(reg_dist.flatten(2).transpose([0, 2, 1]))
-            reg_angle_list.append(reg_angle.flatten(2).transpose([0, 2, 1]))
-        cls_score_list = paddle.concat(cls_score_list, axis=1)
-        reg_dist_list = paddle.concat(reg_dist_list, axis=1)
-        reg_angle_list = paddle.concat(reg_angle_list, axis=1)
-
-        return self.get_loss([
-            cls_score_list, reg_dist_list, reg_angle_list, anchor_points,
-            num_anchors_list, stride_tensor
-        ], targets)
-
-    def forward_eval(self, feats):
-        cls_score_list, reg_box_list = [], []
-        anchor_points, _, _ = self._generate_anchors(feats)
-        for i, (feat, stride) in enumerate(zip(feats, self.fpn_strides)):
-            b, _, h, w = paddle.shape(feat)
-            l = h * w
-            # cls
-            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
-            cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) +
-                                         feat)
-            # reg
-            reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat))
-            reg_xy, reg_wh = paddle.split(reg_dist, 2, axis=1)
-            reg_xy = reg_xy * stride
-            reg_wh = (F.elu(reg_wh) + 1.) * stride
-            reg_angle = self.pred_angle[i](self.stem_angle[i](feat, avg_feat))
-            reg_angle = self.angle_proj_conv(F.softmax(reg_angle, axis=1))
-            reg_box = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1)
-            # cls and reg
-            cls_score = F.sigmoid(cls_logit)
-            cls_score_list.append(cls_score.reshape([b, self.num_classes, l]))
-            reg_box_list.append(reg_box.reshape([b, 5, l]))
-
-        cls_score_list = paddle.concat(cls_score_list, axis=-1)
-        reg_box_list = paddle.concat(reg_box_list, axis=-1).transpose([0, 2, 1])
-        reg_xy, reg_wha = paddle.split(reg_box_list, [2, 3], axis=-1)
-        reg_xy = reg_xy + anchor_points
-        reg_box_list = paddle.concat([reg_xy, reg_wha], axis=-1)
-        return cls_score_list, reg_box_list
-
-    def _bbox_decode(self, points, pred_dist, pred_angle, stride_tensor):
-        # predict vector to x, y, w, h, angle
-        b, l = pred_angle.shape[:2]
-        xy, wh = paddle.split(pred_dist, 2, axis=-1)
-        xy = xy * stride_tensor + points
-        wh = (F.elu(wh) + 1.) * stride_tensor
-        angle = F.softmax(pred_angle.reshape([b, l, 1, self.angle_max + 1
-                                              ])).matmul(self.angle_proj)
-        return paddle.concat([xy, wh, angle], axis=-1)
-
-    def get_loss(self, head_outs, gt_meta):
-        pred_scores, pred_dist, pred_angle, \
-        anchor_points, num_anchors_list, stride_tensor = head_outs
-        # [B, N, 5] -> [B, N, 5]
-        pred_bboxes = self._bbox_decode(anchor_points, pred_dist, pred_angle,
-                                        stride_tensor)
-        gt_labels = gt_meta['gt_class']
-        # [B, N, 5]
-        gt_bboxes = gt_meta['gt_rbox']
-        pad_gt_mask = gt_meta['pad_gt_mask']
-        # label assignment
-        if gt_meta['epoch_id'] < self.static_assigner_epoch:
-            assigned_labels, assigned_bboxes, assigned_scores = \
-                self.static_assigner(
-                    anchor_points,
-                    stride_tensor,
-                    num_anchors_list,
-                    gt_labels,
-                    gt_meta['gt_bbox'],
-                    gt_bboxes,
-                    pad_gt_mask,
-                    self.num_classes,
-                    pred_bboxes.detach()
-                )
-        else:
-            assigned_labels, assigned_bboxes, assigned_scores = \
-                self.assigner(
-                pred_scores.detach(),
-                pred_bboxes.detach(),
-                anchor_points,
-                num_anchors_list,
-                gt_labels,
-                gt_bboxes,
-                pad_gt_mask,
-                bg_index=self.num_classes)
-        alpha_l = -1
-        # cls loss
-        if self.use_varifocal_loss:
-            one_hot_label = F.one_hot(assigned_labels,
-                                      self.num_classes + 1)[..., :-1]
-            loss_cls = self._varifocal_loss(pred_scores, assigned_scores,
-                                            one_hot_label)
-        else:
-            loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l)
-
-        assigned_scores_sum = assigned_scores.sum()
-        if paddle.distributed.get_world_size() > 1:
-            paddle.distributed.all_reduce(assigned_scores_sum)
-            assigned_scores_sum = paddle.clip(
-                assigned_scores_sum / paddle.distributed.get_world_size(),
-                min=1.)
-        else:
-            assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.)
-        loss_cls /= assigned_scores_sum
-
-        loss_iou, loss_dfl = self._bbox_loss(pred_angle, pred_bboxes,
-                                             anchor_points, assigned_labels,
-                                             assigned_bboxes, assigned_scores,
-                                             assigned_scores_sum, stride_tensor)
-
-        loss = self.loss_weight['class'] * loss_cls + \
-               self.loss_weight['iou'] * loss_iou + \
-               self.loss_weight['dfl'] * loss_dfl
-        out_dict = {
-            'loss': loss,
-            'loss_cls': loss_cls,
-            'loss_iou': loss_iou,
-            'loss_dfl': loss_dfl
-        }
-        return out_dict
-
-    @staticmethod
-    def _focal_loss(score, label, alpha=0.25, gamma=2.0):
-        weight = (score - label).pow(gamma)
-        if alpha > 0:
-            alpha_t = alpha * label + (1 - alpha) * (1 - label)
-            weight *= alpha_t
-        loss = F.binary_cross_entropy(
-            score, label, weight=weight, reduction='sum')
-        return loss
-
-    @staticmethod
-    def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0):
-        weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
-        loss = F.binary_cross_entropy(
-            pred_score, gt_score, weight=weight, reduction='sum')
-        return loss
-
-    @staticmethod
-    def _df_loss(pred_dist, target):
-        target_left = paddle.cast(target, 'int64')
-        target_right = target_left + 1
-        weight_left = target_right.astype('float32') - target
-        weight_right = 1 - weight_left
-        loss_left = F.cross_entropy(
-            pred_dist, target_left, reduction='none') * weight_left
-        loss_right = F.cross_entropy(
-            pred_dist, target_right, reduction='none') * weight_right
-        return (loss_left + loss_right).mean(-1, keepdim=True)
-
-    def _bbox_loss(self, pred_angle, pred_bboxes, anchor_points,
-                   assigned_labels, assigned_bboxes, assigned_scores,
-                   assigned_scores_sum, stride_tensor):
-        # select positive samples mask
-        mask_positive = (assigned_labels != self.num_classes)
-        num_pos = mask_positive.sum()
-        # pos/neg loss
-        if num_pos > 0:
-            # iou
-            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 5])
-            pred_bboxes_pos = paddle.masked_select(pred_bboxes,
-                                                   bbox_mask).reshape([-1, 5])
-            assigned_bboxes_pos = paddle.masked_select(
-                assigned_bboxes, bbox_mask).reshape([-1, 5])
-            bbox_weight = paddle.masked_select(
-                assigned_scores.sum(-1), mask_positive).reshape([-1])
-
-            loss_iou = self.iou_loss(pred_bboxes_pos,
-                                     assigned_bboxes_pos) * bbox_weight
-            loss_iou = loss_iou.sum() / assigned_scores_sum
-
-            # dfl
-            angle_mask = mask_positive.unsqueeze(-1).tile(
-                [1, 1, self.angle_max + 1])
-            pred_angle_pos = paddle.masked_select(
-                pred_angle, angle_mask).reshape([-1, self.angle_max + 1])
-            assigned_angle_pos = (
-                assigned_bboxes_pos[:, 4] /
-                self.half_pi_bin).clip(0, self.angle_max - 0.01)
-            loss_dfl = self._df_loss(pred_angle_pos, assigned_angle_pos)
-        else:
-            loss_iou = pred_bboxes.sum() * 0.
-            loss_dfl = paddle.zeros([1])
-
-        return loss_iou, loss_dfl
-
-    def _box2corners(self, pred_bboxes):
-        """ convert (x, y, w, h, angle) to (x1, y1, x2, y2, x3, y3, x4, y4)
-
-        Args:
-            pred_bboxes (Tensor): [B, N, 5]
-        
-        Returns:
-            polys (Tensor): [B, N, 8]
-        """
-        x, y, w, h, angle = paddle.split(pred_bboxes, 5, axis=-1)
-        cos_a_half = paddle.cos(angle) * 0.5
-        sin_a_half = paddle.sin(angle) * 0.5
-        w_x = cos_a_half * w
-        w_y = sin_a_half * w
-        h_x = -sin_a_half * h
-        h_y = cos_a_half * h
-        return paddle.concat(
-            [
-                x + w_x + h_x, y + w_y + h_y, x - w_x + h_x, y - w_y + h_y,
-                x - w_x - h_x, y - w_y - h_y, x + w_x - h_x, y + w_y - h_y
-            ],
-            axis=-1)
-
-    def post_process(self, head_outs, scale_factor):
-        pred_scores, pred_bboxes = head_outs
-        # [B, N, 5] -> [B, N, 8]
-        pred_bboxes = self._box2corners(pred_bboxes)
-        # scale bbox to origin
-        scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1)
-        scale_factor = paddle.concat(
-            [
-                scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x,
-                scale_y
-            ],
-            axis=-1).reshape([-1, 1, 8])
-        pred_bboxes /= scale_factor
-        if self.export_onnx:
-            return pred_bboxes, pred_scores, None
-        bbox_pred, bbox_num, nms_keep_idx = self.nms(pred_bboxes,
-                                                           pred_scores)
-        return bbox_pred, bbox_num, nms_keep_idx
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/retina_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/retina_head.py
deleted file mode 100644
index 67a5126..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/retina_head.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn.initializer import Normal, Constant
-from ppdet.modeling.bbox_utils import bbox2delta, delta2bbox
-from ppdet.modeling.heads.fcos_head import FCOSFeat
-
-from ppdet.core.workspace import register
-
-__all__ = ['RetinaHead']
-
-
-@register
-class RetinaFeat(FCOSFeat):
-    """We use FCOSFeat to construct conv layers in RetinaNet.
-    We rename FCOSFeat to RetinaFeat to avoid confusion.
-    """
-    pass
-
-
-@register
-class RetinaHead(nn.Layer):
-    """Used in RetinaNet proposed in paper https://arxiv.org/pdf/1708.02002.pdf
-    """
-    __shared__ = ['num_classes']
-    __inject__ = [
-        'conv_feat', 'anchor_generator', 'bbox_assigner', 'loss_class',
-        'loss_bbox', 'nms'
-    ]
-
-    def __init__(self,
-                 num_classes=80,
-                 conv_feat='RetinaFeat',
-                 anchor_generator='RetinaAnchorGenerator',
-                 bbox_assigner='MaxIoUAssigner',
-                 loss_class='FocalLoss',
-                 loss_bbox='SmoothL1Loss',
-                 nms='MultiClassNMS',
-                 prior_prob=0.01,
-                 nms_pre=1000,
-                 weights=[1., 1., 1., 1.]):
-        super(RetinaHead, self).__init__()
-        self.num_classes = num_classes
-        self.conv_feat = conv_feat
-        self.anchor_generator = anchor_generator
-        self.bbox_assigner = bbox_assigner
-        self.loss_class = loss_class
-        self.loss_bbox = loss_bbox
-        self.nms = nms
-        self.nms_pre = nms_pre
-        self.weights = weights
-
-        bias_init_value = -math.log((1 - prior_prob) / prior_prob)
-        num_anchors = self.anchor_generator.num_anchors
-        self.retina_cls = nn.Conv2D(
-            in_channels=self.conv_feat.feat_out,
-            out_channels=self.num_classes * num_anchors,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            weight_attr=ParamAttr(initializer=Normal(
-                mean=0.0, std=0.01)),
-            bias_attr=ParamAttr(initializer=Constant(value=bias_init_value)))
-        self.retina_reg = nn.Conv2D(
-            in_channels=self.conv_feat.feat_out,
-            out_channels=4 * num_anchors,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-            weight_attr=ParamAttr(initializer=Normal(
-                mean=0.0, std=0.01)),
-            bias_attr=ParamAttr(initializer=Constant(value=0)))
-
-    def forward(self, neck_feats, targets=None):
-        cls_logits_list = []
-        bboxes_reg_list = []
-        for neck_feat in neck_feats:
-            conv_cls_feat, conv_reg_feat = self.conv_feat(neck_feat)
-            cls_logits = self.retina_cls(conv_cls_feat)
-            bbox_reg = self.retina_reg(conv_reg_feat)
-            cls_logits_list.append(cls_logits)
-            bboxes_reg_list.append(bbox_reg)
-
-        if self.training:
-            return self.get_loss([cls_logits_list, bboxes_reg_list], targets)
-        else:
-            return [cls_logits_list, bboxes_reg_list]
-
-    def get_loss(self, head_outputs, targets):
-        """Here we calculate loss for a batch of images.
-        We assign anchors to gts in each image and gather all the assigned
-        postive and negative samples. Then loss is calculated on the gathered
-        samples.
-        """
-        cls_logits_list, bboxes_reg_list = head_outputs
-        anchors = self.anchor_generator(cls_logits_list)
-        anchors = paddle.concat(anchors)
-
-        # matches: contain gt_inds
-        # match_labels: -1(ignore), 0(neg) or 1(pos)
-        matches_list, match_labels_list = [], []
-        # assign anchors to gts, no sampling is involved
-        for gt_bbox in targets['gt_bbox']:
-            matches, match_labels = self.bbox_assigner(anchors, gt_bbox)
-            matches_list.append(matches)
-            match_labels_list.append(match_labels)
-
-        # reshape network outputs
-        cls_logits = [
-            _.transpose([0, 2, 3, 1]).reshape([0, -1, self.num_classes])
-            for _ in cls_logits_list
-        ]
-        bboxes_reg = [
-            _.transpose([0, 2, 3, 1]).reshape([0, -1, 4])
-            for _ in bboxes_reg_list
-        ]
-        cls_logits = paddle.concat(cls_logits, axis=1)
-        bboxes_reg = paddle.concat(bboxes_reg, axis=1)
-
-        cls_pred_list, cls_tar_list = [], []
-        reg_pred_list, reg_tar_list = [], []
-        # find and gather preds and targets in each image
-        for matches, match_labels, cls_logit, bbox_reg, gt_bbox, gt_class in \
-            zip(matches_list, match_labels_list, cls_logits, bboxes_reg,
-                targets['gt_bbox'], targets['gt_class']):
-            pos_mask = (match_labels == 1)
-            neg_mask = (match_labels == 0)
-            chosen_mask = paddle.logical_or(pos_mask, neg_mask)
-
-            gt_class = gt_class.reshape([-1])
-            bg_class = paddle.to_tensor(
-                [self.num_classes], dtype=gt_class.dtype)
-            # a trick to assign num_classes to negative targets
-            gt_class = paddle.concat([gt_class, bg_class], axis=-1)
-            matches = paddle.where(neg_mask,
-                                   paddle.full_like(matches, gt_class.size - 1),
-                                   matches)
-
-            cls_pred = cls_logit[chosen_mask]
-            cls_tar = gt_class[matches[chosen_mask]]
-            reg_pred = bbox_reg[pos_mask].reshape([-1, 4])
-            reg_tar = gt_bbox[matches[pos_mask]].reshape([-1, 4])
-            reg_tar = bbox2delta(anchors[pos_mask], reg_tar, self.weights)
-            cls_pred_list.append(cls_pred)
-            cls_tar_list.append(cls_tar)
-            reg_pred_list.append(reg_pred)
-            reg_tar_list.append(reg_tar)
-        cls_pred = paddle.concat(cls_pred_list)
-        cls_tar = paddle.concat(cls_tar_list)
-        reg_pred = paddle.concat(reg_pred_list)
-        reg_tar = paddle.concat(reg_tar_list)
-
-        avg_factor = max(1.0, reg_pred.shape[0])
-        cls_loss = self.loss_class(
-            cls_pred, cls_tar, reduction='sum') / avg_factor
-
-        if reg_pred.shape[0] == 0:
-            reg_loss = paddle.zeros([1])
-            reg_loss.stop_gradient = False
-        else:
-            reg_loss = self.loss_bbox(
-                reg_pred, reg_tar, reduction='sum') / avg_factor
-
-        loss = cls_loss + reg_loss
-        out_dict = {
-            'loss_cls': cls_loss,
-            'loss_reg': reg_loss,
-            'loss': loss,
-        }
-        return out_dict
-
-    def get_bboxes_single(self,
-                          anchors,
-                          cls_scores_list,
-                          bbox_preds_list,
-                          im_shape,
-                          scale_factor,
-                          rescale=True):
-        assert len(cls_scores_list) == len(bbox_preds_list)
-        mlvl_bboxes = []
-        mlvl_scores = []
-        for anchor, cls_score, bbox_pred in zip(anchors, cls_scores_list,
-                                                bbox_preds_list):
-            cls_score = cls_score.reshape([-1, self.num_classes])
-            bbox_pred = bbox_pred.reshape([-1, 4])
-            if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre:
-                max_score = cls_score.max(axis=1)
-                _, topk_inds = max_score.topk(self.nms_pre)
-                bbox_pred = bbox_pred.gather(topk_inds)
-                anchor = anchor.gather(topk_inds)
-                cls_score = cls_score.gather(topk_inds)
-            bbox_pred = delta2bbox(bbox_pred, anchor, self.weights).squeeze()
-            mlvl_bboxes.append(bbox_pred)
-            mlvl_scores.append(F.sigmoid(cls_score))
-        mlvl_bboxes = paddle.concat(mlvl_bboxes)
-        mlvl_bboxes = paddle.squeeze(mlvl_bboxes)
-        if rescale:
-            mlvl_bboxes = mlvl_bboxes / paddle.concat(
-                [scale_factor[::-1], scale_factor[::-1]])
-        mlvl_scores = paddle.concat(mlvl_scores)
-        mlvl_scores = mlvl_scores.transpose([1, 0])
-        return mlvl_bboxes, mlvl_scores
-
-    def decode(self, anchors, cls_logits, bboxes_reg, im_shape, scale_factor):
-        batch_bboxes = []
-        batch_scores = []
-        for img_id in range(cls_logits[0].shape[0]):
-            num_lvls = len(cls_logits)
-            cls_scores_list = [cls_logits[i][img_id] for i in range(num_lvls)]
-            bbox_preds_list = [bboxes_reg[i][img_id] for i in range(num_lvls)]
-            bboxes, scores = self.get_bboxes_single(
-                anchors, cls_scores_list, bbox_preds_list, im_shape[img_id],
-                scale_factor[img_id])
-            batch_bboxes.append(bboxes)
-            batch_scores.append(scores)
-        batch_bboxes = paddle.stack(batch_bboxes, axis=0)
-        batch_scores = paddle.stack(batch_scores, axis=0)
-        return batch_bboxes, batch_scores
-
-    def post_process(self, head_outputs, im_shape, scale_factor):
-        cls_logits_list, bboxes_reg_list = head_outputs
-        anchors = self.anchor_generator(cls_logits_list)
-        cls_logits = [_.transpose([0, 2, 3, 1]) for _ in cls_logits_list]
-        bboxes_reg = [_.transpose([0, 2, 3, 1]) for _ in bboxes_reg_list]
-        bboxes, scores = self.decode(anchors, cls_logits, bboxes_reg, im_shape,
-                                     scale_factor)
-
-        bbox_pred, bbox_num, nms_keep_idx = self.nms(bboxes, scores)
-        return bbox_pred, bbox_num, nms_keep_idx
-
-
-    def get_scores_single(self, cls_scores_list):
-        mlvl_logits = []
-        for cls_score in  cls_scores_list:
-            cls_score = cls_score.reshape([-1, self.num_classes])
-            if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre:
-                max_score = cls_score.max(axis=1)
-                _, topk_inds = max_score.topk(self.nms_pre)
-                cls_score = cls_score.gather(topk_inds)
-
-            mlvl_logits.append(cls_score)
-
-        mlvl_logits = paddle.concat(mlvl_logits)
-        mlvl_logits = mlvl_logits.transpose([1, 0])
-
-        return mlvl_logits
-
-    def decode_cls_logits(self, cls_logits_list):
-        cls_logits = [_.transpose([0, 2, 3, 1]) for _ in cls_logits_list]
-        batch_logits = []
-        for img_id in range(cls_logits[0].shape[0]):
-            num_lvls = len(cls_logits)
-            cls_scores_list = [cls_logits[i][img_id] for i in range(num_lvls)]
-            logits = self.get_scores_single(cls_scores_list)
-            batch_logits.append(logits)
-        batch_logits = paddle.stack(batch_logits, axis=0)
-        return batch_logits
-
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/roi_extractor.py b/pdfdet/models/Paddle/ppdet/modeling/heads/roi_extractor.py
deleted file mode 100644
index 6c2f5c8..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/roi_extractor.py
+++ /dev/null
@@ -1,118 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-from ppdet.core.workspace import register
-from ppdet.modeling import ops
-import paddle.nn as nn
-
-
-def _to_list(v):
-    if not isinstance(v, (list, tuple)):
-        return [v]
-    return v
-
-
-@register
-class RoIAlign(nn.Layer):
-    """
-    RoI Align module
-
-    For more details, please refer to the document of roi_align in
-    in https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/vision/ops.py
-
-    Args:
-        resolution (int): The output size, default 14
-        spatial_scale (float): Multiplicative spatial scale factor to translate
-            ROI coords from their input scale to the scale used when pooling.
-            default 0.0625
-        sampling_ratio (int): The number of sampling points in the interpolation
-            grid, default 0
-        canconical_level (int): The referring level of FPN layer with 
-            specified level. default 4
-        canonical_size (int): The referring scale of FPN layer with 
-            specified scale. default 224
-        start_level (int): The start level of FPN layer to extract RoI feature,
-            default 0
-        end_level (int): The end level of FPN layer to extract RoI feature,
-            default 3
-        aligned (bool): Whether to add offset to rois' coord in roi_align.
-            default false
-    """
-
-    def __init__(self,
-                 resolution=14,
-                 spatial_scale=0.0625,
-                 sampling_ratio=0,
-                 canconical_level=4,
-                 canonical_size=224,
-                 start_level=0,
-                 end_level=3,
-                 aligned=False):
-        super(RoIAlign, self).__init__()
-        self.resolution = resolution
-        self.spatial_scale = _to_list(spatial_scale)
-        self.sampling_ratio = sampling_ratio
-        self.canconical_level = canconical_level
-        self.canonical_size = canonical_size
-        self.start_level = start_level
-        self.end_level = end_level
-        self.aligned = aligned
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'spatial_scale': [1. / i.stride for i in input_shape]}
-
-    def forward(self, feats, roi, rois_num):
-        roi = paddle.concat(roi) if len(roi) > 1 else roi[0]
-        if len(feats) == 1:
-            rois_feat = paddle.vision.ops.roi_align(
-                x=feats[self.start_level],
-                boxes=roi,
-                boxes_num=rois_num,
-                output_size=self.resolution,
-                spatial_scale=self.spatial_scale[0],
-                aligned=self.aligned)
-        else:
-            offset = 2
-            k_min = self.start_level + offset
-            k_max = self.end_level + offset
-            if hasattr(paddle.vision.ops, "distribute_fpn_proposals"):
-                distribute_fpn_proposals = getattr(paddle.vision.ops,
-                                                   "distribute_fpn_proposals")
-            else:
-                distribute_fpn_proposals = ops.distribute_fpn_proposals
-            rois_dist, restore_index, rois_num_dist = distribute_fpn_proposals(
-                roi,
-                k_min,
-                k_max,
-                self.canconical_level,
-                self.canonical_size,
-                rois_num=rois_num)
-
-            rois_feat_list = []
-            for lvl in range(self.start_level, self.end_level + 1):
-                roi_feat = paddle.vision.ops.roi_align(
-                    x=feats[lvl],
-                    boxes=rois_dist[lvl],
-                    boxes_num=rois_num_dist[lvl],
-                    output_size=self.resolution,
-                    spatial_scale=self.spatial_scale[lvl],
-                    sampling_ratio=self.sampling_ratio,
-                    aligned=self.aligned)
-                rois_feat_list.append(roi_feat)
-            rois_feat_shuffle = paddle.concat(rois_feat_list)
-            rois_feat = paddle.gather(rois_feat_shuffle, restore_index)
-
-        return rois_feat
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/s2anet_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/s2anet_head.py
deleted file mode 100644
index 99fd13a..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/s2anet_head.py
+++ /dev/null
@@ -1,745 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# The code is based on https://github.com/csuhan/s2anet/blob/master/mmdet/models/anchor_heads_rotated/s2anet_head.py
-
-import paddle
-from paddle import ParamAttr
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.nn.initializer import Normal, Constant
-from ppdet.core.workspace import register
-from ppdet.modeling.proposal_generator.target_layer import RBoxAssigner
-from ppdet.modeling.proposal_generator.anchor_generator import S2ANetAnchorGenerator
-from ppdet.modeling.layers import AlignConv
-from ..cls_utils import _get_class_default_kwargs
-import numpy as np
-
-
-@register
-class S2ANetHead(nn.Layer):
-    """
-    S2Anet head
-    Args:
-        stacked_convs (int): number of stacked_convs
-        feat_in (int): input channels of feat
-        feat_out (int): output channels of feat
-        num_classes (int): num_classes
-        anchor_strides (list): stride of anchors
-        anchor_scales (list): scale of anchors
-        anchor_ratios (list): ratios of anchors
-        target_means (list): target_means
-        target_stds (list): target_stds
-        align_conv_type (str): align_conv_type ['Conv', 'AlignConv']
-        align_conv_size (int): kernel size of align_conv
-        use_sigmoid_cls (bool): use sigmoid_cls or not
-        reg_loss_weight (list): loss weight for regression
-    """
-    __shared__ = ['num_classes']
-    __inject__ = ['anchor_assign', 'nms']
-
-    def __init__(self,
-                 stacked_convs=2,
-                 feat_in=256,
-                 feat_out=256,
-                 num_classes=15,
-                 anchor_strides=[8, 16, 32, 64, 128],
-                 anchor_scales=[4],
-                 anchor_ratios=[1.0],
-                 target_means=0.0,
-                 target_stds=1.0,
-                 align_conv_type='AlignConv',
-                 align_conv_size=3,
-                 use_sigmoid_cls=True,
-                 anchor_assign=_get_class_default_kwargs(RBoxAssigner),
-                 reg_loss_weight=[1.0, 1.0, 1.0, 1.0, 1.1],
-                 cls_loss_weight=[1.1, 1.05],
-                 reg_loss_type='l1',
-                 nms_pre=2000,
-                 nms='MultiClassNMS'):
-        super(S2ANetHead, self).__init__()
-        self.stacked_convs = stacked_convs
-        self.feat_in = feat_in
-        self.feat_out = feat_out
-        self.anchor_list = None
-        self.anchor_scales = anchor_scales
-        self.anchor_ratios = anchor_ratios
-        self.anchor_strides = anchor_strides
-        self.anchor_strides = paddle.to_tensor(anchor_strides)
-        self.anchor_base_sizes = list(anchor_strides)
-        self.means = paddle.ones(shape=[5]) * target_means
-        self.stds = paddle.ones(shape=[5]) * target_stds
-        assert align_conv_type in ['AlignConv', 'Conv', 'DCN']
-        self.align_conv_type = align_conv_type
-        self.align_conv_size = align_conv_size
-
-        self.use_sigmoid_cls = use_sigmoid_cls
-        self.cls_out_channels = num_classes if self.use_sigmoid_cls else num_classes + 1
-        self.sampling = False
-        self.anchor_assign = anchor_assign
-        self.reg_loss_weight = reg_loss_weight
-        self.cls_loss_weight = cls_loss_weight
-        self.alpha = 1.0
-        self.beta = 1.0
-        self.reg_loss_type = reg_loss_type
-        self.nms_pre = nms_pre
-        self.nms = nms
-        self.fake_bbox = paddle.to_tensor(
-            np.array(
-                [[-1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
-                dtype='float32'))
-        self.fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32'))
-
-        # anchor
-        self.anchor_generators = []
-        for anchor_base in self.anchor_base_sizes:
-            self.anchor_generators.append(
-                S2ANetAnchorGenerator(anchor_base, anchor_scales,
-                                      anchor_ratios))
-
-        self.anchor_generators = nn.LayerList(self.anchor_generators)
-        self.fam_cls_convs = nn.Sequential()
-        self.fam_reg_convs = nn.Sequential()
-
-        for i in range(self.stacked_convs):
-            chan_in = self.feat_in if i == 0 else self.feat_out
-
-            self.fam_cls_convs.add_sublayer(
-                'fam_cls_conv_{}'.format(i),
-                nn.Conv2D(
-                    in_channels=chan_in,
-                    out_channels=self.feat_out,
-                    kernel_size=3,
-                    padding=1,
-                    weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
-                    bias_attr=ParamAttr(initializer=Constant(0))))
-
-            self.fam_cls_convs.add_sublayer('fam_cls_conv_{}_act'.format(i),
-                                            nn.ReLU())
-
-            self.fam_reg_convs.add_sublayer(
-                'fam_reg_conv_{}'.format(i),
-                nn.Conv2D(
-                    in_channels=chan_in,
-                    out_channels=self.feat_out,
-                    kernel_size=3,
-                    padding=1,
-                    weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
-                    bias_attr=ParamAttr(initializer=Constant(0))))
-
-            self.fam_reg_convs.add_sublayer('fam_reg_conv_{}_act'.format(i),
-                                            nn.ReLU())
-
-        self.fam_reg = nn.Conv2D(
-            self.feat_out,
-            5,
-            1,
-            weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
-            bias_attr=ParamAttr(initializer=Constant(0)))
-        prior_prob = 0.01
-        bias_init = float(-np.log((1 - prior_prob) / prior_prob))
-        self.fam_cls = nn.Conv2D(
-            self.feat_out,
-            self.cls_out_channels,
-            1,
-            weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
-            bias_attr=ParamAttr(initializer=Constant(bias_init)))
-
-        if self.align_conv_type == "AlignConv":
-            self.align_conv = AlignConv(self.feat_out, self.feat_out,
-                                        self.align_conv_size)
-        elif self.align_conv_type == "Conv":
-            self.align_conv = nn.Conv2D(
-                self.feat_out,
-                self.feat_out,
-                self.align_conv_size,
-                padding=(self.align_conv_size - 1) // 2,
-                bias_attr=ParamAttr(initializer=Constant(0)))
-
-        elif self.align_conv_type == "DCN":
-            self.align_conv_offset = nn.Conv2D(
-                self.feat_out,
-                2 * self.align_conv_size**2,
-                1,
-                weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
-                bias_attr=ParamAttr(initializer=Constant(0)))
-
-            self.align_conv = paddle.vision.ops.DeformConv2D(
-                self.feat_out,
-                self.feat_out,
-                self.align_conv_size,
-                padding=(self.align_conv_size - 1) // 2,
-                weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
-                bias_attr=False)
-
-        self.or_conv = nn.Conv2D(
-            self.feat_out,
-            self.feat_out,
-            kernel_size=3,
-            padding=1,
-            weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
-            bias_attr=ParamAttr(initializer=Constant(0)))
-
-        # ODM
-        self.odm_cls_convs = nn.Sequential()
-        self.odm_reg_convs = nn.Sequential()
-
-        for i in range(self.stacked_convs):
-            ch_in = self.feat_out
-            # ch_in = int(self.feat_out / 8) if i == 0 else self.feat_out
-
-            self.odm_cls_convs.add_sublayer(
-                'odm_cls_conv_{}'.format(i),
-                nn.Conv2D(
-                    in_channels=ch_in,
-                    out_channels=self.feat_out,
-                    kernel_size=3,
-                    stride=1,
-                    padding=1,
-                    weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
-                    bias_attr=ParamAttr(initializer=Constant(0))))
-
-            self.odm_cls_convs.add_sublayer('odm_cls_conv_{}_act'.format(i),
-                                            nn.ReLU())
-
-            self.odm_reg_convs.add_sublayer(
-                'odm_reg_conv_{}'.format(i),
-                nn.Conv2D(
-                    in_channels=self.feat_out,
-                    out_channels=self.feat_out,
-                    kernel_size=3,
-                    stride=1,
-                    padding=1,
-                    weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
-                    bias_attr=ParamAttr(initializer=Constant(0))))
-
-            self.odm_reg_convs.add_sublayer('odm_reg_conv_{}_act'.format(i),
-                                            nn.ReLU())
-
-        self.odm_cls = nn.Conv2D(
-            self.feat_out,
-            self.cls_out_channels,
-            3,
-            padding=1,
-            weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
-            bias_attr=ParamAttr(initializer=Constant(bias_init)))
-        self.odm_reg = nn.Conv2D(
-            self.feat_out,
-            5,
-            3,
-            padding=1,
-            weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)),
-            bias_attr=ParamAttr(initializer=Constant(0)))
-
-    def forward(self, feats, targets=None):
-        fam_reg_list, fam_cls_list = [], []
-        odm_reg_list, odm_cls_list = [], []
-        num_anchors_list, base_anchors_list, refine_anchors_list = [], [], []
-
-        for i, feat in enumerate(feats):
-            # get shape
-            B = feat.shape[0]
-            H, W = paddle.shape(feat)[2], paddle.shape(feat)[3]
-
-            NA = H * W
-            num_anchors_list.append(NA)
-
-            fam_cls_feat = self.fam_cls_convs(feat)
-            fam_cls = self.fam_cls(fam_cls_feat)
-            # [N, CLS, H, W] --> [N, H, W, CLS]
-            fam_cls = fam_cls.transpose([0, 2, 3, 1]).reshape(
-                [B, NA, self.cls_out_channels])
-            fam_cls_list.append(fam_cls)
-
-            fam_reg_feat = self.fam_reg_convs(feat)
-            fam_reg = self.fam_reg(fam_reg_feat)
-            # [N, 5, H, W] --> [N, H, W, 5]
-            fam_reg = fam_reg.transpose([0, 2, 3, 1]).reshape([B, NA, 5])
-            fam_reg_list.append(fam_reg)
-
-            # prepare anchor
-            init_anchors = self.anchor_generators[i]((H, W),
-                                                     self.anchor_strides[i])
-            init_anchors = init_anchors.reshape([1, NA, 5])
-            base_anchors_list.append(init_anchors.squeeze(0))
-
-            if self.training:
-                refine_anchor = self.bbox_decode(fam_reg.detach(), init_anchors)
-            else:
-                refine_anchor = self.bbox_decode(fam_reg, init_anchors)
-
-            refine_anchors_list.append(refine_anchor)
-
-            if self.align_conv_type == 'AlignConv':
-                align_feat = self.align_conv(feat,
-                                             refine_anchor.clone(), (H, W),
-                                             self.anchor_strides[i])
-            elif self.align_conv_type == 'DCN':
-                align_offset = self.align_conv_offset(feat)
-                align_feat = self.align_conv(feat, align_offset)
-            elif self.align_conv_type == 'Conv':
-                align_feat = self.align_conv(feat)
-
-            or_feat = self.or_conv(align_feat)
-            odm_reg_feat = or_feat
-            odm_cls_feat = or_feat
-
-            odm_reg_feat = self.odm_reg_convs(odm_reg_feat)
-            odm_cls_feat = self.odm_cls_convs(odm_cls_feat)
-
-            odm_cls = self.odm_cls(odm_cls_feat)
-            # [N, CLS, H, W] --> [N, H, W, CLS]
-            odm_cls = odm_cls.transpose([0, 2, 3, 1]).reshape(
-                [B, NA, self.cls_out_channels])
-            odm_cls_list.append(odm_cls)
-
-            odm_reg = self.odm_reg(odm_reg_feat)
-            # [N, 5, H, W] --> [N, H, W, 5]
-            odm_reg = odm_reg.transpose([0, 2, 3, 1]).reshape([B, NA, 5])
-            odm_reg_list.append(odm_reg)
-
-        if self.training:
-            return self.get_loss([
-                fam_cls_list, fam_reg_list, odm_cls_list, odm_reg_list,
-                num_anchors_list, base_anchors_list, refine_anchors_list
-            ], targets)
-        else:
-            odm_bboxes_list = []
-            for odm_reg, refine_anchor in zip(odm_reg_list,
-                                              refine_anchors_list):
-                odm_bboxes = self.bbox_decode(odm_reg, refine_anchor)
-                odm_bboxes_list.append(odm_bboxes)
-            return [odm_bboxes_list, odm_cls_list]
-
-    def get_bboxes(self, head_outs):
-        perd_bboxes_list, pred_scores_list = head_outs
-        batch = paddle.shape(pred_scores_list[0])[0]
-        bboxes, bbox_num = [], []
-        for i in range(batch):
-            pred_scores_per_image = [t[i] for t in pred_scores_list]
-            pred_bboxes_per_image = [t[i] for t in perd_bboxes_list]
-            bbox_per_image, bbox_num_per_image = self.get_bboxes_single(
-                pred_scores_per_image, pred_bboxes_per_image)
-            bboxes.append(bbox_per_image)
-            bbox_num.append(bbox_num_per_image)
-
-        bboxes = paddle.concat(bboxes)
-        bbox_num = paddle.concat(bbox_num)
-        return bboxes, bbox_num
-
-    def get_pred(self, bboxes, bbox_num, im_shape, scale_factor):
-        """
-        Rescale, clip and filter the bbox from the output of NMS to
-        get final prediction.
-        Args:
-            bboxes(Tensor): bboxes [N, 10]
-            bbox_num(Tensor): bbox_num
-            im_shape(Tensor): [1 2]
-            scale_factor(Tensor): [1 2]
-        Returns:
-            bbox_pred(Tensor): The output is the prediction with shape [N, 8]
-                               including labels, scores and bboxes. The size of
-                               bboxes are corresponding to the original image.
-        """
-        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
-
-        origin_shape_list = []
-        scale_factor_list = []
-        # scale_factor: scale_y, scale_x
-        for i in range(bbox_num.shape[0]):
-            expand_shape = paddle.expand(origin_shape[i:i + 1, :],
-                                         [bbox_num[i], 2])
-            scale_y, scale_x = scale_factor[i, 0:1], scale_factor[i, 1:2]
-            scale = paddle.concat([
-                scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x,
-                scale_y
-            ])
-            expand_scale = paddle.expand(scale, [bbox_num[i], 8])
-            origin_shape_list.append(expand_shape)
-            scale_factor_list.append(expand_scale)
-
-        origin_shape_list = paddle.concat(origin_shape_list)
-        scale_factor_list = paddle.concat(scale_factor_list)
-
-        # bboxes: [N, 10], label, score, bbox
-        pred_label_score = bboxes[:, 0:2]
-        pred_bbox = bboxes[:, 2:]
-
-        # rescale bbox to original image
-        pred_bbox = pred_bbox.reshape([-1, 8])
-        scaled_bbox = pred_bbox / scale_factor_list
-        origin_h = origin_shape_list[:, 0]
-        origin_w = origin_shape_list[:, 1]
-
-        bboxes = scaled_bbox
-        zeros = paddle.zeros_like(origin_h)
-        x1 = paddle.maximum(paddle.minimum(bboxes[:, 0], origin_w - 1), zeros)
-        y1 = paddle.maximum(paddle.minimum(bboxes[:, 1], origin_h - 1), zeros)
-        x2 = paddle.maximum(paddle.minimum(bboxes[:, 2], origin_w - 1), zeros)
-        y2 = paddle.maximum(paddle.minimum(bboxes[:, 3], origin_h - 1), zeros)
-        x3 = paddle.maximum(paddle.minimum(bboxes[:, 4], origin_w - 1), zeros)
-        y3 = paddle.maximum(paddle.minimum(bboxes[:, 5], origin_h - 1), zeros)
-        x4 = paddle.maximum(paddle.minimum(bboxes[:, 6], origin_w - 1), zeros)
-        y4 = paddle.maximum(paddle.minimum(bboxes[:, 7], origin_h - 1), zeros)
-        pred_bbox = paddle.stack([x1, y1, x2, y2, x3, y3, x4, y4], axis=-1)
-        pred_result = paddle.concat([pred_label_score, pred_bbox], axis=1)
-        return pred_result
-
-    def get_bboxes_single(self, cls_score_list, bbox_pred_list):
-        mlvl_bboxes = []
-        mlvl_scores = []
-
-        for cls_score, bbox_pred in zip(cls_score_list, bbox_pred_list):
-            if self.use_sigmoid_cls:
-                scores = F.sigmoid(cls_score)
-            else:
-                scores = F.softmax(cls_score, axis=-1)
-
-            if scores.shape[0] > self.nms_pre:
-                # Get maximum scores for foreground classes.
-                if self.use_sigmoid_cls:
-                    max_scores = paddle.max(scores, axis=1)
-                else:
-                    max_scores = paddle.max(scores[:, :-1], axis=1)
-
-                topk_val, topk_inds = paddle.topk(max_scores, self.nms_pre)
-                bbox_pred = paddle.gather(bbox_pred, topk_inds)
-                scores = paddle.gather(scores, topk_inds)
-
-            mlvl_bboxes.append(bbox_pred)
-            mlvl_scores.append(scores)
-
-        mlvl_bboxes = paddle.concat(mlvl_bboxes)
-        mlvl_scores = paddle.concat(mlvl_scores)
-
-        mlvl_polys = self.rbox2poly(mlvl_bboxes).unsqueeze(0)
-        mlvl_scores = paddle.transpose(mlvl_scores, [1, 0]).unsqueeze(0)
-
-        bbox, bbox_num, _ = self.nms(mlvl_polys, mlvl_scores)
-        if bbox.shape[0] <= 0:
-            bbox = self.fake_bbox
-            bbox_num = self.fake_bbox_num
-
-        return bbox, bbox_num
-
-    def smooth_l1_loss(self, pred, label, delta=1.0 / 9.0):
-        """
-        Args:
-            pred: pred score
-            label: label
-            delta: delta
-        Returns: loss
-        """
-        assert pred.shape == label.shape and label.numel() > 0
-        assert delta > 0
-        diff = paddle.abs(pred - label)
-        loss = paddle.where(diff < delta, 0.5 * diff * diff / delta,
-                            diff - 0.5 * delta)
-        return loss
-
-    def get_fam_loss(self, fam_target, s2anet_head_out, reg_loss_type='l1'):
-        (labels, label_weights, bbox_targets, bbox_weights, bbox_gt_bboxes,
-         pos_inds, neg_inds) = fam_target
-        fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list, num_anchors_list = s2anet_head_out
-
-        fam_cls_losses = []
-        fam_bbox_losses = []
-        st_idx = 0
-        num_total_samples = len(pos_inds) + len(
-            neg_inds) if self.sampling else len(pos_inds)
-        num_total_samples = max(1, num_total_samples)
-
-        for idx, feat_anchor_num in enumerate(num_anchors_list):
-            # step1:  get data
-            feat_labels = labels[st_idx:st_idx + feat_anchor_num]
-            feat_label_weights = label_weights[st_idx:st_idx + feat_anchor_num]
-
-            feat_bbox_targets = bbox_targets[st_idx:st_idx + feat_anchor_num, :]
-            feat_bbox_weights = bbox_weights[st_idx:st_idx + feat_anchor_num, :]
-
-            # step2: calc cls loss
-            feat_labels = feat_labels.reshape(-1)
-            feat_label_weights = feat_label_weights.reshape(-1)
-
-            fam_cls_score = fam_cls_branch_list[idx]
-            fam_cls_score = paddle.squeeze(fam_cls_score, axis=0)
-            fam_cls_score1 = fam_cls_score
-
-            feat_labels = paddle.to_tensor(feat_labels)
-            feat_labels_one_hot = paddle.nn.functional.one_hot(
-                feat_labels, self.cls_out_channels + 1)
-            feat_labels_one_hot = feat_labels_one_hot[:, 1:]
-            feat_labels_one_hot.stop_gradient = True
-
-            num_total_samples = paddle.to_tensor(
-                num_total_samples, dtype='float32', stop_gradient=True)
-
-            fam_cls = F.sigmoid_focal_loss(
-                fam_cls_score1,
-                feat_labels_one_hot,
-                normalizer=num_total_samples,
-                reduction='none')
-
-            feat_label_weights = feat_label_weights.reshape(
-                feat_label_weights.shape[0], 1)
-            feat_label_weights = np.repeat(
-                feat_label_weights, self.cls_out_channels, axis=1)
-            feat_label_weights = paddle.to_tensor(
-                feat_label_weights, stop_gradient=True)
-
-            fam_cls = fam_cls * feat_label_weights
-            fam_cls_total = paddle.sum(fam_cls)
-            fam_cls_losses.append(fam_cls_total)
-
-            # step3: regression loss
-            feat_bbox_targets = paddle.to_tensor(
-                feat_bbox_targets, dtype='float32', stop_gradient=True)
-            feat_bbox_targets = paddle.reshape(feat_bbox_targets, [-1, 5])
-
-            fam_bbox_pred = fam_reg_branch_list[idx]
-            fam_bbox_pred = paddle.squeeze(fam_bbox_pred, axis=0)
-            fam_bbox_pred = paddle.reshape(fam_bbox_pred, [-1, 5])
-            fam_bbox = self.smooth_l1_loss(fam_bbox_pred, feat_bbox_targets)
-            loss_weight = paddle.to_tensor(
-                self.reg_loss_weight, dtype='float32', stop_gradient=True)
-            fam_bbox = paddle.multiply(fam_bbox, loss_weight)
-            feat_bbox_weights = paddle.to_tensor(
-                feat_bbox_weights, stop_gradient=True)
-
-            fam_bbox = fam_bbox * feat_bbox_weights
-            fam_bbox_total = paddle.sum(fam_bbox) / num_total_samples
-            fam_bbox_losses.append(fam_bbox_total)
-            st_idx += feat_anchor_num
-
-        fam_cls_loss = paddle.add_n(fam_cls_losses)
-        fam_cls_loss_weight = paddle.to_tensor(
-            self.cls_loss_weight[0], dtype='float32', stop_gradient=True)
-        fam_cls_loss = fam_cls_loss * fam_cls_loss_weight
-        fam_reg_loss = paddle.add_n(fam_bbox_losses)
-        return fam_cls_loss, fam_reg_loss
-
-    def get_odm_loss(self, odm_target, s2anet_head_out, reg_loss_type='l1'):
-        (labels, label_weights, bbox_targets, bbox_weights, bbox_gt_bboxes,
-         pos_inds, neg_inds) = odm_target
-        fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list, num_anchors_list = s2anet_head_out
-
-        odm_cls_losses = []
-        odm_bbox_losses = []
-        st_idx = 0
-        num_total_samples = len(pos_inds) + len(
-            neg_inds) if self.sampling else len(pos_inds)
-        num_total_samples = max(1, num_total_samples)
-
-        for idx, feat_anchor_num in enumerate(num_anchors_list):
-            # step1:  get data
-            feat_labels = labels[st_idx:st_idx + feat_anchor_num]
-            feat_label_weights = label_weights[st_idx:st_idx + feat_anchor_num]
-
-            feat_bbox_targets = bbox_targets[st_idx:st_idx + feat_anchor_num, :]
-            feat_bbox_weights = bbox_weights[st_idx:st_idx + feat_anchor_num, :]
-
-            # step2: calc cls loss
-            feat_labels = feat_labels.reshape(-1)
-            feat_label_weights = feat_label_weights.reshape(-1)
-
-            odm_cls_score = odm_cls_branch_list[idx]
-            odm_cls_score = paddle.squeeze(odm_cls_score, axis=0)
-            odm_cls_score1 = odm_cls_score
-
-            feat_labels = paddle.to_tensor(feat_labels)
-            feat_labels_one_hot = paddle.nn.functional.one_hot(
-                feat_labels, self.cls_out_channels + 1)
-            feat_labels_one_hot = feat_labels_one_hot[:, 1:]
-            feat_labels_one_hot.stop_gradient = True
-
-            num_total_samples = paddle.to_tensor(
-                num_total_samples, dtype='float32', stop_gradient=True)
-            odm_cls = F.sigmoid_focal_loss(
-                odm_cls_score1,
-                feat_labels_one_hot,
-                normalizer=num_total_samples,
-                reduction='none')
-
-            feat_label_weights = feat_label_weights.reshape(
-                feat_label_weights.shape[0], 1)
-            feat_label_weights = np.repeat(
-                feat_label_weights, self.cls_out_channels, axis=1)
-            feat_label_weights = paddle.to_tensor(feat_label_weights)
-            feat_label_weights.stop_gradient = True
-
-            odm_cls = odm_cls * feat_label_weights
-            odm_cls_total = paddle.sum(odm_cls)
-            odm_cls_losses.append(odm_cls_total)
-
-            # # step3: regression loss
-            feat_bbox_targets = paddle.to_tensor(
-                feat_bbox_targets, dtype='float32')
-            feat_bbox_targets = paddle.reshape(feat_bbox_targets, [-1, 5])
-            feat_bbox_targets.stop_gradient = True
-
-            odm_bbox_pred = odm_reg_branch_list[idx]
-            odm_bbox_pred = paddle.squeeze(odm_bbox_pred, axis=0)
-            odm_bbox_pred = paddle.reshape(odm_bbox_pred, [-1, 5])
-            odm_bbox = self.smooth_l1_loss(odm_bbox_pred, feat_bbox_targets)
-
-            loss_weight = paddle.to_tensor(
-                self.reg_loss_weight, dtype='float32', stop_gradient=True)
-            odm_bbox = paddle.multiply(odm_bbox, loss_weight)
-            feat_bbox_weights = paddle.to_tensor(
-                feat_bbox_weights, stop_gradient=True)
-
-            odm_bbox = odm_bbox * feat_bbox_weights
-            odm_bbox_total = paddle.sum(odm_bbox) / num_total_samples
-
-            odm_bbox_losses.append(odm_bbox_total)
-            st_idx += feat_anchor_num
-
-        odm_cls_loss = paddle.add_n(odm_cls_losses)
-        odm_cls_loss_weight = paddle.to_tensor(
-            self.cls_loss_weight[1], dtype='float32', stop_gradient=True)
-        odm_cls_loss = odm_cls_loss * odm_cls_loss_weight
-        odm_reg_loss = paddle.add_n(odm_bbox_losses)
-        return odm_cls_loss, odm_reg_loss
-
-    def get_loss(self, head_outs, inputs):
-        fam_cls_list, fam_reg_list, odm_cls_list, odm_reg_list, \
-            num_anchors_list, base_anchors_list, refine_anchors_list = head_outs
-
-        # compute loss
-        fam_cls_loss_lst = []
-        fam_reg_loss_lst = []
-        odm_cls_loss_lst = []
-        odm_reg_loss_lst = []
-
-        batch = len(inputs['gt_rbox'])
-        for i in range(batch):
-            # data_format: (xc, yc, w, h, theta)
-            gt_mask = inputs['pad_gt_mask'][i, :, 0]
-            gt_idx = paddle.nonzero(gt_mask).squeeze(-1)
-            gt_bboxes = paddle.gather(inputs['gt_rbox'][i], gt_idx).numpy()
-            gt_labels = paddle.gather(inputs['gt_class'][i], gt_idx).numpy()
-            is_crowd = paddle.gather(inputs['is_crowd'][i], gt_idx).numpy()
-            gt_labels = gt_labels + 1
-
-            anchors_per_image = np.concatenate(base_anchors_list)
-
-            fam_cls_per_image = [t[i] for t in fam_cls_list]
-            fam_reg_per_image = [t[i] for t in fam_reg_list]
-            odm_cls_per_image = [t[i] for t in odm_cls_list]
-            odm_reg_per_image = [t[i] for t in odm_reg_list]
-            im_s2anet_head_out = (fam_cls_per_image, fam_reg_per_image,
-                                  odm_cls_per_image, odm_reg_per_image,
-                                  num_anchors_list)
-            # FAM
-            im_fam_target = self.anchor_assign(anchors_per_image, gt_bboxes,
-                                               gt_labels, is_crowd)
-            if im_fam_target is not None:
-                im_fam_cls_loss, im_fam_reg_loss = self.get_fam_loss(
-                    im_fam_target, im_s2anet_head_out, self.reg_loss_type)
-                fam_cls_loss_lst.append(im_fam_cls_loss)
-                fam_reg_loss_lst.append(im_fam_reg_loss)
-
-            # ODM
-            refine_anchors_per_image = [t[i] for t in refine_anchors_list]
-            refine_anchors_per_image = paddle.concat(
-                refine_anchors_per_image).numpy()
-            im_odm_target = self.anchor_assign(refine_anchors_per_image,
-                                               gt_bboxes, gt_labels, is_crowd)
-
-            if im_odm_target is not None:
-                im_odm_cls_loss, im_odm_reg_loss = self.get_odm_loss(
-                    im_odm_target, im_s2anet_head_out, self.reg_loss_type)
-                odm_cls_loss_lst.append(im_odm_cls_loss)
-                odm_reg_loss_lst.append(im_odm_reg_loss)
-
-        fam_cls_loss = paddle.add_n(fam_cls_loss_lst) / batch
-        fam_reg_loss = paddle.add_n(fam_reg_loss_lst) / batch
-        odm_cls_loss = paddle.add_n(odm_cls_loss_lst) / batch
-        odm_reg_loss = paddle.add_n(odm_reg_loss_lst) / batch
-        loss = fam_cls_loss + fam_reg_loss + odm_cls_loss + odm_reg_loss
-
-        return {
-            'loss': loss,
-            'fam_cls_loss': fam_cls_loss,
-            'fam_reg_loss': fam_reg_loss,
-            'odm_cls_loss': odm_cls_loss,
-            'odm_reg_loss': odm_reg_loss
-        }
-
-    def bbox_decode(self, preds, anchors, wh_ratio_clip=1e-6):
-        """decode bbox from deltas
-        Args:
-            preds: [B, L, 5]
-            anchors: [1, L, 5]
-        return:
-            bboxes: [B, L, 5]
-        """
-        preds = paddle.add(paddle.multiply(preds, self.stds), self.means)
-
-        dx, dy, dw, dh, dangle = paddle.split(preds, 5, axis=-1)
-        max_ratio = np.abs(np.log(wh_ratio_clip))
-        dw = paddle.clip(dw, min=-max_ratio, max=max_ratio)
-        dh = paddle.clip(dh, min=-max_ratio, max=max_ratio)
-
-        rroi_x, rroi_y, rroi_w, rroi_h, rroi_angle = paddle.split(
-            anchors, 5, axis=-1)
-
-        gx = dx * rroi_w * paddle.cos(rroi_angle) - dy * rroi_h * paddle.sin(
-            rroi_angle) + rroi_x
-        gy = dx * rroi_w * paddle.sin(rroi_angle) + dy * rroi_h * paddle.cos(
-            rroi_angle) + rroi_y
-        gw = rroi_w * dw.exp()
-        gh = rroi_h * dh.exp()
-        ga = np.pi * dangle + rroi_angle
-        ga = (ga + np.pi / 4) % np.pi - np.pi / 4
-        bboxes = paddle.concat([gx, gy, gw, gh, ga], axis=-1)
-        return bboxes
-
-    def rbox2poly(self, rboxes):
-        """
-        rboxes: [x_ctr,y_ctr,w,h,angle]
-        to
-        polys: [x0,y0,x1,y1,x2,y2,x3,y3]
-        """
-        N = paddle.shape(rboxes)[0]
-
-        x_ctr = rboxes[:, 0]
-        y_ctr = rboxes[:, 1]
-        width = rboxes[:, 2]
-        height = rboxes[:, 3]
-        angle = rboxes[:, 4]
-
-        tl_x, tl_y, br_x, br_y = -width * 0.5, -height * 0.5, width * 0.5, height * 0.5
-
-        normal_rects = paddle.stack(
-            [tl_x, br_x, br_x, tl_x, tl_y, tl_y, br_y, br_y], axis=0)
-        normal_rects = paddle.reshape(normal_rects, [2, 4, N])
-        normal_rects = paddle.transpose(normal_rects, [2, 0, 1])
-
-        sin, cos = paddle.sin(angle), paddle.cos(angle)
-        # M: [N,2,2]
-        M = paddle.stack([cos, -sin, sin, cos], axis=0)
-        M = paddle.reshape(M, [2, 2, N])
-        M = paddle.transpose(M, [2, 0, 1])
-
-        # polys: [N,8]
-        polys = paddle.matmul(M, normal_rects)
-        polys = paddle.transpose(polys, [2, 1, 0])
-        polys = paddle.reshape(polys, [-1, N])
-        polys = paddle.transpose(polys, [1, 0])
-
-        tmp = paddle.stack(
-            [x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr], axis=1)
-        polys = polys + tmp
-        return polys
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/simota_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/simota_head.py
deleted file mode 100644
index 037c395..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/simota_head.py
+++ /dev/null
@@ -1,500 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# The code is based on:
-# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/yolox_head.py
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-from functools import partial
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn.initializer import Normal, Constant
-
-from ppdet.core.workspace import register
-
-from ppdet.modeling.bbox_utils import distance2bbox, bbox2distance
-from ppdet.data.transform.atss_assigner import bbox_overlaps
-
-from .gfl_head import GFLHead
-
-
-@register
-class OTAHead(GFLHead):
-    """
-    OTAHead
-    Args:
-        conv_feat (object): Instance of 'FCOSFeat'
-        num_classes (int): Number of classes
-        fpn_stride (list): The stride of each FPN Layer
-        prior_prob (float): Used to set the bias init for the class prediction layer
-        loss_qfl (object): Instance of QualityFocalLoss.
-        loss_dfl (object): Instance of DistributionFocalLoss.
-        loss_bbox (object): Instance of bbox loss.
-        assigner (object): Instance of label assigner.
-        reg_max: Max value of integral set :math: `{0, ..., reg_max}`
-                n QFL setting. Default: 16.
-    """
-    __inject__ = [
-        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
-        'assigner', 'nms'
-    ]
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 conv_feat='FCOSFeat',
-                 dgqp_module=None,
-                 num_classes=80,
-                 fpn_stride=[8, 16, 32, 64, 128],
-                 prior_prob=0.01,
-                 loss_class='QualityFocalLoss',
-                 loss_dfl='DistributionFocalLoss',
-                 loss_bbox='GIoULoss',
-                 assigner='SimOTAAssigner',
-                 reg_max=16,
-                 feat_in_chan=256,
-                 nms=None,
-                 nms_pre=1000,
-                 cell_offset=0):
-        super(OTAHead, self).__init__(
-            conv_feat=conv_feat,
-            dgqp_module=dgqp_module,
-            num_classes=num_classes,
-            fpn_stride=fpn_stride,
-            prior_prob=prior_prob,
-            loss_class=loss_class,
-            loss_dfl=loss_dfl,
-            loss_bbox=loss_bbox,
-            reg_max=reg_max,
-            feat_in_chan=feat_in_chan,
-            nms=nms,
-            nms_pre=nms_pre,
-            cell_offset=cell_offset)
-        self.conv_feat = conv_feat
-        self.dgqp_module = dgqp_module
-        self.num_classes = num_classes
-        self.fpn_stride = fpn_stride
-        self.prior_prob = prior_prob
-        self.loss_qfl = loss_class
-        self.loss_dfl = loss_dfl
-        self.loss_bbox = loss_bbox
-        self.reg_max = reg_max
-        self.feat_in_chan = feat_in_chan
-        self.nms = nms
-        self.nms_pre = nms_pre
-        self.cell_offset = cell_offset
-        self.use_sigmoid = self.loss_qfl.use_sigmoid
-
-        self.assigner = assigner
-
-    def _get_target_single(self, flatten_cls_pred, flatten_center_and_stride,
-                           flatten_bbox, gt_bboxes, gt_labels):
-        """Compute targets for priors in a single image.
-        """
-        pos_num, label, label_weight, bbox_target = self.assigner(
-            F.sigmoid(flatten_cls_pred), flatten_center_and_stride,
-            flatten_bbox, gt_bboxes, gt_labels)
-
-        return (pos_num, label, label_weight, bbox_target)
-
-    def get_loss(self, head_outs, gt_meta):
-        cls_scores, bbox_preds = head_outs
-        num_level_anchors = [
-            featmap.shape[-2] * featmap.shape[-1] for featmap in cls_scores
-        ]
-        num_imgs = gt_meta['im_id'].shape[0]
-        featmap_sizes = [[featmap.shape[-2], featmap.shape[-1]]
-                         for featmap in cls_scores]
-
-        decode_bbox_preds = []
-        center_and_strides = []
-        for featmap_size, stride, bbox_pred in zip(featmap_sizes,
-                                                   self.fpn_stride, bbox_preds):
-
-            # center in origin image
-            yy, xx = self.get_single_level_center_point(featmap_size, stride,
-                                                        self.cell_offset)
-
-            center_and_stride = paddle.stack([xx, yy, stride, stride], -1).tile(
-                [num_imgs, 1, 1])
-            center_and_strides.append(center_and_stride)
-            center_in_feature = center_and_stride.reshape(
-                [-1, 4])[:, :-2] / stride
-            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
-                [num_imgs, -1, 4 * (self.reg_max + 1)])
-            pred_distances = self.distribution_project(bbox_pred)
-            decode_bbox_pred_wo_stride = distance2bbox(
-                center_in_feature, pred_distances).reshape([num_imgs, -1, 4])
-            decode_bbox_preds.append(decode_bbox_pred_wo_stride * stride)
-
-        flatten_cls_preds = [
-            cls_pred.transpose([0, 2, 3, 1]).reshape(
-                [num_imgs, -1, self.cls_out_channels])
-            for cls_pred in cls_scores
-        ]
-        flatten_cls_preds = paddle.concat(flatten_cls_preds, axis=1)
-        flatten_bboxes = paddle.concat(decode_bbox_preds, axis=1)
-        flatten_center_and_strides = paddle.concat(center_and_strides, axis=1)
-
-        gt_boxes, gt_labels = gt_meta['gt_bbox'], gt_meta['gt_class']
-        pos_num_l, label_l, label_weight_l, bbox_target_l = [], [], [], []
-        for flatten_cls_pred,flatten_center_and_stride,flatten_bbox,gt_box, gt_label \
-            in zip(flatten_cls_preds.detach(),flatten_center_and_strides.detach(), \
-                   flatten_bboxes.detach(),gt_boxes, gt_labels):
-            pos_num, label, label_weight, bbox_target = self._get_target_single(
-                flatten_cls_pred, flatten_center_and_stride, flatten_bbox,
-                gt_box, gt_label)
-            pos_num_l.append(pos_num)
-            label_l.append(label)
-            label_weight_l.append(label_weight)
-            bbox_target_l.append(bbox_target)
-
-        labels = paddle.to_tensor(np.stack(label_l, axis=0))
-        label_weights = paddle.to_tensor(np.stack(label_weight_l, axis=0))
-        bbox_targets = paddle.to_tensor(np.stack(bbox_target_l, axis=0))
-
-        center_and_strides_list = self._images_to_levels(
-            flatten_center_and_strides, num_level_anchors)
-        labels_list = self._images_to_levels(labels, num_level_anchors)
-        label_weights_list = self._images_to_levels(label_weights,
-                                                    num_level_anchors)
-        bbox_targets_list = self._images_to_levels(bbox_targets,
-                                                   num_level_anchors)
-        num_total_pos = sum(pos_num_l)
-        try:
-            paddle.distributed.all_reduce(paddle.to_tensor(num_total_pos))
-            num_total_pos = paddle.clip(
-                num_total_pos / paddle.distributed.get_world_size(), min=1.)
-        except:
-            num_total_pos = max(num_total_pos, 1)
-
-        loss_bbox_list, loss_dfl_list, loss_qfl_list, avg_factor = [], [], [], []
-        for cls_score, bbox_pred, center_and_strides, labels, label_weights, bbox_targets, stride in zip(
-                cls_scores, bbox_preds, center_and_strides_list, labels_list,
-                label_weights_list, bbox_targets_list, self.fpn_stride):
-            center_and_strides = center_and_strides.reshape([-1, 4])
-            cls_score = cls_score.transpose([0, 2, 3, 1]).reshape(
-                [-1, self.cls_out_channels])
-            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
-                [-1, 4 * (self.reg_max + 1)])
-            bbox_targets = bbox_targets.reshape([-1, 4])
-            labels = labels.reshape([-1])
-            label_weights = label_weights.reshape([-1])
-
-            bg_class_ind = self.num_classes
-            pos_inds = paddle.nonzero(
-                paddle.logical_and((labels >= 0), (labels < bg_class_ind)),
-                as_tuple=False).squeeze(1)
-            score = np.zeros(labels.shape)
-
-            if len(pos_inds) > 0:
-                pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0)
-                pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0)
-                pos_centers = paddle.gather(
-                    center_and_strides[:, :-2], pos_inds, axis=0) / stride
-
-                weight_targets = F.sigmoid(cls_score.detach())
-                weight_targets = paddle.gather(
-                    weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)
-                pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred)
-                pos_decode_bbox_pred = distance2bbox(pos_centers,
-                                                     pos_bbox_pred_corners)
-                pos_decode_bbox_targets = pos_bbox_targets / stride
-                bbox_iou = bbox_overlaps(
-                    pos_decode_bbox_pred.detach().numpy(),
-                    pos_decode_bbox_targets.detach().numpy(),
-                    is_aligned=True)
-                score[pos_inds.numpy()] = bbox_iou
-
-                pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1])
-                target_corners = bbox2distance(pos_centers,
-                                               pos_decode_bbox_targets,
-                                               self.reg_max).reshape([-1])
-                # regression loss
-                loss_bbox = paddle.sum(
-                    self.loss_bbox(pos_decode_bbox_pred,
-                                   pos_decode_bbox_targets) * weight_targets)
-
-                # dfl loss
-                loss_dfl = self.loss_dfl(
-                    pred_corners,
-                    target_corners,
-                    weight=weight_targets.expand([-1, 4]).reshape([-1]),
-                    avg_factor=4.0)
-            else:
-                loss_bbox = bbox_pred.sum() * 0
-                loss_dfl = bbox_pred.sum() * 0
-                weight_targets = paddle.to_tensor([0], dtype='float32')
-
-            # qfl loss
-            score = paddle.to_tensor(score)
-            loss_qfl = self.loss_qfl(
-                cls_score, (labels, score),
-                weight=label_weights,
-                avg_factor=num_total_pos)
-            loss_bbox_list.append(loss_bbox)
-            loss_dfl_list.append(loss_dfl)
-            loss_qfl_list.append(loss_qfl)
-            avg_factor.append(weight_targets.sum())
-
-        avg_factor = sum(avg_factor)
-        try:
-            paddle.distributed.all_reduce(paddle.to_tensor(avg_factor))
-            avg_factor = paddle.clip(
-                avg_factor / paddle.distributed.get_world_size(), min=1)
-        except:
-            avg_factor = max(avg_factor.item(), 1)
-        if avg_factor <= 0:
-            loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
-            loss_bbox = paddle.to_tensor(
-                0, dtype='float32', stop_gradient=False)
-            loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
-        else:
-            losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list))
-            losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list))
-            loss_qfl = sum(loss_qfl_list)
-            loss_bbox = sum(losses_bbox)
-            loss_dfl = sum(losses_dfl)
-
-        loss_states = dict(
-            loss_qfl=loss_qfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl)
-
-        return loss_states
-
-
-@register
-class OTAVFLHead(OTAHead):
-    __inject__ = [
-        'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox',
-        'assigner', 'nms'
-    ]
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 conv_feat='FCOSFeat',
-                 dgqp_module=None,
-                 num_classes=80,
-                 fpn_stride=[8, 16, 32, 64, 128],
-                 prior_prob=0.01,
-                 loss_class='VarifocalLoss',
-                 loss_dfl='DistributionFocalLoss',
-                 loss_bbox='GIoULoss',
-                 assigner='SimOTAAssigner',
-                 reg_max=16,
-                 feat_in_chan=256,
-                 nms=None,
-                 nms_pre=1000,
-                 cell_offset=0):
-        super(OTAVFLHead, self).__init__(
-            conv_feat=conv_feat,
-            dgqp_module=dgqp_module,
-            num_classes=num_classes,
-            fpn_stride=fpn_stride,
-            prior_prob=prior_prob,
-            loss_class=loss_class,
-            loss_dfl=loss_dfl,
-            loss_bbox=loss_bbox,
-            reg_max=reg_max,
-            feat_in_chan=feat_in_chan,
-            nms=nms,
-            nms_pre=nms_pre,
-            cell_offset=cell_offset)
-        self.conv_feat = conv_feat
-        self.dgqp_module = dgqp_module
-        self.num_classes = num_classes
-        self.fpn_stride = fpn_stride
-        self.prior_prob = prior_prob
-        self.loss_vfl = loss_class
-        self.loss_dfl = loss_dfl
-        self.loss_bbox = loss_bbox
-        self.reg_max = reg_max
-        self.feat_in_chan = feat_in_chan
-        self.nms = nms
-        self.nms_pre = nms_pre
-        self.cell_offset = cell_offset
-        self.use_sigmoid = self.loss_vfl.use_sigmoid
-
-        self.assigner = assigner
-
-    def get_loss(self, head_outs, gt_meta):
-        cls_scores, bbox_preds = head_outs
-        num_level_anchors = [
-            featmap.shape[-2] * featmap.shape[-1] for featmap in cls_scores
-        ]
-        num_imgs = gt_meta['im_id'].shape[0]
-        featmap_sizes = [[featmap.shape[-2], featmap.shape[-1]]
-                         for featmap in cls_scores]
-
-        decode_bbox_preds = []
-        center_and_strides = []
-        for featmap_size, stride, bbox_pred in zip(featmap_sizes,
-                                                   self.fpn_stride, bbox_preds):
-            # center in origin image
-            yy, xx = self.get_single_level_center_point(featmap_size, stride,
-                                                        self.cell_offset)
-            strides = paddle.full((len(xx), ), stride)
-            center_and_stride = paddle.stack([xx, yy, strides, strides],
-                                             -1).tile([num_imgs, 1, 1])
-            center_and_strides.append(center_and_stride)
-            center_in_feature = center_and_stride.reshape(
-                [-1, 4])[:, :-2] / stride
-            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
-                [num_imgs, -1, 4 * (self.reg_max + 1)])
-            pred_distances = self.distribution_project(bbox_pred)
-            decode_bbox_pred_wo_stride = distance2bbox(
-                center_in_feature, pred_distances).reshape([num_imgs, -1, 4])
-            decode_bbox_preds.append(decode_bbox_pred_wo_stride * stride)
-
-        flatten_cls_preds = [
-            cls_pred.transpose([0, 2, 3, 1]).reshape(
-                [num_imgs, -1, self.cls_out_channels])
-            for cls_pred in cls_scores
-        ]
-        flatten_cls_preds = paddle.concat(flatten_cls_preds, axis=1)
-        flatten_bboxes = paddle.concat(decode_bbox_preds, axis=1)
-        flatten_center_and_strides = paddle.concat(center_and_strides, axis=1)
-
-        gt_boxes, gt_labels = gt_meta['gt_bbox'], gt_meta['gt_class']
-        pos_num_l, label_l, label_weight_l, bbox_target_l = [], [], [], []
-        for flatten_cls_pred, flatten_center_and_stride, flatten_bbox,gt_box,gt_label \
-                in zip(flatten_cls_preds.detach(), flatten_center_and_strides.detach(), \
-                       flatten_bboxes.detach(),gt_boxes,gt_labels):
-            pos_num, label, label_weight, bbox_target = self._get_target_single(
-                flatten_cls_pred, flatten_center_and_stride, flatten_bbox,
-                gt_box, gt_label)
-            pos_num_l.append(pos_num)
-            label_l.append(label)
-            label_weight_l.append(label_weight)
-            bbox_target_l.append(bbox_target)
-
-        labels = paddle.to_tensor(np.stack(label_l, axis=0))
-        label_weights = paddle.to_tensor(np.stack(label_weight_l, axis=0))
-        bbox_targets = paddle.to_tensor(np.stack(bbox_target_l, axis=0))
-
-        center_and_strides_list = self._images_to_levels(
-            flatten_center_and_strides, num_level_anchors)
-        labels_list = self._images_to_levels(labels, num_level_anchors)
-        label_weights_list = self._images_to_levels(label_weights,
-                                                    num_level_anchors)
-        bbox_targets_list = self._images_to_levels(bbox_targets,
-                                                   num_level_anchors)
-        num_total_pos = sum(pos_num_l)
-        try:
-            paddle.distributed.all_reduce(paddle.to_tensor(num_total_pos))
-            num_total_pos = paddle.clip(
-                num_total_pos / paddle.distributed.get_world_size(), min=1.)
-        except:
-            num_total_pos = max(num_total_pos, 1)
-
-        loss_bbox_list, loss_dfl_list, loss_vfl_list, avg_factor = [], [], [], []
-        for cls_score, bbox_pred, center_and_strides, labels, label_weights, bbox_targets, stride in zip(
-                cls_scores, bbox_preds, center_and_strides_list, labels_list,
-                label_weights_list, bbox_targets_list, self.fpn_stride):
-            center_and_strides = center_and_strides.reshape([-1, 4])
-            cls_score = cls_score.transpose([0, 2, 3, 1]).reshape(
-                [-1, self.cls_out_channels])
-            bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape(
-                [-1, 4 * (self.reg_max + 1)])
-            bbox_targets = bbox_targets.reshape([-1, 4])
-            labels = labels.reshape([-1])
-
-            bg_class_ind = self.num_classes
-            pos_inds = paddle.nonzero(
-                paddle.logical_and((labels >= 0), (labels < bg_class_ind)),
-                as_tuple=False).squeeze(1)
-            # vfl
-            vfl_score = np.zeros(cls_score.shape)
-
-            if len(pos_inds) > 0:
-                pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0)
-                pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0)
-                pos_centers = paddle.gather(
-                    center_and_strides[:, :-2], pos_inds, axis=0) / stride
-
-                weight_targets = F.sigmoid(cls_score.detach())
-                weight_targets = paddle.gather(
-                    weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0)
-                pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred)
-                pos_decode_bbox_pred = distance2bbox(pos_centers,
-                                                     pos_bbox_pred_corners)
-                pos_decode_bbox_targets = pos_bbox_targets / stride
-                bbox_iou = bbox_overlaps(
-                    pos_decode_bbox_pred.detach().numpy(),
-                    pos_decode_bbox_targets.detach().numpy(),
-                    is_aligned=True)
-
-                # vfl
-                pos_labels = paddle.gather(labels, pos_inds, axis=0)
-                vfl_score[pos_inds.numpy(), pos_labels] = bbox_iou
-
-                pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1])
-                target_corners = bbox2distance(pos_centers,
-                                               pos_decode_bbox_targets,
-                                               self.reg_max).reshape([-1])
-                # regression loss
-                loss_bbox = paddle.sum(
-                    self.loss_bbox(pos_decode_bbox_pred,
-                                   pos_decode_bbox_targets) * weight_targets)
-
-                # dfl loss
-                loss_dfl = self.loss_dfl(
-                    pred_corners,
-                    target_corners,
-                    weight=weight_targets.expand([-1, 4]).reshape([-1]),
-                    avg_factor=4.0)
-            else:
-                loss_bbox = bbox_pred.sum() * 0
-                loss_dfl = bbox_pred.sum() * 0
-                weight_targets = paddle.to_tensor([0], dtype='float32')
-
-            # vfl loss
-            num_pos_avg_per_gpu = num_total_pos
-            vfl_score = paddle.to_tensor(vfl_score)
-            loss_vfl = self.loss_vfl(
-                cls_score, vfl_score, avg_factor=num_pos_avg_per_gpu)
-
-            loss_bbox_list.append(loss_bbox)
-            loss_dfl_list.append(loss_dfl)
-            loss_vfl_list.append(loss_vfl)
-            avg_factor.append(weight_targets.sum())
-
-        avg_factor = sum(avg_factor)
-        try:
-            paddle.distributed.all_reduce(paddle.to_tensor(avg_factor))
-            avg_factor = paddle.clip(
-                avg_factor / paddle.distributed.get_world_size(), min=1)
-        except:
-            avg_factor = max(avg_factor.item(), 1)
-        if avg_factor <= 0:
-            loss_vfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
-            loss_bbox = paddle.to_tensor(
-                0, dtype='float32', stop_gradient=False)
-            loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False)
-        else:
-            losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list))
-            losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list))
-            loss_vfl = sum(loss_vfl_list)
-            loss_bbox = sum(losses_bbox)
-            loss_dfl = sum(losses_dfl)
-
-        loss_states = dict(
-            loss_vfl=loss_vfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl)
-
-        return loss_states
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/solov2_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/solov2_head.py
deleted file mode 100644
index 0fd0f61..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/solov2_head.py
+++ /dev/null
@@ -1,554 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-from paddle import ParamAttr
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.nn.initializer import Normal, Constant
-
-from ppdet.modeling.layers import ConvNormLayer, MaskMatrixNMS, DropBlock
-from ppdet.core.workspace import register
-
-from six.moves import zip
-import numpy as np
-
-__all__ = ['SOLOv2Head']
-
-
-@register
-class SOLOv2MaskHead(nn.Layer):
-    """
-    MaskHead of SOLOv2.
-    The code of this function is based on:
-        https://github.com/WXinlong/SOLO/blob/master/mmdet/models/mask_heads/mask_feat_head.py
-
-    Args:
-        in_channels (int): The channel number of input Tensor.
-        out_channels (int): The channel number of output Tensor.
-        start_level (int): The position where the input starts.
-        end_level (int): The position where the input ends.
-        use_dcn_in_tower (bool): Whether to use dcn in tower or not.
-    """
-    __shared__ = ['norm_type']
-
-    def __init__(self,
-                 in_channels=256,
-                 mid_channels=128,
-                 out_channels=256,
-                 start_level=0,
-                 end_level=3,
-                 use_dcn_in_tower=False,
-                 norm_type='gn'):
-        super(SOLOv2MaskHead, self).__init__()
-        assert start_level >= 0 and end_level >= start_level
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.mid_channels = mid_channels
-        self.use_dcn_in_tower = use_dcn_in_tower
-        self.range_level = end_level - start_level + 1
-        self.use_dcn = True if self.use_dcn_in_tower else False
-        self.convs_all_levels = []
-        self.norm_type = norm_type
-        for i in range(start_level, end_level + 1):
-            conv_feat_name = 'mask_feat_head.convs_all_levels.{}'.format(i)
-            conv_pre_feat = nn.Sequential()
-            if i == start_level:
-                conv_pre_feat.add_sublayer(
-                    conv_feat_name + '.conv' + str(i),
-                    ConvNormLayer(
-                        ch_in=self.in_channels,
-                        ch_out=self.mid_channels,
-                        filter_size=3,
-                        stride=1,
-                        use_dcn=self.use_dcn,
-                        norm_type=self.norm_type))
-                self.add_sublayer('conv_pre_feat' + str(i), conv_pre_feat)
-                self.convs_all_levels.append(conv_pre_feat)
-            else:
-                for j in range(i):
-                    ch_in = 0
-                    if j == 0:
-                        ch_in = self.in_channels + 2 if i == end_level else self.in_channels
-                    else:
-                        ch_in = self.mid_channels
-                    conv_pre_feat.add_sublayer(
-                        conv_feat_name + '.conv' + str(j),
-                        ConvNormLayer(
-                            ch_in=ch_in,
-                            ch_out=self.mid_channels,
-                            filter_size=3,
-                            stride=1,
-                            use_dcn=self.use_dcn,
-                            norm_type=self.norm_type))
-                    conv_pre_feat.add_sublayer(
-                        conv_feat_name + '.conv' + str(j) + 'act', nn.ReLU())
-                    conv_pre_feat.add_sublayer(
-                        'upsample' + str(i) + str(j),
-                        nn.Upsample(
-                            scale_factor=2, mode='bilinear'))
-                self.add_sublayer('conv_pre_feat' + str(i), conv_pre_feat)
-                self.convs_all_levels.append(conv_pre_feat)
-
-        conv_pred_name = 'mask_feat_head.conv_pred.0'
-        self.conv_pred = self.add_sublayer(
-            conv_pred_name,
-            ConvNormLayer(
-                ch_in=self.mid_channels,
-                ch_out=self.out_channels,
-                filter_size=1,
-                stride=1,
-                use_dcn=self.use_dcn,
-                norm_type=self.norm_type))
-
-    def forward(self, inputs):
-        """
-        Get SOLOv2MaskHead output.
-
-        Args:
-            inputs(list[Tensor]): feature map from each necks with shape of [N, C, H, W]
-        Returns:
-            ins_pred(Tensor): Output of SOLOv2MaskHead head
-        """
-        feat_all_level = F.relu(self.convs_all_levels[0](inputs[0]))
-        for i in range(1, self.range_level):
-            input_p = inputs[i]
-            if i == (self.range_level - 1):
-                input_feat = input_p
-                x_range = paddle.linspace(
-                    -1, 1, paddle.shape(input_feat)[-1], dtype='float32')
-                y_range = paddle.linspace(
-                    -1, 1, paddle.shape(input_feat)[-2], dtype='float32')
-                y, x = paddle.meshgrid([y_range, x_range])
-                x = paddle.unsqueeze(x, [0, 1])
-                y = paddle.unsqueeze(y, [0, 1])
-                y = paddle.expand(
-                    y, shape=[paddle.shape(input_feat)[0], 1, -1, -1])
-                x = paddle.expand(
-                    x, shape=[paddle.shape(input_feat)[0], 1, -1, -1])
-                coord_feat = paddle.concat([x, y], axis=1)
-                input_p = paddle.concat([input_p, coord_feat], axis=1)
-            feat_all_level = paddle.add(feat_all_level,
-                                        self.convs_all_levels[i](input_p))
-        ins_pred = F.relu(self.conv_pred(feat_all_level))
-
-        return ins_pred
-
-
-@register
-class SOLOv2Head(nn.Layer):
-    """
-    Head block for SOLOv2 network
-
-    Args:
-        num_classes (int): Number of output classes.
-        in_channels (int): Number of input channels.
-        seg_feat_channels (int): Num_filters of kernel & categroy branch convolution operation.
-        stacked_convs (int): Times of convolution operation.
-        num_grids (list[int]): List of feature map grids size.
-        kernel_out_channels (int): Number of output channels in kernel branch.
-        dcn_v2_stages (list): Which stage use dcn v2 in tower. It is between [0, stacked_convs).
-        segm_strides (list[int]): List of segmentation area stride.
-        solov2_loss (object): SOLOv2Loss instance.
-        score_threshold (float): Threshold of categroy score.
-        mask_nms (object): MaskMatrixNMS instance.
-    """
-    __inject__ = ['solov2_loss', 'mask_nms']
-    __shared__ = ['norm_type', 'num_classes']
-
-    def __init__(self,
-                 num_classes=80,
-                 in_channels=256,
-                 seg_feat_channels=256,
-                 stacked_convs=4,
-                 num_grids=[40, 36, 24, 16, 12],
-                 kernel_out_channels=256,
-                 dcn_v2_stages=[],
-                 segm_strides=[8, 8, 16, 32, 32],
-                 solov2_loss=None,
-                 score_threshold=0.1,
-                 mask_threshold=0.5,
-                 mask_nms=None,
-                 norm_type='gn',
-                 drop_block=False):
-        super(SOLOv2Head, self).__init__()
-        self.num_classes = num_classes
-        self.in_channels = in_channels
-        self.seg_num_grids = num_grids
-        self.cate_out_channels = self.num_classes
-        self.seg_feat_channels = seg_feat_channels
-        self.stacked_convs = stacked_convs
-        self.kernel_out_channels = kernel_out_channels
-        self.dcn_v2_stages = dcn_v2_stages
-        self.segm_strides = segm_strides
-        self.solov2_loss = solov2_loss
-        self.mask_nms = mask_nms
-        self.score_threshold = score_threshold
-        self.mask_threshold = mask_threshold
-        self.norm_type = norm_type
-        self.drop_block = drop_block
-
-        self.kernel_pred_convs = []
-        self.cate_pred_convs = []
-        for i in range(self.stacked_convs):
-            use_dcn = True if i in self.dcn_v2_stages else False
-            ch_in = self.in_channels + 2 if i == 0 else self.seg_feat_channels
-            kernel_conv = self.add_sublayer(
-                'bbox_head.kernel_convs.' + str(i),
-                ConvNormLayer(
-                    ch_in=ch_in,
-                    ch_out=self.seg_feat_channels,
-                    filter_size=3,
-                    stride=1,
-                    use_dcn=use_dcn,
-                    norm_type=self.norm_type))
-            self.kernel_pred_convs.append(kernel_conv)
-            ch_in = self.in_channels if i == 0 else self.seg_feat_channels
-            cate_conv = self.add_sublayer(
-                'bbox_head.cate_convs.' + str(i),
-                ConvNormLayer(
-                    ch_in=ch_in,
-                    ch_out=self.seg_feat_channels,
-                    filter_size=3,
-                    stride=1,
-                    use_dcn=use_dcn,
-                    norm_type=self.norm_type))
-            self.cate_pred_convs.append(cate_conv)
-
-        self.solo_kernel = self.add_sublayer(
-            'bbox_head.solo_kernel',
-            nn.Conv2D(
-                self.seg_feat_channels,
-                self.kernel_out_channels,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0., std=0.01)),
-                bias_attr=True))
-        self.solo_cate = self.add_sublayer(
-            'bbox_head.solo_cate',
-            nn.Conv2D(
-                self.seg_feat_channels,
-                self.cate_out_channels,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0., std=0.01)),
-                bias_attr=ParamAttr(initializer=Constant(
-                    value=float(-np.log((1 - 0.01) / 0.01))))))
-
-        if self.drop_block and self.training:
-            self.drop_block_fun = DropBlock(
-                block_size=3, keep_prob=0.9, name='solo_cate.dropblock')
-
-    def _points_nms(self, heat, kernel_size=2):
-        hmax = F.max_pool2d(heat, kernel_size=kernel_size, stride=1, padding=1)
-        keep = paddle.cast((hmax[:, :, :-1, :-1] == heat), 'float32')
-        return heat * keep
-
-    def _split_feats(self, feats):
-        return (F.interpolate(
-            feats[0],
-            scale_factor=0.5,
-            align_corners=False,
-            align_mode=0,
-            mode='bilinear'), feats[1], feats[2], feats[3], F.interpolate(
-                feats[4],
-                size=paddle.shape(feats[3])[-2:],
-                mode='bilinear',
-                align_corners=False,
-                align_mode=0))
-
-    def forward(self, input):
-        """
-        Get SOLOv2 head output
-
-        Args:
-            input (list): List of Tensors, output of backbone or neck stages
-        Returns:
-            cate_pred_list (list): Tensors of each category branch layer
-            kernel_pred_list (list): Tensors of each kernel branch layer
-        """
-        feats = self._split_feats(input)
-        cate_pred_list = []
-        kernel_pred_list = []
-        for idx in range(len(self.seg_num_grids)):
-            cate_pred, kernel_pred = self._get_output_single(feats[idx], idx)
-            cate_pred_list.append(cate_pred)
-            kernel_pred_list.append(kernel_pred)
-
-        return cate_pred_list, kernel_pred_list
-
-    def _get_output_single(self, input, idx):
-        ins_kernel_feat = input
-        # CoordConv
-        x_range = paddle.linspace(
-            -1, 1, paddle.shape(ins_kernel_feat)[-1], dtype='float32')
-        y_range = paddle.linspace(
-            -1, 1, paddle.shape(ins_kernel_feat)[-2], dtype='float32')
-        y, x = paddle.meshgrid([y_range, x_range])
-        x = paddle.unsqueeze(x, [0, 1])
-        y = paddle.unsqueeze(y, [0, 1])
-        y = paddle.expand(
-            y, shape=[paddle.shape(ins_kernel_feat)[0], 1, -1, -1])
-        x = paddle.expand(
-            x, shape=[paddle.shape(ins_kernel_feat)[0], 1, -1, -1])
-        coord_feat = paddle.concat([x, y], axis=1)
-        ins_kernel_feat = paddle.concat([ins_kernel_feat, coord_feat], axis=1)
-
-        # kernel branch
-        kernel_feat = ins_kernel_feat
-        seg_num_grid = self.seg_num_grids[idx]
-        kernel_feat = F.interpolate(
-            kernel_feat,
-            size=[seg_num_grid, seg_num_grid],
-            mode='bilinear',
-            align_corners=False,
-            align_mode=0)
-        cate_feat = kernel_feat[:, :-2, :, :]
-
-        for kernel_layer in self.kernel_pred_convs:
-            kernel_feat = F.relu(kernel_layer(kernel_feat))
-        if self.drop_block and self.training:
-            kernel_feat = self.drop_block_fun(kernel_feat)
-        kernel_pred = self.solo_kernel(kernel_feat)
-        # cate branch
-        for cate_layer in self.cate_pred_convs:
-            cate_feat = F.relu(cate_layer(cate_feat))
-        if self.drop_block and self.training:
-            cate_feat = self.drop_block_fun(cate_feat)
-        cate_pred = self.solo_cate(cate_feat)
-
-        if not self.training:
-            cate_pred = self._points_nms(F.sigmoid(cate_pred), kernel_size=2)
-            cate_pred = paddle.transpose(cate_pred, [0, 2, 3, 1])
-        return cate_pred, kernel_pred
-
-    def get_loss(self, cate_preds, kernel_preds, ins_pred, ins_labels,
-                 cate_labels, grid_order_list, fg_num):
-        """
-        Get loss of network of SOLOv2.
-
-        Args:
-            cate_preds (list): Tensor list of categroy branch output.
-            kernel_preds (list): Tensor list of kernel branch output.
-            ins_pred (list): Tensor list of instance branch output.
-            ins_labels (list): List of instance labels pre batch.
-            cate_labels (list): List of categroy labels pre batch.
-            grid_order_list (list): List of index in pre grid.
-            fg_num (int): Number of positive samples in a mini-batch.
-        Returns:
-            loss_ins (Tensor): The instance loss Tensor of SOLOv2 network.
-            loss_cate (Tensor): The category loss Tensor of SOLOv2 network.
-        """
-        batch_size = paddle.shape(grid_order_list[0])[0]
-        ins_pred_list = []
-        for kernel_preds_level, grid_orders_level in zip(kernel_preds,
-                                                         grid_order_list):
-            if grid_orders_level.shape[1] == 0:
-                ins_pred_list.append(None)
-                continue
-            grid_orders_level = paddle.reshape(grid_orders_level, [-1])
-            reshape_pred = paddle.reshape(
-                kernel_preds_level,
-                shape=(paddle.shape(kernel_preds_level)[0],
-                       paddle.shape(kernel_preds_level)[1], -1))
-            reshape_pred = paddle.transpose(reshape_pred, [0, 2, 1])
-            reshape_pred = paddle.reshape(
-                reshape_pred, shape=(-1, paddle.shape(reshape_pred)[2]))
-            gathered_pred = paddle.gather(reshape_pred, index=grid_orders_level)
-            gathered_pred = paddle.reshape(
-                gathered_pred,
-                shape=[batch_size, -1, paddle.shape(gathered_pred)[1]])
-            cur_ins_pred = ins_pred
-            cur_ins_pred = paddle.reshape(
-                cur_ins_pred,
-                shape=(paddle.shape(cur_ins_pred)[0],
-                       paddle.shape(cur_ins_pred)[1], -1))
-            ins_pred_conv = paddle.matmul(gathered_pred, cur_ins_pred)
-            cur_ins_pred = paddle.reshape(
-                ins_pred_conv,
-                shape=(-1, paddle.shape(ins_pred)[-2],
-                       paddle.shape(ins_pred)[-1]))
-            ins_pred_list.append(cur_ins_pred)
-
-        num_ins = paddle.sum(fg_num)
-        cate_preds = [
-            paddle.reshape(
-                paddle.transpose(cate_pred, [0, 2, 3, 1]),
-                shape=(-1, self.cate_out_channels)) for cate_pred in cate_preds
-        ]
-        flatten_cate_preds = paddle.concat(cate_preds)
-        new_cate_labels = []
-        for cate_label in cate_labels:
-            new_cate_labels.append(paddle.reshape(cate_label, shape=[-1]))
-        cate_labels = paddle.concat(new_cate_labels)
-
-        loss_ins, loss_cate = self.solov2_loss(
-            ins_pred_list, ins_labels, flatten_cate_preds, cate_labels, num_ins)
-
-        return {'loss_ins': loss_ins, 'loss_cate': loss_cate}
-
-    def get_prediction(self, cate_preds, kernel_preds, seg_pred, im_shape,
-                       scale_factor):
-        """
-        Get prediction result of SOLOv2 network
-
-        Args:
-            cate_preds (list): List of Variables, output of categroy branch.
-            kernel_preds (list): List of Variables, output of kernel branch.
-            seg_pred (list): List of Variables, output of mask head stages.
-            im_shape (Variables): [h, w] for input images.
-            scale_factor (Variables): [scale, scale] for input images.
-        Returns:
-            seg_masks (Tensor): The prediction segmentation.
-            cate_labels (Tensor): The prediction categroy label of each segmentation.
-            seg_masks (Tensor): The prediction score of each segmentation.
-        """
-        num_levels = len(cate_preds)
-        featmap_size = paddle.shape(seg_pred)[-2:]
-        seg_masks_list = []
-        cate_labels_list = []
-        cate_scores_list = []
-        cate_preds = [cate_pred * 1.0 for cate_pred in cate_preds]
-        kernel_preds = [kernel_pred * 1.0 for kernel_pred in kernel_preds]
-        # Currently only supports batch size == 1
-        for idx in range(1):
-            cate_pred_list = [
-                paddle.reshape(
-                    cate_preds[i][idx], shape=(-1, self.cate_out_channels))
-                for i in range(num_levels)
-            ]
-            seg_pred_list = seg_pred
-            kernel_pred_list = [
-                paddle.reshape(
-                    paddle.transpose(kernel_preds[i][idx], [1, 2, 0]),
-                    shape=(-1, self.kernel_out_channels))
-                for i in range(num_levels)
-            ]
-            cate_pred_list = paddle.concat(cate_pred_list, axis=0)
-            kernel_pred_list = paddle.concat(kernel_pred_list, axis=0)
-
-            seg_masks, cate_labels, cate_scores = self.get_seg_single(
-                cate_pred_list, seg_pred_list, kernel_pred_list, featmap_size,
-                im_shape[idx], scale_factor[idx][0])
-            bbox_num = paddle.shape(cate_labels)[0:1]
-        return seg_masks, cate_labels, cate_scores, bbox_num
-
-    def get_seg_single(self, cate_preds, seg_preds, kernel_preds, featmap_size,
-                       im_shape, scale_factor):
-        """
-        The code of this function is based on:
-            https://github.com/WXinlong/SOLO/blob/master/mmdet/models/anchor_heads/solov2_head.py#L385
-        """
-        h = paddle.cast(im_shape[0], 'int32')
-        w = paddle.cast(im_shape[1], 'int32')
-        upsampled_size_out = [featmap_size[0] * 4, featmap_size[1] * 4]
-
-        y = paddle.zeros(shape=paddle.shape(cate_preds), dtype='float32')
-        inds = paddle.where(cate_preds > self.score_threshold, cate_preds, y)
-        inds = paddle.nonzero(inds)
-        cate_preds = paddle.reshape(cate_preds, shape=[-1])
-        # Prevent empty and increase fake data
-        ind_a = paddle.cast(paddle.shape(kernel_preds)[0:1], 'int64')
-        ind_b = paddle.zeros(shape=[1], dtype='int64')
-        inds_end = paddle.unsqueeze(paddle.concat([ind_a, ind_b]), 0)
-        inds = paddle.concat([inds, inds_end])
-        kernel_preds_end = paddle.ones(
-            shape=[1, self.kernel_out_channels], dtype='float32')
-        kernel_preds = paddle.concat([kernel_preds, kernel_preds_end])
-        cate_preds = paddle.concat(
-            [cate_preds, paddle.zeros(
-                shape=[1], dtype='float32')])
-
-        # cate_labels & kernel_preds
-        cate_labels = inds[:, 1]
-        kernel_preds = paddle.gather(kernel_preds, index=inds[:, 0])
-        cate_score_idx = paddle.add(inds[:, 0] * self.cate_out_channels,
-                                    cate_labels)
-        cate_scores = paddle.gather(cate_preds, index=cate_score_idx)
-
-        size_trans = np.power(self.seg_num_grids, 2)
-        strides = []
-        for _ind in range(len(self.segm_strides)):
-            strides.append(
-                paddle.full(
-                    shape=[int(size_trans[_ind])],
-                    fill_value=self.segm_strides[_ind],
-                    dtype="int32"))
-        strides = paddle.concat(strides)
-        strides = paddle.concat(
-            [strides, paddle.zeros(
-                shape=[1], dtype='int32')])
-        strides = paddle.gather(strides, index=inds[:, 0])
-
-        # mask encoding.
-        kernel_preds = paddle.unsqueeze(kernel_preds, [2, 3])
-        seg_preds = F.conv2d(seg_preds, kernel_preds)
-        seg_preds = F.sigmoid(paddle.squeeze(seg_preds, [0]))
-        seg_masks = seg_preds > self.mask_threshold
-        seg_masks = paddle.cast(seg_masks, 'float32')
-        sum_masks = paddle.sum(seg_masks, axis=[1, 2])
-
-        y = paddle.zeros(shape=paddle.shape(sum_masks), dtype='float32')
-        keep = paddle.where(sum_masks > strides, sum_masks, y)
-        keep = paddle.nonzero(keep)
-        keep = paddle.squeeze(keep, axis=[1])
-        # Prevent empty and increase fake data
-        keep_other = paddle.concat(
-            [keep, paddle.cast(paddle.shape(sum_masks)[0:1] - 1, 'int64')])
-        keep_scores = paddle.concat(
-            [keep, paddle.cast(paddle.shape(sum_masks)[0:1], 'int64')])
-        cate_scores_end = paddle.zeros(shape=[1], dtype='float32')
-        cate_scores = paddle.concat([cate_scores, cate_scores_end])
-
-        seg_masks = paddle.gather(seg_masks, index=keep_other)
-        seg_preds = paddle.gather(seg_preds, index=keep_other)
-        sum_masks = paddle.gather(sum_masks, index=keep_other)
-        cate_labels = paddle.gather(cate_labels, index=keep_other)
-        cate_scores = paddle.gather(cate_scores, index=keep_scores)
-
-        # mask scoring.
-        seg_mul = paddle.cast(seg_preds * seg_masks, 'float32')
-        seg_scores = paddle.sum(seg_mul, axis=[1, 2]) / sum_masks
-        cate_scores *= seg_scores
-        # Matrix NMS
-        seg_preds, cate_scores, cate_labels = self.mask_nms(
-            seg_preds, seg_masks, cate_labels, cate_scores, sum_masks=sum_masks)
-        ori_shape = im_shape[:2] / scale_factor + 0.5
-        ori_shape = paddle.cast(ori_shape, 'int32')
-        seg_preds = F.interpolate(
-            paddle.unsqueeze(seg_preds, 0),
-            size=upsampled_size_out,
-            mode='bilinear',
-            align_corners=False,
-            align_mode=0)
-        seg_preds = paddle.slice(
-            seg_preds, axes=[2, 3], starts=[0, 0], ends=[h, w])
-        seg_masks = paddle.squeeze(
-            F.interpolate(
-                seg_preds,
-                size=ori_shape[:2],
-                mode='bilinear',
-                align_corners=False,
-                align_mode=0),
-            axis=[0])
-        seg_masks = paddle.cast(seg_masks > self.mask_threshold, 'uint8')
-        return seg_masks, cate_labels, cate_scores
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/sparse_roi_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/sparse_roi_head.py
deleted file mode 100644
index bdc76a9..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/sparse_roi_head.py
+++ /dev/null
@@ -1,467 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This code is referenced from: https://github.com/open-mmlab/mmdetection
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import copy
-
-import paddle
-from paddle import nn
-
-from ppdet.core.workspace import register
-from ppdet.modeling import initializer as init
-from .roi_extractor import RoIAlign
-from ..bbox_utils import delta2bbox_v2
-from ..cls_utils import _get_class_default_kwargs
-from ..layers import MultiHeadAttention
-
-__all__ = ['SparseRoIHead', 'DIIHead', 'DynamicMaskHead']
-
-
-class DynamicConv(nn.Layer):
-    def __init__(self,
-                 in_channels=256,
-                 feature_channels=64,
-                 out_channels=None,
-                 roi_resolution=7,
-                 with_proj=True):
-        super(DynamicConv, self).__init__()
-
-        self.in_channels = in_channels
-        self.feature_channels = feature_channels
-        self.out_channels = out_channels if out_channels else in_channels
-
-        self.num_params_in = self.in_channels * self.feature_channels
-        self.num_params_out = self.out_channels * self.feature_channels
-        self.dynamic_layer = nn.Linear(self.in_channels,
-                                       self.num_params_in + self.num_params_out)
-
-        self.norm_in = nn.LayerNorm(self.feature_channels)
-        self.norm_out = nn.LayerNorm(self.out_channels)
-
-        self.activation = nn.ReLU()
-
-        self.with_proj = with_proj
-        if self.with_proj:
-            num_output = self.out_channels * roi_resolution**2
-            self.fc_layer = nn.Linear(num_output, self.out_channels)
-            self.fc_norm = nn.LayerNorm(self.out_channels)
-
-    def forward(self, param_feature, input_feature):
-        input_feature = input_feature.flatten(2).transpose([2, 0, 1])
-        input_feature = input_feature.transpose([1, 0, 2])
-
-        parameters = self.dynamic_layer(param_feature)
-
-        param_in = parameters[:, :self.num_params_in].reshape(
-            [-1, self.in_channels, self.feature_channels])
-        param_out = parameters[:, -self.num_params_out:].reshape(
-            [-1, self.feature_channels, self.out_channels])
-
-        features = paddle.bmm(input_feature, param_in)
-        features = self.norm_in(features)
-        features = self.activation(features)
-
-        features = paddle.bmm(features, param_out)
-        features = self.norm_out(features)
-        features = self.activation(features)
-
-        if self.with_proj:
-            features = features.flatten(1)
-            features = self.fc_layer(features)
-            features = self.fc_norm(features)
-            features = self.activation(features)
-
-        return features
-
-
-class FFN(nn.Layer):
-    def __init__(self,
-                 embed_dims=256,
-                 feedforward_channels=2048,
-                 num_fcs=2,
-                 ffn_drop=0.0,
-                 add_identity=True):
-        super(FFN, self).__init__()
-
-        layers = []
-        in_channels = embed_dims
-        for _ in range(num_fcs - 1):
-            layers.append(
-                nn.Sequential(
-                    nn.Linear(in_channels, feedforward_channels),
-                    nn.ReLU(), nn.Dropout(ffn_drop)))
-            in_channels = feedforward_channels
-        layers.append(nn.Linear(feedforward_channels, embed_dims))
-        layers.append(nn.Dropout(ffn_drop))
-        self.layers = nn.Sequential(*layers)
-
-        self.add_identity = add_identity
-
-    def forward(self, x):
-        identity = x
-        out = self.layers(x)
-        if not self.add_identity:
-            return out
-        else:
-            return out + identity
-
-
-@register
-class DynamicMaskHead(nn.Layer):
-    __shared__ = ['num_classes', 'proposal_embedding_dim', 'norm_type']
-
-    def __init__(self,
-                 num_classes=80,
-                 proposal_embedding_dim=256,
-                 dynamic_feature_channels=64,
-                 roi_resolution=14,
-                 num_convs=4,
-                 conv_kernel_size=3,
-                 conv_channels=256,
-                 upsample_method='deconv',
-                 upsample_scale_factor=2,
-                 norm_type='bn'):
-        super(DynamicMaskHead, self).__init__()
-
-        self.d_model = proposal_embedding_dim
-
-        self.instance_interactive_conv = DynamicConv(
-            self.d_model,
-            dynamic_feature_channels,
-            roi_resolution=roi_resolution,
-            with_proj=False)
-
-        self.convs = nn.LayerList()
-        for i in range(num_convs):
-            self.convs.append(
-                nn.Sequential(
-                    nn.Conv2D(
-                        self.d_model if i == 0 else conv_channels,
-                        conv_channels,
-                        conv_kernel_size,
-                        padding='same',
-                        bias_attr=False),
-                    nn.BatchNorm2D(conv_channels),
-                    nn.ReLU()))
-        if norm_type == 'sync_bn':
-            self.convs = nn.SyncBatchNorm.convert_sync_batchnorm(self.convs)
-
-        self.upsample_method = upsample_method
-        if upsample_method is None:
-            self.upsample = None
-        elif upsample_method == 'deconv':
-            self.upsample = nn.Conv2DTranspose(
-                conv_channels if num_convs > 0 else self.d_model,
-                conv_channels,
-                upsample_scale_factor,
-                stride=upsample_scale_factor)
-            self.relu = nn.ReLU()
-        else:
-            self.upsample = nn.Upsample(None, upsample_scale_factor)
-
-        cls_in_channels = conv_channels if num_convs > 0 else self.d_model
-        cls_in_channels = conv_channels if upsample_method == 'deconv' else cls_in_channels
-        self.conv_cls = nn.Conv2D(cls_in_channels, num_classes, 1)
-
-        self._init_weights()
-
-    def _init_weights(self):
-        for p in self.parameters():
-            if p.dim() > 1:
-                init.xavier_uniform_(p)
-
-        init.constant_(self.conv_cls.bias, 0.)
-
-    def forward(self, roi_features, attn_features):
-        attn_features = attn_features.reshape([-1, self.d_model])
-        attn_features_iic = self.instance_interactive_conv(attn_features,
-                                                           roi_features)
-
-        x = attn_features_iic.transpose([0, 2, 1]).reshape(roi_features.shape)
-
-        for conv in self.convs:
-            x = conv(x)
-        if self.upsample is not None:
-            x = self.upsample(x)
-            if self.upsample_method == 'deconv':
-                x = self.relu(x)
-        mask_pred = self.conv_cls(x)
-        return mask_pred
-
-
-@register
-class DIIHead(nn.Layer):
-    __shared__ = ['num_classes', 'proposal_embedding_dim']
-
-    def __init__(self,
-                 num_classes=80,
-                 proposal_embedding_dim=256,
-                 feedforward_channels=2048,
-                 dynamic_feature_channels=64,
-                 roi_resolution=7,
-                 num_attn_heads=8,
-                 dropout=0.0,
-                 num_ffn_fcs=2,
-                 num_cls_fcs=1,
-                 num_reg_fcs=3):
-        super(DIIHead, self).__init__()
-
-        self.num_classes = num_classes
-        self.d_model = proposal_embedding_dim
-
-        self.attention = MultiHeadAttention(self.d_model, num_attn_heads,
-                                            dropout)
-        self.attention_norm = nn.LayerNorm(self.d_model)
-
-        self.instance_interactive_conv = DynamicConv(
-            self.d_model,
-            dynamic_feature_channels,
-            roi_resolution=roi_resolution,
-            with_proj=True)
-        self.instance_interactive_conv_dropout = nn.Dropout(dropout)
-        self.instance_interactive_conv_norm = nn.LayerNorm(self.d_model)
-
-        self.ffn = FFN(self.d_model, feedforward_channels, num_ffn_fcs, dropout)
-        self.ffn_norm = nn.LayerNorm(self.d_model)
-
-        self.cls_fcs = nn.LayerList()
-        for _ in range(num_cls_fcs):
-            self.cls_fcs.append(
-                nn.Linear(
-                    self.d_model, self.d_model, bias_attr=False))
-            self.cls_fcs.append(nn.LayerNorm(self.d_model))
-            self.cls_fcs.append(nn.ReLU())
-        self.fc_cls = nn.Linear(self.d_model, self.num_classes)
-
-        self.reg_fcs = nn.LayerList()
-        for _ in range(num_reg_fcs):
-            self.reg_fcs.append(
-                nn.Linear(
-                    self.d_model, self.d_model, bias_attr=False))
-            self.reg_fcs.append(nn.LayerNorm(self.d_model))
-            self.reg_fcs.append(nn.ReLU())
-        self.fc_reg = nn.Linear(self.d_model, 4)
-
-        self._init_weights()
-
-    def _init_weights(self):
-        for p in self.parameters():
-            if p.dim() > 1:
-                init.xavier_uniform_(p)
-
-        bias_init = init.bias_init_with_prob(0.01)
-        init.constant_(self.fc_cls.bias, bias_init)
-
-    def forward(self, roi_features, proposal_features):
-        N, num_proposals = proposal_features.shape[:2]
-
-        proposal_features = proposal_features + self.attention(
-            proposal_features)
-        attn_features = self.attention_norm(proposal_features)
-
-        proposal_features = attn_features.reshape([-1, self.d_model])
-        proposal_features_iic = self.instance_interactive_conv(
-            proposal_features, roi_features)
-        proposal_features = proposal_features + self.instance_interactive_conv_dropout(
-            proposal_features_iic)
-        obj_features = self.instance_interactive_conv_norm(proposal_features)
-
-        obj_features = self.ffn(obj_features)
-        obj_features = self.ffn_norm(obj_features)
-
-        cls_feature = obj_features.clone()
-        reg_feature = obj_features.clone()
-
-        for cls_layer in self.cls_fcs:
-            cls_feature = cls_layer(cls_feature)
-        class_logits = self.fc_cls(cls_feature)
-        for reg_layer in self.reg_fcs:
-            reg_feature = reg_layer(reg_feature)
-        bbox_deltas = self.fc_reg(reg_feature)
-
-        class_logits = class_logits.reshape(
-            [N, num_proposals, self.num_classes])
-        bbox_deltas = bbox_deltas.reshape([N, num_proposals, 4])
-        obj_features = obj_features.reshape([N, num_proposals, self.d_model])
-
-        return class_logits, bbox_deltas, obj_features, attn_features
-
-    @staticmethod
-    def refine_bboxes(proposal_bboxes, bbox_deltas):
-        pred_bboxes = delta2bbox_v2(
-            bbox_deltas.reshape([-1, 4]),
-            proposal_bboxes.reshape([-1, 4]),
-            delta_mean=[0.0, 0.0, 0.0, 0.0],
-            delta_std=[0.5, 0.5, 1.0, 1.0],
-            ctr_clip=None)
-        return pred_bboxes.reshape(proposal_bboxes.shape)
-
-
-@register
-class SparseRoIHead(nn.Layer):
-    __inject__ = ['bbox_head', 'mask_head', 'loss_func']
-
-    def __init__(self,
-                 num_stages=6,
-                 bbox_roi_extractor=_get_class_default_kwargs(RoIAlign),
-                 mask_roi_extractor=_get_class_default_kwargs(RoIAlign),
-                 bbox_head='DIIHead',
-                 mask_head='DynamicMaskHead',
-                 loss_func='QueryInstLoss'):
-        super(SparseRoIHead, self).__init__()
-
-        self.num_stages = num_stages
-
-        self.bbox_roi_extractor = bbox_roi_extractor
-        self.mask_roi_extractor = mask_roi_extractor
-        if isinstance(bbox_roi_extractor, dict):
-            self.bbox_roi_extractor = RoIAlign(**bbox_roi_extractor)
-        if isinstance(mask_roi_extractor, dict):
-            self.mask_roi_extractor = RoIAlign(**mask_roi_extractor)
-
-        self.bbox_heads = nn.LayerList(
-            [copy.deepcopy(bbox_head) for _ in range(num_stages)])
-        self.mask_heads = nn.LayerList(
-            [copy.deepcopy(mask_head) for _ in range(num_stages)])
-
-        self.loss_helper = loss_func
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        bbox_roi_extractor = cfg['bbox_roi_extractor']
-        mask_roi_extractor = cfg['mask_roi_extractor']
-        assert isinstance(bbox_roi_extractor, dict)
-        assert isinstance(mask_roi_extractor, dict)
-
-        kwargs = RoIAlign.from_config(cfg, input_shape)
-        bbox_roi_extractor.update(kwargs)
-        mask_roi_extractor.update(kwargs)
-
-        return {
-            'bbox_roi_extractor': bbox_roi_extractor,
-            'mask_roi_extractor': mask_roi_extractor
-        }
-
-    @staticmethod
-    def get_roi_features(features, bboxes, roi_extractor):
-        rois_list = [
-            bboxes[i] for i in range(len(bboxes)) if len(bboxes[i]) > 0
-        ]
-        rois_num = paddle.to_tensor(
-            [len(bboxes[i]) for i in range(len(bboxes))], dtype='int32')
-
-        pos_ids = paddle.cast(rois_num, dtype='bool')
-        if pos_ids.sum() != len(rois_num):
-            rois_num = rois_num[pos_ids]
-            features = [features[i][pos_ids] for i in range(len(features))]
-
-        return roi_extractor(features, rois_list, rois_num)
-
-    def _forward_train(self, body_feats, pro_bboxes, pro_feats, targets):
-        all_stage_losses = {}
-        for stage in range(self.num_stages):
-            bbox_head = self.bbox_heads[stage]
-            mask_head = self.mask_heads[stage]
-
-            roi_feats = self.get_roi_features(body_feats, pro_bboxes,
-                                              self.bbox_roi_extractor)
-            class_logits, bbox_deltas, pro_feats, attn_feats = bbox_head(
-                roi_feats, pro_feats)
-            bbox_pred = self.bbox_heads[stage].refine_bboxes(pro_bboxes,
-                                                             bbox_deltas)
-
-            indices = self.loss_helper.matcher({
-                'pred_logits': class_logits.detach(),
-                'pred_boxes': bbox_pred.detach()
-            }, targets)
-            avg_factor = paddle.to_tensor(
-                [sum(len(tgt['labels']) for tgt in targets)], dtype='float32')
-            if paddle.distributed.get_world_size() > 1:
-                paddle.distributed.all_reduce(avg_factor)
-                avg_factor /= paddle.distributed.get_world_size()
-            avg_factor = paddle.clip(avg_factor, min=1.)
-
-            loss_classes = self.loss_helper.loss_classes(class_logits, targets,
-                                                         indices, avg_factor)
-            if sum(len(v['labels']) for v in targets) == 0:
-                loss_bboxes = {
-                    'loss_bbox': paddle.to_tensor([0.]),
-                    'loss_giou': paddle.to_tensor([0.])
-                }
-                loss_masks = {'loss_mask': paddle.to_tensor([0.])}
-            else:
-                loss_bboxes = self.loss_helper.loss_bboxes(bbox_pred, targets,
-                                                           indices, avg_factor)
-
-                pos_attn_feats = paddle.concat([
-                    paddle.gather(
-                        src, src_idx, axis=0)
-                    for src, (src_idx, _) in zip(attn_feats, indices)
-                ])
-                pos_bbox_pred = [
-                    paddle.gather(
-                        src, src_idx, axis=0)
-                    for src, (src_idx, _) in zip(bbox_pred.detach(), indices)
-                ]
-                pos_roi_feats = self.get_roi_features(body_feats, pos_bbox_pred,
-                                                      self.mask_roi_extractor)
-                mask_logits = mask_head(pos_roi_feats, pos_attn_feats)
-                loss_masks = self.loss_helper.loss_masks(
-                    pos_bbox_pred, mask_logits, targets, indices, avg_factor)
-
-            for loss in [loss_classes, loss_bboxes, loss_masks]:
-                for key in loss.keys():
-                    all_stage_losses[f'stage{stage}_{key}'] = loss[key]
-
-            pro_bboxes = bbox_pred.detach()
-
-        return all_stage_losses
-
-    def _forward_test(self, body_feats, pro_bboxes, pro_feats):
-        for stage in range(self.num_stages):
-            roi_feats = self.get_roi_features(body_feats, pro_bboxes,
-                                              self.bbox_roi_extractor)
-            class_logits, bbox_deltas, pro_feats, attn_feats = self.bbox_heads[
-                stage](roi_feats, pro_feats)
-            bbox_pred = self.bbox_heads[stage].refine_bboxes(pro_bboxes,
-                                                             bbox_deltas)
-
-            pro_bboxes = bbox_pred.detach()
-
-        roi_feats = self.get_roi_features(body_feats, bbox_pred,
-                                          self.mask_roi_extractor)
-        mask_logits = self.mask_heads[stage](roi_feats, attn_feats)
-
-        return {
-            'class_logits': class_logits,
-            'bbox_pred': bbox_pred,
-            'mask_logits': mask_logits
-        }
-
-    def forward(self,
-                body_features,
-                proposal_bboxes,
-                proposal_features,
-                targets=None):
-        if self.training:
-            return self._forward_train(body_features, proposal_bboxes,
-                                       proposal_features, targets)
-        else:
-            return self._forward_test(body_features, proposal_bboxes,
-                                      proposal_features)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/sparsercnn_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/sparsercnn_head.py
deleted file mode 100644
index 0534cf4..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/sparsercnn_head.py
+++ /dev/null
@@ -1,380 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/PeizeSun/SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/head.py
-Ths copyright of PeizeSun/SparseR-CNN is as follows:
-MIT License [see LICENSE for details]
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import copy
-import paddle
-import paddle.nn as nn
-
-from ppdet.core.workspace import register
-from ppdet.modeling.heads.roi_extractor import RoIAlign
-from ppdet.modeling.bbox_utils import delta2bbox
-from .. import initializer as init
-
-_DEFAULT_SCALE_CLAMP = math.log(100000. / 16)
-
-
-class DynamicConv(nn.Layer):
-    def __init__(
-            self,
-            head_hidden_dim,
-            head_dim_dynamic,
-            head_num_dynamic, ):
-        super().__init__()
-
-        self.hidden_dim = head_hidden_dim
-        self.dim_dynamic = head_dim_dynamic
-        self.num_dynamic = head_num_dynamic
-        self.num_params = self.hidden_dim * self.dim_dynamic
-        self.dynamic_layer = nn.Linear(self.hidden_dim,
-                                       self.num_dynamic * self.num_params)
-
-        self.norm1 = nn.LayerNorm(self.dim_dynamic)
-        self.norm2 = nn.LayerNorm(self.hidden_dim)
-
-        self.activation = nn.ReLU()
-
-        pooler_resolution = 7
-        num_output = self.hidden_dim * pooler_resolution**2
-        self.out_layer = nn.Linear(num_output, self.hidden_dim)
-        self.norm3 = nn.LayerNorm(self.hidden_dim)
-
-    def forward(self, pro_features, roi_features):
-        '''
-        pro_features: (1,  N * nr_boxes, self.d_model)
-        roi_features: (49, N * nr_boxes, self.d_model)
-        '''
-        features = roi_features.transpose(perm=[1, 0, 2])
-        parameters = self.dynamic_layer(pro_features).transpose(perm=[1, 0, 2])
-
-        param1 = parameters[:, :, :self.num_params].reshape(
-            [-1, self.hidden_dim, self.dim_dynamic])
-        param2 = parameters[:, :, self.num_params:].reshape(
-            [-1, self.dim_dynamic, self.hidden_dim])
-
-        features = paddle.bmm(features, param1)
-        features = self.norm1(features)
-        features = self.activation(features)
-
-        features = paddle.bmm(features, param2)
-        features = self.norm2(features)
-        features = self.activation(features)
-
-        features = features.flatten(1)
-        features = self.out_layer(features)
-        features = self.norm3(features)
-        features = self.activation(features)
-
-        return features
-
-
-class RCNNHead(nn.Layer):
-    def __init__(
-            self,
-            d_model,
-            num_classes,
-            dim_feedforward,
-            nhead,
-            dropout,
-            head_cls,
-            head_reg,
-            head_dim_dynamic,
-            head_num_dynamic,
-            scale_clamp: float=_DEFAULT_SCALE_CLAMP,
-            bbox_weights=(2.0, 2.0, 1.0, 1.0), ):
-        super().__init__()
-
-        self.d_model = d_model
-
-        # dynamic.
-        self.self_attn = nn.MultiHeadAttention(d_model, nhead, dropout=dropout)
-        self.inst_interact = DynamicConv(d_model, head_dim_dynamic,
-                                         head_num_dynamic)
-
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.norm3 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout)
-        self.dropout2 = nn.Dropout(dropout)
-        self.dropout3 = nn.Dropout(dropout)
-
-        self.activation = nn.ReLU()
-
-        # cls.
-        num_cls = head_cls
-        cls_module = list()
-        for _ in range(num_cls):
-            cls_module.append(nn.Linear(d_model, d_model, bias_attr=False))
-            cls_module.append(nn.LayerNorm(d_model))
-            cls_module.append(nn.ReLU())
-        self.cls_module = nn.LayerList(cls_module)
-
-        # reg.
-        num_reg = head_reg
-        reg_module = list()
-        for _ in range(num_reg):
-            reg_module.append(nn.Linear(d_model, d_model, bias_attr=False))
-            reg_module.append(nn.LayerNorm(d_model))
-            reg_module.append(nn.ReLU())
-        self.reg_module = nn.LayerList(reg_module)
-
-        # pred.
-        self.class_logits = nn.Linear(d_model, num_classes)
-        self.bboxes_delta = nn.Linear(d_model, 4)
-        self.scale_clamp = scale_clamp
-        self.bbox_weights = bbox_weights
-
-    def forward(self, features, bboxes, pro_features, pooler):
-        """
-        :param bboxes: (N, nr_boxes, 4)
-        :param pro_features: (N, nr_boxes, d_model)
-        """
-
-        N, nr_boxes = bboxes.shape[:2]
-
-        proposal_boxes = list()
-        for b in range(N):
-            proposal_boxes.append(bboxes[b])
-        roi_num = paddle.full([N], nr_boxes).astype("int32")
-
-        roi_features = pooler(features, proposal_boxes, roi_num)
-        roi_features = roi_features.reshape(
-            [N * nr_boxes, self.d_model, -1]).transpose(perm=[2, 0, 1])
-
-        # self_att.
-        pro_features = pro_features.reshape([N, nr_boxes, self.d_model])
-        pro_features2 = self.self_attn(
-            pro_features, pro_features, value=pro_features)
-        pro_features = pro_features.transpose(perm=[1, 0, 2]) + self.dropout1(
-            pro_features2.transpose(perm=[1, 0, 2]))
-        pro_features = self.norm1(pro_features)
-
-        # inst_interact.
-        pro_features = pro_features.reshape(
-            [nr_boxes, N, self.d_model]).transpose(perm=[1, 0, 2]).reshape(
-                [1, N * nr_boxes, self.d_model])
-        pro_features2 = self.inst_interact(pro_features, roi_features)
-        pro_features = pro_features + self.dropout2(pro_features2)
-        obj_features = self.norm2(pro_features)
-
-        # obj_feature.
-        obj_features2 = self.linear2(
-            self.dropout(self.activation(self.linear1(obj_features))))
-        obj_features = obj_features + self.dropout3(obj_features2)
-        obj_features = self.norm3(obj_features)
-
-        fc_feature = obj_features.transpose(perm=[1, 0, 2]).reshape(
-            [N * nr_boxes, -1])
-        cls_feature = fc_feature.clone()
-        reg_feature = fc_feature.clone()
-        for cls_layer in self.cls_module:
-            cls_feature = cls_layer(cls_feature)
-        for reg_layer in self.reg_module:
-            reg_feature = reg_layer(reg_feature)
-        class_logits = self.class_logits(cls_feature)
-        bboxes_deltas = self.bboxes_delta(reg_feature)
-        pred_bboxes = delta2bbox(bboxes_deltas,
-                                 bboxes.reshape([-1, 4]), self.bbox_weights)
-
-        return class_logits.reshape([N, nr_boxes, -1]), pred_bboxes.reshape(
-            [N, nr_boxes, -1]), obj_features
-
-
-@register
-class SparseRCNNHead(nn.Layer):
-    '''
-    SparsercnnHead
-    Args:
-        roi_input_shape (list[ShapeSpec]): The output shape of fpn
-        num_classes (int): Number of classes,
-        head_hidden_dim (int): The param of MultiHeadAttention,
-        head_dim_feedforward (int): The param of MultiHeadAttention,
-        nhead (int): The param of MultiHeadAttention,
-        head_dropout (float): The p of dropout,
-        head_cls (int): The number of class head,
-        head_reg (int): The number of regressionhead,
-        head_num_dynamic (int): The number of DynamicConv's param,
-        head_num_heads (int): The number of RCNNHead,
-        deep_supervision (int): wheather supervise the intermediate results,
-        num_proposals (int): the number of proposals boxes and features
-    '''
-    __inject__ = ['loss_func']
-    __shared__ = ['num_classes']
-
-    def __init__(
-            self,
-            head_hidden_dim,
-            head_dim_feedforward,
-            nhead,
-            head_dropout,
-            head_cls,
-            head_reg,
-            head_dim_dynamic,
-            head_num_dynamic,
-            head_num_heads,
-            deep_supervision,
-            num_proposals,
-            num_classes=80,
-            loss_func="SparseRCNNLoss",
-            roi_input_shape=None, ):
-        super().__init__()
-        assert head_num_heads > 0, \
-            f'At least one RoI Head is required, but {head_num_heads}.'
-
-        # Build RoI.
-        box_pooler = self._init_box_pooler(roi_input_shape)
-        self.box_pooler = box_pooler
-
-        # Build heads.
-        rcnn_head = RCNNHead(
-            head_hidden_dim,
-            num_classes,
-            head_dim_feedforward,
-            nhead,
-            head_dropout,
-            head_cls,
-            head_reg,
-            head_dim_dynamic,
-            head_num_dynamic, )
-        self.head_series = nn.LayerList(
-            [copy.deepcopy(rcnn_head) for i in range(head_num_heads)])
-        self.return_intermediate = deep_supervision
-
-        self.num_classes = num_classes
-
-        # build init proposal
-        self.init_proposal_features = nn.Embedding(num_proposals,
-                                                   head_hidden_dim)
-        self.init_proposal_boxes = nn.Embedding(num_proposals, 4)
-
-        self.lossfunc = loss_func
-
-        # Init parameters.
-        init.reset_initialized_parameter(self)
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        # init all parameters.
-        prior_prob = 0.01
-        bias_value = -math.log((1 - prior_prob) / prior_prob)
-
-        for m in self.sublayers():
-            if isinstance(m, nn.Linear):
-                init.xavier_normal_(m.weight, reverse=True)
-            elif not isinstance(m, nn.Embedding) and hasattr(
-                    m, "weight") and m.weight.dim() > 1:
-                init.xavier_normal_(m.weight, reverse=False)
-
-            if hasattr(m, "bias") and m.bias is not None and m.bias.shape[
-                    -1] == self.num_classes:
-                init.constant_(m.bias, bias_value)
-
-        init_bboxes = paddle.empty_like(self.init_proposal_boxes.weight)
-        init_bboxes[:, :2] = 0.5
-        init_bboxes[:, 2:] = 1.0
-        self.init_proposal_boxes.weight.set_value(init_bboxes)
-
-    @staticmethod
-    def _init_box_pooler(input_shape):
-
-        pooler_resolution = 7
-        sampling_ratio = 2
-
-        if input_shape is not None:
-            pooler_scales = tuple(1.0 / input_shape[k].stride
-                                  for k in range(len(input_shape)))
-            in_channels = [
-                input_shape[f].channels for f in range(len(input_shape))
-            ]
-            end_level = len(input_shape) - 1
-            # Check all channel counts are equal
-            assert len(set(in_channels)) == 1, in_channels
-        else:
-            pooler_scales = [1.0 / 4.0, 1.0 / 8.0, 1.0 / 16.0, 1.0 / 32.0]
-            end_level = 3
-
-        aligned = True
-        if paddle.device.is_compiled_with_custom_device('npu'):
-            aligned = False
-        box_pooler = RoIAlign(
-            resolution=pooler_resolution,
-            spatial_scale=pooler_scales,
-            sampling_ratio=sampling_ratio,
-            end_level=end_level,
-            aligned=aligned)
-        return box_pooler
-
-    def forward(self, features, input_whwh):
-
-        bs = len(features[0])
-        bboxes = box_cxcywh_to_xyxy(self.init_proposal_boxes.weight.clone(
-        )).unsqueeze(0)
-        bboxes = bboxes * input_whwh.unsqueeze(-2)
-
-        init_features = self.init_proposal_features.weight.unsqueeze(0).tile(
-            [1, bs, 1])
-        proposal_features = init_features.clone()
-
-        inter_class_logits = []
-        inter_pred_bboxes = []
-
-        for stage, rcnn_head in enumerate(self.head_series):
-            class_logits, pred_bboxes, proposal_features = rcnn_head(
-                features, bboxes, proposal_features, self.box_pooler)
-
-            if self.return_intermediate or stage == len(self.head_series) - 1:
-                inter_class_logits.append(class_logits)
-                inter_pred_bboxes.append(pred_bboxes)
-            bboxes = pred_bboxes.detach()
-
-        output = {
-            'pred_logits': inter_class_logits[-1],
-            'pred_boxes': inter_pred_bboxes[-1]
-        }
-        if self.return_intermediate:
-            output['aux_outputs'] = [{
-                'pred_logits': a,
-                'pred_boxes': b
-            } for a, b in zip(inter_class_logits[:-1], inter_pred_bboxes[:-1])]
-
-        return output
-
-    def get_loss(self, outputs, targets):
-        losses = self.lossfunc(outputs, targets)
-        weight_dict = self.lossfunc.weight_dict
-
-        for k in losses.keys():
-            if k in weight_dict:
-                losses[k] *= weight_dict[k]
-
-        return losses
-
-
-def box_cxcywh_to_xyxy(x):
-    x_c, y_c, w, h = x.unbind(-1)
-    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
-    return paddle.stack(b, axis=-1)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/ssd_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/ssd_head.py
deleted file mode 100644
index a6df482..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/ssd_head.py
+++ /dev/null
@@ -1,216 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-from paddle.regularizer import L2Decay
-from paddle import ParamAttr
-
-from ..layers import AnchorGeneratorSSD
-from ..cls_utils import _get_class_default_kwargs
-
-
-class SepConvLayer(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size=3,
-                 padding=1,
-                 conv_decay=0.):
-        super(SepConvLayer, self).__init__()
-        self.dw_conv = nn.Conv2D(
-            in_channels=in_channels,
-            out_channels=in_channels,
-            kernel_size=kernel_size,
-            stride=1,
-            padding=padding,
-            groups=in_channels,
-            weight_attr=ParamAttr(regularizer=L2Decay(conv_decay)),
-            bias_attr=False)
-
-        self.bn = nn.BatchNorm2D(
-            in_channels,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.)))
-
-        self.pw_conv = nn.Conv2D(
-            in_channels=in_channels,
-            out_channels=out_channels,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            weight_attr=ParamAttr(regularizer=L2Decay(conv_decay)),
-            bias_attr=False)
-
-    def forward(self, x):
-        x = self.dw_conv(x)
-        x = F.relu6(self.bn(x))
-        x = self.pw_conv(x)
-        return x
-
-
-class SSDExtraHead(nn.Layer):
-    def __init__(self,
-                 in_channels=256,
-                 out_channels=([256, 512], [256, 512], [128, 256], [128, 256],
-                               [128, 256]),
-                 strides=(2, 2, 2, 1, 1),
-                 paddings=(1, 1, 1, 0, 0)):
-        super(SSDExtraHead, self).__init__()
-        self.convs = nn.LayerList()
-        for out_channel, stride, padding in zip(out_channels, strides,
-                                                paddings):
-            self.convs.append(
-                self._make_layers(in_channels, out_channel[0], out_channel[1],
-                                  stride, padding))
-            in_channels = out_channel[-1]
-
-    def _make_layers(self, c_in, c_hidden, c_out, stride_3x3, padding_3x3):
-        return nn.Sequential(
-            nn.Conv2D(c_in, c_hidden, 1),
-            nn.ReLU(),
-            nn.Conv2D(c_hidden, c_out, 3, stride_3x3, padding_3x3), nn.ReLU())
-
-    def forward(self, x):
-        out = [x]
-        for conv_layer in self.convs:
-            out.append(conv_layer(out[-1]))
-        return out
-
-
-@register
-class SSDHead(nn.Layer):
-    """
-    SSDHead
-
-    Args:
-        num_classes (int): Number of classes
-        in_channels (list): Number of channels per input feature
-        anchor_generator (dict): Configuration of 'AnchorGeneratorSSD' instance
-        kernel_size (int): Conv kernel size
-        padding (int): Conv padding
-        use_sepconv (bool): Use SepConvLayer if true
-        conv_decay (float): Conv regularization coeff
-        loss (object): 'SSDLoss' instance
-        use_extra_head (bool): If use ResNet34 as baskbone, you should set `use_extra_head`=True
-    """
-
-    __shared__ = ['num_classes']
-    __inject__ = ['anchor_generator', 'loss']
-
-    def __init__(self,
-                 num_classes=80,
-                 in_channels=(512, 1024, 512, 256, 256, 256),
-                 anchor_generator=_get_class_default_kwargs(AnchorGeneratorSSD),
-                 kernel_size=3,
-                 padding=1,
-                 use_sepconv=False,
-                 conv_decay=0.,
-                 loss='SSDLoss',
-                 use_extra_head=False):
-        super(SSDHead, self).__init__()
-        # add background class
-        self.num_classes = num_classes + 1
-        self.in_channels = in_channels
-        self.anchor_generator = anchor_generator
-        self.loss = loss
-        self.use_extra_head = use_extra_head
-
-        if self.use_extra_head:
-            self.ssd_extra_head = SSDExtraHead()
-            self.in_channels = [256, 512, 512, 256, 256, 256]
-
-        if isinstance(anchor_generator, dict):
-            self.anchor_generator = AnchorGeneratorSSD(**anchor_generator)
-
-        self.num_priors = self.anchor_generator.num_priors
-        self.box_convs = []
-        self.score_convs = []
-        for i, num_prior in enumerate(self.num_priors):
-            box_conv_name = "boxes{}".format(i)
-            if not use_sepconv:
-                box_conv = self.add_sublayer(
-                    box_conv_name,
-                    nn.Conv2D(
-                        in_channels=self.in_channels[i],
-                        out_channels=num_prior * 4,
-                        kernel_size=kernel_size,
-                        padding=padding))
-            else:
-                box_conv = self.add_sublayer(
-                    box_conv_name,
-                    SepConvLayer(
-                        in_channels=self.in_channels[i],
-                        out_channels=num_prior * 4,
-                        kernel_size=kernel_size,
-                        padding=padding,
-                        conv_decay=conv_decay))
-            self.box_convs.append(box_conv)
-
-            score_conv_name = "scores{}".format(i)
-            if not use_sepconv:
-                score_conv = self.add_sublayer(
-                    score_conv_name,
-                    nn.Conv2D(
-                        in_channels=self.in_channels[i],
-                        out_channels=num_prior * self.num_classes,
-                        kernel_size=kernel_size,
-                        padding=padding))
-            else:
-                score_conv = self.add_sublayer(
-                    score_conv_name,
-                    SepConvLayer(
-                        in_channels=self.in_channels[i],
-                        out_channels=num_prior * self.num_classes,
-                        kernel_size=kernel_size,
-                        padding=padding,
-                        conv_decay=conv_decay))
-            self.score_convs.append(score_conv)
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
-
-    def forward(self, feats, image, gt_bbox=None, gt_class=None):
-        if self.use_extra_head:
-            assert len(feats) == 1, \
-                ("If you set use_extra_head=True, backbone feature "
-                 "list length should be 1.")
-            feats = self.ssd_extra_head(feats[0])
-        box_preds = []
-        cls_scores = []
-        for feat, box_conv, score_conv in zip(feats, self.box_convs,
-                                              self.score_convs):
-            box_pred = box_conv(feat)
-            box_pred = paddle.transpose(box_pred, [0, 2, 3, 1])
-            box_pred = paddle.reshape(box_pred, [0, -1, 4])
-            box_preds.append(box_pred)
-
-            cls_score = score_conv(feat)
-            cls_score = paddle.transpose(cls_score, [0, 2, 3, 1])
-            cls_score = paddle.reshape(cls_score, [0, -1, self.num_classes])
-            cls_scores.append(cls_score)
-
-        prior_boxes = self.anchor_generator(feats, image)
-
-        if self.training:
-            return self.get_loss(box_preds, cls_scores, gt_bbox, gt_class,
-                                 prior_boxes)
-        else:
-            return (box_preds, cls_scores), prior_boxes
-
-    def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes):
-        return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/tood_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/tood_head.py
deleted file mode 100644
index be84098..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/tood_head.py
+++ /dev/null
@@ -1,376 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn.initializer import Constant
-
-from ppdet.core.workspace import register
-from ..initializer import normal_, constant_, bias_init_with_prob
-from ppdet.modeling.bbox_utils import bbox_center, batch_distance2bbox
-from ..losses import GIoULoss
-from ppdet.modeling.layers import ConvNormLayer
-from ppdet.modeling.ops import get_static_shape
-from ppdet.modeling.assigners.utils import generate_anchors_for_grid_cell
-
-
-class ScaleReg(nn.Layer):
-    """
-    Parameter for scaling the regression outputs.
-    """
-
-    def __init__(self, init_scale=1.):
-        super(ScaleReg, self).__init__()
-        self.scale_reg = self.create_parameter(
-            shape=[1],
-            attr=ParamAttr(initializer=Constant(value=init_scale)),
-            dtype="float32")
-
-    def forward(self, inputs):
-        out = inputs * self.scale_reg
-        return out
-
-
-class TaskDecomposition(nn.Layer):
-    """This code is based on
-        https://github.com/fcjian/TOOD/blob/master/mmdet/models/dense_heads/tood_head.py
-    """
-
-    def __init__(
-            self,
-            feat_channels,
-            stacked_convs,
-            la_down_rate=8,
-            norm_type='gn',
-            norm_groups=32, ):
-        super(TaskDecomposition, self).__init__()
-        self.feat_channels = feat_channels
-        self.stacked_convs = stacked_convs
-        self.norm_type = norm_type
-        self.norm_groups = norm_groups
-        self.in_channels = self.feat_channels * self.stacked_convs
-        self.la_conv1 = nn.Conv2D(self.in_channels,
-                                  self.in_channels // la_down_rate, 1)
-        self.la_conv2 = nn.Conv2D(self.in_channels // la_down_rate,
-                                  self.stacked_convs, 1)
-
-        self.reduction_conv = ConvNormLayer(
-            self.in_channels,
-            self.feat_channels,
-            filter_size=1,
-            stride=1,
-            norm_type=self.norm_type,
-            norm_groups=self.norm_groups)
-
-        self._init_weights()
-
-    def _init_weights(self):
-        normal_(self.la_conv1.weight, std=0.001)
-        normal_(self.la_conv2.weight, std=0.001)
-
-    def forward(self, feat, avg_feat):
-        feat_shape = get_static_shape(feat)
-        b = feat_shape[0:1]
-        h = feat_shape[2:3]
-        w = feat_shape[3:4]
-        weight = F.relu(self.la_conv1(avg_feat))
-        weight = F.sigmoid(self.la_conv2(weight)).unsqueeze(-1)
-        feat = paddle.reshape(
-            feat, [b, self.stacked_convs, self.feat_channels, h, w]) * weight
-        feat = self.reduction_conv(feat.flatten(1, 2))
-        feat = F.relu(feat)
-        return feat
-
-
-@register
-class TOODHead(nn.Layer):
-    """This code is based on
-        https://github.com/fcjian/TOOD/blob/master/mmdet/models/dense_heads/tood_head.py
-    """
-    __inject__ = ['nms', 'static_assigner', 'assigner']
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 num_classes=80,
-                 feat_channels=256,
-                 stacked_convs=6,
-                 fpn_strides=(8, 16, 32, 64, 128),
-                 grid_cell_scale=8,
-                 grid_cell_offset=0.5,
-                 norm_type='gn',
-                 norm_groups=32,
-                 static_assigner_epoch=4,
-                 use_align_head=True,
-                 loss_weight={
-                     'class': 1.0,
-                     'bbox': 1.0,
-                     'iou': 2.0,
-                 },
-                 nms='MultiClassNMS',
-                 static_assigner='ATSSAssigner',
-                 assigner='TaskAlignedAssigner'):
-        super(TOODHead, self).__init__()
-        self.num_classes = num_classes
-        self.feat_channels = feat_channels
-        self.stacked_convs = stacked_convs
-        self.fpn_strides = fpn_strides
-        self.grid_cell_scale = grid_cell_scale
-        self.grid_cell_offset = grid_cell_offset
-        self.static_assigner_epoch = static_assigner_epoch
-        self.use_align_head = use_align_head
-        self.nms = nms
-        self.static_assigner = static_assigner
-        self.assigner = assigner
-        self.loss_weight = loss_weight
-        self.giou_loss = GIoULoss()
-
-        self.inter_convs = nn.LayerList()
-        for i in range(self.stacked_convs):
-            self.inter_convs.append(
-                ConvNormLayer(
-                    self.feat_channels,
-                    self.feat_channels,
-                    filter_size=3,
-                    stride=1,
-                    norm_type=norm_type,
-                    norm_groups=norm_groups))
-
-        self.cls_decomp = TaskDecomposition(
-            self.feat_channels,
-            self.stacked_convs,
-            self.stacked_convs * 8,
-            norm_type=norm_type,
-            norm_groups=norm_groups)
-        self.reg_decomp = TaskDecomposition(
-            self.feat_channels,
-            self.stacked_convs,
-            self.stacked_convs * 8,
-            norm_type=norm_type,
-            norm_groups=norm_groups)
-
-        self.tood_cls = nn.Conv2D(
-            self.feat_channels, self.num_classes, 3, padding=1)
-        self.tood_reg = nn.Conv2D(self.feat_channels, 4, 3, padding=1)
-
-        if self.use_align_head:
-            self.cls_prob_conv1 = nn.Conv2D(self.feat_channels *
-                                            self.stacked_convs,
-                                            self.feat_channels // 4, 1)
-            self.cls_prob_conv2 = nn.Conv2D(
-                self.feat_channels // 4, 1, 3, padding=1)
-            self.reg_offset_conv1 = nn.Conv2D(self.feat_channels *
-                                              self.stacked_convs,
-                                              self.feat_channels // 4, 1)
-            self.reg_offset_conv2 = nn.Conv2D(
-                self.feat_channels // 4, 4 * 2, 3, padding=1)
-
-        self.scales_regs = nn.LayerList([ScaleReg() for _ in self.fpn_strides])
-
-        self._init_weights()
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {
-            'feat_channels': input_shape[0].channels,
-            'fpn_strides': [i.stride for i in input_shape],
-        }
-
-    def _init_weights(self):
-        bias_cls = bias_init_with_prob(0.01)
-        normal_(self.tood_cls.weight, std=0.01)
-        constant_(self.tood_cls.bias, bias_cls)
-        normal_(self.tood_reg.weight, std=0.01)
-
-        if self.use_align_head:
-            normal_(self.cls_prob_conv1.weight, std=0.01)
-            normal_(self.cls_prob_conv2.weight, std=0.01)
-            constant_(self.cls_prob_conv2.bias, bias_cls)
-            normal_(self.reg_offset_conv1.weight, std=0.001)
-            constant_(self.reg_offset_conv2.weight)
-            constant_(self.reg_offset_conv2.bias)
-
-    def _reg_grid_sample(self, feat, offset, anchor_points):
-        feat_shape = get_static_shape(feat)
-        b = feat_shape[0:1]
-        h = feat_shape[2:3]
-        w = feat_shape[3:4]
-        feat = paddle.reshape(feat, [-1, 1, h, w])
-        offset = paddle.reshape(offset, [-1, 2, h, w]).transpose([0, 2, 3, 1])
-        grid_shape = paddle.concat([w, h]).astype('float32')
-        grid = (offset + anchor_points) / grid_shape
-        grid = 2 * grid.clip(0., 1.) - 1
-        feat = F.grid_sample(feat, grid)
-        feat = paddle.reshape(feat, [b, -1, h, w])
-        return feat
-
-    def forward(self, feats):
-        assert len(feats) == len(self.fpn_strides), \
-            "The size of feats is not equal to size of fpn_strides"
-
-        anchors, anchor_points, num_anchors_list, stride_tensor =\
-            generate_anchors_for_grid_cell(
-            feats, self.fpn_strides, self.grid_cell_scale,
-            self.grid_cell_offset)
-        anchor_centers_split = paddle.split(anchor_points / stride_tensor,
-                                            num_anchors_list)
-
-        cls_score_list, bbox_pred_list = [], []
-        for feat, scale_reg, anchor_centers, stride in zip(
-                feats, self.scales_regs, anchor_centers_split,
-                self.fpn_strides):
-            b, _, h, w = get_static_shape(feat)
-            inter_feats = []
-            for inter_conv in self.inter_convs:
-                feat = F.relu(inter_conv(feat))
-                inter_feats.append(feat)
-            feat = paddle.concat(inter_feats, axis=1)
-
-            # task decomposition
-            avg_feat = F.adaptive_avg_pool2d(feat, (1, 1))
-            cls_feat = self.cls_decomp(feat, avg_feat)
-            reg_feat = self.reg_decomp(feat, avg_feat)
-
-            # cls prediction and alignment
-            cls_logits = self.tood_cls(cls_feat)
-            if self.use_align_head:
-                cls_prob = F.relu(self.cls_prob_conv1(feat))
-                cls_prob = F.sigmoid(self.cls_prob_conv2(cls_prob))
-                cls_score = (F.sigmoid(cls_logits) * cls_prob).sqrt()
-            else:
-                cls_score = F.sigmoid(cls_logits)
-            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
-
-            # reg prediction and alignment
-            reg_dist = scale_reg(self.tood_reg(reg_feat).exp())
-            reg_dist = reg_dist.flatten(2).transpose([0, 2, 1])
-            reg_bbox = batch_distance2bbox(
-                anchor_centers.unsqueeze(0), reg_dist)
-            if self.use_align_head:
-                reg_offset = F.relu(self.reg_offset_conv1(feat))
-                reg_offset = self.reg_offset_conv2(reg_offset)
-                reg_bbox = reg_bbox.transpose([0, 2, 1]).reshape([b, 4, h, w])
-                anchor_centers = anchor_centers.reshape([1, h, w, 2])
-                bbox_pred = self._reg_grid_sample(reg_bbox, reg_offset,
-                                                  anchor_centers)
-                bbox_pred = bbox_pred.flatten(2).transpose([0, 2, 1])
-            else:
-                bbox_pred = reg_bbox
-
-            if not self.training:
-                bbox_pred *= stride
-            bbox_pred_list.append(bbox_pred)
-        cls_score_list = paddle.concat(cls_score_list, axis=1)
-        bbox_pred_list = paddle.concat(bbox_pred_list, axis=1)
-
-        return cls_score_list, bbox_pred_list, anchors, num_anchors_list, stride_tensor
-
-    @staticmethod
-    def _focal_loss(score, label, alpha=0.25, gamma=2.0):
-        weight = (score - label).pow(gamma)
-        if alpha > 0:
-            alpha_t = alpha * label + (1 - alpha) * (1 - label)
-            weight *= alpha_t
-        loss = F.binary_cross_entropy(
-            score, label, weight=weight, reduction='sum')
-        return loss
-
-    def get_loss(self, head_outs, gt_meta):
-        pred_scores, pred_bboxes, anchors, \
-        num_anchors_list, stride_tensor = head_outs
-        gt_labels = gt_meta['gt_class']
-        gt_bboxes = gt_meta['gt_bbox']
-        pad_gt_mask = gt_meta['pad_gt_mask']
-        # label assignment
-        if gt_meta['epoch_id'] < self.static_assigner_epoch:
-            assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner(
-                anchors,
-                num_anchors_list,
-                gt_labels,
-                gt_bboxes,
-                pad_gt_mask,
-                bg_index=self.num_classes)
-            alpha_l = 0.25
-        else:
-            assigned_labels, assigned_bboxes, assigned_scores = self.assigner(
-                pred_scores.detach(),
-                pred_bboxes.detach() * stride_tensor,
-                bbox_center(anchors),
-                num_anchors_list,
-                gt_labels,
-                gt_bboxes,
-                pad_gt_mask,
-                bg_index=self.num_classes)
-            alpha_l = -1
-
-        # rescale bbox
-        assigned_bboxes /= stride_tensor
-        # classification loss
-        loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha=alpha_l)
-        # select positive samples mask
-        mask_positive = (assigned_labels != self.num_classes)
-        num_pos = mask_positive.astype(paddle.float32).sum()
-        # bbox regression loss
-        if num_pos > 0:
-            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4])
-            pred_bboxes_pos = paddle.masked_select(pred_bboxes,
-                                                   bbox_mask).reshape([-1, 4])
-            assigned_bboxes_pos = paddle.masked_select(
-                assigned_bboxes, bbox_mask).reshape([-1, 4])
-            bbox_weight = paddle.masked_select(
-                assigned_scores.sum(-1), mask_positive).unsqueeze(-1)
-            # iou loss
-            loss_iou = self.giou_loss(pred_bboxes_pos,
-                                      assigned_bboxes_pos) * bbox_weight
-            loss_iou = loss_iou.sum() / bbox_weight.sum()
-            # l1 loss
-            loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos)
-        else:
-            loss_iou = paddle.zeros([1])
-            loss_l1 = paddle.zeros([1])
-
-        loss_cls /= assigned_scores.sum().clip(min=1)
-        loss = self.loss_weight['class'] * loss_cls + self.loss_weight[
-            'iou'] * loss_iou
-
-        return {
-            'loss': loss,
-            'loss_class': loss_cls,
-            'loss_iou': loss_iou,
-            'loss_l1': loss_l1
-        }
-
-    def post_process(self, head_outs, img_shape, scale_factor):
-        pred_scores, pred_bboxes, _, _, _ = head_outs
-        pred_scores = pred_scores.transpose([0, 2, 1])
-
-        for i in range(len(pred_bboxes)):
-            pred_bboxes[i, :, 0] = pred_bboxes[i, :, 0].clip(
-                min=0, max=img_shape[i, 1])
-            pred_bboxes[i, :, 1] = pred_bboxes[i, :, 1].clip(
-                min=0, max=img_shape[i, 0])
-            pred_bboxes[i, :, 2] = pred_bboxes[i, :, 2].clip(
-                min=0, max=img_shape[i, 1])
-            pred_bboxes[i, :, 3] = pred_bboxes[i, :, 3].clip(
-                min=0, max=img_shape[i, 0])
-        # scale bbox to origin
-        scale_factor = scale_factor.flip([1]).tile([1, 2]).unsqueeze(1)
-        pred_bboxes /= scale_factor
-        bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
-        return bbox_pred, bbox_num
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/ttf_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/ttf_head.py
deleted file mode 100644
index dfe97bd..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/ttf_head.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn.initializer import Constant, Normal
-from paddle.regularizer import L2Decay
-from ppdet.core.workspace import register
-from ppdet.modeling.layers import DeformableConvV2, LiteConv
-import numpy as np
-
-
-@register
-class HMHead(nn.Layer):
-    """
-    Args:
-        ch_in (int): The channel number of input Tensor.
-        ch_out (int): The channel number of output Tensor.
-        num_classes (int): Number of classes.
-        conv_num (int): The convolution number of hm_feat.
-        dcn_head(bool): whether use dcn in head. False by default. 
-        lite_head(bool): whether use lite version. False by default.
-        norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional.
-            bn by default
-
-    Return:
-        Heatmap head output
-    """
-    __shared__ = ['num_classes', 'norm_type']
-
-    def __init__(
-            self,
-            ch_in,
-            ch_out=128,
-            num_classes=80,
-            conv_num=2,
-            dcn_head=False,
-            lite_head=False,
-            norm_type='bn', ):
-        super(HMHead, self).__init__()
-        head_conv = nn.Sequential()
-        for i in range(conv_num):
-            name = 'conv.{}'.format(i)
-            if lite_head:
-                lite_name = 'hm.' + name
-                head_conv.add_sublayer(
-                    lite_name,
-                    LiteConv(
-                        in_channels=ch_in if i == 0 else ch_out,
-                        out_channels=ch_out,
-                        norm_type=norm_type))
-            else:
-                if dcn_head:
-                    head_conv.add_sublayer(
-                        name,
-                        DeformableConvV2(
-                            in_channels=ch_in if i == 0 else ch_out,
-                            out_channels=ch_out,
-                            kernel_size=3,
-                            weight_attr=ParamAttr(initializer=Normal(0, 0.01))))
-                else:
-                    head_conv.add_sublayer(
-                        name,
-                        nn.Conv2D(
-                            in_channels=ch_in if i == 0 else ch_out,
-                            out_channels=ch_out,
-                            kernel_size=3,
-                            padding=1,
-                            weight_attr=ParamAttr(initializer=Normal(0, 0.01)),
-                            bias_attr=ParamAttr(
-                                learning_rate=2., regularizer=L2Decay(0.))))
-                head_conv.add_sublayer(name + '.act', nn.ReLU())
-        self.feat = head_conv
-        bias_init = float(-np.log((1 - 0.01) / 0.01))
-        weight_attr = None if lite_head else ParamAttr(initializer=Normal(0,
-                                                                          0.01))
-        self.head = nn.Conv2D(
-            in_channels=ch_out,
-            out_channels=num_classes,
-            kernel_size=1,
-            weight_attr=weight_attr,
-            bias_attr=ParamAttr(
-                learning_rate=2.,
-                regularizer=L2Decay(0.),
-                initializer=Constant(bias_init)))
-
-    def forward(self, feat):
-        out = self.feat(feat)
-        out = self.head(out)
-        return out
-
-
-@register
-class WHHead(nn.Layer):
-    """
-    Args:
-        ch_in (int): The channel number of input Tensor.
-        ch_out (int): The channel number of output Tensor.
-        conv_num (int): The convolution number of wh_feat.
-        dcn_head(bool): whether use dcn in head. False by default.
-        lite_head(bool): whether use lite version. False by default.
-        norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional.
-            bn by default
-    Return:
-        Width & Height head output
-    """
-    __shared__ = ['norm_type']
-
-    def __init__(self,
-                 ch_in,
-                 ch_out=64,
-                 conv_num=2,
-                 dcn_head=False,
-                 lite_head=False,
-                 norm_type='bn'):
-        super(WHHead, self).__init__()
-        head_conv = nn.Sequential()
-        for i in range(conv_num):
-            name = 'conv.{}'.format(i)
-            if lite_head:
-                lite_name = 'wh.' + name
-                head_conv.add_sublayer(
-                    lite_name,
-                    LiteConv(
-                        in_channels=ch_in if i == 0 else ch_out,
-                        out_channels=ch_out,
-                        norm_type=norm_type))
-            else:
-                if dcn_head:
-                    head_conv.add_sublayer(
-                        name,
-                        DeformableConvV2(
-                            in_channels=ch_in if i == 0 else ch_out,
-                            out_channels=ch_out,
-                            kernel_size=3,
-                            weight_attr=ParamAttr(initializer=Normal(0, 0.01))))
-                else:
-                    head_conv.add_sublayer(
-                        name,
-                        nn.Conv2D(
-                            in_channels=ch_in if i == 0 else ch_out,
-                            out_channels=ch_out,
-                            kernel_size=3,
-                            padding=1,
-                            weight_attr=ParamAttr(initializer=Normal(0, 0.01)),
-                            bias_attr=ParamAttr(
-                                learning_rate=2., regularizer=L2Decay(0.))))
-                head_conv.add_sublayer(name + '.act', nn.ReLU())
-
-        weight_attr = None if lite_head else ParamAttr(initializer=Normal(0,
-                                                                          0.01))
-        self.feat = head_conv
-        self.head = nn.Conv2D(
-            in_channels=ch_out,
-            out_channels=4,
-            kernel_size=1,
-            weight_attr=weight_attr,
-            bias_attr=ParamAttr(
-                learning_rate=2., regularizer=L2Decay(0.)))
-
-    def forward(self, feat):
-        out = self.feat(feat)
-        out = self.head(out)
-        out = F.relu(out)
-        return out
-
-
-@register
-class TTFHead(nn.Layer):
-    """
-    TTFHead
-    Args:
-        in_channels (int): the channel number of input to TTFHead.
-        num_classes (int): the number of classes, 80 by default.
-        hm_head_planes (int): the channel number in heatmap head,
-            128 by default.
-        wh_head_planes (int): the channel number in width & height head,
-            64 by default.
-        hm_head_conv_num (int): the number of convolution in heatmap head,
-            2 by default.
-        wh_head_conv_num (int): the number of convolution in width & height
-            head, 2 by default.
-        hm_loss (object): Instance of 'CTFocalLoss'.
-        wh_loss (object): Instance of 'GIoULoss'.
-        wh_offset_base (float): the base offset of width and height,
-            16.0 by default.
-        down_ratio (int): the actual down_ratio is calculated by base_down_ratio
-            (default 16) and the number of upsample layers.
-        lite_head(bool): whether use lite version. False by default.
-        norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional.
-            bn by default
-        ags_module(bool): whether use AGS module to reweight location feature.
-            false by default.
-
-    """
-
-    __shared__ = ['num_classes', 'down_ratio', 'norm_type']
-    __inject__ = ['hm_loss', 'wh_loss']
-
-    def __init__(self,
-                 in_channels,
-                 num_classes=80,
-                 hm_head_planes=128,
-                 wh_head_planes=64,
-                 hm_head_conv_num=2,
-                 wh_head_conv_num=2,
-                 hm_loss='CTFocalLoss',
-                 wh_loss='GIoULoss',
-                 wh_offset_base=16.,
-                 down_ratio=4,
-                 dcn_head=False,
-                 lite_head=False,
-                 norm_type='bn',
-                 ags_module=False):
-        super(TTFHead, self).__init__()
-        self.in_channels = in_channels
-        self.hm_head = HMHead(in_channels, hm_head_planes, num_classes,
-                              hm_head_conv_num, dcn_head, lite_head, norm_type)
-        self.wh_head = WHHead(in_channels, wh_head_planes, wh_head_conv_num,
-                              dcn_head, lite_head, norm_type)
-        self.hm_loss = hm_loss
-        self.wh_loss = wh_loss
-
-        self.wh_offset_base = wh_offset_base
-        self.down_ratio = down_ratio
-        self.ags_module = ags_module
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        if isinstance(input_shape, (list, tuple)):
-            input_shape = input_shape[0]
-        return {'in_channels': input_shape.channels, }
-
-    def forward(self, feats):
-        hm = self.hm_head(feats)
-        wh = self.wh_head(feats) * self.wh_offset_base
-        return hm, wh
-
-    def filter_box_by_weight(self, pred, target, weight):
-        """
-        Filter out boxes where ttf_reg_weight is 0, only keep positive samples.
-        """
-        index = paddle.nonzero(weight > 0)
-        index.stop_gradient = True
-        weight = paddle.gather_nd(weight, index)
-        pred = paddle.gather_nd(pred, index)
-        target = paddle.gather_nd(target, index)
-        return pred, target, weight
-
-    def filter_loc_by_weight(self, score, weight):
-        index = paddle.nonzero(weight > 0)
-        index.stop_gradient = True
-        score = paddle.gather_nd(score, index)
-        return score
-
-    def get_loss(self, pred_hm, pred_wh, target_hm, box_target, target_weight):
-        pred_hm = paddle.clip(F.sigmoid(pred_hm), 1e-4, 1 - 1e-4)
-        hm_loss = self.hm_loss(pred_hm, target_hm)
-        H, W = target_hm.shape[2:]
-        mask = paddle.reshape(target_weight, [-1, H, W])
-        avg_factor = paddle.sum(mask) + 1e-4
-
-        base_step = self.down_ratio
-        shifts_x = paddle.arange(0, W * base_step, base_step, dtype='int32')
-        shifts_y = paddle.arange(0, H * base_step, base_step, dtype='int32')
-        shift_y, shift_x = paddle.tensor.meshgrid([shifts_y, shifts_x])
-        base_loc = paddle.stack([shift_x, shift_y], axis=0)
-        base_loc.stop_gradient = True
-
-        pred_boxes = paddle.concat(
-            [0 - pred_wh[:, 0:2, :, :] + base_loc, pred_wh[:, 2:4] + base_loc],
-            axis=1)
-        pred_boxes = paddle.transpose(pred_boxes, [0, 2, 3, 1])
-        boxes = paddle.transpose(box_target, [0, 2, 3, 1])
-        boxes.stop_gradient = True
-
-        if self.ags_module:
-            pred_hm_max = paddle.max(pred_hm, axis=1, keepdim=True)
-            pred_hm_max_softmax = F.softmax(pred_hm_max, axis=1)
-            pred_hm_max_softmax = paddle.transpose(pred_hm_max_softmax,
-                                                   [0, 2, 3, 1])
-            pred_hm_max_softmax = self.filter_loc_by_weight(pred_hm_max_softmax,
-                                                            mask)
-        else:
-            pred_hm_max_softmax = None
-
-        pred_boxes, boxes, mask = self.filter_box_by_weight(pred_boxes, boxes,
-                                                            mask)
-        mask.stop_gradient = True
-        wh_loss = self.wh_loss(
-            pred_boxes,
-            boxes,
-            iou_weight=mask.unsqueeze(1),
-            loc_reweight=pred_hm_max_softmax)
-        wh_loss = wh_loss / avg_factor
-
-        ttf_loss = {'hm_loss': hm_loss, 'wh_loss': wh_loss}
-        return ttf_loss
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/vitpose_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/vitpose_head.py
deleted file mode 100644
index 43908ed..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/vitpose_head.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ppdet.core.workspace import register
-from ppdet.modeling.keypoint_utils import resize, flip_back
-from paddle.nn.initializer import TruncatedNormal, Constant, Normal
-from ppdet.modeling.layers import ConvTranspose2d, BatchNorm2d
-
-trunc_normal_ = TruncatedNormal(std=.02)
-normal_ = Normal(std=0.001)
-zeros_ = Constant(value=0.)
-ones_ = Constant(value=1.)
-
-__all__ = ['TopdownHeatmapSimpleHead']
-
-
-@register
-class TopdownHeatmapSimpleHead(nn.Layer):
-    def __init__(self,
-                 in_channels=768,
-                 out_channels=17,
-                 num_deconv_layers=3,
-                 num_deconv_filters=(256, 256, 256),
-                 num_deconv_kernels=(4, 4, 4),
-                 extra=None,
-                 in_index=0,
-                 input_transform=None,
-                 align_corners=False,
-                 upsample=0,
-                 flip_pairs=None,
-                 shift_heatmap=False,
-                 target_type='GaussianHeatmap'):
-        super(TopdownHeatmapSimpleHead, self).__init__()
-
-        self.in_channels = in_channels
-        self.upsample = upsample
-        self.flip_pairs = flip_pairs
-        self.shift_heatmap = shift_heatmap
-        self.target_type = target_type
-
-        self._init_inputs(in_channels, in_index, input_transform)
-        self.in_index = in_index
-        self.align_corners = align_corners
-
-        if extra is not None and not isinstance(extra, dict):
-            raise TypeError('extra should be dict or None.')
-
-        if num_deconv_layers > 0:
-            self.deconv_layers = self._make_deconv_layer(
-                num_deconv_layers,
-                num_deconv_filters,
-                num_deconv_kernels, )
-        elif num_deconv_layers == 0:
-            self.deconv_layers = nn.Identity()
-        else:
-            raise ValueError(
-                f'num_deconv_layers ({num_deconv_layers}) should >= 0.')
-
-        identity_final_layer = False
-        if extra is not None and 'final_conv_kernel' in extra:
-            assert extra['final_conv_kernel'] in [0, 1, 3]
-            if extra['final_conv_kernel'] == 3:
-                padding = 1
-            elif extra['final_conv_kernel'] == 1:
-                padding = 0
-            else:
-                # 0 for Identity mapping.
-                identity_final_layer = True
-            kernel_size = extra['final_conv_kernel']
-        else:
-            kernel_size = 1
-            padding = 0
-
-        if identity_final_layer:
-            self.final_layer = nn.Identity()
-        else:
-            conv_channels = num_deconv_filters[
-                -1] if num_deconv_layers > 0 else self.in_channels
-
-            layers = []
-            if extra is not None:
-                num_conv_layers = extra.get('num_conv_layers', 0)
-                num_conv_kernels = extra.get('num_conv_kernels',
-                                             [1] * num_conv_layers)
-
-                for i in range(num_conv_layers):
-                    layers.append(
-                        nn.Conv2D(
-                            in_channels=conv_channels,
-                            out_channels=conv_channels,
-                            kernel_size=num_conv_kernels[i],
-                            stride=1,
-                            padding=(num_conv_kernels[i] - 1) // 2))
-                    layers.append(nn.BatchNorm2D(conv_channels))
-                    layers.append(nn.ReLU())
-
-            layers.append(
-                nn.Conv2D(
-                    in_channels=conv_channels,
-                    out_channels=out_channels,
-                    kernel_size=kernel_size,
-                    stride=1,
-                    padding=(padding, padding)))
-
-            if len(layers) > 1:
-                self.final_layer = nn.Sequential(*layers)
-            else:
-                self.final_layer = layers[0]
-
-        self.init_weights()
-
-    @staticmethod
-    def _get_deconv_cfg(deconv_kernel):
-        """Get configurations for deconv layers."""
-        if deconv_kernel == 4:
-            padding = 1
-            output_padding = 0
-        elif deconv_kernel == 3:
-            padding = 1
-            output_padding = 1
-        elif deconv_kernel == 2:
-            padding = 0
-            output_padding = 0
-        else:
-            raise ValueError(f'Not supported num_kernels ({deconv_kernel}).')
-
-        return deconv_kernel, padding, output_padding
-
-    def _init_inputs(self, in_channels, in_index, input_transform):
-        """Check and initialize input transforms.
-        """
-
-        if input_transform is not None:
-            assert input_transform in ['resize_concat', 'multiple_select']
-        self.input_transform = input_transform
-        self.in_index = in_index
-        if input_transform is not None:
-            assert isinstance(in_channels, (list, tuple))
-            assert isinstance(in_index, (list, tuple))
-            assert len(in_channels) == len(in_index)
-            if input_transform == 'resize_concat':
-                self.in_channels = sum(in_channels)
-            else:
-                self.in_channels = in_channels
-        else:
-            assert isinstance(in_channels, int)
-            assert isinstance(in_index, int)
-            self.in_channels = in_channels
-
-    def _transform_inputs(self, inputs):
-        """Transform inputs for decoder.
-        """
-        if not isinstance(inputs, list):
-            if not isinstance(inputs, list):
-
-                if self.upsample > 0:
-                    inputs = resize(
-                        input=F.relu(inputs),
-                        scale_factor=self.upsample,
-                        mode='bilinear',
-                        align_corners=self.align_corners)
-            return inputs
-
-        if self.input_transform == 'resize_concat':
-            inputs = [inputs[i] for i in self.in_index]
-            upsampled_inputs = [
-                resize(
-                    input=x,
-                    size=inputs[0].shape[2:],
-                    mode='bilinear',
-                    align_corners=self.align_corners) for x in inputs
-            ]
-            inputs = paddle.concat(upsampled_inputs, dim=1)
-        elif self.input_transform == 'multiple_select':
-            inputs = [inputs[i] for i in self.in_index]
-        else:
-            inputs = inputs[self.in_index]
-
-        return inputs
-
-    def forward(self, x):
-        """Forward function."""
-        x = self._transform_inputs(x)
-        x = self.deconv_layers(x)
-        x = self.final_layer(x)
-
-        return x
-
-    def inference_model(self, x, flip_pairs=None):
-        """Inference function.
-
-        Returns:
-            output_heatmap (np.ndarray): Output heatmaps.
-
-        Args:
-            x (torch.Tensor[N,K,H,W]): Input features.
-            flip_pairs (None | list[tuple]):
-                Pairs of keypoints which are mirrored.
-        """
-        output = self.forward(x)
-
-        if flip_pairs is not None:
-            output_heatmap = flip_back(
-                output, self.flip_pairs, target_type=self.target_type)
-            # feature is not aligned, shift flipped heatmap for higher accuracy
-            if self.shift_heatmap:
-                output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1]
-        else:
-            output_heatmap = output
-        return output_heatmap
-
-    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
-        """Make deconv layers."""
-        if num_layers != len(num_filters):
-            error_msg = f'num_layers({num_layers}) ' \
-                        f'!= length of num_filters({len(num_filters)})'
-            raise ValueError(error_msg)
-        if num_layers != len(num_kernels):
-            error_msg = f'num_layers({num_layers}) ' \
-                        f'!= length of num_kernels({len(num_kernels)})'
-            raise ValueError(error_msg)
-
-        layers = []
-        for i in range(num_layers):
-            kernel, padding, output_padding = \
-                self._get_deconv_cfg(num_kernels[i])
-
-            planes = num_filters[i]
-            layers.append(
-                ConvTranspose2d(
-                    in_channels=self.in_channels,
-                    out_channels=planes,
-                    kernel_size=kernel,
-                    stride=2,
-                    padding=padding,
-                    output_padding=output_padding,
-                    bias=False))
-            layers.append(nn.BatchNorm2D(planes))
-            layers.append(nn.ReLU())
-            self.in_channels = planes
-
-        return nn.Sequential(*layers)
-
-    def init_weights(self):
-        """Initialize model weights."""
-        if not isinstance(self.deconv_layers, nn.Identity):
-
-            for m in self.deconv_layers:
-                if isinstance(m, nn.BatchNorm2D):
-                    ones_(m.weight)
-                    ones_(m.bias)
-        if not isinstance(self.final_layer, nn.Conv2D):
-
-            for m in self.final_layer:
-                if isinstance(m, nn.Conv2D):
-                    normal_(m.weight)
-                    zeros_(m.bias)
-                elif isinstance(m, nn.BatchNorm2D):
-                    ones_(m.weight)
-                    ones_(m.bias)
-        else:
-            normal_(self.final_layer.weight)
-            zeros_(self.final_layer.bias)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/yolo_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/yolo_head.py
deleted file mode 100644
index 0a63060..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/yolo_head.py
+++ /dev/null
@@ -1,416 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-from ppdet.core.workspace import register
-
-import math
-import numpy as np
-from ..initializer import bias_init_with_prob, constant_
-from ..backbones.csp_darknet import BaseConv, DWConv
-from ..losses import IouLoss
-from ppdet.modeling.assigners.simota_assigner import SimOTAAssigner
-from ppdet.modeling.bbox_utils import bbox_overlaps
-from ppdet.modeling.layers import MultiClassNMS
-
-__all__ = ['YOLOv3Head', 'YOLOXHead']
-
-
-def _de_sigmoid(x, eps=1e-7):
-    x = paddle.clip(x, eps, 1. / eps)
-    x = paddle.clip(1. / x - 1., eps, 1. / eps)
-    x = -paddle.log(x)
-    return x
-
-
-@register
-class YOLOv3Head(nn.Layer):
-    __shared__ = ['num_classes', 'data_format']
-    __inject__ = ['loss']
-
-    def __init__(self,
-                 in_channels=[1024, 512, 256],
-                 anchors=[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45],
-                          [59, 119], [116, 90], [156, 198], [373, 326]],
-                 anchor_masks=[[6, 7, 8], [3, 4, 5], [0, 1, 2]],
-                 num_classes=80,
-                 loss='YOLOv3Loss',
-                 iou_aware=False,
-                 iou_aware_factor=0.4,
-                 data_format='NCHW'):
-        """
-        Head for YOLOv3 network
-
-        Args:
-            num_classes (int): number of foreground classes
-            anchors (list): anchors
-            anchor_masks (list): anchor masks
-            loss (object): YOLOv3Loss instance
-            iou_aware (bool): whether to use iou_aware
-            iou_aware_factor (float): iou aware factor
-            data_format (str): data format, NCHW or NHWC
-        """
-        super(YOLOv3Head, self).__init__()
-        assert len(in_channels) > 0, "in_channels length should > 0"
-        self.in_channels = in_channels
-        self.num_classes = num_classes
-        self.loss = loss
-
-        self.iou_aware = iou_aware
-        self.iou_aware_factor = iou_aware_factor
-
-        self.parse_anchor(anchors, anchor_masks)
-        self.num_outputs = len(self.anchors)
-        self.data_format = data_format
-
-        self.yolo_outputs = []
-        for i in range(len(self.anchors)):
-
-            if self.iou_aware:
-                num_filters = len(self.anchors[i]) * (self.num_classes + 6)
-            else:
-                num_filters = len(self.anchors[i]) * (self.num_classes + 5)
-            name = 'yolo_output.{}'.format(i)
-            conv = nn.Conv2D(
-                in_channels=self.in_channels[i],
-                out_channels=num_filters,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                data_format=data_format,
-                bias_attr=ParamAttr(regularizer=L2Decay(0.)))
-            conv.skip_quant = True
-            yolo_output = self.add_sublayer(name, conv)
-            self.yolo_outputs.append(yolo_output)
-
-    def parse_anchor(self, anchors, anchor_masks):
-        self.anchors = [[anchors[i] for i in mask] for mask in anchor_masks]
-        self.mask_anchors = []
-        anchor_num = len(anchors)
-        for masks in anchor_masks:
-            self.mask_anchors.append([])
-            for mask in masks:
-                assert mask < anchor_num, "anchor mask index overflow"
-                self.mask_anchors[-1].extend(anchors[mask])
-
-    def forward(self, feats, targets=None):
-        assert len(feats) == len(self.anchors)
-        yolo_outputs = []
-        for i, feat in enumerate(feats):
-            yolo_output = self.yolo_outputs[i](feat)
-            if self.data_format == 'NHWC':
-                yolo_output = paddle.transpose(yolo_output, [0, 3, 1, 2])
-            yolo_outputs.append(yolo_output)
-
-        if self.training:
-            return self.loss(yolo_outputs, targets, self.anchors)
-        else:
-            if self.iou_aware:
-                y = []
-                for i, out in enumerate(yolo_outputs):
-                    na = len(self.anchors[i])
-                    ioup, x = out[:, 0:na, :, :], out[:, na:, :, :]
-                    b, c, h, w = x.shape
-                    no = c // na
-                    x = x.reshape((b, na, no, h * w))
-                    ioup = ioup.reshape((b, na, 1, h * w))
-                    obj = x[:, :, 4:5, :]
-                    ioup = F.sigmoid(ioup)
-                    obj = F.sigmoid(obj)
-                    obj_t = (obj**(1 - self.iou_aware_factor)) * (
-                        ioup**self.iou_aware_factor)
-                    obj_t = _de_sigmoid(obj_t)
-                    loc_t = x[:, :, :4, :]
-                    cls_t = x[:, :, 5:, :]
-                    y_t = paddle.concat([loc_t, obj_t, cls_t], axis=2)
-                    y_t = y_t.reshape((b, c, h, w))
-                    y.append(y_t)
-                return y
-            else:
-                return yolo_outputs
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
-
-
-@register
-class YOLOXHead(nn.Layer):
-    __shared__ = ['num_classes', 'width_mult', 'act', 'trt', 'exclude_nms']
-    __inject__ = ['assigner', 'nms']
-
-    def __init__(self,
-                 num_classes=80,
-                 width_mult=1.0,
-                 depthwise=False,
-                 in_channels=[256, 512, 1024],
-                 feat_channels=256,
-                 fpn_strides=(8, 16, 32),
-                 l1_epoch=285,
-                 act='silu',
-                 assigner=SimOTAAssigner(use_vfl=False),
-                 nms='MultiClassNMS',
-                 loss_weight={
-                     'cls': 1.0,
-                     'obj': 1.0,
-                     'iou': 5.0,
-                     'l1': 1.0,
-                 },
-                 trt=False,
-                 exclude_nms=False):
-        super(YOLOXHead, self).__init__()
-        self._dtype = paddle.framework.get_default_dtype()
-        self.num_classes = num_classes
-        assert len(in_channels) > 0, "in_channels length should > 0"
-        self.in_channels = in_channels
-        feat_channels = int(feat_channels * width_mult)
-        self.fpn_strides = fpn_strides
-        self.l1_epoch = l1_epoch
-        self.assigner = assigner
-        self.nms = nms
-        if isinstance(self.nms, MultiClassNMS) and trt:
-            self.nms.trt = trt
-        self.exclude_nms = exclude_nms
-        self.loss_weight = loss_weight
-        self.iou_loss = IouLoss(loss_weight=1.0)  # default loss_weight 2.5
-
-        ConvBlock = DWConv if depthwise else BaseConv
-
-        self.stem_conv = nn.LayerList()
-        self.conv_cls = nn.LayerList()
-        self.conv_reg = nn.LayerList()  # reg [x,y,w,h] + obj
-        for in_c in self.in_channels:
-            self.stem_conv.append(BaseConv(in_c, feat_channels, 1, 1, act=act))
-
-            self.conv_cls.append(
-                nn.Sequential(* [
-                    ConvBlock(
-                        feat_channels, feat_channels, 3, 1, act=act), ConvBlock(
-                            feat_channels, feat_channels, 3, 1, act=act),
-                    nn.Conv2D(
-                        feat_channels,
-                        self.num_classes,
-                        1,
-                        bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-                ]))
-
-            self.conv_reg.append(
-                nn.Sequential(* [
-                    ConvBlock(
-                        feat_channels, feat_channels, 3, 1, act=act),
-                    ConvBlock(
-                        feat_channels, feat_channels, 3, 1, act=act),
-                    nn.Conv2D(
-                        feat_channels,
-                        4 + 1,  # reg [x,y,w,h] + obj
-                        1,
-                        bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-                ]))
-
-        self._init_weights()
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
-
-    def _init_weights(self):
-        bias_cls = bias_init_with_prob(0.01)
-        bias_reg = paddle.full([5], math.log(5.), dtype=self._dtype)
-        bias_reg[:2] = 0.
-        bias_reg[-1] = bias_cls
-        for cls_, reg_ in zip(self.conv_cls, self.conv_reg):
-            constant_(cls_[-1].weight)
-            constant_(cls_[-1].bias, bias_cls)
-            constant_(reg_[-1].weight)
-            reg_[-1].bias.set_value(bias_reg)
-
-    def _generate_anchor_point(self, feat_sizes, strides, offset=0.):
-        anchor_points, stride_tensor = [], []
-        num_anchors_list = []
-        for feat_size, stride in zip(feat_sizes, strides):
-            h, w = feat_size
-            x = (paddle.arange(w) + offset) * stride
-            y = (paddle.arange(h) + offset) * stride
-            y, x = paddle.meshgrid(y, x)
-            anchor_points.append(paddle.stack([x, y], axis=-1).reshape([-1, 2]))
-            stride_tensor.append(
-                paddle.full(
-                    [len(anchor_points[-1]), 1], stride, dtype=self._dtype))
-            num_anchors_list.append(len(anchor_points[-1]))
-        anchor_points = paddle.concat(anchor_points).astype(self._dtype)
-        anchor_points.stop_gradient = True
-        stride_tensor = paddle.concat(stride_tensor)
-        stride_tensor.stop_gradient = True
-        return anchor_points, stride_tensor, num_anchors_list
-
-    def forward(self, feats, targets=None):
-        assert len(feats) == len(self.fpn_strides), \
-            "The size of feats is not equal to size of fpn_strides"
-
-        feat_sizes = [[f.shape[-2], f.shape[-1]] for f in feats]
-        cls_score_list, reg_pred_list = [], []
-        obj_score_list = []
-        for i, feat in enumerate(feats):
-            feat = self.stem_conv[i](feat)
-            cls_logit = self.conv_cls[i](feat)
-            reg_pred = self.conv_reg[i](feat)
-            # cls prediction
-            cls_score = F.sigmoid(cls_logit)
-            cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1]))
-            # reg prediction
-            reg_xywh, obj_logit = paddle.split(reg_pred, [4, 1], axis=1)
-            reg_xywh = reg_xywh.flatten(2).transpose([0, 2, 1])
-            reg_pred_list.append(reg_xywh)
-            # obj prediction
-            obj_score = F.sigmoid(obj_logit)
-            obj_score_list.append(obj_score.flatten(2).transpose([0, 2, 1]))
-
-        cls_score_list = paddle.concat(cls_score_list, axis=1)
-        reg_pred_list = paddle.concat(reg_pred_list, axis=1)
-        obj_score_list = paddle.concat(obj_score_list, axis=1)
-
-        # bbox decode
-        anchor_points, stride_tensor, _ =\
-            self._generate_anchor_point(feat_sizes, self.fpn_strides)
-        reg_xy, reg_wh = paddle.split(reg_pred_list, 2, axis=-1)
-        reg_xy += (anchor_points / stride_tensor)
-        reg_wh = paddle.exp(reg_wh) * 0.5
-        bbox_pred_list = paddle.concat(
-            [reg_xy - reg_wh, reg_xy + reg_wh], axis=-1)
-
-        if self.training:
-            anchor_points, stride_tensor, num_anchors_list =\
-                self._generate_anchor_point(feat_sizes, self.fpn_strides, 0.5)
-            yolox_losses = self.get_loss([
-                cls_score_list, bbox_pred_list, obj_score_list, anchor_points,
-                stride_tensor, num_anchors_list
-            ], targets)
-            return yolox_losses
-        else:
-            pred_scores = (cls_score_list * obj_score_list).sqrt()
-            return pred_scores, bbox_pred_list, stride_tensor
-
-    def get_loss(self, head_outs, targets):
-        pred_cls, pred_bboxes, pred_obj,\
-        anchor_points, stride_tensor, num_anchors_list = head_outs
-        gt_labels = targets['gt_class']
-        gt_bboxes = targets['gt_bbox']
-        pred_scores = (pred_cls * pred_obj).sqrt()
-        # label assignment
-        center_and_strides = paddle.concat(
-            [anchor_points, stride_tensor, stride_tensor], axis=-1)
-        pos_num_list, label_list, bbox_target_list = [], [], []
-        for pred_score, pred_bbox, gt_box, gt_label in zip(
-                pred_scores.detach(),
-                pred_bboxes.detach() * stride_tensor, gt_bboxes, gt_labels):
-            pos_num, label, _, bbox_target = self.assigner(
-                pred_score, center_and_strides, pred_bbox, gt_box, gt_label)
-            pos_num_list.append(pos_num)
-            label_list.append(label)
-            bbox_target_list.append(bbox_target)
-        labels = paddle.to_tensor(np.stack(label_list, axis=0))
-        bbox_targets = paddle.to_tensor(np.stack(bbox_target_list, axis=0))
-        bbox_targets /= stride_tensor  # rescale bbox
-
-        # 1. obj score loss
-        mask_positive = (labels != self.num_classes)
-        loss_obj = F.binary_cross_entropy(
-            pred_obj,
-            mask_positive.astype(pred_obj.dtype).unsqueeze(-1),
-            reduction='sum')
-
-        num_pos = sum(pos_num_list)
-
-        if num_pos > 0:
-            num_pos = paddle.to_tensor(num_pos, dtype=self._dtype).clip(min=1)
-            loss_obj /= num_pos
-
-            # 2. iou loss
-            bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4])
-            pred_bboxes_pos = paddle.masked_select(pred_bboxes,
-                                                   bbox_mask).reshape([-1, 4])
-            assigned_bboxes_pos = paddle.masked_select(
-                bbox_targets, bbox_mask).reshape([-1, 4])
-            bbox_iou = bbox_overlaps(pred_bboxes_pos, assigned_bboxes_pos)
-            bbox_iou = paddle.diag(bbox_iou)
-
-            loss_iou = self.iou_loss(
-                pred_bboxes_pos.split(
-                    4, axis=-1),
-                assigned_bboxes_pos.split(
-                    4, axis=-1))
-            loss_iou = loss_iou.sum() / num_pos
-
-            # 3. cls loss
-            cls_mask = mask_positive.unsqueeze(-1).tile(
-                [1, 1, self.num_classes])
-            pred_cls_pos = paddle.masked_select(
-                pred_cls, cls_mask).reshape([-1, self.num_classes])
-            assigned_cls_pos = paddle.masked_select(labels, mask_positive)
-            assigned_cls_pos = F.one_hot(assigned_cls_pos,
-                                         self.num_classes + 1)[..., :-1]
-            assigned_cls_pos *= bbox_iou.unsqueeze(-1)
-            loss_cls = F.binary_cross_entropy(
-                pred_cls_pos, assigned_cls_pos, reduction='sum')
-            loss_cls /= num_pos
-
-            # 4. l1 loss
-            if targets['epoch_id'] >= self.l1_epoch:
-                loss_l1 = F.l1_loss(
-                    pred_bboxes_pos, assigned_bboxes_pos, reduction='sum')
-                loss_l1 /= num_pos
-            else:
-                loss_l1 = paddle.zeros([1])
-                loss_l1.stop_gradient = False
-        else:
-            loss_cls = paddle.zeros([1])
-            loss_iou = paddle.zeros([1])
-            loss_l1 = paddle.zeros([1])
-            loss_cls.stop_gradient = False
-            loss_iou.stop_gradient = False
-            loss_l1.stop_gradient = False
-
-        loss = self.loss_weight['obj'] * loss_obj + \
-               self.loss_weight['cls'] * loss_cls + \
-               self.loss_weight['iou'] * loss_iou
-
-        if targets['epoch_id'] >= self.l1_epoch:
-            loss += (self.loss_weight['l1'] * loss_l1)
-
-        yolox_losses = {
-            'loss': loss,
-            'loss_cls': loss_cls,
-            'loss_obj': loss_obj,
-            'loss_iou': loss_iou,
-            'loss_l1': loss_l1,
-        }
-        return yolox_losses
-
-    def post_process(self, head_outs, img_shape, scale_factor):
-        pred_scores, pred_bboxes, stride_tensor = head_outs
-        pred_scores = pred_scores.transpose([0, 2, 1])
-        pred_bboxes *= stride_tensor
-        # scale bbox to origin image
-        scale_factor = scale_factor.flip(-1).tile([1, 2]).unsqueeze(1)
-        pred_bboxes /= scale_factor
-        if self.exclude_nms:
-            # `exclude_nms=True` just use in benchmark
-            return pred_bboxes.sum(), pred_scores.sum()
-        else:
-            bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
-            return bbox_pred, bbox_num
diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/yolof_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/yolof_head.py
deleted file mode 100644
index 4893337..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/heads/yolof_head.py
+++ /dev/null
@@ -1,368 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-from paddle.nn.initializer import Normal, Constant
-
-from ppdet.modeling.layers import MultiClassNMS
-from ppdet.core.workspace import register
-from ppdet.modeling.bbox_utils import delta2bbox_v2
-
-__all__ = ['YOLOFHead']
-
-INF = 1e8
-
-
-def reduce_mean(tensor):
-    world_size = paddle.distributed.get_world_size()
-    if world_size == 1:
-        return tensor
-    paddle.distributed.all_reduce(tensor)
-    return tensor / world_size
-
-
-def find_inside_anchor(feat_size, stride, num_anchors, im_shape):
-    feat_h, feat_w = feat_size[:2]
-    im_h, im_w = im_shape[:2]
-    inside_h = min(int(np.ceil(im_h / stride)), feat_h)
-    inside_w = min(int(np.ceil(im_w / stride)), feat_w)
-    inside_mask = paddle.zeros([feat_h, feat_w], dtype=paddle.bool)
-    inside_mask[:inside_h, :inside_w] = True
-    inside_mask = inside_mask.unsqueeze(-1).expand(
-        [feat_h, feat_w, num_anchors])
-    return inside_mask.reshape([-1])
-
-
-@register
-class YOLOFFeat(nn.Layer):
-    def __init__(self,
-                 feat_in=256,
-                 feat_out=256,
-                 num_cls_convs=2,
-                 num_reg_convs=4,
-                 norm_type='bn'):
-        super(YOLOFFeat, self).__init__()
-        assert norm_type == 'bn', "YOLOFFeat only support BN now."
-        self.feat_in = feat_in
-        self.feat_out = feat_out
-        self.num_cls_convs = num_cls_convs
-        self.num_reg_convs = num_reg_convs
-        self.norm_type = norm_type
-
-        cls_subnet, reg_subnet = [], []
-        for i in range(self.num_cls_convs):
-            feat_in = self.feat_in if i == 0 else self.feat_out
-            cls_subnet.append(
-                nn.Conv2D(
-                    feat_in,
-                    self.feat_out,
-                    3,
-                    stride=1,
-                    padding=1,
-                    weight_attr=ParamAttr(initializer=Normal(
-                        mean=0.0, std=0.01)),
-                    bias_attr=ParamAttr(initializer=Constant(value=0.0))))
-            cls_subnet.append(
-                nn.BatchNorm2D(
-                    self.feat_out,
-                    weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-                    bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
-            cls_subnet.append(nn.ReLU())
-
-        for i in range(self.num_reg_convs):
-            feat_in = self.feat_in if i == 0 else self.feat_out
-            reg_subnet.append(
-                nn.Conv2D(
-                    feat_in,
-                    self.feat_out,
-                    3,
-                    stride=1,
-                    padding=1,
-                    weight_attr=ParamAttr(initializer=Normal(
-                        mean=0.0, std=0.01)),
-                    bias_attr=ParamAttr(initializer=Constant(value=0.0))))
-            reg_subnet.append(
-                nn.BatchNorm2D(
-                    self.feat_out,
-                    weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-                    bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
-            reg_subnet.append(nn.ReLU())
-
-        self.cls_subnet = nn.Sequential(*cls_subnet)
-        self.reg_subnet = nn.Sequential(*reg_subnet)
-
-    def forward(self, fpn_feat):
-        cls_feat = self.cls_subnet(fpn_feat)
-        reg_feat = self.reg_subnet(fpn_feat)
-        return cls_feat, reg_feat
-
-
-@register
-class YOLOFHead(nn.Layer):
-    __shared__ = ['num_classes', 'trt', 'exclude_nms']
-    __inject__ = [
-        'conv_feat', 'anchor_generator', 'bbox_assigner', 'loss_class',
-        'loss_bbox', 'nms'
-    ]
-
-    def __init__(self,
-                 num_classes=80,
-                 conv_feat='YOLOFFeat',
-                 anchor_generator='AnchorGenerator',
-                 bbox_assigner='UniformAssigner',
-                 loss_class='FocalLoss',
-                 loss_bbox='GIoULoss',
-                 ctr_clip=32.0,
-                 delta_mean=[0.0, 0.0, 0.0, 0.0],
-                 delta_std=[1.0, 1.0, 1.0, 1.0],
-                 nms='MultiClassNMS',
-                 prior_prob=0.01,
-                 nms_pre=1000,
-                 use_inside_anchor=False,
-                 trt=False,
-                 exclude_nms=False):
-        super(YOLOFHead, self).__init__()
-        self.num_classes = num_classes
-        self.conv_feat = conv_feat
-        self.anchor_generator = anchor_generator
-        self.na = self.anchor_generator.num_anchors
-        self.bbox_assigner = bbox_assigner
-        self.loss_class = loss_class
-        self.loss_bbox = loss_bbox
-        self.ctr_clip = ctr_clip
-        self.delta_mean = delta_mean
-        self.delta_std = delta_std
-        self.nms = nms
-        self.nms_pre = nms_pre
-        self.use_inside_anchor = use_inside_anchor
-        if isinstance(self.nms, MultiClassNMS) and trt:
-            self.nms.trt = trt
-        self.exclude_nms = exclude_nms
-
-        bias_init_value = -math.log((1 - prior_prob) / prior_prob)
-        self.cls_score = self.add_sublayer(
-            'cls_score',
-            nn.Conv2D(
-                in_channels=conv_feat.feat_out,
-                out_channels=self.num_classes * self.na,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0.0, std=0.01)),
-                bias_attr=ParamAttr(initializer=Constant(
-                    value=bias_init_value))))
-
-        self.bbox_pred = self.add_sublayer(
-            'bbox_pred',
-            nn.Conv2D(
-                in_channels=conv_feat.feat_out,
-                out_channels=4 * self.na,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0.0, std=0.01)),
-                bias_attr=ParamAttr(initializer=Constant(value=0))))
-
-        self.object_pred = self.add_sublayer(
-            'object_pred',
-            nn.Conv2D(
-                in_channels=conv_feat.feat_out,
-                out_channels=self.na,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0.0, std=0.01)),
-                bias_attr=ParamAttr(initializer=Constant(value=0))))
-
-    def forward(self, feats, targets=None):
-        assert len(feats) == 1, "YOLOF only has one level feature."
-        conv_cls_feat, conv_reg_feat = self.conv_feat(feats[0])
-        cls_logits = self.cls_score(conv_cls_feat)
-        objectness = self.object_pred(conv_reg_feat)
-        bboxes_reg = self.bbox_pred(conv_reg_feat)
-
-        N, C, H, W = paddle.shape(cls_logits)[:]
-        cls_logits = cls_logits.reshape((N, self.na, self.num_classes, H, W))
-        objectness = objectness.reshape((N, self.na, 1, H, W))
-        norm_cls_logits = cls_logits + objectness - paddle.log(
-            1.0 + paddle.clip(
-                cls_logits.exp(), max=INF) + paddle.clip(
-                    objectness.exp(), max=INF))
-        norm_cls_logits = norm_cls_logits.reshape((N, C, H, W))
-
-        anchors = self.anchor_generator([norm_cls_logits])
-
-        if self.training:
-            yolof_losses = self.get_loss(
-                [anchors[0], norm_cls_logits, bboxes_reg], targets)
-            return yolof_losses
-        else:
-            return anchors[0], norm_cls_logits, bboxes_reg
-
-    def get_loss(self, head_outs, targets):
-        anchors, cls_logits, bbox_preds = head_outs
-
-        feat_size = cls_logits.shape[-2:]
-        cls_logits = cls_logits.transpose([0, 2, 3, 1])
-        cls_logits = cls_logits.reshape([0, -1, self.num_classes])
-        bbox_preds = bbox_preds.transpose([0, 2, 3, 1])
-        bbox_preds = bbox_preds.reshape([0, -1, 4])
-
-        num_pos_list = []
-        cls_pred_list, cls_tar_list = [], []
-        reg_pred_list, reg_tar_list = [], []
-        # find and gather preds and targets in each image
-        for cls_logit, bbox_pred, gt_bbox, gt_class, im_shape in zip(
-                cls_logits, bbox_preds, targets['gt_bbox'], targets['gt_class'],
-                targets['im_shape']):
-            if self.use_inside_anchor:
-                inside_mask = find_inside_anchor(
-                    feat_size, self.anchor_generator.strides[0], self.na,
-                    im_shape.tolist())
-                cls_logit = cls_logit[inside_mask]
-                bbox_pred = bbox_pred[inside_mask]
-                anchors = anchors[inside_mask]
-
-            bbox_pred = delta2bbox_v2(
-                bbox_pred,
-                anchors,
-                self.delta_mean,
-                self.delta_std,
-                ctr_clip=self.ctr_clip)
-            bbox_pred = bbox_pred.reshape([-1, bbox_pred.shape[-1]])
-
-            # -2:ignore, -1:neg, >=0:pos
-            match_labels, pos_bbox_pred, pos_bbox_tar = self.bbox_assigner(
-                bbox_pred, anchors, gt_bbox)
-            pos_mask = (match_labels >= 0)
-            neg_mask = (match_labels == -1)
-            chosen_mask = paddle.logical_or(pos_mask, neg_mask)
-
-            gt_class = gt_class.reshape([-1])
-            bg_class = paddle.to_tensor(
-                [self.num_classes], dtype=gt_class.dtype)
-            # a trick to assign num_classes to negative targets
-            gt_class = paddle.concat([gt_class, bg_class], axis=-1)
-            match_labels = paddle.where(
-                neg_mask,
-                paddle.full_like(match_labels, gt_class.size - 1), match_labels)
-            num_pos_list.append(max(1.0, pos_mask.sum().item()))
-
-            cls_pred_list.append(cls_logit[chosen_mask])
-            cls_tar_list.append(gt_class[match_labels[chosen_mask]])
-            reg_pred_list.append(pos_bbox_pred)
-            reg_tar_list.append(pos_bbox_tar)
-
-        num_tot_pos = paddle.to_tensor(sum(num_pos_list))
-        num_tot_pos = reduce_mean(num_tot_pos).item()
-        num_tot_pos = max(1.0, num_tot_pos)
-
-        cls_pred = paddle.concat(cls_pred_list)
-        cls_tar = paddle.concat(cls_tar_list)
-        cls_loss = self.loss_class(
-            cls_pred, cls_tar, reduction='sum') / num_tot_pos
-
-        reg_pred_list = [_ for _ in reg_pred_list if _ is not None]
-        reg_tar_list = [_ for _ in reg_tar_list if _ is not None]
-        if len(reg_pred_list) == 0:
-            reg_loss = bbox_preds.sum() * 0.0
-        else:
-            reg_pred = paddle.concat(reg_pred_list)
-            reg_tar = paddle.concat(reg_tar_list)
-            reg_loss = self.loss_bbox(reg_pred, reg_tar).sum() / num_tot_pos
-
-        yolof_losses = {
-            'loss': cls_loss + reg_loss,
-            'loss_cls': cls_loss,
-            'loss_reg': reg_loss,
-        }
-        return yolof_losses
-
-    def get_bboxes_single(self,
-                          anchors,
-                          cls_scores,
-                          bbox_preds,
-                          im_shape,
-                          scale_factor,
-                          rescale=True):
-        assert len(cls_scores) == len(bbox_preds)
-        mlvl_bboxes = []
-        mlvl_scores = []
-        for anchor, cls_score, bbox_pred in zip(anchors, cls_scores,
-                                                bbox_preds):
-            cls_score = cls_score.reshape([-1, self.num_classes])
-            bbox_pred = bbox_pred.reshape([-1, 4])
-            if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre:
-                max_score = cls_score.max(axis=1)
-                _, topk_inds = max_score.topk(self.nms_pre)
-                bbox_pred = bbox_pred.gather(topk_inds)
-                anchor = anchor.gather(topk_inds)
-                cls_score = cls_score.gather(topk_inds)
-
-            bbox_pred = delta2bbox_v2(
-                bbox_pred,
-                anchor,
-                self.delta_mean,
-                self.delta_std,
-                max_shape=im_shape,
-                ctr_clip=self.ctr_clip).squeeze()
-            mlvl_bboxes.append(bbox_pred)
-            mlvl_scores.append(F.sigmoid(cls_score))
-        mlvl_bboxes = paddle.concat(mlvl_bboxes)
-        mlvl_bboxes = paddle.squeeze(mlvl_bboxes)
-        if rescale:
-            mlvl_bboxes = mlvl_bboxes / paddle.concat(
-                [scale_factor[::-1], scale_factor[::-1]])
-        mlvl_scores = paddle.concat(mlvl_scores)
-        mlvl_scores = mlvl_scores.transpose([1, 0])
-        return mlvl_bboxes, mlvl_scores
-
-    def decode(self, anchors, cls_scores, bbox_preds, im_shape, scale_factor):
-        batch_bboxes = []
-        batch_scores = []
-        for img_id in range(cls_scores[0].shape[0]):
-            num_lvls = len(cls_scores)
-            cls_score_list = [cls_scores[i][img_id] for i in range(num_lvls)]
-            bbox_pred_list = [bbox_preds[i][img_id] for i in range(num_lvls)]
-            bboxes, scores = self.get_bboxes_single(
-                anchors, cls_score_list, bbox_pred_list, im_shape[img_id],
-                scale_factor[img_id])
-            batch_bboxes.append(bboxes)
-            batch_scores.append(scores)
-        batch_bboxes = paddle.stack(batch_bboxes, 0)
-        batch_scores = paddle.stack(batch_scores, 0)
-        return batch_bboxes, batch_scores
-
-    def post_process(self, head_outs, im_shape, scale_factor):
-        anchors, cls_scores, bbox_preds = head_outs
-        cls_scores = cls_scores.transpose([0, 2, 3, 1])
-        bbox_preds = bbox_preds.transpose([0, 2, 3, 1])
-        pred_bboxes, pred_scores = self.decode(
-            [anchors], [cls_scores], [bbox_preds], im_shape, scale_factor)
-
-        if self.exclude_nms:
-            # `exclude_nms=True` just use in benchmark
-            return pred_bboxes.sum(), pred_scores.sum()
-        else:
-            bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores)
-            return bbox_pred, bbox_num
diff --git a/pdfdet/models/Paddle/ppdet/modeling/initializer.py b/pdfdet/models/Paddle/ppdet/modeling/initializer.py
deleted file mode 100644
index 308c51b..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/initializer.py
+++ /dev/null
@@ -1,325 +0,0 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
-Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
-"""
-
-import math
-import numpy as np
-
-import paddle
-import paddle.nn as nn
-
-__all__ = [
-    'uniform_',
-    'normal_',
-    'constant_',
-    'ones_',
-    'zeros_',
-    'xavier_uniform_',
-    'xavier_normal_',
-    'kaiming_uniform_',
-    'kaiming_normal_',
-    'linear_init_',
-    'conv_init_',
-    'reset_initialized_parameter',
-]
-
-
-def _no_grad_uniform_(tensor, a, b):
-    with paddle.no_grad():
-        tensor.set_value(
-            paddle.uniform(
-                shape=tensor.shape, dtype=tensor.dtype, min=a, max=b))
-    return tensor
-
-
-def _no_grad_normal_(tensor, mean=0., std=1.):
-    with paddle.no_grad():
-        tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
-    return tensor
-
-
-def _no_grad_fill_(tensor, value=0.):
-    with paddle.no_grad():
-        tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype))
-    return tensor
-
-
-def uniform_(tensor, a, b):
-    """
-    Modified tensor inspace using uniform_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-        a (float|int): min value.
-        b (float|int): max value.
-    Return:
-        tensor
-    """
-    return _no_grad_uniform_(tensor, a, b)
-
-
-def normal_(tensor, mean=0., std=1.):
-    """
-    Modified tensor inspace using normal_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-        mean (float|int): mean value.
-        std (float|int): std value.
-    Return:
-        tensor
-    """
-    return _no_grad_normal_(tensor, mean, std)
-
-
-def constant_(tensor, value=0.):
-    """
-    Modified tensor inspace using constant_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-        value (float|int): value to fill tensor.
-    Return:
-        tensor
-    """
-    return _no_grad_fill_(tensor, value)
-
-
-def ones_(tensor):
-    """
-    Modified tensor inspace using ones_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-    Return:
-        tensor
-    """
-    return _no_grad_fill_(tensor, 1)
-
-
-def zeros_(tensor):
-    """
-    Modified tensor inspace using zeros_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-    Return:
-        tensor
-    """
-    return _no_grad_fill_(tensor, 0)
-
-
-def vector_(tensor, vector):
-    with paddle.no_grad():
-        tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype))
-    return tensor
-
-
-def _calculate_fan_in_and_fan_out(tensor, reverse=False):
-    """
-    Calculate (fan_in, _fan_out) for tensor
-
-    Args:
-        tensor (Tensor): paddle.Tensor
-        reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
-
-    Return:
-        Tuple[fan_in, fan_out]
-    """
-    if tensor.ndim < 2:
-        raise ValueError(
-            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
-        )
-
-    if reverse:
-        num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
-    else:
-        num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
-
-    receptive_field_size = 1
-    if tensor.ndim > 2:
-        receptive_field_size = np.prod(tensor.shape[2:])
-
-    fan_in = num_input_fmaps * receptive_field_size
-    fan_out = num_output_fmaps * receptive_field_size
-
-    return fan_in, fan_out
-
-
-def xavier_uniform_(tensor, gain=1., reverse=False):
-    """
-    Modified tensor inspace using xavier_uniform_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-        gain (float): super parameter, 1. default.
-        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
-    Return:
-        tensor
-    """
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
-    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
-    k = math.sqrt(3.0) * std
-    return _no_grad_uniform_(tensor, -k, k)
-
-
-def xavier_normal_(tensor, gain=1., reverse=False):
-    """
-    Modified tensor inspace using xavier_normal_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-        gain (float): super parameter, 1. default.
-        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
-    Return:
-        tensor
-    """
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
-    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
-    return _no_grad_normal_(tensor, 0, std)
-
-
-# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
-def _calculate_correct_fan(tensor, mode, reverse=False):
-    mode = mode.lower()
-    valid_modes = ['fan_in', 'fan_out']
-    if mode not in valid_modes:
-        raise ValueError("Mode {} not supported, please use one of {}".format(
-            mode, valid_modes))
-
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
-
-    return fan_in if mode == 'fan_in' else fan_out
-
-
-def _calculate_gain(nonlinearity, param=None):
-    linear_fns = [
-        'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d',
-        'conv_transpose2d', 'conv_transpose3d'
-    ]
-    if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
-        return 1
-    elif nonlinearity == 'tanh':
-        return 5.0 / 3
-    elif nonlinearity == 'relu':
-        return math.sqrt(2.0)
-    elif nonlinearity == 'leaky_relu':
-        if param is None:
-            negative_slope = 0.01
-        elif not isinstance(param, bool) and isinstance(
-                param, int) or isinstance(param, float):
-            # True/False are instances of int, hence check above
-            negative_slope = param
-        else:
-            raise ValueError("negative_slope {} not a valid number".format(
-                param))
-        return math.sqrt(2.0 / (1 + negative_slope**2))
-    elif nonlinearity == 'selu':
-        return 3.0 / 4
-    else:
-        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
-
-
-def kaiming_uniform_(tensor,
-                     a=0,
-                     mode='fan_in',
-                     nonlinearity='leaky_relu',
-                     reverse=False):
-    """
-    Modified tensor inspace using kaiming_uniform method
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
-        nonlinearity (str): nonlinearity method name
-        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
-    Return:
-        tensor
-    """
-    fan = _calculate_correct_fan(tensor, mode, reverse)
-    gain = _calculate_gain(nonlinearity, a)
-    std = gain / math.sqrt(fan)
-    k = math.sqrt(3.0) * std
-    return _no_grad_uniform_(tensor, -k, k)
-
-
-def kaiming_normal_(tensor,
-                    a=0,
-                    mode='fan_in',
-                    nonlinearity='leaky_relu',
-                    reverse=False):
-    """
-    Modified tensor inspace using kaiming_normal_
-    Args:
-        tensor (paddle.Tensor): paddle Tensor
-        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
-        nonlinearity (str): nonlinearity method name
-        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
-    Return:
-        tensor
-    """
-    fan = _calculate_correct_fan(tensor, mode, reverse)
-    gain = _calculate_gain(nonlinearity, a)
-    std = gain / math.sqrt(fan)
-    return _no_grad_normal_(tensor, 0, std)
-
-
-def linear_init_(module):
-    bound = 1 / math.sqrt(module.weight.shape[0])
-    uniform_(module.weight, -bound, bound)
-    if hasattr(module, "bias") and module.bias is not None:
-        uniform_(module.bias, -bound, bound)
-
-
-def conv_init_(module):
-    bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
-    uniform_(module.weight, -bound, bound)
-    if module.bias is not None:
-        uniform_(module.bias, -bound, bound)
-
-
-def bias_init_with_prob(prior_prob=0.01):
-    """initialize conv/fc bias value according to a given probability value."""
-    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
-    return bias_init
-
-
-@paddle.no_grad()
-def reset_initialized_parameter(model, include_self=True):
-    """
-    Reset initialized parameter using following method for [conv, linear, embedding, bn]
-
-    Args:
-        model (paddle.Layer): paddle Layer
-        include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
-    Return:
-        None
-    """
-    for _, m in model.named_sublayers(include_self=include_self):
-        if isinstance(m, nn.Conv2D):
-            k = float(m._groups) / (m._in_channels * m._kernel_size[0] *
-                                    m._kernel_size[1])
-            k = math.sqrt(k)
-            _no_grad_uniform_(m.weight, -k, k)
-            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
-                _no_grad_uniform_(m.bias, -k, k)
-
-        elif isinstance(m, nn.Linear):
-            k = math.sqrt(1. / m.weight.shape[0])
-            _no_grad_uniform_(m.weight, -k, k)
-            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
-                _no_grad_uniform_(m.bias, -k, k)
-
-        elif isinstance(m, nn.Embedding):
-            _no_grad_normal_(m.weight, mean=0., std=1.)
-
-        elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
-            _no_grad_fill_(m.weight, 1.)
-            if hasattr(m, 'bias') and getattr(m, 'bias') is not None:
-                _no_grad_fill_(m.bias, 0)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/keypoint_utils.py b/pdfdet/models/Paddle/ppdet/modeling/keypoint_utils.py
deleted file mode 100644
index 382e373..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/keypoint_utils.py
+++ /dev/null
@@ -1,551 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-this code is based on https://github.com/open-mmlab/mmpose
-"""
-
-import cv2
-import numpy as np
-import paddle.nn.functional as F
-
-
-def get_affine_mat_kernel(h, w, s, inv=False):
-    if w < h:
-        w_ = s
-        h_ = int(np.ceil((s / w * h) / 64.) * 64)
-        scale_w = w
-        scale_h = h_ / w_ * w
-
-    else:
-        h_ = s
-        w_ = int(np.ceil((s / h * w) / 64.) * 64)
-        scale_h = h
-        scale_w = w_ / h_ * h
-
-    center = np.array([np.round(w / 2.), np.round(h / 2.)])
-
-    size_resized = (w_, h_)
-    trans = get_affine_transform(
-        center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv)
-
-    return trans, size_resized
-
-
-def get_affine_transform(center,
-                         input_size,
-                         rot,
-                         output_size,
-                         shift=(0., 0.),
-                         inv=False):
-    """Get the affine transform matrix, given the center/scale/rot/output_size.
-
-    Args:
-        center (np.ndarray[2, ]): Center of the bounding box (x, y).
-        input_size (np.ndarray[2, ]): Size of input feature (width, height).
-        rot (float): Rotation angle (degree).
-        output_size (np.ndarray[2, ]): Size of the destination heatmaps.
-        shift (0-100%): Shift translation ratio wrt the width/height.
-            Default (0., 0.).
-        inv (bool): Option to inverse the affine transform direction.
-            (inv=False: src->dst or inv=True: dst->src)
-
-    Returns:
-        np.ndarray: The transform matrix.
-    """
-    assert len(center) == 2
-    assert len(output_size) == 2
-    assert len(shift) == 2
-
-    if not isinstance(input_size, (np.ndarray, list)):
-        input_size = np.array([input_size, input_size], dtype=np.float32)
-    scale_tmp = input_size
-
-    shift = np.array(shift)
-    src_w = scale_tmp[0]
-    dst_w = output_size[0]
-    dst_h = output_size[1]
-
-    rot_rad = np.pi * rot / 180
-    src_dir = rotate_point([0., src_w * -0.5], rot_rad)
-    dst_dir = np.array([0., dst_w * -0.5])
-
-    src = np.zeros((3, 2), dtype=np.float32)
-
-    src[0, :] = center + scale_tmp * shift
-    src[1, :] = center + src_dir + scale_tmp * shift
-    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
-
-    dst = np.zeros((3, 2), dtype=np.float32)
-    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
-    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
-    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
-
-    if inv:
-        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
-    else:
-        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
-
-    return trans
-
-
-def get_warp_matrix(theta, size_input, size_dst, size_target):
-    """This code is based on
-        https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py
-
-        Calculate the transformation matrix under the constraint of unbiased.
-    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
-    Data Processing for Human Pose Estimation (CVPR 2020).
-
-    Args:
-        theta (float): Rotation angle in degrees.
-        size_input (np.ndarray): Size of input image [w, h].
-        size_dst (np.ndarray): Size of output image [w, h].
-        size_target (np.ndarray): Size of ROI in input plane [w, h].
-
-    Returns:
-        matrix (np.ndarray): A matrix for transformation.
-    """
-    theta = np.deg2rad(theta)
-    matrix = np.zeros((2, 3), dtype=np.float32)
-    scale_x = size_dst[0] / size_target[0]
-    scale_y = size_dst[1] / size_target[1]
-    matrix[0, 0] = np.cos(theta) * scale_x
-    matrix[0, 1] = -np.sin(theta) * scale_x
-    matrix[0, 2] = scale_x * (
-        -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] *
-        np.sin(theta) + 0.5 * size_target[0])
-    matrix[1, 0] = np.sin(theta) * scale_y
-    matrix[1, 1] = np.cos(theta) * scale_y
-    matrix[1, 2] = scale_y * (
-        -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] *
-        np.cos(theta) + 0.5 * size_target[1])
-    return matrix
-
-
-def _get_3rd_point(a, b):
-    """To calculate the affine matrix, three pairs of points are required. This
-    function is used to get the 3rd point, given 2D points a & b.
-
-    The 3rd point is defined by rotating vector `a - b` by 90 degrees
-    anticlockwise, using b as the rotation center.
-
-    Args:
-        a (np.ndarray): point(x,y)
-        b (np.ndarray): point(x,y)
-
-    Returns:
-        np.ndarray: The 3rd point.
-    """
-    assert len(
-        a) == 2, 'input of _get_3rd_point should be point with length of 2'
-    assert len(
-        b) == 2, 'input of _get_3rd_point should be point with length of 2'
-    direction = a - b
-    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
-
-    return third_pt
-
-
-def rotate_point(pt, angle_rad):
-    """Rotate a point by an angle.
-
-    Args:
-        pt (list[float]): 2 dimensional point to be rotated
-        angle_rad (float): rotation angle by radian
-
-    Returns:
-        list[float]: Rotated point.
-    """
-    assert len(pt) == 2
-    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
-    new_x = pt[0] * cs - pt[1] * sn
-    new_y = pt[0] * sn + pt[1] * cs
-    rotated_pt = [new_x, new_y]
-
-    return rotated_pt
-
-
-def transpred(kpts, h, w, s):
-    trans, _ = get_affine_mat_kernel(h, w, s, inv=True)
-
-    return warp_affine_joints(kpts[..., :2].copy(), trans)
-
-
-def warp_affine_joints(joints, mat):
-    """Apply affine transformation defined by the transform matrix on the
-    joints.
-
-    Args:
-        joints (np.ndarray[..., 2]): Origin coordinate of joints.
-        mat (np.ndarray[3, 2]): The affine matrix.
-
-    Returns:
-        matrix (np.ndarray[..., 2]): Result coordinate of joints.
-    """
-    joints = np.array(joints)
-    shape = joints.shape
-    joints = joints.reshape(-1, 2)
-    return np.dot(np.concatenate(
-        (joints, joints[:, 0:1] * 0 + 1), axis=1),
-                  mat.T).reshape(shape)
-
-
-def affine_transform(pt, t):
-    new_pt = np.array([pt[0], pt[1], 1.]).T
-    new_pt = np.dot(t, new_pt)
-    return new_pt[:2]
-
-
-def transform_preds(coords, center, scale, output_size):
-    target_coords = np.zeros(coords.shape)
-    trans = get_affine_transform(center, scale * 200, 0, output_size, inv=1)
-    for p in range(coords.shape[0]):
-        target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans)
-    return target_coords
-
-
-def oks_iou(g, d, a_g, a_d, sigmas=None, in_vis_thre=None):
-    if not isinstance(sigmas, np.ndarray):
-        sigmas = np.array([
-            .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07,
-            .87, .87, .89, .89
-        ]) / 10.0
-    vars = (sigmas * 2)**2
-    xg = g[0::3]
-    yg = g[1::3]
-    vg = g[2::3]
-    ious = np.zeros((d.shape[0]))
-    for n_d in range(0, d.shape[0]):
-        xd = d[n_d, 0::3]
-        yd = d[n_d, 1::3]
-        vd = d[n_d, 2::3]
-        dx = xd - xg
-        dy = yd - yg
-        e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2
-        if in_vis_thre is not None:
-            ind = list(vg > in_vis_thre) and list(vd > in_vis_thre)
-            e = e[ind]
-        ious[n_d] = np.sum(np.exp(-e)) / e.shape[0] if e.shape[0] != 0 else 0.0
-    return ious
-
-
-def oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
-    """greedily select boxes with high confidence and overlap with current maximum <= thresh
-    rule out overlap >= thresh
-
-    Args:
-        kpts_db (list): The predicted keypoints within the image
-        thresh (float): The threshold to select the boxes
-        sigmas (np.array): The variance to calculate the oks iou
-            Default: None
-        in_vis_thre (float): The threshold to select the high confidence boxes
-            Default: None
-
-    Return:
-        keep (list): indexes to keep
-    """
-
-    if len(kpts_db) == 0:
-        return []
-
-    scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
-    kpts = np.array(
-        [kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
-    areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])
-
-    order = scores.argsort()[::-1]
-
-    keep = []
-    while order.size > 0:
-        i = order[0]
-        keep.append(i)
-
-        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
-                          sigmas, in_vis_thre)
-
-        inds = np.where(oks_ovr <= thresh)[0]
-        order = order[inds + 1]
-
-    return keep
-
-
-def rescore(overlap, scores, thresh, type='gaussian'):
-    assert overlap.shape[0] == scores.shape[0]
-    if type == 'linear':
-        inds = np.where(overlap >= thresh)[0]
-        scores[inds] = scores[inds] * (1 - overlap[inds])
-    else:
-        scores = scores * np.exp(-overlap**2 / thresh)
-
-    return scores
-
-
-def soft_oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None):
-    """greedily select boxes with high confidence and overlap with current maximum <= thresh
-    rule out overlap >= thresh
-
-    Args:
-        kpts_db (list): The predicted keypoints within the image
-        thresh (float): The threshold to select the boxes
-        sigmas (np.array): The variance to calculate the oks iou
-            Default: None
-        in_vis_thre (float): The threshold to select the high confidence boxes
-            Default: None
-
-    Return:
-        keep (list): indexes to keep
-    """
-
-    if len(kpts_db) == 0:
-        return []
-
-    scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))])
-    kpts = np.array(
-        [kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))])
-    areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))])
-
-    order = scores.argsort()[::-1]
-    scores = scores[order]
-
-    # max_dets = order.size
-    max_dets = 20
-    keep = np.zeros(max_dets, dtype=np.intp)
-    keep_cnt = 0
-    while order.size > 0 and keep_cnt < max_dets:
-        i = order[0]
-
-        oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]],
-                          sigmas, in_vis_thre)
-
-        order = order[1:]
-        scores = rescore(oks_ovr, scores[1:], thresh)
-
-        tmp = scores.argsort()[::-1]
-        order = order[tmp]
-        scores = scores[tmp]
-
-        keep[keep_cnt] = i
-        keep_cnt += 1
-
-    keep = keep[:keep_cnt]
-
-    return keep
-
-
-def resize(input,
-           size=None,
-           scale_factor=None,
-           mode='nearest',
-           align_corners=None,
-           warning=True):
-    if warning:
-        if size is not None and align_corners:
-            input_h, input_w = tuple(int(x) for x in input.shape[2:])
-            output_h, output_w = tuple(int(x) for x in size)
-            if output_h > input_h or output_w > output_h:
-                if ((output_h > 1 and output_w > 1 and input_h > 1 and
-                     input_w > 1) and (output_h - 1) % (input_h - 1) and
-                    (output_w - 1) % (input_w - 1)):
-                    warnings.warn(
-                        f'When align_corners={align_corners}, '
-                        'the output would more aligned if '
-                        f'input size {(input_h, input_w)} is `x+1` and '
-                        f'out size {(output_h, output_w)} is `nx+1`')
-
-    return F.interpolate(input, size, scale_factor, mode, align_corners)
-
-
-def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'):
-    """Flip the flipped heatmaps back to the original form.
-    Note:
-        - batch_size: N
-        - num_keypoints: K
-        - heatmap height: H
-        - heatmap width: W
-    Args:
-        output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained
-            from the flipped images.
-        flip_pairs (list[tuple()): Pairs of keypoints which are mirrored
-            (for example, left ear -- right ear).
-        target_type (str): GaussianHeatmap or CombinedTarget
-    Returns:
-        np.ndarray: heatmaps that flipped back to the original image
-    """
-    assert len(output_flipped.shape) == 4, \
-        'output_flipped should be [batch_size, num_keypoints, height, width]'
-    shape_ori = output_flipped.shape
-    channels = 1
-    if target_type.lower() == 'CombinedTarget'.lower():
-        channels = 3
-        output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...]
-    output_flipped = output_flipped.reshape((shape_ori[0], -1, channels,
-                                             shape_ori[2], shape_ori[3]))
-    output_flipped_back = output_flipped.clone()
-
-    # Swap left-right parts
-    for left, right in flip_pairs:
-        output_flipped_back[:, left, ...] = output_flipped[:, right, ...]
-        output_flipped_back[:, right, ...] = output_flipped[:, left, ...]
-    output_flipped_back = output_flipped_back.reshape(shape_ori)
-    # Flip horizontally
-    output_flipped_back = output_flipped_back[..., ::-1]
-    return output_flipped_back
-
-
-def _calc_distances(preds, targets, mask, normalize):
-    """Calculate the normalized distances between preds and target.
-
-    Note:
-        batch_size: N
-        num_keypoints: K
-        dimension of keypoints: D (normally, D=2 or D=3)
-
-    Args:
-        preds (np.ndarray[N, K, D]): Predicted keypoint location.
-        targets (np.ndarray[N, K, D]): Groundtruth keypoint location.
-        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
-            joints, and True for visible. Invisible joints will be ignored for
-            accuracy calculation.
-        normalize (np.ndarray[N, D]): Typical value is heatmap_size
-
-    Returns:
-        np.ndarray[K, N]: The normalized distances. \
-            If target keypoints are missing, the distance is -1.
-    """
-    N, K, _ = preds.shape
-    # set mask=0 when normalize==0
-    _mask = mask.copy()
-    _mask[np.where((normalize == 0).sum(1))[0], :] = False
-    distances = np.full((N, K), -1, dtype=np.float32)
-    # handle invalid values
-    normalize[np.where(normalize <= 0)] = 1e6
-    distances[_mask] = np.linalg.norm(
-        ((preds - targets) / normalize[:, None, :])[_mask], axis=-1)
-    return distances.T
-
-
-def _distance_acc(distances, thr=0.5):
-    """Return the percentage below the distance threshold, while ignoring
-    distances values with -1.
-
-    Note:
-        batch_size: N
-    Args:
-        distances (np.ndarray[N, ]): The normalized distances.
-        thr (float): Threshold of the distances.
-
-    Returns:
-        float: Percentage of distances below the threshold. \
-            If all target keypoints are missing, return -1.
-    """
-    distance_valid = distances != -1
-    num_distance_valid = distance_valid.sum()
-    if num_distance_valid > 0:
-        return (distances[distance_valid] < thr).sum() / num_distance_valid
-    return -1
-
-
-def keypoint_pck_accuracy(pred, gt, mask, thr, normalize):
-    """Calculate the pose accuracy of PCK for each individual keypoint and the
-    averaged accuracy across all keypoints for coordinates.
-
-    Note:
-        PCK metric measures accuracy of the localization of the body joints.
-        The distances between predicted positions and the ground-truth ones
-        are typically normalized by the bounding box size.
-        The threshold (thr) of the normalized distance is commonly set
-        as 0.05, 0.1 or 0.2 etc.
-
-        - batch_size: N
-        - num_keypoints: K
-
-    Args:
-        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
-        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
-        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
-            joints, and True for visible. Invisible joints will be ignored for
-            accuracy calculation.
-        thr (float): Threshold of PCK calculation.
-        normalize (np.ndarray[N, 2]): Normalization factor for H&W.
-
-    Returns:
-        tuple: A tuple containing keypoint accuracy.
-
-        - acc (np.ndarray[K]): Accuracy of each keypoint.
-        - avg_acc (float): Averaged accuracy across all keypoints.
-        - cnt (int): Number of valid keypoints.
-    """
-    distances = _calc_distances(pred, gt, mask, normalize)
-
-    acc = np.array([_distance_acc(d, thr) for d in distances])
-    valid_acc = acc[acc >= 0]
-    cnt = len(valid_acc)
-    avg_acc = valid_acc.mean() if cnt > 0 else 0
-    return acc, avg_acc, cnt
-
-
-def keypoint_auc(pred, gt, mask, normalize, num_step=20):
-    """Calculate the pose accuracy of PCK for each individual keypoint and the
-    averaged accuracy across all keypoints for coordinates.
-
-    Note:
-        - batch_size: N
-        - num_keypoints: K
-
-    Args:
-        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
-        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
-        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
-            joints, and True for visible. Invisible joints will be ignored for
-            accuracy calculation.
-        normalize (float): Normalization factor.
-
-    Returns:
-        float: Area under curve.
-    """
-    nor = np.tile(np.array([[normalize, normalize]]), (pred.shape[0], 1))
-    x = [1.0 * i / num_step for i in range(num_step)]
-    y = []
-    for thr in x:
-        _, avg_acc, _ = keypoint_pck_accuracy(pred, gt, mask, thr, nor)
-        y.append(avg_acc)
-
-    auc = 0
-    for i in range(num_step):
-        auc += 1.0 / num_step * y[i]
-    return auc
-
-
-def keypoint_epe(pred, gt, mask):
-    """Calculate the end-point error.
-
-    Note:
-        - batch_size: N
-        - num_keypoints: K
-
-    Args:
-        pred (np.ndarray[N, K, 2]): Predicted keypoint location.
-        gt (np.ndarray[N, K, 2]): Groundtruth keypoint location.
-        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
-            joints, and True for visible. Invisible joints will be ignored for
-            accuracy calculation.
-
-    Returns:
-        float: Average end-point error.
-    """
-
-    normalize = np.ones((pred.shape[0], pred.shape[2]), dtype=np.float32)
-    distances = _calc_distances(pred, gt, mask, normalize)
-    distance_valid = distances[distances != -1]
-    return distance_valid.sum() / max(1, len(distance_valid))
diff --git a/pdfdet/models/Paddle/ppdet/modeling/lane_utils.py b/pdfdet/models/Paddle/ppdet/modeling/lane_utils.py
deleted file mode 100644
index e3fb45c..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/lane_utils.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import os
-import cv2
-import numpy as np
-from scipy.interpolate import InterpolatedUnivariateSpline
-
-
-class Lane:
-    def __init__(self, points=None, invalid_value=-2., metadata=None):
-        super(Lane, self).__init__()
-        self.curr_iter = 0
-        self.points = points
-        self.invalid_value = invalid_value
-        self.function = InterpolatedUnivariateSpline(
-            points[:, 1], points[:, 0], k=min(3, len(points) - 1))
-        self.min_y = points[:, 1].min() - 0.01
-        self.max_y = points[:, 1].max() + 0.01
-        self.metadata = metadata or {}
-
-    def __repr__(self):
-        return '[Lane]\n' + str(self.points) + '\n[/Lane]'
-
-    def __call__(self, lane_ys):
-        lane_xs = self.function(lane_ys)
-
-        lane_xs[(lane_ys < self.min_y) | (lane_ys > self.max_y
-                                          )] = self.invalid_value
-        return lane_xs
-
-    def to_array(self, sample_y_range, img_w, img_h):
-        self.sample_y = range(sample_y_range[0], sample_y_range[1],
-                              sample_y_range[2])
-        sample_y = self.sample_y
-        img_w, img_h = img_w, img_h
-        ys = np.array(sample_y) / float(img_h)
-        xs = self(ys)
-        valid_mask = (xs >= 0) & (xs < 1)
-        lane_xs = xs[valid_mask] * img_w
-        lane_ys = ys[valid_mask] * img_h
-        lane = np.concatenate(
-            (lane_xs.reshape(-1, 1), lane_ys.reshape(-1, 1)), axis=1)
-        return lane
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        if self.curr_iter < len(self.points):
-            self.curr_iter += 1
-            return self.points[self.curr_iter - 1]
-        self.curr_iter = 0
-        raise StopIteration
-
-
-COLORS = [
-    (255, 0, 0),
-    (0, 255, 0),
-    (0, 0, 255),
-    (255, 255, 0),
-    (255, 0, 255),
-    (0, 255, 255),
-    (128, 255, 0),
-    (255, 128, 0),
-    (128, 0, 255),
-    (255, 0, 128),
-    (0, 128, 255),
-    (0, 255, 128),
-    (128, 255, 255),
-    (255, 128, 255),
-    (255, 255, 128),
-    (60, 180, 0),
-    (180, 60, 0),
-    (0, 60, 180),
-    (0, 180, 60),
-    (60, 0, 180),
-    (180, 0, 60),
-    (255, 0, 0),
-    (0, 255, 0),
-    (0, 0, 255),
-    (255, 255, 0),
-    (255, 0, 255),
-    (0, 255, 255),
-    (128, 255, 0),
-    (255, 128, 0),
-    (128, 0, 255),
-]
-
-
-def imshow_lanes(img, lanes, show=False, out_file=None, width=4):
-    lanes_xys = []
-    for _, lane in enumerate(lanes):
-        xys = []
-        for x, y in lane:
-            if x <= 0 or y <= 0:
-                continue
-            x, y = int(x), int(y)
-            xys.append((x, y))
-        lanes_xys.append(xys)
-    lanes_xys.sort(key=lambda xys: xys[0][0] if len(xys) > 0 else 0)
-
-    for idx, xys in enumerate(lanes_xys):
-        for i in range(1, len(xys)):
-            cv2.line(img, xys[i - 1], xys[i], COLORS[idx], thickness=width)
-
-    if show:
-        cv2.imshow('view', img)
-        cv2.waitKey(0)
-
-    if out_file:
-        if not os.path.exists(os.path.dirname(out_file)):
-            os.makedirs(os.path.dirname(out_file))
-        cv2.imwrite(out_file, img)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/layers.py b/pdfdet/models/Paddle/ppdet/modeling/layers.py
deleted file mode 100644
index f91b840..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/layers.py
+++ /dev/null
@@ -1,1348 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import six
-import numpy as np
-from numbers import Integral
-
-import paddle
-import paddle.nn as nn
-from paddle import ParamAttr
-from paddle import to_tensor
-import paddle.nn.functional as F
-from paddle.nn.initializer import Normal, Constant, XavierUniform
-from paddle.regularizer import L2Decay
-
-from ppdet.core.workspace import register, serializable
-from ppdet.modeling.bbox_utils import delta2bbox
-from . import ops
-from .initializer import xavier_uniform_, constant_
-
-from paddle.vision.ops import DeformConv2D
-
-
-def _to_list(l):
-    if isinstance(l, (list, tuple)):
-        return list(l)
-    return [l]
-
-
-class AlignConv(nn.Layer):
-    def __init__(self, in_channels, out_channels, kernel_size=3, groups=1):
-        super(AlignConv, self).__init__()
-        self.kernel_size = kernel_size
-        self.align_conv = paddle.vision.ops.DeformConv2D(
-            in_channels,
-            out_channels,
-            kernel_size=self.kernel_size,
-            padding=(self.kernel_size - 1) // 2,
-            groups=groups,
-            weight_attr=ParamAttr(initializer=Normal(0, 0.01)),
-            bias_attr=None)
-
-    @paddle.no_grad()
-    def get_offset(self, anchors, featmap_size, stride):
-        """
-        Args:
-            anchors: [B, L, 5] xc,yc,w,h,angle
-            featmap_size: (feat_h, feat_w)
-            stride: 8
-        Returns:
-
-        """
-        batch = anchors.shape[0]
-        dtype = anchors.dtype
-        feat_h, feat_w = featmap_size
-        pad = (self.kernel_size - 1) // 2
-        idx = paddle.arange(-pad, pad + 1, dtype=dtype)
-
-        yy, xx = paddle.meshgrid(idx, idx)
-        xx = paddle.reshape(xx, [-1])
-        yy = paddle.reshape(yy, [-1])
-
-        # get sampling locations of default conv
-        xc = paddle.arange(0, feat_w, dtype=dtype)
-        yc = paddle.arange(0, feat_h, dtype=dtype)
-        yc, xc = paddle.meshgrid(yc, xc)
-
-        xc = paddle.reshape(xc, [-1, 1])
-        yc = paddle.reshape(yc, [-1, 1])
-        x_conv = xc + xx
-        y_conv = yc + yy
-
-        # get sampling locations of anchors
-        x_ctr, y_ctr, w, h, a = paddle.split(anchors, 5, axis=-1)
-        x_ctr = x_ctr / stride
-        y_ctr = y_ctr / stride
-        w_s = w / stride
-        h_s = h / stride
-        cos, sin = paddle.cos(a), paddle.sin(a)
-        dw, dh = w_s / self.kernel_size, h_s / self.kernel_size
-        x, y = dw * xx, dh * yy
-        xr = cos * x - sin * y
-        yr = sin * x + cos * y
-        x_anchor, y_anchor = xr + x_ctr, yr + y_ctr
-        # get offset filed
-        offset_x = x_anchor - x_conv
-        offset_y = y_anchor - y_conv
-        offset = paddle.stack([offset_y, offset_x], axis=-1)
-        offset = offset.reshape(
-            [batch, feat_h, feat_w, self.kernel_size * self.kernel_size * 2])
-        offset = offset.transpose([0, 3, 1, 2])
-
-        return offset
-
-    def forward(self, x, refine_anchors, featmap_size, stride):
-        batch = paddle.shape(x)[0].numpy()
-        offset = self.get_offset(refine_anchors, featmap_size, stride)
-        if self.training:
-            x = F.relu(self.align_conv(x, offset.detach()))
-        else:
-            x = F.relu(self.align_conv(x, offset))
-        return x
-
-
-class DeformableConvV2(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 weight_attr=None,
-                 bias_attr=None,
-                 lr_scale=1,
-                 regularizer=None,
-                 skip_quant=False,
-                 dcn_bias_regularizer=L2Decay(0.),
-                 dcn_bias_lr_scale=2.):
-        super(DeformableConvV2, self).__init__()
-        self.offset_channel = 2 * kernel_size**2
-        self.mask_channel = kernel_size**2
-
-        if lr_scale == 1 and regularizer is None:
-            offset_bias_attr = ParamAttr(initializer=Constant(0.))
-        else:
-            offset_bias_attr = ParamAttr(
-                initializer=Constant(0.),
-                learning_rate=lr_scale,
-                regularizer=regularizer)
-        self.conv_offset = nn.Conv2D(
-            in_channels,
-            3 * kernel_size**2,
-            kernel_size,
-            stride=stride,
-            padding=(kernel_size - 1) // 2,
-            weight_attr=ParamAttr(initializer=Constant(0.0)),
-            bias_attr=offset_bias_attr)
-        if skip_quant:
-            self.conv_offset.skip_quant = True
-
-        if bias_attr:
-            # in FCOS-DCN head, specifically need learning_rate and regularizer
-            dcn_bias_attr = ParamAttr(
-                initializer=Constant(value=0),
-                regularizer=dcn_bias_regularizer,
-                learning_rate=dcn_bias_lr_scale)
-        else:
-            # in ResNet backbone, do not need bias
-            dcn_bias_attr = False
-        self.conv_dcn = DeformConv2D(
-            in_channels,
-            out_channels,
-            kernel_size,
-            stride=stride,
-            padding=(kernel_size - 1) // 2 * dilation,
-            dilation=dilation,
-            groups=groups,
-            weight_attr=weight_attr,
-            bias_attr=dcn_bias_attr)
-
-    def forward(self, x):
-        offset_mask = self.conv_offset(x)
-        offset, mask = paddle.split(
-            offset_mask,
-            num_or_sections=[self.offset_channel, self.mask_channel],
-            axis=1)
-        mask = F.sigmoid(mask)
-        y = self.conv_dcn(x, offset, mask=mask)
-        return y
-
-
-class ConvNormLayer(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 filter_size,
-                 stride,
-                 groups=1,
-                 norm_type='bn',
-                 norm_decay=0.,
-                 norm_groups=32,
-                 use_dcn=False,
-                 bias_on=False,
-                 lr_scale=1.,
-                 freeze_norm=False,
-                 initializer=Normal(
-                     mean=0., std=0.01),
-                 skip_quant=False,
-                 dcn_lr_scale=2.,
-                 dcn_regularizer=L2Decay(0.)):
-        super(ConvNormLayer, self).__init__()
-        assert norm_type in ['bn', 'sync_bn', 'gn', None]
-
-        if bias_on:
-            bias_attr = ParamAttr(
-                initializer=Constant(value=0.), learning_rate=lr_scale)
-        else:
-            bias_attr = False
-
-        if not use_dcn:
-            self.conv = nn.Conv2D(
-                in_channels=ch_in,
-                out_channels=ch_out,
-                kernel_size=filter_size,
-                stride=stride,
-                padding=(filter_size - 1) // 2,
-                groups=groups,
-                weight_attr=ParamAttr(
-                    initializer=initializer, learning_rate=1.),
-                bias_attr=bias_attr)
-            if skip_quant:
-                self.conv.skip_quant = True
-        else:
-            # in FCOS-DCN head, specifically need learning_rate and regularizer
-            self.conv = DeformableConvV2(
-                in_channels=ch_in,
-                out_channels=ch_out,
-                kernel_size=filter_size,
-                stride=stride,
-                padding=(filter_size - 1) // 2,
-                groups=groups,
-                weight_attr=ParamAttr(
-                    initializer=initializer, learning_rate=1.),
-                bias_attr=True,
-                lr_scale=dcn_lr_scale,
-                regularizer=dcn_regularizer,
-                dcn_bias_regularizer=dcn_regularizer,
-                dcn_bias_lr_scale=dcn_lr_scale,
-                skip_quant=skip_quant)
-
-        norm_lr = 0. if freeze_norm else 1.
-        param_attr = ParamAttr(
-            learning_rate=norm_lr,
-            regularizer=L2Decay(norm_decay) if norm_decay is not None else None)
-        bias_attr = ParamAttr(
-            learning_rate=norm_lr,
-            regularizer=L2Decay(norm_decay) if norm_decay is not None else None)
-        if norm_type in ['bn', 'sync_bn']:
-            self.norm = nn.BatchNorm2D(
-                ch_out, weight_attr=param_attr, bias_attr=bias_attr)
-        elif norm_type == 'gn':
-            self.norm = nn.GroupNorm(
-                num_groups=norm_groups,
-                num_channels=ch_out,
-                weight_attr=param_attr,
-                bias_attr=bias_attr)
-        else:
-            self.norm = None
-
-    def forward(self, inputs):
-        out = self.conv(inputs)
-        if self.norm is not None:
-            out = self.norm(out)
-        return out
-
-
-class LiteConv(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 stride=1,
-                 with_act=True,
-                 norm_type='sync_bn',
-                 name=None):
-        super(LiteConv, self).__init__()
-        self.lite_conv = nn.Sequential()
-        conv1 = ConvNormLayer(
-            in_channels,
-            in_channels,
-            filter_size=5,
-            stride=stride,
-            groups=in_channels,
-            norm_type=norm_type,
-            initializer=XavierUniform())
-        conv2 = ConvNormLayer(
-            in_channels,
-            out_channels,
-            filter_size=1,
-            stride=stride,
-            norm_type=norm_type,
-            initializer=XavierUniform())
-        conv3 = ConvNormLayer(
-            out_channels,
-            out_channels,
-            filter_size=1,
-            stride=stride,
-            norm_type=norm_type,
-            initializer=XavierUniform())
-        conv4 = ConvNormLayer(
-            out_channels,
-            out_channels,
-            filter_size=5,
-            stride=stride,
-            groups=out_channels,
-            norm_type=norm_type,
-            initializer=XavierUniform())
-        conv_list = [conv1, conv2, conv3, conv4]
-        self.lite_conv.add_sublayer('conv1', conv1)
-        self.lite_conv.add_sublayer('relu6_1', nn.ReLU6())
-        self.lite_conv.add_sublayer('conv2', conv2)
-        if with_act:
-            self.lite_conv.add_sublayer('relu6_2', nn.ReLU6())
-        self.lite_conv.add_sublayer('conv3', conv3)
-        self.lite_conv.add_sublayer('relu6_3', nn.ReLU6())
-        self.lite_conv.add_sublayer('conv4', conv4)
-        if with_act:
-            self.lite_conv.add_sublayer('relu6_4', nn.ReLU6())
-
-    def forward(self, inputs):
-        out = self.lite_conv(inputs)
-        return out
-
-
-class DropBlock(nn.Layer):
-    def __init__(self, block_size, keep_prob, name=None, data_format='NCHW'):
-        """
-        DropBlock layer, see https://arxiv.org/abs/1810.12890
-
-        Args:
-            block_size (int): block size
-            keep_prob (int): keep probability
-            name (str): layer name
-            data_format (str): data format, NCHW or NHWC
-        """
-        super(DropBlock, self).__init__()
-        self.block_size = block_size
-        self.keep_prob = keep_prob
-        self.name = name
-        self.data_format = data_format
-
-    def forward(self, x):
-        if not self.training or self.keep_prob == 1:
-            return x
-        else:
-            gamma = (1. - self.keep_prob) / (self.block_size**2)
-            if self.data_format == 'NCHW':
-                shape = x.shape[2:]
-            else:
-                shape = x.shape[1:3]
-            for s in shape:
-                gamma *= s / (s - self.block_size + 1)
-
-            matrix = paddle.cast(paddle.rand(x.shape) < gamma, x.dtype)
-            mask_inv = F.max_pool2d(
-                matrix,
-                self.block_size,
-                stride=1,
-                padding=self.block_size // 2,
-                data_format=self.data_format)
-            mask = 1. - mask_inv
-            mask = mask.astype('float32')
-            x = x.astype('float32')
-            y = x * mask * (mask.numel() / mask.sum())
-            return y
-
-
-@register
-@serializable
-class AnchorGeneratorSSD(object):
-    def __init__(self,
-                 steps=[8, 16, 32, 64, 100, 300],
-                 aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]],
-                 min_ratio=15,
-                 max_ratio=90,
-                 base_size=300,
-                 min_sizes=[30.0, 60.0, 111.0, 162.0, 213.0, 264.0],
-                 max_sizes=[60.0, 111.0, 162.0, 213.0, 264.0, 315.0],
-                 offset=0.5,
-                 flip=True,
-                 clip=False,
-                 min_max_aspect_ratios_order=False):
-        self.steps = steps
-        self.aspect_ratios = aspect_ratios
-        self.min_ratio = min_ratio
-        self.max_ratio = max_ratio
-        self.base_size = base_size
-        self.min_sizes = min_sizes
-        self.max_sizes = max_sizes
-        self.offset = offset
-        self.flip = flip
-        self.clip = clip
-        self.min_max_aspect_ratios_order = min_max_aspect_ratios_order
-
-        if self.min_sizes == [] and self.max_sizes == []:
-            num_layer = len(aspect_ratios)
-            step = int(
-                math.floor(((self.max_ratio - self.min_ratio)) / (num_layer - 2
-                                                                  )))
-            for ratio in six.moves.range(self.min_ratio, self.max_ratio + 1,
-                                         step):
-                self.min_sizes.append(self.base_size * ratio / 100.)
-                self.max_sizes.append(self.base_size * (ratio + step) / 100.)
-            self.min_sizes = [self.base_size * .10] + self.min_sizes
-            self.max_sizes = [self.base_size * .20] + self.max_sizes
-
-        self.num_priors = []
-        for aspect_ratio, min_size, max_size in zip(
-                aspect_ratios, self.min_sizes, self.max_sizes):
-            if isinstance(min_size, (list, tuple)):
-                self.num_priors.append(
-                    len(_to_list(min_size)) + len(_to_list(max_size)))
-            else:
-                self.num_priors.append((len(aspect_ratio) * 2 + 1) * len(
-                    _to_list(min_size)) + len(_to_list(max_size)))
-
-    def __call__(self, inputs, image):
-        boxes = []
-        for input, min_size, max_size, aspect_ratio, step in zip(
-                inputs, self.min_sizes, self.max_sizes, self.aspect_ratios,
-                self.steps):
-            box, _ = ops.prior_box(
-                input=input,
-                image=image,
-                min_sizes=_to_list(min_size),
-                max_sizes=_to_list(max_size),
-                aspect_ratios=aspect_ratio,
-                flip=self.flip,
-                clip=self.clip,
-                steps=[step, step],
-                offset=self.offset,
-                min_max_aspect_ratios_order=self.min_max_aspect_ratios_order)
-            boxes.append(paddle.reshape(box, [-1, 4]))
-        return boxes
-
-
-@register
-@serializable
-class RCNNBox(object):
-    __shared__ = ['num_classes', 'export_onnx']
-
-    def __init__(self,
-                 prior_box_var=[10., 10., 5., 5.],
-                 code_type="decode_center_size",
-                 box_normalized=False,
-                 num_classes=80,
-                 export_onnx=False):
-        super(RCNNBox, self).__init__()
-        self.prior_box_var = prior_box_var
-        self.code_type = code_type
-        self.box_normalized = box_normalized
-        self.num_classes = num_classes
-        self.export_onnx = export_onnx
-
-    def __call__(self, bbox_head_out, rois, im_shape, scale_factor):
-        bbox_pred = bbox_head_out[0]
-        cls_prob = bbox_head_out[1]
-        roi = rois[0]
-        rois_num = rois[1]
-
-        if self.export_onnx:
-            onnx_rois_num_per_im = rois_num[0]
-            origin_shape = paddle.expand(im_shape[0, :],
-                                         [onnx_rois_num_per_im, 2])
-
-        else:
-            origin_shape_list = []
-            if isinstance(roi, list):
-                batch_size = len(roi)
-            else:
-                batch_size = paddle.slice(paddle.shape(im_shape), [0], [0], [1])
-
-            # bbox_pred.shape: [N, C*4]
-            for idx in range(batch_size):
-                rois_num_per_im = rois_num[idx]
-                expand_im_shape = paddle.expand(im_shape[idx, :],
-                                                [rois_num_per_im, 2])
-                origin_shape_list.append(expand_im_shape)
-
-            origin_shape = paddle.concat(origin_shape_list)
-
-        # bbox_pred.shape: [N, C*4]
-        # C=num_classes in faster/mask rcnn(bbox_head), C=1 in cascade rcnn(cascade_head)
-        bbox = paddle.concat(roi)
-        bbox = delta2bbox(bbox_pred, bbox, self.prior_box_var)
-        scores = cls_prob[:, :-1]
-
-        # bbox.shape: [N, C, 4]
-        # bbox.shape[1] must be equal to scores.shape[1]
-        total_num = bbox.shape[0]
-        bbox_dim = bbox.shape[-1]
-        bbox = paddle.expand(bbox, [total_num, self.num_classes, bbox_dim])
-
-        origin_h = paddle.unsqueeze(origin_shape[:, 0], axis=1)
-        origin_w = paddle.unsqueeze(origin_shape[:, 1], axis=1)
-        zeros = paddle.zeros_like(origin_h)
-        x1 = paddle.maximum(paddle.minimum(bbox[:, :, 0], origin_w), zeros)
-        y1 = paddle.maximum(paddle.minimum(bbox[:, :, 1], origin_h), zeros)
-        x2 = paddle.maximum(paddle.minimum(bbox[:, :, 2], origin_w), zeros)
-        y2 = paddle.maximum(paddle.minimum(bbox[:, :, 3], origin_h), zeros)
-        bbox = paddle.stack([x1, y1, x2, y2], axis=-1)
-        bboxes = (bbox, rois_num)
-        return bboxes, scores
-
-
-@register
-@serializable
-class MultiClassNMS(object):
-    def __init__(self,
-                 score_threshold=.05,
-                 nms_top_k=-1,
-                 keep_top_k=100,
-                 nms_threshold=.5,
-                 normalized=True,
-                 nms_eta=1.0,
-                 return_index=False,
-                 return_rois_num=True,
-                 trt=False):
-        super(MultiClassNMS, self).__init__()
-        self.score_threshold = score_threshold
-        self.nms_top_k = nms_top_k
-        self.keep_top_k = keep_top_k
-        self.nms_threshold = nms_threshold
-        self.normalized = normalized
-        self.nms_eta = nms_eta
-        self.return_index = return_index
-        self.return_rois_num = return_rois_num
-        self.trt = trt
-
-    def __call__(self, bboxes, score, background_label=-1):
-        """
-        bboxes (Tensor|List[Tensor]): 1. (Tensor) Predicted bboxes with shape 
-                                         [N, M, 4], N is the batch size and M
-                                         is the number of bboxes
-                                      2. (List[Tensor]) bboxes and bbox_num,
-                                         bboxes have shape of [M, C, 4], C
-                                         is the class number and bbox_num means
-                                         the number of bboxes of each batch with
-                                         shape [N,] 
-        score (Tensor): Predicted scores with shape [N, C, M] or [M, C]
-        background_label (int): Ignore the background label; For example, RCNN
-                                is num_classes and YOLO is -1. 
-        """
-        kwargs = self.__dict__.copy()
-        if isinstance(bboxes, tuple):
-            bboxes, bbox_num = bboxes
-            kwargs.update({'rois_num': bbox_num})
-        if background_label > -1:
-            kwargs.update({'background_label': background_label})
-        kwargs.pop('trt')
-        # TODO(wangxinxin08): paddle version should be develop or 2.3 and above to run nms on tensorrt
-        if self.trt and (int(paddle.version.major) == 0 or
-                         (int(paddle.version.major) >= 2 and
-                          int(paddle.version.minor) >= 3)):
-            # TODO(wangxinxin08): tricky switch to run nms on tensorrt
-            kwargs.update({'nms_eta': 1.1})
-            bbox, bbox_num, _ = ops.multiclass_nms(bboxes, score, **kwargs)
-            bbox = bbox.reshape([1, -1, 6])
-            idx = paddle.nonzero(bbox[..., 0] != -1)
-            bbox = paddle.gather_nd(bbox, idx)
-            return bbox, bbox_num, None
-        else:
-            return ops.multiclass_nms(bboxes, score, **kwargs)
-
-
-@register
-@serializable
-class MatrixNMS(object):
-    __append_doc__ = True
-
-    def __init__(self,
-                 score_threshold=.05,
-                 post_threshold=.05,
-                 nms_top_k=-1,
-                 keep_top_k=100,
-                 use_gaussian=False,
-                 gaussian_sigma=2.,
-                 normalized=False,
-                 background_label=0):
-        super(MatrixNMS, self).__init__()
-        self.score_threshold = score_threshold
-        self.post_threshold = post_threshold
-        self.nms_top_k = nms_top_k
-        self.keep_top_k = keep_top_k
-        self.normalized = normalized
-        self.use_gaussian = use_gaussian
-        self.gaussian_sigma = gaussian_sigma
-        self.background_label = background_label
-
-    def __call__(self, bbox, score, *args):
-        return ops.matrix_nms(
-            bboxes=bbox,
-            scores=score,
-            score_threshold=self.score_threshold,
-            post_threshold=self.post_threshold,
-            nms_top_k=self.nms_top_k,
-            keep_top_k=self.keep_top_k,
-            use_gaussian=self.use_gaussian,
-            gaussian_sigma=self.gaussian_sigma,
-            background_label=self.background_label,
-            normalized=self.normalized)
-
-
-@register
-@serializable
-class YOLOBox(object):
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 num_classes=80,
-                 conf_thresh=0.005,
-                 downsample_ratio=32,
-                 clip_bbox=True,
-                 scale_x_y=1.):
-        self.num_classes = num_classes
-        self.conf_thresh = conf_thresh
-        self.downsample_ratio = downsample_ratio
-        self.clip_bbox = clip_bbox
-        self.scale_x_y = scale_x_y
-
-    def __call__(self,
-                 yolo_head_out,
-                 anchors,
-                 im_shape,
-                 scale_factor,
-                 var_weight=None):
-        boxes_list = []
-        scores_list = []
-        origin_shape = im_shape / scale_factor
-        origin_shape = paddle.cast(origin_shape, 'int32')
-        for i, head_out in enumerate(yolo_head_out):
-            boxes, scores = paddle.vision.ops.yolo_box(
-                head_out,
-                origin_shape,
-                anchors[i],
-                self.num_classes,
-                self.conf_thresh,
-                self.downsample_ratio // 2**i,
-                self.clip_bbox,
-                scale_x_y=self.scale_x_y)
-            boxes_list.append(boxes)
-            scores_list.append(paddle.transpose(scores, perm=[0, 2, 1]))
-        yolo_boxes = paddle.concat(boxes_list, axis=1)
-        yolo_scores = paddle.concat(scores_list, axis=2)
-        return yolo_boxes, yolo_scores
-
-
-@register
-@serializable
-class SSDBox(object):
-    def __init__(self,
-                 is_normalized=True,
-                 prior_box_var=[0.1, 0.1, 0.2, 0.2],
-                 use_fuse_decode=False):
-        self.is_normalized = is_normalized
-        self.norm_delta = float(not self.is_normalized)
-        self.prior_box_var = prior_box_var
-        self.use_fuse_decode = use_fuse_decode
-
-    def __call__(self,
-                 preds,
-                 prior_boxes,
-                 im_shape,
-                 scale_factor,
-                 var_weight=None):
-        boxes, scores = preds
-        boxes = paddle.concat(boxes, axis=1)
-        prior_boxes = paddle.concat(prior_boxes)
-        if self.use_fuse_decode:
-            output_boxes = ops.box_coder(
-                prior_boxes,
-                self.prior_box_var,
-                boxes,
-                code_type="decode_center_size",
-                box_normalized=self.is_normalized)
-        else:
-            pb_w = prior_boxes[:, 2] - prior_boxes[:, 0] + self.norm_delta
-            pb_h = prior_boxes[:, 3] - prior_boxes[:, 1] + self.norm_delta
-            pb_x = prior_boxes[:, 0] + pb_w * 0.5
-            pb_y = prior_boxes[:, 1] + pb_h * 0.5
-            out_x = pb_x + boxes[:, :, 0] * pb_w * self.prior_box_var[0]
-            out_y = pb_y + boxes[:, :, 1] * pb_h * self.prior_box_var[1]
-            out_w = paddle.exp(boxes[:, :, 2] * self.prior_box_var[2]) * pb_w
-            out_h = paddle.exp(boxes[:, :, 3] * self.prior_box_var[3]) * pb_h
-            output_boxes = paddle.stack(
-                [
-                    out_x - out_w / 2., out_y - out_h / 2., out_x + out_w / 2.,
-                    out_y + out_h / 2.
-                ],
-                axis=-1)
-
-        if self.is_normalized:
-            h = (im_shape[:, 0] / scale_factor[:, 0]).unsqueeze(-1)
-            w = (im_shape[:, 1] / scale_factor[:, 1]).unsqueeze(-1)
-            im_shape = paddle.stack([w, h, w, h], axis=-1)
-            output_boxes *= im_shape
-        else:
-            output_boxes[..., -2:] -= 1.0
-        output_scores = F.softmax(paddle.concat(
-            scores, axis=1)).transpose([0, 2, 1])
-
-        return output_boxes, output_scores
-
-
-@register
-class TTFBox(object):
-    __shared__ = ['down_ratio']
-
-    def __init__(self, max_per_img=100, score_thresh=0.01, down_ratio=4):
-        super(TTFBox, self).__init__()
-        self.max_per_img = max_per_img
-        self.score_thresh = score_thresh
-        self.down_ratio = down_ratio
-
-    def _simple_nms(self, heat, kernel=3):
-        """
-        Use maxpool to filter the max score, get local peaks.
-        """
-        pad = (kernel - 1) // 2
-        hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad)
-        keep = paddle.cast(hmax == heat, 'float32')
-        return heat * keep
-
-    def _topk(self, scores):
-        """
-        Select top k scores and decode to get xy coordinates.
-        """
-        k = self.max_per_img
-        shape_fm = paddle.shape(scores)
-        shape_fm.stop_gradient = True
-        cat, height, width = shape_fm[1], shape_fm[2], shape_fm[3]
-        # batch size is 1
-        scores_r = paddle.reshape(scores, [cat, -1])
-        topk_scores, topk_inds = paddle.topk(scores_r, k)
-        topk_ys = topk_inds // width
-        topk_xs = topk_inds % width
-
-        topk_score_r = paddle.reshape(topk_scores, [-1])
-        topk_score, topk_ind = paddle.topk(topk_score_r, k)
-        k_t = paddle.full(paddle.shape(topk_ind), k, dtype='int64')
-        topk_clses = paddle.cast(paddle.floor_divide(topk_ind, k_t), 'float32')
-
-        topk_inds = paddle.reshape(topk_inds, [-1])
-        topk_ys = paddle.reshape(topk_ys, [-1, 1])
-        topk_xs = paddle.reshape(topk_xs, [-1, 1])
-        topk_inds = paddle.gather(topk_inds, topk_ind)
-        topk_ys = paddle.gather(topk_ys, topk_ind)
-        topk_xs = paddle.gather(topk_xs, topk_ind)
-
-        return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
-
-    def _decode(self, hm, wh, im_shape, scale_factor):
-        heatmap = F.sigmoid(hm)
-        heat = self._simple_nms(heatmap)
-        scores, inds, clses, ys, xs = self._topk(heat)
-        ys = paddle.cast(ys, 'float32') * self.down_ratio
-        xs = paddle.cast(xs, 'float32') * self.down_ratio
-        scores = paddle.tensor.unsqueeze(scores, [1])
-        clses = paddle.tensor.unsqueeze(clses, [1])
-
-        wh_t = paddle.transpose(wh, [0, 2, 3, 1])
-        wh = paddle.reshape(wh_t, [-1, paddle.shape(wh_t)[-1]])
-        wh = paddle.gather(wh, inds)
-
-        x1 = xs - wh[:, 0:1]
-        y1 = ys - wh[:, 1:2]
-        x2 = xs + wh[:, 2:3]
-        y2 = ys + wh[:, 3:4]
-
-        bboxes = paddle.concat([x1, y1, x2, y2], axis=1)
-
-        scale_y = scale_factor[:, 0:1]
-        scale_x = scale_factor[:, 1:2]
-        scale_expand = paddle.concat(
-            [scale_x, scale_y, scale_x, scale_y], axis=1)
-        boxes_shape = paddle.shape(bboxes)
-        boxes_shape.stop_gradient = True
-        scale_expand = paddle.expand(scale_expand, shape=boxes_shape)
-        bboxes = paddle.divide(bboxes, scale_expand)
-        results = paddle.concat([clses, scores, bboxes], axis=1)
-        # hack: append result with cls=-1 and score=1. to avoid all scores
-        # are less than score_thresh which may cause error in gather.
-        fill_r = paddle.to_tensor(np.array([[-1, 1, 0, 0, 0, 0]]))
-        fill_r = paddle.cast(fill_r, results.dtype)
-        results = paddle.concat([results, fill_r])
-        scores = results[:, 1]
-        valid_ind = paddle.nonzero(scores > self.score_thresh)
-        results = paddle.gather(results, valid_ind)
-        return results, paddle.shape(results)[0:1]
-
-    def __call__(self, hm, wh, im_shape, scale_factor):
-        results = []
-        results_num = []
-        for i in range(scale_factor.shape[0]):
-            result, num = self._decode(hm[i:i + 1, ], wh[i:i + 1, ],
-                                       im_shape[i:i + 1, ],
-                                       scale_factor[i:i + 1, ])
-            results.append(result)
-            results_num.append(num)
-        results = paddle.concat(results, axis=0)
-        results_num = paddle.concat(results_num, axis=0)
-        return results, results_num
-
-
-@register
-@serializable
-class JDEBox(object):
-    __shared__ = ['num_classes']
-
-    def __init__(self, num_classes=1, conf_thresh=0.3, downsample_ratio=32):
-        self.num_classes = num_classes
-        self.conf_thresh = conf_thresh
-        self.downsample_ratio = downsample_ratio
-
-    def generate_anchor(self, nGh, nGw, anchor_wh):
-        nA = len(anchor_wh)
-        yv, xv = paddle.meshgrid([paddle.arange(nGh), paddle.arange(nGw)])
-        mesh = paddle.stack(
-            (xv, yv), axis=0).cast(dtype='float32')  # 2 x nGh x nGw
-        meshs = paddle.tile(mesh, [nA, 1, 1, 1])
-
-        anchor_offset_mesh = anchor_wh[:, :, None][:, :, :, None].repeat(
-            int(nGh), axis=-2).repeat(
-                int(nGw), axis=-1)
-        anchor_offset_mesh = paddle.to_tensor(
-            anchor_offset_mesh.astype(np.float32))
-        # nA x 2 x nGh x nGw
-
-        anchor_mesh = paddle.concat([meshs, anchor_offset_mesh], axis=1)
-        anchor_mesh = paddle.transpose(anchor_mesh,
-                                       [0, 2, 3, 1])  # (nA x nGh x nGw) x 4
-        return anchor_mesh
-
-    def decode_delta(self, delta, fg_anchor_list):
-        px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \
-                        fg_anchor_list[:, 2], fg_anchor_list[:,3]
-        dx, dy, dw, dh = delta[:, 0], delta[:, 1], delta[:, 2], delta[:, 3]
-        gx = pw * dx + px
-        gy = ph * dy + py
-        gw = pw * paddle.exp(dw)
-        gh = ph * paddle.exp(dh)
-        gx1 = gx - gw * 0.5
-        gy1 = gy - gh * 0.5
-        gx2 = gx + gw * 0.5
-        gy2 = gy + gh * 0.5
-        return paddle.stack([gx1, gy1, gx2, gy2], axis=1)
-
-    def decode_delta_map(self, nA, nGh, nGw, delta_map, anchor_vec):
-        anchor_mesh = self.generate_anchor(nGh, nGw, anchor_vec)
-        anchor_mesh = paddle.unsqueeze(anchor_mesh, 0)
-        pred_list = self.decode_delta(
-            paddle.reshape(
-                delta_map, shape=[-1, 4]),
-            paddle.reshape(
-                anchor_mesh, shape=[-1, 4]))
-        pred_map = paddle.reshape(pred_list, shape=[nA * nGh * nGw, 4])
-        return pred_map
-
-    def _postprocessing_by_level(self, nA, stride, head_out, anchor_vec):
-        boxes_shape = head_out.shape  # [nB, nA*6, nGh, nGw]
-        nGh, nGw = boxes_shape[-2], boxes_shape[-1]
-        nB = 1  # TODO: only support bs=1 now
-        boxes_list, scores_list = [], []
-        for idx in range(nB):
-            p = paddle.reshape(
-                head_out[idx], shape=[nA, self.num_classes + 5, nGh, nGw])
-            p = paddle.transpose(p, perm=[0, 2, 3, 1])  # [nA, nGh, nGw, 6]
-            delta_map = p[:, :, :, :4]
-            boxes = self.decode_delta_map(nA, nGh, nGw, delta_map, anchor_vec)
-            # [nA * nGh * nGw, 4]
-            boxes_list.append(boxes * stride)
-
-            p_conf = paddle.transpose(
-                p[:, :, :, 4:6], perm=[3, 0, 1, 2])  # [2, nA, nGh, nGw]
-            p_conf = F.softmax(
-                p_conf, axis=0)[1, :, :, :].unsqueeze(-1)  # [nA, nGh, nGw, 1]
-            scores = paddle.reshape(p_conf, shape=[nA * nGh * nGw, 1])
-            scores_list.append(scores)
-
-        boxes_results = paddle.stack(boxes_list)
-        scores_results = paddle.stack(scores_list)
-        return boxes_results, scores_results
-
-    def __call__(self, yolo_head_out, anchors):
-        bbox_pred_list = []
-        for i, head_out in enumerate(yolo_head_out):
-            stride = self.downsample_ratio // 2**i
-            anc_w, anc_h = anchors[i][0::2], anchors[i][1::2]
-            anchor_vec = np.stack((anc_w, anc_h), axis=1) / stride
-            nA = len(anc_w)
-            boxes, scores = self._postprocessing_by_level(nA, stride, head_out,
-                                                          anchor_vec)
-            bbox_pred_list.append(paddle.concat([boxes, scores], axis=-1))
-
-        yolo_boxes_scores = paddle.concat(bbox_pred_list, axis=1)
-        boxes_idx_over_conf_thr = paddle.nonzero(
-            yolo_boxes_scores[:, :, -1] > self.conf_thresh)
-        boxes_idx_over_conf_thr.stop_gradient = True
-
-        return boxes_idx_over_conf_thr, yolo_boxes_scores
-
-
-@register
-@serializable
-class MaskMatrixNMS(object):
-    """
-    Matrix NMS for multi-class masks.
-    Args:
-        update_threshold (float): Updated threshold of categroy score in second time.
-        pre_nms_top_n (int): Number of total instance to be kept per image before NMS
-        post_nms_top_n (int): Number of total instance to be kept per image after NMS.
-        kernel (str):  'linear' or 'gaussian'.
-        sigma (float): std in gaussian method.
-    Input:
-        seg_preds (Variable): shape (n, h, w), segmentation feature maps
-        seg_masks (Variable): shape (n, h, w), segmentation feature maps
-        cate_labels (Variable): shape (n), mask labels in descending order
-        cate_scores (Variable): shape (n), mask scores in descending order
-        sum_masks (Variable): a float tensor of the sum of seg_masks
-    Returns:
-        Variable: cate_scores, tensors of shape (n)
-    """
-
-    def __init__(self,
-                 update_threshold=0.05,
-                 pre_nms_top_n=500,
-                 post_nms_top_n=100,
-                 kernel='gaussian',
-                 sigma=2.0):
-        super(MaskMatrixNMS, self).__init__()
-        self.update_threshold = update_threshold
-        self.pre_nms_top_n = pre_nms_top_n
-        self.post_nms_top_n = post_nms_top_n
-        self.kernel = kernel
-        self.sigma = sigma
-
-    def _sort_score(self, scores, top_num):
-        if paddle.shape(scores)[0] > top_num:
-            return paddle.topk(scores, top_num)[1]
-        else:
-            return paddle.argsort(scores, descending=True)
-
-    def __call__(self,
-                 seg_preds,
-                 seg_masks,
-                 cate_labels,
-                 cate_scores,
-                 sum_masks=None):
-        # sort and keep top nms_pre
-        sort_inds = self._sort_score(cate_scores, self.pre_nms_top_n)
-        seg_masks = paddle.gather(seg_masks, index=sort_inds)
-        seg_preds = paddle.gather(seg_preds, index=sort_inds)
-        sum_masks = paddle.gather(sum_masks, index=sort_inds)
-        cate_scores = paddle.gather(cate_scores, index=sort_inds)
-        cate_labels = paddle.gather(cate_labels, index=sort_inds)
-
-        seg_masks = paddle.flatten(seg_masks, start_axis=1, stop_axis=-1)
-        # inter.
-        inter_matrix = paddle.mm(seg_masks, paddle.transpose(seg_masks, [1, 0]))
-        n_samples = paddle.shape(cate_labels)
-        # union.
-        sum_masks_x = paddle.expand(sum_masks, shape=[n_samples, n_samples])
-        # iou.
-        iou_matrix = (inter_matrix / (
-            sum_masks_x + paddle.transpose(sum_masks_x, [1, 0]) - inter_matrix))
-        iou_matrix = paddle.triu(iou_matrix, diagonal=1)
-        # label_specific matrix.
-        cate_labels_x = paddle.expand(cate_labels, shape=[n_samples, n_samples])
-        label_matrix = paddle.cast(
-            (cate_labels_x == paddle.transpose(cate_labels_x, [1, 0])),
-            'float32')
-        label_matrix = paddle.triu(label_matrix, diagonal=1)
-
-        # IoU compensation
-        compensate_iou = paddle.max((iou_matrix * label_matrix), axis=0)
-        compensate_iou = paddle.expand(
-            compensate_iou, shape=[n_samples, n_samples])
-        compensate_iou = paddle.transpose(compensate_iou, [1, 0])
-
-        # IoU decay
-        decay_iou = iou_matrix * label_matrix
-
-        # matrix nms
-        if self.kernel == 'gaussian':
-            decay_matrix = paddle.exp(-1 * self.sigma * (decay_iou**2))
-            compensate_matrix = paddle.exp(-1 * self.sigma *
-                                           (compensate_iou**2))
-            decay_coefficient = paddle.min(decay_matrix / compensate_matrix,
-                                           axis=0)
-        elif self.kernel == 'linear':
-            decay_matrix = (1 - decay_iou) / (1 - compensate_iou)
-            decay_coefficient = paddle.min(decay_matrix, axis=0)
-        else:
-            raise NotImplementedError
-
-        # update the score.
-        cate_scores = cate_scores * decay_coefficient
-        y = paddle.zeros(shape=paddle.shape(cate_scores), dtype='float32')
-        keep = paddle.where(cate_scores >= self.update_threshold, cate_scores,
-                            y)
-        keep = paddle.nonzero(keep)
-        keep = paddle.squeeze(keep, axis=[1])
-        # Prevent empty and increase fake data
-        keep = paddle.concat(
-            [keep, paddle.cast(paddle.shape(cate_scores)[0:1] - 1, 'int64')])
-
-        seg_preds = paddle.gather(seg_preds, index=keep)
-        cate_scores = paddle.gather(cate_scores, index=keep)
-        cate_labels = paddle.gather(cate_labels, index=keep)
-
-        # sort and keep top_k
-        sort_inds = self._sort_score(cate_scores, self.post_nms_top_n)
-        seg_preds = paddle.gather(seg_preds, index=sort_inds)
-        cate_scores = paddle.gather(cate_scores, index=sort_inds)
-        cate_labels = paddle.gather(cate_labels, index=sort_inds)
-        return seg_preds, cate_scores, cate_labels
-
-
-def Conv2d(in_channels,
-           out_channels,
-           kernel_size,
-           stride=1,
-           padding=0,
-           dilation=1,
-           groups=1,
-           bias=True,
-           weight_init=Normal(std=0.001),
-           bias_init=Constant(0.)):
-    weight_attr = paddle.framework.ParamAttr(initializer=weight_init)
-    if bias:
-        bias_attr = paddle.framework.ParamAttr(initializer=bias_init)
-    else:
-        bias_attr = False
-    conv = nn.Conv2D(
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride,
-        padding,
-        dilation,
-        groups,
-        weight_attr=weight_attr,
-        bias_attr=bias_attr)
-    return conv
-
-
-def ConvTranspose2d(in_channels,
-                    out_channels,
-                    kernel_size,
-                    stride=1,
-                    padding=0,
-                    output_padding=0,
-                    groups=1,
-                    bias=True,
-                    dilation=1,
-                    weight_init=Normal(std=0.001),
-                    bias_init=Constant(0.)):
-    weight_attr = paddle.framework.ParamAttr(initializer=weight_init)
-    if bias:
-        bias_attr = paddle.framework.ParamAttr(initializer=bias_init)
-    else:
-        bias_attr = False
-    conv = nn.Conv2DTranspose(
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride,
-        padding,
-        output_padding,
-        dilation,
-        groups,
-        weight_attr=weight_attr,
-        bias_attr=bias_attr)
-    return conv
-
-
-def BatchNorm2d(num_features, eps=1e-05, momentum=0.9, affine=True):
-    if not affine:
-        weight_attr = False
-        bias_attr = False
-    else:
-        weight_attr = None
-        bias_attr = None
-    batchnorm = nn.BatchNorm2D(
-        num_features,
-        momentum,
-        eps,
-        weight_attr=weight_attr,
-        bias_attr=bias_attr)
-    return batchnorm
-
-
-def ReLU():
-    return nn.ReLU()
-
-
-def Upsample(scale_factor=None, mode='nearest', align_corners=False):
-    return nn.Upsample(None, scale_factor, mode, align_corners)
-
-
-def MaxPool(kernel_size, stride, padding, ceil_mode=False):
-    return nn.MaxPool2D(kernel_size, stride, padding, ceil_mode=ceil_mode)
-
-
-class Concat(nn.Layer):
-    def __init__(self, dim=0):
-        super(Concat, self).__init__()
-        self.dim = dim
-
-    def forward(self, inputs):
-        return paddle.concat(inputs, axis=self.dim)
-
-    def extra_repr(self):
-        return 'dim={}'.format(self.dim)
-
-
-def _convert_attention_mask(attn_mask, dtype):
-    """
-    Convert the attention mask to the target dtype we expect.
-    Parameters:
-        attn_mask (Tensor, optional): A tensor used in multi-head attention
-                to prevents attention to some unwanted positions, usually the
-                paddings or the subsequent positions. It is a tensor with shape
-                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
-                When the data type is bool, the unwanted positions have `False` 
-                values and the others have `True` values. When the data type is 
-                int, the unwanted positions have 0 values and the others have 1 
-                values. When the data type is float, the unwanted positions have 
-                `-INF` values and the others have 0 values. It can be None when 
-                nothing wanted or needed to be prevented attention to. Default None.
-        dtype (VarType): The target type of `attn_mask` we expect.
-    Returns:
-        Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`.
-    """
-    return nn.layer.transformer._convert_attention_mask(attn_mask, dtype)
-
-
-@register
-class MultiHeadAttention(nn.Layer):
-    """
-    Attention mapps queries and a set of key-value pairs to outputs, and
-    Multi-Head Attention performs multiple parallel attention to jointly attending
-    to information from different representation subspaces.
-
-    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
-    for more details.
-
-    Parameters:
-        embed_dim (int): The expected feature size in the input and output.
-        num_heads (int): The number of heads in multi-head attention.
-        dropout (float, optional): The dropout probability used on attention
-            weights to drop some attention targets. 0 for no dropout. Default 0
-        kdim (int, optional): The feature size in key. If None, assumed equal to
-            `embed_dim`. Default None.
-        vdim (int, optional): The feature size in value. If None, assumed equal to
-            `embed_dim`. Default None.
-        need_weights (bool, optional): Indicate whether to return the attention
-            weights. Default False.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-
-            # encoder input: [batch_size, sequence_length, d_model]
-            query = paddle.rand((2, 4, 128))
-            # self attention mask: [batch_size, num_heads, query_len, query_len]
-            attn_mask = paddle.rand((2, 2, 4, 4))
-            multi_head_attn = paddle.nn.MultiHeadAttention(128, 2)
-            output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
-    """
-
-    def __init__(self,
-                 embed_dim,
-                 num_heads,
-                 dropout=0.,
-                 kdim=None,
-                 vdim=None,
-                 need_weights=False):
-        super(MultiHeadAttention, self).__init__()
-        self.embed_dim = embed_dim
-        self.kdim = kdim if kdim is not None else embed_dim
-        self.vdim = vdim if vdim is not None else embed_dim
-        self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
-
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.need_weights = need_weights
-
-        self.head_dim = embed_dim // num_heads
-        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
-
-        if self._qkv_same_embed_dim:
-            self.in_proj_weight = self.create_parameter(
-                shape=[embed_dim, 3 * embed_dim],
-                attr=None,
-                dtype=self._dtype,
-                is_bias=False)
-            self.in_proj_bias = self.create_parameter(
-                shape=[3 * embed_dim],
-                attr=None,
-                dtype=self._dtype,
-                is_bias=True)
-        else:
-            self.q_proj = nn.Linear(embed_dim, embed_dim)
-            self.k_proj = nn.Linear(self.kdim, embed_dim)
-            self.v_proj = nn.Linear(self.vdim, embed_dim)
-
-        self.out_proj = nn.Linear(embed_dim, embed_dim)
-        self._type_list = ('q_proj', 'k_proj', 'v_proj')
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        for p in self.parameters():
-            if p.dim() > 1:
-                xavier_uniform_(p)
-            else:
-                constant_(p)
-
-    def compute_qkv(self, tensor, index):
-        if self._qkv_same_embed_dim:
-            tensor = F.linear(
-                x=tensor,
-                weight=self.in_proj_weight[:, index * self.embed_dim:(index + 1)
-                                           * self.embed_dim],
-                bias=self.in_proj_bias[index * self.embed_dim:(index + 1) *
-                                       self.embed_dim]
-                if self.in_proj_bias is not None else None)
-        else:
-            tensor = getattr(self, self._type_list[index])(tensor)
-        tensor = tensor.reshape(
-            [0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3])
-        return tensor
-
-    def forward(self, query, key=None, value=None, attn_mask=None):
-        r"""
-        Applies multi-head attention to map queries and a set of key-value pairs
-        to outputs.
-
-        Parameters:
-            query (Tensor): The queries for multi-head attention. It is a
-                tensor with shape `[batch_size, query_length, embed_dim]`. The
-                data type should be float32 or float64.
-            key (Tensor, optional): The keys for multi-head attention. It is
-                a tensor with shape `[batch_size, key_length, kdim]`. The
-                data type should be float32 or float64. If None, use `query` as
-                `key`. Default None.
-            value (Tensor, optional): The values for multi-head attention. It
-                is a tensor with shape `[batch_size, value_length, vdim]`.
-                The data type should be float32 or float64. If None, use `query` as
-                `value`. Default None.
-            attn_mask (Tensor, optional): A tensor used in multi-head attention
-                to prevents attention to some unwanted positions, usually the
-                paddings or the subsequent positions. It is a tensor with shape
-                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`.
-                When the data type is bool, the unwanted positions have `False`
-                values and the others have `True` values. When the data type is
-                int, the unwanted positions have 0 values and the others have 1
-                values. When the data type is float, the unwanted positions have
-                `-INF` values and the others have 0 values. It can be None when
-                nothing wanted or needed to be prevented attention to. Default None.
-
-        Returns:
-            Tensor|tuple: It is a tensor that has the same shape and data type \
-                as `query`, representing attention output. Or a tuple if \
-                `need_weights` is True or `cache` is not None. If `need_weights` \
-                is True, except for attention output, the tuple also includes \
-                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
-                If `cache` is not None, the tuple then includes the new cache \
-                having the same type as `cache`, and if it is `StaticCache`, it \
-                is same as the input `cache`, if it is `Cache`, the new cache \
-                reserves tensors concatanating raw tensors with intermediate \
-                results of current query.
-        """
-        key = query if key is None else key
-        value = query if value is None else value
-        # compute q ,k ,v
-        q, k, v = (self.compute_qkv(t, i)
-                   for i, t in enumerate([query, key, value]))
-
-        # scale dot product attention
-        product = paddle.matmul(x=q, y=k, transpose_y=True)
-        scaling = float(self.head_dim)**-0.5
-        product = product * scaling
-
-        if attn_mask is not None:
-            # Support bool or int mask
-            attn_mask = _convert_attention_mask(attn_mask, product.dtype)
-            product = product + attn_mask
-        weights = F.softmax(product)
-        if self.dropout:
-            weights = F.dropout(
-                weights,
-                self.dropout,
-                training=self.training,
-                mode="upscale_in_train")
-        out = paddle.matmul(weights, v)
-
-        # combine heads
-        out = paddle.transpose(out, perm=[0, 2, 1, 3])
-        out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
-
-        # project to output
-        out = self.out_proj(out)
-
-        outs = [out]
-        if self.need_weights:
-            outs.append(weights)
-        return out if len(outs) == 1 else tuple(outs)
-
-
-@register
-class ConvMixer(nn.Layer):
-    def __init__(
-            self,
-            dim,
-            depth,
-            kernel_size=3, ):
-        super().__init__()
-        self.dim = dim
-        self.depth = depth
-        self.kernel_size = kernel_size
-
-        self.mixer = self.conv_mixer(dim, depth, kernel_size)
-
-    def forward(self, x):
-        return self.mixer(x)
-
-    @staticmethod
-    def conv_mixer(
-            dim,
-            depth,
-            kernel_size, ):
-        Seq, ActBn = nn.Sequential, lambda x: Seq(x, nn.GELU(), nn.BatchNorm2D(dim))
-        Residual = type('Residual', (Seq, ),
-                        {'forward': lambda self, x: self[0](x) + x})
-        return Seq(* [
-            Seq(Residual(
-                ActBn(
-                    nn.Conv2D(
-                        dim, dim, kernel_size, groups=dim, padding="same"))),
-                ActBn(nn.Conv2D(dim, dim, 1))) for i in range(depth)
-        ])
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/losses/__init__.py
deleted file mode 100644
index 41b3ae0..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/__init__.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import yolo_loss
-from . import iou_aware_loss
-from . import iou_loss
-from . import ssd_loss
-from . import fcos_loss
-from . import solov2_loss
-from . import ctfocal_loss
-from . import keypoint_loss
-from . import jde_loss
-from . import fairmot_loss
-from . import gfocal_loss
-from . import detr_loss
-from . import sparsercnn_loss
-from . import focal_loss
-from . import smooth_l1_loss
-from . import probiou_loss
-from . import cot_loss
-from . import supcontrast
-from . import queryinst_loss
-from . import clrnet_loss
-from . import clrnet_line_iou_loss
-
-from .yolo_loss import *
-from .iou_aware_loss import *
-from .iou_loss import *
-from .ssd_loss import *
-from .fcos_loss import *
-from .solov2_loss import *
-from .ctfocal_loss import *
-from .keypoint_loss import *
-from .jde_loss import *
-from .fairmot_loss import *
-from .gfocal_loss import *
-from .detr_loss import *
-from .sparsercnn_loss import *
-from .focal_loss import *
-from .smooth_l1_loss import *
-from .pose3d_loss import *
-from .probiou_loss import *
-from .cot_loss import *
-from .supcontrast import *
-from .queryinst_loss import *
-from .clrnet_loss import *
-from .clrnet_line_iou_loss import *
\ No newline at end of file
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/clrnet_line_iou_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/clrnet_line_iou_loss.py
deleted file mode 100644
index 2a1973d..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/clrnet_line_iou_loss.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import paddle
-
-
-def line_iou(pred, target, img_w, length=15, aligned=True):
-    '''
-    Calculate the line iou value between predictions and targets
-    Args:
-        pred: lane predictions, shape: (num_pred, 72)
-        target: ground truth, shape: (num_target, 72)
-        img_w: image width
-        length: extended radius
-        aligned: True for iou loss calculation, False for pair-wise ious in assign
-    '''
-    px1 = pred - length
-    px2 = pred + length
-    tx1 = target - length
-    tx2 = target + length
-
-    if aligned:
-        invalid_mask = target
-        ovr = paddle.minimum(px2, tx2) - paddle.maximum(px1, tx1)
-        union = paddle.maximum(px2, tx2) - paddle.minimum(px1, tx1)
-    else:
-        num_pred = pred.shape[0]
-        invalid_mask = target.tile([num_pred, 1, 1])
-
-        ovr = (paddle.minimum(px2[:, None, :], tx2[None, ...]) - paddle.maximum(
-            px1[:, None, :], tx1[None, ...]))
-        union = (paddle.maximum(px2[:, None, :], tx2[None, ...]) -
-                 paddle.minimum(px1[:, None, :], tx1[None, ...]))
-
-    invalid_masks = (invalid_mask < 0) | (invalid_mask >= img_w)
-
-    ovr[invalid_masks] = 0.
-    union[invalid_masks] = 0.
-    iou = ovr.sum(axis=-1) / (union.sum(axis=-1) + 1e-9)
-    return iou
-
-
-def liou_loss(pred, target, img_w, length=15):
-    return (1 - line_iou(pred, target, img_w, length)).mean()
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/clrnet_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/clrnet_loss.py
deleted file mode 100644
index b4ad39e..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/clrnet_loss.py
+++ /dev/null
@@ -1,283 +0,0 @@
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-from ppdet.modeling.clrnet_utils import accuracy
-from ppdet.modeling.assigners.clrnet_assigner import assign
-from ppdet.modeling.losses.clrnet_line_iou_loss import liou_loss
-
-__all__ = ['CLRNetLoss']
-
-
-class SoftmaxFocalLoss(nn.Layer):
-    def __init__(self, gamma, ignore_lb=255, *args, **kwargs):
-        super(SoftmaxFocalLoss, self).__init__()
-        self.gamma = gamma
-        self.nll = nn.NLLLoss(ignore_index=ignore_lb)
-
-    def forward(self, logits, labels):
-        scores = F.softmax(logits, dim=1)
-        factor = paddle.pow(1. - scores, self.gamma)
-        log_score = F.log_softmax(logits, dim=1)
-        log_score = factor * log_score
-        loss = self.nll(log_score, labels)
-        return loss
-
-
-def focal_loss(input: paddle.Tensor,
-               target: paddle.Tensor,
-               alpha: float,
-               gamma: float=2.0,
-               reduction: str='none',
-               eps: float=1e-8) -> paddle.Tensor:
-    r"""Function that computes Focal loss.
-
-    See :class:`~kornia.losses.FocalLoss` for details.
-    """
-    if not paddle.is_tensor(input):
-        raise TypeError("Input type is not a torch.Tensor. Got {}".format(
-            type(input)))
-
-    if not len(input.shape) >= 2:
-        raise ValueError("Invalid input shape, we expect BxCx*. Got: {}".format(
-            input.shape))
-
-    if input.shape[0] != target.shape[0]:
-        raise ValueError(
-            'Expected input batch_size ({}) to match target batch_size ({}).'.
-            format(input.shape[0], target.shape[0]))
-
-    n = input.shape[0]
-    out_size = (n, ) + tuple(input.shape[2:])
-    if target.shape[1:] != input.shape[2:]:
-        raise ValueError('Expected target size {}, got {}'.format(out_size,
-                                                                  target.shape))
-    if (isinstance(input.place, paddle.CUDAPlace) and
-            isinstance(target.place, paddle.CPUPlace)) | (isinstance(
-                input.place, paddle.CPUPlace) and isinstance(target.place,
-                                                             paddle.CUDAPlace)):
-        raise ValueError(
-            "input and target must be in the same device. Got: {} and {}".
-            format(input.place, target.place))
-
-    # compute softmax over the classes axis
-    input_soft: paddle.Tensor = F.softmax(input, axis=1) + eps
-
-    # create the labels one hot tensor
-    target_one_hot: paddle.Tensor = paddle.to_tensor(
-        F.one_hot(
-            target, num_classes=input.shape[1]).cast(input.dtype),
-        place=input.place)
-
-    # compute the actual focal loss
-    weight = paddle.pow(-input_soft + 1., gamma)
-
-    focal = -alpha * weight * paddle.log(input_soft)
-    loss_tmp = paddle.sum(target_one_hot * focal, axis=1)
-
-    if reduction == 'none':
-        loss = loss_tmp
-    elif reduction == 'mean':
-        loss = paddle.mean(loss_tmp)
-    elif reduction == 'sum':
-        loss = paddle.sum(loss_tmp)
-    else:
-        raise NotImplementedError("Invalid reduction mode: {}".format(
-            reduction))
-    return loss
-
-
-class FocalLoss(nn.Layer):
-    r"""Criterion that computes Focal loss.
-
-    According to [1], the Focal loss is computed as follows:
-
-    .. math::
-
-        \text{FL}(p_t) = -\alpha_t (1 - p_t)^{\gamma} \, \text{log}(p_t)
-
-    where:
-       - :math:`p_t` is the model's estimated probability for each class.
-
-
-    Arguments:
-        alpha (float): Weighting factor :math:`\alpha \in [0, 1]`.
-        gamma (float): Focusing parameter :math:`\gamma >= 0`.
-        reduction (str, optional): Specifies the reduction to apply to the
-         output: ‘none’ | ‘mean’ | ‘sum’. ‘none’: no reduction will be applied,
-         ‘mean’: the sum of the output will be divided by the number of elements
-         in the output, ‘sum’: the output will be summed. Default: ‘none’.
-
-    Shape:
-        - Input: :math:`(N, C, *)` where C = number of classes.
-        - Target: :math:`(N, *)` where each value is
-          :math:`0 ≤ targets[i] ≤ C−1`.
-
-    Examples:
-        >>> N = 5  # num_classes
-        >>> kwargs = {"alpha": 0.5, "gamma": 2.0, "reduction": 'mean'}
-        >>> loss = kornia.losses.FocalLoss(**kwargs)
-        >>> input = torch.randn(1, N, 3, 5, requires_grad=True)
-        >>> target = torch.empty(1, 3, 5, dtype=torch.long).random_(N)
-        >>> output = loss(input, target)
-        >>> output.backward()
-
-    References:
-        [1] https://arxiv.org/abs/1708.02002
-    """
-
-    def __init__(self, alpha: float, gamma: float=2.0,
-                 reduction: str='none') -> None:
-        super(FocalLoss, self).__init__()
-        self.alpha: float = alpha
-        self.gamma: float = gamma
-        self.reduction: str = reduction
-        self.eps: float = 1e-6
-
-    def forward(  # type: ignore
-            self, input: paddle.Tensor, target: paddle.Tensor) -> paddle.Tensor:
-        return focal_loss(input, target, self.alpha, self.gamma, self.reduction,
-                          self.eps)
-
-
-@register
-class CLRNetLoss(nn.Layer):
-    __shared__ = ['img_w', 'img_h', 'num_classes', 'num_points']
-
-    def __init__(self,
-                 cls_loss_weight=2.0,
-                 xyt_loss_weight=0.2,
-                 iou_loss_weight=2.0,
-                 seg_loss_weight=1.0,
-                 refine_layers=3,
-                 num_points=72,
-                 img_w=800,
-                 img_h=320,
-                 num_classes=5,
-                 ignore_label=255,
-                 bg_weight=0.4):
-        super(CLRNetLoss, self).__init__()
-        self.cls_loss_weight = cls_loss_weight
-        self.xyt_loss_weight = xyt_loss_weight
-        self.iou_loss_weight = iou_loss_weight
-        self.seg_loss_weight = seg_loss_weight
-        self.refine_layers = refine_layers
-        self.img_w = img_w
-        self.img_h = img_h
-        self.n_strips = num_points - 1
-        self.num_classes = num_classes
-        self.ignore_label = ignore_label
-        weights = paddle.ones(shape=[self.num_classes])
-        weights[0] = bg_weight
-        self.criterion = nn.NLLLoss(
-            ignore_index=self.ignore_label, weight=weights)
-
-    def forward(self, output, batch):
-        predictions_lists = output['predictions_lists']
-        targets = batch['lane_line'].clone()
-        cls_criterion = FocalLoss(alpha=0.25, gamma=2.0)
-        cls_loss = paddle.to_tensor(0.0)
-        reg_xytl_loss = paddle.to_tensor(0.0)
-        iou_loss = paddle.to_tensor(0.0)
-        cls_acc = []
-        cls_acc_stage = []
-        for stage in range(self.refine_layers):
-            predictions_list = predictions_lists[stage]
-            for predictions, target in zip(predictions_list, targets):
-                target = target[target[:, 1] == 1]
-
-                if len(target) == 0:
-                    # If there are no targets, all predictions have to be negatives (i.e., 0 confidence)
-                    cls_target = paddle.zeros(
-                        [predictions.shape[0]], dtype='int64')
-                    cls_pred = predictions[:, :2]
-                    cls_loss = cls_loss + cls_criterion(cls_pred,
-                                                        cls_target).sum()
-                    continue
-
-                with paddle.no_grad():
-                    matched_row_inds, matched_col_inds = assign(
-                        predictions, target, self.img_w, self.img_h)
-
-                # classification targets
-                cls_target = paddle.zeros([predictions.shape[0]], dtype='int64')
-                cls_target[matched_row_inds] = 1
-                cls_pred = predictions[:, :2]
-
-                # regression targets -> [start_y, start_x, theta] (all transformed to absolute values), only on matched pairs
-                reg_yxtl = predictions.index_select(matched_row_inds)[..., 2:6]
-
-                reg_yxtl[:, 0] *= self.n_strips
-                reg_yxtl[:, 1] *= (self.img_w - 1)
-                reg_yxtl[:, 2] *= 180
-                reg_yxtl[:, 3] *= self.n_strips
-
-                target_yxtl = target.index_select(matched_col_inds)[..., 2:
-                                                                    6].clone()
-
-                # regression targets -> S coordinates (all transformed to absolute values)
-                reg_pred = predictions.index_select(matched_row_inds)[..., 6:]
-                reg_pred *= (self.img_w - 1)
-                reg_targets = target.index_select(matched_col_inds)[...,
-                                                                    6:].clone()
-
-                with paddle.no_grad():
-                    predictions_starts = paddle.clip(
-                        (predictions.index_select(matched_row_inds)[..., 2] *
-                         self.n_strips).round().cast("int64"),
-                        min=0,
-                        max=self.
-                        n_strips)  # ensure the predictions starts is valid
-
-                    target_starts = (
-                        target.index_select(matched_col_inds)[..., 2] *
-                        self.n_strips).round().cast("int64")
-                    target_yxtl[:, -1] -= (
-                        predictions_starts - target_starts)  # reg length
-
-                # Loss calculation
-                cls_loss = cls_loss + cls_criterion(
-                    cls_pred, cls_target).sum() / target.shape[0]
-
-                target_yxtl[:, 0] *= self.n_strips
-                target_yxtl[:, 2] *= 180
-
-                reg_xytl_loss = reg_xytl_loss + F.smooth_l1_loss(
-                    input=reg_yxtl, label=target_yxtl, reduction='none').mean()
-
-                iou_loss = iou_loss + liou_loss(
-                    reg_pred, reg_targets, self.img_w, length=15)
-
-                cls_accuracy = accuracy(cls_pred, cls_target)
-                cls_acc_stage.append(cls_accuracy)
-
-            cls_acc.append(sum(cls_acc_stage) / (len(cls_acc_stage) + 1e-5))
-
-        # extra segmentation loss
-        seg_loss = self.criterion(
-            F.log_softmax(
-                output['seg'], axis=1), batch['seg'].cast('int64'))
-
-        cls_loss /= (len(targets) * self.refine_layers)
-        reg_xytl_loss /= (len(targets) * self.refine_layers)
-        iou_loss /= (len(targets) * self.refine_layers)
-
-        loss = cls_loss * self.cls_loss_weight \
-            + reg_xytl_loss * self.xyt_loss_weight \
-            + seg_loss * self.seg_loss_weight \
-            + iou_loss * self.iou_loss_weight
-
-        return_value = {
-            'loss': loss,
-            'cls_loss': cls_loss * self.cls_loss_weight,
-            'reg_xytl_loss': reg_xytl_loss * self.xyt_loss_weight,
-            'seg_loss': seg_loss * self.seg_loss_weight,
-            'iou_loss': iou_loss * self.iou_loss_weight
-        }
-
-        for i in range(self.refine_layers):
-            if not isinstance(cls_acc[i], paddle.Tensor):
-                cls_acc[i] = paddle.to_tensor(cls_acc[i])
-            return_value['stage_{}_acc'.format(i)] = cls_acc[i]
-
-        return return_value
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/cot_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/cot_loss.py
deleted file mode 100644
index 40f8f9a..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/cot_loss.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-import numpy as np
-from ppdet.core.workspace import register
-
-__all__ = ['COTLoss']
-
-@register
-class COTLoss(nn.Layer):
-    __shared__ = ['num_classes']
-    def __init__(self,
-                 num_classes=80, 
-                 cot_scale=1,
-                 cot_lambda=1):
-        super(COTLoss, self).__init__()
-        self.cot_scale = cot_scale
-        self.cot_lambda = cot_lambda    
-        self.num_classes = num_classes    
-        
-    def forward(self, scores, targets, cot_relation):    
-        cls_name = 'loss_bbox_cls_cot'
-        loss_bbox = {}
-
-        tgt_labels, tgt_bboxes, tgt_gt_inds = targets
-        tgt_labels = paddle.concat(tgt_labels) if len(
-            tgt_labels) > 1 else tgt_labels[0]
-        mask = (tgt_labels < self.num_classes)
-        valid_inds = paddle.nonzero(tgt_labels >= 0).flatten()
-        if valid_inds.shape[0] == 0:
-            loss_bbox[cls_name] = paddle.zeros([1], dtype='float32')
-        else:
-            tgt_labels = tgt_labels.cast('int64')
-            valid_cot_targets = []
-            for i in range(tgt_labels.shape[0]):
-                train_label = tgt_labels[i]
-                if train_label < self.num_classes:
-                    valid_cot_targets.append(cot_relation[train_label])
-            coco_targets = paddle.to_tensor(valid_cot_targets)
-            coco_targets.stop_gradient = True
-            coco_loss = - coco_targets * F.log_softmax(scores[mask][:, :-1] * self.cot_scale)
-            loss_bbox[cls_name] = self.cot_lambda * paddle.mean(paddle.sum(coco_loss, axis=-1))
-        return loss_bbox
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/ctfocal_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/ctfocal_loss.py
deleted file mode 100644
index dd00eb8..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/ctfocal_loss.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-
-from ppdet.core.workspace import register, serializable
-
-__all__ = ['CTFocalLoss']
-
-
-@register
-@serializable
-class CTFocalLoss(object):
-    """
-    CTFocalLoss: CornerNet & CenterNet Focal Loss
-    Args:
-        loss_weight (float): loss weight
-        gamma (float): gamma parameter for Focal Loss
-    """
-
-    def __init__(self, loss_weight=1., gamma=2.0):
-        self.loss_weight = loss_weight
-        self.gamma = gamma
-
-    def __call__(self, pred, target):
-        """
-        Calculate the loss
-        Args:
-            pred (Tensor): heatmap prediction
-            target (Tensor): target for positive samples
-        Return:
-            ct_focal_loss (Tensor): Focal Loss used in CornerNet & CenterNet.
-                Note that the values in target are in [0, 1] since gaussian is
-                used to reduce the punishment and we treat [0, 1) as neg example.
-        """
-        fg_map = paddle.cast(target == 1, 'float32')
-        fg_map.stop_gradient = True
-        bg_map = paddle.cast(target < 1, 'float32')
-        bg_map.stop_gradient = True
-
-        neg_weights = paddle.pow(1 - target, 4)
-        pos_loss = 0 - paddle.log(pred) * paddle.pow(1 - pred,
-                                                     self.gamma) * fg_map
-
-        neg_loss = 0 - paddle.log(1 - pred) * paddle.pow(
-            pred, self.gamma) * neg_weights * bg_map
-        pos_loss = paddle.sum(pos_loss)
-        neg_loss = paddle.sum(neg_loss)
-
-        fg_num = paddle.sum(fg_map)
-        ct_focal_loss = (pos_loss + neg_loss) / (
-            fg_num + paddle.cast(fg_num == 0, 'float32'))
-        return ct_focal_loss * self.loss_weight
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/detr_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/detr_loss.py
deleted file mode 100644
index d635337..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/detr_loss.py
+++ /dev/null
@@ -1,631 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-from .iou_loss import GIoULoss
-from ..transformers import bbox_cxcywh_to_xyxy, sigmoid_focal_loss, varifocal_loss_with_logits
-from ..bbox_utils import bbox_iou
-
-__all__ = ['DETRLoss', 'DINOLoss']
-
-
-@register
-class DETRLoss(nn.Layer):
-    __shared__ = ['num_classes', 'use_focal_loss']
-    __inject__ = ['matcher']
-
-    def __init__(self,
-                 num_classes=80,
-                 matcher='HungarianMatcher',
-                 loss_coeff={
-                     'class': 1,
-                     'bbox': 5,
-                     'giou': 2,
-                     'no_object': 0.1,
-                     'mask': 1,
-                     'dice': 1
-                 },
-                 aux_loss=True,
-                 use_focal_loss=False,
-                 use_vfl=False,
-                 use_uni_match=False,
-                 uni_match_ind=0):
-        r"""
-        Args:
-            num_classes (int): The number of classes.
-            matcher (HungarianMatcher): It computes an assignment between the targets
-                and the predictions of the network.
-            loss_coeff (dict): The coefficient of loss.
-            aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used.
-            use_focal_loss (bool): Use focal loss or not.
-        """
-        super(DETRLoss, self).__init__()
-
-        self.num_classes = num_classes
-        self.matcher = matcher
-        self.loss_coeff = loss_coeff
-        self.aux_loss = aux_loss
-        self.use_focal_loss = use_focal_loss
-        self.use_vfl = use_vfl
-        self.use_uni_match = use_uni_match
-        self.uni_match_ind = uni_match_ind
-
-        if not self.use_focal_loss:
-            self.loss_coeff['class'] = paddle.full([num_classes + 1],
-                                                   loss_coeff['class'])
-            self.loss_coeff['class'][-1] = loss_coeff['no_object']
-        self.giou_loss = GIoULoss()
-
-    def _get_loss_class(self,
-                        logits,
-                        gt_class,
-                        match_indices,
-                        bg_index,
-                        num_gts,
-                        postfix="",
-                        iou_score=None,
-                        gt_score=None):
-        # logits: [b, query, num_classes], gt_class: list[[n, 1]]
-        name_class = "loss_class" + postfix
-
-        target_label = paddle.full(logits.shape[:2], bg_index, dtype='int64')
-        bs, num_query_objects = target_label.shape
-        num_gt = sum(len(a) for a in gt_class)
-        if num_gt > 0:
-            index, updates = self._get_index_updates(num_query_objects,
-                                                     gt_class, match_indices)
-            target_label = paddle.scatter(
-                target_label.reshape([-1, 1]), index, updates.astype('int64'))
-            target_label = target_label.reshape([bs, num_query_objects])
-        if self.use_focal_loss:
-            target_label = F.one_hot(target_label,
-                                     self.num_classes + 1)[..., :-1]
-            if iou_score is not None and self.use_vfl:
-                if gt_score is not None:
-                    target_score = paddle.zeros([bs, num_query_objects])
-                    target_score = paddle.scatter(
-                        target_score.reshape([-1, 1]), index, gt_score)
-                    target_score = target_score.reshape(
-                        [bs, num_query_objects, 1]) * target_label
-
-                    target_score_iou = paddle.zeros([bs, num_query_objects])
-                    target_score_iou = paddle.scatter(
-                        target_score_iou.reshape([-1, 1]), index, iou_score)
-                    target_score_iou = target_score_iou.reshape(
-                        [bs, num_query_objects, 1]) * target_label
-                    target_score = paddle.multiply(target_score,
-                                                   target_score_iou)
-                    loss_ = self.loss_coeff[
-                        'class'] * varifocal_loss_with_logits(
-                            logits, target_score, target_label,
-                            num_gts / num_query_objects)
-                else:
-                    target_score = paddle.zeros([bs, num_query_objects])
-                    if num_gt > 0:
-                        target_score = paddle.scatter(
-                            target_score.reshape([-1, 1]), index, iou_score)
-                    target_score = target_score.reshape(
-                        [bs, num_query_objects, 1]) * target_label
-                    loss_ = self.loss_coeff[
-                        'class'] * varifocal_loss_with_logits(
-                            logits, target_score, target_label,
-                            num_gts / num_query_objects)
-            else:
-                loss_ = self.loss_coeff['class'] * sigmoid_focal_loss(
-                    logits, target_label, num_gts / num_query_objects)
-        else:
-            loss_ = F.cross_entropy(
-                logits, target_label, weight=self.loss_coeff['class'])
-        return {name_class: loss_}
-
-    def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts,
-                       postfix=""):
-        # boxes: [b, query, 4], gt_bbox: list[[n, 4]]
-        name_bbox = "loss_bbox" + postfix
-        name_giou = "loss_giou" + postfix
-
-        loss = dict()
-        if sum(len(a) for a in gt_bbox) == 0:
-            loss[name_bbox] = paddle.to_tensor([0.])
-            loss[name_giou] = paddle.to_tensor([0.])
-            return loss
-
-        src_bbox, target_bbox = self._get_src_target_assign(boxes, gt_bbox,
-                                                            match_indices)
-        loss[name_bbox] = self.loss_coeff['bbox'] * F.l1_loss(
-            src_bbox, target_bbox, reduction='sum') / num_gts
-        loss[name_giou] = self.giou_loss(
-            bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox))
-        loss[name_giou] = loss[name_giou].sum() / num_gts
-        loss[name_giou] = self.loss_coeff['giou'] * loss[name_giou]
-        return loss
-
-    def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
-                       postfix=""):
-        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
-        name_mask = "loss_mask" + postfix
-        name_dice = "loss_dice" + postfix
-
-        loss = dict()
-        if sum(len(a) for a in gt_mask) == 0:
-            loss[name_mask] = paddle.to_tensor([0.])
-            loss[name_dice] = paddle.to_tensor([0.])
-            return loss
-
-        src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
-                                                              match_indices)
-        src_masks = F.interpolate(
-            src_masks.unsqueeze(0),
-            size=target_masks.shape[-2:],
-            mode="bilinear")[0]
-        loss[name_mask] = self.loss_coeff['mask'] * F.sigmoid_focal_loss(
-            src_masks,
-            target_masks,
-            paddle.to_tensor(
-                [num_gts], dtype='float32'))
-        loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
-            src_masks, target_masks, num_gts)
-        return loss
-
-    def _dice_loss(self, inputs, targets, num_gts):
-        inputs = F.sigmoid(inputs)
-        inputs = inputs.flatten(1)
-        targets = targets.flatten(1)
-        numerator = 2 * (inputs * targets).sum(1)
-        denominator = inputs.sum(-1) + targets.sum(-1)
-        loss = 1 - (numerator + 1) / (denominator + 1)
-        return loss.sum() / num_gts
-
-    def _get_loss_aux(self,
-                      boxes,
-                      logits,
-                      gt_bbox,
-                      gt_class,
-                      bg_index,
-                      num_gts,
-                      dn_match_indices=None,
-                      postfix="",
-                      masks=None,
-                      gt_mask=None,
-                      gt_score=None):
-        loss_class = []
-        loss_bbox, loss_giou = [], []
-        loss_mask, loss_dice = [], []
-        if dn_match_indices is not None:
-            match_indices = dn_match_indices
-        elif self.use_uni_match:
-            match_indices = self.matcher(
-                boxes[self.uni_match_ind],
-                logits[self.uni_match_ind],
-                gt_bbox,
-                gt_class,
-                masks=masks[self.uni_match_ind] if masks is not None else None,
-                gt_mask=gt_mask)
-        for i, (aux_boxes, aux_logits) in enumerate(zip(boxes, logits)):
-            aux_masks = masks[i] if masks is not None else None
-            if not self.use_uni_match and dn_match_indices is None:
-                match_indices = self.matcher(
-                    aux_boxes,
-                    aux_logits,
-                    gt_bbox,
-                    gt_class,
-                    masks=aux_masks,
-                    gt_mask=gt_mask)
-            if self.use_vfl:
-                if sum(len(a) for a in gt_bbox) > 0:
-                    src_bbox, target_bbox = self._get_src_target_assign(
-                        aux_boxes.detach(), gt_bbox, match_indices)
-                    iou_score = bbox_iou(
-                        bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
-                        bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
-                else:
-                    iou_score = None
-                if gt_score is not None:
-                    _, target_score = self._get_src_target_assign(
-                        logits[-1].detach(), gt_score, match_indices)
-            else:
-                iou_score = None
-            loss_class.append(
-                self._get_loss_class(
-                    aux_logits,
-                    gt_class,
-                    match_indices,
-                    bg_index,
-                    num_gts,
-                    postfix,
-                    iou_score,
-                    gt_score=target_score
-                    if gt_score is not None else None)['loss_class' + postfix])
-            loss_ = self._get_loss_bbox(aux_boxes, gt_bbox, match_indices,
-                                        num_gts, postfix)
-            loss_bbox.append(loss_['loss_bbox' + postfix])
-            loss_giou.append(loss_['loss_giou' + postfix])
-            if masks is not None and gt_mask is not None:
-                loss_ = self._get_loss_mask(aux_masks, gt_mask, match_indices,
-                                            num_gts, postfix)
-                loss_mask.append(loss_['loss_mask' + postfix])
-                loss_dice.append(loss_['loss_dice' + postfix])
-        loss = {
-            "loss_class_aux" + postfix: paddle.add_n(loss_class),
-            "loss_bbox_aux" + postfix: paddle.add_n(loss_bbox),
-            "loss_giou_aux" + postfix: paddle.add_n(loss_giou)
-        }
-        if masks is not None and gt_mask is not None:
-            loss["loss_mask_aux" + postfix] = paddle.add_n(loss_mask)
-            loss["loss_dice_aux" + postfix] = paddle.add_n(loss_dice)
-        return loss
-
-    def _get_index_updates(self, num_query_objects, target, match_indices):
-        batch_idx = paddle.concat([
-            paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices)
-        ])
-        src_idx = paddle.concat([src for (src, _) in match_indices])
-        src_idx += (batch_idx * num_query_objects)
-        target_assign = paddle.concat([
-            paddle.gather(
-                t, dst, axis=0) for t, (_, dst) in zip(target, match_indices)
-        ])
-        return src_idx, target_assign
-
-    def _get_src_target_assign(self, src, target, match_indices):
-        src_assign = paddle.concat([
-            paddle.gather(
-                t, I, axis=0) if len(I) > 0 else paddle.zeros([0, t.shape[-1]])
-            for t, (I, _) in zip(src, match_indices)
-        ])
-        target_assign = paddle.concat([
-            paddle.gather(
-                t, J, axis=0) if len(J) > 0 else paddle.zeros([0, t.shape[-1]])
-            for t, (_, J) in zip(target, match_indices)
-        ])
-        return src_assign, target_assign
-
-    def _get_num_gts(self, targets, dtype="float32"):
-        num_gts = sum(len(a) for a in targets)
-        num_gts = paddle.to_tensor([num_gts], dtype=dtype)
-        if paddle.distributed.get_world_size() > 1:
-            paddle.distributed.all_reduce(num_gts)
-            num_gts /= paddle.distributed.get_world_size()
-        num_gts = paddle.clip(num_gts, min=1.)
-        return num_gts
-
-    def _get_prediction_loss(self,
-                             boxes,
-                             logits,
-                             gt_bbox,
-                             gt_class,
-                             masks=None,
-                             gt_mask=None,
-                             postfix="",
-                             dn_match_indices=None,
-                             num_gts=1,
-                             gt_score=None):
-        if dn_match_indices is None:
-            match_indices = self.matcher(
-                boxes, logits, gt_bbox, gt_class, masks=masks, gt_mask=gt_mask)
-        else:
-            match_indices = dn_match_indices
-
-        if self.use_vfl:
-            if gt_score is not None:  #ssod
-                _, target_score = self._get_src_target_assign(
-                    logits[-1].detach(), gt_score, match_indices)
-            elif sum(len(a) for a in gt_bbox) > 0:
-                src_bbox, target_bbox = self._get_src_target_assign(
-                    boxes.detach(), gt_bbox, match_indices)
-                iou_score = bbox_iou(
-                    bbox_cxcywh_to_xyxy(src_bbox).split(4, -1),
-                    bbox_cxcywh_to_xyxy(target_bbox).split(4, -1))
-            else:
-                iou_score = None
-        else:
-            iou_score = None
-
-        loss = dict()
-        loss.update(
-            self._get_loss_class(
-                logits,
-                gt_class,
-                match_indices,
-                self.num_classes,
-                num_gts,
-                postfix,
-                iou_score,
-                gt_score=target_score if gt_score is not None else None))
-        loss.update(
-            self._get_loss_bbox(boxes, gt_bbox, match_indices, num_gts,
-                                postfix))
-        if masks is not None and gt_mask is not None:
-            loss.update(
-                self._get_loss_mask(masks, gt_mask, match_indices, num_gts,
-                                    postfix))
-        return loss
-
-    def forward(self,
-                boxes,
-                logits,
-                gt_bbox,
-                gt_class,
-                masks=None,
-                gt_mask=None,
-                postfix="",
-                gt_score=None,
-                **kwargs):
-        r"""
-        Args:
-            boxes (Tensor): [l, b, query, 4]
-            logits (Tensor): [l, b, query, num_classes]
-            gt_bbox (List(Tensor)): list[[n, 4]]
-            gt_class (List(Tensor)): list[[n, 1]]
-            masks (Tensor, optional): [l, b, query, h, w]
-            gt_mask (List(Tensor), optional): list[[n, H, W]]
-            postfix (str): postfix of loss name
-        """
-
-        dn_match_indices = kwargs.get("dn_match_indices", None)
-        num_gts = kwargs.get("num_gts", None)
-        if num_gts is None:
-            num_gts = self._get_num_gts(gt_class)
-
-        total_loss = self._get_prediction_loss(
-            boxes[-1],
-            logits[-1],
-            gt_bbox,
-            gt_class,
-            masks=masks[-1] if masks is not None else None,
-            gt_mask=gt_mask,
-            postfix=postfix,
-            dn_match_indices=dn_match_indices,
-            num_gts=num_gts,
-            gt_score=gt_score if gt_score is not None else None)
-
-        if self.aux_loss:
-            total_loss.update(
-                self._get_loss_aux(
-                    boxes[:-1],
-                    logits[:-1],
-                    gt_bbox,
-                    gt_class,
-                    self.num_classes,
-                    num_gts,
-                    dn_match_indices,
-                    postfix,
-                    masks=masks[:-1] if masks is not None else None,
-                    gt_mask=gt_mask,
-                    gt_score=gt_score if gt_score is not None else None))
-
-        return total_loss
-
-
-@register
-class DINOLoss(DETRLoss):
-    def forward(self,
-                boxes,
-                logits,
-                gt_bbox,
-                gt_class,
-                masks=None,
-                gt_mask=None,
-                postfix="",
-                dn_out_bboxes=None,
-                dn_out_logits=None,
-                dn_meta=None,
-                gt_score=None,
-                **kwargs):
-        num_gts = self._get_num_gts(gt_class)
-        total_loss = super(DINOLoss, self).forward(
-            boxes,
-            logits,
-            gt_bbox,
-            gt_class,
-            num_gts=num_gts,
-            gt_score=gt_score)
-
-        if dn_meta is not None:
-            dn_positive_idx, dn_num_group = \
-                dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
-            assert len(gt_class) == len(dn_positive_idx)
-
-            # denoising match indices
-            dn_match_indices = self.get_dn_match_indices(
-                gt_class, dn_positive_idx, dn_num_group)
-
-            # compute denoising training loss
-            num_gts *= dn_num_group
-            dn_loss = super(DINOLoss, self).forward(
-                dn_out_bboxes,
-                dn_out_logits,
-                gt_bbox,
-                gt_class,
-                postfix="_dn",
-                dn_match_indices=dn_match_indices,
-                num_gts=num_gts,
-                gt_score=gt_score)
-            total_loss.update(dn_loss)
-        else:
-            total_loss.update(
-                {k + '_dn': paddle.to_tensor([0.])
-                 for k in total_loss.keys()})
-
-        return total_loss
-
-    @staticmethod
-    def get_dn_match_indices(labels, dn_positive_idx, dn_num_group):
-        dn_match_indices = []
-        for i in range(len(labels)):
-            num_gt = len(labels[i])
-            if num_gt > 0:
-                gt_idx = paddle.arange(end=num_gt, dtype="int64")
-                gt_idx = gt_idx.tile([dn_num_group])
-                assert len(dn_positive_idx[i]) == len(gt_idx)
-                dn_match_indices.append((dn_positive_idx[i], gt_idx))
-            else:
-                dn_match_indices.append((paddle.zeros(
-                    [0], dtype="int64"), paddle.zeros(
-                        [0], dtype="int64")))
-        return dn_match_indices
-
-
-@register
-class MaskDINOLoss(DETRLoss):
-    __shared__ = ['num_classes', 'use_focal_loss', 'num_sample_points']
-    __inject__ = ['matcher']
-
-    def __init__(self,
-                 num_classes=80,
-                 matcher='HungarianMatcher',
-                 loss_coeff={
-                     'class': 4,
-                     'bbox': 5,
-                     'giou': 2,
-                     'mask': 5,
-                     'dice': 5
-                 },
-                 aux_loss=True,
-                 use_focal_loss=False,
-                 num_sample_points=12544,
-                 oversample_ratio=3.0,
-                 important_sample_ratio=0.75):
-        super(MaskDINOLoss, self).__init__(num_classes, matcher, loss_coeff,
-                                           aux_loss, use_focal_loss)
-        assert oversample_ratio >= 1
-        assert important_sample_ratio <= 1 and important_sample_ratio >= 0
-
-        self.num_sample_points = num_sample_points
-        self.oversample_ratio = oversample_ratio
-        self.important_sample_ratio = important_sample_ratio
-        self.num_oversample_points = int(num_sample_points * oversample_ratio)
-        self.num_important_points = int(num_sample_points *
-                                        important_sample_ratio)
-        self.num_random_points = num_sample_points - self.num_important_points
-
-    def forward(self,
-                boxes,
-                logits,
-                gt_bbox,
-                gt_class,
-                masks=None,
-                gt_mask=None,
-                postfix="",
-                dn_out_bboxes=None,
-                dn_out_logits=None,
-                dn_out_masks=None,
-                dn_meta=None,
-                **kwargs):
-        num_gts = self._get_num_gts(gt_class)
-        total_loss = super(MaskDINOLoss, self).forward(
-            boxes,
-            logits,
-            gt_bbox,
-            gt_class,
-            masks=masks,
-            gt_mask=gt_mask,
-            num_gts=num_gts)
-
-        if dn_meta is not None:
-            dn_positive_idx, dn_num_group = \
-                dn_meta["dn_positive_idx"], dn_meta["dn_num_group"]
-            assert len(gt_class) == len(dn_positive_idx)
-
-            # denoising match indices
-            dn_match_indices = DINOLoss.get_dn_match_indices(
-                gt_class, dn_positive_idx, dn_num_group)
-
-            # compute denoising training loss
-            num_gts *= dn_num_group
-            dn_loss = super(MaskDINOLoss, self).forward(
-                dn_out_bboxes,
-                dn_out_logits,
-                gt_bbox,
-                gt_class,
-                masks=dn_out_masks,
-                gt_mask=gt_mask,
-                postfix="_dn",
-                dn_match_indices=dn_match_indices,
-                num_gts=num_gts)
-            total_loss.update(dn_loss)
-        else:
-            total_loss.update(
-                {k + '_dn': paddle.to_tensor([0.])
-                 for k in total_loss.keys()})
-
-        return total_loss
-
-    def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts,
-                       postfix=""):
-        # masks: [b, query, h, w], gt_mask: list[[n, H, W]]
-        name_mask = "loss_mask" + postfix
-        name_dice = "loss_dice" + postfix
-
-        loss = dict()
-        if sum(len(a) for a in gt_mask) == 0:
-            loss[name_mask] = paddle.to_tensor([0.])
-            loss[name_dice] = paddle.to_tensor([0.])
-            return loss
-
-        src_masks, target_masks = self._get_src_target_assign(masks, gt_mask,
-                                                              match_indices)
-        # sample points
-        sample_points = self._get_point_coords_by_uncertainty(src_masks)
-        sample_points = 2.0 * sample_points.unsqueeze(1) - 1.0
-
-        src_masks = F.grid_sample(
-            src_masks.unsqueeze(1), sample_points,
-            align_corners=False).squeeze([1, 2])
-
-        target_masks = F.grid_sample(
-            target_masks.unsqueeze(1), sample_points,
-            align_corners=False).squeeze([1, 2]).detach()
-
-        loss[name_mask] = self.loss_coeff[
-            'mask'] * F.binary_cross_entropy_with_logits(
-                src_masks, target_masks,
-                reduction='none').mean(1).sum() / num_gts
-        loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss(
-            src_masks, target_masks, num_gts)
-        return loss
-
-    def _get_point_coords_by_uncertainty(self, masks):
-        # Sample points based on their uncertainty.
-        masks = masks.detach()
-        num_masks = masks.shape[0]
-        sample_points = paddle.rand(
-            [num_masks, 1, self.num_oversample_points, 2])
-
-        out_mask = F.grid_sample(
-            masks.unsqueeze(1), 2.0 * sample_points - 1.0,
-            align_corners=False).squeeze([1, 2])
-        out_mask = -paddle.abs(out_mask)
-
-        _, topk_ind = paddle.topk(out_mask, self.num_important_points, axis=1)
-        batch_ind = paddle.arange(end=num_masks, dtype=topk_ind.dtype)
-        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_important_points])
-        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
-
-        sample_points = paddle.gather_nd(sample_points.squeeze(1), topk_ind)
-        if self.num_random_points > 0:
-            sample_points = paddle.concat(
-                [
-                    sample_points,
-                    paddle.rand([num_masks, self.num_random_points, 2])
-                ],
-                axis=1)
-        return sample_points
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/fairmot_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/fairmot_loss.py
deleted file mode 100644
index e24ff33..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/fairmot_loss.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-from paddle.nn.initializer import Constant
-from ppdet.core.workspace import register
-
-__all__ = ['FairMOTLoss']
-
-
-@register
-class FairMOTLoss(nn.Layer):
-    def __init__(self):
-        super(FairMOTLoss, self).__init__()
-        self.det_weight = self.create_parameter(
-            shape=[1], default_initializer=Constant(-1.85))
-        self.reid_weight = self.create_parameter(
-            shape=[1], default_initializer=Constant(-1.05))
-
-    def forward(self, det_loss, reid_loss):
-        loss = paddle.exp(-self.det_weight) * det_loss + paddle.exp(
-            -self.reid_weight) * reid_loss + (self.det_weight + self.reid_weight
-                                              )
-        loss *= 0.5
-        return {'loss': loss}
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/fcos_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/fcos_loss.py
deleted file mode 100644
index e9bbc27..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/fcos_loss.py
+++ /dev/null
@@ -1,1020 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-from ppdet.modeling import ops
-from functools import partial
-
-__all__ = ['FCOSLoss', 'FCOSLossMILC', 'FCOSLossCR']
-
-
-def flatten_tensor(inputs, channel_first=False):
-    """
-    Flatten a Tensor
-    Args:
-        inputs (Tensor): 4-D Tensor with shape [N, C, H, W] or [N, H, W, C]
-        channel_first (bool): If true the dimension order of Tensor is 
-            [N, C, H, W], otherwise is [N, H, W, C]
-    Return:
-        output_channel_last (Tensor): The flattened Tensor in channel_last style
-    """
-    if channel_first:
-        input_channel_last = paddle.transpose(inputs, perm=[0, 2, 3, 1])
-    else:
-        input_channel_last = inputs
-    output_channel_last = paddle.flatten(
-        input_channel_last, start_axis=0, stop_axis=2)
-    return output_channel_last
-
-
-@register
-class FCOSLoss(nn.Layer):
-    """
-    FCOSLoss
-    Args:
-        loss_alpha (float): alpha in focal loss
-        loss_gamma (float): gamma in focal loss
-        iou_loss_type (str): location loss type, IoU/GIoU/LINEAR_IoU
-        reg_weights (float): weight for location loss
-        quality (str): quality branch, centerness/iou
-    """
-
-    def __init__(self,
-                 loss_alpha=0.25,
-                 loss_gamma=2.0,
-                 iou_loss_type="giou",
-                 reg_weights=1.0,
-                 quality='centerness'):
-        super(FCOSLoss, self).__init__()
-        self.loss_alpha = loss_alpha
-        self.loss_gamma = loss_gamma
-        self.iou_loss_type = iou_loss_type
-        self.reg_weights = reg_weights
-        self.quality = quality
-
-    def _iou_loss(self,
-                  pred,
-                  targets,
-                  positive_mask,
-                  weights=None,
-                  return_iou=False):
-        """
-        Calculate the loss for location prediction
-        Args:
-            pred (Tensor): bounding boxes prediction
-            targets (Tensor): targets for positive samples
-            positive_mask (Tensor): mask of positive samples
-            weights (Tensor): weights for each positive samples
-        Return:
-            loss (Tensor): location loss
-        """
-        plw = pred[:, 0] * positive_mask
-        pth = pred[:, 1] * positive_mask
-        prw = pred[:, 2] * positive_mask
-        pbh = pred[:, 3] * positive_mask
-
-        tlw = targets[:, 0] * positive_mask
-        tth = targets[:, 1] * positive_mask
-        trw = targets[:, 2] * positive_mask
-        tbh = targets[:, 3] * positive_mask
-        tlw.stop_gradient = True
-        trw.stop_gradient = True
-        tth.stop_gradient = True
-        tbh.stop_gradient = True
-
-        ilw = paddle.minimum(plw, tlw)
-        irw = paddle.minimum(prw, trw)
-        ith = paddle.minimum(pth, tth)
-        ibh = paddle.minimum(pbh, tbh)
-
-        clw = paddle.maximum(plw, tlw)
-        crw = paddle.maximum(prw, trw)
-        cth = paddle.maximum(pth, tth)
-        cbh = paddle.maximum(pbh, tbh)
-
-        area_predict = (plw + prw) * (pth + pbh)
-        area_target = (tlw + trw) * (tth + tbh)
-        area_inter = (ilw + irw) * (ith + ibh)
-        ious = (area_inter + 1.0) / (
-            area_predict + area_target - area_inter + 1.0)
-        ious = ious * positive_mask
-
-        if return_iou:
-            return ious
-
-        if self.iou_loss_type.lower() == "linear_iou":
-            loss = 1.0 - ious
-        elif self.iou_loss_type.lower() == "giou":
-            area_uniou = area_predict + area_target - area_inter
-            area_circum = (clw + crw) * (cth + cbh) + 1e-7
-            giou = ious - (area_circum - area_uniou) / area_circum
-            loss = 1.0 - giou
-        elif self.iou_loss_type.lower() == "iou":
-            loss = 0.0 - paddle.log(ious)
-        else:
-            raise KeyError
-        if weights is not None:
-            loss = loss * weights
-        return loss
-
-    def forward(self, cls_logits, bboxes_reg, centerness, tag_labels,
-                tag_bboxes, tag_center):
-        """
-        Calculate the loss for classification, location and centerness
-        Args:
-            cls_logits (list): list of Tensor, which is predicted
-                score for all anchor points with shape [N, M, C]
-            bboxes_reg (list): list of Tensor, which is predicted
-                offsets for all anchor points with shape [N, M, 4]
-            centerness (list): list of Tensor, which is predicted
-                centerness for all anchor points with shape [N, M, 1]
-            tag_labels (list): list of Tensor, which is category
-                targets for each anchor point
-            tag_bboxes (list): list of Tensor, which is bounding
-                boxes targets for positive samples
-            tag_center (list): list of Tensor, which is centerness
-                targets for positive samples
-        Return:
-            loss (dict): loss composed by classification loss, bounding box
-        """
-        cls_logits_flatten_list = []
-        bboxes_reg_flatten_list = []
-        centerness_flatten_list = []
-        tag_labels_flatten_list = []
-        tag_bboxes_flatten_list = []
-        tag_center_flatten_list = []
-        num_lvl = len(cls_logits)
-        for lvl in range(num_lvl):
-            cls_logits_flatten_list.append(
-                flatten_tensor(cls_logits[lvl], True))
-            bboxes_reg_flatten_list.append(
-                flatten_tensor(bboxes_reg[lvl], True))
-            centerness_flatten_list.append(
-                flatten_tensor(centerness[lvl], True))
-
-            tag_labels_flatten_list.append(
-                flatten_tensor(tag_labels[lvl], False))
-            tag_bboxes_flatten_list.append(
-                flatten_tensor(tag_bboxes[lvl], False))
-            tag_center_flatten_list.append(
-                flatten_tensor(tag_center[lvl], False))
-
-        cls_logits_flatten = paddle.concat(cls_logits_flatten_list, axis=0)
-        bboxes_reg_flatten = paddle.concat(bboxes_reg_flatten_list, axis=0)
-        centerness_flatten = paddle.concat(centerness_flatten_list, axis=0)
-
-        tag_labels_flatten = paddle.concat(tag_labels_flatten_list, axis=0)
-        tag_bboxes_flatten = paddle.concat(tag_bboxes_flatten_list, axis=0)
-        tag_center_flatten = paddle.concat(tag_center_flatten_list, axis=0)
-        tag_labels_flatten.stop_gradient = True
-        tag_bboxes_flatten.stop_gradient = True
-        tag_center_flatten.stop_gradient = True
-
-        mask_positive_bool = tag_labels_flatten > 0
-        mask_positive_bool.stop_gradient = True
-        mask_positive_float = paddle.cast(mask_positive_bool, dtype="float32")
-        mask_positive_float.stop_gradient = True
-
-        num_positive_fp32 = paddle.sum(mask_positive_float)
-        num_positive_fp32.stop_gradient = True
-        num_positive_int32 = paddle.cast(num_positive_fp32, dtype="int32")
-        num_positive_int32 = num_positive_int32 * 0 + 1
-        num_positive_int32.stop_gradient = True
-
-        normalize_sum = paddle.sum(tag_center_flatten * mask_positive_float)
-        normalize_sum.stop_gradient = True
-
-        # 1. cls_logits: sigmoid_focal_loss
-        # expand onehot labels
-        num_classes = cls_logits_flatten.shape[-1]
-        tag_labels_flatten = paddle.squeeze(tag_labels_flatten, axis=-1)
-        tag_labels_flatten_bin = F.one_hot(
-            tag_labels_flatten, num_classes=1 + num_classes)
-        tag_labels_flatten_bin = tag_labels_flatten_bin[:, 1:]
-        # sigmoid_focal_loss
-        cls_loss = F.sigmoid_focal_loss(
-            cls_logits_flatten, tag_labels_flatten_bin) / num_positive_fp32
-
-        if self.quality == 'centerness':
-            # 2. bboxes_reg: giou_loss
-            mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1)
-            tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1)
-            reg_loss = self._iou_loss(
-                bboxes_reg_flatten,
-                tag_bboxes_flatten,
-                mask_positive_float,
-                weights=tag_center_flatten)
-            reg_loss = reg_loss * mask_positive_float / normalize_sum
-
-            # 3. centerness: sigmoid_cross_entropy_with_logits_loss
-            centerness_flatten = paddle.squeeze(centerness_flatten, axis=-1)
-            quality_loss = ops.sigmoid_cross_entropy_with_logits(
-                centerness_flatten, tag_center_flatten)
-            quality_loss = quality_loss * mask_positive_float / num_positive_fp32
-
-        elif self.quality == 'iou':
-            # 2. bboxes_reg: giou_loss
-            mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1)
-            tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1)
-            reg_loss = self._iou_loss(
-                bboxes_reg_flatten,
-                tag_bboxes_flatten,
-                mask_positive_float,
-                weights=None)
-            reg_loss = reg_loss * mask_positive_float / num_positive_fp32
-            # num_positive_fp32 is num_foreground
-
-            # 3. centerness: sigmoid_cross_entropy_with_logits_loss
-            centerness_flatten = paddle.squeeze(centerness_flatten, axis=-1)
-            gt_ious = self._iou_loss(
-                bboxes_reg_flatten,
-                tag_bboxes_flatten,
-                mask_positive_float,
-                weights=None,
-                return_iou=True)
-            quality_loss = ops.sigmoid_cross_entropy_with_logits(
-                centerness_flatten, gt_ious)
-            quality_loss = quality_loss * mask_positive_float / num_positive_fp32
-        else:
-            raise Exception(f'Unknown quality type: {self.quality}')
-
-        loss_all = {
-            "loss_cls": paddle.sum(cls_loss),
-            "loss_box": paddle.sum(reg_loss),
-            "loss_quality": paddle.sum(quality_loss),
-        }
-        return loss_all
-
-
-@register
-class FCOSLossMILC(FCOSLoss):
-    """
-    FCOSLossMILC for ARSL in semi-det(ssod)
-    Args:
-        loss_alpha (float): alpha in focal loss
-        loss_gamma (float): gamma in focal loss
-        iou_loss_type (str): location loss type, IoU/GIoU/LINEAR_IoU
-        reg_weights (float): weight for location loss
-    """
-
-    def __init__(self,
-                 loss_alpha=0.25,
-                 loss_gamma=2.0,
-                 iou_loss_type="giou",
-                 reg_weights=1.0):
-        super(FCOSLossMILC, self).__init__()
-        self.loss_alpha = loss_alpha
-        self.loss_gamma = loss_gamma
-        self.iou_loss_type = iou_loss_type
-        self.reg_weights = reg_weights
-
-    def iou_loss(self, pred, targets, weights=None, avg_factor=None):
-        """
-        Calculate the loss for location prediction
-        Args:
-            pred (Tensor): bounding boxes prediction
-            targets (Tensor): targets for positive samples
-            weights (Tensor): weights for each positive samples
-        Return:
-            loss (Tensor): location loss
-        """
-        plw = pred[:, 0]
-        pth = pred[:, 1]
-        prw = pred[:, 2]
-        pbh = pred[:, 3]
-
-        tlw = targets[:, 0]
-        tth = targets[:, 1]
-        trw = targets[:, 2]
-        tbh = targets[:, 3]
-        tlw.stop_gradient = True
-        trw.stop_gradient = True
-        tth.stop_gradient = True
-        tbh.stop_gradient = True
-
-        ilw = paddle.minimum(plw, tlw)
-        irw = paddle.minimum(prw, trw)
-        ith = paddle.minimum(pth, tth)
-        ibh = paddle.minimum(pbh, tbh)
-
-        clw = paddle.maximum(plw, tlw)
-        crw = paddle.maximum(prw, trw)
-        cth = paddle.maximum(pth, tth)
-        cbh = paddle.maximum(pbh, tbh)
-
-        area_predict = (plw + prw) * (pth + pbh)
-        area_target = (tlw + trw) * (tth + tbh)
-        area_inter = (ilw + irw) * (ith + ibh)
-        ious = (area_inter + 1.0) / (
-            area_predict + area_target - area_inter + 1.0)
-        ious = ious
-
-        if self.iou_loss_type.lower() == "linear_iou":
-            loss = 1.0 - ious
-        elif self.iou_loss_type.lower() == "giou":
-            area_uniou = area_predict + area_target - area_inter
-            area_circum = (clw + crw) * (cth + cbh) + 1e-7
-            giou = ious - (area_circum - area_uniou) / area_circum
-            loss = 1.0 - giou
-        elif self.iou_loss_type.lower() == "iou":
-            loss = 0.0 - paddle.log(ious)
-        else:
-            raise KeyError
-        if weights is not None:
-            loss = loss * weights
-        loss = paddle.sum(loss)
-        if avg_factor is not None:
-            loss = loss / avg_factor
-        return loss
-
-    # temp function: calcualate iou between bbox and target
-    def _bbox_overlap_align(self, pred, targets):
-        assert pred.shape[0] == targets.shape[0], \
-        'the pred should be aligned with target.'
-
-        plw = pred[:, 0]
-        pth = pred[:, 1]
-        prw = pred[:, 2]
-        pbh = pred[:, 3]
-
-        tlw = targets[:, 0]
-        tth = targets[:, 1]
-        trw = targets[:, 2]
-        tbh = targets[:, 3]
-
-        ilw = paddle.minimum(plw, tlw)
-        irw = paddle.minimum(prw, trw)
-        ith = paddle.minimum(pth, tth)
-        ibh = paddle.minimum(pbh, tbh)
-
-        area_predict = (plw + prw) * (pth + pbh)
-        area_target = (tlw + trw) * (tth + tbh)
-        area_inter = (ilw + irw) * (ith + ibh)
-        ious = (area_inter + 1.0) / (
-            area_predict + area_target - area_inter + 1.0)
-
-        return ious
-
-    def iou_based_soft_label_loss(self,
-                                  pred,
-                                  target,
-                                  alpha=0.75,
-                                  gamma=2.0,
-                                  iou_weighted=False,
-                                  implicit_iou=None,
-                                  avg_factor=None):
-        assert pred.shape == target.shape
-        pred = F.sigmoid(pred)
-        target = target.cast(pred.dtype)
-
-        if implicit_iou is not None:
-            pred = pred * implicit_iou
-
-        if iou_weighted:
-            focal_weight = (pred - target).abs().pow(gamma) * target * (target > 0.0).cast('float32') + \
-                alpha * (pred - target).abs().pow(gamma) * \
-                (target <= 0.0).cast('float32')
-        else:
-            focal_weight = (pred - target).abs().pow(gamma) * (target > 0.0).cast('float32') + \
-                alpha * (pred - target).abs().pow(gamma) * \
-                (target <= 0.0).cast('float32')
-
-        # focal loss
-        loss = F.binary_cross_entropy(
-            pred, target, reduction='none') * focal_weight
-        if avg_factor is not None:
-            loss = loss / avg_factor
-        return loss
-
-    def forward(self, cls_logits, bboxes_reg, centerness, tag_labels,
-                tag_bboxes, tag_center):
-        """
-        Calculate the loss for classification, location and centerness
-        Args:
-            cls_logits (list): list of Tensor, which is predicted
-                score for all anchor points with shape [N, M, C]
-            bboxes_reg (list): list of Tensor, which is predicted
-                offsets for all anchor points with shape [N, M, 4]
-            centerness (list): list of Tensor, which is predicted
-                centerness for all anchor points with shape [N, M, 1]
-            tag_labels (list): list of Tensor, which is category
-                targets for each anchor point
-            tag_bboxes (list): list of Tensor, which is bounding
-                boxes targets for positive samples
-            tag_center (list): list of Tensor, which is centerness
-                targets for positive samples
-        Return:
-            loss (dict): loss composed by classification loss, bounding box
-        """
-        cls_logits_flatten_list = []
-        bboxes_reg_flatten_list = []
-        centerness_flatten_list = []
-        tag_labels_flatten_list = []
-        tag_bboxes_flatten_list = []
-        tag_center_flatten_list = []
-        num_lvl = len(cls_logits)
-        for lvl in range(num_lvl):
-            cls_logits_flatten_list.append(
-                flatten_tensor(cls_logits[lvl], True))
-            bboxes_reg_flatten_list.append(
-                flatten_tensor(bboxes_reg[lvl], True))
-            centerness_flatten_list.append(
-                flatten_tensor(centerness[lvl], True))
-
-            tag_labels_flatten_list.append(
-                flatten_tensor(tag_labels[lvl], False))
-            tag_bboxes_flatten_list.append(
-                flatten_tensor(tag_bboxes[lvl], False))
-            tag_center_flatten_list.append(
-                flatten_tensor(tag_center[lvl], False))
-
-        cls_logits_flatten = paddle.concat(cls_logits_flatten_list, axis=0)
-        bboxes_reg_flatten = paddle.concat(bboxes_reg_flatten_list, axis=0)
-        centerness_flatten = paddle.concat(centerness_flatten_list, axis=0)
-
-        tag_labels_flatten = paddle.concat(tag_labels_flatten_list, axis=0)
-        tag_bboxes_flatten = paddle.concat(tag_bboxes_flatten_list, axis=0)
-        tag_center_flatten = paddle.concat(tag_center_flatten_list, axis=0)
-        tag_labels_flatten.stop_gradient = True
-        tag_bboxes_flatten.stop_gradient = True
-        tag_center_flatten.stop_gradient = True
-
-        # find positive index
-        mask_positive_bool = tag_labels_flatten > 0
-        mask_positive_bool.stop_gradient = True
-        mask_positive_float = paddle.cast(mask_positive_bool, dtype="float32")
-        mask_positive_float.stop_gradient = True
-
-        num_positive_fp32 = paddle.sum(mask_positive_float)
-        num_positive_fp32.stop_gradient = True
-        num_positive_int32 = paddle.cast(num_positive_fp32, dtype="int32")
-        num_positive_int32 = num_positive_int32 * 0 + 1
-        num_positive_int32.stop_gradient = True
-
-        # centerness target is used as reg weight
-        normalize_sum = paddle.sum(tag_center_flatten * mask_positive_float)
-        normalize_sum.stop_gradient = True
-
-        # 1. IoU-Based soft label loss
-        # calculate iou
-        with paddle.no_grad():
-            pos_ind = paddle.nonzero(
-                tag_labels_flatten.reshape([-1]) > 0).reshape([-1])
-            pos_pred = bboxes_reg_flatten[pos_ind]
-            pos_target = tag_bboxes_flatten[pos_ind]
-            bbox_iou = self._bbox_overlap_align(pos_pred, pos_target)
-        # pos labels
-        pos_labels = tag_labels_flatten[pos_ind].squeeze(1)
-        cls_target = paddle.zeros(cls_logits_flatten.shape)
-        cls_target[pos_ind, pos_labels - 1] = bbox_iou
-        cls_loss = self.iou_based_soft_label_loss(
-            cls_logits_flatten,
-            cls_target,
-            implicit_iou=F.sigmoid(centerness_flatten),
-            avg_factor=num_positive_fp32)
-
-        # 2. bboxes_reg: giou_loss
-        mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1)
-        tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1)
-        reg_loss = self._iou_loss(
-            bboxes_reg_flatten,
-            tag_bboxes_flatten,
-            mask_positive_float,
-            weights=tag_center_flatten)
-        reg_loss = reg_loss * mask_positive_float / normalize_sum
-
-        # 3. iou loss
-        pos_iou_pred = paddle.squeeze(centerness_flatten, axis=-1)[pos_ind]
-        loss_iou = ops.sigmoid_cross_entropy_with_logits(pos_iou_pred, bbox_iou)
-        loss_iou = loss_iou / num_positive_fp32 * 0.5
-
-        loss_all = {
-            "loss_cls": paddle.sum(cls_loss),
-            "loss_box": paddle.sum(reg_loss),
-            'loss_iou': paddle.sum(loss_iou),
-        }
-
-        return loss_all
-
-
-# Concat multi-level feature maps by image
-def levels_to_images(mlvl_tensor):
-    batch_size = mlvl_tensor[0].shape[0]
-    batch_list = [[] for _ in range(batch_size)]
-    channels = mlvl_tensor[0].shape[1]
-    for t in mlvl_tensor:
-        t = t.transpose([0, 2, 3, 1])
-        t = t.reshape([batch_size, -1, channels])
-        for img in range(batch_size):
-            batch_list[img].append(t[img])
-    return [paddle.concat(item, axis=0) for item in batch_list]
-
-
-def multi_apply(func, *args, **kwargs):
-    """Apply function to a list of arguments.
-
-    Note:
-        This function applies the ``func`` to multiple inputs and
-        map the multiple outputs of the ``func`` into different
-        list. Each list contains the same type of outputs corresponding
-        to different inputs.
-
-    Args:
-        func (Function): A function that will be applied to a list of
-            arguments
-
-    Returns:
-        tuple(list): A tuple containing multiple list, each list contains \
-            a kind of returned results by the function
-    """
-    pfunc = partial(func, **kwargs) if kwargs else func
-    map_results = map(pfunc, *args)
-    return tuple(map(list, zip(*map_results)))
-
-
-@register
-class FCOSLossCR(FCOSLossMILC):
-    """
-    FCOSLoss of Consistency Regularization
-    """
-
-    def __init__(self,
-                 iou_loss_type="giou",
-                 cls_weight=2.0,
-                 reg_weight=2.0,
-                 iou_weight=0.5,
-                 hard_neg_mining_flag=True):
-        super(FCOSLossCR, self).__init__()
-        self.iou_loss_type = iou_loss_type
-        self.cls_weight = cls_weight
-        self.reg_weight = reg_weight
-        self.iou_weight = iou_weight
-        self.hard_neg_mining_flag = hard_neg_mining_flag
-
-    def iou_loss(self, pred, targets, weights=None, avg_factor=None):
-        """
-            Calculate the loss for location prediction
-            Args:
-                pred (Tensor): bounding boxes prediction
-                targets (Tensor): targets for positive samples
-                weights (Tensor): weights for each positive samples
-            Return:
-                loss (Tensor): location loss
-            """
-        plw = pred[:, 0]
-        pth = pred[:, 1]
-        prw = pred[:, 2]
-        pbh = pred[:, 3]
-
-        tlw = targets[:, 0]
-        tth = targets[:, 1]
-        trw = targets[:, 2]
-        tbh = targets[:, 3]
-        tlw.stop_gradient = True
-        trw.stop_gradient = True
-        tth.stop_gradient = True
-        tbh.stop_gradient = True
-
-        ilw = paddle.minimum(plw, tlw)
-        irw = paddle.minimum(prw, trw)
-        ith = paddle.minimum(pth, tth)
-        ibh = paddle.minimum(pbh, tbh)
-
-        clw = paddle.maximum(plw, tlw)
-        crw = paddle.maximum(prw, trw)
-        cth = paddle.maximum(pth, tth)
-        cbh = paddle.maximum(pbh, tbh)
-
-        area_predict = (plw + prw) * (pth + pbh)
-        area_target = (tlw + trw) * (tth + tbh)
-        area_inter = (ilw + irw) * (ith + ibh)
-        ious = (area_inter + 1.0) / (
-            area_predict + area_target - area_inter + 1.0)
-        ious = ious
-
-        if self.iou_loss_type.lower() == "linear_iou":
-            loss = 1.0 - ious
-        elif self.iou_loss_type.lower() == "giou":
-            area_uniou = area_predict + area_target - area_inter
-            area_circum = (clw + crw) * (cth + cbh) + 1e-7
-            giou = ious - (area_circum - area_uniou) / area_circum
-            loss = 1.0 - giou
-        elif self.iou_loss_type.lower() == "iou":
-            loss = 0.0 - paddle.log(ious)
-        else:
-            raise KeyError
-        if weights is not None:
-            loss = loss * weights
-        loss = paddle.sum(loss)
-        if avg_factor is not None:
-            loss = loss / avg_factor
-        return loss
-
-    # calcualate iou between bbox and target
-    def bbox_overlap_align(self, pred, targets):
-        assert pred.shape[0] == targets.shape[0], \
-        'the pred should be aligned with target.'
-
-        plw = pred[:, 0]
-        pth = pred[:, 1]
-        prw = pred[:, 2]
-        pbh = pred[:, 3]
-
-        tlw = targets[:, 0]
-        tth = targets[:, 1]
-        trw = targets[:, 2]
-        tbh = targets[:, 3]
-
-        ilw = paddle.minimum(plw, tlw)
-        irw = paddle.minimum(prw, trw)
-        ith = paddle.minimum(pth, tth)
-        ibh = paddle.minimum(pbh, tbh)
-
-        area_predict = (plw + prw) * (pth + pbh)
-        area_target = (tlw + trw) * (tth + tbh)
-        area_inter = (ilw + irw) * (ith + ibh)
-        ious = (area_inter + 1.0) / (
-            area_predict + area_target - area_inter + 1.0)
-        return ious
-
-    # cls loss: iou-based soft lable with joint iou
-    def quality_focal_loss(self,
-                           stu_cls,
-                           targets,
-                           quality=None,
-                           weights=None,
-                           alpha=0.75,
-                           gamma=2.0,
-                           avg_factor='sum'):
-        stu_cls = F.sigmoid(stu_cls)
-        if quality is not None:
-            stu_cls = stu_cls * F.sigmoid(quality)
-
-        focal_weight = (stu_cls - targets).abs().pow(gamma) * (targets > 0.0).cast('float32') + \
-            alpha * (stu_cls - targets).abs().pow(gamma) * \
-            (targets <= 0.0).cast('float32')
-
-        loss = F.binary_cross_entropy(
-            stu_cls, targets, reduction='none') * focal_weight
-
-        if weights is not None:
-            loss = loss * weights.reshape([-1, 1])
-        loss = paddle.sum(loss)
-        if avg_factor is not None:
-            loss = loss / avg_factor
-        return loss
-
-    # generate points according to feature maps
-    def compute_locations_by_level(self, fpn_stride, h, w):
-        """
-        Compute locations of anchor points of each FPN layer
-        Return:
-            Anchor points locations of current FPN feature map
-        """
-        shift_x = paddle.arange(0, w * fpn_stride, fpn_stride)
-        shift_y = paddle.arange(0, h * fpn_stride, fpn_stride)
-        shift_x = paddle.unsqueeze(shift_x, axis=0)
-        shift_y = paddle.unsqueeze(shift_y, axis=1)
-        shift_x = paddle.expand(shift_x, shape=[h, w])
-        shift_y = paddle.expand(shift_y, shape=[h, w])
-        shift_x = paddle.reshape(shift_x, shape=[-1])
-        shift_y = paddle.reshape(shift_y, shape=[-1])
-        location = paddle.stack(
-            [shift_x, shift_y], axis=-1) + float(fpn_stride) / 2
-        return location
-
-    # decode bbox from ltrb to x1y1x2y2
-    def decode_bbox(self, ltrb, points):
-        assert ltrb.shape[0] == points.shape[0], \
-        "When decoding bbox in one image, the num of loc should be same with points."
-        bbox_decoding = paddle.stack(
-            [
-                points[:, 0] - ltrb[:, 0], points[:, 1] - ltrb[:, 1],
-                points[:, 0] + ltrb[:, 2], points[:, 1] + ltrb[:, 3]
-            ],
-            axis=1)
-        return bbox_decoding
-
-    # encode bbox from x1y1x2y2 to ltrb
-    def encode_bbox(self, bbox, points):
-        assert bbox.shape[0] == points.shape[0], \
-        "When encoding bbox in one image, the num of bbox should be same with points."
-        bbox_encoding = paddle.stack(
-            [
-                points[:, 0] - bbox[:, 0], points[:, 1] - bbox[:, 1],
-                bbox[:, 2] - points[:, 0], bbox[:, 3] - points[:, 1]
-            ],
-            axis=1)
-        return bbox_encoding
-
-    def calcualate_iou(self, gt_bbox, predict_bbox):
-        # bbox area
-        gt_area = (gt_bbox[:, 2] - gt_bbox[:, 0]) * \
-             (gt_bbox[:, 3] - gt_bbox[:, 1])
-        predict_area = (predict_bbox[:, 2] - predict_bbox[:, 0]) * \
-             (predict_bbox[:, 3] - predict_bbox[:, 1])
-        # overlop area
-        lt = paddle.fmax(gt_bbox[:, None, :2], predict_bbox[None, :, :2])
-        rb = paddle.fmin(gt_bbox[:, None, 2:], predict_bbox[None, :, 2:])
-        wh = paddle.clip(rb - lt, min=0)
-        overlap = wh[..., 0] * wh[..., 1]
-        # iou
-        iou = overlap / (gt_area[:, None] + predict_area[None, :] - overlap)
-        return iou
-
-    # select potential positives from hard negatives 
-    def hard_neg_mining(self,
-                        cls_score,
-                        loc_ltrb,
-                        quality,
-                        pos_ind,
-                        hard_neg_ind,
-                        loc_mask,
-                        loc_targets,
-                        iou_thresh=0.6):
-        # get points locations and strides
-        points_list = []
-        strides_list = []
-        scale_list = []
-        scale = [0, 1, 2, 3, 4]
-        for fpn_scale, fpn_stride, HW in zip(scale, self.fpn_stride,
-                                             self.lvl_hw):
-            h, w = HW
-            lvl_points = self.compute_locations_by_level(fpn_stride, h, w)
-            points_list.append(lvl_points)
-            lvl_strides = paddle.full([h * w, 1], fpn_stride)
-            strides_list.append(lvl_strides)
-            lvl_scales = paddle.full([h * w, 1], fpn_scale)
-            scale_list.append(lvl_scales)
-        points = paddle.concat(points_list, axis=0)
-        strides = paddle.concat(strides_list, axis=0)
-        scales = paddle.concat(scale_list, axis=0)
-
-        # cls scores
-        cls_vals = F.sigmoid(cls_score) * F.sigmoid(quality)
-        max_vals = paddle.max(cls_vals, axis=-1)
-        class_ind = paddle.argmax(cls_vals, axis=-1)
-
-        ### calculate iou between positive and hard negative
-        # decode pos bbox
-        pos_cls = max_vals[pos_ind]
-        pos_loc = loc_ltrb[pos_ind].reshape([-1, 4])
-        pos_strides = strides[pos_ind]
-        pos_points = points[pos_ind].reshape([-1, 2])
-        pos_loc = pos_loc * pos_strides
-        pos_bbox = self.decode_bbox(pos_loc, pos_points)
-        pos_scales = scales[pos_ind]
-        # decode hard negative bbox
-        hard_neg_loc = loc_ltrb[hard_neg_ind].reshape([-1, 4])
-        hard_neg_strides = strides[hard_neg_ind]
-        hard_neg_points = points[hard_neg_ind].reshape([-1, 2])
-        hard_neg_loc = hard_neg_loc * hard_neg_strides
-        hard_neg_bbox = self.decode_bbox(hard_neg_loc, hard_neg_points)
-        hard_neg_scales = scales[hard_neg_ind]
-        # iou between pos bbox and hard negative bbox
-        hard_neg_pos_iou = self.calcualate_iou(hard_neg_bbox, pos_bbox)
-
-        ### select potential positives from hard negatives
-        # scale flag
-        scale_temp = paddle.abs(
-            pos_scales.reshape([-1])[None, :] - hard_neg_scales.reshape([-1])
-            [:, None])
-        scale_flag = (scale_temp <= 1.)
-        # iou flag
-        iou_flag = (hard_neg_pos_iou >= iou_thresh)
-        # same class flag
-        pos_class = class_ind[pos_ind]
-        hard_neg_class = class_ind[hard_neg_ind]
-        class_flag = pos_class[None, :] - hard_neg_class[:, None]
-        class_flag = (class_flag == 0)
-        # hard negative point inside positive bbox flag
-        ltrb_temp = paddle.stack(
-            [
-                hard_neg_points[:, None, 0] - pos_bbox[None, :, 0],
-                hard_neg_points[:, None, 1] - pos_bbox[None, :, 1],
-                pos_bbox[None, :, 2] - hard_neg_points[:, None, 0],
-                pos_bbox[None, :, 3] - hard_neg_points[:, None, 1]
-            ],
-            axis=-1)
-        inside_flag = ltrb_temp.min(axis=-1) > 0
-        # reset iou
-        valid_flag = (iou_flag & class_flag & inside_flag & scale_flag)
-        invalid_iou = paddle.zeros_like(hard_neg_pos_iou)
-        hard_neg_pos_iou = paddle.where(valid_flag, hard_neg_pos_iou,
-                                        invalid_iou)
-        pos_hard_neg_max_iou = hard_neg_pos_iou.max(axis=-1)
-        # selece potential pos
-        potential_pos_ind = (pos_hard_neg_max_iou > 0.)
-        num_potential_pos = paddle.nonzero(potential_pos_ind).shape[0]
-        if num_potential_pos == 0:
-            return None
-
-        ### calculate loc target：aggregate all matching bboxes as the bbox targets of potential pos
-        # prepare data
-        potential_points = hard_neg_points[potential_pos_ind].reshape([-1, 2])
-        potential_strides = hard_neg_strides[potential_pos_ind]
-        potential_valid_flag = valid_flag[potential_pos_ind]
-        potential_pos_ind = hard_neg_ind[potential_pos_ind]
-
-        # get cls and box of matching positives
-        pos_cls = max_vals[pos_ind]
-        expand_pos_bbox = paddle.expand(
-            pos_bbox,
-            shape=[num_potential_pos, pos_bbox.shape[0], pos_bbox.shape[1]])
-        expand_pos_cls = paddle.expand(
-            pos_cls, shape=[num_potential_pos, pos_cls.shape[0]])
-        invalid_cls = paddle.zeros_like(expand_pos_cls)
-        expand_pos_cls = paddle.where(potential_valid_flag, expand_pos_cls,
-                                      invalid_cls)
-        expand_pos_cls = paddle.unsqueeze(expand_pos_cls, axis=-1)
-        # aggregate box based on cls_score
-        agg_bbox = (expand_pos_bbox * expand_pos_cls).sum(axis=1) \
-            / expand_pos_cls.sum(axis=1)
-        agg_ltrb = self.encode_bbox(agg_bbox, potential_points)
-        agg_ltrb = agg_ltrb / potential_strides
-
-        # loc target for all pos
-        loc_targets[potential_pos_ind] = agg_ltrb
-        loc_mask[potential_pos_ind] = 1.
-
-        return loc_mask, loc_targets
-
-    # get training targets
-    def get_targets_per_img(self, tea_cls, tea_loc, tea_iou, stu_cls, stu_loc,
-                            stu_iou):
-
-        ### sample selection
-        # prepare datas
-        tea_cls_scores = F.sigmoid(tea_cls) * F.sigmoid(tea_iou)
-        class_ind = paddle.argmax(tea_cls_scores, axis=-1)
-        max_vals = paddle.max(tea_cls_scores, axis=-1)
-        cls_mask = paddle.zeros_like(
-            max_vals
-        )  # set cls valid mask: pos is 1, hard_negative and negative are 0.
-        num_pos, num_hard_neg = 0, 0
-
-        # mean-std selection
-        # using nonzero to turn index from bool to int, because the index will be used to compose two-dim index in following.
-        # using squeeze rather than reshape to avoid errors when no score is larger than thresh.
-        candidate_ind = paddle.nonzero(max_vals >= 0.1).squeeze(axis=-1)
-        num_candidate = candidate_ind.shape[0]
-        if num_candidate > 0:
-            # pos thresh = mean + std to select pos samples
-            candidate_score = max_vals[candidate_ind]
-            candidate_score_mean = candidate_score.mean()
-            candidate_score_std = candidate_score.std()
-            pos_thresh = (candidate_score_mean + candidate_score_std).clip(
-                max=0.4)
-            # select pos
-            pos_ind = paddle.nonzero(max_vals >= pos_thresh).squeeze(axis=-1)
-            num_pos = pos_ind.shape[0]
-            # select hard negatives as potential pos
-            hard_neg_ind = (max_vals >= 0.1) & (max_vals < pos_thresh)
-            hard_neg_ind = paddle.nonzero(hard_neg_ind).squeeze(axis=-1)
-            num_hard_neg = hard_neg_ind.shape[0]
-        # if not positive, directly select top-10 as pos.
-        if (num_pos == 0):
-            num_pos = 10
-            _, pos_ind = paddle.topk(max_vals, k=num_pos)
-        cls_mask[pos_ind] = 1.
-
-        ### Consistency Regularization Training targets
-        # cls targets
-        pos_class_ind = class_ind[pos_ind]
-        cls_targets = paddle.zeros_like(tea_cls)
-        cls_targets[pos_ind, pos_class_ind] = tea_cls_scores[pos_ind,
-                                                             pos_class_ind]
-        # hard negative cls target
-        if num_hard_neg != 0:
-            cls_targets[hard_neg_ind] = tea_cls_scores[hard_neg_ind]
-        # loc targets
-        loc_targets = paddle.zeros_like(tea_loc)
-        loc_targets[pos_ind] = tea_loc[pos_ind]
-        # iou targets
-        iou_targets = paddle.zeros(
-            shape=[tea_iou.shape[0]], dtype=tea_iou.dtype)
-        iou_targets[pos_ind] = F.sigmoid(
-            paddle.squeeze(
-                tea_iou, axis=-1)[pos_ind])
-
-        loc_mask = cls_mask.clone()
-        # select potential positive from hard negatives for loc_task training
-        if (num_hard_neg > 0) and self.hard_neg_mining_flag:
-            results = self.hard_neg_mining(tea_cls, tea_loc, tea_iou, pos_ind,
-                                           hard_neg_ind, loc_mask, loc_targets)
-            if results is not None:
-                loc_mask, loc_targets = results
-                loc_pos_ind = paddle.nonzero(loc_mask > 0.).squeeze(axis=-1)
-                iou_targets[loc_pos_ind] = F.sigmoid(
-                    paddle.squeeze(
-                        tea_iou, axis=-1)[loc_pos_ind])
-
-        return cls_mask, loc_mask, \
-               cls_targets, loc_targets, iou_targets
-
-    def forward(self, student_prediction, teacher_prediction):
-        stu_cls_lvl, stu_loc_lvl, stu_iou_lvl = student_prediction
-        tea_cls_lvl, tea_loc_lvl, tea_iou_lvl, self.fpn_stride = teacher_prediction
-
-        # H and W of level (used for aggregating targets)
-        self.lvl_hw = []
-        for t in tea_cls_lvl:
-            _, _, H, W = t.shape
-            self.lvl_hw.append([H, W])
-
-        # levels to images
-        stu_cls_img = levels_to_images(stu_cls_lvl)
-        stu_loc_img = levels_to_images(stu_loc_lvl)
-        stu_iou_img = levels_to_images(stu_iou_lvl)
-        tea_cls_img = levels_to_images(tea_cls_lvl)
-        tea_loc_img = levels_to_images(tea_loc_lvl)
-        tea_iou_img = levels_to_images(tea_iou_lvl)
-
-        with paddle.no_grad():
-            cls_mask, loc_mask, \
-            cls_targets, loc_targets, iou_targets = multi_apply(
-                self.get_targets_per_img,
-                tea_cls_img,
-                tea_loc_img,
-                tea_iou_img,
-                stu_cls_img,
-                stu_loc_img,
-                stu_iou_img
-            )
-
-        # flatten preditction
-        stu_cls = paddle.concat(stu_cls_img, axis=0)
-        stu_loc = paddle.concat(stu_loc_img, axis=0)
-        stu_iou = paddle.concat(stu_iou_img, axis=0)
-        # flatten targets
-        cls_mask = paddle.concat(cls_mask, axis=0)
-        loc_mask = paddle.concat(loc_mask, axis=0)
-        cls_targets = paddle.concat(cls_targets, axis=0)
-        loc_targets = paddle.concat(loc_targets, axis=0)
-        iou_targets = paddle.concat(iou_targets, axis=0)
-
-        ### Training Weights and avg factor
-        # find positives
-        cls_pos_ind = paddle.nonzero(cls_mask > 0.).squeeze(axis=-1)
-        loc_pos_ind = paddle.nonzero(loc_mask > 0.).squeeze(axis=-1)
-        # cls weight
-        cls_sample_weights = paddle.ones([cls_targets.shape[0]])
-        cls_avg_factor = paddle.max(cls_targets[cls_pos_ind],
-                                    axis=-1).sum().item()
-        # loc weight
-        loc_sample_weights = paddle.max(cls_targets[loc_pos_ind], axis=-1)
-        loc_avg_factor = loc_sample_weights.sum().item()
-        # iou weight
-        iou_sample_weights = paddle.ones([loc_pos_ind.shape[0]])
-        iou_avg_factor = loc_pos_ind.shape[0]
-
-        ### unsupervised loss
-        # cls loss
-        loss_cls = self.quality_focal_loss(
-            stu_cls,
-            cls_targets,
-            quality=stu_iou,
-            weights=cls_sample_weights,
-            avg_factor=cls_avg_factor) * self.cls_weight
-        # iou loss
-        pos_stu_iou = paddle.squeeze(stu_iou, axis=-1)[loc_pos_ind]
-        pos_iou_targets = iou_targets[loc_pos_ind]
-        loss_iou = F.binary_cross_entropy(
-            F.sigmoid(pos_stu_iou), pos_iou_targets,
-            reduction='none') * iou_sample_weights
-        loss_iou = loss_iou.sum() / iou_avg_factor * self.iou_weight
-        # box loss
-        pos_stu_loc = stu_loc[loc_pos_ind]
-        pos_loc_targets = loc_targets[loc_pos_ind]
-
-        loss_box = self.iou_loss(
-            pos_stu_loc,
-            pos_loc_targets,
-            weights=loc_sample_weights,
-            avg_factor=loc_avg_factor)
-        loss_box = loss_box * self.reg_weight
-
-        loss_all = {
-            "loss_cls": loss_cls,
-            "loss_box": loss_box,
-            "loss_iou": loss_iou,
-        }
-        return loss_all
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/focal_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/focal_loss.py
deleted file mode 100644
index b9a64e1..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/focal_loss.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn.functional as F
-import paddle.nn as nn
-from ppdet.core.workspace import register
-
-__all__ = ['FocalLoss', 'Weighted_FocalLoss']
-
-@register
-class FocalLoss(nn.Layer):
-    """A wrapper around paddle.nn.functional.sigmoid_focal_loss.
-    Args:
-        use_sigmoid (bool): currently only support use_sigmoid=True
-        alpha (float): parameter alpha in Focal Loss
-        gamma (float): parameter gamma in Focal Loss
-        loss_weight (float): final loss will be multiplied by this
-    """
-    def __init__(self,
-                 use_sigmoid=True,
-                 alpha=0.25,
-                 gamma=2.0,
-                 loss_weight=1.0):
-        super(FocalLoss, self).__init__()
-        assert use_sigmoid == True, \
-            'Focal Loss only supports sigmoid at the moment'
-        self.use_sigmoid = use_sigmoid
-        self.alpha = alpha
-        self.gamma = gamma
-        self.loss_weight = loss_weight
-
-    def forward(self, pred, target, reduction='none'):
-        """forward function.
-        Args:
-            pred (Tensor): logits of class prediction, of shape (N, num_classes)
-            target (Tensor): target class label, of shape (N, )
-            reduction (str): the way to reduce loss, one of (none, sum, mean)
-        """
-        num_classes = pred.shape[1]
-        target = F.one_hot(target, num_classes+1).cast(pred.dtype)
-        target = target[:, :-1].detach()
-        loss = F.sigmoid_focal_loss(
-            pred, target, alpha=self.alpha, gamma=self.gamma,
-            reduction=reduction)
-        return loss * self.loss_weight
-
-
-@register
-class Weighted_FocalLoss(FocalLoss):
-    """A wrapper around paddle.nn.functional.sigmoid_focal_loss.
-    Args:
-        use_sigmoid (bool): currently only support use_sigmoid=True
-        alpha (float): parameter alpha in Focal Loss
-        gamma (float): parameter gamma in Focal Loss
-        loss_weight (float): final loss will be multiplied by this
-    """
-    def __init__(self,
-                 use_sigmoid=True,
-                 alpha=0.25,
-                 gamma=2.0,
-                 loss_weight=1.0,
-                 reduction="mean"):
-        super(FocalLoss, self).__init__()
-        assert use_sigmoid == True, \
-            'Focal Loss only supports sigmoid at the moment'
-        self.use_sigmoid = use_sigmoid
-        self.alpha = alpha
-        self.gamma = gamma
-        self.loss_weight = loss_weight
-        self.reduction = reduction
-
-    def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None):
-        """forward function.
-        Args:
-            pred (Tensor): logits of class prediction, of shape (N, num_classes)
-            target (Tensor): target class label, of shape (N, )
-            reduction (str): the way to reduce loss, one of (none, sum, mean)
-        """
-        assert reduction_override in (None, 'none', 'mean', 'sum')
-        reduction = (
-            reduction_override if reduction_override else self.reduction)
-        num_classes = pred.shape[1]
-        target = F.one_hot(target, num_classes + 1).astype(pred.dtype)
-        target = target[:, :-1].detach()
-        loss = F.sigmoid_focal_loss(
-            pred, target, alpha=self.alpha, gamma=self.gamma,
-            reduction='none')
-
-        if weight is not None:
-            if weight.shape != loss.shape:
-                if weight.shape[0] == loss.shape[0]:
-                    # For most cases, weight is of shape (num_priors, ),
-                    #  which means it does not have the second axis num_class
-                    weight = weight.reshape((-1, 1))
-                else:
-                    # Sometimes, weight per anchor per class is also needed. e.g.
-                    #  in FSAF. But it may be flattened of shape
-                    #  (num_priors x num_class, ), while loss is still of shape
-                    #  (num_priors, num_class).
-                    assert weight.numel() == loss.numel()
-                    weight = weight.reshape((loss.shape[0], -1))
-            assert weight.ndim == loss.ndim
-            loss = loss * weight
-
-        # if avg_factor is not specified, just reduce the loss
-        if avg_factor is None:
-            if reduction == 'mean':
-                loss = loss.mean()
-            elif reduction == 'sum':
-                loss = loss.sum()
-        else:
-            # if reduction is mean, then average the loss by avg_factor
-            if reduction == 'mean':
-                # Avoid causing ZeroDivisionError when avg_factor is 0.0,
-                # i.e., all labels of an image belong to ignore index.
-                eps = 1e-10
-                loss = loss.sum() / (avg_factor + eps)
-            # if reduction is 'none', then do nothing, otherwise raise an error
-            elif reduction != 'none':
-                raise ValueError('avg_factor can not be used with reduction="sum"')
-
-        return loss * self.loss_weight
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/gfocal_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/gfocal_loss.py
deleted file mode 100644
index 37e27f0..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/gfocal_loss.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# The code is based on:
-# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/gfocal_loss.py
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register, serializable
-from ppdet.modeling import ops
-
-__all__ = ['QualityFocalLoss', 'DistributionFocalLoss']
-
-
-def quality_focal_loss(pred, target, beta=2.0, use_sigmoid=True):
-    """
-    Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning
-    Qualified and Distributed Bounding Boxes for Dense Object Detection
-    <https://arxiv.org/abs/2006.04388>`_.
-    Args:
-        pred (Tensor): Predicted joint representation of classification
-            and quality (IoU) estimation with shape (N, C), C is the number of
-            classes.
-        target (tuple([Tensor])): Target category label with shape (N,)
-            and target quality label with shape (N,).
-        beta (float): The beta parameter for calculating the modulating factor.
-            Defaults to 2.0.
-    Returns:
-        Tensor: Loss tensor with shape (N,).
-    """
-    assert len(target) == 2, """target for QFL must be a tuple of two elements,
-        including category label and quality label, respectively"""
-    # label denotes the category id, score denotes the quality score
-    label, score = target
-    if use_sigmoid:
-        func = F.binary_cross_entropy_with_logits
-    else:
-        func = F.binary_cross_entropy
-
-    # negatives are supervised by 0 quality score
-    pred_sigmoid = F.sigmoid(pred) if use_sigmoid else pred
-    scale_factor = pred_sigmoid
-    zerolabel = paddle.zeros(pred.shape, dtype='float32')
-    loss = func(pred, zerolabel, reduction='none') * scale_factor.pow(beta)
-
-    # FG cat_id: [0, num_classes -1], BG cat_id: num_classes
-    bg_class_ind = pred.shape[1]
-    pos = paddle.logical_and((label >= 0),
-                             (label < bg_class_ind)).nonzero().squeeze(1)
-    if pos.shape[0] == 0:
-        return loss.sum(axis=1)
-    pos_label = paddle.gather(label, pos, axis=0)
-    pos_mask = np.zeros(pred.shape, dtype=np.int32)
-    pos_mask[pos.numpy(), pos_label.numpy()] = 1
-    pos_mask = paddle.to_tensor(pos_mask, dtype='bool')
-    score = score.unsqueeze(-1).expand([-1, pred.shape[1]]).cast('float32')
-    # positives are supervised by bbox quality (IoU) score
-    scale_factor_new = score - pred_sigmoid
-
-    loss_pos = func(
-        pred, score, reduction='none') * scale_factor_new.abs().pow(beta)
-    loss = loss * paddle.logical_not(pos_mask) + loss_pos * pos_mask
-    loss = loss.sum(axis=1)
-    return loss
-
-
-def distribution_focal_loss(pred, label):
-    """Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning
-    Qualified and Distributed Bounding Boxes for Dense Object Detection
-    <https://arxiv.org/abs/2006.04388>`_.
-    Args:
-        pred (Tensor): Predicted general distribution of bounding boxes
-            (before softmax) with shape (N, n+1), n is the max value of the
-            integral set `{0, ..., n}` in paper.
-        label (Tensor): Target distance label for bounding boxes with
-            shape (N,).
-    Returns:
-        Tensor: Loss tensor with shape (N,).
-    """
-    dis_left = label.cast('int64')
-    dis_right = dis_left + 1
-    weight_left = dis_right.cast('float32') - label
-    weight_right = label - dis_left.cast('float32')
-    loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \
-        + F.cross_entropy(pred, dis_right, reduction='none') * weight_right
-    return loss
-
-
-@register
-@serializable
-class QualityFocalLoss(nn.Layer):
-    r"""Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss:
-    Learning Qualified and Distributed Bounding Boxes for Dense Object
-    Detection <https://arxiv.org/abs/2006.04388>`_.
-    Args:
-        use_sigmoid (bool): Whether sigmoid operation is conducted in QFL.
-            Defaults to True.
-        beta (float): The beta parameter for calculating the modulating factor.
-            Defaults to 2.0.
-        reduction (str): Options are "none", "mean" and "sum".
-        loss_weight (float): Loss weight of current loss.
-    """
-
-    def __init__(self,
-                 use_sigmoid=True,
-                 beta=2.0,
-                 reduction='mean',
-                 loss_weight=1.0):
-        super(QualityFocalLoss, self).__init__()
-        self.use_sigmoid = use_sigmoid
-        self.beta = beta
-        assert reduction in ('none', 'mean', 'sum')
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-
-    def forward(self, pred, target, weight=None, avg_factor=None):
-        """Forward function.
-        Args:
-            pred (Tensor): Predicted joint representation of
-                classification and quality (IoU) estimation with shape (N, C),
-                C is the number of classes.
-            target (tuple([Tensor])): Target category label with shape
-                (N,) and target quality label with shape (N,).
-            weight (Tensor, optional): The weight of loss for each
-                prediction. Defaults to None.
-            avg_factor (int, optional): Average factor that is used to average
-                the loss. Defaults to None.
-        """
-
-        loss = self.loss_weight * quality_focal_loss(
-            pred, target, beta=self.beta, use_sigmoid=self.use_sigmoid)
-
-        if weight is not None:
-            loss = loss * weight
-        if avg_factor is None:
-            if self.reduction == 'none':
-                return loss
-            elif self.reduction == 'mean':
-                return loss.mean()
-            elif self.reduction == 'sum':
-                return loss.sum()
-        else:
-            # if reduction is mean, then average the loss by avg_factor
-            if self.reduction == 'mean':
-                loss = loss.sum() / avg_factor
-            # if reduction is 'none', then do nothing, otherwise raise an error
-            elif self.reduction != 'none':
-                raise ValueError(
-                    'avg_factor can not be used with reduction="sum"')
-        return loss
-
-
-@register
-@serializable
-class DistributionFocalLoss(nn.Layer):
-    """Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss:
-    Learning Qualified and Distributed Bounding Boxes for Dense Object
-    Detection <https://arxiv.org/abs/2006.04388>`_.
-    Args:
-        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
-        loss_weight (float): Loss weight of current loss.
-    """
-
-    def __init__(self, reduction='mean', loss_weight=1.0):
-        super(DistributionFocalLoss, self).__init__()
-        assert reduction in ('none', 'mean', 'sum')
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-
-    def forward(self, pred, target, weight=None, avg_factor=None):
-        """Forward function.
-        Args:
-            pred (Tensor): Predicted general distribution of bounding
-                boxes (before softmax) with shape (N, n+1), n is the max value
-                of the integral set `{0, ..., n}` in paper.
-            target (Tensor): Target distance label for bounding boxes
-                with shape (N,).
-            weight (Tensor, optional): The weight of loss for each
-                prediction. Defaults to None.
-            avg_factor (int, optional): Average factor that is used to average
-                the loss. Defaults to None.
-        """
-        loss = self.loss_weight * distribution_focal_loss(pred, target)
-        if weight is not None:
-            loss = loss * weight
-        if avg_factor is None:
-            if self.reduction == 'none':
-                return loss
-            elif self.reduction == 'mean':
-                return loss.mean()
-            elif self.reduction == 'sum':
-                return loss.sum()
-        else:
-            # if reduction is mean, then average the loss by avg_factor
-            if self.reduction == 'mean':
-                loss = loss.sum() / avg_factor
-            # if reduction is 'none', then do nothing, otherwise raise an error
-            elif self.reduction != 'none':
-                raise ValueError(
-                    'avg_factor can not be used with reduction="sum"')
-        return loss
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/iou_aware_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/iou_aware_loss.py
deleted file mode 100644
index 4a9e904..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/iou_aware_loss.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle.nn.functional as F
-from ppdet.core.workspace import register, serializable
-from .iou_loss import IouLoss
-from ..bbox_utils import bbox_iou
-
-
-@register
-@serializable
-class IouAwareLoss(IouLoss):
-    """
-    iou aware loss, see https://arxiv.org/abs/1912.05992
-    Args:
-        loss_weight (float): iou aware loss weight, default is 1.0
-        max_height (int): max height of input to support random shape input
-        max_width (int): max width of input to support random shape input
-    """
-
-    def __init__(self, loss_weight=1.0, giou=False, diou=False, ciou=False):
-        super(IouAwareLoss, self).__init__(
-            loss_weight=loss_weight, giou=giou, diou=diou, ciou=ciou)
-
-    def __call__(self, ioup, pbox, gbox):
-        iou = bbox_iou(
-            pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou)
-        iou.stop_gradient = True
-        loss_iou_aware = F.binary_cross_entropy_with_logits(
-            ioup, iou, reduction='none')
-        loss_iou_aware = loss_iou_aware * self.loss_weight
-        return loss_iou_aware
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/iou_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/iou_loss.py
deleted file mode 100644
index b5cac22..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/iou_loss.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-import math
-import paddle
-
-from ppdet.core.workspace import register, serializable
-from ..bbox_utils import bbox_iou
-
-__all__ = ['IouLoss', 'GIoULoss', 'DIouLoss', 'SIoULoss']
-
-
-@register
-@serializable
-class IouLoss(object):
-    """
-    iou loss, see https://arxiv.org/abs/1908.03851
-    loss = 1.0 - iou * iou
-    Args:
-        loss_weight (float): iou loss weight, default is 2.5
-        max_height (int): max height of input to support random shape input
-        max_width (int): max width of input to support random shape input
-        ciou_term (bool): whether to add ciou_term
-        loss_square (bool): whether to square the iou term
-    """
-
-    def __init__(self,
-                 loss_weight=2.5,
-                 giou=False,
-                 diou=False,
-                 ciou=False,
-                 loss_square=True):
-        self.loss_weight = loss_weight
-        self.giou = giou
-        self.diou = diou
-        self.ciou = ciou
-        self.loss_square = loss_square
-
-    def __call__(self, pbox, gbox):
-        iou = bbox_iou(
-            pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou)
-        if self.loss_square:
-            loss_iou = 1 - iou * iou
-        else:
-            loss_iou = 1 - iou
-
-        loss_iou = loss_iou * self.loss_weight
-        return loss_iou
-
-
-@register
-@serializable
-class GIoULoss(object):
-    """
-    Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630
-    Args:
-        loss_weight (float): giou loss weight, default as 1
-        eps (float): epsilon to avoid divide by zero, default as 1e-10
-        reduction (string): Options are "none", "mean" and "sum". default as none
-    """
-
-    def __init__(self, loss_weight=1., eps=1e-10, reduction='none'):
-        self.loss_weight = loss_weight
-        self.eps = eps
-        assert reduction in ('none', 'mean', 'sum')
-        self.reduction = reduction
-
-    def bbox_overlap(self, box1, box2, eps=1e-10):
-        """calculate the iou of box1 and box2
-        Args:
-            box1 (Tensor): box1 with the shape (..., 4)
-            box2 (Tensor): box1 with the shape (..., 4)
-            eps (float): epsilon to avoid divide by zero
-        Return:
-            iou (Tensor): iou of box1 and box2
-            overlap (Tensor): overlap of box1 and box2
-            union (Tensor): union of box1 and box2
-        """
-        x1, y1, x2, y2 = box1
-        x1g, y1g, x2g, y2g = box2
-
-        xkis1 = paddle.maximum(x1, x1g)
-        ykis1 = paddle.maximum(y1, y1g)
-        xkis2 = paddle.minimum(x2, x2g)
-        ykis2 = paddle.minimum(y2, y2g)
-        w_inter = (xkis2 - xkis1).clip(0)
-        h_inter = (ykis2 - ykis1).clip(0)
-        overlap = w_inter * h_inter
-
-        area1 = (x2 - x1) * (y2 - y1)
-        area2 = (x2g - x1g) * (y2g - y1g)
-        union = area1 + area2 - overlap + eps
-        iou = overlap / union
-
-        return iou, overlap, union
-
-    def __call__(self, pbox, gbox, iou_weight=1., loc_reweight=None):
-        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
-        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
-        box1 = [x1, y1, x2, y2]
-        box2 = [x1g, y1g, x2g, y2g]
-        iou, overlap, union = self.bbox_overlap(box1, box2, self.eps)
-        xc1 = paddle.minimum(x1, x1g)
-        yc1 = paddle.minimum(y1, y1g)
-        xc2 = paddle.maximum(x2, x2g)
-        yc2 = paddle.maximum(y2, y2g)
-
-        area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps
-        miou = iou - ((area_c - union) / area_c)
-        if loc_reweight is not None:
-            loc_reweight = paddle.reshape(loc_reweight, shape=(-1, 1))
-            loc_thresh = 0.9
-            giou = 1 - (1 - loc_thresh
-                        ) * miou - loc_thresh * miou * loc_reweight
-        else:
-            giou = 1 - miou
-        if self.reduction == 'none':
-            loss = giou
-        elif self.reduction == 'sum':
-            loss = paddle.sum(giou * iou_weight)
-        else:
-            loss = paddle.mean(giou * iou_weight)
-        return loss * self.loss_weight
-
-
-@register
-@serializable
-class DIouLoss(GIoULoss):
-    """
-    Distance-IoU Loss, see https://arxiv.org/abs/1911.08287
-    Args:
-        loss_weight (float): giou loss weight, default as 1
-        eps (float): epsilon to avoid divide by zero, default as 1e-10
-        use_complete_iou_loss (bool): whether to use complete iou loss
-    """
-
-    def __init__(self, loss_weight=1., eps=1e-10, use_complete_iou_loss=True):
-        super(DIouLoss, self).__init__(loss_weight=loss_weight, eps=eps)
-        self.use_complete_iou_loss = use_complete_iou_loss
-
-    def __call__(self, pbox, gbox, iou_weight=1.):
-        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
-        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
-        cx = (x1 + x2) / 2
-        cy = (y1 + y2) / 2
-        w = x2 - x1
-        h = y2 - y1
-
-        cxg = (x1g + x2g) / 2
-        cyg = (y1g + y2g) / 2
-        wg = x2g - x1g
-        hg = y2g - y1g
-
-        x2 = paddle.maximum(x1, x2)
-        y2 = paddle.maximum(y1, y2)
-
-        # A and B
-        xkis1 = paddle.maximum(x1, x1g)
-        ykis1 = paddle.maximum(y1, y1g)
-        xkis2 = paddle.minimum(x2, x2g)
-        ykis2 = paddle.minimum(y2, y2g)
-
-        # A or B
-        xc1 = paddle.minimum(x1, x1g)
-        yc1 = paddle.minimum(y1, y1g)
-        xc2 = paddle.maximum(x2, x2g)
-        yc2 = paddle.maximum(y2, y2g)
-
-        intsctk = (xkis2 - xkis1) * (ykis2 - ykis1)
-        intsctk = intsctk * paddle.greater_than(
-            xkis2, xkis1) * paddle.greater_than(ykis2, ykis1)
-        unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g
-                                                        ) - intsctk + self.eps
-        iouk = intsctk / unionk
-
-        # DIOU term
-        dist_intersection = (cx - cxg) * (cx - cxg) + (cy - cyg) * (cy - cyg)
-        dist_union = (xc2 - xc1) * (xc2 - xc1) + (yc2 - yc1) * (yc2 - yc1)
-        diou_term = (dist_intersection + self.eps) / (dist_union + self.eps)
-
-        # CIOU term
-        ciou_term = 0
-        if self.use_complete_iou_loss:
-            ar_gt = wg / hg
-            ar_pred = w / h
-            arctan = paddle.atan(ar_gt) - paddle.atan(ar_pred)
-            ar_loss = 4. / np.pi / np.pi * arctan * arctan
-            alpha = ar_loss / (1 - iouk + ar_loss + self.eps)
-            alpha.stop_gradient = True
-            ciou_term = alpha * ar_loss
-
-        diou = paddle.mean((1 - iouk + ciou_term + diou_term) * iou_weight)
-
-        return diou * self.loss_weight
-
-
-@register
-@serializable
-class SIoULoss(GIoULoss):
-    """
-    see https://arxiv.org/pdf/2205.12740.pdf 
-    Args:
-        loss_weight (float): siou loss weight, default as 1
-        eps (float): epsilon to avoid divide by zero, default as 1e-10
-        theta (float): default as 4
-        reduction (str): Options are "none", "mean" and "sum". default as none
-    """
-
-    def __init__(self, loss_weight=1., eps=1e-10, theta=4., reduction='none'):
-        super(SIoULoss, self).__init__(loss_weight=loss_weight, eps=eps)
-        self.loss_weight = loss_weight
-        self.eps = eps
-        self.theta = theta
-        self.reduction = reduction
-
-    def __call__(self, pbox, gbox):
-        x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1)
-        x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1)
-
-        box1 = [x1, y1, x2, y2]
-        box2 = [x1g, y1g, x2g, y2g]
-        iou = bbox_iou(box1, box2)
-
-        cx = (x1 + x2) / 2
-        cy = (y1 + y2) / 2
-        w = x2 - x1 + self.eps
-        h = y2 - y1 + self.eps
-
-        cxg = (x1g + x2g) / 2
-        cyg = (y1g + y2g) / 2
-        wg = x2g - x1g + self.eps
-        hg = y2g - y1g + self.eps
-
-        x2 = paddle.maximum(x1, x2)
-        y2 = paddle.maximum(y1, y2)
-
-        # A or B
-        xc1 = paddle.minimum(x1, x1g)
-        yc1 = paddle.minimum(y1, y1g)
-        xc2 = paddle.maximum(x2, x2g)
-        yc2 = paddle.maximum(y2, y2g)
-
-        cw_out = xc2 - xc1
-        ch_out = yc2 - yc1
-
-        ch = paddle.maximum(cy, cyg) - paddle.minimum(cy, cyg)
-        cw = paddle.maximum(cx, cxg) - paddle.minimum(cx, cxg)
-
-        # angle cost
-        dist_intersection = paddle.sqrt((cx - cxg)**2 + (cy - cyg)**2)
-        sin_angle_alpha = ch / dist_intersection
-        sin_angle_beta = cw / dist_intersection
-        thred = paddle.pow(paddle.to_tensor(2), 0.5) / 2
-        thred.stop_gradient = True
-        sin_alpha = paddle.where(sin_angle_alpha > thred, sin_angle_beta,
-                                 sin_angle_alpha)
-        angle_cost = paddle.cos(paddle.asin(sin_alpha) * 2 - math.pi / 2)
-
-        # distance cost
-        gamma = 2 - angle_cost
-        # gamma.stop_gradient = True
-        beta_x = ((cxg - cx) / cw_out)**2
-        beta_y = ((cyg - cy) / ch_out)**2
-        dist_cost = 1 - paddle.exp(-gamma * beta_x) + 1 - paddle.exp(-gamma *
-                                                                     beta_y)
-
-        # shape cost
-        omega_w = paddle.abs(w - wg) / paddle.maximum(w, wg)
-        omega_h = paddle.abs(hg - h) / paddle.maximum(h, hg)
-        omega = (1 - paddle.exp(-omega_w))**self.theta + (
-            1 - paddle.exp(-omega_h))**self.theta
-        siou_loss = 1 - iou + (omega + dist_cost) / 2
-
-        if self.reduction == 'mean':
-            siou_loss = paddle.mean(siou_loss)
-        elif self.reduction == 'sum':
-            siou_loss = paddle.sum(siou_loss)
-
-        return siou_loss * self.loss_weight
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/jde_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/jde_loss.py
deleted file mode 100644
index 5c3b5a6..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/jde_loss.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-
-__all__ = ['JDEDetectionLoss', 'JDEEmbeddingLoss', 'JDELoss']
-
-
-@register
-class JDEDetectionLoss(nn.Layer):
-    __shared__ = ['num_classes']
-
-    def __init__(self, num_classes=1, for_mot=True):
-        super(JDEDetectionLoss, self).__init__()
-        self.num_classes = num_classes
-        self.for_mot = for_mot
-
-    def det_loss(self, p_det, anchor, t_conf, t_box):
-        pshape = paddle.shape(p_det)
-        pshape.stop_gradient = True
-        nB, nGh, nGw = pshape[0], pshape[-2], pshape[-1]
-        nA = len(anchor)
-        p_det = paddle.reshape(
-            p_det, [nB, nA, self.num_classes + 5, nGh, nGw]).transpose(
-                (0, 1, 3, 4, 2))
-
-        # 1. loss_conf: cross_entropy
-        p_conf = p_det[:, :, :, :, 4:6]
-        p_conf_flatten = paddle.reshape(p_conf, [-1, 2])
-        t_conf_flatten = t_conf.flatten()
-        t_conf_flatten = paddle.cast(t_conf_flatten, dtype="int64")
-        t_conf_flatten.stop_gradient = True
-        loss_conf = F.cross_entropy(
-            p_conf_flatten, t_conf_flatten, ignore_index=-1, reduction='mean')
-        loss_conf.stop_gradient = False
-
-        # 2. loss_box: smooth_l1_loss
-        p_box = p_det[:, :, :, :, :4]
-        p_box_flatten = paddle.reshape(p_box, [-1, 4])
-        t_box_flatten = paddle.reshape(t_box, [-1, 4])
-        fg_inds = paddle.nonzero(t_conf_flatten > 0).flatten()
-        if fg_inds.numel() > 0:
-            reg_delta = paddle.gather(p_box_flatten, fg_inds)
-            reg_target = paddle.gather(t_box_flatten, fg_inds)
-        else:
-            reg_delta = paddle.to_tensor([0, 0, 0, 0], dtype='float32')
-            reg_delta.stop_gradient = False
-            reg_target = paddle.to_tensor([0, 0, 0, 0], dtype='float32')
-        reg_target.stop_gradient = True
-        loss_box = F.smooth_l1_loss(
-            reg_delta, reg_target, reduction='mean', delta=1.0)
-        loss_box.stop_gradient = False
-
-        return loss_conf, loss_box
-
-    def forward(self, det_outs, targets, anchors):
-        """
-        Args:
-            det_outs (list[Tensor]): output from detection head, each one
-                is a 4-D Tensor with shape [N, C, H, W].
-            targets (dict): contains 'im_id', 'gt_bbox', 'gt_ide', 'image',
-                'im_shape', 'scale_factor' and 'tbox', 'tconf', 'tide' of
-                each FPN level.
-            anchors (list[list]): anchor setting of JDE model, N row M col, N is
-                the anchor levels(FPN levels), M is the anchor scales each
-                level.
-        """
-        assert len(det_outs) == len(anchors)
-        loss_confs = []
-        loss_boxes = []
-        for i, (p_det, anchor) in enumerate(zip(det_outs, anchors)):
-            t_conf = targets['tconf{}'.format(i)]
-            t_box = targets['tbox{}'.format(i)]
-
-            loss_conf, loss_box = self.det_loss(p_det, anchor, t_conf, t_box)
-            loss_confs.append(loss_conf)
-            loss_boxes.append(loss_box)
-        if self.for_mot:
-            return {'loss_confs': loss_confs, 'loss_boxes': loss_boxes}
-        else:
-            jde_conf_losses = sum(loss_confs)
-            jde_box_losses = sum(loss_boxes)
-            jde_det_losses = {
-                "loss_conf": jde_conf_losses,
-                "loss_box": jde_box_losses,
-                "loss": jde_conf_losses + jde_box_losses,
-            }
-            return jde_det_losses
-
-
-@register
-class JDEEmbeddingLoss(nn.Layer):
-    def __init__(self, ):
-        super(JDEEmbeddingLoss, self).__init__()
-        self.phony = self.create_parameter(shape=[1], dtype="float32")
-
-    def emb_loss(self, p_ide, t_conf, t_ide, emb_scale, classifier):
-        emb_dim = p_ide.shape[1]
-        p_ide = p_ide.transpose((0, 2, 3, 1))
-        p_ide_flatten = paddle.reshape(p_ide, [-1, emb_dim])
-        mask = t_conf > 0
-        mask = paddle.cast(mask, dtype="int64")
-        mask.stop_gradient = True
-        emb_mask = mask.max(1).flatten()
-        emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten()
-        emb_mask_inds.stop_gradient = True
-        # use max(1) to decide the id, TODO: more reseanable strategy
-        t_ide_flatten = t_ide.max(1).flatten()
-        t_ide_flatten = paddle.cast(t_ide_flatten, dtype="int64")
-        valid_inds = paddle.nonzero(t_ide_flatten != -1).flatten()
-
-        if emb_mask_inds.numel() == 0 or valid_inds.numel() == 0:
-            # loss_ide = paddle.to_tensor([0]) # will be error in gradient backward
-            loss_ide = self.phony * 0  # todo
-        else:
-            embedding = paddle.gather(p_ide_flatten, emb_mask_inds)
-            embedding = emb_scale * F.normalize(embedding)
-            logits = classifier(embedding)
-
-            ide_target = paddle.gather(t_ide_flatten, emb_mask_inds)
-
-            loss_ide = F.cross_entropy(
-                logits, ide_target, ignore_index=-1, reduction='mean')
-        loss_ide.stop_gradient = False
-
-        return loss_ide
-
-    def forward(self, ide_outs, targets, emb_scale, classifier):
-        loss_ides = []
-        for i, p_ide in enumerate(ide_outs):
-            t_conf = targets['tconf{}'.format(i)]
-            t_ide = targets['tide{}'.format(i)]
-
-            loss_ide = self.emb_loss(p_ide, t_conf, t_ide, emb_scale,
-                                     classifier)
-            loss_ides.append(loss_ide)
-        return loss_ides
-
-
-@register
-class JDELoss(nn.Layer):
-    def __init__(self):
-        super(JDELoss, self).__init__()
-
-    def forward(self, loss_confs, loss_boxes, loss_ides, loss_params_cls,
-                loss_params_reg, loss_params_ide, targets):
-        assert len(loss_confs) == len(loss_boxes) == len(loss_ides)
-        assert len(loss_params_cls) == len(loss_params_reg) == len(
-            loss_params_ide)
-        assert len(loss_confs) == len(loss_params_cls)
-
-        batchsize = targets['gt_bbox'].shape[0]
-        nTargets = paddle.nonzero(paddle.sum(targets['gt_bbox'], axis=2)).shape[
-            0] / batchsize
-        nTargets = paddle.to_tensor(nTargets, dtype='float32')
-        nTargets.stop_gradient = True
-
-        jde_losses = []
-        for i, (loss_conf, loss_box, loss_ide, l_conf_p, l_box_p,
-                l_ide_p) in enumerate(
-                    zip(loss_confs, loss_boxes, loss_ides, loss_params_cls,
-                        loss_params_reg, loss_params_ide)):
-
-            jde_loss = l_conf_p(loss_conf) + l_box_p(loss_box) + l_ide_p(
-                loss_ide)
-            jde_losses.append(jde_loss)
-
-        loss_all = {
-            "loss_conf": sum(loss_confs),
-            "loss_box": sum(loss_boxes),
-            "loss_ide": sum(loss_ides),
-            "loss": sum(jde_losses),
-            "nTargets": nTargets,
-        }
-        return loss_all
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/keypoint_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/keypoint_loss.py
deleted file mode 100644
index 37a2410..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/keypoint_loss.py
+++ /dev/null
@@ -1,632 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from itertools import cycle, islice
-from collections import abc
-import numpy as np
-import paddle
-import paddle.nn as nn
-
-from ppdet.core.workspace import register, serializable
-
-__all__ = ['HrHRNetLoss', 'KeyPointMSELoss', 'OKSLoss', 'CenterFocalLoss', 'L1Loss']
-
-
-@register
-@serializable
-class KeyPointMSELoss(nn.Layer):
-    def __init__(self, use_target_weight=True, loss_scale=0.5):
-        """
-        KeyPointMSELoss layer
-
-        Args:
-            use_target_weight (bool): whether to use target weight
-        """
-        super(KeyPointMSELoss, self).__init__()
-        self.criterion = nn.MSELoss(reduction='mean')
-        self.use_target_weight = use_target_weight
-        self.loss_scale = loss_scale
-
-    def forward(self, output, records):
-        target = records['target']
-        target_weight = records['target_weight']
-        batch_size = output.shape[0]
-        num_joints = output.shape[1]
-        heatmaps_pred = output.reshape(
-            (batch_size, num_joints, -1)).split(num_joints, 1)
-        heatmaps_gt = target.reshape(
-            (batch_size, num_joints, -1)).split(num_joints, 1)
-        loss = 0
-        for idx in range(num_joints):
-            heatmap_pred = heatmaps_pred[idx].squeeze()
-            heatmap_gt = heatmaps_gt[idx].squeeze()
-            if self.use_target_weight:
-                loss += self.loss_scale * self.criterion(
-                    heatmap_pred.multiply(target_weight[:, idx]),
-                    heatmap_gt.multiply(target_weight[:, idx]))
-            else:
-                loss += self.loss_scale * self.criterion(heatmap_pred,
-                                                         heatmap_gt)
-        keypoint_losses = dict()
-        keypoint_losses['loss'] = loss / num_joints
-        return keypoint_losses
-
-
-@register
-@serializable
-class HrHRNetLoss(nn.Layer):
-    def __init__(self, num_joints, swahr):
-        """
-        HrHRNetLoss layer
-
-        Args:
-            num_joints (int): number of keypoints
-        """
-        super(HrHRNetLoss, self).__init__()
-        if swahr:
-            self.heatmaploss = HeatMapSWAHRLoss(num_joints)
-        else:
-            self.heatmaploss = HeatMapLoss()
-        self.aeloss = AELoss()
-        self.ziploss = ZipLoss(
-            [self.heatmaploss, self.heatmaploss, self.aeloss])
-
-    def forward(self, inputs, records):
-        targets = []
-        targets.append([records['heatmap_gt1x'], records['mask_1x']])
-        targets.append([records['heatmap_gt2x'], records['mask_2x']])
-        targets.append(records['tagmap'])
-        keypoint_losses = dict()
-        loss = self.ziploss(inputs, targets)
-        keypoint_losses['heatmap_loss'] = loss[0] + loss[1]
-        keypoint_losses['pull_loss'] = loss[2][0]
-        keypoint_losses['push_loss'] = loss[2][1]
-        keypoint_losses['loss'] = recursive_sum(loss)
-        return keypoint_losses
-
-
-class HeatMapLoss(object):
-    def __init__(self, loss_factor=1.0):
-        super(HeatMapLoss, self).__init__()
-        self.loss_factor = loss_factor
-
-    def __call__(self, preds, targets):
-        heatmap, mask = targets
-        loss = ((preds - heatmap)**2 * mask.cast('float').unsqueeze(1))
-        loss = paddle.clip(loss, min=0, max=2).mean()
-        loss *= self.loss_factor
-        return loss
-
-
-class HeatMapSWAHRLoss(object):
-    def __init__(self, num_joints, loss_factor=1.0):
-        super(HeatMapSWAHRLoss, self).__init__()
-        self.loss_factor = loss_factor
-        self.num_joints = num_joints
-
-    def __call__(self, preds, targets):
-        heatmaps_gt, mask = targets
-        heatmaps_pred = preds[0]
-        scalemaps_pred = preds[1]
-
-        heatmaps_scaled_gt = paddle.where(heatmaps_gt > 0, 0.5 * heatmaps_gt * (
-            1 + (1 +
-                 (scalemaps_pred - 1.) * paddle.log(heatmaps_gt + 1e-10))**2),
-                                          heatmaps_gt)
-
-        regularizer_loss = paddle.mean(
-            paddle.pow((scalemaps_pred - 1.) * (heatmaps_gt > 0).astype(float),
-                       2))
-        omiga = 0.01
-        # thres = 2**(-1/omiga), threshold for positive weight
-        hm_weight = heatmaps_scaled_gt**(
-            omiga
-        ) * paddle.abs(1 - heatmaps_pred) + paddle.abs(heatmaps_pred) * (
-            1 - heatmaps_scaled_gt**(omiga))
-
-        loss = (((heatmaps_pred - heatmaps_scaled_gt)**2) *
-                mask.cast('float').unsqueeze(1)) * hm_weight
-        loss = loss.mean()
-        loss = self.loss_factor * (loss + 1.0 * regularizer_loss)
-        return loss
-
-
-class AELoss(object):
-    def __init__(self, pull_factor=0.001, push_factor=0.001):
-        super(AELoss, self).__init__()
-        self.pull_factor = pull_factor
-        self.push_factor = push_factor
-
-    def apply_single(self, pred, tagmap):
-        if tagmap.numpy()[:, :, 3].sum() == 0:
-            return (paddle.zeros([1]), paddle.zeros([1]))
-        nonzero = paddle.nonzero(tagmap[:, :, 3] > 0)
-        if nonzero.shape[0] == 0:
-            return (paddle.zeros([1]), paddle.zeros([1]))
-        p_inds = paddle.unique(nonzero[:, 0])
-        num_person = p_inds.shape[0]
-        if num_person == 0:
-            return (paddle.zeros([1]), paddle.zeros([1]))
-
-        pull = 0
-        tagpull_num = 0
-        embs_all = []
-        person_unvalid = 0
-        for person_idx in p_inds.numpy():
-            valid_single = tagmap[person_idx.item()]
-            validkpts = paddle.nonzero(valid_single[:, 3] > 0)
-            valid_single = paddle.index_select(valid_single, validkpts)
-            emb = paddle.gather_nd(pred, valid_single[:, :3])
-            if emb.shape[0] == 1:
-                person_unvalid += 1
-            mean = paddle.mean(emb, axis=0)
-            embs_all.append(mean)
-            pull += paddle.mean(paddle.pow(emb - mean, 2), axis=0)
-            tagpull_num += emb.shape[0]
-        pull /= max(num_person - person_unvalid, 1)
-        if num_person < 2:
-            return pull, paddle.zeros([1])
-
-        embs_all = paddle.stack(embs_all)
-        A = embs_all.expand([num_person, num_person])
-        B = A.transpose([1, 0])
-        diff = A - B
-
-        diff = paddle.pow(diff, 2)
-        push = paddle.exp(-diff)
-        push = paddle.sum(push) - num_person
-
-        push /= 2 * num_person * (num_person - 1)
-        return pull, push
-
-    def __call__(self, preds, tagmaps):
-        bs = preds.shape[0]
-        losses = [
-            self.apply_single(preds[i:i + 1].squeeze(),
-                              tagmaps[i:i + 1].squeeze()) for i in range(bs)
-        ]
-        pull = self.pull_factor * sum(loss[0] for loss in losses) / len(losses)
-        push = self.push_factor * sum(loss[1] for loss in losses) / len(losses)
-        return pull, push
-
-
-class ZipLoss(object):
-    def __init__(self, loss_funcs):
-        super(ZipLoss, self).__init__()
-        self.loss_funcs = loss_funcs
-
-    def __call__(self, inputs, targets):
-        assert len(self.loss_funcs) == len(targets) >= len(inputs)
-
-        def zip_repeat(*args):
-            longest = max(map(len, args))
-            filled = [islice(cycle(x), longest) for x in args]
-            return zip(*filled)
-
-        return tuple(
-            fn(x, y)
-            for x, y, fn in zip_repeat(inputs, targets, self.loss_funcs))
-
-
-def recursive_sum(inputs):
-    if isinstance(inputs, abc.Sequence):
-        return sum([recursive_sum(x) for x in inputs])
-    return inputs
-
-
-def oks_overlaps(kpt_preds, kpt_gts, kpt_valids, kpt_areas, sigmas):
-    if not kpt_gts.astype('bool').any():
-        return kpt_preds.sum()*0
-    
-    sigmas = paddle.to_tensor(sigmas, dtype=kpt_preds.dtype)
-    variances = (sigmas * 2)**2
-
-    assert kpt_preds.shape[0] == kpt_gts.shape[0]
-    kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1] // 2, 2))
-    kpt_gts = kpt_gts.reshape((-1, kpt_gts.shape[-1] // 2, 2))
-
-    squared_distance = (kpt_preds[:, :, 0] - kpt_gts[:, :, 0]) ** 2 + \
-        (kpt_preds[:, :, 1] - kpt_gts[:, :, 1]) ** 2
-    assert (kpt_valids.sum(-1) > 0).all()
-    squared_distance0 = squared_distance / (
-        kpt_areas[:, None] * variances[None, :] * 2)
-    squared_distance1 = paddle.exp(-squared_distance0)
-    squared_distance1 = squared_distance1 * kpt_valids
-    oks = squared_distance1.sum(axis=1) / kpt_valids.sum(axis=1)
-
-    return oks
-
-
-def oks_loss(pred,
-             target,
-             weight,
-             valid=None,
-             area=None,
-             linear=False,
-             sigmas=None,
-             eps=1e-6,
-             avg_factor=None, 
-             reduction=None):
-    """Oks loss.
-
-    Computing the oks loss between a set of predicted poses and target poses.
-    The loss is calculated as negative log of oks.
-
-    Args:
-        pred (Tensor): Predicted poses of format (x1, y1, x2, y2, ...),
-            shape (n, K*2).
-        target (Tensor): Corresponding gt poses, shape (n, K*2).
-        linear (bool, optional): If True, use linear scale of loss instead of
-            log scale. Default: False.
-        eps (float): Eps to avoid log(0).
-
-    Returns:
-        Tensor: Loss tensor.
-    """
-    oks = oks_overlaps(pred, target, valid, area, sigmas).clip(min=eps)
-    if linear:
-        loss = 1 - oks
-    else:
-        loss = -oks.log()
-
-    if weight is not None:
-        if weight.shape != loss.shape:
-            if weight.shape[0] == loss.shape[0]:
-                # For most cases, weight is of shape (num_priors, ),
-                #  which means it does not have the second axis num_class
-                weight = weight.reshape((-1, 1))
-            else:
-                # Sometimes, weight per anchor per class is also needed. e.g.
-                #  in FSAF. But it may be flattened of shape
-                #  (num_priors x num_class, ), while loss is still of shape
-                #  (num_priors, num_class).
-                assert weight.numel() == loss.numel()
-                weight = weight.reshape((loss.shape[0], -1))
-        assert weight.ndim == loss.ndim
-        loss = loss * weight
-
-    # if avg_factor is not specified, just reduce the loss
-    if avg_factor is None:
-        if reduction == 'mean':
-            loss = loss.mean()
-        elif reduction == 'sum':
-            loss = loss.sum()
-    else:
-        # if reduction is mean, then average the loss by avg_factor
-        if reduction == 'mean':
-            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
-            # i.e., all labels of an image belong to ignore index.
-            eps = 1e-10
-            loss = loss.sum() / (avg_factor + eps)
-        # if reduction is 'none', then do nothing, otherwise raise an error
-        elif reduction != 'none':
-            raise ValueError('avg_factor can not be used with reduction="sum"')
-
-
-    return loss
-
-@register
-@serializable
-class OKSLoss(nn.Layer):
-    """OKSLoss.
-
-    Computing the oks loss between a set of predicted poses and target poses.
-
-    Args:
-        linear (bool): If True, use linear scale of loss instead of log scale.
-            Default: False.
-        eps (float): Eps to avoid log(0).
-        reduction (str): Options are "none", "mean" and "sum".
-        loss_weight (float): Weight of loss.
-    """
-
-    def __init__(self,
-                 linear=False,
-                 num_keypoints=17,
-                 eps=1e-6,
-                 reduction='mean',
-                 loss_weight=1.0):
-        super(OKSLoss, self).__init__()
-        self.linear = linear
-        self.eps = eps
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-        if num_keypoints == 17:
-            self.sigmas = np.array([
-                .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
-                1.07, .87, .87, .89, .89
-            ], dtype=np.float32) / 10.0
-        elif num_keypoints == 14:
-            self.sigmas = np.array([
-                .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89,
-                .79, .79
-            ]) / 10.0
-        else:
-            raise ValueError(f'Unsupported keypoints number {num_keypoints}')
-
-    def forward(self,
-                pred,
-                target,
-                valid,
-                area,
-                weight=None,
-                avg_factor=None,
-                reduction_override=None,
-                **kwargs):
-        """Forward function.
-
-        Args:
-            pred (Tensor): The prediction.
-            target (Tensor): The learning target of the prediction.
-            valid (Tensor): The visible flag of the target pose.
-            area (Tensor): The area of the target pose.
-            weight (Tensor, optional): The weight of loss for each
-                prediction. Defaults to None.
-            avg_factor (int, optional): Average factor that is used to average
-                the loss. Defaults to None.
-            reduction_override (str, optional): The reduction method used to
-                override the original reduction method of the loss.
-                Defaults to None. Options are "none", "mean" and "sum".
-        """
-        assert reduction_override in (None, 'none', 'mean', 'sum')
-        reduction = (
-            reduction_override if reduction_override else self.reduction)
-        if (weight is not None) and (not paddle.any(weight > 0)) and (
-                reduction != 'none'):
-            if pred.dim() == weight.dim() + 1:
-                weight = weight.unsqueeze(1)
-            return (pred * weight).sum()  # 0
-        if weight is not None and weight.dim() > 1:
-            # TODO: remove this in the future
-            # reduce the weight of shape (n, 4) to (n,) to match the
-            # iou_loss of shape (n,)
-            assert weight.shape == pred.shape
-            weight = weight.mean(-1)
-        loss = self.loss_weight * oks_loss(
-            pred,
-            target,
-            weight,
-            valid=valid,
-            area=area,
-            linear=self.linear,
-            sigmas=self.sigmas,
-            eps=self.eps,
-            reduction=reduction,
-            avg_factor=avg_factor,
-            **kwargs)
-        return loss
-
-
-def center_focal_loss(pred, gt, weight=None, mask=None, avg_factor=None, reduction=None):
-    """Modified focal loss. Exactly the same as CornerNet.
-    Runs faster and costs a little bit more memory.
-
-    Args:
-        pred (Tensor): The prediction with shape [bs, c, h, w].
-        gt (Tensor): The learning target of the prediction in gaussian
-            distribution, with shape [bs, c, h, w].
-        mask (Tensor): The valid mask. Defaults to None.
-    """
-    if not gt.astype('bool').any():
-        return pred.sum()*0
-    pos_inds = gt.equal(1).astype('float32')
-    if mask is None:
-        neg_inds = gt.less_than(paddle.to_tensor([1], dtype='float32')).astype('float32')
-    else:
-        neg_inds = gt.less_than(paddle.to_tensor([1], dtype='float32')).astype('float32') * mask.equal(0).astype('float32')
-
-    neg_weights = paddle.pow(1 - gt, 4)
-
-    loss = 0
-
-    pos_loss = paddle.log(pred) * paddle.pow(1 - pred, 2) * pos_inds
-    neg_loss = paddle.log(1 - pred) * paddle.pow(pred, 2) * neg_weights * \
-        neg_inds
-
-    num_pos = pos_inds.astype('float32').sum()
-    pos_loss = pos_loss.sum()
-    neg_loss = neg_loss.sum()
-
-    if num_pos == 0:
-        loss = loss - neg_loss
-    else:
-        loss = loss - (pos_loss + neg_loss) / num_pos
-
-    if weight is not None:
-        if weight.shape != loss.shape:
-            if weight.shape[0] == loss.shape[0]:
-                # For most cases, weight is of shape (num_priors, ),
-                #  which means it does not have the second axis num_class
-                weight = weight.reshape((-1, 1))
-            else:
-                # Sometimes, weight per anchor per class is also needed. e.g.
-                #  in FSAF. But it may be flattened of shape
-                #  (num_priors x num_class, ), while loss is still of shape
-                #  (num_priors, num_class).
-                assert weight.numel() == loss.numel()
-                weight = weight.reshape((loss.shape[0], -1))
-        assert weight.ndim == loss.ndim
-        loss = loss * weight
-
-    # if avg_factor is not specified, just reduce the loss
-    if avg_factor is None:
-        if reduction == 'mean':
-            loss = loss.mean()
-        elif reduction == 'sum':
-            loss = loss.sum()
-    else:
-        # if reduction is mean, then average the loss by avg_factor
-        if reduction == 'mean':
-            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
-            # i.e., all labels of an image belong to ignore index.
-            eps = 1e-10
-            loss = loss.sum() / (avg_factor + eps)
-        # if reduction is 'none', then do nothing, otherwise raise an error
-        elif reduction != 'none':
-            raise ValueError('avg_factor can not be used with reduction="sum"')
-
-    return loss
-
-@register
-@serializable
-class CenterFocalLoss(nn.Layer):
-    """CenterFocalLoss is a variant of focal loss.
-
-    More details can be found in the `paper
-    <https://arxiv.org/abs/1808.01244>`_
-
-    Args:
-        reduction (str): Options are "none", "mean" and "sum".
-        loss_weight (float): Loss weight of current loss.
-    """
-
-    def __init__(self,
-                 reduction='none',
-                 loss_weight=1.0):
-        super(CenterFocalLoss, self).__init__()
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-
-    def forward(self,
-                pred,
-                target,
-                weight=None,
-                mask=None,
-                avg_factor=None,
-                reduction_override=None):
-        """Forward function.
-
-        Args:
-            pred (Tensor): The prediction.
-            target (Tensor): The learning target of the prediction in gaussian
-                distribution.
-            weight (Tensor, optional): The weight of loss for each
-                prediction. Defaults to None.
-            mask (Tensor): The valid mask. Defaults to None.
-            avg_factor (int, optional): Average factor that is used to average
-                the loss. Defaults to None.
-            reduction_override (str, optional): The reduction method used to
-                override the original reduction method of the loss.
-                Defaults to None.
-        """
-        assert reduction_override in (None, 'none', 'mean', 'sum')
-        reduction = (
-            reduction_override if reduction_override else self.reduction)
-        loss_reg = self.loss_weight * center_focal_loss(
-            pred,
-            target,
-            weight,
-            mask=mask,
-            reduction=reduction,
-            avg_factor=avg_factor)
-        return loss_reg
-
-def l1_loss(pred, target, weight=None, reduction='mean', avg_factor=None):
-    """L1 loss.
-
-    Args:
-        pred (Tensor): The prediction.
-        target (Tensor): The learning target of the prediction.
-
-    Returns:
-        Tensor: Calculated loss
-    """
-    if not target.astype('bool').any():
-        return pred.sum() * 0
-
-    assert pred.shape == target.shape
-    loss = paddle.abs(pred - target)
-
-    if weight is not None:
-        if weight.shape != loss.shape:
-            if weight.shape[0] == loss.shape[0]:
-                # For most cases, weight is of shape (num_priors, ),
-                #  which means it does not have the second axis num_class
-                weight = weight.reshape((-1, 1))
-            else:
-                # Sometimes, weight per anchor per class is also needed. e.g.
-                #  in FSAF. But it may be flattened of shape
-                #  (num_priors x num_class, ), while loss is still of shape
-                #  (num_priors, num_class).
-                assert weight.numel() == loss.numel()
-                weight = weight.reshape((loss.shape[0], -1))
-        assert weight.ndim == loss.ndim
-        loss = loss * weight
-
-    # if avg_factor is not specified, just reduce the loss
-    if avg_factor is None:
-        if reduction == 'mean':
-            loss = loss.mean()
-        elif reduction == 'sum':
-            loss = loss.sum()
-    else:
-        # if reduction is mean, then average the loss by avg_factor
-        if reduction == 'mean':
-            # Avoid causing ZeroDivisionError when avg_factor is 0.0,
-            # i.e., all labels of an image belong to ignore index.
-            eps = 1e-10
-            loss = loss.sum() / (avg_factor + eps)
-        # if reduction is 'none', then do nothing, otherwise raise an error
-        elif reduction != 'none':
-            raise ValueError('avg_factor can not be used with reduction="sum"')
-
-
-    return loss
-
-@register
-@serializable
-class L1Loss(nn.Layer):
-    """L1 loss.
-
-    Args:
-        reduction (str, optional): The method to reduce the loss.
-            Options are "none", "mean" and "sum".
-        loss_weight (float, optional): The weight of loss.
-    """
-
-    def __init__(self, reduction='mean', loss_weight=1.0):
-        super(L1Loss, self).__init__()
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-
-    def forward(self,
-                pred,
-                target,
-                weight=None,
-                avg_factor=None,
-                reduction_override=None):
-        """Forward function.
-
-        Args:
-            pred (Tensor): The prediction.
-            target (Tensor): The learning target of the prediction.
-            weight (Tensor, optional): The weight of loss for each
-                prediction. Defaults to None.
-            avg_factor (int, optional): Average factor that is used to average
-                the loss. Defaults to None.
-            reduction_override (str, optional): The reduction method used to
-                override the original reduction method of the loss.
-                Defaults to None.
-        """
-        assert reduction_override in (None, 'none', 'mean', 'sum')
-        reduction = (
-            reduction_override if reduction_override else self.reduction)
-        loss_bbox = self.loss_weight * l1_loss(
-            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
-        return loss_bbox
-
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/pose3d_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/pose3d_loss.py
deleted file mode 100644
index 4781d6e..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/pose3d_loss.py
+++ /dev/null
@@ -1,250 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from itertools import cycle, islice
-from collections import abc
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ppdet.core.workspace import register, serializable
-from ppdet.utils.logger import setup_logger
-logger = setup_logger('ppdet.engine')
-
-__all__ = ['Pose3DLoss']
-
-
-@register
-@serializable
-class Pose3DLoss(nn.Layer):
-    def __init__(self, weight_3d=1.0, weight_2d=0.0, reduction='none'):
-        """
-        KeyPointMSELoss layer
-
-        Args:
-            weight_3d (float): weight of 3d loss
-            weight_2d (float): weight of 2d loss
-            reduction (bool): whether use reduction to loss
-        """
-        super(Pose3DLoss, self).__init__()
-        self.weight_3d = weight_3d
-        self.weight_2d = weight_2d
-        self.criterion_2dpose = nn.MSELoss(reduction=reduction)
-        self.criterion_3dpose = nn.L1Loss(reduction=reduction)
-        self.criterion_smoothl1 = nn.SmoothL1Loss(
-            reduction=reduction, delta=1.0)
-        self.criterion_vertices = nn.L1Loss()
-
-    def forward(self, pred3d, pred2d, inputs):
-        """
-        mpjpe: mpjpe loss between 3d joints
-        keypoint_2d_loss: 2d joints loss compute by criterion_2dpose
-        """
-        gt_3d_joints = inputs['joints_3d']
-        gt_2d_joints = inputs['joints_2d']
-        has_3d_joints = inputs['has_3d_joints']
-        has_2d_joints = inputs['has_2d_joints']
-
-        loss_3d = mpjpe_focal(pred3d, gt_3d_joints, has_3d_joints)
-        loss = self.weight_3d * loss_3d
-        epoch = inputs['epoch_id']
-        if self.weight_2d > 0:
-            weight = self.weight_2d * pow(0.1, (epoch // 8))
-            if epoch > 8:
-                weight = 0
-            loss_2d = keypoint_2d_loss(self.criterion_2dpose, pred2d,
-                                       gt_2d_joints, has_2d_joints)
-            loss += weight * loss_2d
-        return loss
-
-
-def filter_3d_joints(pred, gt, has_3d_joints):
-    """ 
-    filter 3d joints
-    """
-    gt = gt[has_3d_joints == 1]
-    gt = gt[:, :, :3]
-    pred = pred[has_3d_joints == 1]
-
-    gt_pelvis = (gt[:, 2, :] + gt[:, 3, :]) / 2
-    gt = gt - gt_pelvis[:, None, :]
-    pred_pelvis = (pred[:, 2, :] + pred[:, 3, :]) / 2
-    pred = pred - pred_pelvis[:, None, :]
-    return pred, gt
-
-
-def mpjpe(pred, gt, has_3d_joints):
-    """ 
-    mPJPE loss
-    """
-    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
-    error = paddle.sqrt((paddle.minimum((pred - gt), paddle.to_tensor(1.2))**2
-                         ).sum(axis=-1)).mean()
-    return error
-
-
-def mpjpe_focal(pred, gt, has_3d_joints):
-    """ 
-    mPJPE loss
-    """
-    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
-    mse_error = ((pred - gt)**2).sum(axis=-1)
-    mpjpe_error = paddle.sqrt(mse_error)
-    mean = mpjpe_error.mean()
-    std = mpjpe_error.std()
-    atte = 2 * F.sigmoid(6 * (mpjpe_error - mean) / std)
-    mse_error *= atte
-    return mse_error.mean()
-
-
-def mpjpe_mse(pred, gt, has_3d_joints, weight=1.):
-    """ 
-    mPJPE loss
-    """
-    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
-    error = (((pred - gt)**2).sum(axis=-1)).mean()
-    return error
-
-
-def mpjpe_criterion(pred, gt, has_3d_joints, criterion_pose3d):
-    """ 
-    mPJPE loss of self define criterion
-    """
-    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
-    error = paddle.sqrt(criterion_pose3d(pred, gt)).mean()
-    return error
-
-
-@register
-@serializable
-def weighted_mpjpe(pred, gt, has_3d_joints):
-    """ 
-    Weighted_mPJPE
-    """
-    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
-    weight = paddle.linalg.norm(pred, p=2, axis=-1)
-    weight = paddle.to_tensor(
-        [1.5, 1.3, 1.2, 1.2, 1.3, 1.5, 1.5, 1.3, 1.2, 1.2, 1.3, 1.5, 1., 1.])
-    error = (weight * paddle.linalg.norm(pred - gt, p=2, axis=-1)).mean()
-    return error
-
-
-@register
-@serializable
-def normed_mpjpe(pred, gt, has_3d_joints):
-    """
-    Normalized MPJPE (scale only), adapted from:
-    https://github.com/hrhodin/UnsupervisedGeometryAwareRepresentationLearning/blob/master/losses/poses.py
-    """
-    assert pred.shape == gt.shape
-    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
-
-    norm_predicted = paddle.mean(
-        paddle.sum(pred**2, axis=3, keepdim=True), axis=2, keepdim=True)
-    norm_target = paddle.mean(
-        paddle.sum(gt * pred, axis=3, keepdim=True), axis=2, keepdim=True)
-    scale = norm_target / norm_predicted
-    return mpjpe(scale * pred, gt)
-
-
-@register
-@serializable
-def mpjpe_np(pred, gt, has_3d_joints):
-    """ 
-    mPJPE_NP
-    """
-    pred, gt = filter_3d_joints(pred, gt, has_3d_joints)
-    error = np.sqrt(((pred - gt)**2).sum(axis=-1)).mean()
-    return error
-
-
-@register
-@serializable
-def mean_per_vertex_error(pred, gt, has_smpl):
-    """
-    Compute mPVE
-    """
-    pred = pred[has_smpl == 1]
-    gt = gt[has_smpl == 1]
-    with paddle.no_grad():
-        error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean()
-        return error
-
-
-@register
-@serializable
-def keypoint_2d_loss(criterion_keypoints, pred_keypoints_2d, gt_keypoints_2d,
-                     has_pose_2d):
-    """
-    Compute 2D reprojection loss if 2D keypoint annotations are available.
-    The confidence (conf) is binary and indicates whether the keypoints exist or not.
-    """
-    conf = gt_keypoints_2d[:, :, -1].unsqueeze(-1).clone()
-    loss = (conf * criterion_keypoints(
-        pred_keypoints_2d, gt_keypoints_2d[:, :, :-1] * 0.001)).mean()
-    return loss
-
-
-@register
-@serializable
-def keypoint_3d_loss(criterion_keypoints, pred_keypoints_3d, gt_keypoints_3d,
-                     has_pose_3d):
-    """
-    Compute 3D keypoint loss if 3D keypoint annotations are available.
-    """
-    conf = gt_keypoints_3d[:, :, -1].unsqueeze(-1).clone()
-    gt_keypoints_3d = gt_keypoints_3d[:, :, :-1].clone()
-    gt_keypoints_3d = gt_keypoints_3d[has_pose_3d == 1]
-    conf = conf[has_pose_3d == 1]
-    pred_keypoints_3d = pred_keypoints_3d[has_pose_3d == 1]
-    if len(gt_keypoints_3d) > 0:
-        gt_pelvis = (gt_keypoints_3d[:, 2, :] + gt_keypoints_3d[:, 3, :]) / 2
-        gt_keypoints_3d = gt_keypoints_3d - gt_pelvis[:, None, :]
-        pred_pelvis = (
-            pred_keypoints_3d[:, 2, :] + pred_keypoints_3d[:, 3, :]) / 2
-        pred_keypoints_3d = pred_keypoints_3d - pred_pelvis[:, None, :]
-        return (conf * criterion_keypoints(pred_keypoints_3d,
-                                           gt_keypoints_3d)).mean()
-    else:
-        return paddle.to_tensor([1.]).fill_(0.)
-
-
-@register
-@serializable
-def vertices_loss(criterion_vertices, pred_vertices, gt_vertices, has_smpl):
-    """
-    Compute per-vertex loss if vertex annotations are available.
-    """
-    pred_vertices_with_shape = pred_vertices[has_smpl == 1]
-    gt_vertices_with_shape = gt_vertices[has_smpl == 1]
-    if len(gt_vertices_with_shape) > 0:
-        return criterion_vertices(pred_vertices_with_shape,
-                                  gt_vertices_with_shape)
-    else:
-        return paddle.to_tensor([1.]).fill_(0.)
-
-
-@register
-@serializable
-def rectify_pose(pose):
-    pose = pose.copy()
-    R_mod = cv2.Rodrigues(np.array([np.pi, 0, 0]))[0]
-    R_root = cv2.Rodrigues(pose[:3])[0]
-    new_root = R_root.dot(R_mod)
-    pose[:3] = cv2.Rodrigues(new_root)[0].reshape(3)
-    return pose
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/probiou_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/probiou_loss.py
deleted file mode 100644
index c2a1c75..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/probiou_loss.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-import paddle
-import paddle.nn.functional as F
-
-from ppdet.core.workspace import register, serializable
-
-__all__ = ['ProbIoULoss']
-
-
-def gbb_form(boxes):
-    xy, wh, angle = paddle.split(boxes, [2, 2, 1], axis=-1)
-    return paddle.concat([xy, wh.pow(2) / 12., angle], axis=-1)
-
-
-def rotated_form(a_, b_, angles):
-    cos_a = paddle.cos(angles)
-    sin_a = paddle.sin(angles)
-    a = a_ * paddle.pow(cos_a, 2) + b_ * paddle.pow(sin_a, 2)
-    b = a_ * paddle.pow(sin_a, 2) + b_ * paddle.pow(cos_a, 2)
-    c = (a_ - b_) * cos_a * sin_a
-    return a, b, c
-
-
-def probiou_loss(pred, target, eps=1e-3, mode='l1'):
-    """
-        pred    -> a matrix [N,5](x,y,w,h,angle - in radians) containing ours predicted box ;in case of HBB angle == 0
-        target  -> a matrix [N,5](x,y,w,h,angle - in radians) containing ours target    box ;in case of HBB angle == 0
-        eps     -> threshold to avoid infinite values
-        mode    -> ('l1' in [0,1] or 'l2' in [0,inf]) metrics according our paper
-
-    """
-
-    gbboxes1 = gbb_form(pred)
-    gbboxes2 = gbb_form(target)
-
-    x1, y1, a1_, b1_, c1_ = gbboxes1[:,
-                                     0], gbboxes1[:,
-                                                  1], gbboxes1[:,
-                                                               2], gbboxes1[:,
-                                                                            3], gbboxes1[:,
-                                                                                         4]
-    x2, y2, a2_, b2_, c2_ = gbboxes2[:,
-                                     0], gbboxes2[:,
-                                                  1], gbboxes2[:,
-                                                               2], gbboxes2[:,
-                                                                            3], gbboxes2[:,
-                                                                                         4]
-
-    a1, b1, c1 = rotated_form(a1_, b1_, c1_)
-    a2, b2, c2 = rotated_form(a2_, b2_, c2_)
-
-    t1 = 0.25 * ((a1 + a2) * (paddle.pow(y1 - y2, 2)) + (b1 + b2) * (paddle.pow(x1 - x2, 2))) + \
-         0.5 * ((c1+c2)*(x2-x1)*(y1-y2))
-    t2 = (a1 + a2) * (b1 + b2) - paddle.pow(c1 + c2, 2)
-    t3_ = (a1 * b1 - c1 * c1) * (a2 * b2 - c2 * c2)
-    t3 = 0.5 * paddle.log(t2 / (4 * paddle.sqrt(F.relu(t3_)) + eps))
-
-    B_d = (t1 / t2) + t3
-    # B_d = t1 + t2 + t3
-
-    B_d = paddle.clip(B_d, min=eps, max=100.0)
-    l1 = paddle.sqrt(1.0 - paddle.exp(-B_d) + eps)
-    l_i = paddle.pow(l1, 2.0)
-    l2 = -paddle.log(1.0 - l_i + eps)
-
-    if mode == 'l1':
-        probiou = l1
-    if mode == 'l2':
-        probiou = l2
-
-    return probiou
-
-
-@serializable
-@register
-class ProbIoULoss(object):
-    """ ProbIoU Loss, refer to https://arxiv.org/abs/2106.06072 for details """
-
-    def __init__(self, mode='l1', eps=1e-3):
-        super(ProbIoULoss, self).__init__()
-        self.mode = mode
-        self.eps = eps
-
-    def __call__(self, pred_rboxes, assigned_rboxes):
-        return probiou_loss(pred_rboxes, assigned_rboxes, self.eps, self.mode)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/queryinst_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/queryinst_loss.py
deleted file mode 100644
index 640b9b4..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/queryinst_loss.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn.functional as F
-
-from ppdet.core.workspace import register
-from ppdet.modeling.losses.iou_loss import GIoULoss
-from .sparsercnn_loss import HungarianMatcher
-
-__all__ = ['QueryInstLoss']
-
-
-@register
-class QueryInstLoss(object):
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 num_classes=80,
-                 focal_loss_alpha=0.25,
-                 focal_loss_gamma=2.0,
-                 class_weight=2.0,
-                 l1_weight=5.0,
-                 giou_weight=2.0,
-                 mask_weight=8.0):
-        super(QueryInstLoss, self).__init__()
-
-        self.num_classes = num_classes
-        self.focal_loss_alpha = focal_loss_alpha
-        self.focal_loss_gamma = focal_loss_gamma
-        self.loss_weights = {
-            "loss_cls": class_weight,
-            "loss_bbox": l1_weight,
-            "loss_giou": giou_weight,
-            "loss_mask": mask_weight
-        }
-        self.giou_loss = GIoULoss(eps=1e-6, reduction='sum')
-
-        self.matcher = HungarianMatcher(focal_loss_alpha, focal_loss_gamma,
-                                        class_weight, l1_weight, giou_weight)
-
-    def loss_classes(self, class_logits, targets, indices, avg_factor):
-        tgt_labels = paddle.full(
-            class_logits.shape[:2], self.num_classes, dtype='int32')
-
-        if sum(len(v['labels']) for v in targets) > 0:
-            tgt_classes = paddle.concat([
-                paddle.gather(
-                    tgt['labels'], tgt_idx, axis=0)
-                for tgt, (_, tgt_idx) in zip(targets, indices)
-            ])
-            batch_idx, src_idx = self._get_src_permutation_idx(indices)
-            for i, (batch_i, src_i) in enumerate(zip(batch_idx, src_idx)):
-                tgt_labels[int(batch_i), int(src_i)] = tgt_classes[i]
-
-        tgt_labels = tgt_labels.flatten(0, 1).unsqueeze(-1)
-
-        tgt_labels_onehot = paddle.cast(
-            tgt_labels == paddle.arange(0, self.num_classes), dtype='float32')
-        tgt_labels_onehot.stop_gradient = True
-
-        src_logits = class_logits.flatten(0, 1)
-
-        loss_cls = F.sigmoid_focal_loss(
-            src_logits,
-            tgt_labels_onehot,
-            alpha=self.focal_loss_alpha,
-            gamma=self.focal_loss_gamma,
-            reduction='sum') / avg_factor
-        losses = {'loss_cls': loss_cls * self.loss_weights['loss_cls']}
-        return losses
-
-    def loss_bboxes(self, bbox_pred, targets, indices, avg_factor):
-        bboxes = paddle.concat([
-            paddle.gather(
-                src, src_idx, axis=0)
-            for src, (src_idx, _) in zip(bbox_pred, indices)
-        ])
-
-        tgt_bboxes = paddle.concat([
-            paddle.gather(
-                tgt['boxes'], tgt_idx, axis=0)
-            for tgt, (_, tgt_idx) in zip(targets, indices)
-        ])
-        tgt_bboxes.stop_gradient = True
-
-        im_shapes = paddle.concat([tgt['img_whwh_tgt'] for tgt in targets])
-        bboxes_norm = bboxes / im_shapes
-        tgt_bboxes_norm = tgt_bboxes / im_shapes
-
-        loss_giou = self.giou_loss(bboxes, tgt_bboxes) / avg_factor
-        loss_bbox = F.l1_loss(
-            bboxes_norm, tgt_bboxes_norm, reduction='sum') / avg_factor
-        losses = {
-            'loss_bbox': loss_bbox * self.loss_weights['loss_bbox'],
-            'loss_giou': loss_giou * self.loss_weights['loss_giou']
-        }
-        return losses
-
-    def loss_masks(self, pos_bbox_pred, mask_logits, targets, indices,
-                   avg_factor):
-        tgt_segm = [
-            paddle.gather(
-                tgt['gt_segm'], tgt_idx, axis=0)
-            for tgt, (_, tgt_idx) in zip(targets, indices)
-        ]
-
-        tgt_masks = []
-        for i in range(len(indices)):
-            gt_segm = tgt_segm[i].unsqueeze(1)
-            if len(gt_segm) == 0:
-                continue
-            boxes = pos_bbox_pred[i]
-            boxes[:, 0::2] = paddle.clip(
-                boxes[:, 0::2], min=0, max=gt_segm.shape[3])
-            boxes[:, 1::2] = paddle.clip(
-                boxes[:, 1::2], min=0, max=gt_segm.shape[2])
-            boxes_num = paddle.to_tensor([1] * len(boxes), dtype='int32')
-            gt_mask = paddle.vision.ops.roi_align(
-                gt_segm,
-                boxes,
-                boxes_num,
-                output_size=mask_logits.shape[-2:],
-                aligned=True)
-            tgt_masks.append(gt_mask)
-        tgt_masks = paddle.concat(tgt_masks).squeeze(1)
-        tgt_masks = paddle.cast(tgt_masks >= 0.5, dtype='float32')
-        tgt_masks.stop_gradient = True
-
-        tgt_labels = paddle.concat([
-            paddle.gather(
-                tgt['labels'], tgt_idx, axis=0)
-            for tgt, (_, tgt_idx) in zip(targets, indices)
-        ])
-
-        mask_label = F.one_hot(tgt_labels, self.num_classes).unsqueeze([2, 3])
-        mask_label = paddle.expand_as(mask_label, mask_logits)
-        mask_label.stop_gradient = True
-
-        src_masks = paddle.gather_nd(mask_logits, paddle.nonzero(mask_label))
-        shape = mask_logits.shape
-        src_masks = paddle.reshape(src_masks, [shape[0], shape[2], shape[3]])
-        src_masks = F.sigmoid(src_masks)
-
-        X = src_masks.flatten(1)
-        Y = tgt_masks.flatten(1)
-        inter = paddle.sum(X * Y, 1)
-        union = paddle.sum(X * X, 1) + paddle.sum(Y * Y, 1)
-        dice = (2 * inter) / (union + 2e-5)
-
-        loss_mask = (1 - dice).sum() / avg_factor
-        losses = {'loss_mask': loss_mask * self.loss_weights['loss_mask']}
-        return losses
-
-    @staticmethod
-    def _get_src_permutation_idx(indices):
-        batch_idx = paddle.concat(
-            [paddle.full_like(src, i) for i, (src, _) in enumerate(indices)])
-        src_idx = paddle.concat([src for (src, _) in indices])
-        return batch_idx, src_idx
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/smooth_l1_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/smooth_l1_loss.py
deleted file mode 100644
index f89c28f..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/smooth_l1_loss.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-
-__all__ = ['SmoothL1Loss']
-
-@register
-class SmoothL1Loss(nn.Layer):
-    """Smooth L1 Loss.
-    Args:
-        beta (float): controls smooth region, it becomes L1 Loss when beta=0.0
-        loss_weight (float): the final loss will be multiplied by this 
-    """
-    def __init__(self,
-                 beta=1.0,
-                 loss_weight=1.0):
-        super(SmoothL1Loss, self).__init__()
-        assert beta >= 0
-        self.beta = beta
-        self.loss_weight = loss_weight
-
-    def forward(self, pred, target, reduction='none'):
-        """forward function, based on fvcore.
-        Args:
-            pred (Tensor): prediction tensor
-            target (Tensor): target tensor, pred.shape must be the same as target.shape
-            reduction (str): the way to reduce loss, one of (none, sum, mean)
-        """
-        assert reduction in ('none', 'sum', 'mean')
-        target = target.detach()
-        if self.beta < 1e-5:
-            loss = paddle.abs(pred - target)
-        else:
-            n = paddle.abs(pred - target)
-            cond = n < self.beta
-            loss = paddle.where(cond, 0.5 * n ** 2 / self.beta, n - 0.5 * self.beta)
-        if reduction == 'mean':
-            loss = loss.mean() if loss.size > 0 else 0.0 * loss.sum()
-        elif reduction == 'sum':
-            loss = loss.sum()
-        return loss * self.loss_weight
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/solov2_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/solov2_loss.py
deleted file mode 100644
index ef97a77..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/solov2_loss.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn.functional as F
-from ppdet.core.workspace import register, serializable
-
-__all__ = ['SOLOv2Loss']
-
-
-@register
-@serializable
-class SOLOv2Loss(object):
-    """
-    SOLOv2Loss
-    Args:
-        ins_loss_weight (float): Weight of instance loss.
-        focal_loss_gamma (float): Gamma parameter for focal loss.
-        focal_loss_alpha (float): Alpha parameter for focal loss.
-    """
-
-    def __init__(self,
-                 ins_loss_weight=3.0,
-                 focal_loss_gamma=2.0,
-                 focal_loss_alpha=0.25):
-        self.ins_loss_weight = ins_loss_weight
-        self.focal_loss_gamma = focal_loss_gamma
-        self.focal_loss_alpha = focal_loss_alpha
-
-    def _dice_loss(self, input, target):
-        input = paddle.reshape(input, shape=(paddle.shape(input)[0], -1))
-        target = paddle.reshape(target, shape=(paddle.shape(target)[0], -1))
-        a = paddle.sum(input * target, axis=1)
-        b = paddle.sum(input * input, axis=1) + 0.001
-        c = paddle.sum(target * target, axis=1) + 0.001
-        d = (2 * a) / (b + c)
-        return 1 - d
-
-    def __call__(self, ins_pred_list, ins_label_list, cate_preds, cate_labels,
-                 num_ins):
-        """
-        Get loss of network of SOLOv2.
-        Args:
-            ins_pred_list (list): Variable list of instance branch output.
-            ins_label_list (list): List of instance labels pre batch.
-            cate_preds (list): Concat Variable list of categroy branch output.
-            cate_labels (list): Concat list of categroy labels pre batch.
-            num_ins (int): Number of positive samples in a mini-batch.
-        Returns:
-            loss_ins (Variable): The instance loss Variable of SOLOv2 network.
-            loss_cate (Variable): The category loss Variable of SOLOv2 network.
-        """
-
-        #1. Ues dice_loss to calculate instance loss
-        loss_ins = []
-        total_weights = paddle.zeros(shape=[1], dtype='float32')
-        for input, target in zip(ins_pred_list, ins_label_list):
-            if input is None:
-                continue
-            target = paddle.cast(target, 'float32')
-            target = paddle.reshape(
-                target,
-                shape=[-1, paddle.shape(input)[-2], paddle.shape(input)[-1]])
-            weights = paddle.cast(
-                paddle.sum(target, axis=[1, 2]) > 0, 'float32')
-            input = F.sigmoid(input)
-            dice_out = paddle.multiply(self._dice_loss(input, target), weights)
-            total_weights += paddle.sum(weights)
-            loss_ins.append(dice_out)
-        loss_ins = paddle.sum(paddle.concat(loss_ins)) / total_weights
-        loss_ins = loss_ins * self.ins_loss_weight
-
-        #2. Ues sigmoid_focal_loss to calculate category loss
-        # expand onehot labels
-        num_classes = cate_preds.shape[-1]
-        cate_labels_bin = F.one_hot(cate_labels, num_classes=num_classes + 1)
-        cate_labels_bin = cate_labels_bin[:, 1:]
-
-        loss_cate = F.sigmoid_focal_loss(
-            cate_preds,
-            label=cate_labels_bin,
-            normalizer=num_ins + 1.,
-            gamma=self.focal_loss_gamma,
-            alpha=self.focal_loss_alpha)
-
-        return loss_ins, loss_cate
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/sparsercnn_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/sparsercnn_loss.py
deleted file mode 100644
index ac9eba6..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/sparsercnn_loss.py
+++ /dev/null
@@ -1,430 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/PeizeSun/SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/loss.py
-Ths copyright of PeizeSun/SparseR-CNN is as follows:
-MIT License [see LICENSE for details]
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from scipy.optimize import linear_sum_assignment
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.metric import accuracy
-from ppdet.core.workspace import register
-from ppdet.modeling.losses.iou_loss import GIoULoss
-
-__all__ = ["SparseRCNNLoss"]
-
-
-@register
-class SparseRCNNLoss(nn.Layer):
-    """ This class computes the loss for SparseRCNN.
-    The process happens in two steps:
-        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
-        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
-    """
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 losses,
-                 focal_loss_alpha,
-                 focal_loss_gamma,
-                 num_classes=80,
-                 class_weight=2.,
-                 l1_weight=5.,
-                 giou_weight=2.):
-        """ Create the criterion.
-        Parameters:
-            num_classes: number of object categories, omitting the special no-object category
-            weight_dict: dict containing as key the names of the losses and as values their relative weight.
-            losses: list of all the losses to be applied. See get_loss for list of available losses.
-            matcher: module able to compute a matching between targets and proposals
-        """
-        super().__init__()
-        self.num_classes = num_classes
-        weight_dict = {
-            "loss_ce": class_weight,
-            "loss_bbox": l1_weight,
-            "loss_giou": giou_weight
-        }
-        self.weight_dict = weight_dict
-        self.losses = losses
-        self.giou_loss = GIoULoss(reduction="sum")
-
-        self.focal_loss_alpha = focal_loss_alpha
-        self.focal_loss_gamma = focal_loss_gamma
-
-        self.matcher = HungarianMatcher(focal_loss_alpha, focal_loss_gamma,
-                                        class_weight, l1_weight, giou_weight)
-
-    def loss_labels(self, outputs, targets, indices, num_boxes, log=True):
-        """Classification loss (NLL)
-        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
-        """
-        assert 'pred_logits' in outputs
-        src_logits = outputs['pred_logits']
-
-        idx = self._get_src_permutation_idx(indices)
-        target_classes_o = paddle.concat([
-            paddle.gather(
-                t["labels"], J, axis=0) for t, (_, J) in zip(targets, indices)
-        ])
-        target_classes = paddle.full(
-            src_logits.shape[:2], self.num_classes, dtype="int32")
-        for i, ind in enumerate(zip(idx[0], idx[1])):
-            target_classes[int(ind[0]), int(ind[1])] = target_classes_o[i]
-        target_classes.stop_gradient = True
-
-        src_logits = src_logits.flatten(start_axis=0, stop_axis=1)
-
-        # prepare one_hot target.
-        target_classes = target_classes.flatten(start_axis=0, stop_axis=1)
-        class_ids = paddle.arange(0, self.num_classes)
-        labels = (target_classes.unsqueeze(-1) == class_ids).astype("float32")
-        labels.stop_gradient = True
-
-        # comp focal loss.
-        class_loss = sigmoid_focal_loss(
-            src_logits,
-            labels,
-            alpha=self.focal_loss_alpha,
-            gamma=self.focal_loss_gamma,
-            reduction="sum", ) / num_boxes
-        losses = {'loss_ce': class_loss}
-
-        if log:
-            label_acc = target_classes_o.unsqueeze(-1)
-            src_idx = [src for (src, _) in indices]
-
-            pred_list = []
-            for i in range(outputs["pred_logits"].shape[0]):
-                pred_list.append(
-                    paddle.gather(
-                        outputs["pred_logits"][i], src_idx[i], axis=0))
-
-            pred = F.sigmoid(paddle.concat(pred_list, axis=0))
-            acc = accuracy(pred, label_acc.astype("int64"))
-            losses["acc"] = acc
-
-        return losses
-
-    def loss_boxes(self, outputs, targets, indices, num_boxes):
-        """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss
-           targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]
-           The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size.
-        """
-        assert 'pred_boxes' in outputs  # [batch_size, num_proposals, 4]
-        src_idx = [src for (src, _) in indices]
-        src_boxes_list = []
-
-        for i in range(outputs["pred_boxes"].shape[0]):
-            src_boxes_list.append(
-                paddle.gather(
-                    outputs["pred_boxes"][i], src_idx[i], axis=0))
-
-        src_boxes = paddle.concat(src_boxes_list, axis=0)
-
-        target_boxes = paddle.concat(
-            [
-                paddle.gather(
-                    t['boxes'], I, axis=0)
-                for t, (_, I) in zip(targets, indices)
-            ],
-            axis=0)
-        target_boxes.stop_gradient = True
-        losses = {}
-
-        losses['loss_giou'] = self.giou_loss(src_boxes,
-                                             target_boxes) / num_boxes
-
-        image_size = paddle.concat([v["img_whwh_tgt"] for v in targets])
-        src_boxes_ = src_boxes / image_size
-        target_boxes_ = target_boxes / image_size
-
-        loss_bbox = F.l1_loss(src_boxes_, target_boxes_, reduction='sum')
-        losses['loss_bbox'] = loss_bbox / num_boxes
-
-        return losses
-
-    def _get_src_permutation_idx(self, indices):
-        # permute predictions following indices
-        batch_idx = paddle.concat(
-            [paddle.full_like(src, i) for i, (src, _) in enumerate(indices)])
-        src_idx = paddle.concat([src for (src, _) in indices])
-        return batch_idx, src_idx
-
-    def _get_tgt_permutation_idx(self, indices):
-        # permute targets following indices
-        batch_idx = paddle.concat(
-            [paddle.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
-        tgt_idx = paddle.concat([tgt for (_, tgt) in indices])
-        return batch_idx, tgt_idx
-
-    def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs):
-        loss_map = {
-            'labels': self.loss_labels,
-            'boxes': self.loss_boxes,
-        }
-        assert loss in loss_map, f'do you really want to compute {loss} loss?'
-        return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs)
-
-    def forward(self, outputs, targets):
-        """ This performs the loss computation.
-        Parameters:
-             outputs: dict of tensors, see the output specification of the model for the format
-             targets: list of dicts, such that len(targets) == batch_size.
-                      The expected keys in each dict depends on the losses applied, see each loss' doc
-        """
-        outputs_without_aux = {
-            k: v
-            for k, v in outputs.items() if k != 'aux_outputs'
-        }
-
-        # Retrieve the matching between the outputs of the last layer and the targets
-        indices = self.matcher(outputs_without_aux, targets)
-
-        # Compute the average number of target boxes across all nodes, for normalization purposes
-        num_boxes = sum(len(t["labels"]) for t in targets)
-        num_boxes = paddle.to_tensor(
-            [num_boxes],
-            dtype="float32",
-            place=next(iter(outputs.values())).place)
-
-        # Compute all the requested losses
-        losses = {}
-        for loss in self.losses:
-            losses.update(
-                self.get_loss(loss, outputs, targets, indices, num_boxes))
-
-        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
-        if 'aux_outputs' in outputs:
-            for i, aux_outputs in enumerate(outputs['aux_outputs']):
-                indices = self.matcher(aux_outputs, targets)
-                for loss in self.losses:
-                    kwargs = {}
-                    if loss == 'labels':
-                        # Logging is enabled only for the last layer
-                        kwargs = {'log': False}
-                    l_dict = self.get_loss(loss, aux_outputs, targets, indices,
-                                           num_boxes, **kwargs)
-
-                    w_dict = {}
-                    for k in l_dict.keys():
-                        if k in self.weight_dict:
-                            w_dict[k + f'_{i}'] = l_dict[k] * self.weight_dict[
-                                k]
-                        else:
-                            w_dict[k + f'_{i}'] = l_dict[k]
-                    losses.update(w_dict)
-
-        return losses
-
-
-class HungarianMatcher(nn.Layer):
-    """This class computes an assignment between the targets and the predictions of the network
-    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
-    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
-    while the others are un-matched (and thus treated as non-objects).
-    """
-
-    def __init__(self,
-                 focal_loss_alpha,
-                 focal_loss_gamma,
-                 cost_class: float=1,
-                 cost_bbox: float=1,
-                 cost_giou: float=1):
-        """Creates the matcher
-        Params:
-            cost_class: This is the relative weight of the classification error in the matching cost
-            cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
-            cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
-        """
-        super().__init__()
-        self.cost_class = cost_class
-        self.cost_bbox = cost_bbox
-        self.cost_giou = cost_giou
-        self.focal_loss_alpha = focal_loss_alpha
-        self.focal_loss_gamma = focal_loss_gamma
-        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
-
-    @paddle.no_grad()
-    def forward(self, outputs, targets):
-        """ Performs the matching
-        Args:
-            outputs: This is a dict that contains at least these entries:
-                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
-                 "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
-                 eg. outputs = {"pred_logits": pred_logits, "pred_boxes": pred_boxes}
-            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
-                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
-                           objects in the target) containing the class labels
-                 "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
-                 eg. targets = [{"labels":labels, "boxes": boxes}, ...,{"labels":labels, "boxes": boxes}]
-        Returns:
-            A list of size batch_size, containing tuples of (index_i, index_j) where:
-                - index_i is the indices of the selected predictions (in order)
-                - index_j is the indices of the corresponding selected targets (in order)
-            For each batch element, it holds:
-                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
-        """
-        bs, num_queries = outputs["pred_logits"].shape[:2]
-
-        if sum(len(v["labels"]) for v in targets) == 0:
-            return [(paddle.to_tensor(
-                [], dtype=paddle.int64), paddle.to_tensor(
-                    [], dtype=paddle.int64)) for _ in range(bs)]
-
-        # We flatten to compute the cost matrices in a batch
-        out_prob = F.sigmoid(outputs["pred_logits"].flatten(
-            start_axis=0, stop_axis=1))
-        out_bbox = outputs["pred_boxes"].flatten(start_axis=0, stop_axis=1)
-
-        # Also concat the target labels and boxes
-        tgt_ids = paddle.concat([v["labels"] for v in targets])
-        assert (tgt_ids > -1).all()
-        tgt_bbox = paddle.concat([v["boxes"] for v in targets])
-
-        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
-        # but approximate it in 1 - proba[target class].
-        # The 1 is a constant that doesn't change the matching, it can be ommitted.
-
-        # Compute the classification cost.
-        alpha = self.focal_loss_alpha
-        gamma = self.focal_loss_gamma
-
-        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(
-            1 - out_prob + 1e-8).log())
-        pos_cost_class = alpha * ((1 - out_prob)
-                                  **gamma) * (-(out_prob + 1e-8).log())
-
-        cost_class = paddle.gather(
-            pos_cost_class, tgt_ids, axis=1) - paddle.gather(
-                neg_cost_class, tgt_ids, axis=1)
-
-        # Compute the L1 cost between boxes
-        image_size_out = paddle.concat(
-            [v["img_whwh"].unsqueeze(0) for v in targets])
-        image_size_out = image_size_out.unsqueeze(1).tile(
-            [1, num_queries, 1]).flatten(
-                start_axis=0, stop_axis=1)
-        image_size_tgt = paddle.concat([v["img_whwh_tgt"] for v in targets])
-
-        out_bbox_ = out_bbox / image_size_out
-        tgt_bbox_ = tgt_bbox / image_size_tgt
-        cost_bbox = F.l1_loss(
-            out_bbox_.unsqueeze(-2), tgt_bbox_,
-            reduction='none').sum(-1)  # [batch_size * num_queries, num_tgts]
-
-        # Compute the giou cost betwen boxes
-        cost_giou = -get_bboxes_giou(out_bbox, tgt_bbox)
-
-        # Final cost matrix
-        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
-        C = C.reshape([bs, num_queries, -1])
-
-        sizes = [len(v["boxes"]) for v in targets]
-
-        indices = [
-            linear_sum_assignment(c[i].numpy())
-            for i, c in enumerate(C.split(sizes, -1))
-        ]
-        return [(paddle.to_tensor(
-            i, dtype="int32"), paddle.to_tensor(
-                j, dtype="int32")) for i, j in indices]
-
-
-def box_area(boxes):
-    assert (boxes[:, 2:] >= boxes[:, :2]).all()
-    wh = boxes[:, 2:] - boxes[:, :2]
-    return wh[:, 0] * wh[:, 1]
-
-
-def boxes_iou(boxes1, boxes2):
-    '''
-    Compute iou
-
-    Args:
-        boxes1 (paddle.tensor) shape (N, 4)
-        boxes2 (paddle.tensor) shape (M, 4)
-
-    Return:
-        (paddle.tensor) shape (N, M)
-    '''
-    area1 = box_area(boxes1)
-    area2 = box_area(boxes2)
-
-    lt = paddle.maximum(boxes1.unsqueeze(-2)[:, :, :2], boxes2[:, :2])
-    rb = paddle.minimum(boxes1.unsqueeze(-2)[:, :, 2:], boxes2[:, 2:])
-
-    wh = (rb - lt).astype("float32").clip(min=1e-9)
-    inter = wh[:, :, 0] * wh[:, :, 1]
-
-    union = area1.unsqueeze(-1) + area2 - inter + 1e-9
-
-    iou = inter / union
-    return iou, union
-
-
-def get_bboxes_giou(boxes1, boxes2, eps=1e-9):
-    """calculate the ious of boxes1 and boxes2
-
-    Args:
-        boxes1 (Tensor): shape [N, 4]
-        boxes2 (Tensor): shape [M, 4]
-        eps (float): epsilon to avoid divide by zero
-
-    Return:
-        ious (Tensor): ious of boxes1 and boxes2, with the shape [N, M]
-    """
-    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
-    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
-
-    iou, union = boxes_iou(boxes1, boxes2)
-
-    lt = paddle.minimum(boxes1.unsqueeze(-2)[:, :, :2], boxes2[:, :2])
-    rb = paddle.maximum(boxes1.unsqueeze(-2)[:, :, 2:], boxes2[:, 2:])
-
-    wh = (rb - lt).astype("float32").clip(min=eps)
-    enclose_area = wh[:, :, 0] * wh[:, :, 1]
-
-    giou = iou - (enclose_area - union) / enclose_area
-
-    return giou
-
-
-def sigmoid_focal_loss(inputs, targets, alpha, gamma, reduction="sum"):
-
-    assert reduction in ["sum", "mean"
-                         ], f'do not support this {reduction} reduction?'
-
-    p = F.sigmoid(inputs)
-    ce_loss = F.binary_cross_entropy_with_logits(
-        inputs, targets, reduction="none")
-    p_t = p * targets + (1 - p) * (1 - targets)
-    loss = ce_loss * ((1 - p_t)**gamma)
-
-    if alpha >= 0:
-        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
-        loss = alpha_t * loss
-
-    if reduction == "mean":
-        loss = loss.mean()
-    elif reduction == "sum":
-        loss = loss.sum()
-
-    return loss
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/ssd_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/ssd_loss.py
deleted file mode 100644
index 2ab94f2..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/ssd_loss.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-from ..bbox_utils import iou_similarity, bbox2delta
-
-__all__ = ['SSDLoss']
-
-
-@register
-class SSDLoss(nn.Layer):
-    """
-    SSDLoss
-
-    Args:
-        overlap_threshold (float32, optional): IoU threshold for negative bboxes
-            and positive bboxes, 0.5 by default.
-        neg_pos_ratio (float): The ratio of negative samples / positive samples.
-        loc_loss_weight (float): The weight of loc_loss.
-        conf_loss_weight (float): The weight of conf_loss.
-        prior_box_var (list): Variances corresponding to prior box coord, [0.1,
-            0.1, 0.2, 0.2] by default.
-    """
-
-    def __init__(self,
-                 overlap_threshold=0.5,
-                 neg_pos_ratio=3.0,
-                 loc_loss_weight=1.0,
-                 conf_loss_weight=1.0,
-                 prior_box_var=[0.1, 0.1, 0.2, 0.2]):
-        super(SSDLoss, self).__init__()
-        self.overlap_threshold = overlap_threshold
-        self.neg_pos_ratio = neg_pos_ratio
-        self.loc_loss_weight = loc_loss_weight
-        self.conf_loss_weight = conf_loss_weight
-        self.prior_box_var = [1. / a for a in prior_box_var]
-
-    def _bipartite_match_for_batch(self, gt_bbox, gt_label, prior_boxes,
-                                   bg_index):
-        """
-        Args:
-            gt_bbox (Tensor): [B, N, 4]
-            gt_label (Tensor): [B, N, 1]
-            prior_boxes (Tensor): [A, 4]
-            bg_index (int): Background class index
-        """
-        batch_size, num_priors = gt_bbox.shape[0], prior_boxes.shape[0]
-        ious = iou_similarity(gt_bbox.reshape((-1, 4)), prior_boxes).reshape(
-            (batch_size, -1, num_priors))
-
-        # For each prior box, get the max IoU of all GTs.
-        prior_max_iou, prior_argmax_iou = ious.max(axis=1), ious.argmax(axis=1)
-        # For each GT, get the max IoU of all prior boxes.
-        gt_max_iou, gt_argmax_iou = ious.max(axis=2), ious.argmax(axis=2)
-
-        # Gather target bbox and label according to 'prior_argmax_iou' index.
-        batch_ind = paddle.arange(end=batch_size, dtype='int64').unsqueeze(-1)
-        prior_argmax_iou = paddle.stack(
-            [batch_ind.tile([1, num_priors]), prior_argmax_iou], axis=-1)
-        targets_bbox = paddle.gather_nd(gt_bbox, prior_argmax_iou)
-        targets_label = paddle.gather_nd(gt_label, prior_argmax_iou)
-        # Assign negative
-        bg_index_tensor = paddle.full([batch_size, num_priors, 1], bg_index,
-                                      'int64')
-        targets_label = paddle.where(
-            prior_max_iou.unsqueeze(-1) < self.overlap_threshold,
-            bg_index_tensor, targets_label)
-
-        # Ensure each GT can match the max IoU prior box.
-        batch_ind = (batch_ind * num_priors + gt_argmax_iou).flatten()
-        targets_bbox = paddle.scatter(
-            targets_bbox.reshape([-1, 4]), batch_ind,
-            gt_bbox.reshape([-1, 4])).reshape([batch_size, -1, 4])
-        targets_label = paddle.scatter(
-            targets_label.reshape([-1, 1]), batch_ind,
-            gt_label.reshape([-1, 1])).reshape([batch_size, -1, 1])
-        targets_label[:, :1] = bg_index
-
-        # Encode box
-        prior_boxes = prior_boxes.unsqueeze(0).tile([batch_size, 1, 1])
-        targets_bbox = bbox2delta(
-            prior_boxes.reshape([-1, 4]),
-            targets_bbox.reshape([-1, 4]), self.prior_box_var)
-        targets_bbox = targets_bbox.reshape([batch_size, -1, 4])
-
-        return targets_bbox, targets_label
-
-    def _mine_hard_example(self,
-                           conf_loss,
-                           targets_label,
-                           bg_index,
-                           mine_neg_ratio=0.01):
-        pos = (targets_label != bg_index).astype(conf_loss.dtype)
-        num_pos = pos.sum(axis=1, keepdim=True)
-        neg = (targets_label == bg_index).astype(conf_loss.dtype)
-
-        conf_loss = conf_loss.detach() * neg
-        loss_idx = conf_loss.argsort(axis=1, descending=True)
-        idx_rank = loss_idx.argsort(axis=1)
-        num_negs = []
-        for i in range(conf_loss.shape[0]):
-            cur_num_pos = num_pos[i]
-            num_neg = paddle.clip(
-                cur_num_pos * self.neg_pos_ratio, max=pos.shape[1])
-            num_neg = num_neg if num_neg > 0 else paddle.to_tensor(
-                [pos.shape[1] * mine_neg_ratio])
-            num_negs.append(num_neg)
-        num_negs = paddle.stack(num_negs).expand_as(idx_rank)
-        neg_mask = (idx_rank < num_negs).astype(conf_loss.dtype)
-
-        return (neg_mask + pos).astype('bool')
-
-    def forward(self, boxes, scores, gt_bbox, gt_label, prior_boxes):
-        boxes = paddle.concat(boxes, axis=1)
-        scores = paddle.concat(scores, axis=1)
-        gt_label = gt_label.unsqueeze(-1).astype('int64')
-        prior_boxes = paddle.concat(prior_boxes, axis=0)
-        bg_index = scores.shape[-1] - 1
-
-        # Match bbox and get targets.
-        targets_bbox, targets_label = \
-            self._bipartite_match_for_batch(gt_bbox, gt_label, prior_boxes, bg_index)
-        targets_bbox.stop_gradient = True
-        targets_label.stop_gradient = True
-
-        # Compute regression loss.
-        # Select positive samples.
-        bbox_mask = paddle.tile(targets_label != bg_index, [1, 1, 4])
-        if bbox_mask.astype(boxes.dtype).sum() > 0:
-            location = paddle.masked_select(boxes, bbox_mask)
-            targets_bbox = paddle.masked_select(targets_bbox, bbox_mask)
-            loc_loss = F.smooth_l1_loss(location, targets_bbox, reduction='sum')
-            loc_loss = loc_loss * self.loc_loss_weight
-        else:
-            loc_loss = paddle.zeros([1])
-
-        # Compute confidence loss.
-        conf_loss = F.cross_entropy(scores, targets_label, reduction="none")
-        # Mining hard examples.
-        label_mask = self._mine_hard_example(
-            conf_loss.squeeze(-1), targets_label.squeeze(-1), bg_index)
-        conf_loss = paddle.masked_select(conf_loss, label_mask.unsqueeze(-1))
-        conf_loss = conf_loss.sum() * self.conf_loss_weight
-
-        # Compute overall weighted loss.
-        normalizer = (targets_label != bg_index).astype('float32').sum().clip(
-            min=1)
-        loss = (conf_loss + loc_loss) / normalizer
-
-        return loss
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/supcontrast.py b/pdfdet/models/Paddle/ppdet/modeling/losses/supcontrast.py
deleted file mode 100644
index 3e59f08..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/supcontrast.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-import random
-from ppdet.core.workspace import register
-
-
-__all__ = ['SupContrast']
-
-
-@register
-class SupContrast(nn.Layer):
-    __shared__ = [
-        'num_classes'
-    ]
-    def __init__(self, num_classes=80, temperature=2.5, sample_num=4096, thresh=0.75):
-        super(SupContrast, self).__init__()
-        self.num_classes = num_classes
-        self.temperature = temperature
-        self.sample_num = sample_num
-        self.thresh = thresh
-    def forward(self, features, labels, scores):
-        
-        assert features.shape[0] == labels.shape[0] == scores.shape[0]
-        positive_mask = (labels < self.num_classes)
-        positive_features, positive_labels, positive_scores = features[positive_mask], labels[positive_mask], \
-                                                              scores[positive_mask]
-        
-        negative_mask = (labels == self.num_classes)
-        negative_features, negative_labels, negative_scores = features[negative_mask], labels[negative_mask], \
-                                                              scores[negative_mask]
-        
-        N = negative_features.shape[0]
-        S = self.sample_num - positive_mask.sum()   
-        index = paddle.to_tensor(random.sample(range(N), int(S)), dtype='int32')
-
-        negative_features = paddle.index_select(x=negative_features, index=index, axis=0)
-        negative_labels = paddle.index_select(x=negative_labels, index=index, axis=0)
-        negative_scores = paddle.index_select(x=negative_scores, index=index, axis=0)
-        
-        features = paddle.concat([positive_features, negative_features], 0)
-        labels = paddle.concat([positive_labels, negative_labels], 0)
-        scores = paddle.concat([positive_scores, negative_scores], 0)
-
-        if len(labels.shape) == 1:
-            labels = labels.reshape([-1, 1])
-        label_mask = paddle.equal(labels, labels.T).detach()
-        similarity = (paddle.matmul(features, features.T) / self.temperature)
-
-        sim_row_max = paddle.max(similarity, axis=1, keepdim=True)
-        similarity = similarity - sim_row_max
-
-        logits_mask = paddle.ones_like(similarity).detach()
-        logits_mask.fill_diagonal_(0)
-
-        exp_sim = paddle.exp(similarity) * logits_mask
-        log_prob = similarity - paddle.log(exp_sim.sum(axis=1, keepdim=True))
-
-        per_label_log_prob = (log_prob * logits_mask * label_mask).sum(1) / label_mask.sum(1)
-        keep = scores > self.thresh
-        per_label_log_prob = per_label_log_prob[keep]
-        loss = -per_label_log_prob
-
-        return loss.mean()
\ No newline at end of file
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/varifocal_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/varifocal_loss.py
deleted file mode 100644
index 42d18a6..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/varifocal_loss.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-# The code is based on:
-# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/varifocal_loss.py
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register, serializable
-from ppdet.modeling import ops
-
-__all__ = ['VarifocalLoss']
-
-
-def varifocal_loss(pred,
-                   target,
-                   alpha=0.75,
-                   gamma=2.0,
-                   iou_weighted=True,
-                   use_sigmoid=True):
-    """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
-
-    Args:
-        pred (Tensor): The prediction with shape (N, C), C is the
-            number of classes
-        target (Tensor): The learning target of the iou-aware
-            classification score with shape (N, C), C is the number of classes.
-        alpha (float, optional): A balance factor for the negative part of
-            Varifocal Loss, which is different from the alpha of Focal Loss.
-            Defaults to 0.75.
-        gamma (float, optional): The gamma for calculating the modulating
-            factor. Defaults to 2.0.
-        iou_weighted (bool, optional): Whether to weight the loss of the
-            positive example with the iou target. Defaults to True.
-    """
-    # pred and target should be of the same size
-    assert pred.shape == target.shape
-    if use_sigmoid:
-        pred_new = F.sigmoid(pred)
-    else:
-        pred_new = pred
-    target = target.cast(pred.dtype)
-    if iou_weighted:
-        focal_weight = target * (target > 0.0).cast('float32') + \
-            alpha * (pred_new - target).abs().pow(gamma) * \
-            (target <= 0.0).cast('float32')
-    else:
-        focal_weight = (target > 0.0).cast('float32') + \
-            alpha * (pred_new - target).abs().pow(gamma) * \
-            (target <= 0.0).cast('float32')
-
-    if use_sigmoid:
-        loss = F.binary_cross_entropy_with_logits(
-            pred, target, reduction='none') * focal_weight
-    else:
-        loss = F.binary_cross_entropy(
-            pred, target, reduction='none') * focal_weight
-        loss = loss.sum(axis=1)
-    return loss
-
-
-@register
-@serializable
-class VarifocalLoss(nn.Layer):
-    def __init__(self,
-                 use_sigmoid=True,
-                 alpha=0.75,
-                 gamma=2.0,
-                 iou_weighted=True,
-                 reduction='mean',
-                 loss_weight=1.0):
-        """`Varifocal Loss <https://arxiv.org/abs/2008.13367>`_
-
-        Args:
-            use_sigmoid (bool, optional): Whether the prediction is
-                used for sigmoid or softmax. Defaults to True.
-            alpha (float, optional): A balance factor for the negative part of
-                Varifocal Loss, which is different from the alpha of Focal
-                Loss. Defaults to 0.75.
-            gamma (float, optional): The gamma for calculating the modulating
-                factor. Defaults to 2.0.
-            iou_weighted (bool, optional): Whether to weight the loss of the
-                positive examples with the iou target. Defaults to True.
-            reduction (str, optional): The method used to reduce the loss into
-                a scalar. Defaults to 'mean'. Options are "none", "mean" and
-                "sum".
-            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
-        """
-        super(VarifocalLoss, self).__init__()
-        assert alpha >= 0.0
-        self.use_sigmoid = use_sigmoid
-        self.alpha = alpha
-        self.gamma = gamma
-        self.iou_weighted = iou_weighted
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-
-    def forward(self, pred, target, weight=None, avg_factor=None):
-        """Forward function.
-
-        Args:
-            pred (Tensor): The prediction.
-            target (Tensor): The learning target of the prediction.
-            weight (Tensor, optional): The weight of loss for each
-                prediction. Defaults to None.
-            avg_factor (int, optional): Average factor that is used to average
-                the loss. Defaults to None.
-        Returns:
-            Tensor: The calculated loss
-        """
-        loss = self.loss_weight * varifocal_loss(
-            pred,
-            target,
-            alpha=self.alpha,
-            gamma=self.gamma,
-            iou_weighted=self.iou_weighted,
-            use_sigmoid=self.use_sigmoid)
-
-        if weight is not None:
-            loss = loss * weight
-        if avg_factor is None:
-            if self.reduction == 'none':
-                return loss
-            elif self.reduction == 'mean':
-                return loss.mean()
-            elif self.reduction == 'sum':
-                return loss.sum()
-        else:
-            # if reduction is mean, then average the loss by avg_factor
-            if self.reduction == 'mean':
-                loss = loss.sum() / avg_factor
-            # if reduction is 'none', then do nothing, otherwise raise an error
-            elif self.reduction != 'none':
-                raise ValueError(
-                    'avg_factor can not be used with reduction="sum"')
-        return loss
diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/yolo_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/yolo_loss.py
deleted file mode 100644
index fecef9a..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/losses/yolo_loss.py
+++ /dev/null
@@ -1,207 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-
-from ..bbox_utils import decode_yolo, xywh2xyxy, batch_iou_similarity
-
-__all__ = ['YOLOv3Loss']
-
-
-def bbox_transform(pbox, anchor, downsample):
-    pbox = decode_yolo(pbox, anchor, downsample)
-    pbox = xywh2xyxy(pbox)
-    return pbox
-
-
-@register
-class YOLOv3Loss(nn.Layer):
-
-    __inject__ = ['iou_loss', 'iou_aware_loss']
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 num_classes=80,
-                 ignore_thresh=0.7,
-                 label_smooth=False,
-                 downsample=[32, 16, 8],
-                 scale_x_y=1.,
-                 iou_loss=None,
-                 iou_aware_loss=None):
-        """
-        YOLOv3Loss layer
-
-        Args:
-            num_calsses (int): number of foreground classes
-            ignore_thresh (float): threshold to ignore confidence loss
-            label_smooth (bool): whether to use label smoothing
-            downsample (list): downsample ratio for each detection block
-            scale_x_y (float): scale_x_y factor
-            iou_loss (object): IoULoss instance
-            iou_aware_loss (object): IouAwareLoss instance  
-        """
-        super(YOLOv3Loss, self).__init__()
-        self.num_classes = num_classes
-        self.ignore_thresh = ignore_thresh
-        self.label_smooth = label_smooth
-        self.downsample = downsample
-        self.scale_x_y = scale_x_y
-        self.iou_loss = iou_loss
-        self.iou_aware_loss = iou_aware_loss
-        self.distill_pairs = []
-
-    def obj_loss(self, pbox, gbox, pobj, tobj, anchor, downsample):
-        # pbox
-        pbox = decode_yolo(pbox, anchor, downsample)
-        pbox = xywh2xyxy(pbox)
-        pbox = paddle.concat(pbox, axis=-1)
-        b = pbox.shape[0]
-        pbox = pbox.reshape((b, -1, 4))
-        # gbox
-        gxy = gbox[:, :, 0:2] - gbox[:, :, 2:4] * 0.5
-        gwh = gbox[:, :, 0:2] + gbox[:, :, 2:4] * 0.5
-        gbox = paddle.concat([gxy, gwh], axis=-1)
-
-        iou = batch_iou_similarity(pbox, gbox)
-        iou.stop_gradient = True
-        iou_max = iou.max(2)  # [N, M1]
-        iou_mask = paddle.cast(iou_max <= self.ignore_thresh, dtype=pbox.dtype)
-        iou_mask.stop_gradient = True
-
-        pobj = pobj.reshape((b, -1))
-        tobj = tobj.reshape((b, -1))
-        obj_mask = paddle.cast(tobj > 0, dtype=pbox.dtype)
-        obj_mask.stop_gradient = True
-
-        loss_obj = F.binary_cross_entropy_with_logits(
-            pobj, obj_mask, reduction='none')
-        loss_obj_pos = (loss_obj * tobj)
-        loss_obj_neg = (loss_obj * (1 - obj_mask) * iou_mask)
-        return loss_obj_pos + loss_obj_neg
-
-    def cls_loss(self, pcls, tcls):
-        if self.label_smooth:
-            delta = min(1. / self.num_classes, 1. / 40)
-            pos, neg = 1 - delta, delta
-            # 1 for positive, 0 for negative
-            tcls = pos * paddle.cast(
-                tcls > 0., dtype=tcls.dtype) + neg * paddle.cast(
-                    tcls <= 0., dtype=tcls.dtype)
-
-        loss_cls = F.binary_cross_entropy_with_logits(
-            pcls, tcls, reduction='none')
-        return loss_cls
-
-    def yolov3_loss(self, p, t, gt_box, anchor, downsample, scale=1.,
-                    eps=1e-10):
-        na = len(anchor)
-        b, c, h, w = p.shape
-        if self.iou_aware_loss:
-            ioup, p = p[:, 0:na, :, :], p[:, na:, :, :]
-            ioup = ioup.unsqueeze(-1)
-        p = p.reshape((b, na, -1, h, w)).transpose((0, 1, 3, 4, 2))
-        x, y = p[:, :, :, :, 0:1], p[:, :, :, :, 1:2]
-        w, h = p[:, :, :, :, 2:3], p[:, :, :, :, 3:4]
-        obj, pcls = p[:, :, :, :, 4:5], p[:, :, :, :, 5:]
-        self.distill_pairs.append([x, y, w, h, obj, pcls])
-
-        t = t.transpose((0, 1, 3, 4, 2))
-        tx, ty = t[:, :, :, :, 0:1], t[:, :, :, :, 1:2]
-        tw, th = t[:, :, :, :, 2:3], t[:, :, :, :, 3:4]
-        tscale = t[:, :, :, :, 4:5]
-        tobj, tcls = t[:, :, :, :, 5:6], t[:, :, :, :, 6:]
-
-        tscale_obj = tscale * tobj
-        loss = dict()
-
-        x = scale * F.sigmoid(x) - 0.5 * (scale - 1.)
-        y = scale * F.sigmoid(y) - 0.5 * (scale - 1.)
-
-        if abs(scale - 1.) < eps:
-            loss_x = F.binary_cross_entropy(x, tx, reduction='none')
-            loss_y = F.binary_cross_entropy(y, ty, reduction='none')
-            loss_xy = tscale_obj * (loss_x + loss_y)
-        else:
-            loss_x = paddle.abs(x - tx)
-            loss_y = paddle.abs(y - ty)
-            loss_xy = tscale_obj * (loss_x + loss_y)
-
-        loss_xy = loss_xy.sum([1, 2, 3, 4]).mean()
-
-        loss_w = paddle.abs(w - tw)
-        loss_h = paddle.abs(h - th)
-        loss_wh = tscale_obj * (loss_w + loss_h)
-        loss_wh = loss_wh.sum([1, 2, 3, 4]).mean()
-
-        loss['loss_xy'] = loss_xy
-        loss['loss_wh'] = loss_wh
-
-        if self.iou_loss is not None:
-            # warn: do not modify x, y, w, h in place
-            box, tbox = [x, y, w, h], [tx, ty, tw, th]
-            pbox = bbox_transform(box, anchor, downsample)
-            gbox = bbox_transform(tbox, anchor, downsample)
-            loss_iou = self.iou_loss(pbox, gbox)
-            loss_iou = loss_iou * tscale_obj
-            loss_iou = loss_iou.sum([1, 2, 3, 4]).mean()
-            loss['loss_iou'] = loss_iou
-
-        if self.iou_aware_loss is not None:
-            box, tbox = [x, y, w, h], [tx, ty, tw, th]
-            pbox = bbox_transform(box, anchor, downsample)
-            gbox = bbox_transform(tbox, anchor, downsample)
-            loss_iou_aware = self.iou_aware_loss(ioup, pbox, gbox)
-            loss_iou_aware = loss_iou_aware * tobj
-            loss_iou_aware = loss_iou_aware.sum([1, 2, 3, 4]).mean()
-            loss['loss_iou_aware'] = loss_iou_aware
-
-        box = [x, y, w, h]
-        loss_obj = self.obj_loss(box, gt_box, obj, tobj, anchor, downsample)
-        loss_obj = loss_obj.sum(-1).mean()
-        loss['loss_obj'] = loss_obj
-        loss_cls = self.cls_loss(pcls, tcls) * tobj
-        loss_cls = loss_cls.sum([1, 2, 3, 4]).mean()
-        loss['loss_cls'] = loss_cls
-        return loss
-
-    def forward(self, inputs, targets, anchors):
-        np = len(inputs)
-        gt_targets = [targets['target{}'.format(i)] for i in range(np)]
-        gt_box = targets['gt_bbox']
-        yolo_losses = dict()
-        self.distill_pairs.clear()
-        for x, t, anchor, downsample in zip(inputs, gt_targets, anchors,
-                                            self.downsample):
-            yolo_loss = self.yolov3_loss(
-                x.astype('float32'), t, gt_box, anchor, downsample,
-                self.scale_x_y)
-            for k, v in yolo_loss.items():
-                if k in yolo_losses:
-                    yolo_losses[k] += v
-                else:
-                    yolo_losses[k] = v
-
-        loss = 0
-        for k, v in yolo_losses.items():
-            loss += v
-
-        yolo_losses['loss'] = loss
-        return yolo_losses
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/mot/__init__.py
deleted file mode 100644
index 258e4c9..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import matching
-from . import tracker
-from . import motion
-from . import visualization
-from . import utils
-
-from .matching import *
-from .tracker import *
-from .motion import *
-from .visualization import *
-from .utils import *
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/matching/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/mot/matching/__init__.py
deleted file mode 100644
index f6a88c5..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/matching/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import jde_matching
-from . import deepsort_matching
-from . import ocsort_matching
-
-from .jde_matching import *
-from .deepsort_matching import *
-from .ocsort_matching import *
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/matching/deepsort_matching.py b/pdfdet/models/Paddle/ppdet/modeling/mot/matching/deepsort_matching.py
deleted file mode 100644
index 3859ccf..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/matching/deepsort_matching.py
+++ /dev/null
@@ -1,379 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/nwojke/deep_sort/tree/master/deep_sort
-"""
-
-import numpy as np
-from scipy.optimize import linear_sum_assignment
-from ..motion import kalman_filter
-
-INFTY_COST = 1e+5
-
-__all__ = [
-    'iou_1toN',
-    'iou_cost',
-    '_nn_euclidean_distance',
-    '_nn_cosine_distance',
-    'NearestNeighborDistanceMetric',
-    'min_cost_matching',
-    'matching_cascade',
-    'gate_cost_matrix',
-]
-
-
-def iou_1toN(bbox, candidates):
-    """
-    Computer intersection over union (IoU) by one box to N candidates.
-
-    Args:
-        bbox (ndarray): A bounding box in format `(top left x, top left y, width, height)`.
-            candidates (ndarray): A matrix of candidate bounding boxes (one per row) in the
-            same format as `bbox`.
-
-    Returns:
-        ious (ndarray): The intersection over union in [0, 1] between the `bbox`
-            and each candidate. A higher score means a larger fraction of the
-            `bbox` is occluded by the candidate.
-    """
-    bbox_tl = bbox[:2]
-    bbox_br = bbox[:2] + bbox[2:]
-    candidates_tl = candidates[:, :2]
-    candidates_br = candidates[:, :2] + candidates[:, 2:]
-
-    tl = np.c_[np.maximum(bbox_tl[0], candidates_tl[:, 0])[:, np.newaxis],
-               np.maximum(bbox_tl[1], candidates_tl[:, 1])[:, np.newaxis]]
-    br = np.c_[np.minimum(bbox_br[0], candidates_br[:, 0])[:, np.newaxis],
-               np.minimum(bbox_br[1], candidates_br[:, 1])[:, np.newaxis]]
-    wh = np.maximum(0., br - tl)
-
-    area_intersection = wh.prod(axis=1)
-    area_bbox = bbox[2:].prod()
-    area_candidates = candidates[:, 2:].prod(axis=1)
-    ious = area_intersection / (area_bbox + area_candidates - area_intersection)
-    return ious
-
-
-def iou_cost(tracks, detections, track_indices=None, detection_indices=None):
-    """
-    IoU distance metric.
-
-    Args:
-        tracks (list[Track]): A list of tracks.
-        detections (list[Detection]): A list of detections.
-        track_indices (Optional[list[int]]): A list of indices to tracks that
-            should be matched. Defaults to all `tracks`.
-        detection_indices (Optional[list[int]]): A list of indices to detections
-            that should be matched. Defaults to all `detections`.
-
-    Returns:
-        cost_matrix (ndarray): A cost matrix of shape len(track_indices), 
-            len(detection_indices) where entry (i, j) is 
-            `1 - iou(tracks[track_indices[i]], detections[detection_indices[j]])`.
-    """
-    if track_indices is None:
-        track_indices = np.arange(len(tracks))
-    if detection_indices is None:
-        detection_indices = np.arange(len(detections))
-
-    cost_matrix = np.zeros((len(track_indices), len(detection_indices)))
-    for row, track_idx in enumerate(track_indices):
-        if tracks[track_idx].time_since_update > 1:
-            cost_matrix[row, :] = 1e+5
-            continue
-
-        bbox = tracks[track_idx].to_tlwh()
-        candidates = np.asarray([detections[i].tlwh for i in detection_indices])
-        cost_matrix[row, :] = 1. - iou_1toN(bbox, candidates)
-    return cost_matrix
-
-
-def _nn_euclidean_distance(s, q):
-    """
-    Compute pair-wise squared (Euclidean) distance between points in `s` and `q`.
-
-    Args:
-        s (ndarray): Sample points: an NxM matrix of N samples of dimensionality M.
-        q (ndarray): Query points: an LxM matrix of L samples of dimensionality M.
-
-    Returns:
-        distances (ndarray): A vector of length M that contains for each entry in `q` the
-            smallest Euclidean distance to a sample in `s`.
-    """
-    s, q = np.asarray(s), np.asarray(q)
-    if len(s) == 0 or len(q) == 0:
-        return np.zeros((len(s), len(q)))
-    s2, q2 = np.square(s).sum(axis=1), np.square(q).sum(axis=1)
-    distances = -2. * np.dot(s, q.T) + s2[:, None] + q2[None, :]
-    distances = np.clip(distances, 0., float(np.inf))
-
-    return np.maximum(0.0, distances.min(axis=0))
-
-
-def _nn_cosine_distance(s, q):
-    """
-    Compute pair-wise cosine distance between points in `s` and `q`.
-
-    Args:
-        s (ndarray): Sample points: an NxM matrix of N samples of dimensionality M.
-        q (ndarray): Query points: an LxM matrix of L samples of dimensionality M.
-
-    Returns:
-        distances (ndarray): A vector of length M that contains for each entry in `q` the
-            smallest Euclidean distance to a sample in `s`.
-    """
-    s = np.asarray(s) / np.linalg.norm(s, axis=1, keepdims=True)
-    q = np.asarray(q) / np.linalg.norm(q, axis=1, keepdims=True)
-    distances = 1. - np.dot(s, q.T)
-
-    return distances.min(axis=0)
-
-
-class NearestNeighborDistanceMetric(object):
-    """
-    A nearest neighbor distance metric that, for each target, returns
-    the closest distance to any sample that has been observed so far.
-
-    Args:
-        metric (str): Either "euclidean" or "cosine".
-        matching_threshold (float): The matching threshold. Samples with larger
-            distance are considered an invalid match.
-        budget (Optional[int]): If not None, fix samples per class to at most
-            this number. Removes the oldest samples when the budget is reached.
-
-    Attributes: 
-        samples (Dict[int -> List[ndarray]]): A dictionary that maps from target
-            identities to the list of samples that have been observed so far.
-    """
-
-    def __init__(self, metric, matching_threshold, budget=None):
-        if metric == "euclidean":
-            self._metric = _nn_euclidean_distance
-        elif metric == "cosine":
-            self._metric = _nn_cosine_distance
-        else:
-            raise ValueError(
-                "Invalid metric; must be either 'euclidean' or 'cosine'")
-        self.matching_threshold = matching_threshold
-        self.budget = budget
-        self.samples = {}
-
-    def partial_fit(self, features, targets, active_targets):
-        """
-        Update the distance metric with new data.
-
-        Args:
-            features (ndarray): An NxM matrix of N features of dimensionality M.
-            targets (ndarray): An integer array of associated target identities.
-            active_targets (List[int]): A list of targets that are currently
-                present in the scene.
-        """
-        for feature, target in zip(features, targets):
-            self.samples.setdefault(target, []).append(feature)
-            if self.budget is not None:
-                self.samples[target] = self.samples[target][-self.budget:]
-        self.samples = {k: self.samples[k] for k in active_targets}
-
-    def distance(self, features, targets):
-        """
-        Compute distance between features and targets.
-
-        Args:
-            features (ndarray): An NxM matrix of N features of dimensionality M.
-            targets (list[int]): A list of targets to match the given `features` against.
-
-        Returns:
-            cost_matrix (ndarray): a cost matrix of shape len(targets), len(features),
-                where element (i, j) contains the closest squared distance between
-                `targets[i]` and `features[j]`.
-        """
-        cost_matrix = np.zeros((len(targets), len(features)))
-        for i, target in enumerate(targets):
-            cost_matrix[i, :] = self._metric(self.samples[target], features)
-        return cost_matrix
-
-
-def min_cost_matching(distance_metric,
-                      max_distance,
-                      tracks,
-                      detections,
-                      track_indices=None,
-                      detection_indices=None):
-    """
-    Solve linear assignment problem.
-
-    Args:
-        distance_metric :
-            Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray
-            The distance metric is given a list of tracks and detections as 
-            well as a list of N track indices and M detection indices. The 
-            metric should return the NxM dimensional cost matrix, where element
-            (i, j) is the association cost between the i-th track in the given
-            track indices and the j-th detection in the given detection_indices.
-        max_distance (float): Gating threshold. Associations with cost larger
-            than this value are disregarded.
-        tracks (list[Track]): A list of predicted tracks at the current time
-            step.
-        detections (list[Detection]): A list of detections at the current time
-            step.
-        track_indices (list[int]): List of track indices that maps rows in
-            `cost_matrix` to tracks in `tracks`.
-        detection_indices (List[int]): List of detection indices that maps
-            columns in `cost_matrix` to detections in `detections`.
-
-    Returns:
-        A tuple (List[(int, int)], List[int], List[int]) with the following
-        three entries:
-            * A list of matched track and detection indices.
-            * A list of unmatched track indices.
-            * A list of unmatched detection indices.
-    """
-    if track_indices is None:
-        track_indices = np.arange(len(tracks))
-    if detection_indices is None:
-        detection_indices = np.arange(len(detections))
-
-    if len(detection_indices) == 0 or len(track_indices) == 0:
-        return [], track_indices, detection_indices  # Nothing to match.
-
-    cost_matrix = distance_metric(tracks, detections, track_indices,
-                                  detection_indices)
-
-    cost_matrix[cost_matrix > max_distance] = max_distance + 1e-5
-    indices = linear_sum_assignment(cost_matrix)
-
-    matches, unmatched_tracks, unmatched_detections = [], [], []
-    for col, detection_idx in enumerate(detection_indices):
-        if col not in indices[1]:
-            unmatched_detections.append(detection_idx)
-    for row, track_idx in enumerate(track_indices):
-        if row not in indices[0]:
-            unmatched_tracks.append(track_idx)
-    for row, col in zip(indices[0], indices[1]):
-        track_idx = track_indices[row]
-        detection_idx = detection_indices[col]
-        if cost_matrix[row, col] > max_distance:
-            unmatched_tracks.append(track_idx)
-            unmatched_detections.append(detection_idx)
-        else:
-            matches.append((track_idx, detection_idx))
-    return matches, unmatched_tracks, unmatched_detections
-
-
-def matching_cascade(distance_metric,
-                     max_distance,
-                     cascade_depth,
-                     tracks,
-                     detections,
-                     track_indices=None,
-                     detection_indices=None):
-    """
-    Run matching cascade.
-
-    Args:
-        distance_metric :
-            Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray
-            The distance metric is given a list of tracks and detections as 
-            well as a list of N track indices and M detection indices. The 
-            metric should return the NxM dimensional cost matrix, where element
-            (i, j) is the association cost between the i-th track in the given
-            track indices and the j-th detection in the given detection_indices.
-        max_distance (float): Gating threshold. Associations with cost larger
-            than this value are disregarded.
-        cascade_depth (int): The cascade depth, should be se to the maximum
-            track age.
-        tracks (list[Track]): A list of predicted tracks at the current time
-            step.
-        detections (list[Detection]): A list of detections at the current time
-            step.
-        track_indices (list[int]): List of track indices that maps rows in
-            `cost_matrix` to tracks in `tracks`.
-        detection_indices (List[int]): List of detection indices that maps
-            columns in `cost_matrix` to detections in `detections`.
-
-    Returns:
-        A tuple (List[(int, int)], List[int], List[int]) with the following
-        three entries:
-            * A list of matched track and detection indices.
-            * A list of unmatched track indices.
-            * A list of unmatched detection indices.
-    """
-    if track_indices is None:
-        track_indices = list(range(len(tracks)))
-    if detection_indices is None:
-        detection_indices = list(range(len(detections)))
-
-    unmatched_detections = detection_indices
-    matches = []
-    for level in range(cascade_depth):
-        if len(unmatched_detections) == 0:  # No detections left
-            break
-
-        track_indices_l = [
-            k for k in track_indices if tracks[k].time_since_update == 1 + level
-        ]
-        if len(track_indices_l) == 0:  # Nothing to match at this level
-            continue
-
-        matches_l, _, unmatched_detections = \
-            min_cost_matching(
-                distance_metric, max_distance, tracks, detections,
-                track_indices_l, unmatched_detections)
-        matches += matches_l
-    unmatched_tracks = list(set(track_indices) - set(k for k, _ in matches))
-    return matches, unmatched_tracks, unmatched_detections
-
-
-def gate_cost_matrix(kf,
-                     cost_matrix,
-                     tracks,
-                     detections,
-                     track_indices,
-                     detection_indices,
-                     gated_cost=INFTY_COST,
-                     only_position=False):
-    """
-    Invalidate infeasible entries in cost matrix based on the state
-    distributions obtained by Kalman filtering.
-
-    Args:
-        kf (object): The Kalman filter.
-        cost_matrix (ndarray): The NxM dimensional cost matrix, where N is the
-            number of track indices and M is the number of detection indices,
-            such that entry (i, j) is the association cost between
-            `tracks[track_indices[i]]` and `detections[detection_indices[j]]`.
-        tracks (list[Track]): A list of predicted tracks at the current time
-            step.
-        detections (list[Detection]): A list of detections at the current time
-            step.
-        track_indices (List[int]): List of track indices that maps rows in
-            `cost_matrix` to tracks in `tracks`.
-        detection_indices (List[int]): List of detection indices that maps
-            columns in `cost_matrix` to detections in `detections`.
-        gated_cost (Optional[float]): Entries in the cost matrix corresponding
-            to infeasible associations are set this value. Defaults to a very
-            large value.
-        only_position (Optional[bool]): If True, only the x, y position of the
-            state distribution is considered during gating. Default False.
-    """
-    gating_dim = 2 if only_position else 4
-    gating_threshold = kalman_filter.chi2inv95[gating_dim]
-    measurements = np.asarray(
-        [detections[i].to_xyah() for i in detection_indices])
-    for row, track_idx in enumerate(track_indices):
-        track = tracks[track_idx]
-        gating_distance = kf.gating_distance(track.mean, track.covariance,
-                                             measurements, only_position)
-        cost_matrix[row, gating_distance > gating_threshold] = gated_cost
-    return cost_matrix
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/matching/jde_matching.py b/pdfdet/models/Paddle/ppdet/modeling/mot/matching/jde_matching.py
deleted file mode 100644
index ac28f90..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/matching/jde_matching.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/matching.py
-"""
-
-try:
-    import lap
-except:
-    print(
-        'Warning: Unable to use JDE/FairMOT/ByteTrack, please install lap, for example: `pip install lap`, see https://github.com/gatagat/lap'
-    )
-    pass
-
-import scipy
-import numpy as np
-from scipy.spatial.distance import cdist
-from ..motion import kalman_filter
-import warnings
-warnings.filterwarnings("ignore")
-
-__all__ = [
-    'merge_matches',
-    'linear_assignment',
-    'bbox_ious',
-    'iou_distance',
-    'embedding_distance',
-    'fuse_motion',
-]
-
-
-def merge_matches(m1, m2, shape):
-    O, P, Q = shape
-    m1 = np.asarray(m1)
-    m2 = np.asarray(m2)
-
-    M1 = scipy.sparse.coo_matrix(
-        (np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P))
-    M2 = scipy.sparse.coo_matrix(
-        (np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q))
-
-    mask = M1 * M2
-    match = mask.nonzero()
-    match = list(zip(match[0], match[1]))
-    unmatched_O = tuple(set(range(O)) - set([i for i, j in match]))
-    unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match]))
-
-    return match, unmatched_O, unmatched_Q
-
-
-def linear_assignment(cost_matrix, thresh):
-    try:
-        import lap
-    except Exception as e:
-        raise RuntimeError(
-            'Unable to use JDE/FairMOT/ByteTrack, please install lap, for example: `pip install lap`, see https://github.com/gatagat/lap'
-        )
-    if cost_matrix.size == 0:
-        return np.empty(
-            (0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple(
-                range(cost_matrix.shape[1]))
-    matches, unmatched_a, unmatched_b = [], [], []
-    cost, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh)
-    for ix, mx in enumerate(x):
-        if mx >= 0:
-            matches.append([ix, mx])
-    unmatched_a = np.where(x < 0)[0]
-    unmatched_b = np.where(y < 0)[0]
-    matches = np.asarray(matches)
-    return matches, unmatched_a, unmatched_b
-
-
-def bbox_ious(atlbrs, btlbrs):
-    boxes = np.ascontiguousarray(atlbrs, dtype=np.float32)
-    query_boxes = np.ascontiguousarray(btlbrs, dtype=np.float32)
-    N = boxes.shape[0]
-    K = query_boxes.shape[0]
-    ious = np.zeros((N, K), dtype=boxes.dtype)
-    if N * K == 0:
-        return ious
-
-    for k in range(K):
-        box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + 1) *
-                    (query_boxes[k, 3] - query_boxes[k, 1] + 1))
-        for n in range(N):
-            iw = (min(boxes[n, 2], query_boxes[k, 2]) - max(
-                boxes[n, 0], query_boxes[k, 0]) + 1)
-            if iw > 0:
-                ih = (min(boxes[n, 3], query_boxes[k, 3]) - max(
-                    boxes[n, 1], query_boxes[k, 1]) + 1)
-                if ih > 0:
-                    ua = float((boxes[n, 2] - boxes[n, 0] + 1) * (boxes[
-                        n, 3] - boxes[n, 1] + 1) + box_area - iw * ih)
-                    ious[n, k] = iw * ih / ua
-    return ious
-
-
-def iou_distance(atracks, btracks):
-    """
-    Compute cost based on IoU between two list[STrack].
-    """
-    if (len(atracks) > 0 and isinstance(atracks[0], np.ndarray)) or (
-            len(btracks) > 0 and isinstance(btracks[0], np.ndarray)):
-        atlbrs = atracks
-        btlbrs = btracks
-    else:
-        atlbrs = [track.tlbr for track in atracks]
-        btlbrs = [track.tlbr for track in btracks]
-    _ious = bbox_ious(atlbrs, btlbrs)
-    cost_matrix = 1 - _ious
-
-    return cost_matrix
-
-
-def embedding_distance(tracks, detections, metric='euclidean'):
-    """
-    Compute cost based on features between two list[STrack].
-    """
-    cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float32)
-    if cost_matrix.size == 0:
-        return cost_matrix
-    det_features = np.asarray(
-        [track.curr_feat for track in detections], dtype=np.float32)
-    track_features = np.asarray(
-        [track.smooth_feat for track in tracks], dtype=np.float32)
-    cost_matrix = np.maximum(0.0, cdist(track_features, det_features,
-                                        metric))  # Nomalized features
-    return cost_matrix
-
-
-def fuse_motion(kf,
-                cost_matrix,
-                tracks,
-                detections,
-                only_position=False,
-                lambda_=0.98):
-    if cost_matrix.size == 0:
-        return cost_matrix
-    gating_dim = 2 if only_position else 4
-    gating_threshold = kalman_filter.chi2inv95[gating_dim]
-    measurements = np.asarray([det.to_xyah() for det in detections])
-    for row, track in enumerate(tracks):
-        gating_distance = kf.gating_distance(
-            track.mean,
-            track.covariance,
-            measurements,
-            only_position,
-            metric='maha')
-        cost_matrix[row, gating_distance > gating_threshold] = np.inf
-        cost_matrix[row] = lambda_ * cost_matrix[row] + (1 - lambda_
-                                                         ) * gating_distance
-    return cost_matrix
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/matching/ocsort_matching.py b/pdfdet/models/Paddle/ppdet/modeling/mot/matching/ocsort_matching.py
deleted file mode 100644
index 58f79a5..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/matching/ocsort_matching.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/noahcao/OC_SORT/blob/master/trackers/ocsort_tracker/association.py
-"""
-
-import os
-import numpy as np
-
-
-def iou_batch(bboxes1, bboxes2):
-    bboxes2 = np.expand_dims(bboxes2, 0)
-    bboxes1 = np.expand_dims(bboxes1, 1)
-
-    xx1 = np.maximum(bboxes1[..., 0], bboxes2[..., 0])
-    yy1 = np.maximum(bboxes1[..., 1], bboxes2[..., 1])
-    xx2 = np.minimum(bboxes1[..., 2], bboxes2[..., 2])
-    yy2 = np.minimum(bboxes1[..., 3], bboxes2[..., 3])
-    w = np.maximum(0., xx2 - xx1)
-    h = np.maximum(0., yy2 - yy1)
-    area = w * h
-    iou_matrix = area / ((bboxes1[..., 2] - bboxes1[..., 0]) *
-                         (bboxes1[..., 3] - bboxes1[..., 1]) +
-                         (bboxes2[..., 2] - bboxes2[..., 0]) *
-                         (bboxes2[..., 3] - bboxes2[..., 1]) - area)
-    return iou_matrix
-
-
-def speed_direction_batch(dets, tracks):
-    tracks = tracks[..., np.newaxis]
-    CX1, CY1 = (dets[:, 0] + dets[:, 2]) / 2.0, (dets[:, 1] + dets[:, 3]) / 2.0
-    CX2, CY2 = (tracks[:, 0] + tracks[:, 2]) / 2.0, (
-        tracks[:, 1] + tracks[:, 3]) / 2.0
-    dx = CX1 - CX2
-    dy = CY1 - CY2
-    norm = np.sqrt(dx**2 + dy**2) + 1e-6
-    dx = dx / norm
-    dy = dy / norm
-    return dy, dx
-
-
-def linear_assignment(cost_matrix):
-    try:
-        import lap
-        _, x, y = lap.lapjv(cost_matrix, extend_cost=True)
-        return np.array([[y[i], i] for i in x if i >= 0])
-    except ImportError:
-        from scipy.optimize import linear_sum_assignment
-        x, y = linear_sum_assignment(cost_matrix)
-        return np.array(list(zip(x, y)))
-
-
-def associate(detections, trackers, iou_threshold, velocities, previous_obs,
-              vdc_weight):
-    if (len(trackers) == 0):
-        return np.empty(
-            (0, 2), dtype=int), np.arange(len(detections)), np.empty(
-                (0, 5), dtype=int)
-
-    Y, X = speed_direction_batch(detections, previous_obs)
-    inertia_Y, inertia_X = velocities[:, 0], velocities[:, 1]
-    inertia_Y = np.repeat(inertia_Y[:, np.newaxis], Y.shape[1], axis=1)
-    inertia_X = np.repeat(inertia_X[:, np.newaxis], X.shape[1], axis=1)
-    diff_angle_cos = inertia_X * X + inertia_Y * Y
-    diff_angle_cos = np.clip(diff_angle_cos, a_min=-1, a_max=1)
-    diff_angle = np.arccos(diff_angle_cos)
-    diff_angle = (np.pi / 2.0 - np.abs(diff_angle)) / np.pi
-
-    valid_mask = np.ones(previous_obs.shape[0])
-    valid_mask[np.where(previous_obs[:, 4] < 0)] = 0
-
-    iou_matrix = iou_batch(detections, trackers)
-    scores = np.repeat(
-        detections[:, -1][:, np.newaxis], trackers.shape[0], axis=1)
-    # iou_matrix = iou_matrix * scores # a trick sometiems works, we don't encourage this
-    valid_mask = np.repeat(valid_mask[:, np.newaxis], X.shape[1], axis=1)
-
-    angle_diff_cost = (valid_mask * diff_angle) * vdc_weight
-    angle_diff_cost = angle_diff_cost.T
-    angle_diff_cost = angle_diff_cost * scores
-
-    if min(iou_matrix.shape) > 0:
-        a = (iou_matrix > iou_threshold).astype(np.int32)
-        if a.sum(1).max() == 1 and a.sum(0).max() == 1:
-            matched_indices = np.stack(np.where(a), axis=1)
-        else:
-            matched_indices = linear_assignment(-(iou_matrix + angle_diff_cost))
-    else:
-        matched_indices = np.empty(shape=(0, 2))
-
-    unmatched_detections = []
-    for d, det in enumerate(detections):
-        if (d not in matched_indices[:, 0]):
-            unmatched_detections.append(d)
-    unmatched_trackers = []
-    for t, trk in enumerate(trackers):
-        if (t not in matched_indices[:, 1]):
-            unmatched_trackers.append(t)
-
-    # filter out matched with low IOU
-    matches = []
-    for m in matched_indices:
-        if (iou_matrix[m[0], m[1]] < iou_threshold):
-            unmatched_detections.append(m[0])
-            unmatched_trackers.append(m[1])
-        else:
-            matches.append(m.reshape(1, 2))
-    if (len(matches) == 0):
-        matches = np.empty((0, 2), dtype=int)
-    else:
-        matches = np.concatenate(matches, axis=0)
-
-    return matches, np.array(unmatched_detections), np.array(unmatched_trackers)
-
-
-def associate_only_iou(detections, trackers, iou_threshold):
-    if (len(trackers) == 0):
-        return np.empty(
-            (0, 2), dtype=int), np.arange(len(detections)), np.empty(
-                (0, 5), dtype=int)
-
-    iou_matrix = iou_batch(detections, trackers)
-
-    if min(iou_matrix.shape) > 0:
-        a = (iou_matrix > iou_threshold).astype(np.int32)
-        if a.sum(1).max() == 1 and a.sum(0).max() == 1:
-            matched_indices = np.stack(np.where(a), axis=1)
-        else:
-            matched_indices = linear_assignment(-iou_matrix)
-    else:
-        matched_indices = np.empty(shape=(0, 2))
-
-    unmatched_detections = []
-    for d, det in enumerate(detections):
-        if (d not in matched_indices[:, 0]):
-            unmatched_detections.append(d)
-    unmatched_trackers = []
-    for t, trk in enumerate(trackers):
-        if (t not in matched_indices[:, 1]):
-            unmatched_trackers.append(t)
-
-    # filter out matched with low IOU
-    matches = []
-    for m in matched_indices:
-        if (iou_matrix[m[0], m[1]] < iou_threshold):
-            unmatched_detections.append(m[0])
-            unmatched_trackers.append(m[1])
-        else:
-            matches.append(m.reshape(1, 2))
-    if (len(matches) == 0):
-        matches = np.empty((0, 2), dtype=int)
-    else:
-        matches = np.concatenate(matches, axis=0)
-    return matches, np.array(unmatched_detections), np.array(unmatched_trackers)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/motion/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/mot/motion/__init__.py
deleted file mode 100644
index 6d20612..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/motion/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import kalman_filter
-
-from .kalman_filter import *
-from .gmc import *
\ No newline at end of file
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/motion/gmc.py b/pdfdet/models/Paddle/ppdet/modeling/mot/motion/gmc.py
deleted file mode 100644
index 43ec42e..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/motion/gmc.py
+++ /dev/null
@@ -1,368 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/WWangYuHsiang/SMILEtrack/blob/main/BoT-SORT/tracker/gmc.py
-"""
-
-import cv2
-import matplotlib.pyplot as plt
-import numpy as np
-import copy
-import time
-from ppdet.core.workspace import register, serializable
-
-
-@register
-@serializable
-class GMC:
-    def __init__(self, method='sparseOptFlow', downscale=2, verbose=None):
-        super(GMC, self).__init__()
-
-        self.method = method
-        self.downscale = max(1, int(downscale))
-
-        if self.method == 'orb':
-            self.detector = cv2.FastFeatureDetector_create(20)
-            self.extractor = cv2.ORB_create()
-            self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING)
-
-        elif self.method == 'sift':
-            self.detector = cv2.SIFT_create(
-                nOctaveLayers=3, contrastThreshold=0.02, edgeThreshold=20)
-            self.extractor = cv2.SIFT_create(
-                nOctaveLayers=3, contrastThreshold=0.02, edgeThreshold=20)
-            self.matcher = cv2.BFMatcher(cv2.NORM_L2)
-
-        elif self.method == 'ecc':
-            number_of_iterations = 5000
-            termination_eps = 1e-6
-            self.warp_mode = cv2.MOTION_EUCLIDEAN
-            self.criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT,
-                             number_of_iterations, termination_eps)
-
-        elif self.method == 'sparseOptFlow':
-            self.feature_params = dict(
-                maxCorners=1000,
-                qualityLevel=0.01,
-                minDistance=1,
-                blockSize=3,
-                useHarrisDetector=False,
-                k=0.04)
-            # self.gmc_file = open('GMC_results.txt', 'w')
-
-        elif self.method == 'file' or self.method == 'files':
-            seqName = verbose[0]
-            ablation = verbose[1]
-            if ablation:
-                filePath = r'tracker/GMC_files/MOT17_ablation'
-            else:
-                filePath = r'tracker/GMC_files/MOTChallenge'
-
-            if '-FRCNN' in seqName:
-                seqName = seqName[:-6]
-            elif '-DPM' in seqName:
-                seqName = seqName[:-4]
-            elif '-SDP' in seqName:
-                seqName = seqName[:-4]
-
-            self.gmcFile = open(filePath + "/GMC-" + seqName + ".txt", 'r')
-
-            if self.gmcFile is None:
-                raise ValueError("Error: Unable to open GMC file in directory:"
-                                 + filePath)
-        elif self.method == 'none' or self.method == 'None':
-            self.method = 'none'
-        else:
-            raise ValueError("Error: Unknown CMC method:" + method)
-
-        self.prevFrame = None
-        self.prevKeyPoints = None
-        self.prevDescriptors = None
-
-        self.initializedFirstFrame = False
-
-    def apply(self, raw_frame, detections=None):
-        if self.method == 'orb' or self.method == 'sift':
-            return self.applyFeaures(raw_frame, detections)
-        elif self.method == 'ecc':
-            return self.applyEcc(raw_frame, detections)
-        elif self.method == 'sparseOptFlow':
-            return self.applySparseOptFlow(raw_frame, detections)
-        elif self.method == 'file':
-            return self.applyFile(raw_frame, detections)
-        elif self.method == 'none':
-            return np.eye(2, 3)
-        else:
-            return np.eye(2, 3)
-
-    def applyEcc(self, raw_frame, detections=None):
-
-        # Initialize
-        height, width, _ = raw_frame.shape
-        frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY)
-        H = np.eye(2, 3, dtype=np.float32)
-
-        # Downscale image (TODO: consider using pyramids)
-        if self.downscale > 1.0:
-            frame = cv2.GaussianBlur(frame, (3, 3), 1.5)
-            frame = cv2.resize(frame, (width // self.downscale,
-                                       height // self.downscale))
-            width = width // self.downscale
-            height = height // self.downscale
-
-        # Handle first frame
-        if not self.initializedFirstFrame:
-            # Initialize data
-            self.prevFrame = frame.copy()
-
-            # Initialization done
-            self.initializedFirstFrame = True
-
-            return H
-
-        # Run the ECC algorithm. The results are stored in warp_matrix.
-        # (cc, H) = cv2.findTransformECC(self.prevFrame, frame, H, self.warp_mode, self.criteria)
-        try:
-            (cc,
-             H) = cv2.findTransformECC(self.prevFrame, frame, H, self.warp_mode,
-                                       self.criteria, None, 1)
-        except:
-            print('Warning: find transform failed. Set warp as identity')
-
-        return H
-
-    def applyFeaures(self, raw_frame, detections=None):
-
-        # Initialize
-        height, width, _ = raw_frame.shape
-        frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY)
-        H = np.eye(2, 3)
-
-        # Downscale image (TODO: consider using pyramids)
-        if self.downscale > 1.0:
-            # frame = cv2.GaussianBlur(frame, (3, 3), 1.5)
-            frame = cv2.resize(frame, (width // self.downscale,
-                                       height // self.downscale))
-            width = width // self.downscale
-            height = height // self.downscale
-
-        # find the keypoints
-        mask = np.zeros_like(frame)
-        # mask[int(0.05 * height): int(0.95 * height), int(0.05 * width): int(0.95 * width)] = 255
-        mask[int(0.02 * height):int(0.98 * height), int(0.02 * width):int(
-            0.98 * width)] = 255
-        if detections is not None:
-            for det in detections:
-                tlbr = (det[:4] / self.downscale).astype(np.int_)
-                mask[tlbr[1]:tlbr[3], tlbr[0]:tlbr[2]] = 0
-
-        keypoints = self.detector.detect(frame, mask)
-
-        # compute the descriptors
-        keypoints, descriptors = self.extractor.compute(frame, keypoints)
-
-        # Handle first frame
-        if not self.initializedFirstFrame:
-            # Initialize data
-            self.prevFrame = frame.copy()
-            self.prevKeyPoints = copy.copy(keypoints)
-            self.prevDescriptors = copy.copy(descriptors)
-
-            # Initialization done
-            self.initializedFirstFrame = True
-
-            return H
-
-        # Match descriptors.
-        knnMatches = self.matcher.knnMatch(self.prevDescriptors, descriptors, 2)
-
-        # Filtered matches based on smallest spatial distance
-        matches = []
-        spatialDistances = []
-
-        maxSpatialDistance = 0.25 * np.array([width, height])
-
-        # Handle empty matches case
-        if len(knnMatches) == 0:
-            # Store to next iteration
-            self.prevFrame = frame.copy()
-            self.prevKeyPoints = copy.copy(keypoints)
-            self.prevDescriptors = copy.copy(descriptors)
-
-            return H
-
-        for m, n in knnMatches:
-            if m.distance < 0.9 * n.distance:
-                prevKeyPointLocation = self.prevKeyPoints[m.queryIdx].pt
-                currKeyPointLocation = keypoints[m.trainIdx].pt
-
-                spatialDistance = (
-                    prevKeyPointLocation[0] - currKeyPointLocation[0],
-                    prevKeyPointLocation[1] - currKeyPointLocation[1])
-
-                if (np.abs(spatialDistance[0]) < maxSpatialDistance[0]) and \
-                        (np.abs(spatialDistance[1]) < maxSpatialDistance[1]):
-                    spatialDistances.append(spatialDistance)
-                    matches.append(m)
-
-        meanSpatialDistances = np.mean(spatialDistances, 0)
-        stdSpatialDistances = np.std(spatialDistances, 0)
-
-        inliesrs = (spatialDistances - meanSpatialDistances
-                    ) < 2.5 * stdSpatialDistances
-
-        goodMatches = []
-        prevPoints = []
-        currPoints = []
-        for i in range(len(matches)):
-            if inliesrs[i, 0] and inliesrs[i, 1]:
-                goodMatches.append(matches[i])
-                prevPoints.append(self.prevKeyPoints[matches[i].queryIdx].pt)
-                currPoints.append(keypoints[matches[i].trainIdx].pt)
-
-        prevPoints = np.array(prevPoints)
-        currPoints = np.array(currPoints)
-
-        # Draw the keypoint matches on the output image
-        if 0:
-            matches_img = np.hstack((self.prevFrame, frame))
-            matches_img = cv2.cvtColor(matches_img, cv2.COLOR_GRAY2BGR)
-            W = np.size(self.prevFrame, 1)
-            for m in goodMatches:
-                prev_pt = np.array(
-                    self.prevKeyPoints[m.queryIdx].pt, dtype=np.int_)
-                curr_pt = np.array(keypoints[m.trainIdx].pt, dtype=np.int_)
-                curr_pt[0] += W
-                color = np.random.randint(0, 255, (3, ))
-                color = (int(color[0]), int(color[1]), int(color[2]))
-
-                matches_img = cv2.line(matches_img, prev_pt, curr_pt,
-                                       tuple(color), 1, cv2.LINE_AA)
-                matches_img = cv2.circle(matches_img, prev_pt, 2,
-                                         tuple(color), -1)
-                matches_img = cv2.circle(matches_img, curr_pt, 2,
-                                         tuple(color), -1)
-
-            plt.figure()
-            plt.imshow(matches_img)
-            plt.show()
-
-        # Find rigid matrix
-        if (np.size(prevPoints, 0) > 4) and (
-                np.size(prevPoints, 0) == np.size(prevPoints, 0)):
-            H, inliesrs = cv2.estimateAffinePartial2D(prevPoints, currPoints,
-                                                      cv2.RANSAC)
-
-            # Handle downscale
-            if self.downscale > 1.0:
-                H[0, 2] *= self.downscale
-                H[1, 2] *= self.downscale
-        else:
-            print('Warning: not enough matching points')
-
-        # Store to next iteration
-        self.prevFrame = frame.copy()
-        self.prevKeyPoints = copy.copy(keypoints)
-        self.prevDescriptors = copy.copy(descriptors)
-
-        return H
-
-    def applySparseOptFlow(self, raw_frame, detections=None):
-
-        t0 = time.time()
-
-        # Initialize
-        height, width, _ = raw_frame.shape
-        frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY)
-        H = np.eye(2, 3)
-
-        # Downscale image
-        if self.downscale > 1.0:
-            # frame = cv2.GaussianBlur(frame, (3, 3), 1.5)
-            frame = cv2.resize(frame, (width // self.downscale,
-                                       height // self.downscale))
-
-        # find the keypoints
-        keypoints = cv2.goodFeaturesToTrack(
-            frame, mask=None, **self.feature_params)
-
-        # Handle first frame
-        if not self.initializedFirstFrame:
-            # Initialize data
-            self.prevFrame = frame.copy()
-            self.prevKeyPoints = copy.copy(keypoints)
-
-            # Initialization done
-            self.initializedFirstFrame = True
-
-            return H
-
-        if self.prevFrame.shape != frame.shape:
-            self.prevFrame = frame.copy()
-            self.prevKeyPoints = copy.copy(keypoints)
-            return H
-
-        # find correspondences
-        matchedKeypoints, status, err = cv2.calcOpticalFlowPyrLK(
-            self.prevFrame, frame, self.prevKeyPoints, None)
-
-        # leave good correspondences only
-        prevPoints = []
-        currPoints = []
-
-        for i in range(len(status)):
-            if status[i]:
-                prevPoints.append(self.prevKeyPoints[i])
-                currPoints.append(matchedKeypoints[i])
-
-        prevPoints = np.array(prevPoints)
-        currPoints = np.array(currPoints)
-
-        # Find rigid matrix
-        if (np.size(prevPoints, 0) > 4) and (
-                np.size(prevPoints, 0) == np.size(prevPoints, 0)):
-            H, inliesrs = cv2.estimateAffinePartial2D(prevPoints, currPoints,
-                                                      cv2.RANSAC)
-
-            # Handle downscale
-            if self.downscale > 1.0:
-                H[0, 2] *= self.downscale
-                H[1, 2] *= self.downscale
-        else:
-            print('Warning: not enough matching points')
-
-        # Store to next iteration
-        self.prevFrame = frame.copy()
-        self.prevKeyPoints = copy.copy(keypoints)
-
-        t1 = time.time()
-
-        # gmc_line = str(1000 * (t1 - t0)) + "\t" + str(H[0, 0]) + "\t" + str(H[0, 1]) + "\t" + str(
-        #     H[0, 2]) + "\t" + str(H[1, 0]) + "\t" + str(H[1, 1]) + "\t" + str(H[1, 2]) + "\n"
-        # self.gmc_file.write(gmc_line)
-
-        return H
-
-    def applyFile(self, raw_frame, detections=None):
-        line = self.gmcFile.readline()
-        tokens = line.split("\t")
-        H = np.eye(2, 3, dtype=np.float_)
-        H[0, 0] = float(tokens[1])
-        H[0, 1] = float(tokens[2])
-        H[0, 2] = float(tokens[3])
-        H[1, 0] = float(tokens[4])
-        H[1, 1] = float(tokens[5])
-        H[1, 2] = float(tokens[6])
-
-        return H
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/motion/kalman_filter.py b/pdfdet/models/Paddle/ppdet/modeling/mot/motion/kalman_filter.py
deleted file mode 100644
index b4e3c93..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/motion/kalman_filter.py
+++ /dev/null
@@ -1,313 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/nwojke/deep_sort/blob/master/deep_sort/kalman_filter.py
-"""
-
-import numpy as np
-import scipy.linalg
-
-use_numba = True
-try:
-    import numba as nb
-
-    @nb.njit(fastmath=True, cache=True)
-    def nb_project(mean, covariance, std, _update_mat):
-        innovation_cov = np.diag(np.square(std))
-        mean = np.dot(_update_mat, mean)
-        covariance = np.dot(np.dot(_update_mat, covariance), _update_mat.T)
-        return mean, covariance + innovation_cov
-
-    @nb.njit(fastmath=True, cache=True)
-    def nb_multi_predict(mean, covariance, motion_cov, motion_mat):
-        mean = np.dot(mean, motion_mat.T)
-        left = np.dot(motion_mat, covariance)
-        covariance = np.dot(left, motion_mat.T) + motion_cov
-        return mean, covariance
-
-    @nb.njit(fastmath=True, cache=True)
-    def nb_update(mean, covariance, proj_mean, proj_cov, measurement, meas_mat):
-        kalman_gain = np.linalg.solve(proj_cov, (covariance @meas_mat.T).T).T
-        innovation = measurement - proj_mean
-        mean = mean + innovation @kalman_gain.T
-        covariance = covariance - kalman_gain @proj_cov @kalman_gain.T
-        return mean, covariance
-
-except:
-    use_numba = False
-    pass
-
-__all__ = ['KalmanFilter']
-"""
-Table for the 0.95 quantile of the chi-square distribution with N degrees of
-freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv
-function and used as Mahalanobis gating threshold.
-"""
-
-chi2inv95 = {
-    1: 3.8415,
-    2: 5.9915,
-    3: 7.8147,
-    4: 9.4877,
-    5: 11.070,
-    6: 12.592,
-    7: 14.067,
-    8: 15.507,
-    9: 16.919
-}
-
-
-class KalmanFilter(object):
-    """
-    A simple Kalman filter for tracking bounding boxes in image space.
-
-    The 8-dimensional state space
-
-        x, y, a, h, vx, vy, va, vh
-
-    contains the bounding box center position (x, y), aspect ratio a, height h,
-    and their respective velocities.
-
-    Object motion follows a constant velocity model. The bounding box location
-    (x, y, a, h) is taken as direct observation of the state space (linear
-    observation model).
-
-    """
-
-    def __init__(self):
-        ndim, dt = 4, 1.
-
-        # Create Kalman filter model matrices.
-        self._motion_mat = np.eye(2 * ndim, 2 * ndim, dtype=np.float32)
-        for i in range(ndim):
-            self._motion_mat[i, ndim + i] = dt
-        self._update_mat = np.eye(ndim, 2 * ndim, dtype=np.float32)
-
-        # Motion and observation uncertainty are chosen relative to the current
-        # state estimate. These weights control the amount of uncertainty in
-        # the model. This is a bit hacky.
-        self._std_weight_position = 1. / 20
-        self._std_weight_velocity = 1. / 160
-
-    def initiate(self, measurement):
-        """
-        Create track from unassociated measurement.
-
-        Args:
-            measurement (ndarray): Bounding box coordinates (x, y, a, h) with
-                center position (x, y), aspect ratio a, and height h.
-
-        Returns:
-            The mean vector (8 dimensional) and covariance matrix (8x8
-            dimensional) of the new track. Unobserved velocities are 
-            initialized to 0 mean.
-        """
-        mean_pos = measurement
-        mean_vel = np.zeros_like(mean_pos)
-        mean = np.r_[mean_pos, mean_vel]
-
-        std = [
-            2 * self._std_weight_position * measurement[3],
-            2 * self._std_weight_position * measurement[3], 1e-2,
-            2 * self._std_weight_position * measurement[3],
-            10 * self._std_weight_velocity * measurement[3],
-            10 * self._std_weight_velocity * measurement[3], 1e-5,
-            10 * self._std_weight_velocity * measurement[3]
-        ]
-        covariance = np.diag(np.square(std))
-        return mean, np.float32(covariance)
-
-    def predict(self, mean, covariance):
-        """
-        Run Kalman filter prediction step.
-
-        Args:
-            mean (ndarray): The 8 dimensional mean vector of the object state
-                at the previous time step.
-            covariance (ndarray): The 8x8 dimensional covariance matrix of the
-                object state at the previous time step.
-
-        Returns:
-            The mean vector and covariance matrix of the predicted state. 
-            Unobserved velocities are initialized to 0 mean.
-        """
-        std_pos = [
-            self._std_weight_position * mean[3], self._std_weight_position *
-            mean[3], 1e-2, self._std_weight_position * mean[3]
-        ]
-        std_vel = [
-            self._std_weight_velocity * mean[3], self._std_weight_velocity *
-            mean[3], 1e-5, self._std_weight_velocity * mean[3]
-        ]
-        motion_cov = np.diag(np.square(np.r_[std_pos, std_vel]))
-
-        #mean = np.dot(self._motion_mat, mean)
-        mean = np.dot(mean, self._motion_mat.T)
-        covariance = np.linalg.multi_dot(
-            (self._motion_mat, covariance, self._motion_mat.T)) + motion_cov
-
-        return mean, covariance
-
-    def project(self, mean, covariance):
-        """
-        Project state distribution to measurement space.
-
-        Args
-            mean (ndarray): The state's mean vector (8 dimensional array).
-            covariance (ndarray): The state's covariance matrix (8x8 dimensional).
-
-        Returns:
-            The projected mean and covariance matrix of the given state estimate.
-        """
-        std = np.array(
-            [
-                self._std_weight_position * mean[3], self._std_weight_position *
-                mean[3], 1e-1, self._std_weight_position * mean[3]
-            ],
-            dtype=np.float32)
-
-        if use_numba:
-            return nb_project(mean, covariance, std, self._update_mat)
-
-        innovation_cov = np.diag(np.square(std))
-
-        mean = np.dot(self._update_mat, mean)
-        covariance = np.linalg.multi_dot((self._update_mat, covariance,
-                                          self._update_mat.T))
-        return mean, covariance + innovation_cov
-
-    def multi_predict(self, mean, covariance):
-        """
-        Run Kalman filter prediction step (Vectorized version).
-        
-        Args:
-            mean (ndarray): The Nx8 dimensional mean matrix of the object states
-                at the previous time step.
-            covariance (ndarray): The Nx8x8 dimensional covariance matrics of the
-                object states at the previous time step.
-
-        Returns:
-            The mean vector and covariance matrix of the predicted state.
-            Unobserved velocities are initialized to 0 mean.
-        """
-        std_pos = np.array([
-            self._std_weight_position * mean[:, 3], self._std_weight_position *
-            mean[:, 3], 1e-2 * np.ones_like(mean[:, 3]),
-            self._std_weight_position * mean[:, 3]
-        ])
-        std_vel = np.array([
-            self._std_weight_velocity * mean[:, 3], self._std_weight_velocity *
-            mean[:, 3], 1e-5 * np.ones_like(mean[:, 3]),
-            self._std_weight_velocity * mean[:, 3]
-        ])
-        sqr = np.square(np.r_[std_pos, std_vel]).T
-
-        if use_numba:
-
-            means = []
-            covariances = []
-            for i in range(len(mean)):
-                a, b = nb_multi_predict(mean[i], covariance[i],
-                                        np.diag(sqr[i]), self._motion_mat)
-                means.append(a)
-                covariances.append(b)
-            return np.asarray(means), np.asarray(covariances)
-
-        motion_cov = []
-        for i in range(len(mean)):
-            motion_cov.append(np.diag(sqr[i]))
-        motion_cov = np.asarray(motion_cov)
-
-        mean = np.dot(mean, self._motion_mat.T)
-        left = np.dot(self._motion_mat, covariance).transpose((1, 0, 2))
-        covariance = np.dot(left, self._motion_mat.T) + motion_cov
-
-        return mean, covariance
-
-    def update(self, mean, covariance, measurement):
-        """
-        Run Kalman filter correction step.
-
-        Args:
-            mean (ndarray): The predicted state's mean vector (8 dimensional).
-            covariance (ndarray): The state's covariance matrix (8x8 dimensional).
-            measurement (ndarray): The 4 dimensional measurement vector
-                (x, y, a, h), where (x, y) is the center position, a the aspect
-                ratio, and h the height of the bounding box.
-
-        Returns:
-            The measurement-corrected state distribution.
-        """
-        projected_mean, projected_cov = self.project(mean, covariance)
-
-        if use_numba:
-
-            return nb_update(mean, covariance, projected_mean, projected_cov,
-                             measurement, self._update_mat)
-
-        kalman_gain = np.linalg.solve(projected_cov,
-                                      (covariance @self._update_mat.T).T).T
-        innovation = measurement - projected_mean
-        mean = mean + innovation @kalman_gain.T
-        covariance = covariance - kalman_gain @projected_cov @kalman_gain.T
-        return mean, covariance
-
-    def gating_distance(self,
-                        mean,
-                        covariance,
-                        measurements,
-                        only_position=False,
-                        metric='maha'):
-        """
-        Compute gating distance between state distribution and measurements.
-        A suitable distance threshold can be obtained from `chi2inv95`. If
-        `only_position` is False, the chi-square distribution has 4 degrees of
-        freedom, otherwise 2.
-        
-        Args:
-            mean (ndarray): Mean vector over the state distribution (8
-                dimensional).
-            covariance (ndarray): Covariance of the state distribution (8x8
-                dimensional).
-            measurements (ndarray): An Nx4 dimensional matrix of N measurements,
-                each in format (x, y, a, h) where (x, y) is the bounding box center
-                position, a the aspect ratio, and h the height.
-            only_position (Optional[bool]): If True, distance computation is 
-                done with respect to the bounding box center position only.
-            metric (str): Metric type, 'gaussian' or 'maha'.
-
-        Returns
-            An array of length N, where the i-th element contains the squared
-            Mahalanobis distance between (mean, covariance) and `measurements[i]`.
-        """
-        mean, covariance = self.project(mean, covariance)
-        if only_position:
-            mean, covariance = mean[:2], covariance[:2, :2]
-            measurements = measurements[:, :2]
-
-        d = measurements - mean
-        if metric == 'gaussian':
-            return np.sum(d * d, axis=1)
-        elif metric == 'maha':
-            cholesky_factor = np.linalg.cholesky(covariance)
-            z = scipy.linalg.solve_triangular(
-                cholesky_factor,
-                d.T,
-                lower=True,
-                check_finite=False,
-                overwrite_b=True)
-            squared_maha = np.sum(z * z, axis=0)
-            return squared_maha
-        else:
-            raise ValueError('invalid distance metric')
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/motion/ocsort_kalman_filter.py b/pdfdet/models/Paddle/ppdet/modeling/mot/motion/ocsort_kalman_filter.py
deleted file mode 100644
index 303426f..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/motion/ocsort_kalman_filter.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/danbochman/SORT/blob/danny_opencv/kalman_filter.py
-"""
-
-import numpy as np
-from numpy import dot, zeros, eye
-from numpy.linalg import inv
-
-use_numba = True
-try:
-    import numba as nb
-
-    @nb.njit(fastmath=True, cache=True)
-    def nb_predict(x, F, P, Q):
-        x = dot(F, x)
-        P = dot(dot(F, P), F.T) + Q
-        return x, P
-
-    @nb.njit(fastmath=True, cache=True)
-    def nb_update(x, z, H, P, R, _I):
-
-        y = z - np.dot(H, x)
-        PHT = dot(P, H.T)
-
-        S = dot(H, PHT) + R
-        K = dot(PHT, inv(S))
-
-        x = x + dot(K, y)
-
-        I_KH = _I - dot(K, H)
-        P = dot(dot(I_KH, P), I_KH.T) + dot(dot(K, R), K.T)
-        return x, P
-except:
-    use_numba = False
-    pass
-
-
-class OCSORTKalmanFilter:
-    def __init__(self, dim_x, dim_z):
-        self.dim_x = dim_x
-        self.dim_z = dim_z
-        self.x = zeros((dim_x, 1))
-        self.P = eye(dim_x)
-        self.Q = eye(dim_x)
-        self.F = eye(dim_x)
-        self.H = zeros((dim_z, dim_x))
-        self.R = eye(dim_z)
-        self.M = zeros((dim_z, dim_z))
-
-        self._I = eye(dim_x)
-
-    def predict(self):
-        if use_numba:
-            self.x, self.P = nb_predict(self.x, self.F, self.P, self.Q)
-        else:
-            self.x = dot(self.F, self.x)
-            self.P = dot(dot(self.F, self.P), self.F.T) + self.Q
-
-    def update(self, z):
-
-        if z is None:
-            return
-
-        if use_numba:
-            self.x, self.P = nb_update(self.x, z, self.H, self.P, self.R,
-                                       self._I)
-        else:
-            y = z - np.dot(self.H, self.x)
-            PHT = dot(self.P, self.H.T)
-
-            S = dot(self.H, PHT) + self.R
-            K = dot(PHT, inv(S))
-
-            self.x = self.x + dot(K, y)
-
-            I_KH = self._I - dot(K, self.H)
-            self.P = dot(dot(I_KH, self.P), I_KH.T) + dot(dot(K, self.R), K.T)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/__init__.py
deleted file mode 100644
index a3c4229..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import base_jde_tracker
-from . import base_sde_tracker
-
-from .base_jde_tracker import *
-from .base_sde_tracker import *
-
-from . import jde_tracker
-from . import deepsort_tracker
-from . import ocsort_tracker
-from . import center_tracker
-
-from .jde_tracker import *
-from .deepsort_tracker import *
-from .ocsort_tracker import *
-from .botsort_tracker import *
-from .center_tracker import *
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/base_jde_tracker.py b/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/base_jde_tracker.py
deleted file mode 100644
index e78fe00..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/base_jde_tracker.py
+++ /dev/null
@@ -1,311 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/multitracker.py
-"""
-
-import numpy as np
-from collections import defaultdict
-from collections import deque, OrderedDict
-from ..matching import jde_matching as matching
-from ppdet.core.workspace import register, serializable
-import warnings
-warnings.filterwarnings("ignore")
-
-__all__ = [
-    'TrackState',
-    'BaseTrack',
-    'STrack',
-    'joint_stracks',
-    'sub_stracks',
-    'remove_duplicate_stracks',
-]
-
-
-class TrackState(object):
-    New = 0
-    Tracked = 1
-    Lost = 2
-    Removed = 3
-
-
-@register
-@serializable
-class BaseTrack(object):
-    _count_dict = defaultdict(int)  # support single class and multi classes
-
-    track_id = 0
-    is_activated = False
-    state = TrackState.New
-
-    history = OrderedDict()
-    features = []
-    curr_feat = None
-    score = 0
-    start_frame = 0
-    frame_id = 0
-    time_since_update = 0
-
-    # multi-camera
-    location = (np.inf, np.inf)
-
-    @property
-    def end_frame(self):
-        return self.frame_id
-
-    @staticmethod
-    def next_id(cls_id):
-        BaseTrack._count_dict[cls_id] += 1
-        return BaseTrack._count_dict[cls_id]
-
-    # @even: reset track id
-    @staticmethod
-    def init_count(num_classes):
-        """
-        Initiate _count for all object classes
-        :param num_classes:
-        """
-        for cls_id in range(num_classes):
-            BaseTrack._count_dict[cls_id] = 0
-
-    @staticmethod
-    def reset_track_count(cls_id):
-        BaseTrack._count_dict[cls_id] = 0
-
-    def activate(self, *args):
-        raise NotImplementedError
-
-    def predict(self):
-        raise NotImplementedError
-
-    def update(self, *args, **kwargs):
-        raise NotImplementedError
-
-    def mark_lost(self):
-        self.state = TrackState.Lost
-
-    def mark_removed(self):
-        self.state = TrackState.Removed
-
-
-@register
-@serializable
-class STrack(BaseTrack):
-    def __init__(self, tlwh, score, cls_id, buff_size=30, temp_feat=None):
-        # wait activate
-        self._tlwh = np.asarray(tlwh, dtype=np.float32)
-        self.score = score
-        self.cls_id = cls_id
-        self.track_len = 0
-
-        self.kalman_filter = None
-        self.mean, self.covariance = None, None
-        self.is_activated = False
-
-        self.use_reid = True if temp_feat is not None else False
-        if self.use_reid:
-            self.smooth_feat = None
-            self.update_features(temp_feat)
-            self.features = deque([], maxlen=buff_size)
-            self.alpha = 0.9
-
-    def update_features(self, feat):
-        # L2 normalizing, this function has no use for BYTETracker
-        feat /= np.linalg.norm(feat)
-        self.curr_feat = feat
-        if self.smooth_feat is None:
-            self.smooth_feat = feat
-        else:
-            self.smooth_feat = self.alpha * self.smooth_feat + (1.0 - self.alpha
-                                                                ) * feat
-        self.features.append(feat)
-        self.smooth_feat /= np.linalg.norm(self.smooth_feat)
-
-    def predict(self):
-        mean_state = self.mean.copy()
-        if self.state != TrackState.Tracked:
-            mean_state[7] = 0
-        self.mean, self.covariance = self.kalman_filter.predict(mean_state,
-                                                                self.covariance)
-
-    @staticmethod
-    def multi_predict(tracks, kalman_filter):
-        if len(tracks) > 0:
-            multi_mean = np.asarray([track.mean.copy() for track in tracks])
-            multi_covariance = np.asarray(
-                [track.covariance for track in tracks])
-            for i, st in enumerate(tracks):
-                if st.state != TrackState.Tracked:
-                    multi_mean[i][7] = 0
-            multi_mean, multi_covariance = kalman_filter.multi_predict(
-                multi_mean, multi_covariance)
-            for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)):
-                tracks[i].mean = mean
-                tracks[i].covariance = cov
-
-    @staticmethod
-    def multi_gmc(stracks, H=np.eye(2, 3)):
-        if len(stracks) > 0:
-            multi_mean = np.asarray([st.mean.copy() for st in stracks])
-            multi_covariance = np.asarray([st.covariance for st in stracks])
-
-            R = H[:2, :2]
-            R8x8 = np.kron(np.eye(4, dtype=float), R)
-            t = H[:2, 2]
-
-            for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)):
-                mean = R8x8.dot(mean)
-                mean[:2] += t
-                cov = R8x8.dot(cov).dot(R8x8.transpose())
-
-                stracks[i].mean = mean
-                stracks[i].covariance = cov
-
-    def reset_track_id(self):
-        self.reset_track_count(self.cls_id)
-
-    def activate(self, kalman_filter, frame_id):
-        """Start a new track"""
-        self.kalman_filter = kalman_filter
-        # update track id for the object class
-        self.track_id = self.next_id(self.cls_id)
-        self.mean, self.covariance = self.kalman_filter.initiate(
-            self.tlwh_to_xyah(self._tlwh))
-
-        self.track_len = 0
-        self.state = TrackState.Tracked  # set flag 'tracked'
-
-        if frame_id == 1:  # to record the first frame's detection result
-            self.is_activated = True
-
-        self.frame_id = frame_id
-        self.start_frame = frame_id
-
-    def re_activate(self, new_track, frame_id, new_id=False):
-        self.mean, self.covariance = self.kalman_filter.update(
-            self.mean, self.covariance, self.tlwh_to_xyah(new_track.tlwh))
-        if self.use_reid:
-            self.update_features(new_track.curr_feat)
-        self.track_len = 0
-        self.state = TrackState.Tracked
-        self.is_activated = True
-        self.frame_id = frame_id
-        if new_id:  # update track id for the object class
-            self.track_id = self.next_id(self.cls_id)
-
-    def update(self, new_track, frame_id, update_feature=True):
-        self.frame_id = frame_id
-        self.track_len += 1
-
-        new_tlwh = new_track.tlwh
-        self.mean, self.covariance = self.kalman_filter.update(
-            self.mean, self.covariance, self.tlwh_to_xyah(new_tlwh))
-        self.state = TrackState.Tracked  # set flag 'tracked'
-        self.is_activated = True  # set flag 'activated'
-
-        self.score = new_track.score
-        if update_feature and self.use_reid:
-            self.update_features(new_track.curr_feat)
-
-    @property
-    def tlwh(self):
-        """Get current position in bounding box format `(top left x, top left y,
-                width, height)`.
-        """
-        if self.mean is None:
-            return self._tlwh.copy()
-
-        ret = self.mean[:4].copy()
-        ret[2] *= ret[3]
-        ret[:2] -= ret[2:] / 2
-        return ret
-
-    @property
-    def tlbr(self):
-        """Convert bounding box to format `(min x, min y, max x, max y)`, i.e.,
-        `(top left, bottom right)`.
-        """
-        ret = self.tlwh.copy()
-        ret[2:] += ret[:2]
-        return ret
-
-    @staticmethod
-    def tlwh_to_xyah(tlwh):
-        """Convert bounding box to format `(center x, center y, aspect ratio,
-        height)`, where the aspect ratio is `width / height`.
-        """
-        ret = np.asarray(tlwh).copy()
-        ret[:2] += ret[2:] / 2
-        ret[2] /= ret[3]
-        return ret
-
-    def to_xyah(self):
-        return self.tlwh_to_xyah(self.tlwh)
-
-    @staticmethod
-    def tlbr_to_tlwh(tlbr):
-        ret = np.asarray(tlbr).copy()
-        ret[2:] -= ret[:2]
-        return ret
-
-    @staticmethod
-    def tlwh_to_tlbr(tlwh):
-        ret = np.asarray(tlwh).copy()
-        ret[2:] += ret[:2]
-        return ret
-
-    def __repr__(self):
-        return 'OT_({}-{})_({}-{})'.format(self.cls_id, self.track_id,
-                                           self.start_frame, self.end_frame)
-
-
-def joint_stracks(tlista, tlistb):
-    exists = {}
-    res = []
-    for t in tlista:
-        exists[t.track_id] = 1
-        res.append(t)
-    for t in tlistb:
-        tid = t.track_id
-        if not exists.get(tid, 0):
-            exists[tid] = 1
-            res.append(t)
-    return res
-
-
-def sub_stracks(tlista, tlistb):
-    stracks = {}
-    for t in tlista:
-        stracks[t.track_id] = t
-    for t in tlistb:
-        tid = t.track_id
-        if stracks.get(tid, 0):
-            del stracks[tid]
-    return list(stracks.values())
-
-
-def remove_duplicate_stracks(stracksa, stracksb):
-    pdist = matching.iou_distance(stracksa, stracksb)
-    pairs = np.where(pdist < 0.15)
-    dupa, dupb = list(), list()
-    for p, q in zip(*pairs):
-        timep = stracksa[p].frame_id - stracksa[p].start_frame
-        timeq = stracksb[q].frame_id - stracksb[q].start_frame
-        if timep > timeq:
-            dupb.append(q)
-        else:
-            dupa.append(p)
-    resa = [t for i, t in enumerate(stracksa) if not i in dupa]
-    resb = [t for i, t in enumerate(stracksb) if not i in dupb]
-    return resa, resb
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/base_sde_tracker.py b/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/base_sde_tracker.py
deleted file mode 100644
index accc201..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/base_sde_tracker.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/nwojke/deep_sort/blob/master/deep_sort/track.py
-"""
-
-import datetime
-from ppdet.core.workspace import register, serializable
-
-__all__ = ['TrackState', 'Track']
-
-
-class TrackState(object):
-    """
-    Enumeration type for the single target track state. Newly created tracks are
-    classified as `tentative` until enough evidence has been collected. Then,
-    the track state is changed to `confirmed`. Tracks that are no longer alive
-    are classified as `deleted` to mark them for removal from the set of active
-    tracks.
-    """
-    Tentative = 1
-    Confirmed = 2
-    Deleted = 3
-
-
-@register
-@serializable
-class Track(object):
-    """
-    A single target track with state space `(x, y, a, h)` and associated
-    velocities, where `(x, y)` is the center of the bounding box, `a` is the
-    aspect ratio and `h` is the height.
-
-    Args:
-        mean (ndarray): Mean vector of the initial state distribution.
-        covariance (ndarray): Covariance matrix of the initial state distribution.
-        track_id (int): A unique track identifier.
-        n_init (int): Number of consecutive detections before the track is confirmed.
-            The track state is set to `Deleted` if a miss occurs within the first
-            `n_init` frames.
-        max_age (int): The maximum number of consecutive misses before the track
-            state is set to `Deleted`.
-        cls_id (int): The category id of the tracked box.
-        score (float): The confidence score of the tracked box.
-        feature (Optional[ndarray]): Feature vector of the detection this track
-            originates from. If not None, this feature is added to the `features` cache.
-
-    Attributes:
-        hits (int): Total number of measurement updates.
-        age (int): Total number of frames since first occurance.
-        time_since_update (int): Total number of frames since last measurement
-            update.
-        state (TrackState): The current track state.
-        features (List[ndarray]): A cache of features. On each measurement update,
-            the associated feature vector is added to this list.
-    """
-
-    def __init__(self,
-                 mean,
-                 covariance,
-                 track_id,
-                 n_init,
-                 max_age,
-                 cls_id,
-                 score,
-                 feature=None):
-        self.mean = mean
-        self.covariance = covariance
-        self.track_id = track_id
-        self.hits = 1
-        self.age = 1
-        self.time_since_update = 0
-        self.cls_id = cls_id
-        self.score = score
-        self.start_time = datetime.datetime.now()
-
-        self.state = TrackState.Tentative
-        self.features = []
-        self.feat = feature
-        if feature is not None:
-            self.features.append(feature)
-
-        self._n_init = n_init
-        self._max_age = max_age
-
-    def to_tlwh(self):
-        """Get position in format `(top left x, top left y, width, height)`."""
-        ret = self.mean[:4].copy()
-        ret[2] *= ret[3]
-        ret[:2] -= ret[2:] / 2
-        return ret
-
-    def to_tlbr(self):
-        """Get position in bounding box format `(min x, miny, max x, max y)`."""
-        ret = self.to_tlwh()
-        ret[2:] = ret[:2] + ret[2:]
-        return ret
-
-    def predict(self, kalman_filter):
-        """
-        Propagate the state distribution to the current time step using a Kalman
-        filter prediction step.
-        """
-        self.mean, self.covariance = kalman_filter.predict(self.mean,
-                                                           self.covariance)
-        self.age += 1
-        self.time_since_update += 1
-
-    def update(self, kalman_filter, detection):
-        """
-        Perform Kalman filter measurement update step and update the associated
-        detection feature cache.
-        """
-        self.mean, self.covariance = kalman_filter.update(self.mean,
-                                                          self.covariance,
-                                                          detection.to_xyah())
-        self.features.append(detection.feature)
-        self.feat = detection.feature
-        self.cls_id = detection.cls_id
-        self.score = detection.score
-
-        self.hits += 1
-        self.time_since_update = 0
-        if self.state == TrackState.Tentative and self.hits >= self._n_init:
-            self.state = TrackState.Confirmed
-
-    def mark_missed(self):
-        """Mark this track as missed (no association at the current time step).
-        """
-        if self.state == TrackState.Tentative:
-            self.state = TrackState.Deleted
-        elif self.time_since_update > self._max_age:
-            self.state = TrackState.Deleted
-
-    def is_tentative(self):
-        """Returns True if this track is tentative (unconfirmed)."""
-        return self.state == TrackState.Tentative
-
-    def is_confirmed(self):
-        """Returns True if this track is confirmed."""
-        return self.state == TrackState.Confirmed
-
-    def is_deleted(self):
-        """Returns True if this track is dead and should be deleted."""
-        return self.state == TrackState.Deleted
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/botsort_tracker.py b/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/botsort_tracker.py
deleted file mode 100644
index 4f412a7..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/botsort_tracker.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/WWangYuHsiang/SMILEtrack/blob/main/BoT-SORT/tracker/bot_sort.py
-"""
-
-import cv2
-import matplotlib.pyplot as plt
-import numpy as np
-from collections import deque
-
-from ..matching import jde_matching as matching
-from ..motion import GMC
-from .base_jde_tracker import TrackState, STrack
-from .base_jde_tracker import joint_stracks, sub_stracks, remove_duplicate_stracks
-from ..motion import KalmanFilter
-
-from ppdet.core.workspace import register, serializable
-
-
-@register
-@serializable
-class BOTSORTTracker(object):
-    """
-    BOTSORT tracker, support single class
-
-    Args:
-        track_high_thresh (float): threshold of detection high score
-        track_low_thresh (float): threshold of remove detection score
-        new_track_thresh (float): threshold of new track score
-        match_thresh (float): iou threshold for associate
-        track_buffer (int): tracking reserved frames,default 30
-        min_box_area (float): reserved min box
-        camera_motion (bool): Whether use camera motion, default False
-        cmc_method (str): camera motion method,defalut sparseOptFlow
-        frame_rate (int): fps buffer_size=int(frame_rate / 30.0 * track_buffer)
-    """
-
-    def __init__(self,
-                 track_high_thresh=0.3,
-                 track_low_thresh=0.2,
-                 new_track_thresh=0.4,
-                 match_thresh=0.7,
-                 track_buffer=30,
-                 min_box_area=0,
-                 camera_motion=False,
-                 cmc_method='sparseOptFlow',
-                 frame_rate=30):
-
-        self.tracked_stracks = []  # type: list[STrack]
-        self.lost_stracks = []  # type: list[STrack]
-        self.removed_stracks = []  # type: list[STrack]
-
-        self.frame_id = 0
-
-        self.track_high_thresh = track_high_thresh
-        self.track_low_thresh = track_low_thresh
-        self.new_track_thresh = new_track_thresh
-        self.match_thresh = match_thresh
-        self.buffer_size = int(frame_rate / 30.0 * track_buffer)
-        self.max_time_lost = self.buffer_size
-        self.kalman_filter = KalmanFilter()
-        self.min_box_area = min_box_area
-
-        self.camera_motion = camera_motion
-        self.gmc = GMC(method=cmc_method)
-
-    def update(self, output_results, img=None):
-        self.frame_id += 1
-        activated_starcks = []
-        refind_stracks = []
-        lost_stracks = []
-        removed_stracks = []
-
-        if len(output_results):
-            bboxes = output_results[:, 2:6]
-            scores = output_results[:, 1]
-            classes = output_results[:, 0]
-
-            # Remove bad detections
-            lowest_inds = scores > self.track_low_thresh
-            bboxes = bboxes[lowest_inds]
-            scores = scores[lowest_inds]
-            classes = classes[lowest_inds]
-
-            # Find high threshold detections
-            remain_inds = scores > self.track_high_thresh
-            dets = bboxes[remain_inds]
-            scores_keep = scores[remain_inds]
-            classes_keep = classes[remain_inds]
-
-        else:
-            bboxes = []
-            scores = []
-            classes = []
-            dets = []
-            scores_keep = []
-            classes_keep = []
-
-        if len(dets) > 0:
-            '''Detections'''
-            detections = [
-                STrack(STrack.tlbr_to_tlwh(tlbr), s, c)
-                for (tlbr, s, c) in zip(dets, scores_keep, classes_keep)
-            ]
-        else:
-            detections = []
-        ''' Add newly detected tracklets to tracked_stracks'''
-        unconfirmed = []
-        tracked_stracks = []  # type: list[STrack]
-        for track in self.tracked_stracks:
-            if not track.is_activated:
-                unconfirmed.append(track)
-            else:
-                tracked_stracks.append(track)
-        ''' Step 2: First association, with high score detection boxes'''
-        strack_pool = joint_stracks(tracked_stracks, self.lost_stracks)
-
-        # Predict the current location with KF
-        STrack.multi_predict(strack_pool, self.kalman_filter)
-
-        # Fix camera motion
-        if self.camera_motion:
-            warp = self.gmc.apply(img[0], dets)
-            STrack.multi_gmc(strack_pool, warp)
-            STrack.multi_gmc(unconfirmed, warp)
-
-        # Associate with high score detection boxes
-        ious_dists = matching.iou_distance(strack_pool, detections)
-        matches, u_track, u_detection = matching.linear_assignment(
-            ious_dists, thresh=self.match_thresh)
-
-        for itracked, idet in matches:
-            track = strack_pool[itracked]
-            det = detections[idet]
-            if track.state == TrackState.Tracked:
-                track.update(detections[idet], self.frame_id)
-                activated_starcks.append(track)
-            else:
-                track.re_activate(det, self.frame_id, new_id=False)
-                refind_stracks.append(track)
-        ''' Step 3: Second association, with low score detection boxes'''
-        if len(scores):
-            inds_high = scores < self.track_high_thresh
-            inds_low = scores > self.track_low_thresh
-            inds_second = np.logical_and(inds_low, inds_high)
-            dets_second = bboxes[inds_second]
-            scores_second = scores[inds_second]
-            classes_second = classes[inds_second]
-        else:
-            dets_second = []
-            scores_second = []
-            classes_second = []
-
-        # association the untrack to the low score detections
-        if len(dets_second) > 0:
-            '''Detections'''
-            detections_second = [
-                STrack(STrack.tlbr_to_tlwh(tlbr), s, c) for (tlbr, s, c) in
-                zip(dets_second, scores_second, classes_second)
-            ]
-        else:
-            detections_second = []
-
-        r_tracked_stracks = [
-            strack_pool[i] for i in u_track
-            if strack_pool[i].state == TrackState.Tracked
-        ]
-        dists = matching.iou_distance(r_tracked_stracks, detections_second)
-        matches, u_track, u_detection_second = matching.linear_assignment(
-            dists, thresh=0.5)
-        for itracked, idet in matches:
-            track = r_tracked_stracks[itracked]
-            det = detections_second[idet]
-            if track.state == TrackState.Tracked:
-                track.update(det, self.frame_id)
-                activated_starcks.append(track)
-            else:
-                track.re_activate(det, self.frame_id, new_id=False)
-                refind_stracks.append(track)
-
-        for it in u_track:
-            track = r_tracked_stracks[it]
-            if not track.state == TrackState.Lost:
-                track.mark_lost()
-                lost_stracks.append(track)
-        '''Deal with unconfirmed tracks, usually tracks with only one beginning frame'''
-        detections = [detections[i] for i in u_detection]
-        dists = matching.iou_distance(unconfirmed, detections)
-
-        matches, u_unconfirmed, u_detection = matching.linear_assignment(
-            dists, thresh=0.7)
-        for itracked, idet in matches:
-            unconfirmed[itracked].update(detections[idet], self.frame_id)
-            activated_starcks.append(unconfirmed[itracked])
-        for it in u_unconfirmed:
-            track = unconfirmed[it]
-            track.mark_removed()
-            removed_stracks.append(track)
-        """ Step 4: Init new stracks"""
-        for inew in u_detection:
-            track = detections[inew]
-            if track.score < self.new_track_thresh:
-                continue
-
-            track.activate(self.kalman_filter, self.frame_id)
-            activated_starcks.append(track)
-        """ Step 5: Update state"""
-        for track in self.lost_stracks:
-            if self.frame_id - track.end_frame > self.max_time_lost:
-                track.mark_removed()
-                removed_stracks.append(track)
-        """ Merge """
-        self.tracked_stracks = [
-            t for t in self.tracked_stracks if t.state == TrackState.Tracked
-        ]
-        self.tracked_stracks = joint_stracks(self.tracked_stracks,
-                                             activated_starcks)
-        self.tracked_stracks = joint_stracks(self.tracked_stracks,
-                                             refind_stracks)
-        self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks)
-        self.lost_stracks.extend(lost_stracks)
-        self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks)
-        self.removed_stracks.extend(removed_stracks)
-        self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks(
-            self.tracked_stracks, self.lost_stracks)
-
-        # output_stracks = [track for track in self.tracked_stracks if track.is_activated]
-        output_stracks = [track for track in self.tracked_stracks]
-
-        return output_stracks
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/center_tracker.py b/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/center_tracker.py
deleted file mode 100644
index 8005ddc..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/center_tracker.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/xingyizhou/CenterTrack/blob/master/src/lib/utils/tracker.py
-"""
-
-import copy
-import numpy as np
-import sklearn
-
-from ppdet.core.workspace import register, serializable
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-__all__ = ['CenterTracker']
-
-
-@register
-@serializable
-class CenterTracker(object):
-    __shared__ = ['num_classes']
-
-    def __init__(self,
-                 num_classes=1,
-                 min_box_area=0,
-                 vertical_ratio=-1,
-                 track_thresh=0.4,
-                 pre_thresh=0.5,
-                 new_thresh=0.4,
-                 out_thresh=0.4,
-                 hungarian=False):
-        self.num_classes = num_classes
-        self.min_box_area = min_box_area
-        self.vertical_ratio = vertical_ratio
-
-        self.track_thresh = track_thresh
-        self.pre_thresh = max(track_thresh, pre_thresh)
-        self.new_thresh = max(track_thresh, new_thresh)
-        self.out_thresh = max(track_thresh, out_thresh)
-        self.hungarian = hungarian
-
-        self.reset()
-
-    def init_track(self, results):
-        print('Initialize tracking!')
-        for item in results:
-            if item['score'] > self.new_thresh:
-                self.id_count += 1
-                item['tracking_id'] = self.id_count
-                if not ('ct' in item):
-                    bbox = item['bbox']
-                    item['ct'] = [(bbox[0] + bbox[2]) / 2,
-                                  (bbox[1] + bbox[3]) / 2]
-                self.tracks.append(item)
-
-    def reset(self):
-        self.id_count = 0
-        self.tracks = []
-
-    def update(self, results, public_det=None):
-        N = len(results)
-        M = len(self.tracks)
-
-        dets = np.array([det['ct'] + det['tracking'] for det in results],
-                        np.float32)  # N x 2
-        track_size = np.array([((track['bbox'][2] - track['bbox'][0]) * \
-            (track['bbox'][3] - track['bbox'][1])) \
-            for track in self.tracks], np.float32) # M
-        track_cat = np.array([track['class'] for track in self.tracks],
-                             np.int32)  # M
-        item_size = np.array([((item['bbox'][2] - item['bbox'][0]) * \
-            (item['bbox'][3] - item['bbox'][1])) \
-            for item in results], np.float32) # N
-        item_cat = np.array([item['class'] for item in results], np.int32)  # N
-        tracks = np.array([pre_det['ct'] for pre_det in self.tracks],
-                          np.float32)  # M x 2
-        dist = (((tracks.reshape(1, -1, 2) - \
-            dets.reshape(-1, 1, 2)) ** 2).sum(axis=2)) # N x M
-
-        invalid = ((dist > track_size.reshape(1, M)) + \
-            (dist > item_size.reshape(N, 1)) + \
-            (item_cat.reshape(N, 1) != track_cat.reshape(1, M))) > 0
-        dist = dist + invalid * 1e18
-
-        if self.hungarian:
-            item_score = np.array([item['score'] for item in results],
-                                  np.float32)
-            dist[dist > 1e18] = 1e18
-            from sklearn.utils.linear_assignment_ import linear_assignment
-            matched_indices = linear_assignment(dist)
-        else:
-            matched_indices = greedy_assignment(copy.deepcopy(dist))
-
-        unmatched_dets = [d for d in range(dets.shape[0]) \
-            if not (d in matched_indices[:, 0])]
-        unmatched_tracks = [d for d in range(tracks.shape[0]) \
-            if not (d in matched_indices[:, 1])]
-
-        if self.hungarian:
-            matches = []
-            for m in matched_indices:
-                if dist[m[0], m[1]] > 1e16:
-                    unmatched_dets.append(m[0])
-                    unmatched_tracks.append(m[1])
-                else:
-                    matches.append(m)
-            matches = np.array(matches).reshape(-1, 2)
-        else:
-            matches = matched_indices
-
-        ret = []
-        for m in matches:
-            track = results[m[0]]
-            track['tracking_id'] = self.tracks[m[1]]['tracking_id']
-            ret.append(track)
-
-        # Private detection: create tracks for all un-matched detections
-        for i in unmatched_dets:
-            track = results[i]
-            if track['score'] > self.new_thresh:
-                self.id_count += 1
-                track['tracking_id'] = self.id_count
-                ret.append(track)
-
-        self.tracks = ret
-        return ret
-
-
-def greedy_assignment(dist):
-    matched_indices = []
-    if dist.shape[1] == 0:
-        return np.array(matched_indices, np.int32).reshape(-1, 2)
-    for i in range(dist.shape[0]):
-        j = dist[i].argmin()
-        if dist[i][j] < 1e16:
-            dist[:, j] = 1e18
-            matched_indices.append([i, j])
-    return np.array(matched_indices, np.int32).reshape(-1, 2)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/deepsort_tracker.py b/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/deepsort_tracker.py
deleted file mode 100644
index 9065dfe..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/deepsort_tracker.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/nwojke/deep_sort/blob/master/deep_sort/tracker.py
-"""
-
-import numpy as np
-
-from ..motion import KalmanFilter
-from ..matching.deepsort_matching import NearestNeighborDistanceMetric
-from ..matching.deepsort_matching import iou_cost, min_cost_matching, matching_cascade, gate_cost_matrix
-from .base_sde_tracker import Track
-from ..utils import Detection
-
-from ppdet.core.workspace import register, serializable
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-__all__ = ['DeepSORTTracker']
-
-
-@register
-@serializable
-class DeepSORTTracker(object):
-    """
-    DeepSORT tracker
-
-    Args:
-        input_size (list): input feature map size to reid model, [h, w] format,
-            [64, 192] as default.
-        min_box_area (int): min box area to filter out low quality boxes
-        vertical_ratio (float): w/h, the vertical ratio of the bbox to filter
-            bad results, set 1.6 default for pedestrian tracking. If set <=0
-            means no need to filter bboxes.
-        budget (int): If not None, fix samples per class to at most this number.
-            Removes the oldest samples when the budget is reached.
-        max_age (int): maximum number of missed misses before a track is deleted
-        n_init (float): Number of frames that a track remains in initialization
-            phase. Number of consecutive detections before the track is confirmed. 
-            The track state is set to `Deleted` if a miss occurs within the first 
-            `n_init` frames.
-        metric_type (str): either "euclidean" or "cosine", the distance metric 
-            used for measurement to track association.
-        matching_threshold (float): samples with larger distance are 
-            considered an invalid match.
-        max_iou_distance (float): max iou distance threshold
-        motion (object): KalmanFilter instance
-    """
-
-    def __init__(self,
-                 input_size=[64, 192],
-                 min_box_area=0,
-                 vertical_ratio=-1,
-                 budget=100,
-                 max_age=70,
-                 n_init=3,
-                 metric_type='cosine',
-                 matching_threshold=0.2,
-                 max_iou_distance=0.9,
-                 motion='KalmanFilter'):
-        self.input_size = input_size
-        self.min_box_area = min_box_area
-        self.vertical_ratio = vertical_ratio
-        self.max_age = max_age
-        self.n_init = n_init
-        self.metric = NearestNeighborDistanceMetric(metric_type,
-                                                    matching_threshold, budget)
-        self.max_iou_distance = max_iou_distance
-        if motion == 'KalmanFilter':
-            self.motion = KalmanFilter()
-
-        self.tracks = []
-        self._next_id = 1
-
-    def predict(self):
-        """
-        Propagate track state distributions one time step forward.
-        This function should be called once every time step, before `update`.
-        """
-        for track in self.tracks:
-            track.predict(self.motion)
-
-    def update(self, pred_dets, pred_embs):
-        """
-        Perform measurement update and track management.
-        Args:
-            pred_dets (np.array): Detection results of the image, the shape is
-                [N, 6], means 'cls_id, score, x0, y0, x1, y1'.
-            pred_embs (np.array): Embedding results of the image, the shape is
-                [N, 128], usually pred_embs.shape[1] is a multiple of 128.
-        """
-        pred_cls_ids = pred_dets[:, 0:1]
-        pred_scores = pred_dets[:, 1:2]
-        pred_xyxys = pred_dets[:, 2:6]
-        pred_tlwhs = np.concatenate((pred_xyxys[:, 0:2], pred_xyxys[:, 2:4] - pred_xyxys[:, 0:2] + 1), axis=1)
-
-        detections = [
-            Detection(tlwh, score, feat, cls_id)
-            for tlwh, score, feat, cls_id in zip(pred_tlwhs, pred_scores,
-                                                 pred_embs, pred_cls_ids)
-        ]
-
-        # Run matching cascade.
-        matches, unmatched_tracks, unmatched_detections = \
-            self._match(detections)
-
-        # Update track set.
-        for track_idx, detection_idx in matches:
-            self.tracks[track_idx].update(self.motion,
-                                          detections[detection_idx])
-        for track_idx in unmatched_tracks:
-            self.tracks[track_idx].mark_missed()
-        for detection_idx in unmatched_detections:
-            self._initiate_track(detections[detection_idx])
-        self.tracks = [t for t in self.tracks if not t.is_deleted()]
-
-        # Update distance metric.
-        active_targets = [t.track_id for t in self.tracks if t.is_confirmed()]
-        features, targets = [], []
-        for track in self.tracks:
-            if not track.is_confirmed():
-                continue
-            features += track.features
-            targets += [track.track_id for _ in track.features]
-            track.features = []
-        self.metric.partial_fit(
-            np.asarray(features), np.asarray(targets), active_targets)
-        output_stracks = self.tracks
-        return output_stracks
-
-    def _match(self, detections):
-        def gated_metric(tracks, dets, track_indices, detection_indices):
-            features = np.array([dets[i].feature for i in detection_indices])
-            targets = np.array([tracks[i].track_id for i in track_indices])
-            cost_matrix = self.metric.distance(features, targets)
-            cost_matrix = gate_cost_matrix(self.motion, cost_matrix, tracks,
-                                           dets, track_indices,
-                                           detection_indices)
-            return cost_matrix
-
-        # Split track set into confirmed and unconfirmed tracks.
-        confirmed_tracks = [
-            i for i, t in enumerate(self.tracks) if t.is_confirmed()
-        ]
-        unconfirmed_tracks = [
-            i for i, t in enumerate(self.tracks) if not t.is_confirmed()
-        ]
-
-        # Associate confirmed tracks using appearance features.
-        matches_a, unmatched_tracks_a, unmatched_detections = \
-            matching_cascade(
-                gated_metric, self.metric.matching_threshold, self.max_age,
-                self.tracks, detections, confirmed_tracks)
-
-        # Associate remaining tracks together with unconfirmed tracks using IOU.
-        iou_track_candidates = unconfirmed_tracks + [
-            k for k in unmatched_tracks_a
-            if self.tracks[k].time_since_update == 1
-        ]
-        unmatched_tracks_a = [
-            k for k in unmatched_tracks_a
-            if self.tracks[k].time_since_update != 1
-        ]
-        matches_b, unmatched_tracks_b, unmatched_detections = \
-            min_cost_matching(
-                iou_cost, self.max_iou_distance, self.tracks,
-                detections, iou_track_candidates, unmatched_detections)
-
-        matches = matches_a + matches_b
-        unmatched_tracks = list(set(unmatched_tracks_a + unmatched_tracks_b))
-        return matches, unmatched_tracks, unmatched_detections
-
-    def _initiate_track(self, detection):
-        mean, covariance = self.motion.initiate(detection.to_xyah())
-        self.tracks.append(
-            Track(mean, covariance, self._next_id, self.n_init, self.max_age,
-                  detection.cls_id, detection.score, detection.feature))
-        self._next_id += 1
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/jde_tracker.py b/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/jde_tracker.py
deleted file mode 100644
index 9571a6b..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/jde_tracker.py
+++ /dev/null
@@ -1,353 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/multitracker.py
-"""
-
-import numpy as np
-from collections import defaultdict
-
-from ..matching import jde_matching as matching
-from ..motion import KalmanFilter
-from .base_jde_tracker import TrackState, STrack
-from .base_jde_tracker import joint_stracks, sub_stracks, remove_duplicate_stracks
-
-from ppdet.core.workspace import register, serializable
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-__all__ = ['JDETracker']
-
-
-@register
-@serializable
-class JDETracker(object):
-    __shared__ = ['num_classes']
-    """
-    JDE tracker, support single class and multi classes
-
-    Args:
-        use_byte (bool): Whether use ByteTracker, default False
-        num_classes (int): the number of classes
-        det_thresh (float): threshold of detection score
-        track_buffer (int): buffer for tracker
-        min_box_area (int): min box area to filter out low quality boxes
-        vertical_ratio (float): w/h, the vertical ratio of the bbox to filter
-            bad results. If set <= 0 means no need to filter bboxes，usually set
-            1.6 for pedestrian tracking.
-        tracked_thresh (float): linear assignment threshold of tracked 
-            stracks and detections
-        r_tracked_thresh (float): linear assignment threshold of 
-            tracked stracks and unmatched detections
-        unconfirmed_thresh (float): linear assignment threshold of 
-            unconfirmed stracks and unmatched detections
-        conf_thres (float): confidence threshold for tracking, also used in
-            ByteTracker as higher confidence threshold
-        match_thres (float): linear assignment threshold of tracked 
-            stracks and detections in ByteTracker
-        low_conf_thres (float): lower confidence threshold for tracking in
-            ByteTracker
-        input_size (list): input feature map size to reid model, [h, w] format,
-            [64, 192] as default.
-        motion (str): motion model, KalmanFilter as default
-        metric_type (str): either "euclidean" or "cosine", the distance metric 
-            used for measurement to track association.
-    """
-
-    def __init__(self,
-                 use_byte=False,
-                 num_classes=1,
-                 det_thresh=0.3,
-                 track_buffer=30,
-                 min_box_area=0,
-                 vertical_ratio=0,
-                 tracked_thresh=0.7,
-                 r_tracked_thresh=0.5,
-                 unconfirmed_thresh=0.7,
-                 conf_thres=0,
-                 match_thres=0.8,
-                 low_conf_thres=0.2,
-                 input_size=[64, 192],
-                 motion='KalmanFilter',
-                 metric_type='euclidean'):
-        self.use_byte = use_byte
-        self.num_classes = num_classes
-        self.det_thresh = det_thresh if not use_byte else conf_thres + 0.1
-        self.track_buffer = track_buffer
-        self.min_box_area = min_box_area
-        self.vertical_ratio = vertical_ratio
-
-        self.tracked_thresh = tracked_thresh
-        self.r_tracked_thresh = r_tracked_thresh
-        self.unconfirmed_thresh = unconfirmed_thresh
-        self.conf_thres = conf_thres
-        self.match_thres = match_thres
-        self.low_conf_thres = low_conf_thres
-
-        self.input_size = input_size
-        if motion == 'KalmanFilter':
-            self.motion = KalmanFilter()
-        self.metric_type = metric_type
-
-        self.frame_id = 0
-        self.tracked_tracks_dict = defaultdict(list)  # dict(list[STrack])
-        self.lost_tracks_dict = defaultdict(list)  # dict(list[STrack])
-        self.removed_tracks_dict = defaultdict(list)  # dict(list[STrack])
-
-        self.max_time_lost = 0
-        # max_time_lost will be calculated: int(frame_rate / 30.0 * track_buffer)
-
-    def update(self, pred_dets, pred_embs=None):
-        """
-        Processes the image frame and finds bounding box(detections).
-        Associates the detection with corresponding tracklets and also handles
-            lost, removed, refound and active tracklets.
-
-        Args:
-            pred_dets (np.array): Detection results of the image, the shape is
-                [N, 6], means 'cls_id, score, x0, y0, x1, y1'.
-            pred_embs (np.array): Embedding results of the image, the shape is
-                [N, 128] or [N, 512].
-
-        Return:
-            output_stracks_dict (dict(list)): The list contains information
-                regarding the online_tracklets for the received image tensor.
-        """
-        self.frame_id += 1
-        if self.frame_id == 1:
-            STrack.init_count(self.num_classes)
-        activated_tracks_dict = defaultdict(list)
-        refined_tracks_dict = defaultdict(list)
-        lost_tracks_dict = defaultdict(list)
-        removed_tracks_dict = defaultdict(list)
-        output_tracks_dict = defaultdict(list)
-
-        pred_dets_dict = defaultdict(list)
-        pred_embs_dict = defaultdict(list)
-
-        # unify single and multi classes detection and embedding results
-        for cls_id in range(self.num_classes):
-            cls_idx = (pred_dets[:, 0:1] == cls_id).squeeze(-1)
-            pred_dets_dict[cls_id] = pred_dets[cls_idx]
-            if pred_embs is not None:
-                pred_embs_dict[cls_id] = pred_embs[cls_idx]
-            else:
-                pred_embs_dict[cls_id] = None
-
-        for cls_id in range(self.num_classes):
-            """ Step 1: Get detections by class"""
-            pred_dets_cls = pred_dets_dict[cls_id]
-            pred_embs_cls = pred_embs_dict[cls_id]
-            remain_inds = (pred_dets_cls[:, 1:2] > self.conf_thres).squeeze(-1)
-            if remain_inds.sum() > 0:
-                pred_dets_cls = pred_dets_cls[remain_inds]
-                if pred_embs_cls is None:
-                    # in original ByteTrack
-                    detections = [
-                        STrack(
-                            STrack.tlbr_to_tlwh(tlbrs[2:6]),
-                            tlbrs[1],
-                            cls_id,
-                            30,
-                            temp_feat=None) for tlbrs in pred_dets_cls
-                    ]
-                else:
-                    pred_embs_cls = pred_embs_cls[remain_inds]
-                    detections = [
-                        STrack(
-                            STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1], cls_id,
-                            30, temp_feat) for (tlbrs, temp_feat) in
-                        zip(pred_dets_cls, pred_embs_cls)
-                    ]
-            else:
-                detections = []
-            ''' Add newly detected tracklets to tracked_stracks'''
-            unconfirmed_dict = defaultdict(list)
-            tracked_tracks_dict = defaultdict(list)
-            for track in self.tracked_tracks_dict[cls_id]:
-                if not track.is_activated:
-                    # previous tracks which are not active in the current frame are added in unconfirmed list
-                    unconfirmed_dict[cls_id].append(track)
-                else:
-                    # Active tracks are added to the local list 'tracked_stracks'
-                    tracked_tracks_dict[cls_id].append(track)
-            """ Step 2: First association, with embedding"""
-            # building tracking pool for the current frame
-            track_pool_dict = defaultdict(list)
-            track_pool_dict[cls_id] = joint_stracks(
-                tracked_tracks_dict[cls_id], self.lost_tracks_dict[cls_id])
-
-            # Predict the current location with KalmanFilter
-            STrack.multi_predict(track_pool_dict[cls_id], self.motion)
-
-            if pred_embs_cls is None:
-                # in original ByteTrack
-                dists = matching.iou_distance(track_pool_dict[cls_id],
-                                              detections)
-                matches, u_track, u_detection = matching.linear_assignment(
-                    dists, thresh=self.match_thres)  # not self.tracked_thresh
-            else:
-                dists = matching.embedding_distance(
-                    track_pool_dict[cls_id],
-                    detections,
-                    metric=self.metric_type)
-                dists = matching.fuse_motion(
-                    self.motion, dists, track_pool_dict[cls_id], detections)
-                matches, u_track, u_detection = matching.linear_assignment(
-                    dists, thresh=self.tracked_thresh)
-
-            for i_tracked, idet in matches:
-                # i_tracked is the id of the track and idet is the detection
-                track = track_pool_dict[cls_id][i_tracked]
-                det = detections[idet]
-                if track.state == TrackState.Tracked:
-                    # If the track is active, add the detection to the track
-                    track.update(detections[idet], self.frame_id)
-                    activated_tracks_dict[cls_id].append(track)
-                else:
-                    # We have obtained a detection from a track which is not active,
-                    # hence put the track in refind_stracks list
-                    track.re_activate(det, self.frame_id, new_id=False)
-                    refined_tracks_dict[cls_id].append(track)
-
-            # None of the steps below happen if there are no undetected tracks.
-            """ Step 3: Second association, with IOU"""
-            if self.use_byte:
-                inds_low = pred_dets_dict[cls_id][:, 1:2] > self.low_conf_thres
-                inds_high = pred_dets_dict[cls_id][:, 1:2] < self.conf_thres
-                inds_second = np.logical_and(inds_low, inds_high).squeeze(-1)
-                pred_dets_cls_second = pred_dets_dict[cls_id][inds_second]
-
-                # association the untrack to the low score detections
-                if len(pred_dets_cls_second) > 0:
-                    if pred_embs_dict[cls_id] is None:
-                        # in original ByteTrack
-                        detections_second = [
-                            STrack(
-                                STrack.tlbr_to_tlwh(tlbrs[2:6]),
-                                tlbrs[1],
-                                cls_id,
-                                30,
-                                temp_feat=None)
-                            for tlbrs in pred_dets_cls_second
-                        ]
-                    else:
-                        pred_embs_cls_second = pred_embs_dict[cls_id][
-                            inds_second]
-                        detections_second = [
-                            STrack(
-                                STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1],
-                                cls_id, 30, temp_feat) for (tlbrs, temp_feat) in
-                            zip(pred_dets_cls_second, pred_embs_cls_second)
-                        ]
-                else:
-                    detections_second = []
-                r_tracked_stracks = [
-                    track_pool_dict[cls_id][i] for i in u_track
-                    if track_pool_dict[cls_id][i].state == TrackState.Tracked
-                ]
-                dists = matching.iou_distance(r_tracked_stracks,
-                                              detections_second)
-                matches, u_track, u_detection_second = matching.linear_assignment(
-                    dists, thresh=0.4)  # not r_tracked_thresh
-            else:
-                detections = [detections[i] for i in u_detection]
-                r_tracked_stracks = []
-                for i in u_track:
-                    if track_pool_dict[cls_id][i].state == TrackState.Tracked:
-                        r_tracked_stracks.append(track_pool_dict[cls_id][i])
-                dists = matching.iou_distance(r_tracked_stracks, detections)
-
-                matches, u_track, u_detection = matching.linear_assignment(
-                    dists, thresh=self.r_tracked_thresh)
-
-            for i_tracked, idet in matches:
-                track = r_tracked_stracks[i_tracked]
-                det = detections[
-                    idet] if not self.use_byte else detections_second[idet]
-                if track.state == TrackState.Tracked:
-                    track.update(det, self.frame_id)
-                    activated_tracks_dict[cls_id].append(track)
-                else:
-                    track.re_activate(det, self.frame_id, new_id=False)
-                    refined_tracks_dict[cls_id].append(track)
-
-            for it in u_track:
-                track = r_tracked_stracks[it]
-                if not track.state == TrackState.Lost:
-                    track.mark_lost()
-                    lost_tracks_dict[cls_id].append(track)
-            '''Deal with unconfirmed tracks, usually tracks with only one beginning frame'''
-            detections = [detections[i] for i in u_detection]
-            dists = matching.iou_distance(unconfirmed_dict[cls_id], detections)
-            matches, u_unconfirmed, u_detection = matching.linear_assignment(
-                dists, thresh=self.unconfirmed_thresh)
-            for i_tracked, idet in matches:
-                unconfirmed_dict[cls_id][i_tracked].update(detections[idet],
-                                                           self.frame_id)
-                activated_tracks_dict[cls_id].append(unconfirmed_dict[cls_id][
-                    i_tracked])
-            for it in u_unconfirmed:
-                track = unconfirmed_dict[cls_id][it]
-                track.mark_removed()
-                removed_tracks_dict[cls_id].append(track)
-            """ Step 4: Init new stracks"""
-            for inew in u_detection:
-                track = detections[inew]
-                if track.score < self.det_thresh:
-                    continue
-                track.activate(self.motion, self.frame_id)
-                activated_tracks_dict[cls_id].append(track)
-            """ Step 5: Update state"""
-            for track in self.lost_tracks_dict[cls_id]:
-                if self.frame_id - track.end_frame > self.max_time_lost:
-                    track.mark_removed()
-                    removed_tracks_dict[cls_id].append(track)
-
-            self.tracked_tracks_dict[cls_id] = [
-                t for t in self.tracked_tracks_dict[cls_id]
-                if t.state == TrackState.Tracked
-            ]
-            self.tracked_tracks_dict[cls_id] = joint_stracks(
-                self.tracked_tracks_dict[cls_id], activated_tracks_dict[cls_id])
-            self.tracked_tracks_dict[cls_id] = joint_stracks(
-                self.tracked_tracks_dict[cls_id], refined_tracks_dict[cls_id])
-            self.lost_tracks_dict[cls_id] = sub_stracks(
-                self.lost_tracks_dict[cls_id], self.tracked_tracks_dict[cls_id])
-            self.lost_tracks_dict[cls_id].extend(lost_tracks_dict[cls_id])
-            self.lost_tracks_dict[cls_id] = sub_stracks(
-                self.lost_tracks_dict[cls_id], self.removed_tracks_dict[cls_id])
-            self.removed_tracks_dict[cls_id].extend(removed_tracks_dict[cls_id])
-            self.tracked_tracks_dict[cls_id], self.lost_tracks_dict[
-                cls_id] = remove_duplicate_stracks(
-                    self.tracked_tracks_dict[cls_id],
-                    self.lost_tracks_dict[cls_id])
-
-            # get scores of lost tracks
-            output_tracks_dict[cls_id] = [
-                track for track in self.tracked_tracks_dict[cls_id]
-                if track.is_activated
-            ]
-
-            logger.debug('===========Frame {}=========='.format(self.frame_id))
-            logger.debug('Activated: {}'.format(
-                [track.track_id for track in activated_tracks_dict[cls_id]]))
-            logger.debug('Refind: {}'.format(
-                [track.track_id for track in refined_tracks_dict[cls_id]]))
-            logger.debug('Lost: {}'.format(
-                [track.track_id for track in lost_tracks_dict[cls_id]]))
-            logger.debug('Removed: {}'.format(
-                [track.track_id for track in removed_tracks_dict[cls_id]]))
-
-        return output_tracks_dict
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/ocsort_tracker.py b/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/ocsort_tracker.py
deleted file mode 100644
index 49b44e3..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/ocsort_tracker.py
+++ /dev/null
@@ -1,371 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This code is based on https://github.com/noahcao/OC_SORT/blob/master/trackers/ocsort_tracker/ocsort.py
-"""
-
-import numpy as np
-from ..matching.ocsort_matching import associate, linear_assignment, iou_batch, associate_only_iou
-from ..motion.ocsort_kalman_filter import OCSORTKalmanFilter
-from ppdet.core.workspace import register, serializable
-
-
-def k_previous_obs(observations, cur_age, k):
-    if len(observations) == 0:
-        return [-1, -1, -1, -1, -1]
-    for i in range(k):
-        dt = k - i
-        if cur_age - dt in observations:
-            return observations[cur_age - dt]
-    max_age = max(observations.keys())
-    return observations[max_age]
-
-
-def convert_bbox_to_z(bbox):
-    """
-    Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form
-      [x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is
-      the aspect ratio
-    """
-    w = bbox[2] - bbox[0]
-    h = bbox[3] - bbox[1]
-    x = bbox[0] + w / 2.
-    y = bbox[1] + h / 2.
-    s = w * h  # scale is just area
-    r = w / float(h + 1e-6)
-    return np.array([x, y, s, r]).reshape((4, 1))
-
-
-def convert_x_to_bbox(x, score=None):
-    """
-    Takes a bounding box in the centre form [x,y,s,r] and returns it in the form
-      [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right
-    """
-    w = np.sqrt(x[2] * x[3])
-    h = x[2] / w
-    if (score == None):
-        return np.array(
-            [x[0] - w / 2., x[1] - h / 2., x[0] + w / 2.,
-             x[1] + h / 2.]).reshape((1, 4))
-    else:
-        score = np.array([score])
-        return np.array([
-            x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., x[1] + h / 2., score
-        ]).reshape((1, 5))
-
-
-def speed_direction(bbox1, bbox2):
-    cx1, cy1 = (bbox1[0] + bbox1[2]) / 2.0, (bbox1[1] + bbox1[3]) / 2.0
-    cx2, cy2 = (bbox2[0] + bbox2[2]) / 2.0, (bbox2[1] + bbox2[3]) / 2.0
-    speed = np.array([cy2 - cy1, cx2 - cx1])
-    norm = np.sqrt((cy2 - cy1)**2 + (cx2 - cx1)**2) + 1e-6
-    return speed / norm
-
-
-class KalmanBoxTracker(object):
-    """
-    This class represents the internal state of individual tracked objects observed as bbox.
-
-    Args:
-        bbox (np.array): bbox in [x1,y1,x2,y2,score] format.
-        delta_t (int): delta_t of previous observation
-    """
-    count = 0
-
-    def __init__(self, bbox, delta_t=3):
-
-        self.kf = OCSORTKalmanFilter(dim_x=7, dim_z=4)
-        self.kf.F = np.array([[1., 0, 0, 0, 1., 0, 0], [0, 1., 0, 0, 0, 1., 0],
-                              [0, 0, 1., 0, 0, 0, 1], [0, 0, 0, 1., 0, 0, 0],
-                              [0, 0, 0, 0, 1., 0, 0], [0, 0, 0, 0, 0, 1., 0],
-                              [0, 0, 0, 0, 0, 0, 1.]])
-        self.kf.H = np.array([[1., 0, 0, 0, 0, 0, 0], [0, 1., 0, 0, 0, 0, 0],
-                              [0, 0, 1., 0, 0, 0, 0], [0, 0, 0, 1., 0, 0, 0]])
-        self.kf.R[2:, 2:] *= 10.
-        self.kf.P[4:, 4:] *= 1000.
-        # give high uncertainty to the unobservable initial velocities
-        self.kf.P *= 10.
-        self.kf.Q[-1, -1] *= 0.01
-        self.kf.Q[4:, 4:] *= 0.01
-
-        self.score = bbox[4]
-        self.kf.x[:4] = convert_bbox_to_z(bbox)
-        self.time_since_update = 0
-        self.id = KalmanBoxTracker.count
-        KalmanBoxTracker.count += 1
-        self.history = []
-        self.hits = 0
-        self.hit_streak = 0
-        self.age = 0
-        """
-        NOTE: [-1,-1,-1,-1,-1] is a compromising placeholder for non-observation status, the same for the return of 
-        function k_previous_obs. It is ugly and I do not like it. But to support generate observation array in a 
-        fast and unified way, which you would see below k_observations = np.array([k_previous_obs(...]]), let's bear it for now.
-        """
-        self.last_observation = np.array([-1, -1, -1, -1, -1])  # placeholder
-        self.observations = dict()
-        self.history_observations = []
-        self.velocity = None
-        self.delta_t = delta_t
-
-    def update(self, bbox, angle_cost=False):
-        """
-        Updates the state vector with observed bbox.
-        """
-        if bbox is not None:
-            if angle_cost and self.last_observation.sum(
-            ) >= 0:  # no previous observation
-                previous_box = None
-                for i in range(self.delta_t):
-                    dt = self.delta_t - i
-                    if self.age - dt in self.observations:
-                        previous_box = self.observations[self.age - dt]
-                        break
-                if previous_box is None:
-                    previous_box = self.last_observation
-                """
-                  Estimate the track speed direction with observations \Delta t steps away
-                """
-                self.velocity = speed_direction(previous_box, bbox)
-            """
-              Insert new observations. This is a ugly way to maintain both self.observations
-              and self.history_observations. Bear it for the moment.
-            """
-            self.last_observation = bbox
-            self.observations[self.age] = bbox
-            self.history_observations.append(bbox)
-
-            self.time_since_update = 0
-            self.history = []
-            self.hits += 1
-            self.hit_streak += 1
-            self.kf.update(convert_bbox_to_z(bbox))
-        else:
-            self.kf.update(bbox)
-
-    def predict(self):
-        """
-        Advances the state vector and returns the predicted bounding box estimate.
-        """
-        if ((self.kf.x[6] + self.kf.x[2]) <= 0):
-            self.kf.x[6] *= 0.0
-
-        self.kf.predict()
-        self.age += 1
-        if (self.time_since_update > 0):
-            self.hit_streak = 0
-        self.time_since_update += 1
-        self.history.append(convert_x_to_bbox(self.kf.x, score=self.score))
-        return self.history[-1]
-
-    def get_state(self):
-        return convert_x_to_bbox(self.kf.x, score=self.score)
-
-
-@register
-@serializable
-class OCSORTTracker(object):
-    """
-    OCSORT tracker, support single class
-
-    Args:
-        det_thresh (float): threshold of detection score
-        max_age (int): maximum number of missed misses before a track is deleted
-        min_hits (int): minimum hits for associate
-        iou_threshold (float): iou threshold for associate
-        delta_t (int): delta_t of previous observation
-        inertia (float): vdc_weight of angle_diff_cost for associate
-        vertical_ratio (float): w/h, the vertical ratio of the bbox to filter
-            bad results. If set <= 0 means no need to filter bboxes，usually set
-            1.6 for pedestrian tracking.
-        min_box_area (int): min box area to filter out low quality boxes
-        use_byte (bool): Whether use ByteTracker, default False
-    """
-
-    def __init__(self,
-                 det_thresh=0.6,
-                 max_age=30,
-                 min_hits=3,
-                 iou_threshold=0.3,
-                 delta_t=3,
-                 inertia=0.2,
-                 vertical_ratio=-1,
-                 min_box_area=0,
-                 use_byte=False,
-                 use_angle_cost=False):
-        self.det_thresh = det_thresh
-        self.max_age = max_age
-        self.min_hits = min_hits
-        self.iou_threshold = iou_threshold
-        self.delta_t = delta_t
-        self.inertia = inertia
-        self.vertical_ratio = vertical_ratio
-        self.min_box_area = min_box_area
-        self.use_byte = use_byte
-        self.use_angle_cost = use_angle_cost
-
-        self.trackers = []
-        self.frame_count = 0
-        KalmanBoxTracker.count = 0
-
-    def update(self, pred_dets, pred_embs=None):
-        """
-        Args:
-            pred_dets (np.array): Detection results of the image, the shape is
-                [N, 6], means 'cls_id, score, x0, y0, x1, y1'.
-            pred_embs (np.array): Embedding results of the image, the shape is
-                [N, 128] or [N, 512], default as None.
-
-        Return:
-            tracking boxes (np.array): [M, 6], means 'x0, y0, x1, y1, score, id'.
-        """
-        if pred_dets is None:
-            return np.empty((0, 6))
-
-        self.frame_count += 1
-
-        bboxes = pred_dets[:, 2:]
-        scores = pred_dets[:, 1:2]
-        dets = np.concatenate((bboxes, scores), axis=1)
-        scores = scores.squeeze(-1)
-
-        inds_low = scores > 0.1
-        inds_high = scores < self.det_thresh
-        inds_second = np.logical_and(inds_low, inds_high)
-        # self.det_thresh > score > 0.1, for second matching
-        dets_second = dets[inds_second]  # detections for second matching
-        remain_inds = scores > self.det_thresh
-        dets = dets[remain_inds]
-
-        # get predicted locations from existing trackers.
-        trks = np.zeros((len(self.trackers), 5))
-        to_del = []
-        ret = []
-        for t, trk in enumerate(trks):
-            pos = self.trackers[t].predict()[0]
-            trk[:] = [pos[0], pos[1], pos[2], pos[3], 0]
-            if np.any(np.isnan(pos)):
-                to_del.append(t)
-        trks = np.ma.compress_rows(np.ma.masked_invalid(trks))
-        for t in reversed(to_del):
-            self.trackers.pop(t)
-
-        if self.use_angle_cost:
-            velocities = np.array([
-                trk.velocity if trk.velocity is not None else np.array((0, 0))
-                for trk in self.trackers
-            ])
-
-            k_observations = np.array([
-                k_previous_obs(trk.observations, trk.age, self.delta_t)
-                for trk in self.trackers
-            ])
-        last_boxes = np.array([trk.last_observation for trk in self.trackers])
-        """
-            First round of association
-        """
-        if self.use_angle_cost:
-            matched, unmatched_dets, unmatched_trks = associate(
-                dets, trks, self.iou_threshold, velocities, k_observations,
-                self.inertia)
-        else:
-            matched, unmatched_dets, unmatched_trks = associate_only_iou(
-                dets, trks, self.iou_threshold)
-
-        for m in matched:
-            self.trackers[m[1]].update(
-                dets[m[0], :], angle_cost=self.use_angle_cost)
-        """
-            Second round of associaton by OCR
-        """
-        # BYTE association
-        if self.use_byte and len(dets_second) > 0 and unmatched_trks.shape[
-                0] > 0:
-            u_trks = trks[unmatched_trks]
-            iou_left = iou_batch(
-                dets_second,
-                u_trks)  # iou between low score detections and unmatched tracks
-            iou_left = np.array(iou_left)
-            if iou_left.max() > self.iou_threshold:
-                """
-                    NOTE: by using a lower threshold, e.g., self.iou_threshold - 0.1, you may
-                    get a higher performance especially on MOT17/MOT20 datasets. But we keep it
-                    uniform here for simplicity
-                """
-                matched_indices = linear_assignment(-iou_left)
-                to_remove_trk_indices = []
-                for m in matched_indices:
-                    det_ind, trk_ind = m[0], unmatched_trks[m[1]]
-                    if iou_left[m[0], m[1]] < self.iou_threshold:
-                        continue
-                    self.trackers[trk_ind].update(
-                        dets_second[det_ind, :], angle_cost=self.use_angle_cost)
-                    to_remove_trk_indices.append(trk_ind)
-                unmatched_trks = np.setdiff1d(unmatched_trks,
-                                              np.array(to_remove_trk_indices))
-
-        if unmatched_dets.shape[0] > 0 and unmatched_trks.shape[0] > 0:
-            left_dets = dets[unmatched_dets]
-            left_trks = last_boxes[unmatched_trks]
-            iou_left = iou_batch(left_dets, left_trks)
-            iou_left = np.array(iou_left)
-            if iou_left.max() > self.iou_threshold:
-                """
-                    NOTE: by using a lower threshold, e.g., self.iou_threshold - 0.1, you may
-                    get a higher performance especially on MOT17/MOT20 datasets. But we keep it
-                    uniform here for simplicity
-                """
-                rematched_indices = linear_assignment(-iou_left)
-                to_remove_det_indices = []
-                to_remove_trk_indices = []
-                for m in rematched_indices:
-                    det_ind, trk_ind = unmatched_dets[m[0]], unmatched_trks[m[
-                        1]]
-                    if iou_left[m[0], m[1]] < self.iou_threshold:
-                        continue
-                    self.trackers[trk_ind].update(
-                        dets[det_ind, :], angle_cost=self.use_angle_cost)
-                    to_remove_det_indices.append(det_ind)
-                    to_remove_trk_indices.append(trk_ind)
-                unmatched_dets = np.setdiff1d(unmatched_dets,
-                                              np.array(to_remove_det_indices))
-                unmatched_trks = np.setdiff1d(unmatched_trks,
-                                              np.array(to_remove_trk_indices))
-
-        for m in unmatched_trks:
-            self.trackers[m].update(None)
-
-        # create and initialise new trackers for unmatched detections
-        for i in unmatched_dets:
-            trk = KalmanBoxTracker(dets[i, :], delta_t=self.delta_t)
-            self.trackers.append(trk)
-
-        i = len(self.trackers)
-        for trk in reversed(self.trackers):
-            if trk.last_observation.sum() < 0:
-                d = trk.get_state()[0]
-            else:
-                d = trk.last_observation  # tlbr + score
-            if (trk.time_since_update < 1) and (
-                    trk.hit_streak >= self.min_hits or
-                    self.frame_count <= self.min_hits):
-                # +1 as MOT benchmark requires positive
-                ret.append(np.concatenate((d, [trk.id + 1])).reshape(1, -1))
-            i -= 1
-            # remove dead tracklet
-            if (trk.time_since_update > self.max_age):
-                self.trackers.pop(i)
-        if (len(ret) > 0):
-            return np.concatenate(ret)
-        return np.empty((0, 6))
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/utils.py b/pdfdet/models/Paddle/ppdet/modeling/mot/utils.py
deleted file mode 100644
index f19b0d9..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/utils.py
+++ /dev/null
@@ -1,265 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import cv2
-import time
-import numpy as np
-from .visualization import plot_tracking_dict, plot_tracking
-
-__all__ = [
-    'MOTTimer',
-    'Detection',
-    'write_mot_results',
-    'save_vis_results',
-    'load_det_results',
-    'preprocess_reid',
-    'get_crops',
-    'clip_box',
-    'scale_coords',
-]
-
-
-class MOTTimer(object):
-    """
-    This class used to compute and print the current FPS while evaling.
-    """
-
-    def __init__(self):
-        self.total_time = 0.
-        self.calls = 0
-        self.start_time = 0.
-        self.diff = 0.
-        self.average_time = 0.
-        self.duration = 0.
-
-    def tic(self):
-        # using time.time instead of time.clock because time time.clock
-        # does not normalize for multithreading
-        self.start_time = time.time()
-
-    def toc(self, average=True):
-        self.diff = time.time() - self.start_time
-        self.total_time += self.diff
-        self.calls += 1
-        self.average_time = self.total_time / self.calls
-        if average:
-            self.duration = self.average_time
-        else:
-            self.duration = self.diff
-        return self.duration
-
-    def clear(self):
-        self.total_time = 0.
-        self.calls = 0
-        self.start_time = 0.
-        self.diff = 0.
-        self.average_time = 0.
-        self.duration = 0.
-
-
-class Detection(object):
-    """
-    This class represents a bounding box detection in a single image.
-
-    Args:
-        tlwh (Tensor): Bounding box in format `(top left x, top left y,
-            width, height)`.
-        score (Tensor): Bounding box confidence score.
-        feature (Tensor): A feature vector that describes the object 
-            contained in this image.
-        cls_id (Tensor): Bounding box category id.
-    """
-
-    def __init__(self, tlwh, score, feature, cls_id):
-        self.tlwh = np.asarray(tlwh, dtype=np.float32)
-        self.score = float(score)
-        self.feature = np.asarray(feature, dtype=np.float32)
-        self.cls_id = int(cls_id)
-
-    def to_tlbr(self):
-        """
-        Convert bounding box to format `(min x, min y, max x, max y)`, i.e.,
-        `(top left, bottom right)`.
-        """
-        ret = self.tlwh.copy()
-        ret[2:] += ret[:2]
-        return ret
-
-    def to_xyah(self):
-        """
-        Convert bounding box to format `(center x, center y, aspect ratio,
-        height)`, where the aspect ratio is `width / height`.
-        """
-        ret = self.tlwh.copy()
-        ret[:2] += ret[2:] / 2
-        ret[2] /= ret[3]
-        return ret
-
-
-def write_mot_results(filename, results, data_type='mot', num_classes=1):
-    # support single and multi classes
-    if data_type in ['mot', 'mcmot']:
-        save_format = '{frame},{id},{x1},{y1},{w},{h},{score},{cls_id},-1,-1\n'
-    elif data_type == 'kitti':
-        save_format = '{frame} {id} car 0 0 -10 {x1} {y1} {x2} {y2} -10 -10 -10 -1000 -1000 -1000 -10\n'
-    else:
-        raise ValueError(data_type)
-
-    f = open(filename, 'w')
-    for cls_id in range(num_classes):
-        for frame_id, tlwhs, tscores, track_ids in results[cls_id]:
-            if data_type == 'kitti':
-                frame_id -= 1
-            for tlwh, score, track_id in zip(tlwhs, tscores, track_ids):
-                if track_id < 0: continue
-                if data_type == 'mot':
-                    cls_id = -1
-
-                x1, y1, w, h = tlwh
-                x2, y2 = x1 + w, y1 + h
-                line = save_format.format(
-                    frame=frame_id,
-                    id=track_id,
-                    x1=x1,
-                    y1=y1,
-                    x2=x2,
-                    y2=y2,
-                    w=w,
-                    h=h,
-                    score=score,
-                    cls_id=cls_id)
-                f.write(line)
-    print('MOT results save in {}'.format(filename))
-
-
-def save_vis_results(data,
-                     frame_id,
-                     online_ids,
-                     online_tlwhs,
-                     online_scores,
-                     average_time,
-                     show_image,
-                     save_dir,
-                     num_classes=1,
-                     ids2names=[]):
-    if show_image or save_dir is not None:
-        assert 'ori_image' in data
-        img0 = data['ori_image'].numpy()[0]
-        if online_ids is None:
-            online_im = img0
-        else:
-            if isinstance(online_tlwhs, dict):
-                online_im = plot_tracking_dict(
-                    img0,
-                    num_classes,
-                    online_tlwhs,
-                    online_ids,
-                    online_scores,
-                    frame_id=frame_id,
-                    fps=1. / average_time,
-                    ids2names=ids2names)
-            else:
-                online_im = plot_tracking(
-                    img0,
-                    online_tlwhs,
-                    online_ids,
-                    online_scores,
-                    frame_id=frame_id,
-                    fps=1. / average_time,
-                    ids2names=ids2names)
-    if show_image:
-        cv2.imshow('online_im', online_im)
-    if save_dir is not None:
-        cv2.imwrite(
-            os.path.join(save_dir, '{:05d}.jpg'.format(frame_id)), online_im)
-
-
-def load_det_results(det_file, num_frames):
-    assert os.path.exists(det_file) and os.path.isfile(det_file), \
-        '{} is not exist or not a file.'.format(det_file)
-    labels = np.loadtxt(det_file, dtype='float32', delimiter=',')
-    assert labels.shape[1] == 7, \
-        "Each line of {} should have 7 items: '[frame_id],[x0],[y0],[w],[h],[score],[class_id]'.".format(det_file)
-    results_list = []
-    for frame_i in range(num_frames):
-        results = {'bbox': [], 'score': [], 'cls_id': []}
-        lables_with_frame = labels[labels[:, 0] == frame_i + 1]
-        # each line of lables_with_frame:
-        # [frame_id],[x0],[y0],[w],[h],[score],[class_id]
-        for l in lables_with_frame:
-            results['bbox'].append(l[1:5])
-            results['score'].append(l[5:6])
-            results['cls_id'].append(l[6:7])
-        results_list.append(results)
-    return results_list
-
-
-def scale_coords(coords, input_shape, im_shape, scale_factor):
-    # Note: ratio has only one value, scale_factor[0] == scale_factor[1]
-    # 
-    # This function only used for JDE YOLOv3 or other detectors with 
-    # LetterBoxResize and JDEBBoxPostProcess, coords output from detector had
-    # not scaled back to the origin image.
-
-    ratio = scale_factor[0]
-    pad_w = (input_shape[1] - int(im_shape[1])) / 2
-    pad_h = (input_shape[0] - int(im_shape[0])) / 2
-    coords[:, 0::2] -= pad_w
-    coords[:, 1::2] -= pad_h
-    coords[:, 0:4] /= ratio
-    coords[:, :4] = np.clip(coords[:, :4], a_min=0, a_max=coords[:, :4].max())
-    return coords.round()
-
-
-def clip_box(xyxy, ori_image_shape):
-    H, W = ori_image_shape
-    xyxy[:, 0::2] = np.clip(xyxy[:, 0::2], a_min=0, a_max=W)
-    xyxy[:, 1::2] = np.clip(xyxy[:, 1::2], a_min=0, a_max=H)
-    w = xyxy[:, 2:3] - xyxy[:, 0:1]
-    h = xyxy[:, 3:4] - xyxy[:, 1:2]
-    mask = np.logical_and(h > 0, w > 0)
-    keep_idx = np.nonzero(mask)
-    return xyxy[keep_idx[0]], keep_idx
-
-
-def get_crops(xyxy, ori_img, w, h):
-    crops = []
-    xyxy = xyxy.astype(np.int64)
-    ori_img = ori_img.numpy()
-    ori_img = np.squeeze(ori_img, axis=0).transpose(1, 0, 2)  # [h,w,3]->[w,h,3]
-    for i, bbox in enumerate(xyxy):
-        crop = ori_img[bbox[0]:bbox[2], bbox[1]:bbox[3], :]
-        crops.append(crop)
-    crops = preprocess_reid(crops, w, h)
-    return crops
-
-
-def preprocess_reid(imgs,
-                    w=64,
-                    h=192,
-                    mean=[0.485, 0.456, 0.406],
-                    std=[0.229, 0.224, 0.225]):
-    im_batch = []
-    for img in imgs:
-        img = cv2.resize(img, (w, h))
-        img = img[:, :, ::-1].astype('float32').transpose((2, 0, 1)) / 255
-        img_mean = np.array(mean).reshape((3, 1, 1))
-        img_std = np.array(std).reshape((3, 1, 1))
-        img -= img_mean
-        img /= img_std
-        img = np.expand_dims(img, axis=0)
-        im_batch.append(img)
-    im_batch = np.concatenate(im_batch, 0)
-    return im_batch
diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/visualization.py b/pdfdet/models/Paddle/ppdet/modeling/mot/visualization.py
deleted file mode 100644
index 6d13a28..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/mot/visualization.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import cv2
-import numpy as np
-
-
-def get_color(idx):
-    idx = idx * 3
-    color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255)
-    return color
-
-
-def plot_tracking(image,
-                  tlwhs,
-                  obj_ids,
-                  scores=None,
-                  frame_id=0,
-                  fps=0.,
-                  ids2names=[]):
-    im = np.ascontiguousarray(np.copy(image))
-    im_h, im_w = im.shape[:2]
-
-    top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255
-
-    text_scale = max(1, image.shape[1] / 1600.)
-    text_thickness = 2
-    line_thickness = max(1, int(image.shape[1] / 500.))
-
-    radius = max(5, int(im_w / 140.))
-    cv2.putText(
-        im,
-        'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)),
-        (0, int(15 * text_scale)),
-        cv2.FONT_HERSHEY_PLAIN,
-        text_scale, (0, 0, 255),
-        thickness=2)
-
-    for i, tlwh in enumerate(tlwhs):
-        x1, y1, w, h = tlwh
-        intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h)))
-        obj_id = int(obj_ids[i])
-        id_text = '{}'.format(int(obj_id))
-        if ids2names != []:
-            assert len(
-                ids2names) == 1, "plot_tracking only supports single classes."
-            id_text = '{}_'.format(ids2names[0]) + id_text
-        _line_thickness = 1 if obj_id <= 0 else line_thickness
-        color = get_color(abs(obj_id))
-        cv2.rectangle(
-            im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness)
-        cv2.putText(
-            im,
-            id_text, (intbox[0], intbox[1] - 10),
-            cv2.FONT_HERSHEY_PLAIN,
-            text_scale, (0, 0, 255),
-            thickness=text_thickness)
-
-        if scores is not None:
-            text = '{:.2f}'.format(float(scores[i]))
-            cv2.putText(
-                im,
-                text, (intbox[0], intbox[1] + 10),
-                cv2.FONT_HERSHEY_PLAIN,
-                text_scale, (0, 255, 255),
-                thickness=text_thickness)
-    return im
-
-
-def plot_tracking_dict(image,
-                       num_classes,
-                       tlwhs_dict,
-                       obj_ids_dict,
-                       scores_dict,
-                       frame_id=0,
-                       fps=0.,
-                       ids2names=[]):
-    im = np.ascontiguousarray(np.copy(image))
-    im_h, im_w = im.shape[:2]
-
-    top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255
-
-    text_scale = max(1, image.shape[1] / 1600.)
-    text_thickness = 2
-    line_thickness = max(1, int(image.shape[1] / 500.))
-
-    radius = max(5, int(im_w / 140.))
-
-    for cls_id in range(num_classes):
-        tlwhs = tlwhs_dict[cls_id]
-        obj_ids = obj_ids_dict[cls_id]
-        scores = scores_dict[cls_id]
-        cv2.putText(
-            im,
-            'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)),
-            (0, int(15 * text_scale)),
-            cv2.FONT_HERSHEY_PLAIN,
-            text_scale, (0, 0, 255),
-            thickness=2)
-
-        for i, tlwh in enumerate(tlwhs):
-            x1, y1, w, h = tlwh
-            intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h)))
-            obj_id = int(obj_ids[i])
-
-            id_text = '{}'.format(int(obj_id))
-            if ids2names != []:
-                id_text = '{}_{}'.format(ids2names[cls_id], id_text)
-            else:
-                id_text = 'class{}_{}'.format(cls_id, id_text)
-
-            _line_thickness = 1 if obj_id <= 0 else line_thickness
-            color = get_color(abs(obj_id))
-            cv2.rectangle(
-                im,
-                intbox[0:2],
-                intbox[2:4],
-                color=color,
-                thickness=line_thickness)
-            cv2.putText(
-                im,
-                id_text, (intbox[0], intbox[1] - 10),
-                cv2.FONT_HERSHEY_PLAIN,
-                text_scale, (0, 0, 255),
-                thickness=text_thickness)
-
-            if scores is not None:
-                text = '{:.2f}'.format(float(scores[i]))
-                cv2.putText(
-                    im,
-                    text, (intbox[0], intbox[1] + 10),
-                    cv2.FONT_HERSHEY_PLAIN,
-                    text_scale, (0, 255, 255),
-                    thickness=text_thickness)
-    return im
diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/necks/__init__.py
deleted file mode 100644
index afd2a95..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/necks/__init__.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import fpn
-from . import yolo_fpn
-from . import hrfpn
-from . import ttf_fpn
-from . import centernet_fpn
-from . import bifpn
-from . import csp_pan
-from . import es_pan
-from . import lc_pan
-from . import custom_pan
-from . import dilated_encoder
-from . import clrnet_fpn
-
-from .fpn import *
-from .yolo_fpn import *
-from .hrfpn import *
-from .ttf_fpn import *
-from .centernet_fpn import *
-from .blazeface_fpn import *
-from .bifpn import *
-from .csp_pan import *
-from .es_pan import *
-from .lc_pan import *
-from .custom_pan import *
-from .dilated_encoder import *
-from .channel_mapper import *
-from .clrnet_fpn import *
diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/bifpn.py b/pdfdet/models/Paddle/ppdet/modeling/necks/bifpn.py
deleted file mode 100644
index 9e794b8..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/necks/bifpn.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn.initializer import Constant
-
-from ppdet.core.workspace import register, serializable
-from ppdet.modeling.layers import ConvNormLayer
-from ..shape_spec import ShapeSpec
-
-__all__ = ['BiFPN']
-
-
-class SeparableConvLayer(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels=None,
-                 kernel_size=3,
-                 norm_type='bn',
-                 norm_groups=32,
-                 act='swish'):
-        super(SeparableConvLayer, self).__init__()
-        assert norm_type in ['bn', 'sync_bn', 'gn', None]
-        assert act in ['swish', 'relu', None]
-
-        self.in_channels = in_channels
-        if out_channels is None:
-            self.out_channels = self.in_channels
-        self.norm_type = norm_type
-        self.norm_groups = norm_groups
-        self.depthwise_conv = nn.Conv2D(
-            in_channels,
-            in_channels,
-            kernel_size,
-            padding=kernel_size // 2,
-            groups=in_channels,
-            bias_attr=False)
-        self.pointwise_conv = nn.Conv2D(in_channels, self.out_channels, 1)
-
-        # norm type
-        if self.norm_type in ['bn', 'sync_bn']:
-            self.norm = nn.BatchNorm2D(self.out_channels)
-        elif self.norm_type == 'gn':
-            self.norm = nn.GroupNorm(
-                num_groups=self.norm_groups, num_channels=self.out_channels)
-
-        # activation
-        if act == 'swish':
-            self.act = nn.Swish()
-        elif act == 'relu':
-            self.act = nn.ReLU()
-
-    def forward(self, x):
-        if self.act is not None:
-            x = self.act(x)
-        out = self.depthwise_conv(x)
-        out = self.pointwise_conv(out)
-        if self.norm_type is not None:
-            out = self.norm(out)
-        return out
-
-
-class BiFPNCell(nn.Layer):
-    def __init__(self,
-                 channels=256,
-                 num_levels=5,
-                 eps=1e-5,
-                 use_weighted_fusion=True,
-                 kernel_size=3,
-                 norm_type='bn',
-                 norm_groups=32,
-                 act='swish'):
-        super(BiFPNCell, self).__init__()
-        self.channels = channels
-        self.num_levels = num_levels
-        self.eps = eps
-        self.use_weighted_fusion = use_weighted_fusion
-
-        # up
-        self.conv_up = nn.LayerList([
-            SeparableConvLayer(
-                self.channels,
-                kernel_size=kernel_size,
-                norm_type=norm_type,
-                norm_groups=norm_groups,
-                act=act) for _ in range(self.num_levels - 1)
-        ])
-        # down
-        self.conv_down = nn.LayerList([
-            SeparableConvLayer(
-                self.channels,
-                kernel_size=kernel_size,
-                norm_type=norm_type,
-                norm_groups=norm_groups,
-                act=act) for _ in range(self.num_levels - 1)
-        ])
-
-        if self.use_weighted_fusion:
-            self.up_weights = self.create_parameter(
-                shape=[self.num_levels - 1, 2],
-                attr=ParamAttr(initializer=Constant(1.)))
-            self.down_weights = self.create_parameter(
-                shape=[self.num_levels - 1, 3],
-                attr=ParamAttr(initializer=Constant(1.)))
-
-    def _feature_fusion_cell(self,
-                             conv_layer,
-                             lateral_feat,
-                             sampling_feat,
-                             route_feat=None,
-                             weights=None):
-        if self.use_weighted_fusion:
-            weights = F.relu(weights)
-            weights = weights / (weights.sum() + self.eps)
-            if route_feat is not None:
-                out_feat = weights[0] * lateral_feat + \
-                           weights[1] * sampling_feat + \
-                           weights[2] * route_feat
-            else:
-                out_feat = weights[0] * lateral_feat + \
-                           weights[1] * sampling_feat
-        else:
-            if route_feat is not None:
-                out_feat = lateral_feat + sampling_feat + route_feat
-            else:
-                out_feat = lateral_feat + sampling_feat
-
-        out_feat = conv_layer(out_feat)
-        return out_feat
-
-    def forward(self, feats):
-        # feats: [P3 - P7]
-        lateral_feats = []
-
-        # up
-        up_feature = feats[-1]
-        for i, feature in enumerate(feats[::-1]):
-            if i == 0:
-                lateral_feats.append(feature)
-            else:
-                shape = paddle.shape(feature)
-                up_feature = F.interpolate(
-                    up_feature, size=[shape[2], shape[3]])
-                lateral_feature = self._feature_fusion_cell(
-                    self.conv_up[i - 1],
-                    feature,
-                    up_feature,
-                    weights=self.up_weights[i - 1]
-                    if self.use_weighted_fusion else None)
-                lateral_feats.append(lateral_feature)
-                up_feature = lateral_feature
-
-        out_feats = []
-        # down
-        down_feature = lateral_feats[-1]
-        for i, (lateral_feature,
-                route_feature) in enumerate(zip(lateral_feats[::-1], feats)):
-            if i == 0:
-                out_feats.append(lateral_feature)
-            else:
-                down_feature = F.max_pool2d(down_feature, 3, 2, 1)
-                if i == len(feats) - 1:
-                    route_feature = None
-                    weights = self.down_weights[
-                        i - 1][:2] if self.use_weighted_fusion else None
-                else:
-                    weights = self.down_weights[
-                        i - 1] if self.use_weighted_fusion else None
-                out_feature = self._feature_fusion_cell(
-                    self.conv_down[i - 1],
-                    lateral_feature,
-                    down_feature,
-                    route_feature,
-                    weights=weights)
-                out_feats.append(out_feature)
-                down_feature = out_feature
-
-        return out_feats
-
-
-@register
-@serializable
-class BiFPN(nn.Layer):
-    """
-    Bidirectional Feature Pyramid Network, see https://arxiv.org/abs/1911.09070
-
-    Args:
-        in_channels (list[int]): input channels of each level which can be
-            derived from the output shape of backbone by from_config.
-        out_channel (int): output channel of each level.
-        num_extra_levels (int): the number of extra stages added to the last level.
-            default: 2
-        fpn_strides (List): The stride of each level.
-        num_stacks (int): the number of stacks for BiFPN, default: 1.
-        use_weighted_fusion (bool): use weighted feature fusion in BiFPN, default: True.
-        norm_type (string|None): the normalization type in BiFPN module. If
-            norm_type is None, norm will not be used after conv and if
-            norm_type is string, bn, gn, sync_bn are available. default: bn.
-        norm_groups (int): if you use gn, set this param.
-        act (string|None): the activation function of BiFPN.
-    """
-
-    def __init__(self,
-                 in_channels=(512, 1024, 2048),
-                 out_channel=256,
-                 num_extra_levels=2,
-                 fpn_strides=[8, 16, 32, 64, 128],
-                 num_stacks=1,
-                 use_weighted_fusion=True,
-                 norm_type='bn',
-                 norm_groups=32,
-                 act='swish'):
-        super(BiFPN, self).__init__()
-        assert num_stacks > 0, "The number of stacks of BiFPN is at least 1."
-        assert norm_type in ['bn', 'sync_bn', 'gn', None]
-        assert act in ['swish', 'relu', None]
-        assert num_extra_levels >= 0, \
-            "The `num_extra_levels` must be non negative(>=0)."
-
-        self.in_channels = in_channels
-        self.out_channel = out_channel
-        self.num_extra_levels = num_extra_levels
-        self.num_stacks = num_stacks
-        self.use_weighted_fusion = use_weighted_fusion
-        self.norm_type = norm_type
-        self.norm_groups = norm_groups
-        self.act = act
-        self.num_levels = len(self.in_channels) + self.num_extra_levels
-        if len(fpn_strides) != self.num_levels:
-            for i in range(self.num_extra_levels):
-                fpn_strides += [fpn_strides[-1] * 2]
-        self.fpn_strides = fpn_strides
-
-        self.lateral_convs = nn.LayerList()
-        for in_c in in_channels:
-            self.lateral_convs.append(
-                ConvNormLayer(in_c, self.out_channel, 1, 1))
-        if self.num_extra_levels > 0:
-            self.extra_convs = nn.LayerList()
-            for i in range(self.num_extra_levels):
-                if i == 0:
-                    self.extra_convs.append(
-                        ConvNormLayer(self.in_channels[-1], self.out_channel, 3,
-                                      2))
-                else:
-                    self.extra_convs.append(nn.MaxPool2D(3, 2, 1))
-
-        self.bifpn_cells = nn.LayerList()
-        for i in range(self.num_stacks):
-            self.bifpn_cells.append(
-                BiFPNCell(
-                    self.out_channel,
-                    self.num_levels,
-                    use_weighted_fusion=self.use_weighted_fusion,
-                    norm_type=self.norm_type,
-                    norm_groups=self.norm_groups,
-                    act=self.act))
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {
-            'in_channels': [i.channels for i in input_shape],
-            'fpn_strides': [i.stride for i in input_shape]
-        }
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=self.out_channel, stride=s) for s in self.fpn_strides
-        ]
-
-    def forward(self, feats):
-        assert len(feats) == len(self.in_channels)
-        fpn_feats = []
-        for conv_layer, feature in zip(self.lateral_convs, feats):
-            fpn_feats.append(conv_layer(feature))
-        if self.num_extra_levels > 0:
-            feat = feats[-1]
-            for conv_layer in self.extra_convs:
-                feat = conv_layer(feat)
-                fpn_feats.append(feat)
-
-        for bifpn_cell in self.bifpn_cells:
-            fpn_feats = bifpn_cell(fpn_feats)
-        return fpn_feats
diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/blazeface_fpn.py b/pdfdet/models/Paddle/ppdet/modeling/necks/blazeface_fpn.py
deleted file mode 100644
index b903c97..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/necks/blazeface_fpn.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn.functional as F
-from paddle import ParamAttr
-import paddle.nn as nn
-from paddle.nn.initializer import KaimingNormal
-from ppdet.core.workspace import register, serializable
-from ..shape_spec import ShapeSpec
-
-__all__ = ['BlazeNeck']
-
-
-def hard_swish(x):
-    return x * F.relu6(x + 3) / 6.
-
-
-class ConvBNLayer(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride,
-                 padding,
-                 num_groups=1,
-                 act='relu',
-                 conv_lr=0.1,
-                 conv_decay=0.,
-                 norm_decay=0.,
-                 norm_type='bn',
-                 name=None):
-        super(ConvBNLayer, self).__init__()
-        self.act = act
-        self._conv = nn.Conv2D(
-            in_channels,
-            out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            groups=num_groups,
-            weight_attr=ParamAttr(
-                learning_rate=conv_lr, initializer=KaimingNormal()),
-            bias_attr=False)
-
-        if norm_type in ['sync_bn', 'bn']:
-            self._batch_norm = nn.BatchNorm2D(out_channels)
-
-    def forward(self, x):
-        x = self._conv(x)
-        x = self._batch_norm(x)
-        if self.act == "relu":
-            x = F.relu(x)
-        elif self.act == "relu6":
-            x = F.relu6(x)
-        elif self.act == 'leaky':
-            x = F.leaky_relu(x)
-        elif self.act == 'hard_swish':
-            x = hard_swish(x)
-        return x
-
-
-class FPN(nn.Layer):
-    def __init__(self, in_channels, out_channels, name=None):
-        super(FPN, self).__init__()
-        self.conv1_fpn = ConvBNLayer(
-            in_channels,
-            out_channels // 2,
-            kernel_size=1,
-            padding=0,
-            stride=1,
-            act='leaky',
-            name=name + '_output1')
-        self.conv2_fpn = ConvBNLayer(
-            in_channels,
-            out_channels // 2,
-            kernel_size=1,
-            padding=0,
-            stride=1,
-            act='leaky',
-            name=name + '_output2')
-        self.conv3_fpn = ConvBNLayer(
-            out_channels // 2,
-            out_channels // 2,
-            kernel_size=3,
-            padding=1,
-            stride=1,
-            act='leaky',
-            name=name + '_merge')
-
-    def forward(self, input):
-        output1 = self.conv1_fpn(input[0])
-        output2 = self.conv2_fpn(input[1])
-        up2 = F.upsample(
-            output2, size=paddle.shape(output1)[-2:], mode='nearest')
-        output1 = paddle.add(output1, up2)
-        output1 = self.conv3_fpn(output1)
-        return output1, output2
-
-
-class SSH(nn.Layer):
-    def __init__(self, in_channels, out_channels, name=None):
-        super(SSH, self).__init__()
-        assert out_channels % 4 == 0
-        self.conv0_ssh = ConvBNLayer(
-            in_channels,
-            out_channels // 2,
-            kernel_size=3,
-            padding=1,
-            stride=1,
-            act=None,
-            name=name + 'ssh_conv3')
-        self.conv1_ssh = ConvBNLayer(
-            out_channels // 2,
-            out_channels // 4,
-            kernel_size=3,
-            padding=1,
-            stride=1,
-            act='leaky',
-            name=name + 'ssh_conv5_1')
-        self.conv2_ssh = ConvBNLayer(
-            out_channels // 4,
-            out_channels // 4,
-            kernel_size=3,
-            padding=1,
-            stride=1,
-            act=None,
-            name=name + 'ssh_conv5_2')
-        self.conv3_ssh = ConvBNLayer(
-            out_channels // 4,
-            out_channels // 4,
-            kernel_size=3,
-            padding=1,
-            stride=1,
-            act='leaky',
-            name=name + 'ssh_conv7_1')
-        self.conv4_ssh = ConvBNLayer(
-            out_channels // 4,
-            out_channels // 4,
-            kernel_size=3,
-            padding=1,
-            stride=1,
-            act=None,
-            name=name + 'ssh_conv7_2')
-
-    def forward(self, x):
-        conv0 = self.conv0_ssh(x)
-        conv1 = self.conv1_ssh(conv0)
-        conv2 = self.conv2_ssh(conv1)
-        conv3 = self.conv3_ssh(conv2)
-        conv4 = self.conv4_ssh(conv3)
-        concat = paddle.concat([conv0, conv2, conv4], axis=1)
-        return F.relu(concat)
-
-
-@register
-@serializable
-class BlazeNeck(nn.Layer):
-    def __init__(self, in_channel, neck_type="None", data_format='NCHW'):
-        super(BlazeNeck, self).__init__()
-        self.neck_type = neck_type
-        self.reture_input = False
-        self._out_channels = in_channel
-        if self.neck_type == 'None':
-            self.reture_input = True
-        if "fpn" in self.neck_type:
-            self.fpn = FPN(self._out_channels[0],
-                           self._out_channels[1],
-                           name='fpn')
-            self._out_channels = [
-                self._out_channels[0] // 2, self._out_channels[1] // 2
-            ]
-        if "ssh" in self.neck_type:
-            self.ssh1 = SSH(self._out_channels[0],
-                            self._out_channels[0],
-                            name='ssh1')
-            self.ssh2 = SSH(self._out_channels[1],
-                            self._out_channels[1],
-                            name='ssh2')
-            self._out_channels = [self._out_channels[0], self._out_channels[1]]
-
-    def forward(self, inputs):
-        if self.reture_input:
-            return inputs
-        output1, output2 = None, None
-        if "fpn" in self.neck_type:
-            backout_4, backout_1 = inputs
-            output1, output2 = self.fpn([backout_4, backout_1])
-        if self.neck_type == "only_fpn":
-            return [output1, output2]
-        if self.neck_type == "only_ssh":
-            output1, output2 = inputs
-        feature1 = self.ssh1(output1)
-        feature2 = self.ssh2(output2)
-        return [feature1, feature2]
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(channels=c)
-            for c in [self._out_channels[0], self._out_channels[1]]
-        ]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/centernet_fpn.py b/pdfdet/models/Paddle/ppdet/modeling/necks/centernet_fpn.py
deleted file mode 100644
index d4dded8..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/necks/centernet_fpn.py
+++ /dev/null
@@ -1,426 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import math
-import paddle
-import paddle.nn as nn
-from paddle import ParamAttr
-from paddle.nn.initializer import Uniform
-import paddle.nn.functional as F
-from ppdet.core.workspace import register, serializable
-from ppdet.modeling.layers import ConvNormLayer
-from ppdet.modeling.backbones.hardnet import ConvLayer, HarDBlock
-from ..shape_spec import ShapeSpec
-
-__all__ = ['CenterNetDLAFPN', 'CenterNetHarDNetFPN']
-
-
-# SGE attention
-class BasicConv(nn.Layer):
-    def __init__(self,
-                 in_planes,
-                 out_planes,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 relu=True,
-                 bn=True,
-                 bias_attr=False):
-        super(BasicConv, self).__init__()
-        self.out_channels = out_planes
-        self.conv = nn.Conv2D(
-            in_planes,
-            out_planes,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=padding,
-            dilation=dilation,
-            groups=groups,
-            bias_attr=bias_attr)
-        self.bn = nn.BatchNorm2D(
-            out_planes,
-            epsilon=1e-5,
-            momentum=0.01,
-            weight_attr=False,
-            bias_attr=False) if bn else None
-        self.relu = nn.ReLU() if relu else None
-
-    def forward(self, x):
-        x = self.conv(x)
-        if self.bn is not None:
-            x = self.bn(x)
-        if self.relu is not None:
-            x = self.relu(x)
-        return x
-
-
-class ChannelPool(nn.Layer):
-    def forward(self, x):
-        return paddle.concat(
-            (paddle.max(x, 1).unsqueeze(1), paddle.mean(x, 1).unsqueeze(1)),
-            axis=1)
-
-
-class SpatialGate(nn.Layer):
-    def __init__(self):
-        super(SpatialGate, self).__init__()
-        kernel_size = 7
-        self.compress = ChannelPool()
-        self.spatial = BasicConv(
-            2,
-            1,
-            kernel_size,
-            stride=1,
-            padding=(kernel_size - 1) // 2,
-            relu=False)
-
-    def forward(self, x):
-        x_compress = self.compress(x)
-        x_out = self.spatial(x_compress)
-        scale = F.sigmoid(x_out)  # broadcasting
-        return x * scale
-
-
-def fill_up_weights(up):
-    weight = up.weight.numpy()
-    f = math.ceil(weight.shape[2] / 2)
-    c = (2 * f - 1 - f % 2) / (2. * f)
-    for i in range(weight.shape[2]):
-        for j in range(weight.shape[3]):
-            weight[0, 0, i, j] = \
-                (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
-    for c in range(1, weight.shape[0]):
-        weight[c, 0, :, :] = weight[0, 0, :, :]
-    up.weight.set_value(weight)
-
-
-class IDAUp(nn.Layer):
-    def __init__(self, ch_ins, ch_out, up_strides, dcn_v2=True):
-        super(IDAUp, self).__init__()
-        for i in range(1, len(ch_ins)):
-            ch_in = ch_ins[i]
-            up_s = int(up_strides[i])
-            fan_in = ch_in * 3 * 3
-            stdv = 1. / math.sqrt(fan_in)
-            proj = nn.Sequential(
-                ConvNormLayer(
-                    ch_in,
-                    ch_out,
-                    filter_size=3,
-                    stride=1,
-                    use_dcn=dcn_v2,
-                    bias_on=dcn_v2,
-                    norm_decay=None,
-                    dcn_lr_scale=1.,
-                    dcn_regularizer=None,
-                    initializer=Uniform(-stdv, stdv)),
-                nn.ReLU())
-            node = nn.Sequential(
-                ConvNormLayer(
-                    ch_out,
-                    ch_out,
-                    filter_size=3,
-                    stride=1,
-                    use_dcn=dcn_v2,
-                    bias_on=dcn_v2,
-                    norm_decay=None,
-                    dcn_lr_scale=1.,
-                    dcn_regularizer=None,
-                    initializer=Uniform(-stdv, stdv)),
-                nn.ReLU())
-
-            kernel_size = up_s * 2
-            fan_in = ch_out * kernel_size * kernel_size
-            stdv = 1. / math.sqrt(fan_in)
-            up = nn.Conv2DTranspose(
-                ch_out,
-                ch_out,
-                kernel_size=up_s * 2,
-                stride=up_s,
-                padding=up_s // 2,
-                groups=ch_out,
-                weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)),
-                bias_attr=False)
-            fill_up_weights(up)
-            setattr(self, 'proj_' + str(i), proj)
-            setattr(self, 'up_' + str(i), up)
-            setattr(self, 'node_' + str(i), node)
-
-    def forward(self, inputs, start_level, end_level):
-        for i in range(start_level + 1, end_level):
-            upsample = getattr(self, 'up_' + str(i - start_level))
-            project = getattr(self, 'proj_' + str(i - start_level))
-            inputs[i] = project(inputs[i])
-            inputs[i] = upsample(inputs[i])
-            node = getattr(self, 'node_' + str(i - start_level))
-            inputs[i] = node(paddle.add(inputs[i], inputs[i - 1]))
-        return inputs
-
-
-class DLAUp(nn.Layer):
-    def __init__(self, start_level, channels, scales, ch_in=None, dcn_v2=True):
-        super(DLAUp, self).__init__()
-        self.start_level = start_level
-        if ch_in is None:
-            ch_in = channels
-        self.channels = channels
-        channels = list(channels)
-        scales = np.array(scales, dtype=int)
-        for i in range(len(channels) - 1):
-            j = -i - 2
-            setattr(
-                self,
-                'ida_{}'.format(i),
-                IDAUp(
-                    ch_in[j:],
-                    channels[j],
-                    scales[j:] // scales[j],
-                    dcn_v2=dcn_v2))
-            scales[j + 1:] = scales[j]
-            ch_in[j + 1:] = [channels[j] for _ in channels[j + 1:]]
-
-    def forward(self, inputs):
-        out = [inputs[-1]]  # start with 32
-        for i in range(len(inputs) - self.start_level - 1):
-            ida = getattr(self, 'ida_{}'.format(i))
-            outputs = ida(inputs, len(inputs) - i - 2, len(inputs))
-            out.insert(0, outputs[-1])
-        return out
-
-
-@register
-@serializable
-class CenterNetDLAFPN(nn.Layer):
-    """
-    Args:
-        in_channels (list): number of input feature channels from backbone.
-            [16, 32, 64, 128, 256, 512] by default, means the channels of DLA-34
-        down_ratio (int): the down ratio from images to heatmap, 4 by default
-        last_level (int): the last level of input feature fed into the upsamplng block
-        out_channel (int): the channel of the output feature, 0 by default means
-            the channel of the input feature whose down ratio is `down_ratio`
-        first_level (None): the first level of input feature fed into the upsamplng block.
-            if None, the first level stands for logs(down_ratio)
-        dcn_v2 (bool): whether use the DCNv2, True by default
-        with_sge (bool): whether use SGE attention, False by default
-    """
-
-    def __init__(self,
-                 in_channels,
-                 down_ratio=4,
-                 last_level=5,
-                 out_channel=0,
-                 first_level=None,
-                 dcn_v2=True,
-                 with_sge=False):
-        super(CenterNetDLAFPN, self).__init__()
-        self.first_level = int(np.log2(
-            down_ratio)) if first_level is None else first_level
-        assert self.first_level >= 0, "first level in CenterNetDLAFPN should be greater or equal to 0, but received {}".format(
-            self.first_level)
-        self.down_ratio = down_ratio
-        self.last_level = last_level
-        scales = [2**i for i in range(len(in_channels[self.first_level:]))]
-        self.dla_up = DLAUp(
-            self.first_level,
-            in_channels[self.first_level:],
-            scales,
-            dcn_v2=dcn_v2)
-        self.out_channel = out_channel
-        if out_channel == 0:
-            self.out_channel = in_channels[self.first_level]
-        self.ida_up = IDAUp(
-            in_channels[self.first_level:self.last_level],
-            self.out_channel,
-            [2**i for i in range(self.last_level - self.first_level)],
-            dcn_v2=dcn_v2)
-
-        self.with_sge = with_sge
-        if self.with_sge:
-            self.sge_attention = SpatialGate()
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape]}
-
-    def forward(self, body_feats):
-
-        inputs = [body_feats[i] for i in range(len(body_feats))]
-
-        dla_up_feats = self.dla_up(inputs)
-
-        ida_up_feats = []
-        for i in range(self.last_level - self.first_level):
-            ida_up_feats.append(dla_up_feats[i].clone())
-
-        self.ida_up(ida_up_feats, 0, len(ida_up_feats))
-
-        feat = ida_up_feats[-1]
-        if self.with_sge:
-            feat = self.sge_attention(feat)
-        if self.down_ratio != 4:
-            feat = F.interpolate(
-                feat,
-                scale_factor=self.down_ratio // 4,
-                mode="bilinear",
-                align_corners=True)
-        return feat
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=self.out_channel, stride=self.down_ratio)]
-
-
-class TransitionUp(nn.Layer):
-    def __init__(self, in_channels, out_channels):
-        super().__init__()
-
-    def forward(self, x, skip):
-        w, h = skip.shape[2], skip.shape[3]
-        out = F.interpolate(x, size=(w, h), mode="bilinear", align_corners=True)
-        out = paddle.concat([out, skip], 1)
-        return out
-
-
-@register
-@serializable
-class CenterNetHarDNetFPN(nn.Layer):
-    """
-    Args:
-        in_channels (list): number of input feature channels from backbone.
-            [96, 214, 458, 784] by default, means the channels of HarDNet85
-        num_layers (int): HarDNet laters, 85 by default
-        down_ratio (int): the down ratio from images to heatmap, 4 by default
-        first_level (int|None): the first level of input feature fed into the upsamplng block.
-            if None, the first level stands for logs(down_ratio) - 1
-
-        last_level (int): the last level of input feature fed into the upsamplng block
-        out_channel (int): the channel of the output feature, 0 by default means
-            the channel of the input feature whose down ratio is `down_ratio`
-    """
-
-    def __init__(self,
-                 in_channels,
-                 num_layers=85,
-                 down_ratio=4,
-                 first_level=None,
-                 last_level=4,
-                 out_channel=0):
-        super(CenterNetHarDNetFPN, self).__init__()
-        self.first_level = int(np.log2(
-            down_ratio)) - 1 if first_level is None else first_level
-        assert self.first_level >= 0, "first level in CenterNetDLAFPN should be greater or equal to 0, but received {}".format(
-            self.first_level)
-        self.down_ratio = down_ratio
-        self.last_level = last_level
-        self.last_pool = nn.AvgPool2D(kernel_size=2, stride=2)
-
-        assert num_layers in [68, 85], "HarDNet-{} not support.".format(
-            num_layers)
-        if num_layers == 85:
-            self.last_proj = ConvLayer(784, 256, kernel_size=1)
-            self.last_blk = HarDBlock(768, 80, 1.7, 8)
-            self.skip_nodes = [1, 3, 8, 13]
-            self.SC = [32, 32, 0]
-            gr = [64, 48, 28]
-            layers = [8, 8, 4]
-            ch_list2 = [224 + self.SC[0], 160 + self.SC[1], 96 + self.SC[2]]
-            channels = [96, 214, 458, 784]
-            self.skip_lv = 3
-
-        elif num_layers == 68:
-            self.last_proj = ConvLayer(654, 192, kernel_size=1)
-            self.last_blk = HarDBlock(576, 72, 1.7, 8)
-            self.skip_nodes = [1, 3, 8, 11]
-            self.SC = [32, 32, 0]
-            gr = [48, 32, 20]
-            layers = [8, 8, 4]
-            ch_list2 = [224 + self.SC[0], 96 + self.SC[1], 64 + self.SC[2]]
-            channels = [64, 124, 328, 654]
-            self.skip_lv = 2
-
-        self.transUpBlocks = nn.LayerList([])
-        self.denseBlocksUp = nn.LayerList([])
-        self.conv1x1_up = nn.LayerList([])
-        self.avg9x9 = nn.AvgPool2D(kernel_size=(9, 9), stride=1, padding=(4, 4))
-        prev_ch = self.last_blk.get_out_ch()
-
-        for i in range(3):
-            skip_ch = channels[3 - i]
-            self.transUpBlocks.append(TransitionUp(prev_ch, prev_ch))
-            if i < self.skip_lv:
-                cur_ch = prev_ch + skip_ch
-            else:
-                cur_ch = prev_ch
-            self.conv1x1_up.append(
-                ConvLayer(
-                    cur_ch, ch_list2[i], kernel_size=1))
-            cur_ch = ch_list2[i]
-            cur_ch -= self.SC[i]
-            cur_ch *= 3
-
-            blk = HarDBlock(cur_ch, gr[i], 1.7, layers[i])
-            self.denseBlocksUp.append(blk)
-            prev_ch = blk.get_out_ch()
-
-        prev_ch += self.SC[0] + self.SC[1] + self.SC[2]
-        self.out_channel = prev_ch
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape]}
-
-    def forward(self, body_feats):
-        x = body_feats[-1]
-        x_sc = []
-        x = self.last_proj(x)
-        x = self.last_pool(x)
-        x2 = self.avg9x9(x)
-        x3 = x / (x.sum((2, 3), keepdim=True) + 0.1)
-        x = paddle.concat([x, x2, x3], 1)
-        x = self.last_blk(x)
-
-        for i in range(3):
-            skip_x = body_feats[3 - i]
-            x_up = self.transUpBlocks[i](x, skip_x)
-            x_ch = self.conv1x1_up[i](x_up)
-            if self.SC[i] > 0:
-                end = x_ch.shape[1]
-                new_st = end - self.SC[i]
-                x_sc.append(x_ch[:, new_st:, :, :])
-                x_ch = x_ch[:, :new_st, :, :]
-            x2 = self.avg9x9(x_ch)
-            x3 = x_ch / (x_ch.sum((2, 3), keepdim=True) + 0.1)
-            x_new = paddle.concat([x_ch, x2, x3], 1)
-            x = self.denseBlocksUp[i](x_new)
-
-        scs = [x]
-        for i in range(3):
-            if self.SC[i] > 0:
-                scs.insert(
-                    0,
-                    F.interpolate(
-                        x_sc[i],
-                        size=(x.shape[2], x.shape[3]),
-                        mode="bilinear",
-                        align_corners=True))
-        neck_feat = paddle.concat(scs, 1)
-        return neck_feat
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=self.out_channel, stride=self.down_ratio)]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/channel_mapper.py b/pdfdet/models/Paddle/ppdet/modeling/necks/channel_mapper.py
deleted file mode 100644
index 6eff3f8..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/necks/channel_mapper.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-"""
-this code is base on mmdet: git@github.com:open-mmlab/mmdetection.git
-"""
-import paddle.nn as nn
-
-from ppdet.core.workspace import register, serializable
-from ..backbones.hrnet import ConvNormLayer
-from ..shape_spec import ShapeSpec
-from ..initializer import xavier_uniform_, constant_
-
-__all__ = ['ChannelMapper']
-
-
-@register
-@serializable
-class ChannelMapper(nn.Layer):
-    """Channel Mapper to reduce/increase channels of backbone features.
-
-    This is used to reduce/increase channels of backbone features.
-
-    Args:
-        in_channels (List[int]): Number of input channels per scale.
-        out_channels (int): Number of output channels (used at each scale).
-        kernel_size (int, optional): kernel_size for reducing channels (used
-            at each scale). Default: 3.
-        conv_cfg (dict, optional): Config dict for convolution layer.
-            Default: None.
-        norm_cfg (dict, optional): Config dict for normalization layer.
-            Default: None.
-        act_cfg (dict, optional): Config dict for activation layer in
-            ConvModule. Default: dict(type='ReLU').
-        num_outs (int, optional): Number of output feature maps. There
-            would be extra_convs when num_outs larger than the length
-            of in_channels.
-        init_cfg (dict or list[dict], optional): Initialization config dict.
-        
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size=3,
-                 norm_type="gn",
-                 norm_groups=32,
-                 act='relu',
-                 num_outs=None,
-                 init_cfg=dict(
-                     type='Xavier', layer='Conv2d', distribution='uniform')):
-        super(ChannelMapper, self).__init__()
-        assert isinstance(in_channels, list)
-        self.extra_convs = None
-        if num_outs is None:
-            num_outs = len(in_channels)
-        self.convs = nn.LayerList()
-        for in_channel in in_channels:
-            self.convs.append(
-                ConvNormLayer(
-                    ch_in=in_channel,
-                    ch_out=out_channels,
-                    filter_size=kernel_size,
-                    norm_type='gn',
-                    norm_groups=32,
-                    act=act))
-
-        if num_outs > len(in_channels):
-            self.extra_convs = nn.LayerList()
-            for i in range(len(in_channels), num_outs):
-                if i == len(in_channels):
-                    in_channel = in_channels[-1]
-                else:
-                    in_channel = out_channels
-                self.extra_convs.append(
-                    ConvNormLayer(
-                        ch_in=in_channel,
-                        ch_out=out_channels,
-                        filter_size=3,
-                        stride=2,
-                        norm_type='gn',
-                        norm_groups=32,
-                        act=act))
-        self.init_weights()
-
-    def forward(self, inputs):
-        """Forward function."""
-        assert len(inputs) == len(self.convs)
-        outs = [self.convs[i](inputs[i]) for i in range(len(inputs))]
-        if self.extra_convs:
-            for i in range(len(self.extra_convs)):
-                if i == 0:
-                    outs.append(self.extra_convs[0](inputs[-1]))
-                else:
-                    outs.append(self.extra_convs[i](outs[-1]))
-        return tuple(outs)
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=self.out_channel, stride=1. / s)
-            for s in self.spatial_scales
-        ]
-
-    def init_weights(self):
-        """Initialize the transformer weights."""
-        for p in self.parameters():
-            if p.rank() > 1:
-                xavier_uniform_(p)
-                if hasattr(p, 'bias') and p.bias is not None:
-                    constant_(p.bais)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/clrnet_fpn.py b/pdfdet/models/Paddle/ppdet/modeling/necks/clrnet_fpn.py
deleted file mode 100644
index 936c7e7..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/necks/clrnet_fpn.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn.initializer import XavierUniform
-from ppdet.modeling.initializer import kaiming_normal_, constant_
-from ppdet.core.workspace import register, serializable
-from ppdet.modeling.layers import ConvNormLayer
-from ppdet.modeling.shape_spec import ShapeSpec
-
-__all__ = ['CLRFPN']
-
-
-@register
-@serializable
-class CLRFPN(nn.Layer):
-    """
-    Feature Pyramid Network, see https://arxiv.org/abs/1612.03144
-    Args:
-        in_channels (list[int]): input channels of each level which can be 
-            derived from the output shape of backbone by from_config
-        out_channel (int): output channel of each level
-        spatial_scales (list[float]): the spatial scales between input feature
-            maps and original input image which can be derived from the output 
-            shape of backbone by from_config
-        has_extra_convs (bool): whether to add extra conv to the last level.
-            default False
-        extra_stage (int): the number of extra stages added to the last level.
-            default 1
-        use_c5 (bool): Whether to use c5 as the input of extra stage, 
-            otherwise p5 is used. default True
-        norm_type (string|None): The normalization type in FPN module. If 
-            norm_type is None, norm will not be used after conv and if 
-            norm_type is string, bn, gn, sync_bn are available. default None
-        norm_decay (float): weight decay for normalization layer weights.
-            default 0.
-        freeze_norm (bool): whether to freeze normalization layer.  
-            default False
-        relu_before_extra_convs (bool): whether to add relu before extra convs.
-            default False
-        
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channel,
-                 spatial_scales=[0.25, 0.125, 0.0625, 0.03125],
-                 has_extra_convs=False,
-                 extra_stage=1,
-                 use_c5=True,
-                 norm_type=None,
-                 norm_decay=0.,
-                 freeze_norm=False,
-                 relu_before_extra_convs=True):
-        super(CLRFPN, self).__init__()
-        self.out_channel = out_channel
-        for s in range(extra_stage):
-            spatial_scales = spatial_scales + [spatial_scales[-1] / 2.]
-        self.spatial_scales = spatial_scales
-        self.has_extra_convs = has_extra_convs
-        self.extra_stage = extra_stage
-        self.use_c5 = use_c5
-        self.relu_before_extra_convs = relu_before_extra_convs
-        self.norm_type = norm_type
-        self.norm_decay = norm_decay
-        self.freeze_norm = freeze_norm
-        self.in_channels = in_channels
-        self.lateral_convs = []
-        self.fpn_convs = []
-        fan = out_channel * 3 * 3
-
-        # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone
-        # 0 <= st_stage < ed_stage <= 3
-        st_stage = 4 - len(in_channels)
-        ed_stage = st_stage + len(in_channels) - 1
-
-        for i in range(st_stage, ed_stage + 1):
-            # if i == 3:
-            #     lateral_name = 'fpn_inner_res5_sum'
-            # else:
-            #     lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2)
-            lateral_name = "lateral_convs.{}.conv".format(i - 1)
-            in_c = in_channels[i - st_stage]
-            if self.norm_type is not None:
-                lateral = self.add_sublayer(
-                    lateral_name,
-                    ConvNormLayer(
-                        ch_in=in_c,
-                        ch_out=out_channel,
-                        filter_size=1,
-                        stride=1,
-                        norm_type=self.norm_type,
-                        norm_decay=self.norm_decay,
-                        freeze_norm=self.freeze_norm,
-                        initializer=XavierUniform(fan_out=in_c)))
-            else:
-                lateral = self.add_sublayer(
-                    lateral_name,
-                    nn.Conv2D(
-                        in_channels=in_c,
-                        out_channels=out_channel,
-                        kernel_size=1,
-                        weight_attr=ParamAttr(
-                            initializer=XavierUniform(fan_out=in_c))))
-            self.lateral_convs.append(lateral)
-
-            fpn_name = "fpn_convs.{}.conv".format(i - 1)
-            if self.norm_type is not None:
-                fpn_conv = self.add_sublayer(
-                    fpn_name,
-                    ConvNormLayer(
-                        ch_in=out_channel,
-                        ch_out=out_channel,
-                        filter_size=3,
-                        stride=1,
-                        norm_type=self.norm_type,
-                        norm_decay=self.norm_decay,
-                        freeze_norm=self.freeze_norm,
-                        initializer=XavierUniform(fan_out=fan)))
-            else:
-                fpn_conv = self.add_sublayer(
-                    fpn_name,
-                    nn.Conv2D(
-                        in_channels=out_channel,
-                        out_channels=out_channel,
-                        kernel_size=3,
-                        padding=1,
-                        weight_attr=ParamAttr(
-                            initializer=XavierUniform(fan_out=fan))))
-            self.fpn_convs.append(fpn_conv)
-
-        # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
-        if self.has_extra_convs:
-            for i in range(self.extra_stage):
-                lvl = ed_stage + 1 + i
-                if i == 0 and self.use_c5:
-                    in_c = in_channels[-1]
-                else:
-                    in_c = out_channel
-                extra_fpn_name = 'fpn_{}'.format(lvl + 2)
-                if self.norm_type is not None:
-                    extra_fpn_conv = self.add_sublayer(
-                        extra_fpn_name,
-                        ConvNormLayer(
-                            ch_in=in_c,
-                            ch_out=out_channel,
-                            filter_size=3,
-                            stride=2,
-                            norm_type=self.norm_type,
-                            norm_decay=self.norm_decay,
-                            freeze_norm=self.freeze_norm,
-                            initializer=XavierUniform(fan_out=fan)))
-                else:
-                    extra_fpn_conv = self.add_sublayer(
-                        extra_fpn_name,
-                        nn.Conv2D(
-                            in_channels=in_c,
-                            out_channels=out_channel,
-                            kernel_size=3,
-                            stride=2,
-                            padding=1,
-                            weight_attr=ParamAttr(
-                                initializer=XavierUniform(fan_out=fan))))
-                self.fpn_convs.append(extra_fpn_conv)
-        self.init_weights()
-
-    def init_weights(self):
-        for m in self.lateral_convs:
-            if isinstance(m, (nn.Conv1D, nn.Conv2D)):
-                kaiming_normal_(
-                    m.weight, a=0, mode='fan_out', nonlinearity='relu')
-                if m.bias is not None:
-                    constant_(m.bias, value=0.)
-            elif isinstance(m, (nn.BatchNorm1D, nn.BatchNorm2D)):
-                constant_(m.weight, value=1)
-                constant_(m.bias, value=0)
-        for m in self.fpn_convs:
-            if isinstance(m, (nn.Conv1D, nn.Conv2D)):
-                kaiming_normal_(
-                    m.weight, a=0, mode='fan_out', nonlinearity='relu')
-                if m.bias is not None:
-                    constant_(m.bias, value=0.)
-            elif isinstance(m, (nn.BatchNorm1D, nn.BatchNorm2D)):
-                constant_(m.weight, value=1)
-                constant_(m.bias, value=0)
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {}
-
-    def forward(self, body_feats):
-        laterals = []
-        if len(body_feats) > len(self.in_channels):
-            for _ in range(len(body_feats) - len(self.in_channels)):
-                del body_feats[0]
-        num_levels = len(body_feats)
-        # print("body_feats",num_levels)
-        for i in range(num_levels):
-            laterals.append(self.lateral_convs[i](body_feats[i]))
-
-        for i in range(1, num_levels):
-            lvl = num_levels - i
-            upsample = F.interpolate(
-                laterals[lvl],
-                scale_factor=2.,
-                mode='nearest', )
-            laterals[lvl - 1] += upsample
-
-        fpn_output = []
-        for lvl in range(num_levels):
-            fpn_output.append(self.fpn_convs[lvl](laterals[lvl]))
-
-        if self.extra_stage > 0:
-            # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN)
-            if not self.has_extra_convs:
-                assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs'
-                fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2))
-            # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
-            else:
-                if self.use_c5:
-                    extra_source = body_feats[-1]
-                else:
-                    extra_source = fpn_output[-1]
-                fpn_output.append(self.fpn_convs[num_levels](extra_source))
-
-                for i in range(1, self.extra_stage):
-                    if self.relu_before_extra_convs:
-                        fpn_output.append(self.fpn_convs[num_levels + i](F.relu(
-                            fpn_output[-1])))
-                    else:
-                        fpn_output.append(self.fpn_convs[num_levels + i](
-                            fpn_output[-1]))
-        return fpn_output
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=self.out_channel, stride=1. / s)
-            for s in self.spatial_scales
-        ]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/csp_pan.py b/pdfdet/models/Paddle/ppdet/modeling/necks/csp_pan.py
deleted file mode 100644
index 5c3539a..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/necks/csp_pan.py
+++ /dev/null
@@ -1,363 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-# The code is based on:
-# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/necks/yolox_pafpn.py
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from ppdet.core.workspace import register, serializable
-from ..shape_spec import ShapeSpec
-
-__all__ = ['CSPPAN']
-
-
-class ConvBNLayer(nn.Layer):
-    def __init__(self,
-                 in_channel=96,
-                 out_channel=96,
-                 kernel_size=3,
-                 stride=1,
-                 groups=1,
-                 act='leaky_relu'):
-        super(ConvBNLayer, self).__init__()
-        initializer = nn.initializer.KaimingUniform()
-        self.conv = nn.Conv2D(
-            in_channels=in_channel,
-            out_channels=out_channel,
-            kernel_size=kernel_size,
-            groups=groups,
-            padding=(kernel_size - 1) // 2,
-            stride=stride,
-            weight_attr=ParamAttr(initializer=initializer),
-            bias_attr=False)
-        self.bn = nn.BatchNorm2D(out_channel)
-        if act == "hard_swish":
-            act = 'hardswish'
-        self.act = act
-
-    def forward(self, x):
-        x = self.bn(self.conv(x))
-        if self.act:
-            x = getattr(F, self.act)(x)
-        return x
-
-
-class DPModule(nn.Layer):
-    """
-    Depth-wise and point-wise module.
-     Args:
-        in_channel (int): The input channels of this Module.
-        out_channel (int): The output channels of this Module.
-        kernel_size (int): The conv2d kernel size of this Module.
-        stride (int): The conv2d's stride of this Module.
-        act (str): The activation function of this Module,
-                   Now support `leaky_relu` and `hard_swish`.
-    """
-
-    def __init__(self,
-                 in_channel=96,
-                 out_channel=96,
-                 kernel_size=3,
-                 stride=1,
-                 act='leaky_relu',
-                 use_act_in_out=True):
-        super(DPModule, self).__init__()
-        initializer = nn.initializer.KaimingUniform()
-        self.use_act_in_out = use_act_in_out
-        self.dwconv = nn.Conv2D(
-            in_channels=in_channel,
-            out_channels=out_channel,
-            kernel_size=kernel_size,
-            groups=out_channel,
-            padding=(kernel_size - 1) // 2,
-            stride=stride,
-            weight_attr=ParamAttr(initializer=initializer),
-            bias_attr=False)
-        self.bn1 = nn.BatchNorm2D(out_channel)
-        self.pwconv = nn.Conv2D(
-            in_channels=out_channel,
-            out_channels=out_channel,
-            kernel_size=1,
-            groups=1,
-            padding=0,
-            weight_attr=ParamAttr(initializer=initializer),
-            bias_attr=False)
-        self.bn2 = nn.BatchNorm2D(out_channel)
-        if act == "hard_swish":
-            act = 'hardswish'
-        self.act = act
-
-    def forward(self, x):
-        x = self.bn1(self.dwconv(x))
-        if self.act:
-            x = getattr(F, self.act)(x)
-        x = self.bn2(self.pwconv(x))
-        if self.use_act_in_out and self.act:
-            x = getattr(F, self.act)(x)
-        return x
-
-
-class DarknetBottleneck(nn.Layer):
-    """The basic bottleneck block used in Darknet.
-
-    Each Block consists of two ConvModules and the input is added to the
-    final output. Each ConvModule is composed of Conv, BN, and act.
-    The first convLayer has filter size of 1x1 and the second one has the
-    filter size of 3x3.
-
-    Args:
-        in_channels (int): The input channels of this Module.
-        out_channels (int): The output channels of this Module.
-        expansion (int): The kernel size of the convolution. Default: 0.5
-        add_identity (bool): Whether to add identity to the out.
-            Default: True
-        use_depthwise (bool): Whether to use depthwise separable convolution.
-            Default: False
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size=3,
-                 expansion=0.5,
-                 add_identity=True,
-                 use_depthwise=False,
-                 act="leaky_relu"):
-        super(DarknetBottleneck, self).__init__()
-        hidden_channels = int(out_channels * expansion)
-        conv_func = DPModule if use_depthwise else ConvBNLayer
-        self.conv1 = ConvBNLayer(
-            in_channel=in_channels,
-            out_channel=hidden_channels,
-            kernel_size=1,
-            act=act)
-        self.conv2 = conv_func(
-            in_channel=hidden_channels,
-            out_channel=out_channels,
-            kernel_size=kernel_size,
-            stride=1,
-            act=act)
-        self.add_identity = \
-            add_identity and in_channels == out_channels
-
-    def forward(self, x):
-        identity = x
-        out = self.conv1(x)
-        out = self.conv2(out)
-
-        if self.add_identity:
-            return out + identity
-        else:
-            return out
-
-
-class CSPLayer(nn.Layer):
-    """Cross Stage Partial Layer.
-
-    Args:
-        in_channels (int): The input channels of the CSP layer.
-        out_channels (int): The output channels of the CSP layer.
-        expand_ratio (float): Ratio to adjust the number of channels of the
-            hidden layer. Default: 0.5
-        num_blocks (int): Number of blocks. Default: 1
-        add_identity (bool): Whether to add identity in blocks.
-            Default: True
-        use_depthwise (bool): Whether to depthwise separable convolution in
-            blocks. Default: False
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size=3,
-                 expand_ratio=0.5,
-                 num_blocks=1,
-                 add_identity=True,
-                 use_depthwise=False,
-                 act="leaky_relu"):
-        super().__init__()
-        mid_channels = int(out_channels * expand_ratio)
-        self.main_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act)
-        self.short_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act)
-        self.final_conv = ConvBNLayer(
-            2 * mid_channels, out_channels, 1, act=act)
-
-        self.blocks = nn.Sequential(* [
-            DarknetBottleneck(
-                mid_channels,
-                mid_channels,
-                kernel_size,
-                1.0,
-                add_identity,
-                use_depthwise,
-                act=act) for _ in range(num_blocks)
-        ])
-
-    def forward(self, x):
-        x_short = self.short_conv(x)
-
-        x_main = self.main_conv(x)
-        x_main = self.blocks(x_main)
-
-        x_final = paddle.concat((x_main, x_short), axis=1)
-        return self.final_conv(x_final)
-
-
-class Channel_T(nn.Layer):
-    def __init__(self,
-                 in_channels=[116, 232, 464],
-                 out_channels=96,
-                 act="leaky_relu"):
-        super(Channel_T, self).__init__()
-        self.convs = nn.LayerList()
-        for i in range(len(in_channels)):
-            self.convs.append(
-                ConvBNLayer(
-                    in_channels[i], out_channels, 1, act=act))
-
-    def forward(self, x):
-        outs = [self.convs[i](x[i]) for i in range(len(x))]
-        return outs
-
-
-@register
-@serializable
-class CSPPAN(nn.Layer):
-    """Path Aggregation Network with CSP module.
-
-    Args:
-        in_channels (List[int]): Number of input channels per scale.
-        out_channels (int): Number of output channels (used at each scale)
-        kernel_size (int): The conv2d kernel size of this Module.
-        num_features (int): Number of output features of CSPPAN module.
-        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1
-        use_depthwise (bool): Whether to depthwise separable convolution in
-            blocks. Default: True
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size=5,
-                 num_features=3,
-                 num_csp_blocks=1,
-                 use_depthwise=True,
-                 act='hard_swish',
-                 spatial_scales=[0.125, 0.0625, 0.03125]):
-        super(CSPPAN, self).__init__()
-        self.conv_t = Channel_T(in_channels, out_channels, act=act)
-        in_channels = [out_channels] * len(spatial_scales)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.spatial_scales = spatial_scales
-        self.num_features = num_features
-        conv_func = DPModule if use_depthwise else ConvBNLayer
-
-        if self.num_features == 4:
-            self.first_top_conv = conv_func(
-                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
-            self.second_top_conv = conv_func(
-                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
-            self.spatial_scales.append(self.spatial_scales[-1] / 2)
-
-        # build top-down blocks
-        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
-        self.top_down_blocks = nn.LayerList()
-        for idx in range(len(in_channels) - 1, 0, -1):
-            self.top_down_blocks.append(
-                CSPLayer(
-                    in_channels[idx - 1] * 2,
-                    in_channels[idx - 1],
-                    kernel_size=kernel_size,
-                    num_blocks=num_csp_blocks,
-                    add_identity=False,
-                    use_depthwise=use_depthwise,
-                    act=act))
-
-        # build bottom-up blocks
-        self.downsamples = nn.LayerList()
-        self.bottom_up_blocks = nn.LayerList()
-        for idx in range(len(in_channels) - 1):
-            self.downsamples.append(
-                conv_func(
-                    in_channels[idx],
-                    in_channels[idx],
-                    kernel_size=kernel_size,
-                    stride=2,
-                    act=act))
-            self.bottom_up_blocks.append(
-                CSPLayer(
-                    in_channels[idx] * 2,
-                    in_channels[idx + 1],
-                    kernel_size=kernel_size,
-                    num_blocks=num_csp_blocks,
-                    add_identity=False,
-                    use_depthwise=use_depthwise,
-                    act=act))
-
-    def forward(self, inputs):
-        """
-        Args:
-            inputs (tuple[Tensor]): input features.
-
-        Returns:
-            tuple[Tensor]: CSPPAN features.
-        """
-        assert len(inputs) == len(self.in_channels)
-        inputs = self.conv_t(inputs)
-
-        # top-down path
-        inner_outs = [inputs[-1]]
-        for idx in range(len(self.in_channels) - 1, 0, -1):
-            feat_heigh = inner_outs[0]
-            feat_low = inputs[idx - 1]
-
-            upsample_feat = self.upsample(feat_heigh)
-
-            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
-                paddle.concat([upsample_feat, feat_low], 1))
-            inner_outs.insert(0, inner_out)
-
-        # bottom-up path
-        outs = [inner_outs[0]]
-        for idx in range(len(self.in_channels) - 1):
-            feat_low = outs[-1]
-            feat_height = inner_outs[idx + 1]
-            downsample_feat = self.downsamples[idx](feat_low)
-            out = self.bottom_up_blocks[idx](paddle.concat(
-                [downsample_feat, feat_height], 1))
-            outs.append(out)
-
-        top_features = None
-        if self.num_features == 4:
-            top_features = self.first_top_conv(inputs[-1])
-            top_features = top_features + self.second_top_conv(outs[-1])
-            outs.append(top_features)
-
-        return tuple(outs)
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=self.out_channels, stride=1. / s)
-            for s in self.spatial_scales
-        ]
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/custom_pan.py b/pdfdet/models/Paddle/ppdet/modeling/necks/custom_pan.py
deleted file mode 100644
index cf7ec84..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/necks/custom_pan.py
+++ /dev/null
@@ -1,398 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-import copy
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register, serializable
-from ppdet.modeling.layers import DropBlock, MultiHeadAttention
-from ppdet.modeling.ops import get_act_fn
-from ..backbones.cspresnet import ConvBNLayer, BasicBlock
-from ..shape_spec import ShapeSpec
-from ..initializer import linear_init_
-
-__all__ = ['CustomCSPPAN']
-
-
-def _get_clones(module, N):
-    return nn.LayerList([copy.deepcopy(module) for _ in range(N)])
-
-
-class SPP(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 k,
-                 pool_size,
-                 act='swish',
-                 data_format='NCHW'):
-        super(SPP, self).__init__()
-        self.pool = []
-        self.data_format = data_format
-        for i, size in enumerate(pool_size):
-            pool = self.add_sublayer(
-                'pool{}'.format(i),
-                nn.MaxPool2D(
-                    kernel_size=size,
-                    stride=1,
-                    padding=size // 2,
-                    data_format=data_format,
-                    ceil_mode=False))
-            self.pool.append(pool)
-        self.conv = ConvBNLayer(ch_in, ch_out, k, padding=k // 2, act=act)
-
-    def forward(self, x):
-        outs = [x]
-        for pool in self.pool:
-            outs.append(pool(x))
-        if self.data_format == 'NCHW':
-            y = paddle.concat(outs, axis=1)
-        else:
-            y = paddle.concat(outs, axis=-1)
-
-        y = self.conv(y)
-        return y
-
-
-class CSPStage(nn.Layer):
-    def __init__(self,
-                 block_fn,
-                 ch_in,
-                 ch_out,
-                 n,
-                 act='swish',
-                 spp=False,
-                 use_alpha=False):
-        super(CSPStage, self).__init__()
-
-        ch_mid = int(ch_out // 2)
-        self.conv1 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
-        self.conv2 = ConvBNLayer(ch_in, ch_mid, 1, act=act)
-        self.convs = nn.Sequential()
-        next_ch_in = ch_mid
-        for i in range(n):
-            self.convs.add_sublayer(
-                str(i),
-                eval(block_fn)(next_ch_in,
-                               ch_mid,
-                               act=act,
-                               shortcut=False,
-                               use_alpha=use_alpha))
-            if i == (n - 1) // 2 and spp:
-                self.convs.add_sublayer(
-                    'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act))
-            next_ch_in = ch_mid
-        self.conv3 = ConvBNLayer(ch_mid * 2, ch_out, 1, act=act)
-
-    def forward(self, x):
-        y1 = self.conv1(x)
-        y2 = self.conv2(x)
-        y2 = self.convs(y2)
-        y = paddle.concat([y1, y2], axis=1)
-        y = self.conv3(y)
-        return y
-
-
-class TransformerEncoderLayer(nn.Layer):
-    def __init__(self,
-                 d_model,
-                 nhead,
-                 dim_feedforward=2048,
-                 dropout=0.1,
-                 activation="relu",
-                 attn_dropout=None,
-                 act_dropout=None,
-                 normalize_before=False):
-        super(TransformerEncoderLayer, self).__init__()
-        attn_dropout = dropout if attn_dropout is None else attn_dropout
-        act_dropout = dropout if act_dropout is None else act_dropout
-        self.normalize_before = normalize_before
-
-        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
-        # Implementation of Feedforward model
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
-        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
-        self.activation = getattr(F, activation)
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        linear_init_(self.linear1)
-        linear_init_(self.linear2)
-
-    @staticmethod
-    def with_pos_embed(tensor, pos_embed):
-        return tensor if pos_embed is None else tensor + pos_embed
-
-    def forward(self, src, src_mask=None, pos_embed=None):
-        residual = src
-        if self.normalize_before:
-            src = self.norm1(src)
-        q = k = self.with_pos_embed(src, pos_embed)
-        src = self.self_attn(q, k, value=src, attn_mask=src_mask)
-
-        src = residual + self.dropout1(src)
-        if not self.normalize_before:
-            src = self.norm1(src)
-
-        residual = src
-        if self.normalize_before:
-            src = self.norm2(src)
-        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
-        src = residual + self.dropout2(src)
-        if not self.normalize_before:
-            src = self.norm2(src)
-        return src
-
-
-class TransformerEncoder(nn.Layer):
-    def __init__(self, encoder_layer, num_layers, norm=None):
-        super(TransformerEncoder, self).__init__()
-        self.layers = _get_clones(encoder_layer, num_layers)
-        self.num_layers = num_layers
-        self.norm = norm
-
-    def forward(self, src, src_mask=None, pos_embed=None):
-        output = src
-        for layer in self.layers:
-            output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
-
-        if self.norm is not None:
-            output = self.norm(output)
-
-        return output
-
-
-@register
-@serializable
-class CustomCSPPAN(nn.Layer):
-    __shared__ = [
-        'norm_type', 'data_format', 'width_mult', 'depth_mult', 'trt',
-        'eval_size'
-    ]
-
-    def __init__(self,
-                 in_channels=[256, 512, 1024],
-                 out_channels=[1024, 512, 256],
-                 norm_type='bn',
-                 act='leaky',
-                 stage_fn='CSPStage',
-                 block_fn='BasicBlock',
-                 stage_num=1,
-                 block_num=3,
-                 drop_block=False,
-                 block_size=3,
-                 keep_prob=0.9,
-                 spp=False,
-                 data_format='NCHW',
-                 width_mult=1.0,
-                 depth_mult=1.0,
-                 use_alpha=False,
-                 trt=False,
-                 dim_feedforward=2048,
-                 dropout=0.1,
-                 activation='gelu',
-                 nhead=4,
-                 num_layers=4,
-                 attn_dropout=None,
-                 act_dropout=None,
-                 normalize_before=False,
-                 use_trans=False,
-                 eval_size=None):
-
-        super(CustomCSPPAN, self).__init__()
-        out_channels = [max(round(c * width_mult), 1) for c in out_channels]
-        block_num = max(round(block_num * depth_mult), 1)
-        act = get_act_fn(
-            act, trt=trt) if act is None or isinstance(act,
-                                                       (str, dict)) else act
-        self.num_blocks = len(in_channels)
-        self.data_format = data_format
-        self._out_channels = out_channels
-
-        self.hidden_dim = in_channels[-1]
-        in_channels = in_channels[::-1]
-
-        self.use_trans = use_trans
-        self.eval_size = eval_size
-        if use_trans:
-            if eval_size is not None:
-                self.pos_embed = self.build_2d_sincos_position_embedding(
-                    eval_size[1] // 32,
-                    eval_size[0] // 32,
-                    embed_dim=self.hidden_dim)
-            else:
-                self.pos_embed = None
-
-            encoder_layer = TransformerEncoderLayer(
-                self.hidden_dim, nhead, dim_feedforward, dropout, activation,
-                attn_dropout, act_dropout, normalize_before)
-            encoder_norm = nn.LayerNorm(
-                self.hidden_dim) if normalize_before else None
-            self.encoder = TransformerEncoder(encoder_layer, num_layers,
-                                              encoder_norm)
-
-        fpn_stages = []
-        fpn_routes = []
-        for i, (ch_in, ch_out) in enumerate(zip(in_channels, out_channels)):
-            if i > 0:
-                ch_in += ch_pre // 2
-
-            stage = nn.Sequential()
-            for j in range(stage_num):
-                stage.add_sublayer(
-                    str(j),
-                    eval(stage_fn)(block_fn,
-                                   ch_in if j == 0 else ch_out,
-                                   ch_out,
-                                   block_num,
-                                   act=act,
-                                   spp=(spp and i == 0),
-                                   use_alpha=use_alpha))
-
-            if drop_block:
-                stage.add_sublayer('drop', DropBlock(block_size, keep_prob))
-
-            fpn_stages.append(stage)
-
-            if i < self.num_blocks - 1:
-                fpn_routes.append(
-                    ConvBNLayer(
-                        ch_in=ch_out,
-                        ch_out=ch_out // 2,
-                        filter_size=1,
-                        stride=1,
-                        padding=0,
-                        act=act))
-
-            ch_pre = ch_out
-
-        self.fpn_stages = nn.LayerList(fpn_stages)
-        self.fpn_routes = nn.LayerList(fpn_routes)
-
-        pan_stages = []
-        pan_routes = []
-        for i in reversed(range(self.num_blocks - 1)):
-            pan_routes.append(
-                ConvBNLayer(
-                    ch_in=out_channels[i + 1],
-                    ch_out=out_channels[i + 1],
-                    filter_size=3,
-                    stride=2,
-                    padding=1,
-                    act=act))
-
-            ch_in = out_channels[i] + out_channels[i + 1]
-            ch_out = out_channels[i]
-            stage = nn.Sequential()
-            for j in range(stage_num):
-                stage.add_sublayer(
-                    str(j),
-                    eval(stage_fn)(block_fn,
-                                   ch_in if j == 0 else ch_out,
-                                   ch_out,
-                                   block_num,
-                                   act=act,
-                                   spp=False,
-                                   use_alpha=use_alpha))
-            if drop_block:
-                stage.add_sublayer('drop', DropBlock(block_size, keep_prob))
-
-            pan_stages.append(stage)
-
-        self.pan_stages = nn.LayerList(pan_stages[::-1])
-        self.pan_routes = nn.LayerList(pan_routes[::-1])
-
-    def build_2d_sincos_position_embedding(
-            self,
-            w,
-            h,
-            embed_dim=1024,
-            temperature=10000., ):
-        grid_w = paddle.arange(int(w), dtype=paddle.float32)
-        grid_h = paddle.arange(int(h), dtype=paddle.float32)
-        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
-        assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
-        pos_dim = embed_dim // 4
-        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
-        omega = 1. / (temperature**omega)
-
-        out_w = grid_w.flatten()[..., None] @omega[None]
-        out_h = grid_h.flatten()[..., None] @omega[None]
-
-        pos_emb = paddle.concat(
-            [
-                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
-                paddle.cos(out_h)
-            ],
-            axis=1)[None, :, :]
-
-        return pos_emb
-
-    def forward(self, blocks, for_mot=False):
-        if self.use_trans:
-            last_feat = blocks[-1]
-            n, c, h, w = last_feat.shape
-
-            # flatten [B, C, H, W] to [B, HxW, C]
-            src_flatten = last_feat.flatten(2).transpose([0, 2, 1])
-            if self.eval_size is not None and not self.training:
-                pos_embed = self.pos_embed
-            else:
-                pos_embed = self.build_2d_sincos_position_embedding(
-                    w=w, h=h, embed_dim=self.hidden_dim)
-
-            memory = self.encoder(src_flatten, pos_embed=pos_embed)
-            last_feat_encode = memory.transpose([0, 2, 1]).reshape([n, c, h, w])
-            blocks[-1] = last_feat_encode
-
-        blocks = blocks[::-1]
-        fpn_feats = []
-
-        for i, block in enumerate(blocks):
-            if i > 0:
-                block = paddle.concat([route, block], axis=1)
-            route = self.fpn_stages[i](block)
-            fpn_feats.append(route)
-
-            if i < self.num_blocks - 1:
-                route = self.fpn_routes[i](route)
-                route = F.interpolate(
-                    route, scale_factor=2., data_format=self.data_format)
-
-        pan_feats = [fpn_feats[-1], ]
-        route = fpn_feats[-1]
-        for i in reversed(range(self.num_blocks - 1)):
-            block = fpn_feats[i]
-            route = self.pan_routes[i](route)
-            block = paddle.concat([route, block], axis=1)
-            route = self.pan_stages[i](block)
-            pan_feats.append(route)
-
-        return pan_feats[::-1]
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/dilated_encoder.py b/pdfdet/models/Paddle/ppdet/modeling/necks/dilated_encoder.py
deleted file mode 100644
index 0bbc7fd..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/necks/dilated_encoder.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-from paddle.nn.initializer import KaimingUniform, Constant, Normal
-from ppdet.core.workspace import register, serializable
-from ..shape_spec import ShapeSpec
-
-__all__ = ['DilatedEncoder']
-
-
-class Bottleneck(nn.Layer):
-    def __init__(self, in_channels, mid_channels, dilation):
-        super(Bottleneck, self).__init__()
-        self.conv1 = nn.Sequential(* [
-            nn.Conv2D(
-                in_channels,
-                mid_channels,
-                1,
-                padding=0,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0, std=0.01)),
-                bias_attr=ParamAttr(initializer=Constant(0.0))),
-            nn.BatchNorm2D(
-                mid_channels,
-                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-                bias_attr=ParamAttr(regularizer=L2Decay(0.0))),
-            nn.ReLU(),
-        ])
-        self.conv2 = nn.Sequential(* [
-            nn.Conv2D(
-                mid_channels,
-                mid_channels,
-                3,
-                padding=dilation,
-                dilation=dilation,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0, std=0.01)),
-                bias_attr=ParamAttr(initializer=Constant(0.0))),
-            nn.BatchNorm2D(
-                mid_channels,
-                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-                bias_attr=ParamAttr(regularizer=L2Decay(0.0))),
-            nn.ReLU(),
-        ])
-        self.conv3 = nn.Sequential(* [
-            nn.Conv2D(
-                mid_channels,
-                in_channels,
-                1,
-                padding=0,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0, std=0.01)),
-                bias_attr=ParamAttr(initializer=Constant(0.0))),
-            nn.BatchNorm2D(
-                in_channels,
-                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-                bias_attr=ParamAttr(regularizer=L2Decay(0.0))),
-            nn.ReLU(),
-        ])
-
-    def forward(self, x):
-        identity = x
-        y = self.conv3(self.conv2(self.conv1(x)))
-        return y + identity
-
-
-@register
-class DilatedEncoder(nn.Layer):
-    """
-    DilatedEncoder used in YOLOF
-    """
-
-    def __init__(self,
-                 in_channels=[2048],
-                 out_channels=[512],
-                 block_mid_channels=128,
-                 num_residual_blocks=4,
-                 block_dilations=[2, 4, 6, 8]):
-        super(DilatedEncoder, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        assert len(self.in_channels) == 1, "YOLOF only has one level feature."
-        assert len(self.out_channels) == 1, "YOLOF only has one level feature."
-
-        self.block_mid_channels = block_mid_channels
-        self.num_residual_blocks = num_residual_blocks
-        self.block_dilations = block_dilations
-
-        out_ch = self.out_channels[0]
-        self.lateral_conv = nn.Conv2D(
-            self.in_channels[0],
-            out_ch,
-            1,
-            weight_attr=ParamAttr(initializer=KaimingUniform(
-                negative_slope=1, nonlinearity='leaky_relu')),
-            bias_attr=ParamAttr(initializer=Constant(value=0.0)))
-        self.lateral_norm = nn.BatchNorm2D(
-            out_ch,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-
-        self.fpn_conv = nn.Conv2D(
-            out_ch,
-            out_ch,
-            3,
-            padding=1,
-            weight_attr=ParamAttr(initializer=KaimingUniform(
-                negative_slope=1, nonlinearity='leaky_relu')))
-        self.fpn_norm = nn.BatchNorm2D(
-            out_ch,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-
-        encoder_blocks = []
-        for i in range(self.num_residual_blocks):
-            encoder_blocks.append(
-                Bottleneck(
-                    out_ch,
-                    self.block_mid_channels,
-                    dilation=block_dilations[i]))
-        self.dilated_encoder_blocks = nn.Sequential(*encoder_blocks)
-
-    def forward(self, inputs, for_mot=False):
-        out = self.lateral_norm(self.lateral_conv(inputs[0]))
-        out = self.fpn_norm(self.fpn_conv(out))
-        out = self.dilated_encoder_blocks(out)
-        return [out]
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=c) for c in self.out_channels]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/es_pan.py b/pdfdet/models/Paddle/ppdet/modeling/necks/es_pan.py
deleted file mode 100644
index bc24877..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/necks/es_pan.py
+++ /dev/null
@@ -1,212 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-from ppdet.core.workspace import register, serializable
-
-from ..shape_spec import ShapeSpec
-from ..backbones.esnet import SEModule
-from .csp_pan import ConvBNLayer, Channel_T, DPModule
-
-__all__ = ['ESPAN']
-
-
-class ES_Block(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 mid_channels,
-                 out_channels,
-                 kernel_size=5,
-                 stride=1,
-                 act='leaky_relu'):
-        super(ES_Block, self).__init__()
-        self._residual = ConvBNLayer(
-            in_channel=in_channels,
-            out_channel=out_channels,
-            kernel_size=1,
-            stride=1,
-            groups=1,
-            act=act)
-        self._conv_pw = ConvBNLayer(
-            in_channel=in_channels,
-            out_channel=mid_channels // 2,
-            kernel_size=1,
-            stride=1,
-            groups=1,
-            act=act)
-        self._conv_dw = ConvBNLayer(
-            in_channel=mid_channels // 2,
-            out_channel=mid_channels // 2,
-            kernel_size=kernel_size,
-            stride=stride,
-            groups=mid_channels // 2,
-            act=None)
-        self._se = SEModule(mid_channels)
-
-        self._conv_linear = ConvBNLayer(
-            in_channel=mid_channels,
-            out_channel=out_channels,
-            kernel_size=1,
-            stride=1,
-            groups=1,
-            act=act)
-
-        self._out_conv = ConvBNLayer(
-            in_channel=out_channels * 2,
-            out_channel=out_channels,
-            kernel_size=1,
-            stride=1,
-            groups=1,
-            act=act)
-
-    def forward(self, inputs):
-        x1 = self._residual(inputs)
-        x2 = self._conv_pw(inputs)
-        x3 = self._conv_dw(x2)
-        x3 = paddle.concat([x2, x3], axis=1)
-        x3 = self._se(x3)
-        x3 = self._conv_linear(x3)
-        out = paddle.concat([x1, x3], axis=1)
-        out = self._out_conv(out)
-        return out
-
-
-@register
-@serializable
-class ESPAN(nn.Layer):
-    """Path Aggregation Network with ES module.
-
-    Args:
-        in_channels (List[int]): Number of input channels per scale.
-        out_channels (int): Number of output channels (used at each scale)
-        kernel_size (int): The conv2d kernel size of this Module.
-        num_features (int): Number of output features of CSPPAN module.
-        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1
-        use_depthwise (bool): Whether to depthwise separable convolution in
-            blocks. Default: True
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size=5,
-                 num_features=3,
-                 use_depthwise=True,
-                 act='hard_swish',
-                 spatial_scales=[0.125, 0.0625, 0.03125]):
-        super(ESPAN, self).__init__()
-        self.conv_t = Channel_T(in_channels, out_channels, act=act)
-        in_channels = [out_channels] * len(spatial_scales)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.spatial_scales = spatial_scales
-        self.num_features = num_features
-        conv_func = DPModule if use_depthwise else ConvBNLayer
-
-        if self.num_features == 4:
-            self.first_top_conv = conv_func(
-                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
-            self.second_top_conv = conv_func(
-                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
-            self.spatial_scales.append(self.spatial_scales[-1] / 2)
-
-        # build top-down blocks
-        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
-        self.top_down_blocks = nn.LayerList()
-        for idx in range(len(in_channels) - 1, 0, -1):
-            self.top_down_blocks.append(
-                ES_Block(
-                    in_channels[idx - 1] * 2,
-                    in_channels[idx - 1],
-                    in_channels[idx - 1],
-                    kernel_size=kernel_size,
-                    stride=1,
-                    act=act))
-
-        # build bottom-up blocks
-        self.downsamples = nn.LayerList()
-        self.bottom_up_blocks = nn.LayerList()
-        for idx in range(len(in_channels) - 1):
-            self.downsamples.append(
-                conv_func(
-                    in_channels[idx],
-                    in_channels[idx],
-                    kernel_size=kernel_size,
-                    stride=2,
-                    act=act))
-            self.bottom_up_blocks.append(
-                ES_Block(
-                    in_channels[idx] * 2,
-                    in_channels[idx + 1],
-                    in_channels[idx + 1],
-                    kernel_size=kernel_size,
-                    stride=1,
-                    act=act))
-
-    def forward(self, inputs):
-        """
-        Args:
-            inputs (tuple[Tensor]): input features.
-
-        Returns:
-            tuple[Tensor]: CSPPAN features.
-        """
-        assert len(inputs) == len(self.in_channels)
-        inputs = self.conv_t(inputs)
-
-        # top-down path
-        inner_outs = [inputs[-1]]
-        for idx in range(len(self.in_channels) - 1, 0, -1):
-            feat_heigh = inner_outs[0]
-            feat_low = inputs[idx - 1]
-
-            upsample_feat = self.upsample(feat_heigh)
-
-            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
-                paddle.concat([upsample_feat, feat_low], 1))
-            inner_outs.insert(0, inner_out)
-
-        # bottom-up path
-        outs = [inner_outs[0]]
-        for idx in range(len(self.in_channels) - 1):
-            feat_low = outs[-1]
-            feat_height = inner_outs[idx + 1]
-            downsample_feat = self.downsamples[idx](feat_low)
-            out = self.bottom_up_blocks[idx](paddle.concat(
-                [downsample_feat, feat_height], 1))
-            outs.append(out)
-
-        top_features = None
-        if self.num_features == 4:
-            top_features = self.first_top_conv(inputs[-1])
-            top_features = top_features + self.second_top_conv(outs[-1])
-            outs.append(top_features)
-
-        return tuple(outs)
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=self.out_channels, stride=1. / s)
-            for s in self.spatial_scales
-        ]
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/fpn.py b/pdfdet/models/Paddle/ppdet/modeling/necks/fpn.py
deleted file mode 100644
index d08ca41..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/necks/fpn.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn.initializer import XavierUniform
-
-from ppdet.core.workspace import register, serializable
-from ppdet.modeling.layers import ConvNormLayer
-from ..shape_spec import ShapeSpec
-
-__all__ = ['FPN']
-
-
-@register
-@serializable
-class FPN(nn.Layer):
-    """
-    Feature Pyramid Network, see https://arxiv.org/abs/1612.03144
-
-    Args:
-        in_channels (list[int]): input channels of each level which can be 
-            derived from the output shape of backbone by from_config
-        out_channel (int): output channel of each level
-        spatial_scales (list[float]): the spatial scales between input feature
-            maps and original input image which can be derived from the output 
-            shape of backbone by from_config
-        has_extra_convs (bool): whether to add extra conv to the last level.
-            default False
-        extra_stage (int): the number of extra stages added to the last level.
-            default 1
-        use_c5 (bool): Whether to use c5 as the input of extra stage, 
-            otherwise p5 is used. default True
-        norm_type (string|None): The normalization type in FPN module. If 
-            norm_type is None, norm will not be used after conv and if 
-            norm_type is string, bn, gn, sync_bn are available. default None
-        norm_decay (float): weight decay for normalization layer weights.
-            default 0.
-        freeze_norm (bool): whether to freeze normalization layer.  
-            default False
-        relu_before_extra_convs (bool): whether to add relu before extra convs.
-            default False
-        
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channel,
-                 spatial_scales=[0.25, 0.125, 0.0625, 0.03125],
-                 has_extra_convs=False,
-                 extra_stage=1,
-                 use_c5=True,
-                 norm_type=None,
-                 norm_decay=0.,
-                 freeze_norm=False,
-                 relu_before_extra_convs=True):
-        super(FPN, self).__init__()
-        self.out_channel = out_channel
-        for s in range(extra_stage):
-            spatial_scales = spatial_scales + [spatial_scales[-1] / 2.]
-        self.spatial_scales = spatial_scales
-        self.has_extra_convs = has_extra_convs
-        self.extra_stage = extra_stage
-        self.use_c5 = use_c5
-        self.relu_before_extra_convs = relu_before_extra_convs
-        self.norm_type = norm_type
-        self.norm_decay = norm_decay
-        self.freeze_norm = freeze_norm
-
-        self.lateral_convs = []
-        self.fpn_convs = []
-        fan = out_channel * 3 * 3
-
-        # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone
-        # 0 <= st_stage < ed_stage <= 3
-        st_stage = 4 - len(in_channels)
-        ed_stage = st_stage + len(in_channels) - 1
-        for i in range(st_stage, ed_stage + 1):
-            if i == 3:
-                lateral_name = 'fpn_inner_res5_sum'
-            else:
-                lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2)
-            in_c = in_channels[i - st_stage]
-            if self.norm_type is not None:
-                lateral = self.add_sublayer(
-                    lateral_name,
-                    ConvNormLayer(
-                        ch_in=in_c,
-                        ch_out=out_channel,
-                        filter_size=1,
-                        stride=1,
-                        norm_type=self.norm_type,
-                        norm_decay=self.norm_decay,
-                        freeze_norm=self.freeze_norm,
-                        initializer=XavierUniform(fan_out=in_c)))
-            else:
-                lateral = self.add_sublayer(
-                    lateral_name,
-                    nn.Conv2D(
-                        in_channels=in_c,
-                        out_channels=out_channel,
-                        kernel_size=1,
-                        weight_attr=ParamAttr(
-                            initializer=XavierUniform(fan_out=in_c))))
-            self.lateral_convs.append(lateral)
-
-            fpn_name = 'fpn_res{}_sum'.format(i + 2)
-            if self.norm_type is not None:
-                fpn_conv = self.add_sublayer(
-                    fpn_name,
-                    ConvNormLayer(
-                        ch_in=out_channel,
-                        ch_out=out_channel,
-                        filter_size=3,
-                        stride=1,
-                        norm_type=self.norm_type,
-                        norm_decay=self.norm_decay,
-                        freeze_norm=self.freeze_norm,
-                        initializer=XavierUniform(fan_out=fan)))
-            else:
-                fpn_conv = self.add_sublayer(
-                    fpn_name,
-                    nn.Conv2D(
-                        in_channels=out_channel,
-                        out_channels=out_channel,
-                        kernel_size=3,
-                        padding=1,
-                        weight_attr=ParamAttr(
-                            initializer=XavierUniform(fan_out=fan))))
-            self.fpn_convs.append(fpn_conv)
-
-        # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
-        if self.has_extra_convs:
-            for i in range(self.extra_stage):
-                lvl = ed_stage + 1 + i
-                if i == 0 and self.use_c5:
-                    in_c = in_channels[-1]
-                else:
-                    in_c = out_channel
-                extra_fpn_name = 'fpn_{}'.format(lvl + 2)
-                if self.norm_type is not None:
-                    extra_fpn_conv = self.add_sublayer(
-                        extra_fpn_name,
-                        ConvNormLayer(
-                            ch_in=in_c,
-                            ch_out=out_channel,
-                            filter_size=3,
-                            stride=2,
-                            norm_type=self.norm_type,
-                            norm_decay=self.norm_decay,
-                            freeze_norm=self.freeze_norm,
-                            initializer=XavierUniform(fan_out=fan)))
-                else:
-                    extra_fpn_conv = self.add_sublayer(
-                        extra_fpn_name,
-                        nn.Conv2D(
-                            in_channels=in_c,
-                            out_channels=out_channel,
-                            kernel_size=3,
-                            stride=2,
-                            padding=1,
-                            weight_attr=ParamAttr(
-                                initializer=XavierUniform(fan_out=fan))))
-                self.fpn_convs.append(extra_fpn_conv)
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {
-            'in_channels': [i.channels for i in input_shape],
-            'spatial_scales': [1.0 / i.stride for i in input_shape],
-        }
-
-    def forward(self, body_feats):
-        laterals = []
-        num_levels = len(body_feats)
-        for i in range(num_levels):
-            laterals.append(self.lateral_convs[i](body_feats[i]))
-
-        for i in range(1, num_levels):
-            lvl = num_levels - i
-            upsample = F.interpolate(
-                laterals[lvl],
-                scale_factor=2.,
-                mode='nearest', )
-            laterals[lvl - 1] += upsample
-
-        fpn_output = []
-        for lvl in range(num_levels):
-            fpn_output.append(self.fpn_convs[lvl](laterals[lvl]))
-
-        if self.extra_stage > 0:
-            # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN)
-            if not self.has_extra_convs:
-                assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs'
-                fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2))
-            # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5)
-            else:
-                if self.use_c5:
-                    extra_source = body_feats[-1]
-                else:
-                    extra_source = fpn_output[-1]
-                fpn_output.append(self.fpn_convs[num_levels](extra_source))
-
-                for i in range(1, self.extra_stage):
-                    if self.relu_before_extra_convs:
-                        fpn_output.append(self.fpn_convs[num_levels + i](F.relu(
-                            fpn_output[-1])))
-                    else:
-                        fpn_output.append(self.fpn_convs[num_levels + i](
-                            fpn_output[-1]))
-        return fpn_output
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=self.out_channel, stride=1. / s)
-            for s in self.spatial_scales
-        ]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/hrfpn.py b/pdfdet/models/Paddle/ppdet/modeling/necks/hrfpn.py
deleted file mode 100644
index 5c45c99..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/necks/hrfpn.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn.functional as F
-import paddle.nn as nn
-from ppdet.core.workspace import register
-from ..shape_spec import ShapeSpec
-
-__all__ = ['HRFPN']
-
-
-@register
-class HRFPN(nn.Layer):
-    """
-    Args:
-        in_channels (list): number of input feature channels from backbone
-        out_channel (int): number of output feature channels
-        share_conv (bool): whether to share conv for different layers' reduction
-        extra_stage (int): add extra stage for returning HRFPN fpn_feats
-        spatial_scales (list): feature map scaling factor
-    """
-
-    def __init__(self,
-                 in_channels=[18, 36, 72, 144],
-                 out_channel=256,
-                 share_conv=False,
-                 extra_stage=1,
-                 spatial_scales=[1. / 4, 1. / 8, 1. / 16, 1. / 32],
-                 use_bias=False):
-        super(HRFPN, self).__init__()
-        in_channel = sum(in_channels)
-        self.in_channel = in_channel
-        self.out_channel = out_channel
-        self.share_conv = share_conv
-        for i in range(extra_stage):
-            spatial_scales = spatial_scales + [spatial_scales[-1] / 2.]
-        self.spatial_scales = spatial_scales
-        self.num_out = len(self.spatial_scales)
-        self.use_bias = use_bias
-        bias_attr = False if use_bias is False else None
-
-        self.reduction = nn.Conv2D(
-            in_channels=in_channel,
-            out_channels=out_channel,
-            kernel_size=1,
-            bias_attr=bias_attr)
-
-        if share_conv:
-            self.fpn_conv = nn.Conv2D(
-                in_channels=out_channel,
-                out_channels=out_channel,
-                kernel_size=3,
-                padding=1,
-                bias_attr=bias_attr)
-        else:
-            self.fpn_conv = []
-            for i in range(self.num_out):
-                conv_name = "fpn_conv_" + str(i)
-                conv = self.add_sublayer(
-                    conv_name,
-                    nn.Conv2D(
-                        in_channels=out_channel,
-                        out_channels=out_channel,
-                        kernel_size=3,
-                        padding=1,
-                        bias_attr=bias_attr))
-                self.fpn_conv.append(conv)
-
-    def forward(self, body_feats):
-        num_backbone_stages = len(body_feats)
-
-        outs = []
-        outs.append(body_feats[0])
-
-        # resize
-        for i in range(1, num_backbone_stages):
-            resized = F.interpolate(
-                body_feats[i], scale_factor=2**i, mode='bilinear')
-            outs.append(resized)
-
-        # concat
-        out = paddle.concat(outs, axis=1)
-        assert out.shape[
-            1] == self.in_channel, 'in_channel should be {}, be received {}'.format(
-                out.shape[1], self.in_channel)
-
-        # reduction
-        out = self.reduction(out)
-
-        # conv
-        outs = [out]
-        for i in range(1, self.num_out):
-            outs.append(F.avg_pool2d(out, kernel_size=2**i, stride=2**i))
-        outputs = []
-
-        for i in range(self.num_out):
-            conv_func = self.fpn_conv if self.share_conv else self.fpn_conv[i]
-            conv = conv_func(outs[i])
-            outputs.append(conv)
-
-        fpn_feats = [outputs[k] for k in range(self.num_out)]
-        return fpn_feats
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {
-            'in_channels': [i.channels for i in input_shape],
-            'spatial_scales': [1.0 / i.stride for i in input_shape],
-        }
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=self.out_channel, stride=1. / s)
-            for s in self.spatial_scales
-        ]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/lc_pan.py b/pdfdet/models/Paddle/ppdet/modeling/necks/lc_pan.py
deleted file mode 100644
index 0c59c8a..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/necks/lc_pan.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-from ppdet.core.workspace import register, serializable
-
-from ..shape_spec import ShapeSpec
-from ..backbones.lcnet import DepthwiseSeparable
-from .csp_pan import ConvBNLayer, Channel_T, DPModule
-
-__all__ = ['LCPAN']
-
-
-@register
-@serializable
-class LCPAN(nn.Layer):
-    """Path Aggregation Network with LCNet module.
-    Args:
-        in_channels (List[int]): Number of input channels per scale.
-        out_channels (int): Number of output channels (used at each scale)
-        kernel_size (int): The conv2d kernel size of this Module.
-        num_features (int): Number of output features of CSPPAN module.
-        num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1
-        use_depthwise (bool): Whether to depthwise separable convolution in
-            blocks. Default: True
-    """
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size=5,
-                 num_features=3,
-                 use_depthwise=True,
-                 act='hard_swish',
-                 spatial_scales=[0.125, 0.0625, 0.03125]):
-        super(LCPAN, self).__init__()
-        self.conv_t = Channel_T(in_channels, out_channels, act=act)
-        in_channels = [out_channels] * len(spatial_scales)
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.spatial_scales = spatial_scales
-        self.num_features = num_features
-        conv_func = DPModule if use_depthwise else ConvBNLayer
-
-        NET_CONFIG = {
-            #k, in_c, out_c, stride, use_se
-            "block1": [
-                [kernel_size, out_channels * 2, out_channels * 2, 1, False],
-                [kernel_size, out_channels * 2, out_channels, 1, False],
-            ],
-            "block2": [
-                [kernel_size, out_channels * 2, out_channels * 2, 1, False],
-                [kernel_size, out_channels * 2, out_channels, 1, False],
-            ]
-        }
-
-        if self.num_features == 4:
-            self.first_top_conv = conv_func(
-                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
-            self.second_top_conv = conv_func(
-                in_channels[0], in_channels[0], kernel_size, stride=2, act=act)
-            self.spatial_scales.append(self.spatial_scales[-1] / 2)
-
-        # build top-down blocks
-        self.upsample = nn.Upsample(scale_factor=2, mode='nearest')
-        self.top_down_blocks = nn.LayerList()
-        for idx in range(len(in_channels) - 1, 0, -1):
-            self.top_down_blocks.append(
-                nn.Sequential(* [
-                    DepthwiseSeparable(
-                        num_channels=in_c,
-                        num_filters=out_c,
-                        dw_size=k,
-                        stride=s,
-                        use_se=se)
-                    for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[
-                        "block1"])
-                ]))
-
-        # build bottom-up blocks
-        self.downsamples = nn.LayerList()
-        self.bottom_up_blocks = nn.LayerList()
-        for idx in range(len(in_channels) - 1):
-            self.downsamples.append(
-                conv_func(
-                    in_channels[idx],
-                    in_channels[idx],
-                    kernel_size=kernel_size,
-                    stride=2,
-                    act=act))
-            self.bottom_up_blocks.append(
-                nn.Sequential(* [
-                    DepthwiseSeparable(
-                        num_channels=in_c,
-                        num_filters=out_c,
-                        dw_size=k,
-                        stride=s,
-                        use_se=se)
-                    for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[
-                        "block2"])
-                ]))
-
-    def forward(self, inputs):
-        """
-        Args:
-            inputs (tuple[Tensor]): input features.
-        Returns:
-            tuple[Tensor]: CSPPAN features.
-        """
-        assert len(inputs) == len(self.in_channels)
-        inputs = self.conv_t(inputs)
-
-        # top-down path
-        inner_outs = [inputs[-1]]
-        for idx in range(len(self.in_channels) - 1, 0, -1):
-            feat_heigh = inner_outs[0]
-            feat_low = inputs[idx - 1]
-
-            upsample_feat = self.upsample(feat_heigh)
-
-            inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx](
-                paddle.concat([upsample_feat, feat_low], 1))
-            inner_outs.insert(0, inner_out)
-
-        # bottom-up path
-        outs = [inner_outs[0]]
-        for idx in range(len(self.in_channels) - 1):
-            feat_low = outs[-1]
-            feat_height = inner_outs[idx + 1]
-            downsample_feat = self.downsamples[idx](feat_low)
-            out = self.bottom_up_blocks[idx](paddle.concat(
-                [downsample_feat, feat_height], 1))
-            outs.append(out)
-
-        top_features = None
-        if self.num_features == 4:
-            top_features = self.first_top_conv(inputs[-1])
-            top_features = top_features + self.second_top_conv(outs[-1])
-            outs.append(top_features)
-
-        return tuple(outs)
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=self.out_channels, stride=1. / s)
-            for s in self.spatial_scales
-        ]
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/ttf_fpn.py b/pdfdet/models/Paddle/ppdet/modeling/necks/ttf_fpn.py
deleted file mode 100644
index 60cc69f..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/necks/ttf_fpn.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.nn.initializer import Constant, Uniform, Normal, XavierUniform
-from ppdet.core.workspace import register, serializable
-from paddle.regularizer import L2Decay
-from ppdet.modeling.layers import DeformableConvV2, ConvNormLayer, LiteConv
-import math
-from ppdet.modeling.ops import batch_norm
-from ..shape_spec import ShapeSpec
-
-__all__ = ['TTFFPN']
-
-
-class Upsample(nn.Layer):
-    def __init__(self, ch_in, ch_out, norm_type='bn'):
-        super(Upsample, self).__init__()
-        fan_in = ch_in * 3 * 3
-        stdv = 1. / math.sqrt(fan_in)
-        self.dcn = DeformableConvV2(
-            ch_in,
-            ch_out,
-            kernel_size=3,
-            weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)),
-            bias_attr=ParamAttr(
-                initializer=Constant(0),
-                regularizer=L2Decay(0.),
-                learning_rate=2.),
-            lr_scale=2.,
-            regularizer=L2Decay(0.))
-
-        self.bn = batch_norm(
-            ch_out, norm_type=norm_type, initializer=Constant(1.))
-
-    def forward(self, feat):
-        dcn = self.dcn(feat)
-        bn = self.bn(dcn)
-        relu = F.relu(bn)
-        out = F.interpolate(relu, scale_factor=2., mode='bilinear')
-        return out
-
-
-class DeConv(nn.Layer):
-    def __init__(self, ch_in, ch_out, norm_type='bn'):
-        super(DeConv, self).__init__()
-        self.deconv = nn.Sequential()
-        conv1 = ConvNormLayer(
-            ch_in=ch_in,
-            ch_out=ch_out,
-            stride=1,
-            filter_size=1,
-            norm_type=norm_type,
-            initializer=XavierUniform())
-        conv2 = nn.Conv2DTranspose(
-            in_channels=ch_out,
-            out_channels=ch_out,
-            kernel_size=4,
-            padding=1,
-            stride=2,
-            groups=ch_out,
-            weight_attr=ParamAttr(initializer=XavierUniform()),
-            bias_attr=False)
-        bn = batch_norm(ch_out, norm_type=norm_type, norm_decay=0.)
-        conv3 = ConvNormLayer(
-            ch_in=ch_out,
-            ch_out=ch_out,
-            stride=1,
-            filter_size=1,
-            norm_type=norm_type,
-            initializer=XavierUniform())
-
-        self.deconv.add_sublayer('conv1', conv1)
-        self.deconv.add_sublayer('relu6_1', nn.ReLU6())
-        self.deconv.add_sublayer('conv2', conv2)
-        self.deconv.add_sublayer('bn', bn)
-        self.deconv.add_sublayer('relu6_2', nn.ReLU6())
-        self.deconv.add_sublayer('conv3', conv3)
-        self.deconv.add_sublayer('relu6_3', nn.ReLU6())
-
-    def forward(self, inputs):
-        return self.deconv(inputs)
-
-
-class LiteUpsample(nn.Layer):
-    def __init__(self, ch_in, ch_out, norm_type='bn'):
-        super(LiteUpsample, self).__init__()
-        self.deconv = DeConv(ch_in, ch_out, norm_type=norm_type)
-        self.conv = LiteConv(ch_in, ch_out, norm_type=norm_type)
-
-    def forward(self, inputs):
-        deconv_up = self.deconv(inputs)
-        conv = self.conv(inputs)
-        interp_up = F.interpolate(conv, scale_factor=2., mode='bilinear')
-        return deconv_up + interp_up
-
-
-class ShortCut(nn.Layer):
-    def __init__(self,
-                 layer_num,
-                 ch_in,
-                 ch_out,
-                 norm_type='bn',
-                 lite_neck=False,
-                 name=None):
-        super(ShortCut, self).__init__()
-        shortcut_conv = nn.Sequential()
-        for i in range(layer_num):
-            fan_out = 3 * 3 * ch_out
-            std = math.sqrt(2. / fan_out)
-            in_channels = ch_in if i == 0 else ch_out
-            shortcut_name = name + '.conv.{}'.format(i)
-            if lite_neck:
-                shortcut_conv.add_sublayer(
-                    shortcut_name,
-                    LiteConv(
-                        in_channels=in_channels,
-                        out_channels=ch_out,
-                        with_act=i < layer_num - 1,
-                        norm_type=norm_type))
-            else:
-                shortcut_conv.add_sublayer(
-                    shortcut_name,
-                    nn.Conv2D(
-                        in_channels=in_channels,
-                        out_channels=ch_out,
-                        kernel_size=3,
-                        padding=1,
-                        weight_attr=ParamAttr(initializer=Normal(0, std)),
-                        bias_attr=ParamAttr(
-                            learning_rate=2., regularizer=L2Decay(0.))))
-                if i < layer_num - 1:
-                    shortcut_conv.add_sublayer(shortcut_name + '.act',
-                                               nn.ReLU())
-        self.shortcut = self.add_sublayer('shortcut', shortcut_conv)
-
-    def forward(self, feat):
-        out = self.shortcut(feat)
-        return out
-
-
-@register
-@serializable
-class TTFFPN(nn.Layer):
-    """
-    Args:
-        in_channels (list): number of input feature channels from backbone.
-            [128,256,512,1024] by default, means the channels of DarkNet53
-            backbone return_idx [1,2,3,4].
-        planes (list): the number of output feature channels of FPN.
-            [256, 128, 64] by default
-        shortcut_num (list): the number of convolution layers in each shortcut.
-            [3,2,1] by default, means DarkNet53 backbone return_idx_1 has 3 convs
-            in its shortcut, return_idx_2 has 2 convs and return_idx_3 has 1 conv.
-        norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. 
-            bn by default
-        lite_neck (bool): whether to use lite conv in TTFNet FPN, 
-            False by default
-        fusion_method (string): the method to fusion upsample and lateral layer.
-            'add' and 'concat' are optional, add by default
-    """
-
-    __shared__ = ['norm_type']
-
-    def __init__(self,
-                 in_channels,
-                 planes=[256, 128, 64],
-                 shortcut_num=[3, 2, 1],
-                 norm_type='bn',
-                 lite_neck=False,
-                 fusion_method='add'):
-        super(TTFFPN, self).__init__()
-        self.planes = planes
-        self.shortcut_num = shortcut_num[::-1]
-        self.shortcut_len = len(shortcut_num)
-        self.ch_in = in_channels[::-1]
-        self.fusion_method = fusion_method
-
-        self.upsample_list = []
-        self.shortcut_list = []
-        self.upper_list = []
-        for i, out_c in enumerate(self.planes):
-            in_c = self.ch_in[i] if i == 0 else self.upper_list[-1]
-            upsample_module = LiteUpsample if lite_neck else Upsample
-            upsample = self.add_sublayer(
-                'upsample.' + str(i),
-                upsample_module(
-                    in_c, out_c, norm_type=norm_type))
-            self.upsample_list.append(upsample)
-            if i < self.shortcut_len:
-                shortcut = self.add_sublayer(
-                    'shortcut.' + str(i),
-                    ShortCut(
-                        self.shortcut_num[i],
-                        self.ch_in[i + 1],
-                        out_c,
-                        norm_type=norm_type,
-                        lite_neck=lite_neck,
-                        name='shortcut.' + str(i)))
-                self.shortcut_list.append(shortcut)
-                if self.fusion_method == 'add':
-                    upper_c = out_c
-                elif self.fusion_method == 'concat':
-                    upper_c = out_c * 2
-                else:
-                    raise ValueError('Illegal fusion method. Expected add or\
-                        concat, but received {}'.format(self.fusion_method))
-                self.upper_list.append(upper_c)
-
-    def forward(self, inputs):
-        feat = inputs[-1]
-        for i, out_c in enumerate(self.planes):
-            feat = self.upsample_list[i](feat)
-            if i < self.shortcut_len:
-                shortcut = self.shortcut_list[i](inputs[-i - 2])
-                if self.fusion_method == 'add':
-                    feat = feat + shortcut
-                else:
-                    feat = paddle.concat([feat, shortcut], axis=1)
-        return feat
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=self.upper_list[-1], )]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/yolo_fpn.py b/pdfdet/models/Paddle/ppdet/modeling/necks/yolo_fpn.py
deleted file mode 100644
index 79f4cea..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/necks/yolo_fpn.py
+++ /dev/null
@@ -1,1099 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-# 
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register, serializable
-from ppdet.modeling.layers import DropBlock
-from ppdet.modeling.ops import get_act_fn
-from ..backbones.darknet import ConvBNLayer
-from ..shape_spec import ShapeSpec
-from ..backbones.csp_darknet import BaseConv, DWConv, CSPLayer
-
-__all__ = ['YOLOv3FPN', 'PPYOLOFPN', 'PPYOLOTinyFPN', 'PPYOLOPAN', 'YOLOCSPPAN']
-
-
-def add_coord(x, data_format):
-    b = paddle.shape(x)[0]
-    if data_format == 'NCHW':
-        h, w = x.shape[2], x.shape[3]
-    else:
-        h, w = x.shape[1], x.shape[2]
-
-    gx = paddle.cast(paddle.arange(w) / ((w - 1.) * 2.0) - 1., x.dtype)
-    gy = paddle.cast(paddle.arange(h) / ((h - 1.) * 2.0) - 1., x.dtype)
-
-    if data_format == 'NCHW':
-        gx = gx.reshape([1, 1, 1, w]).expand([b, 1, h, w])
-        gy = gy.reshape([1, 1, h, 1]).expand([b, 1, h, w])
-    else:
-        gx = gx.reshape([1, 1, w, 1]).expand([b, h, w, 1])
-        gy = gy.reshape([1, h, 1, 1]).expand([b, h, w, 1])
-
-    gx.stop_gradient = True
-    gy.stop_gradient = True
-    return gx, gy
-
-
-class YoloDetBlock(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 channel,
-                 norm_type,
-                 freeze_norm=False,
-                 name='',
-                 data_format='NCHW'):
-        """
-        YOLODetBlock layer for yolov3, see https://arxiv.org/abs/1804.02767
-
-        Args:
-            ch_in (int): input channel
-            channel (int): base channel
-            norm_type (str): batch norm type
-            freeze_norm (bool): whether to freeze norm, default False
-            name (str): layer name
-            data_format (str): data format, NCHW or NHWC
-        """
-        super(YoloDetBlock, self).__init__()
-        self.ch_in = ch_in
-        self.channel = channel
-        assert channel % 2 == 0, \
-            "channel {} cannot be divided by 2".format(channel)
-        conv_def = [
-            ['conv0', ch_in, channel, 1, '.0.0'],
-            ['conv1', channel, channel * 2, 3, '.0.1'],
-            ['conv2', channel * 2, channel, 1, '.1.0'],
-            ['conv3', channel, channel * 2, 3, '.1.1'],
-            ['route', channel * 2, channel, 1, '.2'],
-        ]
-
-        self.conv_module = nn.Sequential()
-        for idx, (conv_name, ch_in, ch_out, filter_size,
-                  post_name) in enumerate(conv_def):
-            self.conv_module.add_sublayer(
-                conv_name,
-                ConvBNLayer(
-                    ch_in=ch_in,
-                    ch_out=ch_out,
-                    filter_size=filter_size,
-                    padding=(filter_size - 1) // 2,
-                    norm_type=norm_type,
-                    freeze_norm=freeze_norm,
-                    data_format=data_format,
-                    name=name + post_name))
-
-        self.tip = ConvBNLayer(
-            ch_in=channel,
-            ch_out=channel * 2,
-            filter_size=3,
-            padding=1,
-            norm_type=norm_type,
-            freeze_norm=freeze_norm,
-            data_format=data_format,
-            name=name + '.tip')
-
-    def forward(self, inputs):
-        route = self.conv_module(inputs)
-        tip = self.tip(route)
-        return route, tip
-
-
-class SPP(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 k,
-                 pool_size,
-                 norm_type='bn',
-                 freeze_norm=False,
-                 name='',
-                 act='leaky',
-                 data_format='NCHW'):
-        """
-        SPP layer, which consist of four pooling layer follwed by conv layer
-
-        Args:
-            ch_in (int): input channel of conv layer
-            ch_out (int): output channel of conv layer
-            k (int): kernel size of conv layer
-            norm_type (str): batch norm type
-            freeze_norm (bool): whether to freeze norm, default False
-            name (str): layer name
-            act (str): activation function
-            data_format (str): data format, NCHW or NHWC
-        """
-        super(SPP, self).__init__()
-        self.pool = []
-        self.data_format = data_format
-        for size in pool_size:
-            pool = self.add_sublayer(
-                '{}.pool1'.format(name),
-                nn.MaxPool2D(
-                    kernel_size=size,
-                    stride=1,
-                    padding=size // 2,
-                    data_format=data_format,
-                    ceil_mode=False))
-            self.pool.append(pool)
-        self.conv = ConvBNLayer(
-            ch_in,
-            ch_out,
-            k,
-            padding=k // 2,
-            norm_type=norm_type,
-            freeze_norm=freeze_norm,
-            name=name,
-            act=act,
-            data_format=data_format)
-
-    def forward(self, x):
-        outs = [x]
-        for pool in self.pool:
-            outs.append(pool(x))
-        if self.data_format == "NCHW":
-            y = paddle.concat(outs, axis=1)
-        else:
-            y = paddle.concat(outs, axis=-1)
-
-        y = self.conv(y)
-        return y
-
-
-class CoordConv(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 filter_size,
-                 padding,
-                 norm_type,
-                 freeze_norm=False,
-                 name='',
-                 data_format='NCHW'):
-        """
-        CoordConv layer, see https://arxiv.org/abs/1807.03247
-
-        Args:
-            ch_in (int): input channel
-            ch_out (int): output channel
-            filter_size (int): filter size, default 3
-            padding (int): padding size, default 0
-            norm_type (str): batch norm type, default bn
-            name (str): layer name
-            data_format (str): data format, NCHW or NHWC
-
-        """
-        super(CoordConv, self).__init__()
-        self.conv = ConvBNLayer(
-            ch_in + 2,
-            ch_out,
-            filter_size=filter_size,
-            padding=padding,
-            norm_type=norm_type,
-            freeze_norm=freeze_norm,
-            data_format=data_format,
-            name=name)
-        self.data_format = data_format
-
-    def forward(self, x):
-        gx, gy = add_coord(x, self.data_format)
-        if self.data_format == 'NCHW':
-            y = paddle.concat([x, gx, gy], axis=1)
-        else:
-            y = paddle.concat([x, gx, gy], axis=-1)
-        y = self.conv(y)
-        return y
-
-
-class PPYOLODetBlock(nn.Layer):
-    def __init__(self, cfg, name, data_format='NCHW'):
-        """
-        PPYOLODetBlock layer
-
-        Args:
-            cfg (list): layer configs for this block
-            name (str): block name
-            data_format (str): data format, NCHW or NHWC
-        """
-        super(PPYOLODetBlock, self).__init__()
-        self.conv_module = nn.Sequential()
-        for idx, (conv_name, layer, args, kwargs) in enumerate(cfg[:-1]):
-            kwargs.update(
-                name='{}.{}'.format(name, conv_name), data_format=data_format)
-            self.conv_module.add_sublayer(conv_name, layer(*args, **kwargs))
-
-        conv_name, layer, args, kwargs = cfg[-1]
-        kwargs.update(
-            name='{}.{}'.format(name, conv_name), data_format=data_format)
-        self.tip = layer(*args, **kwargs)
-
-    def forward(self, inputs):
-        route = self.conv_module(inputs)
-        tip = self.tip(route)
-        return route, tip
-
-
-class PPYOLOTinyDetBlock(nn.Layer):
-    def __init__(self,
-                 ch_in,
-                 ch_out,
-                 name,
-                 drop_block=False,
-                 block_size=3,
-                 keep_prob=0.9,
-                 data_format='NCHW'):
-        """
-        PPYOLO Tiny DetBlock layer
-        Args:
-            ch_in (list): input channel number
-            ch_out (list): output channel number
-            name (str): block name
-            drop_block: whether user DropBlock
-            block_size: drop block size
-            keep_prob: probability to keep block in DropBlock
-            data_format (str): data format, NCHW or NHWC
-        """
-        super(PPYOLOTinyDetBlock, self).__init__()
-        self.drop_block_ = drop_block
-        self.conv_module = nn.Sequential()
-
-        cfgs = [
-            # name, in channels, out channels, filter_size, 
-            # stride, padding, groups
-            ['.0', ch_in, ch_out, 1, 1, 0, 1],
-            ['.1', ch_out, ch_out, 5, 1, 2, ch_out],
-            ['.2', ch_out, ch_out, 1, 1, 0, 1],
-            ['.route', ch_out, ch_out, 5, 1, 2, ch_out],
-        ]
-        for cfg in cfgs:
-            conv_name, conv_ch_in, conv_ch_out, filter_size, stride, padding, \
-                    groups = cfg
-            self.conv_module.add_sublayer(
-                name + conv_name,
-                ConvBNLayer(
-                    ch_in=conv_ch_in,
-                    ch_out=conv_ch_out,
-                    filter_size=filter_size,
-                    stride=stride,
-                    padding=padding,
-                    groups=groups,
-                    name=name + conv_name))
-
-        self.tip = ConvBNLayer(
-            ch_in=ch_out,
-            ch_out=ch_out,
-            filter_size=1,
-            stride=1,
-            padding=0,
-            groups=1,
-            name=name + conv_name)
-
-        if self.drop_block_:
-            self.drop_block = DropBlock(
-                block_size=block_size,
-                keep_prob=keep_prob,
-                data_format=data_format,
-                name=name + '.dropblock')
-
-    def forward(self, inputs):
-        if self.drop_block_:
-            inputs = self.drop_block(inputs)
-        route = self.conv_module(inputs)
-        tip = self.tip(route)
-        return route, tip
-
-
-class PPYOLODetBlockCSP(nn.Layer):
-    def __init__(self,
-                 cfg,
-                 ch_in,
-                 ch_out,
-                 act,
-                 norm_type,
-                 name,
-                 data_format='NCHW'):
-        """
-        PPYOLODetBlockCSP layer
-
-        Args:
-            cfg (list): layer configs for this block
-            ch_in (int): input channel
-            ch_out (int): output channel
-            act (str): default mish
-            name (str): block name
-            data_format (str): data format, NCHW or NHWC
-        """
-        super(PPYOLODetBlockCSP, self).__init__()
-        self.data_format = data_format
-        self.conv1 = ConvBNLayer(
-            ch_in,
-            ch_out,
-            1,
-            padding=0,
-            act=act,
-            norm_type=norm_type,
-            name=name + '.left',
-            data_format=data_format)
-        self.conv2 = ConvBNLayer(
-            ch_in,
-            ch_out,
-            1,
-            padding=0,
-            act=act,
-            norm_type=norm_type,
-            name=name + '.right',
-            data_format=data_format)
-        self.conv3 = ConvBNLayer(
-            ch_out * 2,
-            ch_out * 2,
-            1,
-            padding=0,
-            act=act,
-            norm_type=norm_type,
-            name=name,
-            data_format=data_format)
-        self.conv_module = nn.Sequential()
-        for idx, (layer_name, layer, args, kwargs) in enumerate(cfg):
-            kwargs.update(name=name + layer_name, data_format=data_format)
-            self.conv_module.add_sublayer(layer_name, layer(*args, **kwargs))
-
-    def forward(self, inputs):
-        conv_left = self.conv1(inputs)
-        conv_right = self.conv2(inputs)
-        conv_left = self.conv_module(conv_left)
-        if self.data_format == 'NCHW':
-            conv = paddle.concat([conv_left, conv_right], axis=1)
-        else:
-            conv = paddle.concat([conv_left, conv_right], axis=-1)
-
-        conv = self.conv3(conv)
-        return conv, conv
-
-
-@register
-@serializable
-class YOLOv3FPN(nn.Layer):
-    __shared__ = ['norm_type', 'data_format']
-
-    def __init__(self,
-                 in_channels=[256, 512, 1024],
-                 norm_type='bn',
-                 freeze_norm=False,
-                 data_format='NCHW'):
-        """
-        YOLOv3FPN layer
-
-        Args:
-            in_channels (list): input channels for fpn
-            norm_type (str): batch norm type, default bn
-            data_format (str): data format, NCHW or NHWC
-
-        """
-        super(YOLOv3FPN, self).__init__()
-        assert len(in_channels) > 0, "in_channels length should > 0"
-        self.in_channels = in_channels
-        self.num_blocks = len(in_channels)
-
-        self._out_channels = []
-        self.yolo_blocks = []
-        self.routes = []
-        self.data_format = data_format
-        for i in range(self.num_blocks):
-            name = 'yolo_block.{}'.format(i)
-            in_channel = in_channels[-i - 1]
-            if i > 0:
-                in_channel += 512 // (2**i)
-            yolo_block = self.add_sublayer(
-                name,
-                YoloDetBlock(
-                    in_channel,
-                    channel=512 // (2**i),
-                    norm_type=norm_type,
-                    freeze_norm=freeze_norm,
-                    data_format=data_format,
-                    name=name))
-            self.yolo_blocks.append(yolo_block)
-            # tip layer output channel doubled
-            self._out_channels.append(1024 // (2**i))
-
-            if i < self.num_blocks - 1:
-                name = 'yolo_transition.{}'.format(i)
-                route = self.add_sublayer(
-                    name,
-                    ConvBNLayer(
-                        ch_in=512 // (2**i),
-                        ch_out=256 // (2**i),
-                        filter_size=1,
-                        stride=1,
-                        padding=0,
-                        norm_type=norm_type,
-                        freeze_norm=freeze_norm,
-                        data_format=data_format,
-                        name=name))
-                self.routes.append(route)
-
-    def forward(self, blocks, for_mot=False):
-        assert len(blocks) == self.num_blocks
-        blocks = blocks[::-1]
-        yolo_feats = []
-
-        # add embedding features output for multi-object tracking model
-        if for_mot:
-            emb_feats = []
-
-        for i, block in enumerate(blocks):
-            if i > 0:
-                if self.data_format == 'NCHW':
-                    block = paddle.concat([route, block], axis=1)
-                else:
-                    block = paddle.concat([route, block], axis=-1)
-            route, tip = self.yolo_blocks[i](block)
-            yolo_feats.append(tip)
-
-            if for_mot:
-                # add embedding features output
-                emb_feats.append(route)
-
-            if i < self.num_blocks - 1:
-                route = self.routes[i](route)
-                route = F.interpolate(
-                    route, scale_factor=2., data_format=self.data_format)
-
-        if for_mot:
-            return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats}
-        else:
-            return yolo_feats
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=c) for c in self._out_channels]
-
-
-@register
-@serializable
-class PPYOLOFPN(nn.Layer):
-    __shared__ = ['norm_type', 'data_format']
-
-    def __init__(self,
-                 in_channels=[512, 1024, 2048],
-                 norm_type='bn',
-                 freeze_norm=False,
-                 data_format='NCHW',
-                 coord_conv=False,
-                 conv_block_num=2,
-                 drop_block=False,
-                 block_size=3,
-                 keep_prob=0.9,
-                 spp=False):
-        """
-        PPYOLOFPN layer
-
-        Args:
-            in_channels (list): input channels for fpn
-            norm_type (str): batch norm type, default bn
-            data_format (str): data format, NCHW or NHWC
-            coord_conv (bool): whether use CoordConv or not
-            conv_block_num (int): conv block num of each pan block
-            drop_block (bool): whether use DropBlock or not
-            block_size (int): block size of DropBlock
-            keep_prob (float): keep probability of DropBlock
-            spp (bool): whether use spp or not
-
-        """
-        super(PPYOLOFPN, self).__init__()
-        assert len(in_channels) > 0, "in_channels length should > 0"
-        self.in_channels = in_channels
-        self.num_blocks = len(in_channels)
-        # parse kwargs
-        self.coord_conv = coord_conv
-        self.drop_block = drop_block
-        self.block_size = block_size
-        self.keep_prob = keep_prob
-        self.spp = spp
-        self.conv_block_num = conv_block_num
-        self.data_format = data_format
-        if self.coord_conv:
-            ConvLayer = CoordConv
-        else:
-            ConvLayer = ConvBNLayer
-
-        if self.drop_block:
-            dropblock_cfg = [[
-                'dropblock', DropBlock, [self.block_size, self.keep_prob],
-                dict()
-            ]]
-        else:
-            dropblock_cfg = []
-
-        self._out_channels = []
-        self.yolo_blocks = []
-        self.routes = []
-        for i, ch_in in enumerate(self.in_channels[::-1]):
-            if i > 0:
-                ch_in += 512 // (2**i)
-            channel = 64 * (2**self.num_blocks) // (2**i)
-            base_cfg = []
-            c_in, c_out = ch_in, channel
-            for j in range(self.conv_block_num):
-                base_cfg += [
-                    [
-                        'conv{}'.format(2 * j), ConvLayer, [c_in, c_out, 1],
-                        dict(
-                            padding=0,
-                            norm_type=norm_type,
-                            freeze_norm=freeze_norm)
-                    ],
-                    [
-                        'conv{}'.format(2 * j + 1), ConvBNLayer,
-                        [c_out, c_out * 2, 3], dict(
-                            padding=1,
-                            norm_type=norm_type,
-                            freeze_norm=freeze_norm)
-                    ],
-                ]
-                c_in, c_out = c_out * 2, c_out
-
-            base_cfg += [[
-                'route', ConvLayer, [c_in, c_out, 1], dict(
-                    padding=0, norm_type=norm_type, freeze_norm=freeze_norm)
-            ], [
-                'tip', ConvLayer, [c_out, c_out * 2, 3], dict(
-                    padding=1, norm_type=norm_type, freeze_norm=freeze_norm)
-            ]]
-
-            if self.conv_block_num == 2:
-                if i == 0:
-                    if self.spp:
-                        spp_cfg = [[
-                            'spp', SPP, [channel * 4, channel, 1], dict(
-                                pool_size=[5, 9, 13],
-                                norm_type=norm_type,
-                                freeze_norm=freeze_norm)
-                        ]]
-                    else:
-                        spp_cfg = []
-                    cfg = base_cfg[0:3] + spp_cfg + base_cfg[
-                        3:4] + dropblock_cfg + base_cfg[4:6]
-                else:
-                    cfg = base_cfg[0:2] + dropblock_cfg + base_cfg[2:6]
-            elif self.conv_block_num == 0:
-                if self.spp and i == 0:
-                    spp_cfg = [[
-                        'spp', SPP, [c_in * 4, c_in, 1], dict(
-                            pool_size=[5, 9, 13],
-                            norm_type=norm_type,
-                            freeze_norm=freeze_norm)
-                    ]]
-                else:
-                    spp_cfg = []
-                cfg = spp_cfg + dropblock_cfg + base_cfg
-            name = 'yolo_block.{}'.format(i)
-            yolo_block = self.add_sublayer(name, PPYOLODetBlock(cfg, name))
-            self.yolo_blocks.append(yolo_block)
-            self._out_channels.append(channel * 2)
-            if i < self.num_blocks - 1:
-                name = 'yolo_transition.{}'.format(i)
-                route = self.add_sublayer(
-                    name,
-                    ConvBNLayer(
-                        ch_in=channel,
-                        ch_out=256 // (2**i),
-                        filter_size=1,
-                        stride=1,
-                        padding=0,
-                        norm_type=norm_type,
-                        freeze_norm=freeze_norm,
-                        data_format=data_format,
-                        name=name))
-                self.routes.append(route)
-
-    def forward(self, blocks, for_mot=False):
-        assert len(blocks) == self.num_blocks
-        blocks = blocks[::-1]
-        yolo_feats = []
-
-        # add embedding features output for multi-object tracking model
-        if for_mot:
-            emb_feats = []
-
-        for i, block in enumerate(blocks):
-            if i > 0:
-                if self.data_format == 'NCHW':
-                    block = paddle.concat([route, block], axis=1)
-                else:
-                    block = paddle.concat([route, block], axis=-1)
-            route, tip = self.yolo_blocks[i](block)
-            yolo_feats.append(tip)
-
-            if for_mot:
-                # add embedding features output
-                emb_feats.append(route)
-
-            if i < self.num_blocks - 1:
-                route = self.routes[i](route)
-                route = F.interpolate(
-                    route, scale_factor=2., data_format=self.data_format)
-
-        if for_mot:
-            return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats}
-        else:
-            return yolo_feats
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=c) for c in self._out_channels]
-
-
-@register
-@serializable
-class PPYOLOTinyFPN(nn.Layer):
-    __shared__ = ['norm_type', 'data_format']
-
-    def __init__(self,
-                 in_channels=[80, 56, 34],
-                 detection_block_channels=[160, 128, 96],
-                 norm_type='bn',
-                 data_format='NCHW',
-                 **kwargs):
-        """
-        PPYOLO Tiny FPN layer
-        Args:
-            in_channels (list): input channels for fpn
-            detection_block_channels (list): channels in fpn
-            norm_type (str): batch norm type, default bn
-            data_format (str): data format, NCHW or NHWC
-            kwargs: extra key-value pairs, such as parameter of DropBlock and spp 
-        """
-        super(PPYOLOTinyFPN, self).__init__()
-        assert len(in_channels) > 0, "in_channels length should > 0"
-        self.in_channels = in_channels[::-1]
-        assert len(detection_block_channels
-                   ) > 0, "detection_block_channelslength should > 0"
-        self.detection_block_channels = detection_block_channels
-        self.data_format = data_format
-        self.num_blocks = len(in_channels)
-        # parse kwargs
-        self.drop_block = kwargs.get('drop_block', False)
-        self.block_size = kwargs.get('block_size', 3)
-        self.keep_prob = kwargs.get('keep_prob', 0.9)
-
-        self.spp_ = kwargs.get('spp', False)
-        if self.spp_:
-            self.spp = SPP(self.in_channels[0] * 4,
-                           self.in_channels[0],
-                           k=1,
-                           pool_size=[5, 9, 13],
-                           norm_type=norm_type,
-                           name='spp')
-
-        self._out_channels = []
-        self.yolo_blocks = []
-        self.routes = []
-        for i, (
-                ch_in, ch_out
-        ) in enumerate(zip(self.in_channels, self.detection_block_channels)):
-            name = 'yolo_block.{}'.format(i)
-            if i > 0:
-                ch_in += self.detection_block_channels[i - 1]
-            yolo_block = self.add_sublayer(
-                name,
-                PPYOLOTinyDetBlock(
-                    ch_in,
-                    ch_out,
-                    name,
-                    drop_block=self.drop_block,
-                    block_size=self.block_size,
-                    keep_prob=self.keep_prob))
-            self.yolo_blocks.append(yolo_block)
-            self._out_channels.append(ch_out)
-
-            if i < self.num_blocks - 1:
-                name = 'yolo_transition.{}'.format(i)
-                route = self.add_sublayer(
-                    name,
-                    ConvBNLayer(
-                        ch_in=ch_out,
-                        ch_out=ch_out,
-                        filter_size=1,
-                        stride=1,
-                        padding=0,
-                        norm_type=norm_type,
-                        data_format=data_format,
-                        name=name))
-                self.routes.append(route)
-
-    def forward(self, blocks, for_mot=False):
-        assert len(blocks) == self.num_blocks
-        blocks = blocks[::-1]
-        yolo_feats = []
-
-        # add embedding features output for multi-object tracking model
-        if for_mot:
-            emb_feats = []
-
-        for i, block in enumerate(blocks):
-            if i == 0 and self.spp_:
-                block = self.spp(block)
-
-            if i > 0:
-                if self.data_format == 'NCHW':
-                    block = paddle.concat([route, block], axis=1)
-                else:
-                    block = paddle.concat([route, block], axis=-1)
-            route, tip = self.yolo_blocks[i](block)
-            yolo_feats.append(tip)
-
-            if for_mot:
-                # add embedding features output
-                emb_feats.append(route)
-
-            if i < self.num_blocks - 1:
-                route = self.routes[i](route)
-                route = F.interpolate(
-                    route, scale_factor=2., data_format=self.data_format)
-
-        if for_mot:
-            return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats}
-        else:
-            return yolo_feats
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=c) for c in self._out_channels]
-
-
-@register
-@serializable
-class PPYOLOPAN(nn.Layer):
-    __shared__ = ['norm_type', 'data_format']
-
-    def __init__(self,
-                 in_channels=[512, 1024, 2048],
-                 norm_type='bn',
-                 data_format='NCHW',
-                 act='mish',
-                 conv_block_num=3,
-                 drop_block=False,
-                 block_size=3,
-                 keep_prob=0.9,
-                 spp=False):
-        """
-        PPYOLOPAN layer with SPP, DropBlock and CSP connection.
-
-        Args:
-            in_channels (list): input channels for fpn
-            norm_type (str): batch norm type, default bn
-            data_format (str): data format, NCHW or NHWC
-            act (str): activation function, default mish
-            conv_block_num (int): conv block num of each pan block
-            drop_block (bool): whether use DropBlock or not
-            block_size (int): block size of DropBlock
-            keep_prob (float): keep probability of DropBlock
-            spp (bool): whether use spp or not
-
-        """
-        super(PPYOLOPAN, self).__init__()
-        assert len(in_channels) > 0, "in_channels length should > 0"
-        self.in_channels = in_channels
-        self.num_blocks = len(in_channels)
-        # parse kwargs
-        self.drop_block = drop_block
-        self.block_size = block_size
-        self.keep_prob = keep_prob
-        self.spp = spp
-        self.conv_block_num = conv_block_num
-        self.data_format = data_format
-        if self.drop_block:
-            dropblock_cfg = [[
-                'dropblock', DropBlock, [self.block_size, self.keep_prob],
-                dict()
-            ]]
-        else:
-            dropblock_cfg = []
-
-        # fpn
-        self.fpn_blocks = []
-        self.fpn_routes = []
-        fpn_channels = []
-        for i, ch_in in enumerate(self.in_channels[::-1]):
-            if i > 0:
-                ch_in += 512 // (2**(i - 1))
-            channel = 512 // (2**i)
-            base_cfg = []
-            for j in range(self.conv_block_num):
-                base_cfg += [
-                    # name, layer, args
-                    [
-                        '{}.0'.format(j), ConvBNLayer, [channel, channel, 1],
-                        dict(
-                            padding=0, act=act, norm_type=norm_type)
-                    ],
-                    [
-                        '{}.1'.format(j), ConvBNLayer, [channel, channel, 3],
-                        dict(
-                            padding=1, act=act, norm_type=norm_type)
-                    ]
-                ]
-
-            if i == 0 and self.spp:
-                base_cfg[3] = [
-                    'spp', SPP, [channel * 4, channel, 1], dict(
-                        pool_size=[5, 9, 13], act=act, norm_type=norm_type)
-                ]
-
-            cfg = base_cfg[:4] + dropblock_cfg + base_cfg[4:]
-            name = 'fpn.{}'.format(i)
-            fpn_block = self.add_sublayer(
-                name,
-                PPYOLODetBlockCSP(cfg, ch_in, channel, act, norm_type, name,
-                                  data_format))
-            self.fpn_blocks.append(fpn_block)
-            fpn_channels.append(channel * 2)
-            if i < self.num_blocks - 1:
-                name = 'fpn_transition.{}'.format(i)
-                route = self.add_sublayer(
-                    name,
-                    ConvBNLayer(
-                        ch_in=channel * 2,
-                        ch_out=channel,
-                        filter_size=1,
-                        stride=1,
-                        padding=0,
-                        act=act,
-                        norm_type=norm_type,
-                        data_format=data_format,
-                        name=name))
-                self.fpn_routes.append(route)
-        # pan
-        self.pan_blocks = []
-        self.pan_routes = []
-        self._out_channels = [512 // (2**(self.num_blocks - 2)), ]
-        for i in reversed(range(self.num_blocks - 1)):
-            name = 'pan_transition.{}'.format(i)
-            route = self.add_sublayer(
-                name,
-                ConvBNLayer(
-                    ch_in=fpn_channels[i + 1],
-                    ch_out=fpn_channels[i + 1],
-                    filter_size=3,
-                    stride=2,
-                    padding=1,
-                    act=act,
-                    norm_type=norm_type,
-                    data_format=data_format,
-                    name=name))
-            self.pan_routes = [route, ] + self.pan_routes
-            base_cfg = []
-            ch_in = fpn_channels[i] + fpn_channels[i + 1]
-            channel = 512 // (2**i)
-            for j in range(self.conv_block_num):
-                base_cfg += [
-                    # name, layer, args
-                    [
-                        '{}.0'.format(j), ConvBNLayer, [channel, channel, 1],
-                        dict(
-                            padding=0, act=act, norm_type=norm_type)
-                    ],
-                    [
-                        '{}.1'.format(j), ConvBNLayer, [channel, channel, 3],
-                        dict(
-                            padding=1, act=act, norm_type=norm_type)
-                    ]
-                ]
-
-            cfg = base_cfg[:4] + dropblock_cfg + base_cfg[4:]
-            name = 'pan.{}'.format(i)
-            pan_block = self.add_sublayer(
-                name,
-                PPYOLODetBlockCSP(cfg, ch_in, channel, act, norm_type, name,
-                                  data_format))
-
-            self.pan_blocks = [pan_block, ] + self.pan_blocks
-            self._out_channels.append(channel * 2)
-
-        self._out_channels = self._out_channels[::-1]
-
-    def forward(self, blocks, for_mot=False):
-        assert len(blocks) == self.num_blocks
-        blocks = blocks[::-1]
-        fpn_feats = []
-
-        # add embedding features output for multi-object tracking model
-        if for_mot:
-            emb_feats = []
-
-        for i, block in enumerate(blocks):
-            if i > 0:
-                if self.data_format == 'NCHW':
-                    block = paddle.concat([route, block], axis=1)
-                else:
-                    block = paddle.concat([route, block], axis=-1)
-            route, tip = self.fpn_blocks[i](block)
-            fpn_feats.append(tip)
-
-            if for_mot:
-                # add embedding features output
-                emb_feats.append(route)
-
-            if i < self.num_blocks - 1:
-                route = self.fpn_routes[i](route)
-                route = F.interpolate(
-                    route, scale_factor=2., data_format=self.data_format)
-
-        pan_feats = [fpn_feats[-1], ]
-        route = fpn_feats[self.num_blocks - 1]
-        for i in reversed(range(self.num_blocks - 1)):
-            block = fpn_feats[i]
-            route = self.pan_routes[i](route)
-            if self.data_format == 'NCHW':
-                block = paddle.concat([route, block], axis=1)
-            else:
-                block = paddle.concat([route, block], axis=-1)
-
-            route, tip = self.pan_blocks[i](block)
-            pan_feats.append(tip)
-
-        if for_mot:
-            return {'yolo_feats': pan_feats[::-1], 'emb_feats': emb_feats}
-        else:
-            return pan_feats[::-1]
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=c) for c in self._out_channels]
-
-
-@register
-@serializable
-class YOLOCSPPAN(nn.Layer):
-    """
-    YOLO CSP-PAN, used in YOLOv5 and YOLOX.
-    """
-    __shared__ = ['depth_mult', 'data_format', 'act', 'trt']
-
-    def __init__(self,
-                 depth_mult=1.0,
-                 in_channels=[256, 512, 1024],
-                 depthwise=False,
-                 data_format='NCHW',
-                 act='silu',
-                 trt=False):
-        super(YOLOCSPPAN, self).__init__()
-        self.in_channels = in_channels
-        self._out_channels = in_channels
-        Conv = DWConv if depthwise else BaseConv
-
-        self.data_format = data_format
-        act = get_act_fn(
-            act, trt=trt) if act is None or isinstance(act,
-                                                       (str, dict)) else act
-        self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
-
-        # top-down fpn
-        self.lateral_convs = nn.LayerList()
-        self.fpn_blocks = nn.LayerList()
-        for idx in range(len(in_channels) - 1, 0, -1):
-            self.lateral_convs.append(
-                BaseConv(
-                    int(in_channels[idx]),
-                    int(in_channels[idx - 1]),
-                    1,
-                    1,
-                    act=act))
-            self.fpn_blocks.append(
-                CSPLayer(
-                    int(in_channels[idx - 1] * 2),
-                    int(in_channels[idx - 1]),
-                    round(3 * depth_mult),
-                    shortcut=False,
-                    depthwise=depthwise,
-                    act=act))
-
-        # bottom-up pan
-        self.downsample_convs = nn.LayerList()
-        self.pan_blocks = nn.LayerList()
-        for idx in range(len(in_channels) - 1):
-            self.downsample_convs.append(
-                Conv(
-                    int(in_channels[idx]),
-                    int(in_channels[idx]),
-                    3,
-                    stride=2,
-                    act=act))
-            self.pan_blocks.append(
-                CSPLayer(
-                    int(in_channels[idx] * 2),
-                    int(in_channels[idx + 1]),
-                    round(3 * depth_mult),
-                    shortcut=False,
-                    depthwise=depthwise,
-                    act=act))
-
-    def forward(self, feats, for_mot=False):
-        assert len(feats) == len(self.in_channels)
-
-        # top-down fpn
-        inner_outs = [feats[-1]]
-        for idx in range(len(self.in_channels) - 1, 0, -1):
-            feat_heigh = inner_outs[0]
-            feat_low = feats[idx - 1]
-            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
-                feat_heigh)
-            inner_outs[0] = feat_heigh
-
-            upsample_feat = F.interpolate(
-                feat_heigh,
-                scale_factor=2.,
-                mode="nearest",
-                data_format=self.data_format)
-            inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
-                paddle.concat(
-                    [upsample_feat, feat_low], axis=1))
-            inner_outs.insert(0, inner_out)
-
-        # bottom-up pan
-        outs = [inner_outs[0]]
-        for idx in range(len(self.in_channels) - 1):
-            feat_low = outs[-1]
-            feat_height = inner_outs[idx + 1]
-            downsample_feat = self.downsample_convs[idx](feat_low)
-            out = self.pan_blocks[idx](paddle.concat(
-                [downsample_feat, feat_height], axis=1))
-            outs.append(out)
-
-        return outs
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_channels': [i.channels for i in input_shape], }
-
-    @property
-    def out_shape(self):
-        return [ShapeSpec(channels=c) for c in self._out_channels]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/ops.py b/pdfdet/models/Paddle/ppdet/modeling/ops.py
deleted file mode 100644
index d9a1192..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/ops.py
+++ /dev/null
@@ -1,1114 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-# 
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import paddle
-import paddle.nn.functional as F
-import paddle.nn as nn
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-try:
-    import paddle._legacy_C_ops as C_ops
-except:
-    import paddle._C_ops as C_ops
-
-from paddle import in_dynamic_mode
-from paddle.common_ops_import import Variable, LayerHelper, check_variable_and_dtype, check_type, check_dtype
-
-__all__ = [
-    'prior_box', 'generate_proposals', 'box_coder', 'multiclass_nms',
-    'distribute_fpn_proposals', 'matrix_nms', 'batch_norm', 'mish', 'silu',
-    'swish', 'identity', 'anchor_generator'
-]
-
-
-def identity(x):
-    return x
-
-
-def mish(x):
-    return F.mish(x) if hasattr(F, mish) else x * F.tanh(F.softplus(x))
-
-
-def silu(x):
-    return F.silu(x)
-
-
-def swish(x):
-    return x * F.sigmoid(x)
-
-
-TRT_ACT_SPEC = {'swish': swish, 'silu': swish}
-
-ACT_SPEC = {'mish': mish, 'silu': silu}
-
-
-def get_act_fn(act=None, trt=False):
-    assert act is None or isinstance(act, (
-        str, dict)), 'name of activation should be str, dict or None'
-    if not act:
-        return identity
-
-    if isinstance(act, dict):
-        name = act['name']
-        act.pop('name')
-        kwargs = act
-    else:
-        name = act
-        kwargs = dict()
-
-    if trt and name in TRT_ACT_SPEC:
-        fn = TRT_ACT_SPEC[name]
-    elif name in ACT_SPEC:
-        fn = ACT_SPEC[name]
-    else:
-        fn = getattr(F, name)
-
-    return lambda x: fn(x, **kwargs)
-
-
-def batch_norm(ch,
-               norm_type='bn',
-               norm_decay=0.,
-               freeze_norm=False,
-               initializer=None,
-               data_format='NCHW'):
-
-    norm_lr = 0. if freeze_norm else 1.
-    weight_attr = ParamAttr(
-        initializer=initializer,
-        learning_rate=norm_lr,
-        regularizer=L2Decay(norm_decay),
-        trainable=False if freeze_norm else True)
-    bias_attr = ParamAttr(
-        learning_rate=norm_lr,
-        regularizer=L2Decay(norm_decay),
-        trainable=False if freeze_norm else True)
-
-    if norm_type in ['sync_bn', 'bn']:
-        norm_layer = nn.BatchNorm2D(
-            ch,
-            weight_attr=weight_attr,
-            bias_attr=bias_attr,
-            data_format=data_format)
-
-    norm_params = norm_layer.parameters()
-    if freeze_norm:
-        for param in norm_params:
-            param.stop_gradient = True
-
-    return norm_layer
-
-
-@paddle.jit.not_to_static
-def anchor_generator(input,
-                     anchor_sizes=None,
-                     aspect_ratios=None,
-                     variance=[0.1, 0.1, 0.2, 0.2],
-                     stride=None,
-                     offset=0.5):
-    """
-    **Anchor generator operator**
-    Generate anchors for Faster RCNN algorithm.
-    Each position of the input produce N anchors, N =
-    size(anchor_sizes) * size(aspect_ratios). The order of generated anchors
-    is firstly aspect_ratios loop then anchor_sizes loop.
-    Args:
-       input(Variable): 4-D Tensor with shape [N,C,H,W]. The input feature map.
-       anchor_sizes(float32|list|tuple, optional): The anchor sizes of generated
-          anchors, given in absolute pixels e.g. [64., 128., 256., 512.].
-          For instance, the anchor size of 64 means the area of this anchor 
-          equals to 64**2. None by default.
-       aspect_ratios(float32|list|tuple, optional): The height / width ratios 
-           of generated anchors, e.g. [0.5, 1.0, 2.0]. None by default.
-       variance(list|tuple, optional): The variances to be used in box 
-           regression deltas. The data type is float32, [0.1, 0.1, 0.2, 0.2] by 
-           default.
-       stride(list|tuple, optional): The anchors stride across width and height.
-           The data type is float32. e.g. [16.0, 16.0]. None by default.
-       offset(float32, optional): Prior boxes center offset. 0.5 by default.
-    Returns:
-        Tuple:
-        Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4].
-        H is the height of input, W is the width of input,
-        num_anchors is the box count of each position. 
-        Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized.
- 
-        Variances(Variable): The expanded variances of anchors
-        with a layout of [H, W, num_priors, 4].
-        H is the height of input, W is the width of input
-        num_anchors is the box count of each position.
-        Each variance is in (xcenter, ycenter, w, h) format.
-    Examples:
-        .. code-block:: python
-            import paddle.fluid as fluid
-            conv1 = fluid.data(name='conv1', shape=[None, 48, 16, 16], dtype='float32')
-            anchor, var = fluid.layers.anchor_generator(
-                input=conv1,
-                anchor_sizes=[64, 128, 256, 512],
-                aspect_ratios=[0.5, 1.0, 2.0],
-                variance=[0.1, 0.1, 0.2, 0.2],
-                stride=[16.0, 16.0],
-                offset=0.5)
-    """
-
-    def _is_list_or_tuple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
-
-    if not _is_list_or_tuple_(anchor_sizes):
-        anchor_sizes = [anchor_sizes]
-    if not _is_list_or_tuple_(aspect_ratios):
-        aspect_ratios = [aspect_ratios]
-    if not (_is_list_or_tuple_(stride) and len(stride) == 2):
-        raise ValueError('stride should be a list or tuple ',
-                         'with length 2, (stride_width, stride_height).')
-
-    anchor_sizes = list(map(float, anchor_sizes))
-    aspect_ratios = list(map(float, aspect_ratios))
-    stride = list(map(float, stride))
-
-    if in_dynamic_mode():
-        attrs = ('anchor_sizes', anchor_sizes, 'aspect_ratios', aspect_ratios,
-                 'variances', variance, 'stride', stride, 'offset', offset)
-        anchor, var = C_ops.anchor_generator(input, *attrs)
-        return anchor, var
-
-    helper = LayerHelper("anchor_generator", **locals())
-    dtype = helper.input_dtype()
-    attrs = {
-        'anchor_sizes': anchor_sizes,
-        'aspect_ratios': aspect_ratios,
-        'variances': variance,
-        'stride': stride,
-        'offset': offset
-    }
-
-    anchor = helper.create_variable_for_type_inference(dtype)
-    var = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="anchor_generator",
-        inputs={"Input": input},
-        outputs={"Anchors": anchor,
-                 "Variances": var},
-        attrs=attrs, )
-    anchor.stop_gradient = True
-    var.stop_gradient = True
-    return anchor, var
-
-
-@paddle.jit.not_to_static
-def distribute_fpn_proposals(fpn_rois,
-                             min_level,
-                             max_level,
-                             refer_level,
-                             refer_scale,
-                             pixel_offset=False,
-                             rois_num=None,
-                             name=None):
-    r"""
-    
-    **This op only takes LoDTensor as input.** In Feature Pyramid Networks 
-    (FPN) models, it is needed to distribute all proposals into different FPN 
-    level, with respect to scale of the proposals, the referring scale and the 
-    referring level. Besides, to restore the order of proposals, we return an 
-    array which indicates the original index of rois in current proposals. 
-    To compute FPN level for each roi, the formula is given as follows:
-    
-    .. math::
-
-        roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
-
-        level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level)
-
-    where BBoxArea is a function to compute the area of each roi.
-
-    Args:
-
-        fpn_rois(Variable): 2-D Tensor with shape [N, 4] and data type is 
-            float32 or float64. The input fpn_rois.
-        min_level(int32): The lowest level of FPN layer where the proposals come 
-            from.
-        max_level(int32): The highest level of FPN layer where the proposals
-            come from.
-        refer_level(int32): The referring level of FPN layer with specified scale.
-        refer_scale(int32): The referring scale of FPN layer with specified level.
-        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. 
-            The shape is [B] and data type is int32. B is the number of images.
-            If it is not None then return a list of 1-D Tensor. Each element 
-            is the output RoIs' number of each image on the corresponding level
-            and the shape is [B]. None by default.
-        name(str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
-            None by default. 
-
-    Returns:
-        Tuple:
-
-        multi_rois(List) : A list of 2-D LoDTensor with shape [M, 4] 
-        and data type of float32 and float64. The length is 
-        max_level-min_level+1. The proposals in each FPN level.
-
-        restore_ind(Variable): A 2-D Tensor with shape [N, 1], N is 
-        the number of total rois. The data type is int32. It is
-        used to restore the order of fpn_rois.
-
-        rois_num_per_level(List): A list of 1-D Tensor and each Tensor is 
-        the RoIs' number in each image on the corresponding level. The shape 
-        is [B] and data type of int32. B is the number of images
-
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            from ppdet.modeling import ops
-            paddle.enable_static()
-            fpn_rois = paddle.static.data(
-                name='data', shape=[None, 4], dtype='float32', lod_level=1)
-            multi_rois, restore_ind = ops.distribute_fpn_proposals(
-                fpn_rois=fpn_rois,
-                min_level=2,
-                max_level=5,
-                refer_level=4,
-                refer_scale=224)
-    """
-    num_lvl = max_level - min_level + 1
-
-    if in_dynamic_mode():
-        assert rois_num is not None, "rois_num should not be None in dygraph mode."
-        attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level',
-                 refer_level, 'refer_scale', refer_scale, 'pixel_offset',
-                 pixel_offset)
-        multi_rois, restore_ind, rois_num_per_level = C_ops.distribute_fpn_proposals(
-            fpn_rois, rois_num, num_lvl, num_lvl, *attrs)
-
-        return multi_rois, restore_ind, rois_num_per_level
-
-    else:
-        check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'],
-                                 'distribute_fpn_proposals')
-        helper = LayerHelper('distribute_fpn_proposals', **locals())
-        dtype = helper.input_dtype('fpn_rois')
-        multi_rois = [
-            helper.create_variable_for_type_inference(dtype)
-            for i in range(num_lvl)
-        ]
-
-        restore_ind = helper.create_variable_for_type_inference(dtype='int32')
-
-        inputs = {'FpnRois': fpn_rois}
-        outputs = {
-            'MultiFpnRois': multi_rois,
-            'RestoreIndex': restore_ind,
-        }
-
-        if rois_num is not None:
-            inputs['RoisNum'] = rois_num
-            rois_num_per_level = [
-                helper.create_variable_for_type_inference(dtype='int32')
-                for i in range(num_lvl)
-            ]
-            outputs['MultiLevelRoIsNum'] = rois_num_per_level
-        else:
-            rois_num_per_level = None
-
-        helper.append_op(
-            type='distribute_fpn_proposals',
-            inputs=inputs,
-            outputs=outputs,
-            attrs={
-                'min_level': min_level,
-                'max_level': max_level,
-                'refer_level': refer_level,
-                'refer_scale': refer_scale,
-                'pixel_offset': pixel_offset
-            })
-        return multi_rois, restore_ind, rois_num_per_level
-
-
-@paddle.jit.not_to_static
-def prior_box(input,
-              image,
-              min_sizes,
-              max_sizes=None,
-              aspect_ratios=[1.],
-              variance=[0.1, 0.1, 0.2, 0.2],
-              flip=False,
-              clip=False,
-              steps=[0.0, 0.0],
-              offset=0.5,
-              min_max_aspect_ratios_order=False,
-              name=None):
-    """
-
-    This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
-    Each position of the input produce N prior boxes, N is determined by
-    the count of min_sizes, max_sizes and aspect_ratios, The size of the
-    box is in range(min_size, max_size) interval, which is generated in
-    sequence according to the aspect_ratios.
-
-    Parameters:
-       input(Tensor): 4-D tensor(NCHW), the data type should be float32 or float64.
-       image(Tensor): 4-D tensor(NCHW), the input image data of PriorBoxOp,
-            the data type should be float32 or float64.
-       min_sizes(list|tuple|float): the min sizes of generated prior boxes.
-       max_sizes(list|tuple|None): the max sizes of generated prior boxes.
-            Default: None.
-       aspect_ratios(list|tuple|float): the aspect ratios of generated
-            prior boxes. Default: [1.].
-       variance(list|tuple): the variances to be encoded in prior boxes.
-            Default:[0.1, 0.1, 0.2, 0.2].
-       flip(bool): Whether to flip aspect ratios. Default:False.
-       clip(bool): Whether to clip out-of-boundary boxes. Default: False.
-       step(list|tuple): Prior boxes step across width and height, If
-            step[0] equals to 0.0 or step[1] equals to 0.0, the prior boxes step across
-            height or weight of the input will be automatically calculated.
-            Default: [0., 0.]
-       offset(float): Prior boxes center offset. Default: 0.5
-       min_max_aspect_ratios_order(bool): If set True, the output prior box is
-            in order of [min, max, aspect_ratios], which is consistent with
-            Caffe. Please note, this order affects the weights order of
-            convolution layer followed by and does not affect the final
-            detection results. Default: False.
-       name(str, optional): The default value is None.  Normally there is no need for 
-            user to set this property. For more information, please refer to :ref:`api_guide_Name`
-
-    Returns:
-        Tuple: A tuple with two Variable (boxes, variances)
-
-        boxes(Tensor): the output prior boxes of PriorBox.
-        4-D tensor, the layout is [H, W, num_priors, 4].
-        H is the height of input, W is the width of input,
-        num_priors is the total box count of each position of input.
-
-        variances(Tensor): the expanded variances of PriorBox.
-        4-D tensor, the layput is [H, W, num_priors, 4].
-        H is the height of input, W is the width of input
-        num_priors is the total box count of each position of input
-
-    Examples:
-        .. code-block:: python
-
-        import paddle
-        from ppdet.modeling import ops
-
-        paddle.enable_static()
-        input = paddle.static.data(name="input", shape=[None,3,6,9])
-        image = paddle.static.data(name="image", shape=[None,3,9,12])
-        box, var = ops.prior_box(
-                    input=input,
-                    image=image,
-                    min_sizes=[100.],
-                    clip=True,
-                    flip=True)
-    """
-    helper = LayerHelper("prior_box", **locals())
-    dtype = helper.input_dtype()
-    check_variable_and_dtype(
-        input, 'input', ['uint8', 'int8', 'float32', 'float64'], 'prior_box')
-
-    def _is_list_or_tuple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
-
-    if not _is_list_or_tuple_(min_sizes):
-        min_sizes = [min_sizes]
-    if not _is_list_or_tuple_(aspect_ratios):
-        aspect_ratios = [aspect_ratios]
-    if not (_is_list_or_tuple_(steps) and len(steps) == 2):
-        raise ValueError('steps should be a list or tuple ',
-                         'with length 2, (step_width, step_height).')
-
-    min_sizes = list(map(float, min_sizes))
-    aspect_ratios = list(map(float, aspect_ratios))
-    steps = list(map(float, steps))
-
-    cur_max_sizes = None
-    if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0:
-        if not _is_list_or_tuple_(max_sizes):
-            max_sizes = [max_sizes]
-        cur_max_sizes = max_sizes
-
-    if in_dynamic_mode():
-        attrs = ('min_sizes', min_sizes, 'aspect_ratios', aspect_ratios,
-                 'variances', variance, 'flip', flip, 'clip', clip, 'step_w',
-                 steps[0], 'step_h', steps[1], 'offset', offset,
-                 'min_max_aspect_ratios_order', min_max_aspect_ratios_order)
-        if cur_max_sizes is not None:
-            attrs += ('max_sizes', cur_max_sizes)
-        box, var = C_ops.prior_box(input, image, *attrs)
-        return box, var
-    else:
-        attrs = {
-            'min_sizes': min_sizes,
-            'aspect_ratios': aspect_ratios,
-            'variances': variance,
-            'flip': flip,
-            'clip': clip,
-            'step_w': steps[0],
-            'step_h': steps[1],
-            'offset': offset,
-            'min_max_aspect_ratios_order': min_max_aspect_ratios_order
-        }
-
-        if cur_max_sizes is not None:
-            attrs['max_sizes'] = cur_max_sizes
-
-        box = helper.create_variable_for_type_inference(dtype)
-        var = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(
-            type="prior_box",
-            inputs={"Input": input,
-                    "Image": image},
-            outputs={"Boxes": box,
-                     "Variances": var},
-            attrs=attrs, )
-        box.stop_gradient = True
-        var.stop_gradient = True
-        return box, var
-
-
-@paddle.jit.not_to_static
-def multiclass_nms(bboxes,
-                   scores,
-                   score_threshold,
-                   nms_top_k,
-                   keep_top_k,
-                   nms_threshold=0.3,
-                   normalized=True,
-                   nms_eta=1.,
-                   background_label=-1,
-                   return_index=False,
-                   return_rois_num=True,
-                   rois_num=None,
-                   name=None):
-    """
-    This operator is to do multi-class non maximum suppression (NMS) on
-    boxes and scores.
-    In the NMS step, this operator greedily selects a subset of detection bounding
-    boxes that have high scores larger than score_threshold, if providing this
-    threshold, then selects the largest nms_top_k confidences scores if nms_top_k
-    is larger than -1. Then this operator pruns away boxes that have high IOU
-    (intersection over union) overlap with already selected boxes by adaptive
-    threshold NMS based on parameters of nms_threshold and nms_eta.
-    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
-    per image if keep_top_k is larger than -1.
-    Args:
-        bboxes (Tensor): Two types of bboxes are supported:
-                           1. (Tensor) A 3-D Tensor with shape
-                           [N, M, 4 or 8 16 24 32] represents the
-                           predicted locations of M bounding bboxes,
-                           N is the batch size. Each bounding box has four
-                           coordinate values and the layout is
-                           [xmin, ymin, xmax, ymax], when box size equals to 4.
-                           2. (LoDTensor) A 3-D Tensor with shape [M, C, 4]
-                           M is the number of bounding boxes, C is the
-                           class number
-        scores (Tensor): Two types of scores are supported:
-                           1. (Tensor) A 3-D Tensor with shape [N, C, M]
-                           represents the predicted confidence predictions.
-                           N is the batch size, C is the class number, M is
-                           number of bounding boxes. For each category there
-                           are total M scores which corresponding M bounding
-                           boxes. Please note, M is equal to the 2nd dimension
-                           of BBoxes.
-                           2. (LoDTensor) A 2-D LoDTensor with shape [M, C].
-                           M is the number of bbox, C is the class number.
-                           In this case, input BBoxes should be the second
-                           case with shape [M, C, 4].
-        background_label (int): The index of background label, the background
-                                label will be ignored. If set to -1, then all
-                                categories will be considered. Default: 0
-        score_threshold (float): Threshold to filter out bounding boxes with
-                                 low confidence score. If not provided,
-                                 consider all boxes.
-        nms_top_k (int): Maximum number of detections to be kept according to
-                         the confidences after the filtering detections based
-                         on score_threshold.
-        nms_threshold (float): The threshold to be used in NMS. Default: 0.3
-        nms_eta (float): The threshold to be used in NMS. Default: 1.0
-        keep_top_k (int): Number of total bboxes to be kept per image after NMS
-                          step. -1 means keeping all bboxes after NMS step.
-        normalized (bool): Whether detections are normalized. Default: True
-        return_index(bool): Whether return selected index. Default: False
-        rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. 
-            The shape is [B] and data type is int32. B is the number of images.
-            If it is not None then return a list of 1-D Tensor. Each element 
-            is the output RoIs' number of each image on the corresponding level
-            and the shape is [B]. None by default.
-        name(str): Name of the multiclass nms op. Default: None.
-    Returns:
-        A tuple with two Variables: (Out, Index) if return_index is True,
-        otherwise, a tuple with one Variable(Out) is returned.
-        Out: A 2-D LoDTensor with shape [No, 6] represents the detections.
-        Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
-        or A 2-D LoDTensor with shape [No, 10] represents the detections.
-        Each row has 10 values: [label, confidence, x1, y1, x2, y2, x3, y3,
-        x4, y4]. No is the total number of detections.
-        If all images have not detected results, all elements in LoD will be
-        0, and output tensor is empty (None).
-        Index: Only return when return_index is True. A 2-D LoDTensor with
-        shape [No, 1] represents the selected index which type is Integer.
-        The index is the absolute value cross batches. No is the same number
-        as Out. If the index is used to gather other attribute such as age,
-        one needs to reshape the input(N, M, 1) to (N * M, 1) as first, where
-        N is the batch size and M is the number of boxes.
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            from ppdet.modeling import ops
-            boxes = paddle.static.data(name='bboxes', shape=[81, 4],
-                                      dtype='float32', lod_level=1)
-            scores = paddle.static.data(name='scores', shape=[81],
-                                      dtype='float32', lod_level=1)
-            out, index = ops.multiclass_nms(bboxes=boxes,
-                                            scores=scores,
-                                            background_label=0,
-                                            score_threshold=0.5,
-                                            nms_top_k=400,
-                                            nms_threshold=0.3,
-                                            keep_top_k=200,
-                                            normalized=False,
-                                            return_index=True)
-    """
-    helper = LayerHelper('multiclass_nms3', **locals())
-
-    if in_dynamic_mode():
-        attrs = ('background_label', background_label, 'score_threshold',
-                 score_threshold, 'nms_top_k', nms_top_k, 'nms_threshold',
-                 nms_threshold, 'keep_top_k', keep_top_k, 'nms_eta', nms_eta,
-                 'normalized', normalized)
-        output, index, nms_rois_num = C_ops.multiclass_nms3(bboxes, scores,
-                                                            rois_num, *attrs)
-        if not return_index:
-            index = None
-        return output, nms_rois_num, index
-
-    else:
-        output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
-        index = helper.create_variable_for_type_inference(dtype='int32')
-
-        inputs = {'BBoxes': bboxes, 'Scores': scores}
-        outputs = {'Out': output, 'Index': index}
-
-        if rois_num is not None:
-            inputs['RoisNum'] = rois_num
-
-        if return_rois_num:
-            nms_rois_num = helper.create_variable_for_type_inference(
-                dtype='int32')
-            outputs['NmsRoisNum'] = nms_rois_num
-
-        helper.append_op(
-            type="multiclass_nms3",
-            inputs=inputs,
-            attrs={
-                'background_label': background_label,
-                'score_threshold': score_threshold,
-                'nms_top_k': nms_top_k,
-                'nms_threshold': nms_threshold,
-                'keep_top_k': keep_top_k,
-                'nms_eta': nms_eta,
-                'normalized': normalized
-            },
-            outputs=outputs)
-        output.stop_gradient = True
-        index.stop_gradient = True
-        if not return_index:
-            index = None
-        if not return_rois_num:
-            nms_rois_num = None
-
-        return output, nms_rois_num, index
-
-
-@paddle.jit.not_to_static
-def matrix_nms(bboxes,
-               scores,
-               score_threshold,
-               post_threshold,
-               nms_top_k,
-               keep_top_k,
-               use_gaussian=False,
-               gaussian_sigma=2.,
-               background_label=0,
-               normalized=True,
-               return_index=False,
-               return_rois_num=True,
-               name=None):
-    """
-    **Matrix NMS**
-    This operator does matrix non maximum suppression (NMS).
-    First selects a subset of candidate bounding boxes that have higher scores
-    than score_threshold (if provided), then the top k candidate is selected if
-    nms_top_k is larger than -1. Score of the remaining candidate are then
-    decayed according to the Matrix NMS scheme.
-    Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
-    per image if keep_top_k is larger than -1.
-    Args:
-        bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the
-                           predicted locations of M bounding bboxes,
-                           N is the batch size. Each bounding box has four
-                           coordinate values and the layout is
-                           [xmin, ymin, xmax, ymax], when box size equals to 4.
-                           The data type is float32 or float64.
-        scores (Tensor): A 3-D Tensor with shape [N, C, M]
-                           represents the predicted confidence predictions.
-                           N is the batch size, C is the class number, M is
-                           number of bounding boxes. For each category there
-                           are total M scores which corresponding M bounding
-                           boxes. Please note, M is equal to the 2nd dimension
-                           of BBoxes. The data type is float32 or float64.
-        score_threshold (float): Threshold to filter out bounding boxes with
-                                 low confidence score.
-        post_threshold (float): Threshold to filter out bounding boxes with
-                                low confidence score AFTER decaying.
-        nms_top_k (int): Maximum number of detections to be kept according to
-                         the confidences after the filtering detections based
-                         on score_threshold.
-        keep_top_k (int): Number of total bboxes to be kept per image after NMS
-                          step. -1 means keeping all bboxes after NMS step.
-        use_gaussian (bool): Use Gaussian as the decay function. Default: False
-        gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0
-        background_label (int): The index of background label, the background
-                                label will be ignored. If set to -1, then all
-                                categories will be considered. Default: 0
-        normalized (bool): Whether detections are normalized. Default: True
-        return_index(bool): Whether return selected index. Default: False
-        return_rois_num(bool): whether return rois_num. Default: True
-        name(str): Name of the matrix nms op. Default: None.
-    Returns:
-        A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True,
-        otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
-        Out (Tensor): A 2-D Tensor with shape [No, 6] containing the
-             detection results.
-             Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
-             (After version 1.3, when no boxes detected, the lod is changed
-             from {0} to {1})
-        Index (Tensor): A 2-D Tensor with shape [No, 1] containing the
-            selected indices, which are absolute values cross batches.
-        rois_num (Tensor): A 1-D Tensor with shape [N] containing 
-            the number of detected boxes in each image.
-    Examples:
-        .. code-block:: python
-            import paddle
-            from ppdet.modeling import ops
-            boxes = paddle.static.data(name='bboxes', shape=[None,81, 4],
-                                      dtype='float32', lod_level=1)
-            scores = paddle.static.data(name='scores', shape=[None,81],
-                                      dtype='float32', lod_level=1)
-            out = ops.matrix_nms(bboxes=boxes, scores=scores, background_label=0,
-                                 score_threshold=0.5, post_threshold=0.1,
-                                 nms_top_k=400, keep_top_k=200, normalized=False)
-    """
-    check_variable_and_dtype(bboxes, 'BBoxes', ['float32', 'float64'],
-                             'matrix_nms')
-    check_variable_and_dtype(scores, 'Scores', ['float32', 'float64'],
-                             'matrix_nms')
-    check_type(score_threshold, 'score_threshold', float, 'matrix_nms')
-    check_type(post_threshold, 'post_threshold', float, 'matrix_nms')
-    check_type(nms_top_k, 'nums_top_k', int, 'matrix_nms')
-    check_type(keep_top_k, 'keep_top_k', int, 'matrix_nms')
-    check_type(normalized, 'normalized', bool, 'matrix_nms')
-    check_type(use_gaussian, 'use_gaussian', bool, 'matrix_nms')
-    check_type(gaussian_sigma, 'gaussian_sigma', float, 'matrix_nms')
-    check_type(background_label, 'background_label', int, 'matrix_nms')
-
-    if in_dynamic_mode():
-        attrs = ('background_label', background_label, 'score_threshold',
-                 score_threshold, 'post_threshold', post_threshold, 'nms_top_k',
-                 nms_top_k, 'gaussian_sigma', gaussian_sigma, 'use_gaussian',
-                 use_gaussian, 'keep_top_k', keep_top_k, 'normalized',
-                 normalized)
-        out, index, rois_num = C_ops.matrix_nms(bboxes, scores, *attrs)
-        if not return_index:
-            index = None
-        if not return_rois_num:
-            rois_num = None
-        return out, rois_num, index
-    else:
-        helper = LayerHelper('matrix_nms', **locals())
-        output = helper.create_variable_for_type_inference(dtype=bboxes.dtype)
-        index = helper.create_variable_for_type_inference(dtype='int32')
-        outputs = {'Out': output, 'Index': index}
-        if return_rois_num:
-            rois_num = helper.create_variable_for_type_inference(dtype='int32')
-            outputs['RoisNum'] = rois_num
-
-        helper.append_op(
-            type="matrix_nms",
-            inputs={'BBoxes': bboxes,
-                    'Scores': scores},
-            attrs={
-                'background_label': background_label,
-                'score_threshold': score_threshold,
-                'post_threshold': post_threshold,
-                'nms_top_k': nms_top_k,
-                'gaussian_sigma': gaussian_sigma,
-                'use_gaussian': use_gaussian,
-                'keep_top_k': keep_top_k,
-                'normalized': normalized
-            },
-            outputs=outputs)
-        output.stop_gradient = True
-
-        if not return_index:
-            index = None
-        if not return_rois_num:
-            rois_num = None
-        return output, rois_num, index
-
-
-@paddle.jit.not_to_static
-def box_coder(prior_box,
-              prior_box_var,
-              target_box,
-              code_type="encode_center_size",
-              box_normalized=True,
-              axis=0,
-              name=None):
-    r"""
-    **Box Coder Layer**
-    Encode/Decode the target bounding box with the priorbox information.
-    
-    The Encoding schema described below:
-    .. math::
-        ox = (tx - px) / pw / pxv
-        oy = (ty - py) / ph / pyv
-        ow = \log(\abs(tw / pw)) / pwv 
-        oh = \log(\abs(th / ph)) / phv 
-    The Decoding schema described below:
-    
-    .. math::
-  
-        ox = (pw * pxv * tx * + px) - tw / 2
-        oy = (ph * pyv * ty * + py) - th / 2
-        ow = \exp(pwv * tw) * pw + tw / 2
-        oh = \exp(phv * th) * ph + th / 2   
-    where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, 
-    width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote 
-    the priorbox's (anchor) center coordinates, width and height. `pxv`, 
-    `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`, 
-    `ow`, `oh` denote the encoded/decoded coordinates, width and height. 
-    During Box Decoding, two modes for broadcast are supported. Say target 
-    box has shape [N, M, 4], and the shape of prior box can be [N, 4] or 
-    [M, 4]. Then prior box will broadcast to target box along the 
-    assigned axis. 
-
-    Args:
-        prior_box(Tensor): Box list prior_box is a 2-D Tensor with shape 
-            [M, 4] holds M boxes and data type is float32 or float64. Each box
-            is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the 
-            left top coordinate of the anchor box, if the input is image feature
-            map, they are close to the origin of the coordinate system. 
-            [xmax, ymax] is the right bottom coordinate of the anchor box.       
-        prior_box_var(List|Tensor|None): prior_box_var supports three types 
-            of input. One is Tensor with shape [M, 4] which holds M group and 
-            data type is float32 or float64. The second is list consist of 
-            4 elements shared by all boxes and data type is float32 or float64. 
-            Other is None and not involved in calculation. 
-        target_box(Tensor): This input can be a 2-D LoDTensor with shape 
-            [N, 4] when code_type is 'encode_center_size'. This input also can 
-            be a 3-D Tensor with shape [N, M, 4] when code_type is 
-            'decode_center_size'. Each box is represented as 
-            [xmin, ymin, xmax, ymax]. The data type is float32 or float64. 
-        code_type(str): The code type used with the target box. It can be
-            `encode_center_size` or `decode_center_size`. `encode_center_size` 
-            by default.
-        box_normalized(bool): Whether treat the priorbox as a normalized box.
-            Set true by default.
-        axis(int): Which axis in PriorBox to broadcast for box decode, 
-            for example, if axis is 0 and TargetBox has shape [N, M, 4] and 
-            PriorBox has shape [M, 4], then PriorBox will broadcast to [N, M, 4]
-            for decoding. It is only valid when code type is 
-            `decode_center_size`. Set 0 by default. 
-        name(str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
-            None by default. 
-
-    Returns:
-        Tensor:
-        output_box(Tensor): When code_type is 'encode_center_size', the 
-        output tensor of box_coder_op with shape [N, M, 4] representing the 
-        result of N target boxes encoded with M Prior boxes and variances. 
-        When code_type is 'decode_center_size', N represents the batch size 
-        and M represents the number of decoded boxes.
-
-    Examples:
- 
-        .. code-block:: python
- 
-            import paddle
-            from ppdet.modeling import ops
-            paddle.enable_static()
-            # For encode
-            prior_box_encode = paddle.static.data(name='prior_box_encode',
-                                  shape=[512, 4],
-                                  dtype='float32')
-            target_box_encode = paddle.static.data(name='target_box_encode',
-                                   shape=[81, 4],
-                                   dtype='float32')
-            output_encode = ops.box_coder(prior_box=prior_box_encode,
-                                    prior_box_var=[0.1,0.1,0.2,0.2],
-                                    target_box=target_box_encode,
-                                    code_type="encode_center_size")
-            # For decode
-            prior_box_decode = paddle.static.data(name='prior_box_decode',
-                                  shape=[512, 4],
-                                  dtype='float32')
-            target_box_decode = paddle.static.data(name='target_box_decode',
-                                   shape=[512, 81, 4],
-                                   dtype='float32')
-            output_decode = ops.box_coder(prior_box=prior_box_decode,
-                                    prior_box_var=[0.1,0.1,0.2,0.2],
-                                    target_box=target_box_decode,
-                                    code_type="decode_center_size",
-                                    box_normalized=False,
-                                    axis=1)
-    """
-    check_variable_and_dtype(prior_box, 'prior_box', ['float32', 'float64'],
-                             'box_coder')
-    check_variable_and_dtype(target_box, 'target_box', ['float32', 'float64'],
-                             'box_coder')
-
-    if in_dynamic_mode():
-        if isinstance(prior_box_var, Variable):
-            output_box = C_ops.box_coder(
-                prior_box, prior_box_var, target_box, "code_type", code_type,
-                "box_normalized", box_normalized, "axis", axis)
-
-        elif isinstance(prior_box_var, list):
-            output_box = C_ops.box_coder(
-                prior_box, None, target_box, "code_type", code_type,
-                "box_normalized", box_normalized, "axis", axis, "variance",
-                prior_box_var)
-        else:
-            raise TypeError(
-                "Input variance of box_coder must be Variable or list")
-        return output_box
-    else:
-        helper = LayerHelper("box_coder", **locals())
-
-        output_box = helper.create_variable_for_type_inference(
-            dtype=prior_box.dtype)
-
-        inputs = {"PriorBox": prior_box, "TargetBox": target_box}
-        attrs = {
-            "code_type": code_type,
-            "box_normalized": box_normalized,
-            "axis": axis
-        }
-        if isinstance(prior_box_var, Variable):
-            inputs['PriorBoxVar'] = prior_box_var
-        elif isinstance(prior_box_var, list):
-            attrs['variance'] = prior_box_var
-        else:
-            raise TypeError(
-                "Input variance of box_coder must be Variable or list")
-        helper.append_op(
-            type="box_coder",
-            inputs=inputs,
-            attrs=attrs,
-            outputs={"OutputBox": output_box})
-        return output_box
-
-
-@paddle.jit.not_to_static
-def generate_proposals(scores,
-                       bbox_deltas,
-                       im_shape,
-                       anchors,
-                       variances,
-                       pre_nms_top_n=6000,
-                       post_nms_top_n=1000,
-                       nms_thresh=0.5,
-                       min_size=0.1,
-                       eta=1.0,
-                       pixel_offset=False,
-                       return_rois_num=False,
-                       name=None):
-    """
-    **Generate proposal Faster-RCNN**
-    This operation proposes RoIs according to each box with their
-    probability to be a foreground object and 
-    the box can be calculated by anchors. Bbox_deltais and scores
-    to be an object are the output of RPN. Final proposals
-    could be used to train detection net.
-    For generating proposals, this operation performs following steps:
-    1. Transposes and resizes scores and bbox_deltas in size of
-       (H*W*A, 1) and (H*W*A, 4)
-    2. Calculate box locations as proposals candidates. 
-    3. Clip boxes to image
-    4. Remove predicted boxes with small area. 
-    5. Apply NMS to get final proposals as output.
-    Args:
-        scores(Tensor): A 4-D Tensor with shape [N, A, H, W] represents
-            the probability for each box to be an object.
-            N is batch size, A is number of anchors, H and W are height and
-            width of the feature map. The data type must be float32.
-        bbox_deltas(Tensor): A 4-D Tensor with shape [N, 4*A, H, W]
-            represents the difference between predicted box location and
-            anchor location. The data type must be float32.
-        im_shape(Tensor): A 2-D Tensor with shape [N, 2] represents H, W, the
-            origin image size or input size. The data type can be float32 or 
-            float64.
-        anchors(Tensor):   A 4-D Tensor represents the anchors with a layout
-            of [H, W, A, 4]. H and W are height and width of the feature map,
-            num_anchors is the box count of each position. Each anchor is
-            in (xmin, ymin, xmax, ymax) format an unnormalized. The data type must be float32.
-        variances(Tensor): A 4-D Tensor. The expanded variances of anchors with a layout of
-            [H, W, num_priors, 4]. Each variance is in
-            (xcenter, ycenter, w, h) format. The data type must be float32.
-        pre_nms_top_n(float): Number of total bboxes to be kept per
-            image before NMS. The data type must be float32. `6000` by default.
-        post_nms_top_n(float): Number of total bboxes to be kept per
-            image after NMS. The data type must be float32. `1000` by default.
-        nms_thresh(float): Threshold in NMS. The data type must be float32. `0.5` by default.
-        min_size(float): Remove predicted boxes with either height or
-            width < min_size. The data type must be float32. `0.1` by default.
-        eta(float): Apply in adaptive NMS, if adaptive `threshold > 0.5`,
-            `adaptive_threshold = adaptive_threshold * eta` in each iteration.
-        return_rois_num(bool): When setting True, it will return a 1D Tensor with shape [N, ] that includes Rois's 
-            num of each image in one batch. The N is the image's num. For example, the tensor has values [4,5] that represents
-            the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model. 
-            'False' by default. 
-        name(str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
-            None by default. 
-
-    Returns:
-        tuple:
-        A tuple with format ``(rpn_rois, rpn_roi_probs)``.
-        - **rpn_rois**: The generated RoIs. 2-D Tensor with shape ``[N, 4]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``.
-        - **rpn_roi_probs**: The scores of generated RoIs. 2-D Tensor with shape ``[N, 1]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``.
-
-    Examples:
-        .. code-block:: python
-        
-            import paddle
-            from ppdet.modeling import ops
-            paddle.enable_static()
-            scores = paddle.static.data(name='scores', shape=[None, 4, 5, 5], dtype='float32')
-            bbox_deltas = paddle.static.data(name='bbox_deltas', shape=[None, 16, 5, 5], dtype='float32')
-            im_shape = paddle.static.data(name='im_shape', shape=[None, 2], dtype='float32')
-            anchors = paddle.static.data(name='anchors', shape=[None, 5, 4, 4], dtype='float32')
-            variances = paddle.static.data(name='variances', shape=[None, 5, 10, 4], dtype='float32')
-            rois, roi_probs = ops.generate_proposals(scores, bbox_deltas,
-                         im_shape, anchors, variances)
-    """
-    if in_dynamic_mode():
-        assert return_rois_num, "return_rois_num should be True in dygraph mode."
-        attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n,
-                 'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta,
-                 'pixel_offset', pixel_offset)
-        rpn_rois, rpn_roi_probs, rpn_rois_num = C_ops.generate_proposals_v2(
-            scores, bbox_deltas, im_shape, anchors, variances, *attrs)
-        if not return_rois_num:
-            rpn_rois_num = None
-        return rpn_rois, rpn_roi_probs, rpn_rois_num
-
-    else:
-        helper = LayerHelper('generate_proposals_v2', **locals())
-
-        check_variable_and_dtype(scores, 'scores', ['float32'],
-                                 'generate_proposals_v2')
-        check_variable_and_dtype(bbox_deltas, 'bbox_deltas', ['float32'],
-                                 'generate_proposals_v2')
-        check_variable_and_dtype(im_shape, 'im_shape', ['float32', 'float64'],
-                                 'generate_proposals_v2')
-        check_variable_and_dtype(anchors, 'anchors', ['float32'],
-                                 'generate_proposals_v2')
-        check_variable_and_dtype(variances, 'variances', ['float32'],
-                                 'generate_proposals_v2')
-
-        rpn_rois = helper.create_variable_for_type_inference(
-            dtype=bbox_deltas.dtype)
-        rpn_roi_probs = helper.create_variable_for_type_inference(
-            dtype=scores.dtype)
-        outputs = {
-            'RpnRois': rpn_rois,
-            'RpnRoiProbs': rpn_roi_probs,
-        }
-        if return_rois_num:
-            rpn_rois_num = helper.create_variable_for_type_inference(
-                dtype='int32')
-            rpn_rois_num.stop_gradient = True
-            outputs['RpnRoisNum'] = rpn_rois_num
-
-        helper.append_op(
-            type="generate_proposals_v2",
-            inputs={
-                'Scores': scores,
-                'BboxDeltas': bbox_deltas,
-                'ImShape': im_shape,
-                'Anchors': anchors,
-                'Variances': variances
-            },
-            attrs={
-                'pre_nms_topN': pre_nms_top_n,
-                'post_nms_topN': post_nms_top_n,
-                'nms_thresh': nms_thresh,
-                'min_size': min_size,
-                'eta': eta,
-                'pixel_offset': pixel_offset
-            },
-            outputs=outputs)
-        rpn_rois.stop_gradient = True
-        rpn_roi_probs.stop_gradient = True
-        if not return_rois_num:
-            rpn_rois_num = None
-
-        return rpn_rois, rpn_roi_probs, rpn_rois_num
-
-
-def sigmoid_cross_entropy_with_logits(input,
-                                      label,
-                                      ignore_index=-100,
-                                      normalize=False):
-    output = F.binary_cross_entropy_with_logits(input, label, reduction='none')
-    mask_tensor = paddle.cast(label != ignore_index, 'float32')
-    output = paddle.multiply(output, mask_tensor)
-    if normalize:
-        sum_valid_mask = paddle.sum(mask_tensor)
-        output = output / sum_valid_mask
-    return output
-
-
-def smooth_l1(input, label, inside_weight=None, outside_weight=None,
-              sigma=None):
-    input_new = paddle.multiply(input, inside_weight)
-    label_new = paddle.multiply(label, inside_weight)
-    delta = 1 / (sigma * sigma)
-    out = F.smooth_l1_loss(input_new, label_new, reduction='none', delta=delta)
-    out = paddle.multiply(out, outside_weight)
-    out = out / delta
-    out = paddle.reshape(out, shape=[out.shape[0], -1])
-    out = paddle.sum(out, axis=1)
-    return out
-
-
-def channel_shuffle(x, groups):
-    batch_size, num_channels, height, width = x.shape[0:4]
-    assert num_channels % groups == 0, 'num_channels should be divisible by groups'
-    channels_per_group = num_channels // groups
-    x = paddle.reshape(
-        x=x, shape=[batch_size, groups, channels_per_group, height, width])
-    x = paddle.transpose(x=x, perm=[0, 2, 1, 3, 4])
-    x = paddle.reshape(x=x, shape=[batch_size, num_channels, height, width])
-    return x
-
-
-def get_static_shape(tensor):
-    shape = paddle.shape(tensor)
-    shape.stop_gradient = True
-    return shape
diff --git a/pdfdet/models/Paddle/ppdet/modeling/post_process.py b/pdfdet/models/Paddle/ppdet/modeling/post_process.py
deleted file mode 100644
index efde830..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/post_process.py
+++ /dev/null
@@ -1,801 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register
-from ppdet.modeling.bbox_utils import nonempty_bbox
-from .transformers import bbox_cxcywh_to_xyxy
-try:
-    from collections.abc import Sequence
-except Exception:
-    from collections import Sequence
-
-__all__ = [
-    'BBoxPostProcess', 'MaskPostProcess', 'JDEBBoxPostProcess',
-    'CenterNetPostProcess', 'DETRPostProcess', 'SparsePostProcess',
-    'DETRBBoxSemiPostProcess'
-]
-
-
-@register
-class BBoxPostProcess(object):
-    __shared__ = ['num_classes', 'export_onnx', 'export_eb']
-    __inject__ = ['decode', 'nms']
-
-    def __init__(self,
-                 num_classes=80,
-                 decode=None,
-                 nms=None,
-                 export_onnx=False,
-                 export_eb=False):
-        super(BBoxPostProcess, self).__init__()
-        self.num_classes = num_classes
-        self.decode = decode
-        self.nms = nms
-        self.export_onnx = export_onnx
-        self.export_eb = export_eb
-
-    def __call__(self, head_out, rois, im_shape, scale_factor):
-        """
-        Decode the bbox and do NMS if needed.
-
-        Args:
-            head_out (tuple): bbox_pred and cls_prob of bbox_head output.
-            rois (tuple): roi and rois_num of rpn_head output.
-            im_shape (Tensor): The shape of the input image.
-            scale_factor (Tensor): The scale factor of the input image.
-            export_onnx (bool): whether export model to onnx
-        Returns:
-            bbox_pred (Tensor): The output prediction with shape [N, 6], including
-                labels, scores and bboxes. The size of bboxes are corresponding
-                to the input image, the bboxes may be used in other branch.
-            bbox_num (Tensor): The number of prediction boxes of each batch with
-                shape [1], and is N.
-        """
-        if self.nms is not None:
-            bboxes, score = self.decode(head_out, rois, im_shape, scale_factor)
-            bbox_pred, bbox_num, before_nms_indexes = self.nms(bboxes, score,
-                                                               self.num_classes)
-
-        else:
-            bbox_pred, bbox_num = self.decode(head_out, rois, im_shape,
-                                              scale_factor)
-
-        if self.export_onnx:
-            # add fake box after postprocess when exporting onnx 
-            fake_bboxes = paddle.to_tensor(
-                np.array(
-                    [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32'))
-
-            bbox_pred = paddle.concat([bbox_pred, fake_bboxes])
-            bbox_num = bbox_num + 1
-
-        if self.nms is not None:
-            return bbox_pred, bbox_num, before_nms_indexes
-        else:
-            return bbox_pred, bbox_num
-
-    def get_pred(self, bboxes, bbox_num, im_shape, scale_factor):
-        """
-        Rescale, clip and filter the bbox from the output of NMS to 
-        get final prediction. 
-
-        Notes:
-        Currently only support bs = 1.
-
-        Args:
-            bboxes (Tensor): The output bboxes with shape [N, 6] after decode
-                and NMS, including labels, scores and bboxes.
-            bbox_num (Tensor): The number of prediction boxes of each batch with
-                shape [1], and is N.
-            im_shape (Tensor): The shape of the input image.
-            scale_factor (Tensor): The scale factor of the input image.
-        Returns:
-            pred_result (Tensor): The final prediction results with shape [N, 6]
-                including labels, scores and bboxes.
-        """
-        if self.export_eb:
-            # enable rcnn models for edgeboard hw to skip the following postprocess.
-            return bboxes, bboxes, bbox_num
-
-        if not self.export_onnx:
-            bboxes_list = []
-            bbox_num_list = []
-            id_start = 0
-            fake_bboxes = paddle.to_tensor(
-                np.array(
-                    [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32'))
-            fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32'))
-
-            # add fake bbox when output is empty for each batch
-            for i in range(bbox_num.shape[0]):
-                if bbox_num[i] == 0:
-                    bboxes_i = fake_bboxes
-                    bbox_num_i = fake_bbox_num
-                else:
-                    bboxes_i = bboxes[id_start:id_start + bbox_num[i], :]
-                    bbox_num_i = bbox_num[i:i + 1]
-                    id_start += bbox_num[i:i + 1]
-                bboxes_list.append(bboxes_i)
-                bbox_num_list.append(bbox_num_i)
-            bboxes = paddle.concat(bboxes_list)
-            bbox_num = paddle.concat(bbox_num_list)
-
-        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
-
-        if not self.export_onnx:
-            origin_shape_list = []
-            scale_factor_list = []
-            # scale_factor: scale_y, scale_x
-            for i in range(bbox_num.shape[0]):
-                expand_shape = paddle.expand(origin_shape[i:i + 1, :],
-                                             [bbox_num[i:i + 1], 2])
-                scale_y, scale_x = scale_factor[i, 0:1], scale_factor[i, 1:2]
-                scale = paddle.concat([scale_x, scale_y, scale_x, scale_y])
-                expand_scale = paddle.expand(scale, [bbox_num[i:i + 1], 4])
-                origin_shape_list.append(expand_shape)
-                scale_factor_list.append(expand_scale)
-
-            self.origin_shape_list = paddle.concat(origin_shape_list)
-            scale_factor_list = paddle.concat(scale_factor_list)
-
-        else:
-            # simplify the computation for bs=1 when exporting onnx
-            scale_y, scale_x = scale_factor[0][0], scale_factor[0][1]
-            scale = paddle.concat(
-                [scale_x, scale_y, scale_x, scale_y]).unsqueeze(0)
-            self.origin_shape_list = paddle.expand(origin_shape,
-                                                   [bbox_num[0:1], 2])
-            scale_factor_list = paddle.expand(scale, [bbox_num[0:1], 4])
-
-        # bboxes: [N, 6], label, score, bbox
-        pred_label = bboxes[:, 0:1]
-        pred_score = bboxes[:, 1:2]
-        pred_bbox = bboxes[:, 2:]
-        # rescale bbox to original image
-        scaled_bbox = pred_bbox / scale_factor_list
-        origin_h = self.origin_shape_list[:, 0]
-        origin_w = self.origin_shape_list[:, 1]
-        zeros = paddle.zeros_like(origin_h)
-        # clip bbox to [0, original_size]
-        x1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 0], origin_w), zeros)
-        y1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 1], origin_h), zeros)
-        x2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 2], origin_w), zeros)
-        y2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 3], origin_h), zeros)
-        pred_bbox = paddle.stack([x1, y1, x2, y2], axis=-1)
-        # filter empty bbox
-        keep_mask = nonempty_bbox(pred_bbox, return_mask=True)
-        keep_mask = paddle.unsqueeze(keep_mask, [1])
-        pred_label = paddle.where(keep_mask, pred_label,
-                                  paddle.ones_like(pred_label) * -1)
-        pred_result = paddle.concat([pred_label, pred_score, pred_bbox], axis=1)
-        return bboxes, pred_result, bbox_num
-
-    def get_origin_shape(self, ):
-        return self.origin_shape_list
-
-
-@register
-class MaskPostProcess(object):
-    __shared__ = ['export_onnx', 'assign_on_cpu']
-    """
-    refer to:
-    https://github.com/facebookresearch/detectron2/layers/mask_ops.py
-
-    Get Mask output according to the output from model
-    """
-
-    def __init__(self,
-                 binary_thresh=0.5,
-                 export_onnx=False,
-                 assign_on_cpu=False):
-        super(MaskPostProcess, self).__init__()
-        self.binary_thresh = binary_thresh
-        self.export_onnx = export_onnx
-        self.assign_on_cpu = assign_on_cpu
-
-    def __call__(self, mask_out, bboxes, bbox_num, origin_shape):
-        """
-        Decode the mask_out and paste the mask to the origin image.
-
-        Args:
-            mask_out (Tensor): mask_head output with shape [N, 28, 28].
-            bbox_pred (Tensor): The output bboxes with shape [N, 6] after decode
-                and NMS, including labels, scores and bboxes.
-            bbox_num (Tensor): The number of prediction boxes of each batch with
-                shape [1], and is N.
-            origin_shape (Tensor): The origin shape of the input image, the tensor
-                shape is [N, 2], and each row is [h, w].
-        Returns:
-            pred_result (Tensor): The final prediction mask results with shape
-                [N, h, w] in binary mask style.
-        """
-        num_mask = mask_out.shape[0]
-        origin_shape = paddle.cast(origin_shape, 'int32')
-        device = paddle.device.get_device()
-
-        if self.export_onnx:
-            h, w = origin_shape[0][0], origin_shape[0][1]
-            mask_onnx = paste_mask(mask_out[:, None, :, :], bboxes[:, 2:], h, w,
-                                   self.assign_on_cpu)
-            mask_onnx = mask_onnx >= self.binary_thresh
-            pred_result = paddle.cast(mask_onnx, 'int32')
-
-        else:
-            max_h = paddle.max(origin_shape[:, 0])
-            max_w = paddle.max(origin_shape[:, 1])
-            pred_result = paddle.zeros(
-                [num_mask, max_h, max_w], dtype='int32') - 1
-
-            id_start = 0
-            for i in range(paddle.shape(bbox_num)[0]):
-                bboxes_i = bboxes[id_start:id_start + bbox_num[i], :]
-                mask_out_i = mask_out[id_start:id_start + bbox_num[i], :, :]
-                im_h = origin_shape[i, 0]
-                im_w = origin_shape[i, 1]
-                pred_mask = paste_mask(mask_out_i[:, None, :, :],
-                                       bboxes_i[:, 2:], im_h, im_w,
-                                       self.assign_on_cpu)
-                pred_mask = paddle.cast(pred_mask >= self.binary_thresh,
-                                        'int32')
-                pred_result[id_start:id_start + bbox_num[i], :im_h, :
-                            im_w] = pred_mask
-                id_start += bbox_num[i]
-        if self.assign_on_cpu:
-            paddle.set_device(device)
-
-        return pred_result
-
-
-@register
-class JDEBBoxPostProcess(nn.Layer):
-    __shared__ = ['num_classes']
-    __inject__ = ['decode', 'nms']
-
-    def __init__(self, num_classes=1, decode=None, nms=None, return_idx=True):
-        super(JDEBBoxPostProcess, self).__init__()
-        self.num_classes = num_classes
-        self.decode = decode
-        self.nms = nms
-        self.return_idx = return_idx
-
-        self.fake_bbox_pred = paddle.to_tensor(
-            np.array(
-                [[-1, 0.0, 0.0, 0.0, 0.0, 0.0]], dtype='float32'))
-        self.fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32'))
-        self.fake_nms_keep_idx = paddle.to_tensor(
-            np.array(
-                [[0]], dtype='int32'))
-
-        self.fake_yolo_boxes_out = paddle.to_tensor(
-            np.array(
-                [[[0.0, 0.0, 0.0, 0.0]]], dtype='float32'))
-        self.fake_yolo_scores_out = paddle.to_tensor(
-            np.array(
-                [[[0.0]]], dtype='float32'))
-        self.fake_boxes_idx = paddle.to_tensor(np.array([[0]], dtype='int64'))
-
-    def forward(self, head_out, anchors):
-        """
-        Decode the bbox and do NMS for JDE model. 
-
-        Args:
-            head_out (list): Bbox_pred and cls_prob of bbox_head output.
-            anchors (list): Anchors of JDE model.
-
-        Returns:
-            boxes_idx (Tensor): The index of kept bboxes after decode 'JDEBox'. 
-            bbox_pred (Tensor): The output is the prediction with shape [N, 6]
-                including labels, scores and bboxes.
-            bbox_num (Tensor): The number of prediction of each batch with shape [N].
-            nms_keep_idx (Tensor): The index of kept bboxes after NMS. 
-        """
-        boxes_idx, yolo_boxes_scores = self.decode(head_out, anchors)
-
-        if len(boxes_idx) == 0:
-            boxes_idx = self.fake_boxes_idx
-            yolo_boxes_out = self.fake_yolo_boxes_out
-            yolo_scores_out = self.fake_yolo_scores_out
-        else:
-            yolo_boxes = paddle.gather_nd(yolo_boxes_scores, boxes_idx)
-            # TODO: only support bs=1 now
-            yolo_boxes_out = paddle.reshape(
-                yolo_boxes[:, :4], shape=[1, len(boxes_idx), 4])
-            yolo_scores_out = paddle.reshape(
-                yolo_boxes[:, 4:5], shape=[1, 1, len(boxes_idx)])
-            boxes_idx = boxes_idx[:, 1:]
-
-        if self.return_idx:
-            bbox_pred, bbox_num, nms_keep_idx = self.nms(
-                yolo_boxes_out, yolo_scores_out, self.num_classes)
-            if bbox_pred.shape[0] == 0:
-                bbox_pred = self.fake_bbox_pred
-                bbox_num = self.fake_bbox_num
-                nms_keep_idx = self.fake_nms_keep_idx
-            return boxes_idx, bbox_pred, bbox_num, nms_keep_idx
-        else:
-            bbox_pred, bbox_num, _ = self.nms(yolo_boxes_out, yolo_scores_out,
-                                              self.num_classes)
-            if bbox_pred.shape[0] == 0:
-                bbox_pred = self.fake_bbox_pred
-                bbox_num = self.fake_bbox_num
-            return _, bbox_pred, bbox_num, _
-
-
-@register
-class CenterNetPostProcess(object):
-    """
-    Postprocess the model outputs to get final prediction:
-        1. Do NMS for heatmap to get top `max_per_img` bboxes.
-        2. Decode bboxes using center offset and box size.
-        3. Rescale decoded bboxes reference to the origin image shape.
-    Args:
-        max_per_img(int): the maximum number of predicted objects in a image,
-            500 by default.
-        down_ratio(int): the down ratio from images to heatmap, 4 by default.
-        regress_ltrb (bool): whether to regress left/top/right/bottom or
-            width/height for a box, true by default.
-    """
-    __shared__ = ['down_ratio']
-
-    def __init__(self, max_per_img=500, down_ratio=4, regress_ltrb=True):
-        super(CenterNetPostProcess, self).__init__()
-        self.max_per_img = max_per_img
-        self.down_ratio = down_ratio
-        self.regress_ltrb = regress_ltrb
-        # _simple_nms() _topk() are same as TTFBox in ppdet/modeling/layers.py
-
-    def _simple_nms(self, heat, kernel=3):
-        """ Use maxpool to filter the max score, get local peaks. """
-        pad = (kernel - 1) // 2
-        hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad)
-        keep = paddle.cast(hmax == heat, 'float32')
-        return heat * keep
-
-    def _topk(self, scores):
-        """ Select top k scores and decode to get xy coordinates. """
-        k = self.max_per_img
-        shape_fm = paddle.shape(scores)
-        shape_fm.stop_gradient = True
-        cat, height, width = shape_fm[1], shape_fm[2], shape_fm[3]
-        # batch size is 1
-        scores_r = paddle.reshape(scores, [cat, -1])
-        topk_scores, topk_inds = paddle.topk(scores_r, k)
-        topk_ys = topk_inds // width
-        topk_xs = topk_inds % width
-
-        topk_score_r = paddle.reshape(topk_scores, [-1])
-        topk_score, topk_ind = paddle.topk(topk_score_r, k)
-        k_t = paddle.full(paddle.shape(topk_ind), k, dtype='int64')
-        topk_clses = paddle.cast(paddle.floor_divide(topk_ind, k_t), 'float32')
-
-        topk_inds = paddle.reshape(topk_inds, [-1])
-        topk_ys = paddle.reshape(topk_ys, [-1, 1])
-        topk_xs = paddle.reshape(topk_xs, [-1, 1])
-        topk_inds = paddle.gather(topk_inds, topk_ind)
-        topk_ys = paddle.gather(topk_ys, topk_ind)
-        topk_xs = paddle.gather(topk_xs, topk_ind)
-        return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
-
-    def __call__(self, hm, wh, reg, im_shape, scale_factor):
-        # 1.get clses and scores, note that hm had been done sigmoid
-        heat = self._simple_nms(hm)
-        scores, inds, topk_clses, ys, xs = self._topk(heat)
-        clses = topk_clses.unsqueeze(1)
-        scores = scores.unsqueeze(1)
-
-        # 2.get bboxes, note only support batch_size=1 now
-        reg_t = paddle.transpose(reg, [0, 2, 3, 1])
-        reg = paddle.reshape(reg_t, [-1, reg_t.shape[-1]])
-        reg = paddle.gather(reg, inds)
-        xs = paddle.cast(xs, 'float32')
-        ys = paddle.cast(ys, 'float32')
-        xs = xs + reg[:, 0:1]
-        ys = ys + reg[:, 1:2]
-        wh_t = paddle.transpose(wh, [0, 2, 3, 1])
-        wh = paddle.reshape(wh_t, [-1, wh_t.shape[-1]])
-        wh = paddle.gather(wh, inds)
-        if self.regress_ltrb:
-            x1 = xs - wh[:, 0:1]
-            y1 = ys - wh[:, 1:2]
-            x2 = xs + wh[:, 2:3]
-            y2 = ys + wh[:, 3:4]
-        else:
-            x1 = xs - wh[:, 0:1] / 2
-            y1 = ys - wh[:, 1:2] / 2
-            x2 = xs + wh[:, 0:1] / 2
-            y2 = ys + wh[:, 1:2] / 2
-        n, c, feat_h, feat_w = paddle.shape(hm)
-        padw = (feat_w * self.down_ratio - im_shape[0, 1]) / 2
-        padh = (feat_h * self.down_ratio - im_shape[0, 0]) / 2
-        x1 = x1 * self.down_ratio
-        y1 = y1 * self.down_ratio
-        x2 = x2 * self.down_ratio
-        y2 = y2 * self.down_ratio
-        x1 = x1 - padw
-        y1 = y1 - padh
-        x2 = x2 - padw
-        y2 = y2 - padh
-        bboxes = paddle.concat([x1, y1, x2, y2], axis=1)
-        scale_y = scale_factor[:, 0:1]
-        scale_x = scale_factor[:, 1:2]
-        scale_expand = paddle.concat(
-            [scale_x, scale_y, scale_x, scale_y], axis=1)
-        boxes_shape = bboxes.shape[:]
-        scale_expand = paddle.expand(scale_expand, shape=boxes_shape)
-        bboxes = paddle.divide(bboxes, scale_expand)
-
-        results = paddle.concat([clses, scores, bboxes], axis=1)
-        return results, paddle.shape(results)[0:1], inds, topk_clses, ys, xs
-
-
-@register
-class DETRPostProcess(object):
-    __shared__ = ['num_classes', 'use_focal_loss', 'with_mask']
-    __inject__ = []
-
-    def __init__(self,
-                 num_classes=80,
-                 num_top_queries=100,
-                 dual_queries=False,
-                 dual_groups=0,
-                 use_focal_loss=False,
-                 with_mask=False,
-                 mask_threshold=0.5,
-                 use_avg_mask_score=False,
-                 bbox_decode_type='origin'):
-        super(DETRPostProcess, self).__init__()
-        assert bbox_decode_type in ['origin', 'pad']
-
-        self.num_classes = num_classes
-        self.num_top_queries = num_top_queries
-        self.dual_queries = dual_queries
-        self.dual_groups = dual_groups
-        self.use_focal_loss = use_focal_loss
-        self.with_mask = with_mask
-        self.mask_threshold = mask_threshold
-        self.use_avg_mask_score = use_avg_mask_score
-        self.bbox_decode_type = bbox_decode_type
-
-    def _mask_postprocess(self, mask_pred, score_pred, index):
-        mask_score = F.sigmoid(paddle.gather_nd(mask_pred, index))
-        mask_pred = (mask_score > self.mask_threshold).astype(mask_score.dtype)
-        if self.use_avg_mask_score:
-            avg_mask_score = (mask_pred * mask_score).sum([-2, -1]) / (
-                mask_pred.sum([-2, -1]) + 1e-6)
-            score_pred *= avg_mask_score
-
-        return mask_pred[0].astype('int32'), score_pred
-
-    def __call__(self, head_out, im_shape, scale_factor, pad_shape):
-        """
-        Decode the bbox and mask.
-
-        Args:
-            head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output.
-            im_shape (Tensor): The shape of the input image without padding.
-            scale_factor (Tensor): The scale factor of the input image.
-            pad_shape (Tensor): The shape of the input image with padding.
-        Returns:
-            bbox_pred (Tensor): The output prediction with shape [N, 6], including
-                labels, scores and bboxes. The size of bboxes are corresponding
-                to the input image, the bboxes may be used in other branch.
-            bbox_num (Tensor): The number of prediction boxes of each batch with
-                shape [bs], and is N.
-        """
-        bboxes, logits, masks = head_out
-        if self.dual_queries:
-            num_queries = logits.shape[1]
-            logits, bboxes = logits[:, :int(num_queries // (self.dual_groups + 1)), :], \
-                             bboxes[:, :int(num_queries // (self.dual_groups + 1)), :]
-
-        bbox_pred = bbox_cxcywh_to_xyxy(bboxes)
-        # calculate the original shape of the image
-        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
-        img_h, img_w = paddle.split(origin_shape, 2, axis=-1)
-        if self.bbox_decode_type == 'pad':
-            # calculate the shape of the image with padding
-            out_shape = pad_shape / im_shape * origin_shape
-            out_shape = out_shape.flip(1).tile([1, 2]).unsqueeze(1)
-        elif self.bbox_decode_type == 'origin':
-            out_shape = origin_shape.flip(1).tile([1, 2]).unsqueeze(1)
-        else:
-            raise Exception(
-                f'Wrong `bbox_decode_type`: {self.bbox_decode_type}.')
-        bbox_pred *= out_shape
-
-        scores = F.sigmoid(logits) if self.use_focal_loss else F.softmax(
-            logits)[:, :, :-1]
-
-        if not self.use_focal_loss:
-            scores, labels = scores.max(-1), scores.argmax(-1)
-            if scores.shape[1] > self.num_top_queries:
-                scores, index = paddle.topk(
-                    scores, self.num_top_queries, axis=-1)
-                batch_ind = paddle.arange(
-                    end=scores.shape[0]).unsqueeze(-1).tile(
-                        [1, self.num_top_queries])
-                index = paddle.stack([batch_ind, index], axis=-1)
-                labels = paddle.gather_nd(labels, index)
-                bbox_pred = paddle.gather_nd(bbox_pred, index)
-        else:
-            scores, index = paddle.topk(
-                scores.flatten(1), self.num_top_queries, axis=-1)
-            labels = index % self.num_classes
-            index = index // self.num_classes
-            batch_ind = paddle.arange(end=scores.shape[0]).unsqueeze(-1).tile(
-                [1, self.num_top_queries])
-            index = paddle.stack([batch_ind, index], axis=-1)
-            bbox_pred = paddle.gather_nd(bbox_pred, index)
-
-        mask_pred = None
-        if self.with_mask:
-            assert masks is not None
-            masks = F.interpolate(
-                masks, scale_factor=4, mode="bilinear", align_corners=False)
-            # TODO: Support prediction with bs>1.
-            # remove padding for input image
-            h, w = im_shape.astype('int32')[0]
-            masks = masks[..., :h, :w]
-            # get pred_mask in the original resolution.
-            img_h = img_h[0].astype('int32')
-            img_w = img_w[0].astype('int32')
-            masks = F.interpolate(
-                masks,
-                size=(img_h, img_w),
-                mode="bilinear",
-                align_corners=False)
-            mask_pred, scores = self._mask_postprocess(masks, scores, index)
-
-        bbox_pred = paddle.concat(
-            [
-                labels.unsqueeze(-1).astype('float32'), scores.unsqueeze(-1),
-                bbox_pred
-            ],
-            axis=-1)
-        bbox_num = paddle.to_tensor(
-            self.num_top_queries, dtype='int32').tile([bbox_pred.shape[0]])
-        bbox_pred = bbox_pred.reshape([-1, 6])
-        return bbox_pred, bbox_num, mask_pred
-
-
-@register
-class SparsePostProcess(object):
-    __shared__ = ['num_classes', 'assign_on_cpu']
-
-    def __init__(self,
-                 num_proposals,
-                 num_classes=80,
-                 binary_thresh=0.5,
-                 assign_on_cpu=False):
-        super(SparsePostProcess, self).__init__()
-        self.num_classes = num_classes
-        self.num_proposals = num_proposals
-        self.binary_thresh = binary_thresh
-        self.assign_on_cpu = assign_on_cpu
-
-    def __call__(self, scores, bboxes, scale_factor, ori_shape, masks=None):
-        assert len(scores) == len(bboxes) == \
-               len(ori_shape) == len(scale_factor)
-        device = paddle.device.get_device()
-        batch_size = len(ori_shape)
-
-        scores = F.sigmoid(scores)
-        has_mask = masks is not None
-        if has_mask:
-            masks = F.sigmoid(masks)
-            masks = masks.reshape([batch_size, -1, *masks.shape[1:]])
-
-        bbox_pred = []
-        mask_pred = [] if has_mask else None
-        bbox_num = paddle.zeros([batch_size], dtype='int32')
-        for i in range(batch_size):
-            score = scores[i]
-            bbox = bboxes[i]
-            score, indices = score.flatten(0, 1).topk(
-                self.num_proposals, sorted=False)
-            label = indices % self.num_classes
-            if has_mask:
-                mask = masks[i]
-                mask = mask.flatten(0, 1)[indices]
-
-            H, W = ori_shape[i][0], ori_shape[i][1]
-            bbox = bbox[paddle.cast(indices / self.num_classes, indices.dtype)]
-            bbox /= scale_factor[i]
-            bbox[:, 0::2] = paddle.clip(bbox[:, 0::2], 0, W)
-            bbox[:, 1::2] = paddle.clip(bbox[:, 1::2], 0, H)
-
-            keep = ((bbox[:, 2] - bbox[:, 0]).numpy() > 1.) & \
-                   ((bbox[:, 3] - bbox[:, 1]).numpy() > 1.)
-            if keep.sum() == 0:
-                bbox = paddle.zeros([1, 6], dtype='float32')
-                if has_mask:
-                    mask = paddle.zeros([1, H, W], dtype='uint8')
-            else:
-                label = paddle.to_tensor(label.numpy()[keep]).astype(
-                    'float32').unsqueeze(-1)
-                score = paddle.to_tensor(score.numpy()[keep]).astype(
-                    'float32').unsqueeze(-1)
-                bbox = paddle.to_tensor(bbox.numpy()[keep]).astype('float32')
-                if has_mask:
-                    mask = paddle.to_tensor(mask.numpy()[keep]).astype(
-                        'float32').unsqueeze(1)
-                    mask = paste_mask(mask, bbox, H, W, self.assign_on_cpu)
-                    mask = paddle.cast(mask >= self.binary_thresh, 'uint8')
-                bbox = paddle.concat([label, score, bbox], axis=-1)
-
-            bbox_num[i] = bbox.shape[0]
-            bbox_pred.append(bbox)
-            if has_mask:
-                mask_pred.append(mask)
-
-        bbox_pred = paddle.concat(bbox_pred)
-        mask_pred = paddle.concat(mask_pred) if has_mask else None
-
-        if self.assign_on_cpu:
-            paddle.set_device(device)
-
-        if has_mask:
-            return bbox_pred, bbox_num, mask_pred
-        else:
-            return bbox_pred, bbox_num
-
-
-def paste_mask(masks, boxes, im_h, im_w, assign_on_cpu=False):
-    """
-    Paste the mask prediction to the original image.
-    """
-    x0_int, y0_int = 0, 0
-    x1_int, y1_int = im_w, im_h
-    x0, y0, x1, y1 = paddle.split(boxes, 4, axis=1)
-    N = masks.shape[0]
-    img_y = paddle.arange(y0_int, y1_int) + 0.5
-    img_x = paddle.arange(x0_int, x1_int) + 0.5
-
-    img_y = (img_y - y0) / (y1 - y0) * 2 - 1
-    img_x = (img_x - x0) / (x1 - x0) * 2 - 1
-    # img_x, img_y have shapes (N, w), (N, h)
-
-    if assign_on_cpu:
-        paddle.set_device('cpu')
-    gx = img_x[:, None, :].expand(
-        [N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]])
-    gy = img_y[:, :, None].expand(
-        [N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]])
-    grid = paddle.stack([gx, gy], axis=3)
-    img_masks = F.grid_sample(masks, grid, align_corners=False)
-    return img_masks[:, 0]
-
-
-def multiclass_nms(bboxs, num_classes, match_threshold=0.6, match_metric='iou'):
-    final_boxes = []
-    for c in range(num_classes):
-        idxs = bboxs[:, 0] == c
-        if np.count_nonzero(idxs) == 0: continue
-        r = nms(bboxs[idxs, 1:], match_threshold, match_metric)
-        final_boxes.append(np.concatenate([np.full((r.shape[0], 1), c), r], 1))
-    return final_boxes
-
-
-def nms(dets, match_threshold=0.6, match_metric='iou'):
-    """ Apply NMS to avoid detecting too many overlapping bounding boxes.
-        Args:
-            dets: shape [N, 5], [score, x1, y1, x2, y2]
-            match_metric: 'iou' or 'ios'
-            match_threshold: overlap thresh for match metric.
-    """
-    if dets.shape[0] == 0:
-        return dets[[], :]
-    scores = dets[:, 0]
-    x1 = dets[:, 1]
-    y1 = dets[:, 2]
-    x2 = dets[:, 3]
-    y2 = dets[:, 4]
-    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
-    order = scores.argsort()[::-1]
-
-    ndets = dets.shape[0]
-    suppressed = np.zeros((ndets), dtype=np.int32)
-
-    for _i in range(ndets):
-        i = order[_i]
-        if suppressed[i] == 1:
-            continue
-        ix1 = x1[i]
-        iy1 = y1[i]
-        ix2 = x2[i]
-        iy2 = y2[i]
-        iarea = areas[i]
-        for _j in range(_i + 1, ndets):
-            j = order[_j]
-            if suppressed[j] == 1:
-                continue
-            xx1 = max(ix1, x1[j])
-            yy1 = max(iy1, y1[j])
-            xx2 = min(ix2, x2[j])
-            yy2 = min(iy2, y2[j])
-            w = max(0.0, xx2 - xx1 + 1)
-            h = max(0.0, yy2 - yy1 + 1)
-            inter = w * h
-            if match_metric == 'iou':
-                union = iarea + areas[j] - inter
-                match_value = inter / union
-            elif match_metric == 'ios':
-                smaller = min(iarea, areas[j])
-                match_value = inter / smaller
-            else:
-                raise ValueError()
-            if match_value >= match_threshold:
-                suppressed[j] = 1
-    keep = np.where(suppressed == 0)[0]
-    dets = dets[keep, :]
-    return dets
-
-
-@register
-class DETRBBoxSemiPostProcess(object):
-    __shared__ = ['num_classes', 'use_focal_loss']
-    __inject__ = []
-
-    def __init__(self,
-                 num_classes=80,
-                 num_top_queries=100,
-                 use_focal_loss=False):
-        super(DETRBBoxSemiPostProcess, self).__init__()
-        self.num_classes = num_classes
-        self.num_top_queries = num_top_queries
-        self.use_focal_loss = use_focal_loss
-
-    def __call__(self, head_out):
-        """
-        Decode the bbox.
-        Args:
-            head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output.
-            im_shape (Tensor): The shape of the input image.
-            scale_factor (Tensor): The scale factor of the input image.
-        Returns:
-            bbox_pred (Tensor): The output prediction with shape [N, 6], including
-                labels, scores and bboxes. The size of bboxes are corresponding
-                to the input image, the bboxes may be used in other branch.
-            bbox_num (Tensor): The number of prediction boxes of each batch with
-                shape [bs], and is N.
-        """
-        bboxes, logits, masks = head_out
-        bbox_pred = bboxes
-
-        scores = F.softmax(logits, axis=2)
-
-        import copy
-        soft_scores = copy.deepcopy(scores)
-        scores, index = paddle.topk(scores.max(-1), 300, axis=-1)
-
-        batch_ind = paddle.arange(end=scores.shape[0]).unsqueeze(-1).tile(
-            [1, 300])
-        index = paddle.stack([batch_ind, index], axis=-1)
-        labels = paddle.gather_nd(soft_scores.argmax(-1), index).astype('int32')
-        score_class = paddle.gather_nd(soft_scores, index)
-        bbox_pred = paddle.gather_nd(bbox_pred, index)
-        bbox_pred = paddle.concat(
-            [
-                labels.unsqueeze(-1).astype('float32'), score_class,
-                scores.unsqueeze(-1), bbox_pred
-            ],
-            axis=-1)
-        bbox_num = paddle.to_tensor(
-            bbox_pred.shape[1], dtype='int32').tile([bbox_pred.shape[0]])
-        bbox_pred = bbox_pred.reshape([-1, bbox_pred.shape[-1]])
-        return bbox_pred, bbox_num
diff --git a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/__init__.py
deleted file mode 100644
index f3ad199..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from . import rpn_head
-from . import embedding_rpn_head
-
-from .rpn_head import *
-from .embedding_rpn_head import *
diff --git a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/anchor_generator.py b/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/anchor_generator.py
deleted file mode 100644
index 9a8e24e..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/anchor_generator.py
+++ /dev/null
@@ -1,266 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# The code is based on 
-# https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/anchor_generator.py
-
-import math
-
-import paddle
-import paddle.nn as nn
-import numpy as np
-
-from ppdet.core.workspace import register
-
-__all__ = ['AnchorGenerator', 'RetinaAnchorGenerator', 'S2ANetAnchorGenerator']
-
-
-@register
-class AnchorGenerator(nn.Layer):
-    """
-    Generate anchors according to the feature maps
-
-    Args:
-        anchor_sizes (list[float] | list[list[float]]): The anchor sizes at 
-            each feature point. list[float] means all feature levels share the 
-            same sizes. list[list[float]] means the anchor sizes for 
-            each level. The sizes stand for the scale of input size.
-        aspect_ratios (list[float] | list[list[float]]): The aspect ratios at
-            each feature point. list[float] means all feature levels share the
-            same ratios. list[list[float]] means the aspect ratios for
-            each level.
-        strides (list[float]): The strides of feature maps which generate 
-            anchors
-        offset (float): The offset of the coordinate of anchors, default 0.
-        
-    """
-
-    def __init__(self,
-                 anchor_sizes=[32, 64, 128, 256, 512],
-                 aspect_ratios=[0.5, 1.0, 2.0],
-                 strides=[16.0],
-                 variance=[1.0, 1.0, 1.0, 1.0],
-                 offset=0.):
-        super(AnchorGenerator, self).__init__()
-        self.anchor_sizes = anchor_sizes
-        self.aspect_ratios = aspect_ratios
-        self.strides = strides
-        self.variance = variance
-        self.cell_anchors = self._calculate_anchors(len(strides))
-        self.offset = offset
-
-    def _broadcast_params(self, params, num_features):
-        if not isinstance(params[0], (list, tuple)):  # list[float]
-            return [params] * num_features
-        if len(params) == 1:
-            return list(params) * num_features
-        return params
-
-    def generate_cell_anchors(self, sizes, aspect_ratios):
-        anchors = []
-        for size in sizes:
-            area = size**2.0
-            for aspect_ratio in aspect_ratios:
-                w = math.sqrt(area / aspect_ratio)
-                h = aspect_ratio * w
-                x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0
-                anchors.append([x0, y0, x1, y1])
-        return paddle.to_tensor(anchors, dtype='float32')
-
-    def _calculate_anchors(self, num_features):
-        sizes = self._broadcast_params(self.anchor_sizes, num_features)
-        aspect_ratios = self._broadcast_params(self.aspect_ratios, num_features)
-        cell_anchors = [
-            self.generate_cell_anchors(s, a)
-            for s, a in zip(sizes, aspect_ratios)
-        ]
-        [
-            self.register_buffer(
-                t.name, t, persistable=False) for t in cell_anchors
-        ]
-        return cell_anchors
-
-    def _create_grid_offsets(self, size, stride, offset):
-        grid_height, grid_width = size[0], size[1]
-        shifts_x = paddle.arange(
-            offset * stride, grid_width * stride, step=stride, dtype='float32')
-        shifts_y = paddle.arange(
-            offset * stride, grid_height * stride, step=stride, dtype='float32')
-        shift_y, shift_x = paddle.meshgrid(shifts_y, shifts_x)
-        shift_x = paddle.reshape(shift_x, [-1])
-        shift_y = paddle.reshape(shift_y, [-1])
-        return shift_x, shift_y
-
-    def _grid_anchors(self, grid_sizes):
-        anchors = []
-        for size, stride, base_anchors in zip(grid_sizes, self.strides,
-                                              self.cell_anchors):
-            shift_x, shift_y = self._create_grid_offsets(size, stride,
-                                                         self.offset)
-            shifts = paddle.stack((shift_x, shift_y, shift_x, shift_y), axis=1)
-            shifts = paddle.reshape(shifts, [-1, 1, 4])
-            base_anchors = paddle.reshape(base_anchors, [1, -1, 4])
-
-            anchors.append(paddle.reshape(shifts + base_anchors, [-1, 4]))
-
-        return anchors
-
-    def forward(self, input):
-        grid_sizes = [paddle.shape(feature_map)[-2:] for feature_map in input]
-        anchors_over_all_feature_maps = self._grid_anchors(grid_sizes)
-        return anchors_over_all_feature_maps
-
-    @property
-    def num_anchors(self):
-        """
-        Returns:
-            int: number of anchors at every pixel
-                location, on that feature map.
-                For example, if at every pixel we use anchors of 3 aspect
-                ratios and 5 sizes, the number of anchors is 15.
-                For FPN models, `num_anchors` on every feature map is the same.
-        """
-        return len(self.cell_anchors[0])
-
-
-@register
-class RetinaAnchorGenerator(AnchorGenerator):
-    def __init__(self,
-                 octave_base_scale=4,
-                 scales_per_octave=3,
-                 aspect_ratios=[0.5, 1.0, 2.0],
-                 strides=[8.0, 16.0, 32.0, 64.0, 128.0],
-                 variance=[1.0, 1.0, 1.0, 1.0],
-                 offset=0.0):
-        anchor_sizes = []
-        for s in strides:
-            anchor_sizes.append([
-                s * octave_base_scale * 2**(i/scales_per_octave) \
-                for i in range(scales_per_octave)])
-        super(RetinaAnchorGenerator, self).__init__(
-            anchor_sizes=anchor_sizes,
-            aspect_ratios=aspect_ratios,
-            strides=strides,
-            variance=variance,
-            offset=offset)
-
-
-@register
-class S2ANetAnchorGenerator(nn.Layer):
-    """
-    AnchorGenerator by paddle
-    """
-
-    def __init__(self, base_size, scales, ratios, scale_major=True, ctr=None):
-        super(S2ANetAnchorGenerator, self).__init__()
-        self.base_size = base_size
-        self.scales = paddle.to_tensor(scales)
-        self.ratios = paddle.to_tensor(ratios)
-        self.scale_major = scale_major
-        self.ctr = ctr
-        self.base_anchors = self.gen_base_anchors()
-
-    @property
-    def num_base_anchors(self):
-        return self.base_anchors.shape[0]
-
-    def gen_base_anchors(self):
-        w = self.base_size
-        h = self.base_size
-        if self.ctr is None:
-            x_ctr = 0.5 * (w - 1)
-            y_ctr = 0.5 * (h - 1)
-        else:
-            x_ctr, y_ctr = self.ctr
-
-        h_ratios = paddle.sqrt(self.ratios)
-        w_ratios = 1 / h_ratios
-        if self.scale_major:
-            ws = (w * w_ratios[:] * self.scales[:]).reshape([-1])
-            hs = (h * h_ratios[:] * self.scales[:]).reshape([-1])
-        else:
-            ws = (w * self.scales[:] * w_ratios[:]).reshape([-1])
-            hs = (h * self.scales[:] * h_ratios[:]).reshape([-1])
-
-        base_anchors = paddle.stack(
-            [
-                x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1),
-                x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1)
-            ],
-            axis=-1)
-        base_anchors = paddle.round(base_anchors)
-        return base_anchors
-
-    def _meshgrid(self, x, y, row_major=True):
-        yy, xx = paddle.meshgrid(y, x)
-        yy = yy.reshape([-1])
-        xx = xx.reshape([-1])
-        if row_major:
-            return xx, yy
-        else:
-            return yy, xx
-
-    def forward(self, featmap_size, stride=16):
-        # featmap_size*stride project it to original area
-
-        feat_h = featmap_size[0]
-        feat_w = featmap_size[1]
-        shift_x = paddle.arange(0, feat_w, 1, 'int32') * stride
-        shift_y = paddle.arange(0, feat_h, 1, 'int32') * stride
-        shift_xx, shift_yy = self._meshgrid(shift_x, shift_y)
-        shifts = paddle.stack([shift_xx, shift_yy, shift_xx, shift_yy], axis=-1)
-
-        all_anchors = self.base_anchors[:, :] + shifts[:, :]
-        all_anchors = all_anchors.cast(paddle.float32).reshape(
-            [feat_h * feat_w, 4])
-        all_anchors = self.rect2rbox(all_anchors)
-        return all_anchors
-
-    def valid_flags(self, featmap_size, valid_size):
-        feat_h, feat_w = featmap_size
-        valid_h, valid_w = valid_size
-        assert valid_h <= feat_h and valid_w <= feat_w
-        valid_x = paddle.zeros([feat_w], dtype='int32')
-        valid_y = paddle.zeros([feat_h], dtype='int32')
-        valid_x[:valid_w] = 1
-        valid_y[:valid_h] = 1
-        valid_xx, valid_yy = self._meshgrid(valid_x, valid_y)
-        valid = valid_xx & valid_yy
-        valid = paddle.reshape(valid, [-1, 1])
-        valid = paddle.expand(valid, [-1, self.num_base_anchors]).reshape([-1])
-        return valid
-
-    def rect2rbox(self, bboxes):
-        """
-        :param bboxes: shape (L, 4) (xmin, ymin, xmax, ymax)
-        :return: dbboxes: shape (L, 5) (x_ctr, y_ctr, w, h, angle)
-        """
-        x1, y1, x2, y2 = paddle.split(bboxes, 4, axis=-1)
-
-        x_ctr = (x1 + x2) / 2.0
-        y_ctr = (y1 + y2) / 2.0
-        edges1 = paddle.abs(x2 - x1)
-        edges2 = paddle.abs(y2 - y1)
-
-        rbox_w = paddle.maximum(edges1, edges2)
-        rbox_h = paddle.minimum(edges1, edges2)
-
-        # set angle
-        inds = edges1 < edges2
-        inds = paddle.cast(inds, paddle.float32)
-        rboxes_angle = inds * np.pi / 2.0
-
-        rboxes = paddle.concat(
-            (x_ctr, y_ctr, rbox_w, rbox_h, rboxes_angle), axis=-1)
-        return rboxes
diff --git a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/embedding_rpn_head.py b/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/embedding_rpn_head.py
deleted file mode 100644
index 2917498..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/embedding_rpn_head.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This code is referenced from: https://github.com/open-mmlab/mmdetection
-
-import paddle
-from paddle import nn
-
-from ppdet.core.workspace import register
-
-__all__ = ['EmbeddingRPNHead']
-
-
-@register
-class EmbeddingRPNHead(nn.Layer):
-    __shared__ = ['proposal_embedding_dim']
-
-    def __init__(self, num_proposals, proposal_embedding_dim=256):
-        super(EmbeddingRPNHead, self).__init__()
-
-        self.num_proposals = num_proposals
-        self.proposal_embedding_dim = proposal_embedding_dim
-
-        self._init_layers()
-        self._init_weights()
-
-    def _init_layers(self):
-        self.init_proposal_bboxes = nn.Embedding(self.num_proposals, 4)
-        self.init_proposal_features = nn.Embedding(self.num_proposals,
-                                                   self.proposal_embedding_dim)
-
-    def _init_weights(self):
-        init_bboxes = paddle.empty_like(self.init_proposal_bboxes.weight)
-        init_bboxes[:, :2] = 0.5
-        init_bboxes[:, 2:] = 1.0
-        self.init_proposal_bboxes.weight.set_value(init_bboxes)
-
-    @staticmethod
-    def bbox_cxcywh_to_xyxy(x):
-        cxcy, wh = paddle.split(x, 2, axis=-1)
-        return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1)
-
-    def forward(self, img_whwh):
-        proposal_bboxes = self.init_proposal_bboxes.weight.clone()
-        proposal_bboxes = self.bbox_cxcywh_to_xyxy(proposal_bboxes)
-        proposal_bboxes = proposal_bboxes.unsqueeze(0) * img_whwh.unsqueeze(1)
-
-        proposal_features = self.init_proposal_features.weight.clone()
-        proposal_features = proposal_features.unsqueeze(0).tile(
-            [img_whwh.shape[0], 1, 1])
-
-        return proposal_bboxes, proposal_features
diff --git a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/proposal_generator.py b/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/proposal_generator.py
deleted file mode 100644
index b87a72c..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/proposal_generator.py
+++ /dev/null
@@ -1,83 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-
-from ppdet.core.workspace import register, serializable
-from .. import ops
-
-
-@register
-@serializable
-class ProposalGenerator(object):
-    """
-    Proposal generation module
-
-    For more details, please refer to the document of generate_proposals 
-    in ppdet/modeing/ops.py
-
-    Args:
-        pre_nms_top_n (int): Number of total bboxes to be kept per
-            image before NMS. default 6000
-        post_nms_top_n (int): Number of total bboxes to be kept per
-            image after NMS. default 1000
-        nms_thresh (float): Threshold in NMS. default 0.5
-        min_size (flaot): Remove predicted boxes with either height or
-             width < min_size. default 0.1
-        eta (float): Apply in adaptive NMS, if adaptive `threshold > 0.5`,
-             `adaptive_threshold = adaptive_threshold * eta` in each iteration.
-             default 1.
-        topk_after_collect (bool): whether to adopt topk after batch 
-             collection. If topk_after_collect is true, box filter will not be 
-             used after NMS at each image in proposal generation. default false
-    """
-
-    def __init__(self,
-                 pre_nms_top_n=12000,
-                 post_nms_top_n=2000,
-                 nms_thresh=.5,
-                 min_size=.1,
-                 eta=1.,
-                 topk_after_collect=False):
-        super(ProposalGenerator, self).__init__()
-        self.pre_nms_top_n = pre_nms_top_n
-        self.post_nms_top_n = post_nms_top_n
-        self.nms_thresh = nms_thresh
-        self.min_size = min_size
-        self.eta = eta
-        self.topk_after_collect = topk_after_collect
-
-    def __call__(self, scores, bbox_deltas, anchors, im_shape):
-
-        top_n = self.pre_nms_top_n if self.topk_after_collect else self.post_nms_top_n
-        variances = paddle.ones_like(anchors)
-        if hasattr(paddle.vision.ops, "generate_proposals"):
-            generate_proposals = getattr(paddle.vision.ops,
-                                         "generate_proposals")
-        else:
-            generate_proposals = ops.generate_proposals
-        rpn_rois, rpn_rois_prob, rpn_rois_num = generate_proposals(
-            scores,
-            bbox_deltas,
-            im_shape,
-            anchors,
-            variances,
-            pre_nms_top_n=self.pre_nms_top_n,
-            post_nms_top_n=top_n,
-            nms_thresh=self.nms_thresh,
-            min_size=self.min_size,
-            eta=self.eta,
-            return_rois_num=True)
-
-        return rpn_rois, rpn_rois_prob, rpn_rois_num, self.post_nms_top_n
diff --git a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/rpn_head.py b/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/rpn_head.py
deleted file mode 100644
index 7c56d8d..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/rpn_head.py
+++ /dev/null
@@ -1,313 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.nn.initializer import Normal
-
-from ppdet.core.workspace import register
-from .anchor_generator import AnchorGenerator
-from .target_layer import RPNTargetAssign
-from .proposal_generator import ProposalGenerator
-from ..cls_utils import _get_class_default_kwargs
-
-
-class RPNFeat(nn.Layer):
-    """
-    Feature extraction in RPN head
-
-    Args:
-        in_channel (int): Input channel
-        out_channel (int): Output channel
-    """
-
-    def __init__(self, in_channel=1024, out_channel=1024):
-        super(RPNFeat, self).__init__()
-        # rpn feat is shared with each level
-        self.rpn_conv = nn.Conv2D(
-            in_channels=in_channel,
-            out_channels=out_channel,
-            kernel_size=3,
-            padding=1,
-            weight_attr=paddle.ParamAttr(initializer=Normal(
-                mean=0., std=0.01)))
-        self.rpn_conv.skip_quant = True
-
-    def forward(self, feats):
-        rpn_feats = []
-        for feat in feats:
-            rpn_feats.append(F.relu(self.rpn_conv(feat)))
-        return rpn_feats
-
-
-@register
-class RPNHead(nn.Layer):
-    """
-    Region Proposal Network
-
-    Args:
-        anchor_generator (dict): configure of anchor generation
-        rpn_target_assign (dict): configure of rpn targets assignment
-        train_proposal (dict): configure of proposals generation
-            at the stage of training
-        test_proposal (dict): configure of proposals generation
-            at the stage of prediction
-        in_channel (int): channel of input feature maps which can be
-            derived by from_config
-    """
-    __shared__ = ['export_onnx']
-    __inject__ = ['loss_rpn_bbox']
-
-    def __init__(self,
-                 anchor_generator=_get_class_default_kwargs(AnchorGenerator),
-                 rpn_target_assign=_get_class_default_kwargs(RPNTargetAssign),
-                 train_proposal=_get_class_default_kwargs(ProposalGenerator,
-                                                          12000, 2000),
-                 test_proposal=_get_class_default_kwargs(ProposalGenerator),
-                 in_channel=1024,
-                 export_onnx=False,
-                 loss_rpn_bbox=None):
-        super(RPNHead, self).__init__()
-        self.anchor_generator = anchor_generator
-        self.rpn_target_assign = rpn_target_assign
-        self.train_proposal = train_proposal
-        self.test_proposal = test_proposal
-        self.export_onnx = export_onnx
-        if isinstance(anchor_generator, dict):
-            self.anchor_generator = AnchorGenerator(**anchor_generator)
-        if isinstance(rpn_target_assign, dict):
-            self.rpn_target_assign = RPNTargetAssign(**rpn_target_assign)
-        if isinstance(train_proposal, dict):
-            self.train_proposal = ProposalGenerator(**train_proposal)
-        if isinstance(test_proposal, dict):
-            self.test_proposal = ProposalGenerator(**test_proposal)
-        self.loss_rpn_bbox = loss_rpn_bbox
-
-        num_anchors = self.anchor_generator.num_anchors
-        self.rpn_feat = RPNFeat(in_channel, in_channel)
-        # rpn head is shared with each level
-        # rpn roi classification scores
-        self.rpn_rois_score = nn.Conv2D(
-            in_channels=in_channel,
-            out_channels=num_anchors,
-            kernel_size=1,
-            padding=0,
-            weight_attr=paddle.ParamAttr(initializer=Normal(
-                mean=0., std=0.01)))
-        self.rpn_rois_score.skip_quant = True
-
-        # rpn roi bbox regression deltas
-        self.rpn_rois_delta = nn.Conv2D(
-            in_channels=in_channel,
-            out_channels=4 * num_anchors,
-            kernel_size=1,
-            padding=0,
-            weight_attr=paddle.ParamAttr(initializer=Normal(
-                mean=0., std=0.01)))
-        self.rpn_rois_delta.skip_quant = True
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        # FPN share same rpn head
-        if isinstance(input_shape, (list, tuple)):
-            input_shape = input_shape[0]
-        return {'in_channel': input_shape.channels}
-
-    def forward(self, feats, inputs):
-        rpn_feats = self.rpn_feat(feats)
-        scores = []
-        deltas = []
-
-        for rpn_feat in rpn_feats:
-            rrs = self.rpn_rois_score(rpn_feat)
-            rrd = self.rpn_rois_delta(rpn_feat)
-            scores.append(rrs)
-            deltas.append(rrd)
-
-        anchors = self.anchor_generator(rpn_feats)
-
-        rois, rois_num = self._gen_proposal(scores, deltas, anchors, inputs)
-        if self.training:
-            loss = self.get_loss(scores, deltas, anchors, inputs)
-            return rois, rois_num, loss
-        else:
-            return rois, rois_num, None
-
-    def _gen_proposal(self, scores, bbox_deltas, anchors, inputs):
-        """
-        scores (list[Tensor]): Multi-level scores prediction
-        bbox_deltas (list[Tensor]): Multi-level deltas prediction
-        anchors (list[Tensor]): Multi-level anchors
-        inputs (dict): ground truth info
-        """
-        prop_gen = self.train_proposal if self.training else self.test_proposal
-        im_shape = inputs['im_shape']
-
-        # Collect multi-level proposals for each batch
-        # Get 'topk' of them as final output
-
-        if self.export_onnx:
-            # bs = 1 when exporting onnx
-            onnx_rpn_rois_list = []
-            onnx_rpn_prob_list = []
-            onnx_rpn_rois_num_list = []
-
-            for rpn_score, rpn_delta, anchor in zip(scores, bbox_deltas,
-                                                    anchors):
-                onnx_rpn_rois, onnx_rpn_rois_prob, onnx_rpn_rois_num, onnx_post_nms_top_n = prop_gen(
-                    scores=rpn_score[0:1],
-                    bbox_deltas=rpn_delta[0:1],
-                    anchors=anchor,
-                    im_shape=im_shape[0:1])
-                onnx_rpn_rois_list.append(onnx_rpn_rois)
-                onnx_rpn_prob_list.append(onnx_rpn_rois_prob)
-                onnx_rpn_rois_num_list.append(onnx_rpn_rois_num)
-
-            onnx_rpn_rois = paddle.concat(onnx_rpn_rois_list)
-            onnx_rpn_prob = paddle.concat(onnx_rpn_prob_list).flatten()
-
-            onnx_top_n = paddle.to_tensor(onnx_post_nms_top_n).cast('int32')
-            onnx_num_rois = paddle.shape(onnx_rpn_prob)[0].cast('int32')
-            k = paddle.minimum(onnx_top_n, onnx_num_rois)
-            onnx_topk_prob, onnx_topk_inds = paddle.topk(onnx_rpn_prob, k)
-            onnx_topk_rois = paddle.gather(onnx_rpn_rois, onnx_topk_inds)
-            # TODO(wangguanzhong): Now bs_rois_collect in export_onnx is moved outside conditional branch
-            # due to problems in dy2static of paddle. Will fix it when updating paddle framework.
-            # bs_rois_collect = [onnx_topk_rois]
-            # bs_rois_num_collect = paddle.shape(onnx_topk_rois)[0]
-
-        else:
-            bs_rois_collect = []
-            bs_rois_num_collect = []
-
-            batch_size = paddle.slice(paddle.shape(im_shape), [0], [0], [1])
-
-            # Generate proposals for each level and each batch.
-            # Discard batch-computing to avoid sorting bbox cross different batches.
-            for i in range(batch_size):
-                rpn_rois_list = []
-                rpn_prob_list = []
-                rpn_rois_num_list = []
-
-                for rpn_score, rpn_delta, anchor in zip(scores, bbox_deltas,
-                                                        anchors):
-                    rpn_rois, rpn_rois_prob, rpn_rois_num, post_nms_top_n = prop_gen(
-                        scores=rpn_score[i:i + 1],
-                        bbox_deltas=rpn_delta[i:i + 1],
-                        anchors=anchor,
-                        im_shape=im_shape[i:i + 1])
-                    rpn_rois_list.append(rpn_rois)
-                    rpn_prob_list.append(rpn_rois_prob)
-                    rpn_rois_num_list.append(rpn_rois_num)
-
-                if len(scores) > 1:
-                    rpn_rois = paddle.concat(rpn_rois_list)
-                    rpn_prob = paddle.concat(rpn_prob_list).flatten()
-
-                    num_rois = paddle.shape(rpn_prob)[0].cast('int32')
-                    if num_rois > post_nms_top_n:
-                        topk_prob, topk_inds = paddle.topk(rpn_prob,
-                                                           post_nms_top_n)
-                        topk_rois = paddle.gather(rpn_rois, topk_inds)
-                    else:
-                        topk_rois = rpn_rois
-                        topk_prob = rpn_prob
-                else:
-                    topk_rois = rpn_rois_list[0]
-                    topk_prob = rpn_prob_list[0].flatten()
-
-                bs_rois_collect.append(topk_rois)
-                bs_rois_num_collect.append(paddle.shape(topk_rois)[0:1])
-
-            bs_rois_num_collect = paddle.concat(bs_rois_num_collect)
-
-        if self.export_onnx:
-            output_rois = [onnx_topk_rois]
-            output_rois_num = paddle.shape(onnx_topk_rois)[0]
-        else:
-            output_rois = bs_rois_collect
-            output_rois_num = bs_rois_num_collect
-
-        return output_rois, output_rois_num
-
-    def get_loss(self, pred_scores, pred_deltas, anchors, inputs):
-        """
-        pred_scores (list[Tensor]): Multi-level scores prediction
-        pred_deltas (list[Tensor]): Multi-level deltas prediction
-        anchors (list[Tensor]): Multi-level anchors
-        inputs (dict): ground truth info, including im, gt_bbox, gt_score
-        """
-        anchors = [paddle.reshape(a, shape=(-1, 4)) for a in anchors]
-        anchors = paddle.concat(anchors)
-
-        scores = [
-            paddle.reshape(
-                paddle.transpose(
-                    v, perm=[0, 2, 3, 1]),
-                shape=(v.shape[0], -1, 1)) for v in pred_scores
-        ]
-        scores = paddle.concat(scores, axis=1)
-
-        deltas = [
-            paddle.reshape(
-                paddle.transpose(
-                    v, perm=[0, 2, 3, 1]),
-                shape=(v.shape[0], -1, 4)) for v in pred_deltas
-        ]
-        deltas = paddle.concat(deltas, axis=1)
-
-        score_tgt, bbox_tgt, loc_tgt, norm = self.rpn_target_assign(inputs,
-                                                                    anchors)
-
-        scores = paddle.reshape(x=scores, shape=(-1, ))
-        deltas = paddle.reshape(x=deltas, shape=(-1, 4))
-
-        score_tgt = paddle.concat(score_tgt)
-        score_tgt.stop_gradient = True
-
-        pos_mask = score_tgt == 1
-        pos_ind = paddle.nonzero(pos_mask)
-
-        valid_mask = score_tgt >= 0
-        valid_ind = paddle.nonzero(valid_mask)
-
-        # cls loss
-        if valid_ind.shape[0] == 0:
-            loss_rpn_cls = paddle.zeros([1], dtype='float32')
-        else:
-            score_pred = paddle.gather(scores, valid_ind)
-            score_label = paddle.gather(score_tgt, valid_ind).cast('float32')
-            score_label.stop_gradient = True
-            loss_rpn_cls = F.binary_cross_entropy_with_logits(
-                logit=score_pred, label=score_label, reduction="sum")
-
-        # reg loss
-        if pos_ind.shape[0] == 0:
-            loss_rpn_reg = paddle.zeros([1], dtype='float32')
-        else:
-            loc_pred = paddle.gather(deltas, pos_ind)
-            loc_tgt = paddle.concat(loc_tgt)
-            loc_tgt = paddle.gather(loc_tgt, pos_ind)
-            loc_tgt.stop_gradient = True
-
-            if self.loss_rpn_bbox is None:
-                loss_rpn_reg = paddle.abs(loc_pred - loc_tgt).sum()
-            else:
-                loss_rpn_reg = self.loss_rpn_bbox(loc_pred, loc_tgt).sum()
-
-        return {
-            'loss_rpn_cls': loss_rpn_cls / norm,
-            'loss_rpn_reg': loss_rpn_reg / norm
-        }
diff --git a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/target.py b/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/target.py
deleted file mode 100644
index 041b2c7..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/target.py
+++ /dev/null
@@ -1,678 +0,0 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle
-from ..bbox_utils import bbox2delta, bbox_overlaps
-
-
-def rpn_anchor_target(anchors,
-                      gt_boxes,
-                      rpn_batch_size_per_im,
-                      rpn_positive_overlap,
-                      rpn_negative_overlap,
-                      rpn_fg_fraction,
-                      use_random=True,
-                      batch_size=1,
-                      ignore_thresh=-1,
-                      is_crowd=None,
-                      weights=[1., 1., 1., 1.],
-                      assign_on_cpu=False):
-    tgt_labels = []
-    tgt_bboxes = []
-    tgt_deltas = []
-    for i in range(batch_size):
-        gt_bbox = gt_boxes[i]
-        is_crowd_i = is_crowd[i] if is_crowd else None
-        # Step1: match anchor and gt_bbox
-        matches, match_labels = label_box(
-            anchors, gt_bbox, rpn_positive_overlap, rpn_negative_overlap, True,
-            ignore_thresh, is_crowd_i, assign_on_cpu)
-        # Step2: sample anchor 
-        fg_inds, bg_inds = subsample_labels(match_labels, rpn_batch_size_per_im,
-                                            rpn_fg_fraction, 0, use_random)
-        # Fill with the ignore label (-1), then set positive and negative labels
-        labels = paddle.full(match_labels.shape, -1, dtype='int32')
-        if bg_inds.shape[0] > 0:
-            labels = paddle.scatter(labels, bg_inds, paddle.zeros_like(bg_inds))
-        if fg_inds.shape[0] > 0:
-            labels = paddle.scatter(labels, fg_inds, paddle.ones_like(fg_inds))
-        # Step3: make output  
-        if gt_bbox.shape[0] == 0:
-            matched_gt_boxes = paddle.zeros([matches.shape[0], 4])
-            tgt_delta = paddle.zeros([matches.shape[0], 4])
-        else:
-            matched_gt_boxes = paddle.gather(gt_bbox, matches)
-            tgt_delta = bbox2delta(anchors, matched_gt_boxes, weights)
-            matched_gt_boxes.stop_gradient = True
-            tgt_delta.stop_gradient = True
-        labels.stop_gradient = True
-        tgt_labels.append(labels)
-        tgt_bboxes.append(matched_gt_boxes)
-        tgt_deltas.append(tgt_delta)
-
-    return tgt_labels, tgt_bboxes, tgt_deltas
-
-
-def label_box(anchors,
-              gt_boxes,
-              positive_overlap,
-              negative_overlap,
-              allow_low_quality,
-              ignore_thresh,
-              is_crowd=None,
-              assign_on_cpu=False):
-    if assign_on_cpu:
-        device = paddle.device.get_device()
-        paddle.set_device("cpu")
-        iou = bbox_overlaps(gt_boxes, anchors)
-        paddle.set_device(device)
-
-    else:
-        iou = bbox_overlaps(gt_boxes, anchors)
-    n_gt = gt_boxes.shape[0]
-    if n_gt == 0 or is_crowd is None:
-        n_gt_crowd = 0
-    else:
-        n_gt_crowd = paddle.nonzero(is_crowd).shape[0]
-    if iou.shape[0] == 0 or n_gt_crowd == n_gt:
-        # No truth, assign everything to background
-        default_matches = paddle.full((iou.shape[1], ), 0, dtype='int64')
-        default_match_labels = paddle.full((iou.shape[1], ), 0, dtype='int32')
-        return default_matches, default_match_labels
-    # if ignore_thresh > 0, remove anchor if it is closed to 
-    # one of the crowded ground-truth
-    if n_gt_crowd > 0:
-        N_a = anchors.shape[0]
-        ones = paddle.ones([N_a])
-        mask = is_crowd * ones
-
-        if ignore_thresh > 0:
-            crowd_iou = iou * mask
-            valid = (paddle.sum((crowd_iou > ignore_thresh).cast('int32'),
-                                axis=0) > 0).cast('float32')
-            iou = iou * (1 - valid) - valid
-
-        # ignore the iou between anchor and crowded ground-truth
-        iou = iou * (1 - mask) - mask
-
-    matched_vals, matches = paddle.topk(iou, k=1, axis=0)
-    match_labels = paddle.full(matches.shape, -1, dtype='int32')
-    # set ignored anchor with iou = -1
-    neg_cond = paddle.logical_and(matched_vals > -1,
-                                  matched_vals < negative_overlap)
-    match_labels = paddle.where(neg_cond,
-                                paddle.zeros_like(match_labels), match_labels)
-    match_labels = paddle.where(matched_vals >= positive_overlap,
-                                paddle.ones_like(match_labels), match_labels)
-    if allow_low_quality:
-        highest_quality_foreach_gt = iou.max(axis=1, keepdim=True)
-        pred_inds_with_highest_quality = paddle.logical_and(
-            iou > 0, iou == highest_quality_foreach_gt).cast('int32').sum(
-                0, keepdim=True)
-        match_labels = paddle.where(pred_inds_with_highest_quality > 0,
-                                    paddle.ones_like(match_labels),
-                                    match_labels)
-
-    matches = matches.flatten()
-    match_labels = match_labels.flatten()
-
-    return matches, match_labels
-
-
-def subsample_labels(labels,
-                     num_samples,
-                     fg_fraction,
-                     bg_label=0,
-                     use_random=True):
-    positive = paddle.nonzero(
-        paddle.logical_and(labels != -1, labels != bg_label))
-    negative = paddle.nonzero(labels == bg_label)
-
-    fg_num = int(num_samples * fg_fraction)
-    fg_num = min(positive.numel(), fg_num)
-    bg_num = num_samples - fg_num
-    bg_num = min(negative.numel(), bg_num)
-    if fg_num == 0 and bg_num == 0:
-        fg_inds = paddle.zeros([0], dtype='int32')
-        bg_inds = paddle.zeros([0], dtype='int32')
-        return fg_inds, bg_inds
-
-    # randomly select positive and negative examples
-
-    negative = negative.cast('int32').flatten()
-    bg_perm = paddle.randperm(negative.numel(), dtype='int32')
-    bg_perm = paddle.slice(bg_perm, axes=[0], starts=[0], ends=[bg_num])
-    if use_random:
-        bg_inds = paddle.gather(negative, bg_perm)
-    else:
-        bg_inds = paddle.slice(negative, axes=[0], starts=[0], ends=[bg_num])
-    if fg_num == 0:
-        fg_inds = paddle.zeros([0], dtype='int32')
-        return fg_inds, bg_inds
-
-    positive = positive.cast('int32').flatten()
-    fg_perm = paddle.randperm(positive.numel(), dtype='int32')
-    fg_perm = paddle.slice(fg_perm, axes=[0], starts=[0], ends=[fg_num])
-    if use_random:
-        fg_inds = paddle.gather(positive, fg_perm)
-    else:
-        fg_inds = paddle.slice(positive, axes=[0], starts=[0], ends=[fg_num])
-
-    return fg_inds, bg_inds
-
-
-def generate_proposal_target(rpn_rois,
-                             gt_classes,
-                             gt_boxes,
-                             batch_size_per_im,
-                             fg_fraction,
-                             fg_thresh,
-                             bg_thresh,
-                             num_classes,
-                             ignore_thresh=-1.,
-                             is_crowd=None,
-                             use_random=True,
-                             is_cascade=False,
-                             cascade_iou=0.5,
-                             assign_on_cpu=False,
-                             add_gt_as_proposals=True):
-
-    rois_with_gt = []
-    tgt_labels = []
-    tgt_bboxes = []
-    tgt_gt_inds = []
-    new_rois_num = []
-
-    # In cascade rcnn, the threshold for foreground and background
-    # is used from cascade_iou
-    fg_thresh = cascade_iou if is_cascade else fg_thresh
-    bg_thresh = cascade_iou if is_cascade else bg_thresh
-    for i, rpn_roi in enumerate(rpn_rois):
-        gt_bbox = gt_boxes[i]
-        is_crowd_i = is_crowd[i] if is_crowd else None
-        gt_class = paddle.squeeze(gt_classes[i], axis=-1)
-
-        # Concat RoIs and gt boxes except cascade rcnn or none gt
-        if add_gt_as_proposals and gt_bbox.shape[0] > 0:
-            bbox = paddle.concat([rpn_roi, gt_bbox])
-        else:
-            bbox = rpn_roi
-
-        # Step1: label bbox
-        matches, match_labels = label_box(bbox, gt_bbox, fg_thresh, bg_thresh,
-                                          False, ignore_thresh, is_crowd_i,
-                                          assign_on_cpu)
-        # Step2: sample bbox 
-        sampled_inds, sampled_gt_classes = sample_bbox(
-            matches, match_labels, gt_class, batch_size_per_im, fg_fraction,
-            num_classes, use_random, is_cascade)
-
-        # Step3: make output 
-        rois_per_image = bbox if is_cascade else paddle.gather(bbox,
-                                                               sampled_inds)
-        sampled_gt_ind = matches if is_cascade else paddle.gather(matches,
-                                                                  sampled_inds)
-        if gt_bbox.shape[0] > 0:
-            sampled_bbox = paddle.gather(gt_bbox, sampled_gt_ind)
-        else:
-            num = rois_per_image.shape[0]
-            sampled_bbox = paddle.zeros([num, 4], dtype='float32')
-
-        rois_per_image.stop_gradient = True
-        sampled_gt_ind.stop_gradient = True
-        sampled_bbox.stop_gradient = True
-        tgt_labels.append(sampled_gt_classes)
-        tgt_bboxes.append(sampled_bbox)
-        rois_with_gt.append(rois_per_image)
-        tgt_gt_inds.append(sampled_gt_ind)
-        new_rois_num.append(paddle.shape(sampled_inds)[0:1])
-    new_rois_num = paddle.concat(new_rois_num)
-    return rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num
-
-
-def sample_bbox(matches,
-                match_labels,
-                gt_classes,
-                batch_size_per_im,
-                fg_fraction,
-                num_classes,
-                use_random=True,
-                is_cascade=False):
-
-    n_gt = gt_classes.shape[0]
-    if n_gt == 0:
-        # No truth, assign everything to background
-        gt_classes = paddle.ones(matches.shape, dtype='int32') * num_classes
-        #return matches, match_labels + num_classes
-    else:
-        gt_classes = paddle.gather(gt_classes, matches)
-        gt_classes = paddle.where(match_labels == 0,
-                                  paddle.ones_like(gt_classes) * num_classes,
-                                  gt_classes)
-        gt_classes = paddle.where(match_labels == -1,
-                                  paddle.ones_like(gt_classes) * -1, gt_classes)
-    if is_cascade:
-        index = paddle.arange(matches.shape[0])
-        return index, gt_classes
-    rois_per_image = int(batch_size_per_im)
-
-    fg_inds, bg_inds = subsample_labels(gt_classes, rois_per_image, fg_fraction,
-                                        num_classes, use_random)
-    if fg_inds.shape[0] == 0 and bg_inds.shape[0] == 0:
-        # fake output labeled with -1 when all boxes are neither
-        # foreground nor background
-        sampled_inds = paddle.zeros([1], dtype='int32')
-    else:
-        sampled_inds = paddle.concat([fg_inds, bg_inds])
-    sampled_gt_classes = paddle.gather(gt_classes, sampled_inds)
-    return sampled_inds, sampled_gt_classes
-
-
-def polygons_to_mask(polygons, height, width):
-    """
-    Convert the polygons to mask format
-
-    Args:
-        polygons (list[ndarray]): each array has shape (Nx2,)
-        height (int): mask height
-        width (int): mask width
-    Returns:
-        ndarray: a bool mask of shape (height, width)
-    """
-    import pycocotools.mask as mask_util
-    assert len(polygons) > 0, "COCOAPI does not support empty polygons"
-    rles = mask_util.frPyObjects(polygons, height, width)
-    rle = mask_util.merge(rles)
-    return mask_util.decode(rle).astype(np.bool_)
-
-
-def rasterize_polygons_within_box(poly, box, resolution):
-    w, h = box[2] - box[0], box[3] - box[1]
-    polygons = [np.asarray(p, dtype=np.float64) for p in poly]
-    for p in polygons:
-        p[0::2] = p[0::2] - box[0]
-        p[1::2] = p[1::2] - box[1]
-
-    ratio_h = resolution / max(h, 0.1)
-    ratio_w = resolution / max(w, 0.1)
-
-    if ratio_h == ratio_w:
-        for p in polygons:
-            p *= ratio_h
-    else:
-        for p in polygons:
-            p[0::2] *= ratio_w
-            p[1::2] *= ratio_h
-
-    # 3. Rasterize the polygons with coco api
-    mask = polygons_to_mask(polygons, resolution, resolution)
-    mask = paddle.to_tensor(mask, dtype='int32')
-    return mask
-
-
-def generate_mask_target(gt_segms, rois, labels_int32, sampled_gt_inds,
-                         num_classes, resolution):
-    mask_rois = []
-    mask_rois_num = []
-    tgt_masks = []
-    tgt_classes = []
-    mask_index = []
-    tgt_weights = []
-    for k in range(len(rois)):
-        labels_per_im = labels_int32[k]
-        # select rois labeled with foreground
-        fg_inds = paddle.nonzero(
-            paddle.logical_and(labels_per_im != -1, labels_per_im !=
-                               num_classes))
-        has_fg = True
-        # generate fake roi if foreground is empty
-        if fg_inds.numel() == 0:
-            has_fg = False
-            fg_inds = paddle.ones([1, 1], dtype='int64')
-        inds_per_im = sampled_gt_inds[k]
-        inds_per_im = paddle.gather(inds_per_im, fg_inds)
-
-        rois_per_im = rois[k]
-        fg_rois = paddle.gather(rois_per_im, fg_inds)
-        # Copy the foreground roi to cpu
-        # to generate mask target with ground-truth
-        boxes = fg_rois.numpy()
-        gt_segms_per_im = gt_segms[k]
-
-        new_segm = []
-        inds_per_im = inds_per_im.numpy()
-        if len(gt_segms_per_im) > 0:
-            for i in inds_per_im:
-                new_segm.append(gt_segms_per_im[i])
-        fg_inds_new = fg_inds.reshape([-1]).numpy()
-        results = []
-        if len(gt_segms_per_im) > 0:
-            for j in range(fg_inds_new.shape[0]):
-                results.append(
-                    rasterize_polygons_within_box(new_segm[j], boxes[j],
-                                                  resolution))
-        else:
-            results.append(paddle.ones([resolution, resolution], dtype='int32'))
-
-        fg_classes = paddle.gather(labels_per_im, fg_inds)
-        weight = paddle.ones([fg_rois.shape[0]], dtype='float32')
-        if not has_fg:
-            # now all sampled classes are background
-            # which will cause error in loss calculation,
-            # make fake classes with weight of 0.
-            fg_classes = paddle.zeros([1], dtype='int32')
-            weight = weight - 1
-        tgt_mask = paddle.stack(results)
-        tgt_mask.stop_gradient = True
-        fg_rois.stop_gradient = True
-
-        mask_index.append(fg_inds)
-        mask_rois.append(fg_rois)
-        mask_rois_num.append(paddle.shape(fg_rois)[0:1])
-        tgt_classes.append(fg_classes)
-        tgt_masks.append(tgt_mask)
-        tgt_weights.append(weight)
-
-    mask_index = paddle.concat(mask_index)
-    mask_rois_num = paddle.concat(mask_rois_num)
-    tgt_classes = paddle.concat(tgt_classes, axis=0)
-    tgt_masks = paddle.concat(tgt_masks, axis=0)
-    tgt_weights = paddle.concat(tgt_weights, axis=0)
-
-    return mask_rois, mask_rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights
-
-
-def libra_sample_pos(max_overlaps, max_classes, pos_inds, num_expected):
-    if len(pos_inds) <= num_expected:
-        return pos_inds
-    else:
-        unique_gt_inds = np.unique(max_classes[pos_inds])
-        num_gts = len(unique_gt_inds)
-        num_per_gt = int(round(num_expected / float(num_gts)) + 1)
-
-        sampled_inds = []
-        for i in unique_gt_inds:
-            inds = np.nonzero(max_classes == i)[0]
-            before_len = len(inds)
-            inds = list(set(inds) & set(pos_inds))
-            after_len = len(inds)
-            if len(inds) > num_per_gt:
-                inds = np.random.choice(inds, size=num_per_gt, replace=False)
-            sampled_inds.extend(list(inds))  # combine as a new sampler
-        if len(sampled_inds) < num_expected:
-            num_extra = num_expected - len(sampled_inds)
-            extra_inds = np.array(list(set(pos_inds) - set(sampled_inds)))
-            assert len(sampled_inds) + len(extra_inds) == len(pos_inds), \
-                "sum of sampled_inds({}) and extra_inds({}) length must be equal with pos_inds({})!".format(
-                    len(sampled_inds), len(extra_inds), len(pos_inds))
-            if len(extra_inds) > num_extra:
-                extra_inds = np.random.choice(
-                    extra_inds, size=num_extra, replace=False)
-            sampled_inds.extend(extra_inds.tolist())
-        elif len(sampled_inds) > num_expected:
-            sampled_inds = np.random.choice(
-                sampled_inds, size=num_expected, replace=False)
-        return paddle.to_tensor(sampled_inds)
-
-
-def libra_sample_via_interval(max_overlaps, full_set, num_expected, floor_thr,
-                              num_bins, bg_thresh):
-    max_iou = max_overlaps.max()
-    iou_interval = (max_iou - floor_thr) / num_bins
-    per_num_expected = int(num_expected / num_bins)
-
-    sampled_inds = []
-    for i in range(num_bins):
-        start_iou = floor_thr + i * iou_interval
-        end_iou = floor_thr + (i + 1) * iou_interval
-
-        tmp_set = set(
-            np.where(
-                np.logical_and(max_overlaps >= start_iou, max_overlaps <
-                               end_iou))[0])
-        tmp_inds = list(tmp_set & full_set)
-
-        if len(tmp_inds) > per_num_expected:
-            tmp_sampled_set = np.random.choice(
-                tmp_inds, size=per_num_expected, replace=False)
-        else:
-            tmp_sampled_set = np.array(tmp_inds, dtype=np.int32)
-        sampled_inds.append(tmp_sampled_set)
-
-    sampled_inds = np.concatenate(sampled_inds)
-    if len(sampled_inds) < num_expected:
-        num_extra = num_expected - len(sampled_inds)
-        extra_inds = np.array(list(full_set - set(sampled_inds)))
-        assert len(sampled_inds) + len(extra_inds) == len(full_set), \
-            "sum of sampled_inds({}) and extra_inds({}) length must be equal with full_set({})!".format(
-                len(sampled_inds), len(extra_inds), len(full_set))
-
-        if len(extra_inds) > num_extra:
-            extra_inds = np.random.choice(extra_inds, num_extra, replace=False)
-        sampled_inds = np.concatenate([sampled_inds, extra_inds])
-
-    return sampled_inds
-
-
-def libra_sample_neg(max_overlaps,
-                     max_classes,
-                     neg_inds,
-                     num_expected,
-                     floor_thr=-1,
-                     floor_fraction=0,
-                     num_bins=3,
-                     bg_thresh=0.5):
-    if len(neg_inds) <= num_expected:
-        return neg_inds
-    else:
-        # balance sampling for negative samples
-        neg_set = set(neg_inds.tolist())
-        if floor_thr > 0:
-            floor_set = set(
-                np.where(
-                    np.logical_and(max_overlaps >= 0, max_overlaps < floor_thr))
-                [0])
-            iou_sampling_set = set(np.where(max_overlaps >= floor_thr)[0])
-        elif floor_thr == 0:
-            floor_set = set(np.where(max_overlaps == 0)[0])
-            iou_sampling_set = set(np.where(max_overlaps > floor_thr)[0])
-        else:
-            floor_set = set()
-            iou_sampling_set = set(np.where(max_overlaps > floor_thr)[0])
-            floor_thr = 0
-
-        floor_neg_inds = list(floor_set & neg_set)
-        iou_sampling_neg_inds = list(iou_sampling_set & neg_set)
-
-        num_expected_iou_sampling = int(num_expected * (1 - floor_fraction))
-        if len(iou_sampling_neg_inds) > num_expected_iou_sampling:
-            if num_bins >= 2:
-                iou_sampled_inds = libra_sample_via_interval(
-                    max_overlaps,
-                    set(iou_sampling_neg_inds), num_expected_iou_sampling,
-                    floor_thr, num_bins, bg_thresh)
-            else:
-                iou_sampled_inds = np.random.choice(
-                    iou_sampling_neg_inds,
-                    size=num_expected_iou_sampling,
-                    replace=False)
-        else:
-            iou_sampled_inds = np.array(iou_sampling_neg_inds, dtype=np.int32)
-        num_expected_floor = num_expected - len(iou_sampled_inds)
-        if len(floor_neg_inds) > num_expected_floor:
-            sampled_floor_inds = np.random.choice(
-                floor_neg_inds, size=num_expected_floor, replace=False)
-        else:
-            sampled_floor_inds = np.array(floor_neg_inds, dtype=np.int32)
-        sampled_inds = np.concatenate((sampled_floor_inds, iou_sampled_inds))
-        if len(sampled_inds) < num_expected:
-            num_extra = num_expected - len(sampled_inds)
-            extra_inds = np.array(list(neg_set - set(sampled_inds)))
-            if len(extra_inds) > num_extra:
-                extra_inds = np.random.choice(
-                    extra_inds, size=num_extra, replace=False)
-            sampled_inds = np.concatenate((sampled_inds, extra_inds))
-        return paddle.to_tensor(sampled_inds)
-
-
-def libra_label_box(anchors, gt_boxes, gt_classes, positive_overlap,
-                    negative_overlap, num_classes):
-    # TODO: use paddle API to speed up
-    gt_classes = gt_classes.numpy()
-    gt_overlaps = np.zeros((anchors.shape[0], num_classes))
-    matches = np.zeros((anchors.shape[0]), dtype=np.int32)
-    if len(gt_boxes) > 0:
-        proposal_to_gt_overlaps = bbox_overlaps(anchors, gt_boxes).numpy()
-        overlaps_argmax = proposal_to_gt_overlaps.argmax(axis=1)
-        overlaps_max = proposal_to_gt_overlaps.max(axis=1)
-        # Boxes which with non-zero overlap with gt boxes
-        overlapped_boxes_ind = np.where(overlaps_max > 0)[0]
-        overlapped_boxes_gt_classes = gt_classes[overlaps_argmax[
-            overlapped_boxes_ind]]
-
-        for idx in range(len(overlapped_boxes_ind)):
-            gt_overlaps[overlapped_boxes_ind[idx], overlapped_boxes_gt_classes[
-                idx]] = overlaps_max[overlapped_boxes_ind[idx]]
-            matches[overlapped_boxes_ind[idx]] = overlaps_argmax[
-                overlapped_boxes_ind[idx]]
-
-    gt_overlaps = paddle.to_tensor(gt_overlaps)
-    matches = paddle.to_tensor(matches)
-
-    matched_vals = paddle.max(gt_overlaps, axis=1)
-    match_labels = paddle.full(matches.shape, -1, dtype='int32')
-    match_labels = paddle.where(matched_vals < negative_overlap,
-                                paddle.zeros_like(match_labels), match_labels)
-    match_labels = paddle.where(matched_vals >= positive_overlap,
-                                paddle.ones_like(match_labels), match_labels)
-
-    return matches, match_labels, matched_vals
-
-
-def libra_sample_bbox(matches,
-                      match_labels,
-                      matched_vals,
-                      gt_classes,
-                      batch_size_per_im,
-                      num_classes,
-                      fg_fraction,
-                      fg_thresh,
-                      bg_thresh,
-                      num_bins,
-                      use_random=True,
-                      is_cascade_rcnn=False):
-    rois_per_image = int(batch_size_per_im)
-    fg_rois_per_im = int(np.round(fg_fraction * rois_per_image))
-    bg_rois_per_im = rois_per_image - fg_rois_per_im
-
-    if is_cascade_rcnn:
-        fg_inds = paddle.nonzero(matched_vals >= fg_thresh)
-        bg_inds = paddle.nonzero(matched_vals < bg_thresh)
-    else:
-        matched_vals_np = matched_vals.numpy()
-        match_labels_np = match_labels.numpy()
-
-        # sample fg
-        fg_inds = paddle.nonzero(matched_vals >= fg_thresh).flatten()
-        fg_nums = int(np.minimum(fg_rois_per_im, fg_inds.shape[0]))
-        if (fg_inds.shape[0] > fg_nums) and use_random:
-            fg_inds = libra_sample_pos(matched_vals_np, match_labels_np,
-                                       fg_inds.numpy(), fg_rois_per_im)
-        fg_inds = fg_inds[:fg_nums]
-
-        # sample bg
-        bg_inds = paddle.nonzero(matched_vals < bg_thresh).flatten()
-        bg_nums = int(np.minimum(rois_per_image - fg_nums, bg_inds.shape[0]))
-        if (bg_inds.shape[0] > bg_nums) and use_random:
-            bg_inds = libra_sample_neg(
-                matched_vals_np,
-                match_labels_np,
-                bg_inds.numpy(),
-                bg_rois_per_im,
-                num_bins=num_bins,
-                bg_thresh=bg_thresh)
-        bg_inds = bg_inds[:bg_nums]
-
-        sampled_inds = paddle.concat([fg_inds, bg_inds])
-
-        gt_classes = paddle.gather(gt_classes, matches)
-        gt_classes = paddle.where(match_labels == 0,
-                                  paddle.ones_like(gt_classes) * num_classes,
-                                  gt_classes)
-        gt_classes = paddle.where(match_labels == -1,
-                                  paddle.ones_like(gt_classes) * -1, gt_classes)
-        sampled_gt_classes = paddle.gather(gt_classes, sampled_inds)
-
-        return sampled_inds, sampled_gt_classes
-
-
-def libra_generate_proposal_target(rpn_rois,
-                                   gt_classes,
-                                   gt_boxes,
-                                   batch_size_per_im,
-                                   fg_fraction,
-                                   fg_thresh,
-                                   bg_thresh,
-                                   num_classes,
-                                   use_random=True,
-                                   is_cascade_rcnn=False,
-                                   max_overlaps=None,
-                                   num_bins=3):
-
-    rois_with_gt = []
-    tgt_labels = []
-    tgt_bboxes = []
-    sampled_max_overlaps = []
-    tgt_gt_inds = []
-    new_rois_num = []
-
-    for i, rpn_roi in enumerate(rpn_rois):
-        max_overlap = max_overlaps[i] if is_cascade_rcnn else None
-        gt_bbox = gt_boxes[i]
-        gt_class = paddle.squeeze(gt_classes[i], axis=-1)
-        if is_cascade_rcnn:
-            rpn_roi = filter_roi(rpn_roi, max_overlap)
-        bbox = paddle.concat([rpn_roi, gt_bbox])
-
-        # Step1: label bbox
-        matches, match_labels, matched_vals = libra_label_box(
-            bbox, gt_bbox, gt_class, fg_thresh, bg_thresh, num_classes)
-
-        # Step2: sample bbox
-        sampled_inds, sampled_gt_classes = libra_sample_bbox(
-            matches, match_labels, matched_vals, gt_class, batch_size_per_im,
-            num_classes, fg_fraction, fg_thresh, bg_thresh, num_bins,
-            use_random, is_cascade_rcnn)
-
-        # Step3: make output
-        rois_per_image = paddle.gather(bbox, sampled_inds)
-        sampled_gt_ind = paddle.gather(matches, sampled_inds)
-        sampled_bbox = paddle.gather(gt_bbox, sampled_gt_ind)
-        sampled_overlap = paddle.gather(matched_vals, sampled_inds)
-
-        rois_per_image.stop_gradient = True
-        sampled_gt_ind.stop_gradient = True
-        sampled_bbox.stop_gradient = True
-        sampled_overlap.stop_gradient = True
-
-        tgt_labels.append(sampled_gt_classes)
-        tgt_bboxes.append(sampled_bbox)
-        rois_with_gt.append(rois_per_image)
-        sampled_max_overlaps.append(sampled_overlap)
-        tgt_gt_inds.append(sampled_gt_ind)
-        new_rois_num.append(paddle.shape(sampled_inds)[0:1])
-    new_rois_num = paddle.concat(new_rois_num)
-    # rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num
-    return rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num
diff --git a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/target_layer.py b/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/target_layer.py
deleted file mode 100644
index c010c81..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/target_layer.py
+++ /dev/null
@@ -1,481 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-import sys
-import paddle
-from ppdet.core.workspace import register, serializable
-
-from .target import rpn_anchor_target, generate_proposal_target, generate_mask_target, libra_generate_proposal_target
-import numpy as np
-
-
-@register
-@serializable
-class RPNTargetAssign(object):
-    __shared__ = ['assign_on_cpu']
-    """
-    RPN targets assignment module
-
-    The assignment consists of three steps:
-        1. Match anchor and ground-truth box, label the anchor with foreground
-           or background sample
-        2. Sample anchors to keep the properly ratio between foreground and 
-           background
-        3. Generate the targets for classification and regression branch
-
-
-    Args:
-        batch_size_per_im (int): Total number of RPN samples per image. 
-            default 256
-        fg_fraction (float): Fraction of anchors that is labeled
-            foreground, default 0.5
-        positive_overlap (float): Minimum overlap required between an anchor
-            and ground-truth box for the (anchor, gt box) pair to be 
-            a foreground sample. default 0.7
-        negative_overlap (float): Maximum overlap allowed between an anchor
-            and ground-truth box for the (anchor, gt box) pair to be 
-            a background sample. default 0.3
-        ignore_thresh(float): Threshold for ignoring the is_crowd ground-truth
-            if the value is larger than zero.
-        use_random (bool): Use random sampling to choose foreground and 
-            background boxes, default true.
-        assign_on_cpu (bool): In case the number of gt box is too large, 
-            compute IoU on CPU, default false.
-    """
-
-    def __init__(self,
-                 batch_size_per_im=256,
-                 fg_fraction=0.5,
-                 positive_overlap=0.7,
-                 negative_overlap=0.3,
-                 ignore_thresh=-1.,
-                 use_random=True,
-                 assign_on_cpu=False):
-        super(RPNTargetAssign, self).__init__()
-        self.batch_size_per_im = batch_size_per_im
-        self.fg_fraction = fg_fraction
-        self.positive_overlap = positive_overlap
-        self.negative_overlap = negative_overlap
-        self.ignore_thresh = ignore_thresh
-        self.use_random = use_random
-        self.assign_on_cpu = assign_on_cpu
-
-    def __call__(self, inputs, anchors):
-        """
-        inputs: ground-truth instances.
-        anchor_box (Tensor): [num_anchors, 4], num_anchors are all anchors in all feature maps.
-        """
-        gt_boxes = inputs['gt_bbox']
-        is_crowd = inputs.get('is_crowd', None)
-        batch_size = len(gt_boxes)
-        tgt_labels, tgt_bboxes, tgt_deltas = rpn_anchor_target(
-            anchors,
-            gt_boxes,
-            self.batch_size_per_im,
-            self.positive_overlap,
-            self.negative_overlap,
-            self.fg_fraction,
-            self.use_random,
-            batch_size,
-            self.ignore_thresh,
-            is_crowd,
-            assign_on_cpu=self.assign_on_cpu)
-        norm = self.batch_size_per_im * batch_size
-
-        return tgt_labels, tgt_bboxes, tgt_deltas, norm
-
-
-@register
-class BBoxAssigner(object):
-    __shared__ = ['num_classes', 'assign_on_cpu']
-    """
-    RCNN targets assignment module
-
-    The assignment consists of three steps:
-        1. Match RoIs and ground-truth box, label the RoIs with foreground
-           or background sample
-        2. Sample anchors to keep the properly ratio between foreground and 
-           background
-        3. Generate the targets for classification and regression branch
-
-    Args:
-        batch_size_per_im (int): Total number of RoIs per image. 
-            default 512 
-        fg_fraction (float): Fraction of RoIs that is labeled
-            foreground, default 0.25
-        fg_thresh (float): Minimum overlap required between a RoI
-            and ground-truth box for the (roi, gt box) pair to be
-            a foreground sample. default 0.5
-        bg_thresh (float): Maximum overlap allowed between a RoI
-            and ground-truth box for the (roi, gt box) pair to be
-            a background sample. default 0.5
-        ignore_thresh(float): Threshold for ignoring the is_crowd ground-truth
-            if the value is larger than zero.
-        use_random (bool): Use random sampling to choose foreground and 
-            background boxes, default true
-        cascade_iou (list[iou]): The list of overlap to select foreground and
-            background of each stage, which is only used In Cascade RCNN.
-        num_classes (int): The number of class.
-        assign_on_cpu (bool): In case the number of gt box is too large, 
-            compute IoU on CPU, default false.
-    """
-
-    def __init__(self,
-                 batch_size_per_im=512,
-                 fg_fraction=.25,
-                 fg_thresh=.5,
-                 bg_thresh=.5,
-                 ignore_thresh=-1.,
-                 use_random=True,
-                 cascade_iou=[0.5, 0.6, 0.7],
-                 num_classes=80,
-                 assign_on_cpu=False):
-        super(BBoxAssigner, self).__init__()
-        self.batch_size_per_im = batch_size_per_im
-        self.fg_fraction = fg_fraction
-        self.fg_thresh = fg_thresh
-        self.bg_thresh = bg_thresh
-        self.ignore_thresh = ignore_thresh
-        self.use_random = use_random
-        self.cascade_iou = cascade_iou
-        self.num_classes = num_classes
-        self.assign_on_cpu = assign_on_cpu
-
-    def __call__(self,
-                 rpn_rois,
-                 rpn_rois_num,
-                 inputs,
-                 stage=0,
-                 is_cascade=False,
-                 add_gt_as_proposals=True):
-        gt_classes = inputs['gt_class']
-        gt_boxes = inputs['gt_bbox']
-        is_crowd = inputs.get('is_crowd', None)
-        # rois, tgt_labels, tgt_bboxes, tgt_gt_inds
-        # new_rois_num
-        outs = generate_proposal_target(
-            rpn_rois, gt_classes, gt_boxes, self.batch_size_per_im,
-            self.fg_fraction, self.fg_thresh, self.bg_thresh, self.num_classes,
-            self.ignore_thresh, is_crowd, self.use_random, is_cascade,
-            self.cascade_iou[stage], self.assign_on_cpu, add_gt_as_proposals)
-        rois = outs[0]
-        rois_num = outs[-1]
-        # tgt_labels, tgt_bboxes, tgt_gt_inds
-        targets = outs[1:4]
-        return rois, rois_num, targets
-
-
-@register
-class BBoxLibraAssigner(object):
-    __shared__ = ['num_classes']
-    """
-    Libra-RCNN targets assignment module
-
-    The assignment consists of three steps:
-        1. Match RoIs and ground-truth box, label the RoIs with foreground
-           or background sample
-        2. Sample anchors to keep the properly ratio between foreground and
-           background
-        3. Generate the targets for classification and regression branch
-
-    Args:
-        batch_size_per_im (int): Total number of RoIs per image.
-            default 512
-        fg_fraction (float): Fraction of RoIs that is labeled
-            foreground, default 0.25
-        fg_thresh (float): Minimum overlap required between a RoI
-            and ground-truth box for the (roi, gt box) pair to be
-            a foreground sample. default 0.5
-        bg_thresh (float): Maximum overlap allowed between a RoI
-            and ground-truth box for the (roi, gt box) pair to be
-            a background sample. default 0.5
-        use_random (bool): Use random sampling to choose foreground and
-            background boxes, default true
-        cascade_iou (list[iou]): The list of overlap to select foreground and
-            background of each stage, which is only used In Cascade RCNN.
-        num_classes (int): The number of class.
-        num_bins (int): The number of libra_sample.
-    """
-
-    def __init__(self,
-                 batch_size_per_im=512,
-                 fg_fraction=.25,
-                 fg_thresh=.5,
-                 bg_thresh=.5,
-                 use_random=True,
-                 cascade_iou=[0.5, 0.6, 0.7],
-                 num_classes=80,
-                 num_bins=3):
-        super(BBoxLibraAssigner, self).__init__()
-        self.batch_size_per_im = batch_size_per_im
-        self.fg_fraction = fg_fraction
-        self.fg_thresh = fg_thresh
-        self.bg_thresh = bg_thresh
-        self.use_random = use_random
-        self.cascade_iou = cascade_iou
-        self.num_classes = num_classes
-        self.num_bins = num_bins
-
-    def __call__(self,
-                 rpn_rois,
-                 rpn_rois_num,
-                 inputs,
-                 stage=0,
-                 is_cascade=False):
-        gt_classes = inputs['gt_class']
-        gt_boxes = inputs['gt_bbox']
-        # rois, tgt_labels, tgt_bboxes, tgt_gt_inds
-        outs = libra_generate_proposal_target(
-            rpn_rois, gt_classes, gt_boxes, self.batch_size_per_im,
-            self.fg_fraction, self.fg_thresh, self.bg_thresh, self.num_classes,
-            self.use_random, is_cascade, self.cascade_iou[stage], self.num_bins)
-        rois = outs[0]
-        rois_num = outs[-1]
-        # tgt_labels, tgt_bboxes, tgt_gt_inds
-        targets = outs[1:4]
-        return rois, rois_num, targets
-
-
-@register
-@serializable
-class MaskAssigner(object):
-    __shared__ = ['num_classes', 'mask_resolution']
-    """
-    Mask targets assignment module
-
-    The assignment consists of three steps:
-        1. Select RoIs labels with foreground.
-        2. Encode the RoIs and corresponding gt polygons to generate 
-           mask target
-
-    Args:
-        num_classes (int): The number of class
-        mask_resolution (int): The resolution of mask target, default 14
-    """
-
-    def __init__(self, num_classes=80, mask_resolution=14):
-        super(MaskAssigner, self).__init__()
-        self.num_classes = num_classes
-        self.mask_resolution = mask_resolution
-
-    def __call__(self, rois, tgt_labels, tgt_gt_inds, inputs):
-        gt_segms = inputs['gt_poly']
-
-        outs = generate_mask_target(gt_segms, rois, tgt_labels, tgt_gt_inds,
-                                    self.num_classes, self.mask_resolution)
-
-        # mask_rois, mask_rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights
-        return outs
-
-
-@register
-class RBoxAssigner(object):
-    """
-    assigner of rbox
-    Args:
-        pos_iou_thr (float): threshold of pos samples
-        neg_iou_thr (float): threshold of neg samples
-        min_iou_thr (float): the min threshold of samples
-        ignore_iof_thr (int): the ignored threshold
-    """
-
-    def __init__(self,
-                 pos_iou_thr=0.5,
-                 neg_iou_thr=0.4,
-                 min_iou_thr=0.0,
-                 ignore_iof_thr=-2):
-        super(RBoxAssigner, self).__init__()
-
-        self.pos_iou_thr = pos_iou_thr
-        self.neg_iou_thr = neg_iou_thr
-        self.min_iou_thr = min_iou_thr
-        self.ignore_iof_thr = ignore_iof_thr
-
-    def anchor_valid(self, anchors):
-        """
-
-        Args:
-            anchor: M x 4
-
-        Returns:
-
-        """
-        if anchors.ndim == 3:
-            anchors = anchors.reshape(-1, anchors.shape[-1])
-        assert anchors.ndim == 2
-        anchor_num = anchors.shape[0]
-        anchor_valid = np.ones((anchor_num), np.int32)
-        anchor_inds = np.arange(anchor_num)
-        return anchor_inds
-
-    def rbox2delta(self,
-                   proposals,
-                   gt,
-                   means=[0, 0, 0, 0, 0],
-                   stds=[1, 1, 1, 1, 1]):
-        """
-        Args:
-            proposals: tensor [N, 5]
-            gt: gt [N, 5]
-            means: means [5]
-            stds: stds [5]
-        Returns:
-
-        """
-        proposals = proposals.astype(np.float64)
-
-        PI = np.pi
-
-        gt_widths = gt[..., 2]
-        gt_heights = gt[..., 3]
-        gt_angle = gt[..., 4]
-
-        proposals_widths = proposals[..., 2]
-        proposals_heights = proposals[..., 3]
-        proposals_angle = proposals[..., 4]
-
-        coord = gt[..., 0:2] - proposals[..., 0:2]
-        dx = (np.cos(proposals[..., 4]) * coord[..., 0] +
-              np.sin(proposals[..., 4]) * coord[..., 1]) / proposals_widths
-        dy = (-np.sin(proposals[..., 4]) * coord[..., 0] +
-              np.cos(proposals[..., 4]) * coord[..., 1]) / proposals_heights
-        dw = np.log(gt_widths / proposals_widths)
-        dh = np.log(gt_heights / proposals_heights)
-        da = (gt_angle - proposals_angle)
-
-        da = (da + PI / 4) % PI - PI / 4
-        da /= PI
-
-        deltas = np.stack([dx, dy, dw, dh, da], axis=-1)
-        means = np.array(means, dtype=deltas.dtype)
-        stds = np.array(stds, dtype=deltas.dtype)
-        deltas = (deltas - means) / stds
-        deltas = deltas.astype(np.float32)
-        return deltas
-
-    def assign_anchor(self,
-                      anchors,
-                      gt_bboxes,
-                      gt_labels,
-                      pos_iou_thr,
-                      neg_iou_thr,
-                      min_iou_thr=0.0,
-                      ignore_iof_thr=-2):
-        assert anchors.shape[1] == 4 or anchors.shape[1] == 5
-        assert gt_bboxes.shape[1] == 4 or gt_bboxes.shape[1] == 5
-        anchors_xc_yc = anchors
-        gt_bboxes_xc_yc = gt_bboxes
-
-        # calc rbox iou
-        anchors_xc_yc = anchors_xc_yc.astype(np.float32)
-        gt_bboxes_xc_yc = gt_bboxes_xc_yc.astype(np.float32)
-        anchors_xc_yc = paddle.to_tensor(anchors_xc_yc)
-        gt_bboxes_xc_yc = paddle.to_tensor(gt_bboxes_xc_yc)
-
-        try:
-            from ext_op import rbox_iou
-        except Exception as e:
-            print("import custom_ops error, try install ext_op " \
-                  "following ppdet/ext_op/README.md", e)
-            sys.stdout.flush()
-            sys.exit(-1)
-
-        iou = rbox_iou(gt_bboxes_xc_yc, anchors_xc_yc)
-        iou = iou.numpy()
-        iou = iou.T
-
-        # every gt's anchor's index
-        gt_bbox_anchor_inds = iou.argmax(axis=0)
-        gt_bbox_anchor_iou = iou[gt_bbox_anchor_inds, np.arange(iou.shape[1])]
-        gt_bbox_anchor_iou_inds = np.where(iou == gt_bbox_anchor_iou)[0]
-
-        # every anchor's gt bbox's index
-        anchor_gt_bbox_inds = iou.argmax(axis=1)
-        anchor_gt_bbox_iou = iou[np.arange(iou.shape[0]), anchor_gt_bbox_inds]
-
-        # (1) set labels=-2 as default
-        labels = np.ones((iou.shape[0], ), dtype=np.int32) * ignore_iof_thr
-
-        # (2) assign ignore
-        labels[anchor_gt_bbox_iou < min_iou_thr] = ignore_iof_thr
-
-        # (3) assign neg_ids -1
-        assign_neg_ids1 = anchor_gt_bbox_iou >= min_iou_thr
-        assign_neg_ids2 = anchor_gt_bbox_iou < neg_iou_thr
-        assign_neg_ids = np.logical_and(assign_neg_ids1, assign_neg_ids2)
-        labels[assign_neg_ids] = -1
-
-        # anchor_gt_bbox_iou_inds
-        # (4) assign max_iou as pos_ids >=0
-        anchor_gt_bbox_iou_inds = anchor_gt_bbox_inds[gt_bbox_anchor_iou_inds]
-        # gt_bbox_anchor_iou_inds = np.logical_and(gt_bbox_anchor_iou_inds, anchor_gt_bbox_iou >= min_iou_thr)
-        labels[gt_bbox_anchor_iou_inds] = gt_labels[anchor_gt_bbox_iou_inds]
-
-        # (5) assign >= pos_iou_thr as pos_ids
-        iou_pos_iou_thr_ids = anchor_gt_bbox_iou >= pos_iou_thr
-        iou_pos_iou_thr_ids_box_inds = anchor_gt_bbox_inds[iou_pos_iou_thr_ids]
-        labels[iou_pos_iou_thr_ids] = gt_labels[iou_pos_iou_thr_ids_box_inds]
-        return anchor_gt_bbox_inds, anchor_gt_bbox_iou, labels
-
-    def __call__(self, anchors, gt_bboxes, gt_labels, is_crowd):
-
-        assert anchors.ndim == 2
-        assert anchors.shape[1] == 5
-        assert gt_bboxes.ndim == 2
-        assert gt_bboxes.shape[1] == 5
-
-        pos_iou_thr = self.pos_iou_thr
-        neg_iou_thr = self.neg_iou_thr
-        min_iou_thr = self.min_iou_thr
-        ignore_iof_thr = self.ignore_iof_thr
-
-        anchor_num = anchors.shape[0]
-
-        gt_bboxes = gt_bboxes
-        is_crowd_slice = is_crowd
-        not_crowd_inds = np.where(is_crowd_slice == 0)
-
-        # Step1: match anchor and gt_bbox
-        anchor_gt_bbox_inds, anchor_gt_bbox_iou, labels = self.assign_anchor(
-            anchors, gt_bboxes,
-            gt_labels.reshape(-1), pos_iou_thr, neg_iou_thr, min_iou_thr,
-            ignore_iof_thr)
-
-        # Step2: sample anchor
-        pos_inds = np.where(labels >= 0)[0]
-        neg_inds = np.where(labels == -1)[0]
-
-        # Step3: make output
-        anchors_num = anchors.shape[0]
-        bbox_targets = np.zeros_like(anchors)
-        bbox_weights = np.zeros_like(anchors)
-        bbox_gt_bboxes = np.zeros_like(anchors)
-        pos_labels = np.zeros(anchors_num, dtype=np.int32)
-        pos_labels_weights = np.zeros(anchors_num, dtype=np.float32)
-
-        pos_sampled_anchors = anchors[pos_inds]
-        pos_sampled_gt_boxes = gt_bboxes[anchor_gt_bbox_inds[pos_inds]]
-        if len(pos_inds) > 0:
-            pos_bbox_targets = self.rbox2delta(pos_sampled_anchors,
-                                               pos_sampled_gt_boxes)
-            bbox_targets[pos_inds, :] = pos_bbox_targets
-            bbox_gt_bboxes[pos_inds, :] = pos_sampled_gt_boxes
-            bbox_weights[pos_inds, :] = 1.0
-
-            pos_labels[pos_inds] = labels[pos_inds]
-            pos_labels_weights[pos_inds] = 1.0
-
-        if len(neg_inds) > 0:
-            pos_labels_weights[neg_inds] = 1.0
-        return (pos_labels, pos_labels_weights, bbox_targets, bbox_weights,
-                bbox_gt_bboxes, pos_inds, neg_inds)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/rbox_utils.py b/pdfdet/models/Paddle/ppdet/modeling/rbox_utils.py
deleted file mode 100644
index a5f19a2..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/rbox_utils.py
+++ /dev/null
@@ -1,295 +0,0 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import paddle
-import numpy as np
-import cv2
-
-
-def norm_angle(angle, range=[-np.pi / 4, np.pi]):
-    return (angle - range[0]) % range[1] + range[0]
-
-
-# rbox function implemented using numpy
-def poly2rbox_le135_np(poly):
-    """convert poly to rbox [-pi / 4, 3 * pi / 4]
-
-    Args:
-        poly: [x1, y1, x2, y2, x3, y3, x4, y4]
-
-    Returns:
-        rbox: [cx, cy, w, h, angle]
-    """
-    poly = np.array(poly[:8], dtype=np.float32)
-
-    pt1 = (poly[0], poly[1])
-    pt2 = (poly[2], poly[3])
-    pt3 = (poly[4], poly[5])
-    pt4 = (poly[6], poly[7])
-
-    edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[1]) *
-                    (pt1[1] - pt2[1]))
-    edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[1]) *
-                    (pt2[1] - pt3[1]))
-
-    width = max(edge1, edge2)
-    height = min(edge1, edge2)
-
-    rbox_angle = 0
-    if edge1 > edge2:
-        rbox_angle = np.arctan2(float(pt2[1] - pt1[1]), float(pt2[0] - pt1[0]))
-    elif edge2 >= edge1:
-        rbox_angle = np.arctan2(float(pt4[1] - pt1[1]), float(pt4[0] - pt1[0]))
-
-    rbox_angle = norm_angle(rbox_angle)
-
-    x_ctr = float(pt1[0] + pt3[0]) / 2
-    y_ctr = float(pt1[1] + pt3[1]) / 2
-    return [x_ctr, y_ctr, width, height, rbox_angle]
-
-
-def poly2rbox_oc_np(poly):
-    """convert poly to rbox (0, pi / 2]
-
-    Args:
-        poly: [x1, y1, x2, y2, x3, y3, x4, y4]
-
-    Returns:
-        rbox: [cx, cy, w, h, angle]
-    """
-    points = np.array(poly, dtype=np.float32).reshape((-1, 2))
-    (cx, cy), (w, h), angle = cv2.minAreaRect(points)
-    # using the new OpenCV Rotated BBox definition since 4.5.1
-    # if angle < 0, opencv is older than 4.5.1, angle is in [-90, 0)
-    if angle < 0:
-        angle += 90
-        w, h = h, w
-
-    # convert angle to [0, 90)
-    if angle == -0.0:
-        angle = 0.0
-    if angle == 90.0:
-        angle = 0.0
-        w, h = h, w
-
-    angle = angle / 180 * np.pi
-    return [cx, cy, w, h, angle]
-
-
-def poly2rbox_np(polys, rbox_type='oc'):
-    """
-    polys: [x0,y0,x1,y1,x2,y2,x3,y3]
-    to
-    rboxes: [x_ctr,y_ctr,w,h,angle]
-    """
-    assert rbox_type in ['oc', 'le135'], 'only oc or le135 is supported now'
-    poly2rbox_fn = poly2rbox_oc_np if rbox_type == 'oc' else poly2rbox_le135_np
-    rboxes = []
-    for poly in polys:
-        x, y, w, h, angle = poly2rbox_fn(poly)
-        rbox = np.array([x, y, w, h, angle], dtype=np.float32)
-        rboxes.append(rbox)
-
-    return np.array(rboxes)
-
-
-def cal_line_length(point1, point2):
-    return math.sqrt(
-        math.pow(point1[0] - point2[0], 2) + math.pow(point1[1] - point2[1], 2))
-
-
-def get_best_begin_point_single(coordinate):
-    x1, y1, x2, y2, x3, y3, x4, y4 = coordinate
-    xmin = min(x1, x2, x3, x4)
-    ymin = min(y1, y2, y3, y4)
-    xmax = max(x1, x2, x3, x4)
-    ymax = max(y1, y2, y3, y4)
-    combinate = [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]],
-                 [[x4, y4], [x1, y1], [x2, y2], [x3, y3]],
-                 [[x3, y3], [x4, y4], [x1, y1], [x2, y2]],
-                 [[x2, y2], [x3, y3], [x4, y4], [x1, y1]]]
-    dst_coordinate = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]]
-    force = 100000000.0
-    force_flag = 0
-    for i in range(4):
-        temp_force = cal_line_length(combinate[i][0], dst_coordinate[0]) \
-                     + cal_line_length(combinate[i][1], dst_coordinate[1]) \
-                     + cal_line_length(combinate[i][2], dst_coordinate[2]) \
-                     + cal_line_length(combinate[i][3], dst_coordinate[3])
-        if temp_force < force:
-            force = temp_force
-            force_flag = i
-    if force_flag != 0:
-        pass
-    return np.array(combinate[force_flag]).reshape(8)
-
-
-def rbox2poly_np(rboxes):
-    """
-    rboxes:[x_ctr,y_ctr,w,h,angle]
-    to
-    poly:[x0,y0,x1,y1,x2,y2,x3,y3]
-    """
-    polys = []
-    for i in range(len(rboxes)):
-        x_ctr, y_ctr, width, height, angle = rboxes[i][:5]
-        tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2
-        rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]])
-        R = np.array([[np.cos(angle), -np.sin(angle)],
-                      [np.sin(angle), np.cos(angle)]])
-        poly = R.dot(rect)
-        x0, x1, x2, x3 = poly[0, :4] + x_ctr
-        y0, y1, y2, y3 = poly[1, :4] + y_ctr
-        poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32)
-        poly = get_best_begin_point_single(poly)
-        polys.append(poly)
-    polys = np.array(polys)
-    return polys
-
-
-# rbox function implemented using paddle
-def box2corners(box):
-    """convert box coordinate to corners
-    Args:
-        box (Tensor): (B, N, 5) with (x, y, w, h, alpha) angle is in [0, 90)
-    Returns:
-        corners (Tensor): (B, N, 4, 2) with (x1, y1, x2, y2, x3, y3, x4, y4)
-    """
-    B = box.shape[0]
-    x, y, w, h, alpha = paddle.split(box, 5, axis=-1)
-    x4 = paddle.to_tensor(
-        [0.5, 0.5, -0.5, -0.5], dtype=paddle.float32).reshape(
-            (1, 1, 4))  # (1,1,4)
-    x4 = x4 * w  # (B, N, 4)
-    y4 = paddle.to_tensor(
-        [-0.5, 0.5, 0.5, -0.5], dtype=paddle.float32).reshape((1, 1, 4))
-    y4 = y4 * h  # (B, N, 4)
-    corners = paddle.stack([x4, y4], axis=-1)  # (B, N, 4, 2)
-    sin = paddle.sin(alpha)
-    cos = paddle.cos(alpha)
-    row1 = paddle.concat([cos, sin], axis=-1)
-    row2 = paddle.concat([-sin, cos], axis=-1)  # (B, N, 2)
-    rot_T = paddle.stack([row1, row2], axis=-2)  # (B, N, 2, 2)
-    rotated = paddle.bmm(corners.reshape([-1, 4, 2]), rot_T.reshape([-1, 2, 2]))
-    rotated = rotated.reshape([B, -1, 4, 2])  # (B*N, 4, 2) -> (B, N, 4, 2)
-    rotated[..., 0] += x
-    rotated[..., 1] += y
-    return rotated
-
-
-def paddle_gather(x, dim, index):
-    index_shape = index.shape
-    index_flatten = index.flatten()
-    if dim < 0:
-        dim = len(x.shape) + dim
-    nd_index = []
-    for k in range(len(x.shape)):
-        if k == dim:
-            nd_index.append(index_flatten)
-        else:
-            reshape_shape = [1] * len(x.shape)
-            reshape_shape[k] = x.shape[k]
-            x_arange = paddle.arange(x.shape[k], dtype=index.dtype)
-            x_arange = x_arange.reshape(reshape_shape)
-            dim_index = paddle.expand(x_arange, index_shape).flatten()
-            nd_index.append(dim_index)
-    ind2 = paddle.transpose(paddle.stack(nd_index), [1, 0]).astype("int64")
-    paddle_out = paddle.gather_nd(x, ind2).reshape(index_shape)
-    return paddle_out
-
-
-def check_points_in_polys(points, polys):
-    """Check whether point is in rotated boxes
-    Args:
-        points (tensor): (1, L, 2) anchor points
-        polys (tensor): [B, N, 4, 2] gt_polys
-        eps (float): default 1e-9
-    Returns:
-        is_in_polys (tensor): (B, N, L)
-    """
-    # [1, L, 2] -> [1, 1, L, 2]
-    points = points.unsqueeze(0)
-    # [B, N, 4, 2] -> [B, N, 1, 2]
-    a, b, c, d = polys.split(4, axis=2)
-    ab = b - a
-    ad = d - a
-    # [B, N, L, 2]
-    ap = points - a
-    # [B, N, 1]
-    norm_ab = paddle.sum(ab * ab, axis=-1)
-    # [B, N, 1]
-    norm_ad = paddle.sum(ad * ad, axis=-1)
-    # [B, N, L] dot product
-    ap_dot_ab = paddle.sum(ap * ab, axis=-1)
-    # [B, N, L] dot product
-    ap_dot_ad = paddle.sum(ap * ad, axis=-1)
-    # [B, N, L] <A, B> = |A|*|B|*cos(theta)
-    is_in_polys = (ap_dot_ab >= 0) & (ap_dot_ab <= norm_ab) & (
-        ap_dot_ad >= 0) & (ap_dot_ad <= norm_ad)
-    return is_in_polys
-
-
-def check_points_in_rotated_boxes(points, boxes):
-    """Check whether point is in rotated boxes
-
-    Args:
-        points (tensor): (1, L, 2) anchor points
-        boxes (tensor): [B, N, 5] gt_bboxes
-        eps (float): default 1e-9
-    
-    Returns:
-        is_in_box (tensor): (B, N, L)
-
-    """
-    # [B, N, 5] -> [B, N, 4, 2]
-    corners = box2corners(boxes)
-    # [1, L, 2] -> [1, 1, L, 2]
-    points = points.unsqueeze(0)
-    # [B, N, 4, 2] -> [B, N, 1, 2]
-    a, b, c, d = corners.split(4, axis=2)
-    ab = b - a
-    ad = d - a
-    # [B, N, L, 2]
-    ap = points - a
-    # [B, N, L]
-    norm_ab = paddle.sum(ab * ab, axis=-1)
-    # [B, N, L]
-    norm_ad = paddle.sum(ad * ad, axis=-1)
-    # [B, N, L] dot product
-    ap_dot_ab = paddle.sum(ap * ab, axis=-1)
-    # [B, N, L] dot product
-    ap_dot_ad = paddle.sum(ap * ad, axis=-1)
-    # [B, N, L] <A, B> = |A|*|B|*cos(theta) 
-    is_in_box = (ap_dot_ab >= 0) & (ap_dot_ab <= norm_ab) & (ap_dot_ad >= 0) & (
-        ap_dot_ad <= norm_ad)
-    return is_in_box
-
-
-def rotated_iou_similarity(box1, box2, eps=1e-9, func=''):
-    """Calculate iou of box1 and box2
-
-    Args:
-        box1 (Tensor): box with the shape [N, M1, 5]
-        box2 (Tensor): box with the shape [N, M2, 5]
-
-    Return:
-        iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2]
-    """
-    from ext_op import rbox_iou
-    rotated_ious = []
-    for b1, b2 in zip(box1, box2):
-        rotated_ious.append(rbox_iou(b1, b2))
-
-    return paddle.stack(rotated_ious, axis=0)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/reid/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/reid/__init__.py
deleted file mode 100644
index 3c176d7..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/reid/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from . import jde_embedding_head
-from . import fairmot_embedding_head
-from . import resnet
-from . import pyramidal_embedding
-from . import pplcnet_embedding
-from . import resnet_embedding
-
-from .fairmot_embedding_head import *
-from .jde_embedding_head import *
-from .resnet import *
-from .pyramidal_embedding import *
-from .pplcnet_embedding import *
-from .resnet_embedding import *
diff --git a/pdfdet/models/Paddle/ppdet/modeling/reid/fairmot_embedding_head.py b/pdfdet/models/Paddle/ppdet/modeling/reid/fairmot_embedding_head.py
deleted file mode 100644
index 98ca257..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/reid/fairmot_embedding_head.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import numpy as np
-import math
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.nn.initializer import KaimingUniform, Uniform
-from ppdet.core.workspace import register
-from ppdet.modeling.heads.centernet_head import ConvLayer
-
-__all__ = ['FairMOTEmbeddingHead']
-
-
-@register
-class FairMOTEmbeddingHead(nn.Layer):
-    __shared__ = ['num_classes']
-    """
-    Args:
-        in_channels (int): the channel number of input to FairMOTEmbeddingHead.
-        ch_head (int): the channel of features before fed into embedding, 256 by default.
-        ch_emb (int): the channel of the embedding feature, 128 by default.
-        num_identities_dict (dict): the number of identities of each category,
-            support single class and multi-calss, {0: 14455} as default. 
-    """
-
-    def __init__(self,
-                 in_channels,
-                 ch_head=256,
-                 ch_emb=128,
-                 num_classes=1,
-                 num_identities_dict={0: 14455}):
-        super(FairMOTEmbeddingHead, self).__init__()
-        assert num_classes >= 1
-        self.num_classes = num_classes
-        self.ch_emb = ch_emb
-        self.num_identities_dict = num_identities_dict
-        self.reid = nn.Sequential(
-            ConvLayer(
-                in_channels, ch_head, kernel_size=3, padding=1, bias=True),
-            nn.ReLU(),
-            ConvLayer(
-                ch_head, ch_emb, kernel_size=1, stride=1, padding=0, bias=True))
-        param_attr = paddle.ParamAttr(initializer=KaimingUniform())
-        bound = 1 / math.sqrt(ch_emb)
-        bias_attr = paddle.ParamAttr(initializer=Uniform(-bound, bound))
-        self.reid_loss = nn.CrossEntropyLoss(ignore_index=-1, reduction='sum')
-
-        if num_classes == 1:
-            nID = self.num_identities_dict[0]  # single class
-            self.classifier = nn.Linear(
-                ch_emb, nID, weight_attr=param_attr, bias_attr=bias_attr)
-            # When num_identities(nID) is 1, emb_scale is set as 1
-            self.emb_scale = math.sqrt(2) * math.log(nID - 1) if nID > 1 else 1
-        else:
-            self.classifiers = dict()
-            self.emb_scale_dict = dict()
-            for cls_id, nID in self.num_identities_dict.items():
-                self.classifiers[str(cls_id)] = nn.Linear(
-                    ch_emb, nID, weight_attr=param_attr, bias_attr=bias_attr)
-                # When num_identities(nID) is 1, emb_scale is set as 1
-                self.emb_scale_dict[str(cls_id)] = math.sqrt(2) * math.log(
-                    nID - 1) if nID > 1 else 1
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        if isinstance(input_shape, (list, tuple)):
-            input_shape = input_shape[0]
-        return {'in_channels': input_shape.channels}
-
-    def process_by_class(self, bboxes, embedding, bbox_inds, topk_clses):
-        pred_dets, pred_embs = [], []
-        for cls_id in range(self.num_classes):
-            inds_masks = topk_clses == cls_id
-            inds_masks = paddle.cast(inds_masks, 'float32')
-
-            pos_num = inds_masks.sum().numpy()
-            if pos_num == 0:
-                continue
-
-            cls_inds_mask = inds_masks > 0
-
-            bbox_mask = paddle.nonzero(cls_inds_mask)
-            cls_bboxes = paddle.gather_nd(bboxes, bbox_mask)
-            pred_dets.append(cls_bboxes)
-
-            cls_inds = paddle.masked_select(bbox_inds, cls_inds_mask)
-            cls_inds = cls_inds.unsqueeze(-1)
-            cls_embedding = paddle.gather_nd(embedding, cls_inds)
-            pred_embs.append(cls_embedding)
-
-        return paddle.concat(pred_dets), paddle.concat(pred_embs)
-
-    def forward(self,
-                neck_feat,
-                inputs,
-                bboxes=None,
-                bbox_inds=None,
-                topk_clses=None):
-        reid_feat = self.reid(neck_feat)
-        if self.training:
-            if self.num_classes == 1:
-                loss = self.get_loss(reid_feat, inputs)
-            else:
-                loss = self.get_mc_loss(reid_feat, inputs)
-            return loss
-        else:
-            assert bboxes is not None and bbox_inds is not None
-            reid_feat = F.normalize(reid_feat)
-            embedding = paddle.transpose(reid_feat, [0, 2, 3, 1])
-            embedding = paddle.reshape(embedding, [-1, self.ch_emb])
-            # embedding shape: [bs * h * w, ch_emb]
-
-            if self.num_classes == 1:
-                pred_dets = bboxes
-                pred_embs = paddle.gather(embedding, bbox_inds)
-            else:
-                pred_dets, pred_embs = self.process_by_class(
-                    bboxes, embedding, bbox_inds, topk_clses)
-            return pred_dets, pred_embs
-
-    def get_loss(self, feat, inputs):
-        index = inputs['index']
-        mask = inputs['index_mask']
-        target = inputs['reid']
-        target = paddle.masked_select(target, mask > 0)
-        target = paddle.unsqueeze(target, 1)
-
-        feat = paddle.transpose(feat, perm=[0, 2, 3, 1])
-        feat_n, feat_h, feat_w, feat_c = feat.shape
-        feat = paddle.reshape(feat, shape=[feat_n, -1, feat_c])
-        index = paddle.unsqueeze(index, 2)
-        batch_inds = list()
-        for i in range(feat_n):
-            batch_ind = paddle.full(
-                shape=[1, index.shape[1], 1], fill_value=i, dtype='int64')
-            batch_inds.append(batch_ind)
-        batch_inds = paddle.concat(batch_inds, axis=0)
-        index = paddle.concat(x=[batch_inds, index], axis=2)
-        feat = paddle.gather_nd(feat, index=index)
-
-        mask = paddle.unsqueeze(mask, axis=2)
-        mask = paddle.expand_as(mask, feat)
-        mask.stop_gradient = True
-        feat = paddle.masked_select(feat, mask > 0)
-        feat = paddle.reshape(feat, shape=[-1, feat_c])
-        feat = F.normalize(feat)
-        feat = self.emb_scale * feat
-        logit = self.classifier(feat)
-        target.stop_gradient = True
-        loss = self.reid_loss(logit, target)
-        valid = (target != self.reid_loss.ignore_index)
-        valid.stop_gradient = True
-        count = paddle.sum((paddle.cast(valid, dtype=np.int32)))
-        count.stop_gradient = True
-        if count > 0:
-            loss = loss / count
-
-        return loss
-
-    def get_mc_loss(self, feat, inputs):
-        # feat.shape = [bs, ch_emb, h, w]
-        assert 'cls_id_map' in inputs and 'cls_tr_ids' in inputs
-        index = inputs['index']
-        mask = inputs['index_mask']
-        cls_id_map = inputs['cls_id_map']  # [bs, h, w]
-        cls_tr_ids = inputs['cls_tr_ids']  # [bs, num_classes, h, w]
-
-        feat = paddle.transpose(feat, perm=[0, 2, 3, 1])
-        feat_n, feat_h, feat_w, feat_c = feat.shape
-        feat = paddle.reshape(feat, shape=[feat_n, -1, feat_c])
-
-        index = paddle.unsqueeze(index, 2)
-        batch_inds = list()
-        for i in range(feat_n):
-            batch_ind = paddle.full(
-                shape=[1, index.shape[1], 1], fill_value=i, dtype='int64')
-            batch_inds.append(batch_ind)
-        batch_inds = paddle.concat(batch_inds, axis=0)
-        index = paddle.concat(x=[batch_inds, index], axis=2)
-        feat = paddle.gather_nd(feat, index=index)
-
-        mask = paddle.unsqueeze(mask, axis=2)
-        mask = paddle.expand_as(mask, feat)
-        mask.stop_gradient = True
-        feat = paddle.masked_select(feat, mask > 0)
-        feat = paddle.reshape(feat, shape=[-1, feat_c])
-
-        reid_losses = 0
-        for cls_id, id_num in self.num_identities_dict.items():
-            # target
-            cur_cls_tr_ids = paddle.reshape(
-                cls_tr_ids[:, cls_id, :, :], shape=[feat_n, -1])  # [bs, h*w]
-            cls_id_target = paddle.gather_nd(cur_cls_tr_ids, index=index)
-            mask = inputs['index_mask']
-            cls_id_target = paddle.masked_select(cls_id_target, mask > 0)
-            cls_id_target.stop_gradient = True
-
-            # feat
-            cls_id_feat = self.emb_scale_dict[str(cls_id)] * F.normalize(feat)
-            cls_id_pred = self.classifiers[str(cls_id)](cls_id_feat)
-
-            loss = self.reid_loss(cls_id_pred, cls_id_target)
-            valid = (cls_id_target != self.reid_loss.ignore_index)
-            valid.stop_gradient = True
-            count = paddle.sum((paddle.cast(valid, dtype=np.int32)))
-            count.stop_gradient = True
-            if count > 0:
-                loss = loss / count
-            reid_losses += loss
-
-        return reid_losses
diff --git a/pdfdet/models/Paddle/ppdet/modeling/reid/jde_embedding_head.py b/pdfdet/models/Paddle/ppdet/modeling/reid/jde_embedding_head.py
deleted file mode 100644
index 1d1e60f..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/reid/jde_embedding_head.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-from ppdet.core.workspace import register
-from paddle.nn.initializer import Normal, Constant
-
-__all__ = ['JDEEmbeddingHead']
-
-
-class LossParam(nn.Layer):
-    def __init__(self, init_value=0., use_uncertainy=True):
-        super(LossParam, self).__init__()
-        self.loss_param = self.create_parameter(
-            shape=[1],
-            attr=ParamAttr(initializer=Constant(value=init_value)),
-            dtype="float32")
-
-    def forward(self, inputs):
-        out = paddle.exp(-self.loss_param) * inputs + self.loss_param
-        return out * 0.5
-
-
-@register
-class JDEEmbeddingHead(nn.Layer):
-    __shared__ = ['num_classes']
-    __inject__ = ['emb_loss', 'jde_loss']
-    """
-    JDEEmbeddingHead
-    Args:
-        num_classes(int): Number of classes. Only support one class tracking.
-        num_identities(int): Number of identities.
-        anchor_levels(int): Number of anchor levels, same as FPN levels.
-        anchor_scales(int): Number of anchor scales on each FPN level.
-        embedding_dim(int): Embedding dimension. Default: 512.
-        emb_loss(object): Instance of 'JDEEmbeddingLoss'
-        jde_loss(object): Instance of 'JDELoss'
-    """
-
-    def __init__(
-            self,
-            num_classes=1,
-            num_identities=14455,  # dataset.num_identities_dict[0]
-            anchor_levels=3,
-            anchor_scales=4,
-            embedding_dim=512,
-            emb_loss='JDEEmbeddingLoss',
-            jde_loss='JDELoss'):
-        super(JDEEmbeddingHead, self).__init__()
-        self.num_classes = num_classes
-        self.num_identities = num_identities
-        self.anchor_levels = anchor_levels
-        self.anchor_scales = anchor_scales
-        self.embedding_dim = embedding_dim
-        self.emb_loss = emb_loss
-        self.jde_loss = jde_loss
-
-        self.emb_scale = math.sqrt(2) * math.log(
-            self.num_identities - 1) if self.num_identities > 1 else 1
-
-        self.identify_outputs = []
-        self.loss_params_cls = []
-        self.loss_params_reg = []
-        self.loss_params_ide = []
-        for i in range(self.anchor_levels):
-            name = 'identify_output.{}'.format(i)
-            identify_output = self.add_sublayer(
-                name,
-                nn.Conv2D(
-                    in_channels=64 * (2**self.anchor_levels) // (2**i),
-                    out_channels=self.embedding_dim,
-                    kernel_size=3,
-                    stride=1,
-                    padding=1,
-                    bias_attr=ParamAttr(regularizer=L2Decay(0.))))
-            self.identify_outputs.append(identify_output)
-
-            loss_p_cls = self.add_sublayer('cls.{}'.format(i), LossParam(-4.15))
-            self.loss_params_cls.append(loss_p_cls)
-            loss_p_reg = self.add_sublayer('reg.{}'.format(i), LossParam(-4.85))
-            self.loss_params_reg.append(loss_p_reg)
-            loss_p_ide = self.add_sublayer('ide.{}'.format(i), LossParam(-2.3))
-            self.loss_params_ide.append(loss_p_ide)
-
-        self.classifier = self.add_sublayer(
-            'classifier',
-            nn.Linear(
-                self.embedding_dim,
-                self.num_identities,
-                weight_attr=ParamAttr(
-                    learning_rate=1., initializer=Normal(
-                        mean=0.0, std=0.01)),
-                bias_attr=ParamAttr(
-                    learning_rate=2., regularizer=L2Decay(0.))))
-
-    def forward(self,
-                identify_feats,
-                targets,
-                loss_confs=None,
-                loss_boxes=None,
-                bboxes=None,
-                boxes_idx=None,
-                nms_keep_idx=None):
-        assert self.num_classes == 1, 'JDE only support sindle class MOT.'
-        assert len(identify_feats) == self.anchor_levels
-        ide_outs = []
-        for feat, ide_head in zip(identify_feats, self.identify_outputs):
-            ide_outs.append(ide_head(feat))
-
-        if self.training:
-            assert len(loss_confs) == len(loss_boxes) == self.anchor_levels
-            loss_ides = self.emb_loss(ide_outs, targets, self.emb_scale,
-                                      self.classifier)
-            jde_losses = self.jde_loss(
-                loss_confs, loss_boxes, loss_ides, self.loss_params_cls,
-                self.loss_params_reg, self.loss_params_ide, targets)
-            return jde_losses
-        else:
-            assert bboxes is not None
-            assert boxes_idx is not None
-            assert nms_keep_idx is not None
-
-            emb_outs = self.get_emb_outs(ide_outs)
-            emb_valid = paddle.gather_nd(emb_outs, boxes_idx)
-            pred_embs = paddle.gather_nd(emb_valid, nms_keep_idx)
-
-            input_shape = targets['image'].shape[2:]
-            # input_shape: [h, w], before data transforms, set in model config
-            im_shape = targets['im_shape'][0].numpy()
-            # im_shape: [new_h, new_w], after data transforms
-            scale_factor = targets['scale_factor'][0].numpy()
-            bboxes[:, 2:] = self.scale_coords(bboxes[:, 2:], input_shape,
-                                              im_shape, scale_factor)
-            # cls_ids, scores, tlwhs 
-            pred_dets = bboxes
-            return pred_dets, pred_embs
-
-    def scale_coords(self, coords, input_shape, im_shape, scale_factor):
-        ratio = scale_factor[0]
-        pad_w = (input_shape[1] - int(im_shape[1])) / 2
-        pad_h = (input_shape[0] - int(im_shape[0])) / 2
-        coords = paddle.cast(coords, 'float32')
-        coords[:, 0::2] -= pad_w
-        coords[:, 1::2] -= pad_h
-        coords[:, 0:4] /= ratio
-        coords[:, :4] = paddle.clip(
-            coords[:, :4], min=0, max=coords[:, :4].max())
-        return coords.round()
-
-    def get_emb_and_gt_outs(self, ide_outs, targets):
-        emb_and_gts = []
-        for i, p_ide in enumerate(ide_outs):
-            t_conf = targets['tconf{}'.format(i)]
-            t_ide = targets['tide{}'.format(i)]
-
-            p_ide = p_ide.transpose((0, 2, 3, 1))
-            p_ide_flatten = paddle.reshape(p_ide, [-1, self.embedding_dim])
-
-            mask = t_conf > 0
-            mask = paddle.cast(mask, dtype="int64")
-            emb_mask = mask.max(1).flatten()
-            emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten()
-            if len(emb_mask_inds) > 0:
-                t_ide_flatten = paddle.reshape(t_ide.max(1), [-1, 1])
-                tids = paddle.gather(t_ide_flatten, emb_mask_inds)
-
-                embedding = paddle.gather(p_ide_flatten, emb_mask_inds)
-                embedding = self.emb_scale * F.normalize(embedding)
-                emb_and_gt = paddle.concat([embedding, tids], axis=1)
-                emb_and_gts.append(emb_and_gt)
-
-        if len(emb_and_gts) > 0:
-            return paddle.concat(emb_and_gts, axis=0)
-        else:
-            return paddle.zeros((1, self.embedding_dim + 1))
-
-    def get_emb_outs(self, ide_outs):
-        emb_outs = []
-        for i, p_ide in enumerate(ide_outs):
-            p_ide = p_ide.transpose((0, 2, 3, 1))
-
-            p_ide_repeat = paddle.tile(p_ide, [self.anchor_scales, 1, 1, 1])
-            embedding = F.normalize(p_ide_repeat, axis=-1)
-            emb = paddle.reshape(embedding, [-1, self.embedding_dim])
-            emb_outs.append(emb)
-
-        if len(emb_outs) > 0:
-            return paddle.concat(emb_outs, axis=0)
-        else:
-            return paddle.zeros((1, self.embedding_dim))
diff --git a/pdfdet/models/Paddle/ppdet/modeling/reid/pplcnet_embedding.py b/pdfdet/models/Paddle/ppdet/modeling/reid/pplcnet_embedding.py
deleted file mode 100644
index d360f89..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/reid/pplcnet_embedding.py
+++ /dev/null
@@ -1,281 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.nn.initializer import Normal, Constant
-from paddle import ParamAttr
-from paddle.nn import AdaptiveAvgPool2D, BatchNorm2D, Conv2D, Linear
-from paddle.regularizer import L2Decay
-from paddle.nn.initializer import KaimingNormal, XavierNormal
-from ppdet.core.workspace import register
-
-__all__ = ['PPLCNetEmbedding']
-
-
-# Each element(list) represents a depthwise block, which is composed of k, in_c, out_c, s, use_se.
-# k: kernel_size
-# in_c: input channel number in depthwise block
-# out_c: output channel number in depthwise block
-# s: stride in depthwise block
-# use_se: whether to use SE block
-
-NET_CONFIG = {
-    "blocks2":
-    #k, in_c, out_c, s, use_se
-    [[3, 16, 32, 1, False]],
-    "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]],
-    "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]],
-    "blocks5": [[3, 128, 256, 2, False], [5, 256, 256, 1, False],
-                [5, 256, 256, 1, False], [5, 256, 256, 1, False],
-                [5, 256, 256, 1, False], [5, 256, 256, 1, False]],
-    "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]]
-}
-
-
-def make_divisible(v, divisor=8, min_value=None):
-    if min_value is None:
-        min_value = divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    if new_v < 0.9 * v:
-        new_v += divisor
-    return new_v
-
-
-class ConvBNLayer(nn.Layer):
-    def __init__(self,
-                 num_channels,
-                 filter_size,
-                 num_filters,
-                 stride,
-                 num_groups=1):
-        super().__init__()
-
-        self.conv = Conv2D(
-            in_channels=num_channels,
-            out_channels=num_filters,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            groups=num_groups,
-            weight_attr=ParamAttr(initializer=KaimingNormal()),
-            bias_attr=False)
-
-        self.bn = BatchNorm2D(
-            num_filters,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-        self.hardswish = nn.Hardswish()
-
-    def forward(self, x):
-        x = self.conv(x)
-        x = self.bn(x)
-        x = self.hardswish(x)
-        return x
-
-
-class DepthwiseSeparable(nn.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 stride,
-                 dw_size=3,
-                 use_se=False):
-        super().__init__()
-        self.use_se = use_se
-        self.dw_conv = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_channels,
-            filter_size=dw_size,
-            stride=stride,
-            num_groups=num_channels)
-        if use_se:
-            self.se = SEModule(num_channels)
-        self.pw_conv = ConvBNLayer(
-            num_channels=num_channels,
-            filter_size=1,
-            num_filters=num_filters,
-            stride=1)
-
-    def forward(self, x):
-        x = self.dw_conv(x)
-        if self.use_se:
-            x = self.se(x)
-        x = self.pw_conv(x)
-        return x
-
-
-class SEModule(nn.Layer):
-    def __init__(self, channel, reduction=4):
-        super().__init__()
-        self.avg_pool = AdaptiveAvgPool2D(1)
-        self.conv1 = Conv2D(
-            in_channels=channel,
-            out_channels=channel // reduction,
-            kernel_size=1,
-            stride=1,
-            padding=0)
-        self.relu = nn.ReLU()
-        self.conv2 = Conv2D(
-            in_channels=channel // reduction,
-            out_channels=channel,
-            kernel_size=1,
-            stride=1,
-            padding=0)
-        self.hardsigmoid = nn.Hardsigmoid()
-
-    def forward(self, x):
-        identity = x
-        x = self.avg_pool(x)
-        x = self.conv1(x)
-        x = self.relu(x)
-        x = self.conv2(x)
-        x = self.hardsigmoid(x)
-        x = paddle.multiply(x=identity, y=x)
-        return x
-
-
-class PPLCNet(nn.Layer):
-    """
-    PP-LCNet, see https://arxiv.org/abs/2109.15099.
-    This code is different from PPLCNet in ppdet/modeling/backbones/lcnet.py
-    or in PaddleClas, because the output is the flatten feature of last_conv.
-
-    Args:
-        scale (float): Scale ratio of channels.
-        class_expand (int): Number of channels of conv feature.
-    """
-
-    def __init__(self, scale=1.0, class_expand=1280):
-        super(PPLCNet, self).__init__()
-        self.scale = scale
-        self.class_expand = class_expand
-
-        self.conv1 = ConvBNLayer(
-            num_channels=3,
-            filter_size=3,
-            num_filters=make_divisible(16 * scale),
-            stride=2)
-
-        self.blocks2 = nn.Sequential(*[
-            DepthwiseSeparable(
-                num_channels=make_divisible(in_c * scale),
-                num_filters=make_divisible(out_c * scale),
-                dw_size=k,
-                stride=s,
-                use_se=se)
-            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"])
-        ])
-
-        self.blocks3 = nn.Sequential(*[
-            DepthwiseSeparable(
-                num_channels=make_divisible(in_c * scale),
-                num_filters=make_divisible(out_c * scale),
-                dw_size=k,
-                stride=s,
-                use_se=se)
-            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"])
-        ])
-
-        self.blocks4 = nn.Sequential(*[
-            DepthwiseSeparable(
-                num_channels=make_divisible(in_c * scale),
-                num_filters=make_divisible(out_c * scale),
-                dw_size=k,
-                stride=s,
-                use_se=se)
-            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"])
-        ])
-
-        self.blocks5 = nn.Sequential(*[
-            DepthwiseSeparable(
-                num_channels=make_divisible(in_c * scale),
-                num_filters=make_divisible(out_c * scale),
-                dw_size=k,
-                stride=s,
-                use_se=se)
-            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"])
-        ])
-
-        self.blocks6 = nn.Sequential(*[
-            DepthwiseSeparable(
-                num_channels=make_divisible(in_c * scale),
-                num_filters=make_divisible(out_c * scale),
-                dw_size=k,
-                stride=s,
-                use_se=se)
-            for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"])
-        ])
-
-        self.avg_pool = AdaptiveAvgPool2D(1)
-        self.last_conv = Conv2D(
-            in_channels=make_divisible(NET_CONFIG["blocks6"][-1][2] * scale),
-            out_channels=self.class_expand,
-            kernel_size=1,
-            stride=1,
-            padding=0,
-            bias_attr=False)
-        self.hardswish = nn.Hardswish()
-        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
-
-    def forward(self, x):
-        x = self.conv1(x)
-
-        x = self.blocks2(x)
-        x = self.blocks3(x)
-        x = self.blocks4(x)
-        x = self.blocks5(x)
-        x = self.blocks6(x)
-
-        x = self.avg_pool(x)
-        x = self.last_conv(x)
-        x = self.hardswish(x)
-        x = self.flatten(x)
-        return x
-
-
-class FC(nn.Layer):
-    def __init__(self, input_ch, output_ch):
-        super(FC, self).__init__()
-        weight_attr = ParamAttr(initializer=XavierNormal())
-        self.fc = paddle.nn.Linear(input_ch, output_ch, weight_attr=weight_attr)
-
-    def forward(self, x):
-        out = self.fc(x)
-        return out
-
-
-@register
-class PPLCNetEmbedding(nn.Layer):
-    """
-    PPLCNet Embedding
-
-    Args:
-        input_ch (int): Number of channels of input conv feature.
-        output_ch (int): Number of channels of output conv feature.
-    """
-    def __init__(self, scale=2.5, input_ch=1280, output_ch=512):
-        super(PPLCNetEmbedding, self).__init__()
-        self.backbone = PPLCNet(scale=scale)
-        self.neck = FC(input_ch, output_ch)
-
-    def forward(self, x):
-        feat = self.backbone(x)
-        feat_out = self.neck(feat)
-        return feat_out
diff --git a/pdfdet/models/Paddle/ppdet/modeling/reid/pyramidal_embedding.py b/pdfdet/models/Paddle/ppdet/modeling/reid/pyramidal_embedding.py
deleted file mode 100644
index 6b2a76d..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/reid/pyramidal_embedding.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.nn.initializer import Normal, Constant
-from paddle import ParamAttr
-from .resnet import ResNet50, ResNet101
-from ppdet.core.workspace import register
-
-__all__ = ['PCBPyramid']
-
-
-@register
-class PCBPyramid(nn.Layer):
-    """
-    PCB (Part-based Convolutional Baseline), see https://arxiv.org/abs/1711.09349,
-    Pyramidal Person Re-IDentification, see https://arxiv.org/abs/1810.12193
-
-    Args:
-        input_ch (int): Number of channels of the input feature.
-        num_stripes (int): Number of sub-parts.
-        used_levels (tuple): Whether the level is used, 1 means used.
-        num_classes (int): Number of classes for identities, default 751 in
-            Market-1501 dataset.
-        last_conv_stride (int): Stride of the last conv.
-        last_conv_dilation (int): Dilation of the last conv.
-        num_conv_out_channels (int): Number of channels of conv feature.
-    """
-
-    def __init__(self,
-                 input_ch=2048,
-                 model_name='ResNet101',
-                 num_stripes=6,
-                 used_levels=(1, 1, 1, 1, 1, 1),
-                 num_classes=751,
-                 last_conv_stride=1,
-                 last_conv_dilation=1,
-                 num_conv_out_channels=128):
-        super(PCBPyramid, self).__init__()
-        self.num_stripes = num_stripes
-        self.used_levels = used_levels
-        self.num_classes = num_classes
-
-        self.num_in_each_level = [i for i in range(self.num_stripes, 0, -1)]
-        self.num_branches = sum(self.num_in_each_level)
-
-        assert model_name in ['ResNet50', 'ResNet101'], "Unsupported ReID arch: {}".format(model_name)
-        self.base = eval(model_name)(
-            lr_mult=0.1,
-            last_conv_stride=last_conv_stride,
-            last_conv_dilation=last_conv_dilation)
-        self.dropout_layer = nn.Dropout(p=0.2)
-        self.pyramid_conv_list0, self.pyramid_fc_list0 = self.basic_branch(
-            num_conv_out_channels, input_ch)
-
-    def basic_branch(self, num_conv_out_channels, input_ch):
-        # the level indexes are defined from fine to coarse,
-        # the branch will contain one more part than that of its previous level
-        # the sliding step is set to 1
-        pyramid_conv_list = nn.LayerList()
-        pyramid_fc_list = nn.LayerList()
-
-        idx_levels = 0
-        for idx_branches in range(self.num_branches):
-            if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]):
-                idx_levels += 1
-
-            pyramid_conv_list.append(
-                nn.Sequential(
-                    nn.Conv2D(input_ch, num_conv_out_channels, 1),
-                    nn.BatchNorm2D(num_conv_out_channels), nn.ReLU()))
-
-        idx_levels = 0
-        for idx_branches in range(self.num_branches):
-            if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]):
-                idx_levels += 1
-
-            fc = nn.Linear(
-                in_features=num_conv_out_channels,
-                out_features=self.num_classes,
-                weight_attr=ParamAttr(initializer=Normal(
-                    mean=0., std=0.001)),
-                bias_attr=ParamAttr(initializer=Constant(value=0.)))
-            pyramid_fc_list.append(fc)
-        return pyramid_conv_list, pyramid_fc_list
-
-    def pyramid_forward(self, feat):
-        each_stripe_size = int(feat.shape[2] / self.num_stripes)
-
-        feat_list, logits_list = [], []
-        idx_levels = 0
-        used_branches = 0
-        for idx_branches in range(self.num_branches):
-            if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]):
-                idx_levels += 1
-            idx_in_each_level = idx_branches - sum(self.num_in_each_level[
-                0:idx_levels])
-            stripe_size_in_each_level = each_stripe_size * (idx_levels + 1)
-            start = idx_in_each_level * each_stripe_size
-            end = start + stripe_size_in_each_level
-
-            k = feat.shape[-1]
-            local_feat_avgpool = F.avg_pool2d(
-                feat[:, :, start:end, :],
-                kernel_size=(stripe_size_in_each_level, k))
-            local_feat_maxpool = F.max_pool2d(
-                feat[:, :, start:end, :],
-                kernel_size=(stripe_size_in_each_level, k))
-            local_feat = local_feat_avgpool + local_feat_maxpool
-
-            local_feat = self.pyramid_conv_list0[used_branches](local_feat)
-            local_feat = paddle.reshape(
-                local_feat, shape=[local_feat.shape[0], -1])
-            feat_list.append(local_feat)
-
-            local_logits = self.pyramid_fc_list0[used_branches](
-                self.dropout_layer(local_feat))
-            logits_list.append(local_logits)
-
-            used_branches += 1
-
-        return feat_list, logits_list
-
-    def forward(self, x):
-        feat = self.base(x)
-        assert feat.shape[2] % self.num_stripes == 0
-        feat_list, logits_list = self.pyramid_forward(feat)
-        feat_out = paddle.concat(feat_list, axis=-1)
-        return feat_out
diff --git a/pdfdet/models/Paddle/ppdet/modeling/reid/resnet.py b/pdfdet/models/Paddle/ppdet/modeling/reid/resnet.py
deleted file mode 100644
index 2e2a855..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/reid/resnet.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import math
-import paddle
-from paddle import ParamAttr
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.nn.initializer import Normal
-
-__all__ = ["ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"]
-
-
-class ConvBNLayer(nn.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 stride=1,
-                 dilation=1,
-                 groups=1,
-                 act=None,
-                 lr_mult=1.0,
-                 name=None,
-                 data_format="NCHW"):
-        super(ConvBNLayer, self).__init__()
-        conv_stdv = filter_size * filter_size * num_filters
-        self._conv = nn.Conv2D(
-            in_channels=num_channels,
-            out_channels=num_filters,
-            kernel_size=filter_size,
-            stride=stride,
-            padding=(filter_size - 1) // 2,
-            dilation=dilation,
-            groups=groups,
-            weight_attr=ParamAttr(
-                learning_rate=lr_mult,
-                initializer=Normal(0, math.sqrt(2. / conv_stdv))),
-            bias_attr=False,
-            data_format=data_format)
-
-        self._batch_norm = nn.BatchNorm2D(num_filters)
-        self.act = act
-
-    def forward(self, inputs):
-        y = self._conv(inputs)
-        y = self._batch_norm(y)
-        if self.act:
-            y = getattr(F, self.act)(y)
-        return y
-
-
-class BottleneckBlock(nn.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 stride,
-                 shortcut=True,
-                 name=None,
-                 lr_mult=1.0,
-                 dilation=1,
-                 data_format="NCHW"):
-        super(BottleneckBlock, self).__init__()
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=1,
-            dilation=dilation,
-            act="relu",
-            lr_mult=lr_mult,
-            name=name + "_branch2a",
-            data_format=data_format)
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            dilation=dilation,
-            stride=stride,
-            act="relu",
-            lr_mult=lr_mult,
-            name=name + "_branch2b",
-            data_format=data_format)
-        self.conv2 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters * 4,
-            filter_size=1,
-            dilation=dilation,
-            act=None,
-            lr_mult=lr_mult,
-            name=name + "_branch2c",
-            data_format=data_format)
-        if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters * 4,
-                filter_size=1,
-                dilation=dilation,
-                stride=stride,
-                lr_mult=lr_mult,
-                name=name + "_branch1",
-                data_format=data_format)
-        self.shortcut = shortcut
-        self._num_channels_out = num_filters * 4
-
-    def forward(self, inputs):
-        y = self.conv0(inputs)
-        conv1 = self.conv1(y)
-        conv2 = self.conv2(conv1)
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
-        y = paddle.add(x=short, y=conv2)
-        y = F.relu(y)
-        return y
-
-
-class BasicBlock(nn.Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 stride,
-                 shortcut=True,
-                 name=None,
-                 data_format="NCHW"):
-        super(BasicBlock, self).__init__()
-        self.stride = stride
-        self.conv0 = ConvBNLayer(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=3,
-            stride=stride,
-            act="relu",
-            name=name + "_branch2a",
-            data_format=data_format)
-        self.conv1 = ConvBNLayer(
-            num_channels=num_filters,
-            num_filters=num_filters,
-            filter_size=3,
-            act=None,
-            name=name + "_branch2b",
-            data_format=data_format)
-        if not shortcut:
-            self.short = ConvBNLayer(
-                num_channels=num_channels,
-                num_filters=num_filters,
-                filter_size=1,
-                stride=stride,
-                name=name + "_branch1",
-                data_format=data_format)
-        self.shortcut = shortcut
-
-    def forward(self, inputs):
-        y = self.conv0(inputs)
-        conv1 = self.conv1(y)
-        if self.shortcut:
-            short = inputs
-        else:
-            short = self.short(inputs)
-        y = paddle.add(x=short, y=conv1)
-        y = F.relu(y)
-        return y
-
-
-class ResNet(nn.Layer):
-    def __init__(self,
-                 layers=50,
-                 lr_mult=1.0,
-                 last_conv_stride=2,
-                 last_conv_dilation=1):
-        super(ResNet, self).__init__()
-        self.layers = layers
-        self.data_format = "NCHW"
-        self.input_image_channel = 3
-        supported_layers = [18, 34, 50, 101, 152]
-        assert layers in supported_layers, \
-            "supported layers are {} but input layer is {}".format(
-                supported_layers, layers)
-        if layers == 18:
-            depth = [2, 2, 2, 2]
-        elif layers == 34 or layers == 50:
-            depth = [3, 4, 6, 3]
-        elif layers == 101:
-            depth = [3, 4, 23, 3]
-        elif layers == 152:
-            depth = [3, 8, 36, 3]
-        num_channels = [64, 256, 512,
-                        1024] if layers >= 50 else [64, 64, 128, 256]
-        num_filters = [64, 128, 256, 512]
-        self.conv = ConvBNLayer(
-            num_channels=self.input_image_channel,
-            num_filters=64,
-            filter_size=7,
-            stride=2,
-            act="relu",
-            lr_mult=lr_mult,
-            name="conv1",
-            data_format=self.data_format)
-        self.pool2d_max = nn.MaxPool2D(
-            kernel_size=3, stride=2, padding=1, data_format=self.data_format)
-        self.block_list = []
-        if layers >= 50:
-            for block in range(len(depth)):
-                shortcut = False
-                for i in range(depth[block]):
-                    if layers in [101, 152] and block == 2:
-                        if i == 0:
-                            conv_name = "res" + str(block + 2) + "a"
-                        else:
-                            conv_name = "res" + str(block + 2) + "b" + str(i)
-                    else:
-                        conv_name = "res" + str(block + 2) + chr(97 + i)
-                    if i != 0 or block == 0:
-                        stride = 1
-                    elif block == len(depth) - 1:
-                        stride = last_conv_stride
-                    else:
-                        stride = 2
-                    bottleneck_block = self.add_sublayer(
-                        conv_name,
-                        BottleneckBlock(
-                            num_channels=num_channels[block]
-                            if i == 0 else num_filters[block] * 4,
-                            num_filters=num_filters[block],
-                            stride=stride,
-                            shortcut=shortcut,
-                            name=conv_name,
-                            lr_mult=lr_mult,
-                            dilation=last_conv_dilation
-                            if block == len(depth) - 1 else 1,
-                            data_format=self.data_format))
-                    self.block_list.append(bottleneck_block)
-                    shortcut = True
-        else:
-            for block in range(len(depth)):
-                shortcut = False
-                for i in range(depth[block]):
-                    conv_name = "res" + str(block + 2) + chr(97 + i)
-                    basic_block = self.add_sublayer(
-                        conv_name,
-                        BasicBlock(
-                            num_channels=num_channels[block]
-                            if i == 0 else num_filters[block],
-                            num_filters=num_filters[block],
-                            stride=2 if i == 0 and block != 0 else 1,
-                            shortcut=shortcut,
-                            name=conv_name,
-                            data_format=self.data_format))
-                    self.block_list.append(basic_block)
-                    shortcut = True
-
-    def forward(self, inputs):
-        y = self.conv(inputs)
-        y = self.pool2d_max(y)
-        for block in self.block_list:
-            y = block(y)
-        return y
-
-
-def ResNet18(**args):
-    model = ResNet(layers=18, **args)
-    return model
-
-
-def ResNet34(**args):
-    model = ResNet(layers=34, **args)
-    return model
-
-
-def ResNet50(pretrained=None, **args):
-    model = ResNet(layers=50, **args)
-    if pretrained is not None:
-        if not (os.path.isdir(pretrained) or
-                os.path.exists(pretrained + '.pdparams')):
-            raise ValueError("Model pretrain path {} does not "
-                             "exists.".format(pretrained))
-        param_state_dict = paddle.load(pretrained + '.pdparams')
-        model.set_dict(param_state_dict)
-    return model
-
-
-def ResNet101(pretrained=None, **args):
-    model = ResNet(layers=101, **args)
-    if pretrained is not None:
-        if not (os.path.isdir(pretrained) or
-                os.path.exists(pretrained + '.pdparams')):
-            raise ValueError("Model pretrain path {} does not "
-                             "exists.".format(pretrained))
-        param_state_dict = paddle.load(pretrained + '.pdparams')
-        model.set_dict(param_state_dict)
-    return model
-
-
-def ResNet152(**args):
-    model = ResNet(layers=152, **args)
-    return model
diff --git a/pdfdet/models/Paddle/ppdet/modeling/reid/resnet_embedding.py b/pdfdet/models/Paddle/ppdet/modeling/reid/resnet_embedding.py
deleted file mode 100644
index 28c11eb..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/reid/resnet_embedding.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-import os
-import paddle
-import paddle.nn.functional as F
-from paddle import nn
-from .resnet import ResNet50, ResNet101
-from ppdet.core.workspace import register
-
-__all__ = ['ResNetEmbedding']
-
-
-@register
-class ResNetEmbedding(nn.Layer):
-    in_planes = 2048
-    def __init__(self, model_name='ResNet50', last_stride=1):
-        super(ResNetEmbedding, self).__init__()
-        assert model_name in ['ResNet50', 'ResNet101'], "Unsupported ReID arch: {}".format(model_name)
-        self.base = eval(model_name)(last_conv_stride=last_stride)
-        self.gap = nn.AdaptiveAvgPool2D(output_size=1)
-        self.flatten = nn.Flatten(start_axis=1, stop_axis=-1)
-        self.bn = nn.BatchNorm1D(self.in_planes, bias_attr=False)
-
-    def forward(self, x):
-        base_out = self.base(x)
-        global_feat = self.gap(base_out)
-        global_feat = self.flatten(global_feat)
-        global_feat = self.bn(global_feat)
-        return global_feat
diff --git a/pdfdet/models/Paddle/ppdet/modeling/shape_spec.py b/pdfdet/models/Paddle/ppdet/modeling/shape_spec.py
deleted file mode 100644
index 81601fd..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/shape_spec.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-# The code is based on:
-# https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/shape_spec.py
-
-from collections import namedtuple
-
-
-class ShapeSpec(
-        namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])):
-    def __new__(cls, channels=None, height=None, width=None, stride=None):
-        return super(ShapeSpec, cls).__new__(cls, channels, height, width,
-                                             stride)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/ssod/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/ssod/__init__.py
deleted file mode 100644
index e758857..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/ssod/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
-#   
-# Licensed under the Apache License, Version 2.0 (the "License");   
-# you may not use this file except in compliance with the License.  
-# You may obtain a copy of the License at   
-#   
-#     http://www.apache.org/licenses/LICENSE-2.0    
-#   
-# Unless required by applicable law or agreed to in writing, software   
-# distributed under the License is distributed on an "AS IS" BASIS, 
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
-# See the License for the specific language governing permissions and   
-# limitations under the License.
-
-from . import utils
-from . import losses
-
-from .utils import *
-from .losses import *
diff --git a/pdfdet/models/Paddle/ppdet/modeling/ssod/losses.py b/pdfdet/models/Paddle/ppdet/modeling/ssod/losses.py
deleted file mode 100644
index e4c5038..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/ssod/losses.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ppdet.core.workspace import register
-from ppdet.modeling.losses.iou_loss import GIoULoss
-from .utils import QFLv2
-
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-__all__ = [
-    'SSODFCOSLoss',
-    'SSODPPYOLOELoss',
-]
-
-
-@register
-class SSODFCOSLoss(nn.Layer):
-    def __init__(self, loss_weight=1.0):
-        super(SSODFCOSLoss, self).__init__()
-        self.loss_weight = loss_weight
-
-    def forward(self, student_head_outs, teacher_head_outs, train_cfg):
-        # for semi-det distill
-        student_logits, student_deltas, student_quality = student_head_outs
-        teacher_logits, teacher_deltas, teacher_quality = teacher_head_outs
-        nc = student_logits[0].shape[1]
-
-        student_logits = paddle.concat(
-            [
-                _.transpose([0, 2, 3, 1]).reshape([-1, nc])
-                for _ in student_logits
-            ],
-            axis=0)
-        teacher_logits = paddle.concat(
-            [
-                _.transpose([0, 2, 3, 1]).reshape([-1, nc])
-                for _ in teacher_logits
-            ],
-            axis=0)
-
-        student_deltas = paddle.concat(
-            [
-                _.transpose([0, 2, 3, 1]).reshape([-1, 4])
-                for _ in student_deltas
-            ],
-            axis=0)
-        teacher_deltas = paddle.concat(
-            [
-                _.transpose([0, 2, 3, 1]).reshape([-1, 4])
-                for _ in teacher_deltas
-            ],
-            axis=0)
-
-        student_quality = paddle.concat(
-            [
-                _.transpose([0, 2, 3, 1]).reshape([-1, 1])
-                for _ in student_quality
-            ],
-            axis=0)
-        teacher_quality = paddle.concat(
-            [
-                _.transpose([0, 2, 3, 1]).reshape([-1, 1])
-                for _ in teacher_quality
-            ],
-            axis=0)
-
-        ratio = train_cfg.get('ratio', 0.01)
-        with paddle.no_grad():
-            # Region Selection
-            count_num = int(teacher_logits.shape[0] * ratio)
-            teacher_probs = F.sigmoid(teacher_logits)
-            max_vals = paddle.max(teacher_probs, 1)
-            sorted_vals, sorted_inds = paddle.topk(max_vals,
-                                                   teacher_logits.shape[0])
-            mask = paddle.zeros_like(max_vals)
-            mask[sorted_inds[:count_num]] = 1.
-            fg_num = sorted_vals[:count_num].sum()
-            b_mask = mask > 0
-
-        # distill_loss_cls
-        loss_logits = QFLv2(
-            F.sigmoid(student_logits),
-            teacher_probs,
-            weight=mask,
-            reduction="sum") / fg_num
-
-        # distill_loss_box
-        inputs = paddle.concat(
-            (-student_deltas[b_mask][..., :2], student_deltas[b_mask][..., 2:]),
-            axis=-1)
-        targets = paddle.concat(
-            (-teacher_deltas[b_mask][..., :2], teacher_deltas[b_mask][..., 2:]),
-            axis=-1)
-        iou_loss = GIoULoss(reduction='mean')
-        loss_deltas = iou_loss(inputs, targets)
-
-        # distill_loss_quality
-        loss_quality = F.binary_cross_entropy(
-            F.sigmoid(student_quality[b_mask]),
-            F.sigmoid(teacher_quality[b_mask]),
-            reduction='mean')
-
-        return {
-            "distill_loss_cls": loss_logits,
-            "distill_loss_box": loss_deltas,
-            "distill_loss_quality": loss_quality,
-            "fg_sum": fg_num,
-        }
-
-
-@register
-class SSODPPYOLOELoss(nn.Layer):
-    def __init__(self, loss_weight=1.0):
-        super(SSODPPYOLOELoss, self).__init__()
-        self.loss_weight = loss_weight
-
-    def forward(self, student_head_outs, teacher_head_outs, train_cfg):
-        # for semi-det distill
-        # student_probs: already sigmoid
-        student_probs, student_deltas, student_dfl = student_head_outs
-        teacher_probs, teacher_deltas, teacher_dfl = teacher_head_outs
-        bs, l, nc = student_probs.shape[:]  # bs, l, num_classes
-        bs, l, _, reg_ch = student_dfl.shape[:]  # bs, l, 4, reg_ch
-        student_probs = student_probs.reshape([-1, nc])
-        teacher_probs = teacher_probs.reshape([-1, nc])
-        student_deltas = student_deltas.reshape([-1, 4])
-        teacher_deltas = teacher_deltas.reshape([-1, 4])
-        student_dfl = student_dfl.reshape([-1, 4, reg_ch])
-        teacher_dfl = teacher_dfl.reshape([-1, 4, reg_ch])
-
-        ratio = train_cfg.get('ratio', 0.01)
-
-        # for contrast loss
-        curr_iter = train_cfg['curr_iter']
-        st_iter = train_cfg['st_iter']
-        if curr_iter == st_iter + 1:
-            # start semi-det training
-            self.queue_ptr = 0
-            self.queue_size = int(bs * l * ratio)
-            self.queue_feats = paddle.zeros([self.queue_size, nc])
-            self.queue_probs = paddle.zeros([self.queue_size, nc])
-        contrast_loss_cfg = train_cfg['contrast_loss']
-        temperature = contrast_loss_cfg.get('temperature', 0.2)
-        alpha = contrast_loss_cfg.get('alpha', 0.9)
-        smooth_iter = contrast_loss_cfg.get('smooth_iter', 100) + st_iter
-
-        with paddle.no_grad():
-            # Region Selection
-            count_num = int(teacher_probs.shape[0] * ratio)
-            max_vals = paddle.max(teacher_probs, 1)
-            sorted_vals, sorted_inds = paddle.topk(max_vals,
-                                                   teacher_probs.shape[0])
-            mask = paddle.zeros_like(max_vals)
-            mask[sorted_inds[:count_num]] = 1.
-            fg_num = sorted_vals[:count_num].sum()
-            b_mask = mask > 0.
-
-            # for contrast loss
-            probs = teacher_probs[b_mask].detach()
-            if curr_iter > smooth_iter:  # memory-smoothing
-                A = paddle.exp(
-                    paddle.mm(teacher_probs[b_mask], self.queue_probs.t()) /
-                    temperature)
-                A = A / A.sum(1, keepdim=True)
-                probs = alpha * probs + (1 - alpha) * paddle.mm(
-                    A, self.queue_probs)
-            n = student_probs[b_mask].shape[0]
-            # update memory bank
-            self.queue_feats[self.queue_ptr:self.queue_ptr +
-                             n, :] = teacher_probs[b_mask].detach()
-            self.queue_probs[self.queue_ptr:self.queue_ptr +
-                             n, :] = teacher_probs[b_mask].detach()
-            self.queue_ptr = (self.queue_ptr + n) % self.queue_size
-
-        # embedding similarity
-        sim = paddle.exp(
-            paddle.mm(student_probs[b_mask], teacher_probs[b_mask].t()) / 0.2)
-        sim_probs = sim / sim.sum(1, keepdim=True)
-        # pseudo-label graph with self-loop
-        Q = paddle.mm(probs, probs.t())
-        Q.fill_diagonal_(1)
-        pos_mask = (Q >= 0.5).astype('float32')
-        Q = Q * pos_mask
-        Q = Q / Q.sum(1, keepdim=True)
-        # contrastive loss
-        loss_contrast = -(paddle.log(sim_probs + 1e-7) * Q).sum(1)
-        loss_contrast = loss_contrast.mean()
-
-        # distill_loss_cls
-        loss_cls = QFLv2(
-            student_probs, teacher_probs, weight=mask, reduction="sum") / fg_num
-
-        # distill_loss_iou
-        inputs = paddle.concat(
-            (-student_deltas[b_mask][..., :2], student_deltas[b_mask][..., 2:]),
-            -1)
-        targets = paddle.concat(
-            (-teacher_deltas[b_mask][..., :2], teacher_deltas[b_mask][..., 2:]),
-            -1)
-        iou_loss = GIoULoss(reduction='mean')
-        loss_iou = iou_loss(inputs, targets)
-
-        # distill_loss_dfl
-        loss_dfl = F.cross_entropy(
-            student_dfl[b_mask].reshape([-1, reg_ch]),
-            teacher_dfl[b_mask].reshape([-1, reg_ch]),
-            soft_label=True,
-            reduction='mean')
-
-        return {
-            "distill_loss_cls": loss_cls,
-            "distill_loss_iou": loss_iou,
-            "distill_loss_dfl": loss_dfl,
-            "distill_loss_contrast": loss_contrast,
-            "fg_sum": fg_num,
-        }
diff --git a/pdfdet/models/Paddle/ppdet/modeling/ssod/utils.py b/pdfdet/models/Paddle/ppdet/modeling/ssod/utils.py
deleted file mode 100644
index 6c9e86f..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/ssod/utils.py
+++ /dev/null
@@ -1,104 +0,0 @@
-#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn.functional as F
-
-
-def align_weak_strong_shape(data_weak, data_strong):
-    max_shape_x = max(data_strong['image'].shape[2],
-                      data_weak['image'].shape[2])
-    max_shape_y = max(data_strong['image'].shape[3],
-                      data_weak['image'].shape[3])
-
-    scale_x_s = max_shape_x / data_strong['image'].shape[2]
-    scale_y_s = max_shape_y / data_strong['image'].shape[3]
-    scale_x_w = max_shape_x / data_weak['image'].shape[2]
-    scale_y_w = max_shape_y / data_weak['image'].shape[3]
-    target_size = [max_shape_x, max_shape_y]
-
-    if scale_x_s != 1 or scale_y_s != 1:
-        data_strong['image'] = F.interpolate(
-            data_strong['image'],
-            size=target_size,
-            mode='bilinear',
-            align_corners=False)
-        if 'gt_bbox' in data_strong:
-            gt_bboxes = data_strong['gt_bbox'].numpy()
-            for i in range(len(gt_bboxes)):
-                if len(gt_bboxes[i]) > 0:
-                    gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x_s
-                    gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y_s
-            data_strong['gt_bbox'] = paddle.to_tensor(gt_bboxes)
-
-    if scale_x_w != 1 or scale_y_w != 1:
-        data_weak['image'] = F.interpolate(
-            data_weak['image'],
-            size=target_size,
-            mode='bilinear',
-            align_corners=False)
-        if 'gt_bbox' in data_weak:
-            gt_bboxes = data_weak['gt_bbox'].numpy()
-            for i in range(len(gt_bboxes)):
-                if len(gt_bboxes[i]) > 0:
-                    gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x_w
-                    gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y_w
-            data_weak['gt_bbox'] = paddle.to_tensor(gt_bboxes)
-    return data_weak, data_strong
-
-
-def QFLv2(pred_sigmoid,
-          teacher_sigmoid,
-          weight=None,
-          beta=2.0,
-          reduction='mean'):
-    pt = pred_sigmoid
-    zerolabel = paddle.zeros_like(pt)
-    loss = F.binary_cross_entropy(
-        pred_sigmoid, zerolabel, reduction='none') * pt.pow(beta)
-    pos = weight > 0
-
-    pt = teacher_sigmoid[pos] - pred_sigmoid[pos]
-    loss[pos] = F.binary_cross_entropy(
-        pred_sigmoid[pos], teacher_sigmoid[pos],
-        reduction='none') * pt.pow(beta)
-
-    valid = weight >= 0
-    if reduction == "mean":
-        loss = loss[valid].mean()
-    elif reduction == "sum":
-        loss = loss[valid].sum()
-    return loss
-
-
-def filter_invalid(bbox, label=None, score=None, thr=0.0, min_size=0):
-    if score.numel() > 0:
-        soft_score = score.max(-1)
-        valid = soft_score >= thr
-        bbox = bbox[valid]
-
-        if label is not None:
-            label = label[valid]
-        score = score[valid]
-    if min_size is not None and bbox.shape[0] > 0:
-        bw = bbox[:, 2]
-        bh = bbox[:, 3]
-        valid = (bw > min_size) & (bh > min_size)
-        bbox = bbox[valid]
-
-        if label is not None:
-            label = label[valid]
-            score = score[valid]
-
-    return bbox, label, score
diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/__init__.py
deleted file mode 100644
index 33a1240..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/transformers/__init__.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import detr_transformer
-from . import utils
-from . import matchers
-from . import position_encoding
-from . import deformable_transformer
-from . import dino_transformer
-from . import group_detr_transformer
-from . import mask_dino_transformer
-from . import rtdetr_transformer
-from . import hybrid_encoder
-
-from .detr_transformer import *
-from .utils import *
-from .matchers import *
-from .position_encoding import *
-from .deformable_transformer import *
-from .dino_transformer import *
-from .petr_transformer import *
-from .group_detr_transformer import *
-from .mask_dino_transformer import *
-from .rtdetr_transformer import *
-from .hybrid_encoder import *
diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/deformable_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/deformable_transformer.py
deleted file mode 100644
index 97a9314..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/transformers/deformable_transformer.py
+++ /dev/null
@@ -1,646 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-
-from ppdet.core.workspace import register
-from ..layers import MultiHeadAttention
-from .position_encoding import PositionEmbedding
-from .utils import _get_clones, get_valid_ratio
-from ..initializer import linear_init_, constant_, xavier_uniform_, normal_
-
-__all__ = ['DeformableTransformer']
-
-
-class MSDeformableAttention(nn.Layer):
-    def __init__(self,
-                 embed_dim=256,
-                 num_heads=8,
-                 num_levels=4,
-                 num_points=4,
-                 lr_mult=0.1):
-        """
-        Multi-Scale Deformable Attention Module
-        """
-        super(MSDeformableAttention, self).__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.num_levels = num_levels
-        self.num_points = num_points
-        self.total_points = num_heads * num_levels * num_points
-
-        self.head_dim = embed_dim // num_heads
-        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
-
-        self.sampling_offsets = nn.Linear(
-            embed_dim,
-            self.total_points * 2,
-            weight_attr=ParamAttr(learning_rate=lr_mult),
-            bias_attr=ParamAttr(learning_rate=lr_mult))
-
-        self.attention_weights = nn.Linear(embed_dim, self.total_points)
-        self.value_proj = nn.Linear(embed_dim, embed_dim)
-        self.output_proj = nn.Linear(embed_dim, embed_dim)
-        try:
-            # use cuda op
-            from deformable_detr_ops import ms_deformable_attn
-        except:
-            # use paddle func
-            from .utils import deformable_attention_core_func as ms_deformable_attn
-        self.ms_deformable_attn_core = ms_deformable_attn
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        # sampling_offsets
-        constant_(self.sampling_offsets.weight)
-        thetas = paddle.arange(
-            self.num_heads,
-            dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)
-        grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
-        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)
-        grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile(
-            [1, self.num_levels, self.num_points, 1])
-        scaling = paddle.arange(
-            1, self.num_points + 1,
-            dtype=paddle.float32).reshape([1, 1, -1, 1])
-        grid_init *= scaling
-        self.sampling_offsets.bias.set_value(grid_init.flatten())
-        # attention_weights
-        constant_(self.attention_weights.weight)
-        constant_(self.attention_weights.bias)
-        # proj
-        xavier_uniform_(self.value_proj.weight)
-        constant_(self.value_proj.bias)
-        xavier_uniform_(self.output_proj.weight)
-        constant_(self.output_proj.bias)
-
-    def forward(self,
-                query,
-                reference_points,
-                value,
-                value_spatial_shapes,
-                value_level_start_index,
-                value_mask=None):
-        """
-        Args:
-            query (Tensor): [bs, query_length, C]
-            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
-                bottom-right (1, 1), including padding area
-            value (Tensor): [bs, value_length, C]
-            value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
-            value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
-            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
-
-        Returns:
-            output (Tensor): [bs, Length_{query}, C]
-        """
-        bs, Len_q = query.shape[:2]
-        Len_v = value.shape[1]
-        assert int(value_spatial_shapes.prod(1).sum()) == Len_v
-
-        value = self.value_proj(value)
-        if value_mask is not None:
-            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
-            value *= value_mask
-        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
-
-        sampling_offsets = self.sampling_offsets(query).reshape(
-            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
-        attention_weights = self.attention_weights(query).reshape(
-            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])
-        attention_weights = F.softmax(attention_weights).reshape(
-            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])
-
-        if reference_points.shape[-1] == 2:
-            offset_normalizer = value_spatial_shapes.flip([1]).reshape(
-                [1, 1, 1, self.num_levels, 1, 2])
-            sampling_locations = reference_points.reshape([
-                bs, Len_q, 1, self.num_levels, 1, 2
-            ]) + sampling_offsets / offset_normalizer
-        elif reference_points.shape[-1] == 4:
-            sampling_locations = (
-                reference_points[:, :, None, :, None, :2] + sampling_offsets /
-                self.num_points * reference_points[:, :, None, :, None, 2:] *
-                0.5)
-        else:
-            raise ValueError(
-                "Last dim of reference_points must be 2 or 4, but get {} instead.".
-                format(reference_points.shape[-1]))
-
-        output = self.ms_deformable_attn_core(
-            value, value_spatial_shapes, value_level_start_index,
-            sampling_locations, attention_weights)
-        output = self.output_proj(output)
-
-        return output
-
-
-class DeformableTransformerEncoderLayer(nn.Layer):
-    def __init__(self,
-                 d_model=256,
-                 n_head=8,
-                 dim_feedforward=1024,
-                 dropout=0.1,
-                 activation="relu",
-                 n_levels=4,
-                 n_points=4,
-                 lr_mult=0.1,
-                 weight_attr=None,
-                 bias_attr=None):
-        super(DeformableTransformerEncoderLayer, self).__init__()
-        # self attention
-        self.self_attn = MSDeformableAttention(d_model, n_head, n_levels,
-                                               n_points, lr_mult)
-        self.dropout1 = nn.Dropout(dropout)
-        self.norm1 = nn.LayerNorm(
-            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
-        # ffn
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.activation = getattr(F, activation)
-        self.dropout2 = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-        self.dropout3 = nn.Dropout(dropout)
-        self.norm2 = nn.LayerNorm(
-            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        linear_init_(self.linear1)
-        linear_init_(self.linear2)
-        xavier_uniform_(self.linear1.weight)
-        xavier_uniform_(self.linear2.weight)
-
-    def with_pos_embed(self, tensor, pos):
-        return tensor if pos is None else tensor + pos
-
-    def forward_ffn(self, src):
-        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
-        src = src + self.dropout3(src2)
-        src = self.norm2(src)
-        return src
-
-    def forward(self,
-                src,
-                reference_points,
-                spatial_shapes,
-                level_start_index,
-                src_mask=None,
-                query_pos_embed=None):
-        # self attention
-        src2 = self.self_attn(
-            self.with_pos_embed(src, query_pos_embed), reference_points, src,
-            spatial_shapes, level_start_index, src_mask)
-        src = src + self.dropout1(src2)
-        src = self.norm1(src)
-        # ffn
-        src = self.forward_ffn(src)
-
-        return src
-
-
-class DeformableTransformerEncoder(nn.Layer):
-    def __init__(self, encoder_layer, num_layers):
-        super(DeformableTransformerEncoder, self).__init__()
-        self.layers = _get_clones(encoder_layer, num_layers)
-        self.num_layers = num_layers
-
-    @staticmethod
-    def get_reference_points(spatial_shapes, valid_ratios, offset=0.5):
-        valid_ratios = valid_ratios.unsqueeze(1)
-        reference_points = []
-        for i, (H, W) in enumerate(spatial_shapes):
-            ref_y, ref_x = paddle.meshgrid(
-                paddle.arange(end=H) + offset, paddle.arange(end=W) + offset)
-            ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] *
-                                                    H)
-            ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] *
-                                                    W)
-            reference_points.append(paddle.stack((ref_x, ref_y), axis=-1))
-        reference_points = paddle.concat(reference_points, 1).unsqueeze(2)
-        reference_points = reference_points * valid_ratios
-        return reference_points
-
-    def forward(self,
-                feat,
-                spatial_shapes,
-                level_start_index,
-                feat_mask=None,
-                query_pos_embed=None,
-                valid_ratios=None):
-        if valid_ratios is None:
-            valid_ratios = paddle.ones(
-                [feat.shape[0], spatial_shapes.shape[0], 2])
-        reference_points = self.get_reference_points(spatial_shapes,
-                                                     valid_ratios)
-        for layer in self.layers:
-            feat = layer(feat, reference_points, spatial_shapes,
-                         level_start_index, feat_mask, query_pos_embed)
-
-        return feat
-
-
-class DeformableTransformerDecoderLayer(nn.Layer):
-    def __init__(self,
-                 d_model=256,
-                 n_head=8,
-                 dim_feedforward=1024,
-                 dropout=0.1,
-                 activation="relu",
-                 n_levels=4,
-                 n_points=4,
-                 lr_mult=0.1,
-                 weight_attr=None,
-                 bias_attr=None):
-        super(DeformableTransformerDecoderLayer, self).__init__()
-
-        # self attention
-        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
-        self.dropout1 = nn.Dropout(dropout)
-        self.norm1 = nn.LayerNorm(
-            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
-
-        # cross attention
-        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
-                                                n_points, lr_mult)
-        self.dropout2 = nn.Dropout(dropout)
-        self.norm2 = nn.LayerNorm(
-            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
-
-        # ffn
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.activation = getattr(F, activation)
-        self.dropout3 = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-        self.dropout4 = nn.Dropout(dropout)
-        self.norm3 = nn.LayerNorm(
-            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        linear_init_(self.linear1)
-        linear_init_(self.linear2)
-        xavier_uniform_(self.linear1.weight)
-        xavier_uniform_(self.linear2.weight)
-
-    def with_pos_embed(self, tensor, pos):
-        return tensor if pos is None else tensor + pos
-
-    def forward_ffn(self, tgt):
-        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
-        tgt = tgt + self.dropout4(tgt2)
-        tgt = self.norm3(tgt)
-        return tgt
-
-    def forward(self,
-                tgt,
-                reference_points,
-                memory,
-                memory_spatial_shapes,
-                memory_level_start_index,
-                memory_mask=None,
-                query_pos_embed=None):
-        # self attention
-        q = k = self.with_pos_embed(tgt, query_pos_embed)
-        tgt2 = self.self_attn(q, k, value=tgt)
-        tgt = tgt + self.dropout1(tgt2)
-        tgt = self.norm1(tgt)
-
-        # cross attention
-        tgt2 = self.cross_attn(
-            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
-            memory_spatial_shapes, memory_level_start_index, memory_mask)
-        tgt = tgt + self.dropout2(tgt2)
-        tgt = self.norm2(tgt)
-
-        # ffn
-        tgt = self.forward_ffn(tgt)
-
-        return tgt
-
-
-class DeformableTransformerDecoder(nn.Layer):
-    def __init__(self, decoder_layer, num_layers, return_intermediate=False):
-        super(DeformableTransformerDecoder, self).__init__()
-        self.layers = _get_clones(decoder_layer, num_layers)
-        self.num_layers = num_layers
-        self.return_intermediate = return_intermediate
-
-    def forward(self,
-                tgt,
-                reference_points,
-                memory,
-                memory_spatial_shapes,
-                memory_level_start_index,
-                memory_mask=None,
-                query_pos_embed=None):
-        output = tgt
-        intermediate = []
-        for lid, layer in enumerate(self.layers):
-            output = layer(output, reference_points, memory,
-                           memory_spatial_shapes, memory_level_start_index,
-                           memory_mask, query_pos_embed)
-
-            if self.return_intermediate:
-                intermediate.append(output)
-
-        if self.return_intermediate:
-            return paddle.stack(intermediate)
-
-        return output.unsqueeze(0)
-
-
-@register
-class DeformableTransformer(nn.Layer):
-    __shared__ = ['hidden_dim']
-
-    def __init__(self,
-                 num_queries=300,
-                 position_embed_type='sine',
-                 return_intermediate_dec=True,
-                 in_feats_channel=[512, 1024, 2048],
-                 num_feature_levels=4,
-                 num_encoder_points=4,
-                 num_decoder_points=4,
-                 hidden_dim=256,
-                 nhead=8,
-                 num_encoder_layers=6,
-                 num_decoder_layers=6,
-                 dim_feedforward=1024,
-                 dropout=0.1,
-                 activation="relu",
-                 lr_mult=0.1,
-                 pe_temperature=10000,
-                 pe_offset=-0.5):
-        super(DeformableTransformer, self).__init__()
-        assert position_embed_type in ['sine', 'learned'], \
-            f'ValueError: position_embed_type not supported {position_embed_type}!'
-        assert len(in_feats_channel) <= num_feature_levels
-
-        self.hidden_dim = hidden_dim
-        self.nhead = nhead
-        self.num_feature_levels = num_feature_levels
-
-        encoder_layer = DeformableTransformerEncoderLayer(
-            hidden_dim, nhead, dim_feedforward, dropout, activation,
-            num_feature_levels, num_encoder_points, lr_mult)
-        self.encoder = DeformableTransformerEncoder(encoder_layer,
-                                                    num_encoder_layers)
-
-        decoder_layer = DeformableTransformerDecoderLayer(
-            hidden_dim, nhead, dim_feedforward, dropout, activation,
-            num_feature_levels, num_decoder_points)
-        self.decoder = DeformableTransformerDecoder(
-            decoder_layer, num_decoder_layers, return_intermediate_dec)
-
-        self.level_embed = nn.Embedding(num_feature_levels, hidden_dim)
-        self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
-        self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
-
-        self.reference_points = nn.Linear(
-            hidden_dim,
-            2,
-            weight_attr=ParamAttr(learning_rate=lr_mult),
-            bias_attr=ParamAttr(learning_rate=lr_mult))
-
-        self.input_proj = nn.LayerList()
-        for in_channels in in_feats_channel:
-            self.input_proj.append(
-                nn.Sequential(
-                    nn.Conv2D(
-                        in_channels, hidden_dim, kernel_size=1),
-                    nn.GroupNorm(32, hidden_dim)))
-        in_channels = in_feats_channel[-1]
-        for _ in range(num_feature_levels - len(in_feats_channel)):
-            self.input_proj.append(
-                nn.Sequential(
-                    nn.Conv2D(
-                        in_channels,
-                        hidden_dim,
-                        kernel_size=3,
-                        stride=2,
-                        padding=1),
-                    nn.GroupNorm(32, hidden_dim)))
-            in_channels = hidden_dim
-
-        self.position_embedding = PositionEmbedding(
-            hidden_dim // 2,
-            temperature=pe_temperature,
-            normalize=True if position_embed_type == 'sine' else False,
-            embed_type=position_embed_type,
-            offset=pe_offset,
-            eps=1e-4)
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        normal_(self.level_embed.weight)
-        normal_(self.tgt_embed.weight)
-        normal_(self.query_pos_embed.weight)
-        xavier_uniform_(self.reference_points.weight)
-        constant_(self.reference_points.bias)
-        for l in self.input_proj:
-            xavier_uniform_(l[0].weight)
-            constant_(l[0].bias)
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_feats_channel': [i.channels for i in input_shape], }
-
-    def forward(self, src_feats, src_mask=None, *args, **kwargs):
-        srcs = []
-        for i in range(len(src_feats)):
-            srcs.append(self.input_proj[i](src_feats[i]))
-        if self.num_feature_levels > len(srcs):
-            len_srcs = len(srcs)
-            for i in range(len_srcs, self.num_feature_levels):
-                if i == len_srcs:
-                    srcs.append(self.input_proj[i](src_feats[-1]))
-                else:
-                    srcs.append(self.input_proj[i](srcs[-1]))
-        src_flatten = []
-        mask_flatten = []
-        lvl_pos_embed_flatten = []
-        spatial_shapes = []
-        valid_ratios = []
-        for level, src in enumerate(srcs):
-            src_shape = paddle.shape(src)
-            bs = src_shape[0:1]
-            h = src_shape[2:3]
-            w = src_shape[3:4]
-            spatial_shapes.append(paddle.concat([h, w]))
-            src = src.flatten(2).transpose([0, 2, 1])
-            src_flatten.append(src)
-            if src_mask is not None:
-                mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
-            else:
-                mask = paddle.ones([bs, h, w])
-            valid_ratios.append(get_valid_ratio(mask))
-            pos_embed = self.position_embedding(mask).flatten(1, 2)
-            lvl_pos_embed = pos_embed + self.level_embed.weight[level]
-            lvl_pos_embed_flatten.append(lvl_pos_embed)
-            mask = mask.flatten(1)
-            mask_flatten.append(mask)
-        src_flatten = paddle.concat(src_flatten, 1)
-        mask_flatten = None if src_mask is None else paddle.concat(mask_flatten,
-                                                                   1)
-        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
-        # [l, 2]
-        spatial_shapes = paddle.to_tensor(
-            paddle.stack(spatial_shapes).astype('int64'))
-        # [l], 每一个level的起始index
-        level_start_index = paddle.concat([
-            paddle.zeros(
-                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
-        ])
-        # [b, l, 2]
-        valid_ratios = paddle.stack(valid_ratios, 1)
-
-        # encoder
-        memory = self.encoder(src_flatten, spatial_shapes, level_start_index,
-                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)
-
-        # prepare input for decoder
-        bs, _, c = memory.shape
-        query_embed = self.query_pos_embed.weight.unsqueeze(0).tile([bs, 1, 1])
-        tgt = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
-        reference_points = F.sigmoid(self.reference_points(query_embed))
-        reference_points_input = reference_points.unsqueeze(
-            2) * valid_ratios.unsqueeze(1)
-
-        # decoder
-        hs = self.decoder(tgt, reference_points_input, memory, spatial_shapes,
-                          level_start_index, mask_flatten, query_embed)
-
-        return (hs, memory, reference_points)
-
-
-class QRDeformableTransformerDecoder(DeformableTransformerDecoder):
-    def __init__(self, decoder_layer, num_layers,
-                 start_q=None, end_q=None, return_intermediate=False):
-        super(QRDeformableTransformerDecoder, self).__init__(
-            decoder_layer, num_layers, return_intermediate=return_intermediate)
-        self.start_q = start_q
-        self.end_q = end_q
-
-    def forward(self,
-                tgt,
-                reference_points,
-                memory,
-                memory_spatial_shapes,
-                memory_level_start_index,
-                memory_mask=None,
-                query_pos_embed=None):
-
-        if not self.training:
-            return super(QRDeformableTransformerDecoder, self).forward(
-                tgt, reference_points,
-                memory, memory_spatial_shapes,
-                memory_level_start_index,
-                memory_mask=memory_mask,
-                query_pos_embed=query_pos_embed)
-
-        batchsize = tgt.shape[0]
-        query_list_reserve = [tgt]
-        intermediate = []
-        for lid, layer in enumerate(self.layers):
-
-            start_q = self.start_q[lid]
-            end_q = self.end_q[lid]
-            query_list = query_list_reserve.copy()[start_q:end_q]
-
-            # prepare for parallel process
-            output = paddle.concat(query_list, axis=0)
-            fakesetsize = int(output.shape[0] / batchsize)
-            reference_points_tiled = reference_points.tile([fakesetsize, 1, 1, 1])
-
-            memory_tiled = memory.tile([fakesetsize, 1, 1])
-            query_pos_embed_tiled = query_pos_embed.tile([fakesetsize, 1, 1])
-            memory_mask_tiled = memory_mask.tile([fakesetsize, 1])
-
-            output = layer(output, reference_points_tiled, memory_tiled,
-                           memory_spatial_shapes, memory_level_start_index,
-                           memory_mask_tiled, query_pos_embed_tiled)
-
-            for i in range(fakesetsize):
-                query_list_reserve.append(output[batchsize*i:batchsize*(i+1)])
-
-            if self.return_intermediate:
-                for i in range(fakesetsize):
-                    intermediate.append(output[batchsize*i:batchsize*(i+1)])
-
-        if self.return_intermediate:
-            return paddle.stack(intermediate)
-
-        return output.unsqueeze(0)
-
-
-@register
-class QRDeformableTransformer(DeformableTransformer):
-
-    def __init__(self,
-                 num_queries=300,
-                 position_embed_type='sine',
-                 return_intermediate_dec=True,
-                 in_feats_channel=[512, 1024, 2048],
-                 num_feature_levels=4,
-                 num_encoder_points=4,
-                 num_decoder_points=4,
-                 hidden_dim=256,
-                 nhead=8,
-                 num_encoder_layers=6,
-                 num_decoder_layers=6,
-                 dim_feedforward=1024,
-                 dropout=0.1,
-                 activation="relu",
-                 lr_mult=0.1,
-                 pe_temperature=10000,
-                 pe_offset=-0.5,
-                 start_q=None,
-                 end_q=None):
-        super(QRDeformableTransformer, self).__init__(
-                 num_queries=num_queries,
-                 position_embed_type=position_embed_type,
-                 return_intermediate_dec=return_intermediate_dec,
-                 in_feats_channel=in_feats_channel,
-                 num_feature_levels=num_feature_levels,
-                 num_encoder_points=num_encoder_points,
-                 num_decoder_points=num_decoder_points,
-                 hidden_dim=hidden_dim,
-                 nhead=nhead,
-                 num_encoder_layers=num_encoder_layers,
-                 num_decoder_layers=num_decoder_layers,
-                 dim_feedforward=dim_feedforward,
-                 dropout=dropout,
-                 activation=activation,
-                 lr_mult=lr_mult,
-                 pe_temperature=pe_temperature,
-                 pe_offset=pe_offset)
-
-        decoder_layer = DeformableTransformerDecoderLayer(
-            hidden_dim, nhead, dim_feedforward, dropout, activation,
-            num_feature_levels, num_decoder_points)
-        self.decoder = QRDeformableTransformerDecoder(
-            decoder_layer, num_decoder_layers, start_q, end_q, return_intermediate_dec)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/detr_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/detr_transformer.py
deleted file mode 100644
index efeb320..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/transformers/detr_transformer.py
+++ /dev/null
@@ -1,359 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Modified from DETR (https://github.com/facebookresearch/detr)
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ppdet.core.workspace import register
-from ..layers import MultiHeadAttention, _convert_attention_mask
-from .position_encoding import PositionEmbedding
-from .utils import _get_clones
-from ..initializer import linear_init_, conv_init_, xavier_uniform_, normal_
-
-__all__ = ['DETRTransformer']
-
-
-class TransformerEncoderLayer(nn.Layer):
-    def __init__(self,
-                 d_model,
-                 nhead,
-                 dim_feedforward=2048,
-                 dropout=0.1,
-                 activation="relu",
-                 attn_dropout=None,
-                 act_dropout=None,
-                 normalize_before=False):
-        super(TransformerEncoderLayer, self).__init__()
-        attn_dropout = dropout if attn_dropout is None else attn_dropout
-        act_dropout = dropout if act_dropout is None else act_dropout
-        self.normalize_before = normalize_before
-
-        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
-        # Implementation of Feedforward model
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
-        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
-        self.activation = getattr(F, activation)
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        linear_init_(self.linear1)
-        linear_init_(self.linear2)
-
-    @staticmethod
-    def with_pos_embed(tensor, pos_embed):
-        return tensor if pos_embed is None else tensor + pos_embed
-
-    def forward(self, src, src_mask=None, pos_embed=None):
-        residual = src
-        if self.normalize_before:
-            src = self.norm1(src)
-        q = k = self.with_pos_embed(src, pos_embed)
-        src = self.self_attn(q, k, value=src, attn_mask=src_mask)
-
-        src = residual + self.dropout1(src)
-        if not self.normalize_before:
-            src = self.norm1(src)
-
-        residual = src
-        if self.normalize_before:
-            src = self.norm2(src)
-        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
-        src = residual + self.dropout2(src)
-        if not self.normalize_before:
-            src = self.norm2(src)
-        return src
-
-
-class TransformerEncoder(nn.Layer):
-    def __init__(self, encoder_layer, num_layers, norm=None):
-        super(TransformerEncoder, self).__init__()
-        self.layers = _get_clones(encoder_layer, num_layers)
-        self.num_layers = num_layers
-        self.norm = norm
-
-    def forward(self, src, src_mask=None, pos_embed=None):
-        output = src
-        for layer in self.layers:
-            output = layer(output, src_mask=src_mask, pos_embed=pos_embed)
-
-        if self.norm is not None:
-            output = self.norm(output)
-
-        return output
-
-
-class TransformerDecoderLayer(nn.Layer):
-    def __init__(self,
-                 d_model,
-                 nhead,
-                 dim_feedforward=2048,
-                 dropout=0.1,
-                 activation="relu",
-                 attn_dropout=None,
-                 act_dropout=None,
-                 normalize_before=False):
-        super(TransformerDecoderLayer, self).__init__()
-        attn_dropout = dropout if attn_dropout is None else attn_dropout
-        act_dropout = dropout if act_dropout is None else act_dropout
-        self.normalize_before = normalize_before
-
-        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
-        self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
-        # Implementation of Feedforward model
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.norm3 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
-        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
-        self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train")
-        self.activation = getattr(F, activation)
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        linear_init_(self.linear1)
-        linear_init_(self.linear2)
-
-    @staticmethod
-    def with_pos_embed(tensor, pos_embed):
-        return tensor if pos_embed is None else tensor + pos_embed
-
-    def forward(self,
-                tgt,
-                memory,
-                tgt_mask=None,
-                memory_mask=None,
-                pos_embed=None,
-                query_pos_embed=None):
-        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
-
-        residual = tgt
-        if self.normalize_before:
-            tgt = self.norm1(tgt)
-        q = k = self.with_pos_embed(tgt, query_pos_embed)
-        tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)
-        tgt = residual + self.dropout1(tgt)
-        if not self.normalize_before:
-            tgt = self.norm1(tgt)
-
-        residual = tgt
-        if self.normalize_before:
-            tgt = self.norm2(tgt)
-        q = self.with_pos_embed(tgt, query_pos_embed)
-        k = self.with_pos_embed(memory, pos_embed)
-        tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask)
-        tgt = residual + self.dropout2(tgt)
-        if not self.normalize_before:
-            tgt = self.norm2(tgt)
-
-        residual = tgt
-        if self.normalize_before:
-            tgt = self.norm3(tgt)
-        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
-        tgt = residual + self.dropout3(tgt)
-        if not self.normalize_before:
-            tgt = self.norm3(tgt)
-        return tgt
-
-
-class TransformerDecoder(nn.Layer):
-    def __init__(self,
-                 decoder_layer,
-                 num_layers,
-                 norm=None,
-                 return_intermediate=False):
-        super(TransformerDecoder, self).__init__()
-        self.layers = _get_clones(decoder_layer, num_layers)
-        self.num_layers = num_layers
-        self.norm = norm
-        self.return_intermediate = return_intermediate
-
-    def forward(self,
-                tgt,
-                memory,
-                tgt_mask=None,
-                memory_mask=None,
-                pos_embed=None,
-                query_pos_embed=None):
-        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
-
-        output = tgt
-        intermediate = []
-        for layer in self.layers:
-            output = layer(
-                output,
-                memory,
-                tgt_mask=tgt_mask,
-                memory_mask=memory_mask,
-                pos_embed=pos_embed,
-                query_pos_embed=query_pos_embed)
-            if self.return_intermediate:
-                intermediate.append(self.norm(output))
-
-        if self.norm is not None:
-            output = self.norm(output)
-
-        if self.return_intermediate:
-            return paddle.stack(intermediate)
-
-        return output.unsqueeze(0)
-
-
-@register
-class DETRTransformer(nn.Layer):
-    __shared__ = ['hidden_dim']
-
-    def __init__(self,
-                 num_queries=100,
-                 position_embed_type='sine',
-                 return_intermediate_dec=True,
-                 backbone_num_channels=2048,
-                 hidden_dim=256,
-                 nhead=8,
-                 num_encoder_layers=6,
-                 num_decoder_layers=6,
-                 dim_feedforward=2048,
-                 dropout=0.1,
-                 activation="relu",
-                 pe_temperature=10000,
-                 pe_offset=0.,
-                 attn_dropout=None,
-                 act_dropout=None,
-                 normalize_before=False):
-        super(DETRTransformer, self).__init__()
-        assert position_embed_type in ['sine', 'learned'],\
-            f'ValueError: position_embed_type not supported {position_embed_type}!'
-        self.hidden_dim = hidden_dim
-        self.nhead = nhead
-
-        encoder_layer = TransformerEncoderLayer(
-            hidden_dim, nhead, dim_feedforward, dropout, activation,
-            attn_dropout, act_dropout, normalize_before)
-        encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None
-        self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
-                                          encoder_norm)
-
-        decoder_layer = TransformerDecoderLayer(
-            hidden_dim, nhead, dim_feedforward, dropout, activation,
-            attn_dropout, act_dropout, normalize_before)
-        decoder_norm = nn.LayerNorm(hidden_dim)
-        self.decoder = TransformerDecoder(
-            decoder_layer,
-            num_decoder_layers,
-            decoder_norm,
-            return_intermediate=return_intermediate_dec)
-
-        self.input_proj = nn.Conv2D(
-            backbone_num_channels, hidden_dim, kernel_size=1)
-        self.query_pos_embed = nn.Embedding(num_queries, hidden_dim)
-        self.position_embedding = PositionEmbedding(
-            hidden_dim // 2,
-            temperature=pe_temperature,
-            normalize=True if position_embed_type == 'sine' else False,
-            embed_type=position_embed_type,
-            offset=pe_offset)
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        for p in self.parameters():
-            if p.dim() > 1:
-                xavier_uniform_(p)
-        conv_init_(self.input_proj)
-        normal_(self.query_pos_embed.weight)
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {
-            'backbone_num_channels': [i.channels for i in input_shape][-1],
-        }
-
-    def _convert_attention_mask(self, mask):
-        return (mask - 1.0) * 1e9
-
-    def forward(self, src, src_mask=None, *args, **kwargs):
-        r"""
-        Applies a Transformer model on the inputs.
-
-        Parameters:
-            src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]].
-            src_mask (Tensor, optional): A tensor used in multi-head attention
-                to prevents attention to some unwanted positions, usually the
-                paddings or the subsequent positions. It is a tensor with shape
-                [bs, H, W]`. When the data type is bool, the unwanted positions
-                have `False` values and the others have `True` values. When the
-                data type is int, the unwanted positions have 0 values and the
-                others have 1 values. When the data type is float, the unwanted
-                positions have `-INF` values and the others have 0 values. It
-                can be None when nothing wanted or needed to be prevented
-                attention to. Default None.
-
-        Returns:
-            output (Tensor): [num_levels, batch_size, num_queries, hidden_dim]
-            memory (Tensor): [batch_size, hidden_dim, h, w]
-        """
-        # use last level feature map
-        src_proj = self.input_proj(src[-1])
-        bs, c, h, w = paddle.shape(src_proj)
-        # flatten [B, C, H, W] to [B, HxW, C]
-        src_flatten = src_proj.flatten(2).transpose([0, 2, 1])
-        if src_mask is not None:
-            src_mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0]
-        else:
-            src_mask = paddle.ones([bs, h, w])
-        pos_embed = self.position_embedding(src_mask).flatten(1, 2)
-
-        if self.training:
-            src_mask = self._convert_attention_mask(src_mask)
-            src_mask = src_mask.reshape([bs, 1, 1, h * w])
-        else:
-            src_mask = None
-
-        memory = self.encoder(
-            src_flatten, src_mask=src_mask, pos_embed=pos_embed)
-
-        query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile(
-            [bs, 1, 1])
-        tgt = paddle.zeros_like(query_pos_embed)
-        output = self.decoder(
-            tgt,
-            memory,
-            memory_mask=src_mask,
-            pos_embed=pos_embed,
-            query_pos_embed=query_pos_embed)
-
-        if self.training:
-            src_mask = src_mask.reshape([bs, 1, 1, h, w])
-        else:
-            src_mask = None
-
-        return (output, memory.transpose([0, 2, 1]).reshape([bs, c, h, w]),
-                src_proj, src_mask)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/dino_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/dino_transformer.py
deleted file mode 100644
index d08a0ad..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/transformers/dino_transformer.py
+++ /dev/null
@@ -1,528 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Modified from detrex (https://github.com/IDEA-Research/detrex)
-# Copyright 2022 The IDEA Authors. All rights reserved.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-
-from ppdet.core.workspace import register
-from ..layers import MultiHeadAttention
-from .position_encoding import PositionEmbedding
-from ..heads.detr_head import MLP
-from .deformable_transformer import (MSDeformableAttention,
-                                     DeformableTransformerEncoderLayer,
-                                     DeformableTransformerEncoder)
-from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
-                           bias_init_with_prob)
-from .utils import (_get_clones, get_valid_ratio,
-                    get_contrastive_denoising_training_group,
-                    get_sine_pos_embed, inverse_sigmoid)
-
-__all__ = ['DINOTransformer']
-
-
-class DINOTransformerDecoderLayer(nn.Layer):
-    def __init__(self,
-                 d_model=256,
-                 n_head=8,
-                 dim_feedforward=1024,
-                 dropout=0.,
-                 activation="relu",
-                 n_levels=4,
-                 n_points=4,
-                 lr_mult=1.0,
-                 weight_attr=None,
-                 bias_attr=None):
-        super(DINOTransformerDecoderLayer, self).__init__()
-
-        # self attention
-        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
-        self.dropout1 = nn.Dropout(dropout)
-        self.norm1 = nn.LayerNorm(
-            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
-
-        # cross attention
-        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
-                                                n_points, lr_mult)
-        self.dropout2 = nn.Dropout(dropout)
-        self.norm2 = nn.LayerNorm(
-            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
-
-        # ffn
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.activation = getattr(F, activation)
-        self.dropout3 = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-        self.dropout4 = nn.Dropout(dropout)
-        self.norm3 = nn.LayerNorm(
-            d_model, weight_attr=weight_attr, bias_attr=bias_attr)
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        linear_init_(self.linear1)
-        linear_init_(self.linear2)
-        xavier_uniform_(self.linear1.weight)
-        xavier_uniform_(self.linear2.weight)
-
-    def with_pos_embed(self, tensor, pos):
-        return tensor if pos is None else tensor + pos
-
-    def forward_ffn(self, tgt):
-        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
-
-    def forward(self,
-                tgt,
-                reference_points,
-                memory,
-                memory_spatial_shapes,
-                memory_level_start_index,
-                attn_mask=None,
-                memory_mask=None,
-                query_pos_embed=None):
-        # self attention
-        q = k = self.with_pos_embed(tgt, query_pos_embed)
-        if attn_mask is not None:
-            attn_mask = paddle.where(
-                attn_mask.astype('bool'),
-                paddle.zeros(attn_mask.shape, tgt.dtype),
-                paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
-        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
-        tgt = tgt + self.dropout1(tgt2)
-        tgt = self.norm1(tgt)
-
-        # cross attention
-        tgt2 = self.cross_attn(
-            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
-            memory_spatial_shapes, memory_level_start_index, memory_mask)
-        tgt = tgt + self.dropout2(tgt2)
-        tgt = self.norm2(tgt)
-
-        # ffn
-        tgt2 = self.forward_ffn(tgt)
-        tgt = tgt + self.dropout4(tgt2)
-        tgt = self.norm3(tgt)
-
-        return tgt
-
-
-class DINOTransformerDecoder(nn.Layer):
-    def __init__(self,
-                 hidden_dim,
-                 decoder_layer,
-                 num_layers,
-                 weight_attr=None,
-                 bias_attr=None):
-        super(DINOTransformerDecoder, self).__init__()
-        self.layers = _get_clones(decoder_layer, num_layers)
-        self.hidden_dim = hidden_dim
-        self.num_layers = num_layers
-        self.norm = nn.LayerNorm(
-            hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)
-
-    def forward(self,
-                tgt,
-                ref_points_unact,
-                memory,
-                memory_spatial_shapes,
-                memory_level_start_index,
-                bbox_head,
-                query_pos_head,
-                valid_ratios=None,
-                attn_mask=None,
-                memory_mask=None):
-        if valid_ratios is None:
-            valid_ratios = paddle.ones(
-                [memory.shape[0], memory_spatial_shapes.shape[0], 2])
-
-        output = tgt
-        intermediate = []
-        inter_bboxes = []
-        ref_points = F.sigmoid(ref_points_unact)
-        for i, layer in enumerate(self.layers):
-            reference_points_input = ref_points.detach().unsqueeze(
-                2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)
-            query_pos_embed = get_sine_pos_embed(
-                reference_points_input[..., 0, :], self.hidden_dim // 2)
-            query_pos_embed = query_pos_head(query_pos_embed)
-
-            output = layer(output, reference_points_input, memory,
-                           memory_spatial_shapes, memory_level_start_index,
-                           attn_mask, memory_mask, query_pos_embed)
-
-            ref_points = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
-                ref_points.detach()))
-
-            intermediate.append(self.norm(output))
-            inter_bboxes.append(ref_points)
-
-        return paddle.stack(intermediate), paddle.stack(inter_bboxes)
-
-
-@register
-class DINOTransformer(nn.Layer):
-    __shared__ = ['num_classes', 'hidden_dim']
-
-    def __init__(self,
-                 num_classes=80,
-                 hidden_dim=256,
-                 num_queries=900,
-                 position_embed_type='sine',
-                 in_feats_channel=[512, 1024, 2048],
-                 num_levels=4,
-                 num_encoder_points=4,
-                 num_decoder_points=4,
-                 nhead=8,
-                 num_encoder_layers=6,
-                 num_decoder_layers=6,
-                 dim_feedforward=1024,
-                 dropout=0.,
-                 activation="relu",
-                 lr_mult=1.0,
-                 pe_temperature=10000,
-                 pe_offset=-0.5,
-                 num_denoising=100,
-                 label_noise_ratio=0.5,
-                 box_noise_scale=1.0,
-                 learnt_init_query=True,
-                 eps=1e-2):
-        super(DINOTransformer, self).__init__()
-        assert position_embed_type in ['sine', 'learned'], \
-            f'ValueError: position_embed_type not supported {position_embed_type}!'
-        assert len(in_feats_channel) <= num_levels
-
-        self.hidden_dim = hidden_dim
-        self.nhead = nhead
-        self.num_levels = num_levels
-        self.num_classes = num_classes
-        self.num_queries = num_queries
-        self.eps = eps
-        self.num_decoder_layers = num_decoder_layers
-
-        weight_attr = ParamAttr(regularizer=L2Decay(0.0))
-        bias_attr = ParamAttr(regularizer=L2Decay(0.0))
-        # backbone feature projection
-        self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr)
-
-        # Transformer module
-        encoder_layer = DeformableTransformerEncoderLayer(
-            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
-            num_encoder_points, lr_mult, weight_attr, bias_attr)
-        self.encoder = DeformableTransformerEncoder(encoder_layer,
-                                                    num_encoder_layers)
-        decoder_layer = DINOTransformerDecoderLayer(
-            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
-            num_decoder_points, lr_mult, weight_attr, bias_attr)
-        self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer,
-                                              num_decoder_layers, weight_attr,
-                                              bias_attr)
-
-        # denoising part
-        self.denoising_class_embed = nn.Embedding(
-            num_classes,
-            hidden_dim,
-            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
-        self.num_denoising = num_denoising
-        self.label_noise_ratio = label_noise_ratio
-        self.box_noise_scale = box_noise_scale
-
-        # position embedding
-        self.position_embedding = PositionEmbedding(
-            hidden_dim // 2,
-            temperature=pe_temperature,
-            normalize=True if position_embed_type == 'sine' else False,
-            embed_type=position_embed_type,
-            offset=pe_offset)
-        self.level_embed = nn.Embedding(num_levels, hidden_dim)
-        # decoder embedding
-        self.learnt_init_query = learnt_init_query
-        if learnt_init_query:
-            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
-        self.query_pos_head = MLP(2 * hidden_dim,
-                                  hidden_dim,
-                                  hidden_dim,
-                                  num_layers=2)
-
-        # encoder head
-        self.enc_output = nn.Sequential(
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.LayerNorm(
-                hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr))
-        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
-        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
-        # decoder head
-        self.dec_score_head = nn.LayerList([
-            nn.Linear(hidden_dim, num_classes)
-            for _ in range(num_decoder_layers)
-        ])
-        self.dec_bbox_head = nn.LayerList([
-            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
-            for _ in range(num_decoder_layers)
-        ])
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        # class and bbox head init
-        bias_cls = bias_init_with_prob(0.01)
-        linear_init_(self.enc_score_head)
-        constant_(self.enc_score_head.bias, bias_cls)
-        constant_(self.enc_bbox_head.layers[-1].weight)
-        constant_(self.enc_bbox_head.layers[-1].bias)
-        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
-            linear_init_(cls_)
-            constant_(cls_.bias, bias_cls)
-            constant_(reg_.layers[-1].weight)
-            constant_(reg_.layers[-1].bias)
-
-        linear_init_(self.enc_output[0])
-        xavier_uniform_(self.enc_output[0].weight)
-        normal_(self.level_embed.weight)
-        if self.learnt_init_query:
-            xavier_uniform_(self.tgt_embed.weight)
-        xavier_uniform_(self.query_pos_head.layers[0].weight)
-        xavier_uniform_(self.query_pos_head.layers[1].weight)
-        for l in self.input_proj:
-            xavier_uniform_(l[0].weight)
-            constant_(l[0].bias)
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_feats_channel': [i.channels for i in input_shape], }
-
-    def _build_input_proj_layer(self,
-                                in_feats_channel,
-                                weight_attr=None,
-                                bias_attr=None):
-        self.input_proj = nn.LayerList()
-        for in_channels in in_feats_channel:
-            self.input_proj.append(
-                nn.Sequential(
-                    ('conv', nn.Conv2D(
-                        in_channels, self.hidden_dim, kernel_size=1)), (
-                            'norm', nn.GroupNorm(
-                                32,
-                                self.hidden_dim,
-                                weight_attr=weight_attr,
-                                bias_attr=bias_attr))))
-        in_channels = in_feats_channel[-1]
-        for _ in range(self.num_levels - len(in_feats_channel)):
-            self.input_proj.append(
-                nn.Sequential(
-                    ('conv', nn.Conv2D(
-                        in_channels,
-                        self.hidden_dim,
-                        kernel_size=3,
-                        stride=2,
-                        padding=1)), ('norm', nn.GroupNorm(
-                            32,
-                            self.hidden_dim,
-                            weight_attr=weight_attr,
-                            bias_attr=bias_attr))))
-            in_channels = self.hidden_dim
-
-    def _get_encoder_input(self, feats, pad_mask=None):
-        # get projection features
-        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
-        if self.num_levels > len(proj_feats):
-            len_srcs = len(proj_feats)
-            for i in range(len_srcs, self.num_levels):
-                if i == len_srcs:
-                    proj_feats.append(self.input_proj[i](feats[-1]))
-                else:
-                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
-
-        # get encoder inputs
-        feat_flatten = []
-        mask_flatten = []
-        lvl_pos_embed_flatten = []
-        spatial_shapes = []
-        valid_ratios = []
-        for i, feat in enumerate(proj_feats):
-            bs, _, h, w = paddle.shape(feat)
-            spatial_shapes.append(paddle.stack([h, w]))
-            # [b,c,h,w] -> [b,h*w,c]
-            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
-            if pad_mask is not None:
-                mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]
-            else:
-                mask = paddle.ones([bs, h, w])
-            valid_ratios.append(get_valid_ratio(mask))
-            # [b, h*w, c]
-            pos_embed = self.position_embedding(mask).flatten(1, 2)
-            lvl_pos_embed = pos_embed + self.level_embed.weight[i]
-            lvl_pos_embed_flatten.append(lvl_pos_embed)
-            if pad_mask is not None:
-                # [b, h*w]
-                mask_flatten.append(mask.flatten(1))
-
-        # [b, l, c]
-        feat_flatten = paddle.concat(feat_flatten, 1)
-        # [b, l]
-        mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,
-                                                                   1)
-        # [b, l, c]
-        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
-        # [num_levels, 2]
-        spatial_shapes = paddle.to_tensor(
-            paddle.stack(spatial_shapes).astype('int64'))
-        # [l] start index of each level
-        level_start_index = paddle.concat([
-            paddle.zeros(
-                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
-        ])
-        # [b, num_levels, 2]
-        valid_ratios = paddle.stack(valid_ratios, 1)
-        return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
-                lvl_pos_embed_flatten, valid_ratios)
-
-    def forward(self, feats, pad_mask=None, gt_meta=None):
-        # input projection and embedding
-        (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
-         lvl_pos_embed_flatten,
-         valid_ratios) = self._get_encoder_input(feats, pad_mask)
-
-        # encoder
-        memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,
-                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)
-
-        # prepare denoising training
-        if self.training:
-            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
-                get_contrastive_denoising_training_group(gt_meta,
-                                            self.num_classes,
-                                            self.num_queries,
-                                            self.denoising_class_embed.weight,
-                                            self.num_denoising,
-                                            self.label_noise_ratio,
-                                            self.box_noise_scale)
-        else:
-            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
-
-        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
-            self._get_decoder_input(
-            memory, spatial_shapes, mask_flatten, denoising_class,
-            denoising_bbox_unact)
-
-        # decoder
-        inter_feats, inter_bboxes = self.decoder(
-            target, init_ref_points_unact, memory, spatial_shapes,
-            level_start_index, self.dec_bbox_head, self.query_pos_head,
-            valid_ratios, attn_mask, mask_flatten)
-        out_bboxes = []
-        out_logits = []
-        for i in range(self.num_decoder_layers):
-            out_logits.append(self.dec_score_head[i](inter_feats[i]))
-            if i == 0:
-                out_bboxes.append(
-                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
-                              init_ref_points_unact))
-            else:
-                out_bboxes.append(
-                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
-                              inverse_sigmoid(inter_bboxes[i - 1])))
-        out_bboxes = paddle.stack(out_bboxes)
-        out_logits = paddle.stack(out_logits)
-
-        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
-                dn_meta)
-
-    def _get_encoder_output_anchors(self,
-                                    memory,
-                                    spatial_shapes,
-                                    memory_mask=None,
-                                    grid_size=0.05):
-        output_anchors = []
-        idx = 0
-        for lvl, (h, w) in enumerate(spatial_shapes):
-            if memory_mask is not None:
-                mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])
-                valid_H = paddle.sum(mask_[:, :, 0], 1)
-                valid_W = paddle.sum(mask_[:, 0, :], 1)
-            else:
-                valid_H, valid_W = h, w
-
-            grid_y, grid_x = paddle.meshgrid(
-                paddle.arange(end=h), paddle.arange(end=w))
-            grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype)
-
-            valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(
-                [-1, 1, 1, 2]).astype(grid_xy.dtype)
-            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
-            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
-            output_anchors.append(
-                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
-            idx += h * w
-
-        output_anchors = paddle.concat(output_anchors, 1)
-        valid_mask = ((output_anchors > self.eps) *
-                      (output_anchors < 1 - self.eps)).all(-1, keepdim=True)
-        output_anchors = paddle.log(output_anchors / (1 - output_anchors))
-        if memory_mask is not None:
-            valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0
-        output_anchors = paddle.where(valid_mask, output_anchors,
-                                      paddle.to_tensor(float("inf")))
-
-        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
-        output_memory = self.enc_output(memory)
-        return output_memory, output_anchors
-
-    def _get_decoder_input(self,
-                           memory,
-                           spatial_shapes,
-                           memory_mask=None,
-                           denoising_class=None,
-                           denoising_bbox_unact=None):
-        bs, _, _ = memory.shape
-        # prepare input for decoder
-        output_memory, output_anchors = self._get_encoder_output_anchors(
-            memory, spatial_shapes, memory_mask)
-        enc_outputs_class = self.enc_score_head(output_memory)
-        enc_outputs_coord_unact = self.enc_bbox_head(
-            output_memory) + output_anchors
-
-        _, topk_ind = paddle.topk(
-            enc_outputs_class.max(-1), self.num_queries, axis=1)
-        # extract region proposal boxes
-        batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype)
-        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
-        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
-        reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
-                                                  topk_ind)  # unsigmoided.
-        enc_topk_bboxes = F.sigmoid(reference_points_unact)
-        if denoising_bbox_unact is not None:
-            reference_points_unact = paddle.concat(
-                [denoising_bbox_unact, reference_points_unact], 1)
-        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
-
-        # extract region features
-        if self.learnt_init_query:
-            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
-        else:
-            target = paddle.gather_nd(output_memory, topk_ind).detach()
-        if denoising_class is not None:
-            target = paddle.concat([denoising_class, target], 1)
-
-        return target, reference_points_unact.detach(
-        ), enc_topk_bboxes, enc_topk_logits
diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/README.md b/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/README.md
deleted file mode 100644
index 290926d..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/README.md
+++ /dev/null
@@ -1,85 +0,0 @@
-# Multi-scale deformable attention自定义OP编译
-该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。
-
-## 1. 环境依赖
-- Paddle >= 2.3.2
-- gcc 8.2
-
-## 2. 安装
-请在当前路径下进行编译安装
-```
-cd PaddleDetection/ppdet/modeling/transformers/ext_op/
-python setup_ms_deformable_attn_op.py install
-```
-
-编译完成后即可使用，以下为`ms_deformable_attn`的使用示例
-```
-# 引入自定义op
-from deformable_detr_ops import ms_deformable_attn
-
-# 构造fake input tensor
-bs, n_heads, c = 2, 8, 8
-query_length, n_levels, n_points = 2, 2, 2
-spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
-level_start_index = paddle.concat((paddle.to_tensor(
-    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
-value_length = sum([(H * W).item() for H, W in spatial_shapes])
-
-def get_test_tensors(channels):
-    value = paddle.rand(
-        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
-    sampling_locations = paddle.rand(
-        [bs, query_length, n_heads, n_levels, n_points, 2],
-        dtype=paddle.float32)
-    attention_weights = paddle.rand(
-        [bs, query_length, n_heads, n_levels, n_points],
-        dtype=paddle.float32) + 1e-5
-    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
-        -2, keepdim=True)
-    return [value, sampling_locations, attention_weights]
-
-value, sampling_locations, attention_weights = get_test_tensors(c)
-
-output = ms_deformable_attn(value,
-                            spatial_shapes,
-                            level_start_index,
-                            sampling_locations,
-                            attention_weights)
-```
-
-## 3. 单元测试
-可以通过执行单元测试来确认自定义算子功能的正确性，执行单元测试的示例如下所示：
-```
-python test_ms_deformable_attn_op.py
-```
-运行成功后，打印如下：
-```
-*True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07
-*tensor1 True check_gradient_numerical(D=30)
-*tensor2 True check_gradient_numerical(D=30)
-*tensor3 True check_gradient_numerical(D=30)
-*tensor1 True check_gradient_numerical(D=32)
-*tensor2 True check_gradient_numerical(D=32)
-*tensor3 True check_gradient_numerical(D=32)
-*tensor1 True check_gradient_numerical(D=64)
-*tensor2 True check_gradient_numerical(D=64)
-*tensor3 True check_gradient_numerical(D=64)
-*tensor1 True check_gradient_numerical(D=71)
-*tensor2 True check_gradient_numerical(D=71)
-*tensor3 True check_gradient_numerical(D=71)
-*tensor1 True check_gradient_numerical(D=128)
-*tensor2 True check_gradient_numerical(D=128)
-*tensor3 True check_gradient_numerical(D=128)
-*tensor1 True check_gradient_numerical(D=1024)
-*tensor2 True check_gradient_numerical(D=1024)
-*tensor3 True check_gradient_numerical(D=1024)
-*tensor1 True check_gradient_numerical(D=1025)
-*tensor2 True check_gradient_numerical(D=1025)
-*tensor3 True check_gradient_numerical(D=1025)
-*tensor1 True check_gradient_numerical(D=2048)
-*tensor2 True check_gradient_numerical(D=2048)
-*tensor3 True check_gradient_numerical(D=2048)
-*tensor1 True check_gradient_numerical(D=3096)
-*tensor2 True check_gradient_numerical(D=3096)
-*tensor3 True check_gradient_numerical(D=3096)
-```
diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc b/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc
deleted file mode 100644
index d1758ad..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/extension.h"
-
-#include <vector>
-
-// declare GPU implementation
-std::vector<paddle::Tensor>
-MSDeformableAttnCUDAForward(const paddle::Tensor &value,
-                            const paddle::Tensor &value_spatial_shapes,
-                            const paddle::Tensor &value_level_start_index,
-                            const paddle::Tensor &sampling_locations,
-                            const paddle::Tensor &attention_weights);
-
-std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
-    const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
-    const paddle::Tensor &value_level_start_index,
-    const paddle::Tensor &sampling_locations,
-    const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out);
-
-//// CPU not implemented
-
-std::vector<std::vector<int64_t>>
-MSDeformableAttnInferShape(std::vector<int64_t> value_shape,
-                           std::vector<int64_t> value_spatial_shapes_shape,
-                           std::vector<int64_t> value_level_start_index_shape,
-                           std::vector<int64_t> sampling_locations_shape,
-                           std::vector<int64_t> attention_weights_shape) {
-  return {{value_shape[0], sampling_locations_shape[1],
-           value_shape[2] * value_shape[3]}};
-}
-
-std::vector<paddle::DataType>
-MSDeformableAttnInferDtype(paddle::DataType value_dtype,
-                           paddle::DataType value_spatial_shapes_dtype,
-                           paddle::DataType value_level_start_index_dtype,
-                           paddle::DataType sampling_locations_dtype,
-                           paddle::DataType attention_weights_dtype) {
-  return {value_dtype};
-}
-
-PD_BUILD_OP(ms_deformable_attn)
-    .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
-             "AttentionWeights"})
-    .Outputs({"Out"})
-    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward))
-    .SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape))
-    .SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype));
-
-PD_BUILD_GRAD_OP(ms_deformable_attn)
-    .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations",
-             "AttentionWeights", paddle::Grad("Out")})
-    .Outputs({paddle::Grad("Value"), paddle::Grad("SpatialShapes"),
-              paddle::Grad("LevelIndex"), paddle::Grad("SamplingLocations"),
-              paddle::Grad("AttentionWeights")})
-    .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward));
diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu b/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu
deleted file mode 100644
index d5a8d16..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu
+++ /dev/null
@@ -1,1073 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/extension.h"
-
-#define CUDA_KERNEL_LOOP(i, n)                                                 \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);                 \
-       i += blockDim.x * gridDim.x)
-
-const int CUDA_NUM_THREADS = 1024;
-inline int GET_BLOCKS(const int N, const int num_threads) {
-  return (N + num_threads - 1) / num_threads;
-}
-
-// forward bilinear
-template <typename data_t>
-__device__ data_t deformable_attn_bilinear_forward(
-    const data_t *&bottom_data, const int &height, const int &width,
-    const int &nheads, const int &channels, const data_t &h, const data_t &w,
-    const int &m, const int &c) {
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const data_t lh = h - h_low;
-  const data_t lw = w - w_low;
-  const data_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  data_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0) {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-  }
-  data_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1) {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-  }
-  data_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0) {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-  }
-  data_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1) {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-  }
-
-  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
-// forward kernel
-template <typename data_t>
-__global__ void deformable_attn_cuda_kernel_forward(
-    const int n, const data_t *data_value, const int64_t *data_spatial_shapes,
-    const int64_t *data_level_start_index, const data_t *data_sampling_loc,
-    const data_t *data_attn_weight, const int batch_size,
-    const int value_length, const int num_heads, const int channels,
-    const int num_levels, const int query_length, const int num_points,
-    data_t *output_data_ptr) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % query_length;
-    _temp /= query_length;
-    const int b_col = _temp;
-
-    data_t *data_ptr = output_data_ptr + index;
-    int data_weight_ptr = sampling_index * num_levels * num_points;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
-    data_t col = 0;
-
-    for (int l_col = 0; l_col < num_levels; ++l_col) {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const data_t *data_value_ptr = data_value + (data_value_ptr_init_offset +
-                                                   level_start_id * qid_stride);
-      for (int p_col = 0; p_col < num_points; ++p_col) {
-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const data_t weight = data_attn_weight[data_weight_ptr];
-
-        const data_t h_im = loc_h * spatial_h - 0.5;
-        const data_t w_im = loc_w * spatial_w - 0.5;
-
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
-          col += deformable_attn_bilinear_forward(
-                     data_value_ptr, spatial_h, spatial_w, num_heads, channels,
-                     h_im, w_im, m_col, c_col) *
-                 weight;
-        }
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-      }
-    }
-    *data_ptr = col;
-  }
-}
-
-#define CHECK_INPUT_GPU(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
-// forward
-std::vector<paddle::Tensor>
-MSDeformableAttnCUDAForward(const paddle::Tensor &value,
-                            const paddle::Tensor &value_spatial_shapes,
-                            const paddle::Tensor &value_level_start_index,
-                            const paddle::Tensor &sampling_locations,
-                            const paddle::Tensor &attention_weights) {
-
-  CHECK_INPUT_GPU(value);
-  CHECK_INPUT_GPU(value_spatial_shapes);
-  CHECK_INPUT_GPU(value_level_start_index);
-  CHECK_INPUT_GPU(sampling_locations);
-  CHECK_INPUT_GPU(attention_weights);
-
-  const int batch_size = value.shape()[0];
-  const int value_length = value.shape()[1];
-  const int num_heads = value.shape()[2];
-  const int channels = value.shape()[3];
-
-  const int num_levels = value_spatial_shapes.shape()[0];
-  const int query_length = sampling_locations.shape()[1];
-  const int num_points = sampling_locations.shape()[4];
-
-  auto output = paddle::full({batch_size, query_length, num_heads * channels},
-                             0, value.dtype(), paddle::GPUPlace());
-
-  const int num_kernels = batch_size * query_length * num_heads * channels;
-  deformable_attn_cuda_kernel_forward<float>
-      <<<GET_BLOCKS(num_kernels, CUDA_NUM_THREADS), CUDA_NUM_THREADS, 0,
-         value.stream()>>>(num_kernels, value.data<float>(),
-                           value_spatial_shapes.data<int64_t>(),
-                           value_level_start_index.data<int64_t>(),
-                           sampling_locations.data<float>(),
-                           attention_weights.data<float>(), batch_size,
-                           value_length, num_heads, channels, num_levels,
-                           query_length, num_points, output.data<float>());
-  return {output};
-}
-
-// backward bilinear
-template <typename data_t>
-__device__ void deformable_attn_bilinear_backward(
-    const data_t *&bottom_data, const int &height, const int &width,
-    const int &nheads, const int &channels, const data_t &h, const data_t &w,
-    const int &m, const int &c, const data_t &top_grad,
-    const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const data_t lh = h - h_low;
-  const data_t lw = w - w_low;
-  const data_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const data_t top_grad_value = top_grad * attn_weight;
-  data_t grad_h_weight = 0, grad_w_weight = 0;
-
-  data_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0) {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-    grad_h_weight -= hw * v1;
-    grad_w_weight -= hh * v1;
-    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
-  }
-  data_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1) {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-    grad_h_weight -= lw * v2;
-    grad_w_weight += hh * v2;
-    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
-  }
-  data_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0) {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-    grad_h_weight += hw * v3;
-    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
-  }
-  data_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1) {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-    grad_h_weight += lw * v4;
-    grad_w_weight += lh * v4;
-    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
-  }
-
-  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  *grad_attn_weight = top_grad * val;
-  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
-  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
-}
-
-template <typename data_t>
-__device__ void deformable_attn_bilinear_backward_gm(
-    const data_t *&bottom_data, const int &height, const int &width,
-    const int &nheads, const int &channels, const data_t &h, const data_t &w,
-    const int &m, const int &c, const data_t &top_grad,
-    const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const data_t lh = h - h_low;
-  const data_t lw = w - w_low;
-  const data_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const data_t top_grad_value = top_grad * attn_weight;
-  data_t grad_h_weight = 0, grad_w_weight = 0;
-
-  data_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0) {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-    grad_h_weight -= hw * v1;
-    grad_w_weight -= hh * v1;
-    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
-  }
-  data_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1) {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-    grad_h_weight -= lw * v2;
-    grad_w_weight += hh * v2;
-    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
-  }
-  data_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0) {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-    grad_h_weight += hw * v3;
-    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
-  }
-  data_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1) {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-    grad_h_weight += lw * v4;
-    grad_w_weight += lh * v4;
-    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
-  }
-
-  const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  atomicAdd(grad_attn_weight, top_grad * val);
-  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
-  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
-}
-
-// backward kernels
-// channels > 1024
-template <typename data_t>
-__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks(
-    const int n, const data_t *grad_col, const data_t *data_value,
-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
-    const int batch_size, const int value_length, const int num_heads,
-    const int channels, const int num_levels, const int query_length,
-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  CUDA_KERNEL_LOOP(index, n) {
-    extern __shared__ int _s[];
-    data_t *cache_grad_sampling_loc = (data_t *)_s;
-    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % query_length;
-    _temp /= query_length;
-    const int b_col = _temp;
-
-    const data_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_points;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
-
-    for (int l_col = 0; l_col < num_levels; ++l_col) {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset =
-          data_value_ptr_init_offset + level_start_id * qid_stride;
-      const data_t *data_value_ptr = data_value + value_ptr_offset;
-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col = 0; p_col < num_points; ++p_col) {
-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const data_t weight = data_attn_weight[data_weight_ptr];
-
-        const data_t h_im = loc_h * spatial_h - 0.5;
-        const data_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight + threadIdx.x) = 0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
-          deformable_attn_bilinear_backward(
-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
-              cache_grad_sampling_loc + (threadIdx.x << 1),
-              cache_grad_attn_weight + threadIdx.x);
-        }
-
-        __syncthreads();
-
-        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
-             s >>= 1, spre >>= 1) {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] +=
-                cache_grad_sampling_loc[xid2 + 1];
-            if (tid + (s << 1) < spre) {
-              cache_grad_attn_weight[tid] +=
-                  cache_grad_attn_weight[tid + (s << 1)];
-              cache_grad_sampling_loc[xid1] +=
-                  cache_grad_sampling_loc[xid2 + (s << 1)];
-              cache_grad_sampling_loc[xid1 + 1] +=
-                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
-            }
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0) {
-          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
-          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
-          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-template <typename data_t>
-__global__ void deformable_attn_cuda_kernel_backward_gm(
-    const int n, const data_t *grad_col, const data_t *data_value,
-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
-    const int batch_size, const int value_length, const int num_heads,
-    const int channels, const int num_levels, const int query_length,
-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  CUDA_KERNEL_LOOP(index, n) {
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % query_length;
-    _temp /= query_length;
-    const int b_col = _temp;
-
-    const data_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_points;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
-
-    for (int l_col = 0; l_col < num_levels; ++l_col) {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset =
-          data_value_ptr_init_offset + level_start_id * qid_stride;
-      const data_t *data_value_ptr = data_value + value_ptr_offset;
-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col = 0; p_col < num_points; ++p_col) {
-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const data_t weight = data_attn_weight[data_weight_ptr];
-
-        const data_t h_im = loc_h * spatial_h - 0.5;
-        const data_t w_im = loc_w * spatial_w - 0.5;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
-          deformable_attn_bilinear_backward_gm(
-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
-              grad_sampling_loc, grad_attn_weight);
-        }
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-// channels <= 1024
-template <typename data_t, unsigned int blockSize>
-__global__ void
-deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1(
-    const int n, const data_t *grad_col, const data_t *data_value,
-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
-    const int batch_size, const int value_length, const int num_heads,
-    const int channels, const int num_levels, const int query_length,
-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  CUDA_KERNEL_LOOP(index, n) {
-    __shared__ data_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ data_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % query_length;
-    _temp /= query_length;
-    const int b_col = _temp;
-
-    const data_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_points;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
-
-    for (int l_col = 0; l_col < num_levels; ++l_col) {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset =
-          data_value_ptr_init_offset + level_start_id * qid_stride;
-      const data_t *data_value_ptr = data_value + value_ptr_offset;
-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col = 0; p_col < num_points; ++p_col) {
-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const data_t weight = data_attn_weight[data_weight_ptr];
-
-        const data_t h_im = loc_h * spatial_h - 0.5;
-        const data_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight + threadIdx.x) = 0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
-          deformable_attn_bilinear_backward(
-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
-              cache_grad_sampling_loc + (threadIdx.x << 1),
-              cache_grad_attn_weight + threadIdx.x);
-        }
-
-        __syncthreads();
-        if (tid == 0) {
-          data_t _grad_w = cache_grad_sampling_loc[0],
-                 _grad_h = cache_grad_sampling_loc[1],
-                 _grad_a = cache_grad_attn_weight[0];
-          int sid = 2;
-          for (unsigned int tid = 1; tid < blockSize; ++tid) {
-            _grad_w += cache_grad_sampling_loc[sid];
-            _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
-            sid += 2;
-          }
-
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-template <typename data_t, unsigned int blockSize>
-__global__ void
-deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2(
-    const int n, const data_t *grad_col, const data_t *data_value,
-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
-    const int batch_size, const int value_length, const int num_heads,
-    const int channels, const int num_levels, const int query_length,
-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  CUDA_KERNEL_LOOP(index, n) {
-    __shared__ data_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ data_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % query_length;
-    _temp /= query_length;
-    const int b_col = _temp;
-
-    const data_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_points;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
-
-    for (int l_col = 0; l_col < num_levels; ++l_col) {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset =
-          data_value_ptr_init_offset + level_start_id * qid_stride;
-      const data_t *data_value_ptr = data_value + value_ptr_offset;
-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col = 0; p_col < num_points; ++p_col) {
-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const data_t weight = data_attn_weight[data_weight_ptr];
-
-        const data_t h_im = loc_h * spatial_h - 0.5;
-        const data_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight + threadIdx.x) = 0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
-          deformable_attn_bilinear_backward(
-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
-              cache_grad_sampling_loc + (threadIdx.x << 1),
-              cache_grad_attn_weight + threadIdx.x);
-        }
-
-        __syncthreads();
-
-        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] +=
-                cache_grad_sampling_loc[xid2 + 1];
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0) {
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-template <typename data_t>
-__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v1(
-    const int n, const data_t *grad_col, const data_t *data_value,
-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
-    const int batch_size, const int value_length, const int num_heads,
-    const int channels, const int num_levels, const int query_length,
-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  CUDA_KERNEL_LOOP(index, n) {
-    extern __shared__ int _s[];
-    data_t *cache_grad_sampling_loc = (data_t *)_s;
-    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % query_length;
-    _temp /= query_length;
-    const int b_col = _temp;
-
-    const data_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_points;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
-
-    for (int l_col = 0; l_col < num_levels; ++l_col) {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset =
-          data_value_ptr_init_offset + level_start_id * qid_stride;
-      const data_t *data_value_ptr = data_value + value_ptr_offset;
-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col = 0; p_col < num_points; ++p_col) {
-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const data_t weight = data_attn_weight[data_weight_ptr];
-
-        const data_t h_im = loc_h * spatial_h - 0.5;
-        const data_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight + threadIdx.x) = 0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
-          deformable_attn_bilinear_backward(
-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
-              cache_grad_sampling_loc + (threadIdx.x << 1),
-              cache_grad_attn_weight + threadIdx.x);
-        }
-
-        __syncthreads();
-        if (tid == 0) {
-          data_t _grad_w = cache_grad_sampling_loc[0],
-                 _grad_h = cache_grad_sampling_loc[1],
-                 _grad_a = cache_grad_attn_weight[0];
-          int sid = 2;
-          for (unsigned int tid = 1; tid < blockDim.x; ++tid) {
-            _grad_w += cache_grad_sampling_loc[sid];
-            _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
-            sid += 2;
-          }
-
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-template <typename data_t>
-__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2(
-    const int n, const data_t *grad_col, const data_t *data_value,
-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
-    const int batch_size, const int value_length, const int num_heads,
-    const int channels, const int num_levels, const int query_length,
-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  CUDA_KERNEL_LOOP(index, n) {
-    extern __shared__ int _s[];
-    data_t *cache_grad_sampling_loc = (data_t *)_s;
-    data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp;
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % query_length;
-    _temp /= query_length;
-    const int b_col = _temp;
-
-    const data_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_points;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * value_length * qid_stride;
-
-    for (int l_col = 0; l_col < num_levels; ++l_col) {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset =
-          data_value_ptr_init_offset + level_start_id * qid_stride;
-      const data_t *data_value_ptr = data_value + value_ptr_offset;
-      data_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col = 0; p_col < num_points; ++p_col) {
-        const data_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const data_t weight = data_attn_weight[data_weight_ptr];
-
-        const data_t h_im = loc_h * spatial_h - 0.5;
-        const data_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight + threadIdx.x) = 0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
-          deformable_attn_bilinear_backward(
-              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
-              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
-              cache_grad_sampling_loc + (threadIdx.x << 1),
-              cache_grad_attn_weight + threadIdx.x);
-        }
-
-        __syncthreads();
-
-        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
-             s >>= 1, spre >>= 1) {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] +=
-                cache_grad_sampling_loc[xid2 + 1];
-            if (tid + (s << 1) < spre) {
-              cache_grad_attn_weight[tid] +=
-                  cache_grad_attn_weight[tid + (s << 1)];
-              cache_grad_sampling_loc[xid1] +=
-                  cache_grad_sampling_loc[xid2 + (s << 1)];
-              cache_grad_sampling_loc[xid1 + 1] +=
-                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
-            }
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0) {
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-// backward branch
-template <typename data_t>
-void deformable_attn_cuda_backward(
-    cudaStream_t stream, const data_t *grad_out, const data_t *data_value,
-    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
-    const data_t *data_sampling_loc, const data_t *data_attn_weight,
-    const int batch_size, const int value_length, const int num_heads,
-    const int channels, const int num_levels, const int query_length,
-    const int num_points, data_t *grad_value, data_t *grad_sampling_loc,
-    data_t *grad_attn_weight) {
-  const int num_threads =
-      (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels;
-  const int num_kernels = batch_size * query_length * num_heads * channels;
-  const int num_actual_kernels =
-      batch_size * query_length * num_heads * channels;
-  if (channels > 1024) {
-    if ((channels & 1023) == 0) {
-      deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks<data_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-             num_threads * 3 * sizeof(data_t), stream>>>(
-              num_kernels, grad_out, data_value, data_spatial_shapes,
-              data_level_start_index, data_sampling_loc, data_attn_weight,
-              batch_size, value_length, num_heads, channels, num_levels,
-              query_length, num_points, grad_value, grad_sampling_loc,
-              grad_attn_weight);
-    } else {
-      deformable_attn_cuda_kernel_backward_gm<data_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-    }
-  } else {
-    switch (channels) {
-    case 1:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
-                                                                         1>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 2:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
-                                                                         2>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 4:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
-                                                                         4>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 8:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
-                                                                         8>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 16:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
-                                                                         16>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 32:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1<data_t,
-                                                                         32>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 64:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
-                                                                         64>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 128:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
-                                                                         128>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 256:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
-                                                                         256>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 512:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
-                                                                         512>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    case 1024:
-      deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2<data_t,
-                                                                         1024>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
-             stream>>>(num_kernels, grad_out, data_value, data_spatial_shapes,
-                       data_level_start_index, data_sampling_loc,
-                       data_attn_weight, batch_size, value_length, num_heads,
-                       channels, num_levels, query_length, num_points,
-                       grad_value, grad_sampling_loc, grad_attn_weight);
-      break;
-    default:
-      if (channels < 64) {
-        deformable_attn_cuda_kernel_backward_shm_reduce_v1<data_t>
-            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-               num_threads * 3 * sizeof(data_t), stream>>>(
-                num_kernels, grad_out, data_value, data_spatial_shapes,
-                data_level_start_index, data_sampling_loc, data_attn_weight,
-                batch_size, value_length, num_heads, channels, num_levels,
-                query_length, num_points, grad_value, grad_sampling_loc,
-                grad_attn_weight);
-      } else {
-        deformable_attn_cuda_kernel_backward_shm_reduce_v2<data_t>
-            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-               num_threads * 3 * sizeof(data_t), stream>>>(
-                num_kernels, grad_out, data_value, data_spatial_shapes,
-                data_level_start_index, data_sampling_loc, data_attn_weight,
-                batch_size, value_length, num_heads, channels, num_levels,
-                query_length, num_points, grad_value, grad_sampling_loc,
-                grad_attn_weight);
-      }
-    }
-  }
-}
-
-// backward
-std::vector<paddle::Tensor> MSDeformableAttnCUDABackward(
-    const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes,
-    const paddle::Tensor &value_level_start_index,
-    const paddle::Tensor &sampling_locations,
-    const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out) {
-
-  CHECK_INPUT_GPU(value);
-  CHECK_INPUT_GPU(value_spatial_shapes);
-  CHECK_INPUT_GPU(value_level_start_index);
-  CHECK_INPUT_GPU(sampling_locations);
-  CHECK_INPUT_GPU(attention_weights);
-  CHECK_INPUT_GPU(grad_out);
-
-  const int batch_size = value.shape()[0];
-  const int value_length = value.shape()[1];
-  const int num_heads = value.shape()[2];
-  const int channels = value.shape()[3];
-
-  const int num_levels = value_spatial_shapes.shape()[0];
-  const int query_length = sampling_locations.shape()[1];
-  const int num_points = sampling_locations.shape()[4];
-
-  auto grad_value =
-      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
-  auto grad_spatial_shapes =
-      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
-  auto grad_level_start_index =
-      paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace());
-  auto grad_sampling_locations =
-      paddle::full(sampling_locations.shape(), 0, sampling_locations.dtype(),
-                   paddle::GPUPlace());
-  auto grad_attention_weights =
-      paddle::full(attention_weights.shape(), 0, attention_weights.dtype(),
-                   paddle::GPUPlace());
-
-  deformable_attn_cuda_backward<float>(
-      value.stream(), grad_out.data<float>(), value.data<float>(),
-      value_spatial_shapes.data<int64_t>(),
-      value_level_start_index.data<int64_t>(), sampling_locations.data<float>(),
-      attention_weights.data<float>(), batch_size, value_length, num_heads,
-      channels, num_levels, query_length, num_points, grad_value.data<float>(),
-      grad_sampling_locations.data<float>(),
-      grad_attention_weights.data<float>());
-
-  return {grad_value, grad_spatial_shapes, grad_level_start_index,
-          grad_sampling_locations, grad_attention_weights};
-}
diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py
deleted file mode 100644
index 7c3c386..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from paddle.utils.cpp_extension import CUDAExtension, setup
-
-if __name__ == "__main__":
-    setup(
-        name='deformable_detr_ops',
-        ext_modules=CUDAExtension(
-            sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu']))
diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py
deleted file mode 100644
index 94a0573..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-import os
-import sys
-import random
-import numpy as np
-import paddle
-# add python path of PaddleDetection to sys.path
-parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5)))
-if parent_path not in sys.path:
-    sys.path.append(parent_path)
-
-from ppdet.modeling.transformers.utils import deformable_attention_core_func
-ms_deform_attn_core_paddle = deformable_attention_core_func
-
-try:
-    gpu_index = int(sys.argv[1])
-except:
-    gpu_index = 0
-print(f'Use gpu {gpu_index} to test...')
-paddle.set_device(f'gpu:{gpu_index}')
-
-try:
-    from deformable_detr_ops import ms_deformable_attn
-except Exception as e:
-    print('import deformable_detr_ops error', e)
-    sys.exit(-1)
-
-paddle.seed(1)
-random.seed(1)
-np.random.seed(1)
-
-bs, n_heads, c = 2, 8, 8
-query_length, n_levels, n_points = 2, 2, 2
-spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64)
-level_start_index = paddle.concat((paddle.to_tensor(
-    [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1]))
-value_length = sum([(H * W).item() for H, W in spatial_shapes])
-
-
-def get_test_tensors(channels):
-    value = paddle.rand(
-        [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01
-    sampling_locations = paddle.rand(
-        [bs, query_length, n_heads, n_levels, n_points, 2],
-        dtype=paddle.float32)
-    attention_weights = paddle.rand(
-        [bs, query_length, n_heads, n_levels, n_points],
-        dtype=paddle.float32) + 1e-5
-    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(
-        -2, keepdim=True)
-
-    return [value, sampling_locations, attention_weights]
-
-
-@paddle.no_grad()
-def check_forward_equal_with_paddle_float():
-    value, sampling_locations, attention_weights = get_test_tensors(c)
-
-    output_paddle = ms_deform_attn_core_paddle(
-        value, spatial_shapes, level_start_index, sampling_locations,
-        attention_weights).detach().cpu()
-    output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index,
-                                     sampling_locations,
-                                     attention_weights).detach().cpu()
-    fwdok = paddle.allclose(
-        output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item()
-    max_abs_err = (output_cuda - output_paddle).abs().max().item()
-    max_rel_err = (
-        (output_cuda - output_paddle).abs() / output_paddle.abs()).max().item()
-
-    print(
-        f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}'
-    )
-
-
-def check_gradient_numerical(channels=4):
-    value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors(
-        channels)
-    value_paddle.stop_gradient = False
-    sampling_locations_paddle.stop_gradient = False
-    attention_weights_paddle.stop_gradient = False
-
-    value_cuda = value_paddle.detach().clone()
-    sampling_locations_cuda = sampling_locations_paddle.detach().clone()
-    attention_weights_cuda = attention_weights_paddle.detach().clone()
-    value_cuda.stop_gradient = False
-    sampling_locations_cuda.stop_gradient = False
-    attention_weights_cuda.stop_gradient = False
-
-    output_paddle = ms_deform_attn_core_paddle(
-        value_paddle, spatial_shapes, level_start_index,
-        sampling_locations_paddle, attention_weights_paddle)
-    output_paddle.sum().backward()
-
-    output_cuda = ms_deformable_attn(value_cuda, spatial_shapes,
-                                     level_start_index, sampling_locations_cuda,
-                                     attention_weights_cuda)
-    output_cuda.sum().backward()
-
-    res = paddle.allclose(
-        value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item()
-    print(f'*tensor1 {res} check_gradient_numerical(D={channels})')
-
-    res = paddle.allclose(
-        sampling_locations_paddle.grad,
-        sampling_locations_cuda.grad,
-        rtol=1e-2,
-        atol=1e-3).item()
-    print(f'*tensor2 {res} check_gradient_numerical(D={channels})')
-
-    res = paddle.allclose(
-        attention_weights_paddle.grad,
-        attention_weights_cuda.grad,
-        rtol=1e-2,
-        atol=1e-3).item()
-    print(f'*tensor3 {res} check_gradient_numerical(D={channels})')
-
-
-if __name__ == '__main__':
-    check_forward_equal_with_paddle_float()
-
-    for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]:
-        check_gradient_numerical(channels)
diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/group_detr_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/group_detr_transformer.py
deleted file mode 100644
index 31ec617..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/transformers/group_detr_transformer.py
+++ /dev/null
@@ -1,857 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Modified from detrex (https://github.com/IDEA-Research/detrex)
-# Copyright 2022 The IDEA Authors. All rights reserved.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-
-from ppdet.core.workspace import register
-from ..layers import MultiHeadAttention
-from .position_encoding import PositionEmbedding
-from ..heads.detr_head import MLP
-from .deformable_transformer import MSDeformableAttention
-from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
-                           bias_init_with_prob)
-from .utils import (_get_clones, get_valid_ratio,
-                    get_contrastive_denoising_training_group,
-                    get_sine_pos_embed, inverse_sigmoid)
-
-__all__ = ['GroupDINOTransformer']
-
-
-class DINOTransformerEncoderLayer(nn.Layer):
-    def __init__(self,
-                 d_model=256,
-                 n_head=8,
-                 dim_feedforward=1024,
-                 dropout=0.,
-                 activation="relu",
-                 n_levels=4,
-                 n_points=4,
-                 weight_attr=None,
-                 bias_attr=None):
-        super(DINOTransformerEncoderLayer, self).__init__()
-        # self attention
-        self.self_attn = MSDeformableAttention(d_model, n_head, n_levels,
-                                               n_points, 1.0)
-        self.dropout1 = nn.Dropout(dropout)
-        self.norm1 = nn.LayerNorm(
-            d_model,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-        # ffn
-        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,
-                                 bias_attr)
-        self.activation = getattr(F, activation)
-        self.dropout2 = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,
-                                 bias_attr)
-        self.dropout3 = nn.Dropout(dropout)
-        self.norm2 = nn.LayerNorm(
-            d_model,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        linear_init_(self.linear1)
-        linear_init_(self.linear2)
-        xavier_uniform_(self.linear1.weight)
-        xavier_uniform_(self.linear2.weight)
-
-    def with_pos_embed(self, tensor, pos):
-        return tensor if pos is None else tensor + pos
-
-    def forward_ffn(self, src):
-        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
-        src = src + self.dropout3(src2)
-        src = self.norm2(src)
-        return src
-
-    def forward(self,
-                src,
-                reference_points,
-                spatial_shapes,
-                level_start_index,
-                src_mask=None,
-                query_pos_embed=None):
-        # self attention
-        src2 = self.self_attn(
-            self.with_pos_embed(src, query_pos_embed), reference_points, src,
-            spatial_shapes, level_start_index, src_mask)
-        src = src + self.dropout1(src2)
-        src = self.norm1(src)
-        # ffn
-        src = self.forward_ffn(src)
-
-        return src
-
-
-class DINOTransformerEncoder(nn.Layer):
-    def __init__(self, encoder_layer, num_layers):
-        super(DINOTransformerEncoder, self).__init__()
-        self.layers = _get_clones(encoder_layer, num_layers)
-        self.num_layers = num_layers
-
-    @staticmethod
-    def get_reference_points(spatial_shapes, valid_ratios, offset=0.5):
-        valid_ratios = valid_ratios.unsqueeze(1)
-        reference_points = []
-        for i, (H, W) in enumerate(spatial_shapes):
-            ref_y, ref_x = paddle.meshgrid(
-                paddle.arange(end=H) + offset, paddle.arange(end=W) + offset)
-            ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] *
-                                                    H)
-            ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] *
-                                                    W)
-            reference_points.append(paddle.stack((ref_x, ref_y), axis=-1))
-        reference_points = paddle.concat(reference_points, 1).unsqueeze(2)
-        reference_points = reference_points * valid_ratios
-        return reference_points
-
-    def forward(self,
-                feat,
-                spatial_shapes,
-                level_start_index,
-                feat_mask=None,
-                query_pos_embed=None,
-                valid_ratios=None):
-        if valid_ratios is None:
-            valid_ratios = paddle.ones(
-                [feat.shape[0], spatial_shapes.shape[0], 2])
-        reference_points = self.get_reference_points(spatial_shapes,
-                                                     valid_ratios)
-        for layer in self.layers:
-            feat = layer(feat, reference_points, spatial_shapes,
-                         level_start_index, feat_mask, query_pos_embed)
-
-        return feat
-
-
-class DINOTransformerDecoderLayer(nn.Layer):
-    def __init__(self,
-                 d_model=256,
-                 n_head=8,
-                 dim_feedforward=1024,
-                 dropout=0.,
-                 activation="relu",
-                 n_levels=4,
-                 n_points=4,
-                 dual_queries=False,
-                 dual_groups=0,
-                 weight_attr=None,
-                 bias_attr=None):
-        super(DINOTransformerDecoderLayer, self).__init__()
-
-        # self attention
-        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
-        self.dropout1 = nn.Dropout(dropout)
-        self.norm1 = nn.LayerNorm(
-            d_model,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-
-        # cross attention
-        self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels,
-                                                n_points, 1.0)
-        self.dropout2 = nn.Dropout(dropout)
-        self.norm2 = nn.LayerNorm(
-            d_model,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-
-        # ffn
-        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,
-                                 bias_attr)
-        self.activation = getattr(F, activation)
-        self.dropout3 = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,
-                                 bias_attr)
-        self.dropout4 = nn.Dropout(dropout)
-        self.norm3 = nn.LayerNorm(
-            d_model,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-
-        # for dual groups 
-        self.dual_queries = dual_queries
-        self.dual_groups = dual_groups
-        self.n_head = n_head
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        linear_init_(self.linear1)
-        linear_init_(self.linear2)
-        xavier_uniform_(self.linear1.weight)
-        xavier_uniform_(self.linear2.weight)
-
-    def with_pos_embed(self, tensor, pos):
-        return tensor if pos is None else tensor + pos
-
-    def forward_ffn(self, tgt):
-        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
-
-    def forward(self,
-                tgt,
-                reference_points,
-                memory,
-                memory_spatial_shapes,
-                memory_level_start_index,
-                attn_mask=None,
-                memory_mask=None,
-                query_pos_embed=None):
-        # self attention
-        q = k = self.with_pos_embed(tgt, query_pos_embed)
-        if self.dual_queries:
-            dual_groups = self.dual_groups
-            bs, num_queries, n_model = paddle.shape(q)
-            q = paddle.concat(q.split(dual_groups + 1, axis=1), axis=0)
-            k = paddle.concat(k.split(dual_groups + 1, axis=1), axis=0)
-            tgt = paddle.concat(tgt.split(dual_groups + 1, axis=1), axis=0)
-
-            g_num_queries = num_queries // (dual_groups + 1)
-            if attn_mask is None or attn_mask[0] is None:
-                attn_mask = None
-            else:
-                # [(dual_groups + 1), g_num_queries, g_num_queries]
-                attn_mask = paddle.concat(
-                    [sa_mask.unsqueeze(0) for sa_mask in attn_mask], axis=0)
-                # [1, (dual_groups + 1), 1, g_num_queries, g_num_queries]
-                # --> [bs, (dual_groups + 1), nhead, g_num_queries, g_num_queries]
-                # --> [bs * (dual_groups + 1), nhead, g_num_queries, g_num_queries]
-                attn_mask = attn_mask.unsqueeze(0).unsqueeze(2).tile(
-                    [bs, 1, self.n_head, 1, 1])
-                attn_mask = attn_mask.reshape([
-                    bs * (dual_groups + 1), self.n_head, g_num_queries,
-                    g_num_queries
-                ])
-
-        if attn_mask is not None:
-            attn_mask = attn_mask.astype('bool')
-
-        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
-        tgt = tgt + self.dropout1(tgt2)
-        tgt = self.norm2(tgt)
-
-        # trace back
-        if self.dual_queries:
-            tgt = paddle.concat(tgt.split(dual_groups + 1, axis=0), axis=1)
-
-        # cross attention
-        tgt2 = self.cross_attn(
-            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
-            memory_spatial_shapes, memory_level_start_index, memory_mask)
-        tgt = tgt + self.dropout2(tgt2)
-        tgt = self.norm1(tgt)
-
-        # ffn
-        tgt2 = self.forward_ffn(tgt)
-        tgt = tgt + self.dropout4(tgt2)
-        tgt = self.norm3(tgt)
-
-        return tgt
-
-
-class DINOTransformerDecoder(nn.Layer):
-    def __init__(self,
-                 hidden_dim,
-                 decoder_layer,
-                 num_layers,
-                 return_intermediate=True):
-        super(DINOTransformerDecoder, self).__init__()
-        self.layers = _get_clones(decoder_layer, num_layers)
-        self.hidden_dim = hidden_dim
-        self.num_layers = num_layers
-        self.return_intermediate = return_intermediate
-
-        self.norm = nn.LayerNorm(
-            hidden_dim,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-
-    def forward(self,
-                tgt,
-                reference_points,
-                memory,
-                memory_spatial_shapes,
-                memory_level_start_index,
-                bbox_head,
-                query_pos_head,
-                valid_ratios=None,
-                attn_mask=None,
-                memory_mask=None):
-        if valid_ratios is None:
-            valid_ratios = paddle.ones(
-                [memory.shape[0], memory_spatial_shapes.shape[0], 2])
-
-        output = tgt
-        intermediate = []
-        inter_ref_bboxes = []
-        for i, layer in enumerate(self.layers):
-            reference_points_input = reference_points.unsqueeze(
-                2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)
-            query_pos_embed = get_sine_pos_embed(
-                reference_points_input[..., 0, :], self.hidden_dim // 2)
-            query_pos_embed = query_pos_head(query_pos_embed)
-
-            output = layer(output, reference_points_input, memory,
-                           memory_spatial_shapes, memory_level_start_index,
-                           attn_mask, memory_mask, query_pos_embed)
-            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
-                reference_points))
-
-            if self.return_intermediate:
-                intermediate.append(self.norm(output))
-                inter_ref_bboxes.append(inter_ref_bbox)
-
-            reference_points = inter_ref_bbox.detach()
-
-        if self.return_intermediate:
-            return paddle.stack(intermediate), paddle.stack(inter_ref_bboxes)
-
-        return output, reference_points
-
-
-@register
-class GroupDINOTransformer(nn.Layer):
-    __shared__ = ['num_classes', 'hidden_dim']
-
-    def __init__(self,
-                 num_classes=80,
-                 hidden_dim=256,
-                 num_queries=900,
-                 position_embed_type='sine',
-                 return_intermediate_dec=True,
-                 backbone_feat_channels=[512, 1024, 2048],
-                 num_levels=4,
-                 num_encoder_points=4,
-                 num_decoder_points=4,
-                 nhead=8,
-                 num_encoder_layers=6,
-                 num_decoder_layers=6,
-                 dim_feedforward=1024,
-                 dropout=0.,
-                 activation="relu",
-                 pe_temperature=10000,
-                 pe_offset=-0.5,
-                 num_denoising=100,
-                 label_noise_ratio=0.5,
-                 box_noise_scale=1.0,
-                 learnt_init_query=True,
-                 use_input_proj=True,
-                 dual_queries=False,
-                 dual_groups=0,
-                 eps=1e-2):
-        super(GroupDINOTransformer, self).__init__()
-        assert position_embed_type in ['sine', 'learned'], \
-            f'ValueError: position_embed_type not supported {position_embed_type}!'
-        assert len(backbone_feat_channels) <= num_levels
-
-        self.hidden_dim = hidden_dim
-        self.nhead = nhead
-        self.num_levels = num_levels
-        self.num_classes = num_classes
-        self.num_queries = num_queries
-        self.eps = eps
-        self.num_decoder_layers = num_decoder_layers
-        self.use_input_proj = use_input_proj
-
-        if use_input_proj:
-            # backbone feature projection
-            self._build_input_proj_layer(backbone_feat_channels)
-
-        # Transformer module
-        encoder_layer = DINOTransformerEncoderLayer(
-            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
-            num_encoder_points)
-        self.encoder = DINOTransformerEncoder(encoder_layer, num_encoder_layers)
-        decoder_layer = DINOTransformerDecoderLayer(
-            hidden_dim,
-            nhead,
-            dim_feedforward,
-            dropout,
-            activation,
-            num_levels,
-            num_decoder_points,
-            dual_queries=dual_queries,
-            dual_groups=dual_groups)
-        self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer,
-                                              num_decoder_layers,
-                                              return_intermediate_dec)
-
-        # denoising part
-        self.denoising_class_embed = nn.Embedding(
-            num_classes,
-            hidden_dim,
-            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
-        self.num_denoising = num_denoising
-        self.label_noise_ratio = label_noise_ratio
-        self.box_noise_scale = box_noise_scale
-
-        # for dual group
-        self.dual_queries = dual_queries
-        self.dual_groups = dual_groups
-        if self.dual_queries:
-            self.denoising_class_embed_groups = nn.LayerList([
-                nn.Embedding(
-                    num_classes,
-                    hidden_dim,
-                    weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
-                for _ in range(self.dual_groups)
-            ])
-
-        # position embedding
-        self.position_embedding = PositionEmbedding(
-            hidden_dim // 2,
-            temperature=pe_temperature,
-            normalize=True if position_embed_type == 'sine' else False,
-            embed_type=position_embed_type,
-            offset=pe_offset)
-        self.level_embed = nn.Embedding(num_levels, hidden_dim)
-        # decoder embedding
-        self.learnt_init_query = learnt_init_query
-        if learnt_init_query:
-            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
-            normal_(self.tgt_embed.weight)
-            if self.dual_queries:
-                self.tgt_embed_dual = nn.LayerList([
-                    nn.Embedding(num_queries, hidden_dim)
-                    for _ in range(self.dual_groups)
-                ])
-                for dual_tgt_module in self.tgt_embed_dual:
-                    normal_(dual_tgt_module.weight)
-        self.query_pos_head = MLP(2 * hidden_dim,
-                                  hidden_dim,
-                                  hidden_dim,
-                                  num_layers=2)
-
-        # encoder head
-        self.enc_output = nn.Sequential(
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.LayerNorm(
-                hidden_dim,
-                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-                bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
-        if self.dual_queries:
-            self.enc_output = _get_clones(self.enc_output, self.dual_groups + 1)
-        else:
-            self.enc_output = _get_clones(self.enc_output, 1)
-
-        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
-        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
-
-        if self.dual_queries:
-            self.enc_bbox_head_dq = nn.LayerList([
-                MLP(hidden_dim, hidden_dim, 4, num_layers=3)
-                for i in range(self.dual_groups)
-            ])
-            self.enc_score_head_dq = nn.LayerList([
-                nn.Linear(hidden_dim, num_classes)
-                for i in range(self.dual_groups)
-            ])
-
-        # decoder head
-        self.dec_score_head = nn.LayerList([
-            nn.Linear(hidden_dim, num_classes)
-            for _ in range(num_decoder_layers)
-        ])
-        self.dec_bbox_head = nn.LayerList([
-            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
-            for _ in range(num_decoder_layers)
-        ])
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        # class and bbox head init
-        bias_cls = bias_init_with_prob(0.01)
-        linear_init_(self.enc_score_head)
-        constant_(self.enc_score_head.bias, bias_cls)
-        constant_(self.enc_bbox_head.layers[-1].weight)
-        constant_(self.enc_bbox_head.layers[-1].bias)
-        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
-            linear_init_(cls_)
-            constant_(cls_.bias, bias_cls)
-            constant_(reg_.layers[-1].weight)
-            constant_(reg_.layers[-1].bias)
-
-        for enc_output in self.enc_output:
-            linear_init_(enc_output[0])
-            xavier_uniform_(enc_output[0].weight)
-        normal_(self.level_embed.weight)
-        if self.learnt_init_query:
-            xavier_uniform_(self.tgt_embed.weight)
-        xavier_uniform_(self.query_pos_head.layers[0].weight)
-        xavier_uniform_(self.query_pos_head.layers[1].weight)
-        normal_(self.denoising_class_embed.weight)
-        if self.use_input_proj:
-            for l in self.input_proj:
-                xavier_uniform_(l[0].weight)
-                constant_(l[0].bias)
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'backbone_feat_channels': [i.channels for i in input_shape], }
-
-    def _build_input_proj_layer(self, backbone_feat_channels):
-        self.input_proj = nn.LayerList()
-        for in_channels in backbone_feat_channels:
-            self.input_proj.append(
-                nn.Sequential(
-                    ('conv', nn.Conv2D(
-                        in_channels, self.hidden_dim, kernel_size=1)),
-                    ('norm', nn.GroupNorm(
-                        32,
-                        self.hidden_dim,
-                        weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-                        bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
-        in_channels = backbone_feat_channels[-1]
-        for _ in range(self.num_levels - len(backbone_feat_channels)):
-            self.input_proj.append(
-                nn.Sequential(
-                    ('conv', nn.Conv2D(
-                        in_channels,
-                        self.hidden_dim,
-                        kernel_size=3,
-                        stride=2,
-                        padding=1)), ('norm', nn.GroupNorm(
-                            32,
-                            self.hidden_dim,
-                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
-            in_channels = self.hidden_dim
-
-    def _get_encoder_input(self, feats, pad_mask=None):
-        if self.use_input_proj:
-            # get projection features
-            proj_feats = [
-                self.input_proj[i](feat) for i, feat in enumerate(feats)
-            ]
-            if self.num_levels > len(proj_feats):
-                len_srcs = len(proj_feats)
-                for i in range(len_srcs, self.num_levels):
-                    if i == len_srcs:
-                        proj_feats.append(self.input_proj[i](feats[-1]))
-                    else:
-                        proj_feats.append(self.input_proj[i](proj_feats[-1]))
-        else:
-            proj_feats = feats
-        # get encoder inputs
-        feat_flatten = []
-        mask_flatten = []
-        lvl_pos_embed_flatten = []
-        spatial_shapes = []
-        valid_ratios = []
-        for i, feat in enumerate(proj_feats):
-            bs, _, h, w = paddle.shape(feat)
-            spatial_shapes.append(paddle.concat([h, w]))
-            # [b,c,h,w] -> [b,h*w,c]
-            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
-            if pad_mask is not None:
-                mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]
-            else:
-                mask = paddle.ones([bs, h, w])
-            valid_ratios.append(get_valid_ratio(mask))
-            # [b, h*w, c]
-            pos_embed = self.position_embedding(mask).flatten(1, 2)
-            lvl_pos_embed = pos_embed + self.level_embed.weight[i].reshape(
-                [1, 1, -1])
-            lvl_pos_embed_flatten.append(lvl_pos_embed)
-            if pad_mask is not None:
-                # [b, h*w]
-                mask_flatten.append(mask.flatten(1))
-
-        # [b, l, c]
-        feat_flatten = paddle.concat(feat_flatten, 1)
-        # [b, l]
-        mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,
-                                                                   1)
-        # [b, l, c]
-        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
-        # [num_levels, 2]
-        spatial_shapes = paddle.to_tensor(
-            paddle.stack(spatial_shapes).astype('int64'))
-        # [l] start index of each level
-        level_start_index = paddle.concat([
-            paddle.zeros(
-                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
-        ])
-        # [b, num_levels, 2]
-        valid_ratios = paddle.stack(valid_ratios, 1)
-        return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
-                lvl_pos_embed_flatten, valid_ratios)
-
-    def forward(self, feats, pad_mask=None, gt_meta=None):
-        # input projection and embedding
-        (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
-         lvl_pos_embed_flatten,
-         valid_ratios) = self._get_encoder_input(feats, pad_mask)
-
-        # encoder
-        memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,
-                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)
-
-        # prepare denoising training
-        if self.training:
-            denoising_class, denoising_bbox, attn_mask, dn_meta = \
-                get_contrastive_denoising_training_group(gt_meta,
-                                            self.num_classes,
-                                            self.num_queries,
-                                            self.denoising_class_embed.weight,
-                                            self.num_denoising,
-                                            self.label_noise_ratio,
-                                            self.box_noise_scale)
-            if self.dual_queries:
-                denoising_class_groups = []
-                denoising_bbox_groups = []
-                attn_mask_groups = []
-                dn_meta_groups = []
-                for g_id in range(self.dual_groups):
-                    denoising_class_gid, denoising_bbox_gid, attn_mask_gid, dn_meta_gid = \
-                        get_contrastive_denoising_training_group(gt_meta,
-                                                    self.num_classes,
-                                                    self.num_queries,
-                                                    self.denoising_class_embed_groups[g_id].weight,
-                                                    self.num_denoising,
-                                                    self.label_noise_ratio,
-                                                    self.box_noise_scale)
-                    denoising_class_groups.append(denoising_class_gid)
-                    denoising_bbox_groups.append(denoising_bbox_gid)
-                    attn_mask_groups.append(attn_mask_gid)
-                    dn_meta_groups.append(dn_meta_gid)
-
-                # combine
-                denoising_class = [denoising_class] + denoising_class_groups
-                denoising_bbox = [denoising_bbox] + denoising_bbox_groups
-                attn_mask = [attn_mask] + attn_mask_groups
-                dn_meta = [dn_meta] + dn_meta_groups
-        else:
-            denoising_class, denoising_bbox, attn_mask, dn_meta = None, None, None, None
-
-        target, init_ref_points, enc_topk_bboxes, enc_topk_logits = \
-            self._get_decoder_input(
-            memory, spatial_shapes, mask_flatten, denoising_class,
-            denoising_bbox)
-
-        # decoder
-        inter_feats, inter_ref_bboxes = self.decoder(
-            target, init_ref_points, memory, spatial_shapes, level_start_index,
-            self.dec_bbox_head, self.query_pos_head, valid_ratios, attn_mask,
-            mask_flatten)
-        # solve hang during distributed training
-        inter_feats[0] += self.denoising_class_embed.weight[0, 0] * 0.
-        if self.dual_queries:
-            for g_id in range(self.dual_groups):
-                inter_feats[0] += self.denoising_class_embed_groups[
-                    g_id].weight[0, 0] * 0.0
-
-        out_bboxes = []
-        out_logits = []
-        for i in range(self.num_decoder_layers):
-            out_logits.append(self.dec_score_head[i](inter_feats[i]))
-            if i == 0:
-                out_bboxes.append(
-                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
-                              inverse_sigmoid(init_ref_points)))
-            else:
-                out_bboxes.append(
-                    F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) +
-                              inverse_sigmoid(inter_ref_bboxes[i - 1])))
-
-        out_bboxes = paddle.stack(out_bboxes)
-        out_logits = paddle.stack(out_logits)
-        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
-                dn_meta)
-
-    def _get_encoder_output_anchors(self,
-                                    memory,
-                                    spatial_shapes,
-                                    memory_mask=None,
-                                    grid_size=0.05):
-        output_anchors = []
-        idx = 0
-        for lvl, (h, w) in enumerate(spatial_shapes):
-            if memory_mask is not None:
-                mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])
-                valid_H = paddle.sum(mask_[:, :, 0], 1)
-                valid_W = paddle.sum(mask_[:, 0, :], 1)
-            else:
-                valid_H, valid_W = h, w
-
-            grid_y, grid_x = paddle.meshgrid(
-                paddle.arange(
-                    end=h, dtype=memory.dtype),
-                paddle.arange(
-                    end=w, dtype=memory.dtype))
-            grid_xy = paddle.stack([grid_x, grid_y], -1)
-
-            valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(
-                [-1, 1, 1, 2]).astype(grid_xy.dtype)
-            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
-            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
-            output_anchors.append(
-                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
-            idx += h * w
-
-        output_anchors = paddle.concat(output_anchors, 1)
-        valid_mask = ((output_anchors > self.eps) *
-                      (output_anchors < 1 - self.eps)).all(-1, keepdim=True)
-        output_anchors = paddle.log(output_anchors / (1 - output_anchors))
-        if memory_mask is not None:
-            valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0
-        output_anchors = paddle.where(valid_mask, output_anchors,
-                                      paddle.to_tensor(float("inf")))
-
-        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
-        if self.dual_queries:
-            output_memory = [
-                self.enc_output[g_id](memory)
-                for g_id in range(self.dual_groups + 1)
-            ]
-        else:
-            output_memory = self.enc_output[0](memory)
-        return output_memory, output_anchors
-
-    def _get_decoder_input(self,
-                           memory,
-                           spatial_shapes,
-                           memory_mask=None,
-                           denoising_class=None,
-                           denoising_bbox=None):
-        bs, _, _ = memory.shape
-        # prepare input for decoder
-        output_memory, output_anchors = self._get_encoder_output_anchors(
-            memory, spatial_shapes, memory_mask)
-        if self.dual_queries:
-            enc_outputs_class = self.enc_score_head(output_memory[0])
-            enc_outputs_coord_unact = self.enc_bbox_head(output_memory[
-                0]) + output_anchors
-        else:
-            enc_outputs_class = self.enc_score_head(output_memory)
-            enc_outputs_coord_unact = self.enc_bbox_head(
-                output_memory) + output_anchors
-
-        _, topk_ind = paddle.topk(
-            enc_outputs_class.max(-1), self.num_queries, axis=1)
-        # extract region proposal boxes
-        batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)
-        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
-        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
-        topk_coords_unact = paddle.gather_nd(enc_outputs_coord_unact,
-                                             topk_ind)  # unsigmoided.
-        enc_topk_bboxes = F.sigmoid(topk_coords_unact)
-        reference_points = enc_topk_bboxes.detach()
-        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
-
-        if self.dual_queries:
-            enc_topk_logits_groups = []
-            enc_topk_bboxes_groups = []
-            reference_points_groups = []
-            topk_ind_groups = []
-            for g_id in range(self.dual_groups):
-                enc_outputs_class_gid = self.enc_score_head_dq[g_id](
-                    output_memory[g_id + 1])
-                enc_outputs_coord_unact_gid = self.enc_bbox_head_dq[g_id](
-                    output_memory[g_id + 1]) + output_anchors
-                _, topk_ind_gid = paddle.topk(
-                    enc_outputs_class_gid.max(-1), self.num_queries, axis=1)
-                # extract region proposal boxes
-                batch_ind = paddle.arange(end=bs, dtype=topk_ind_gid.dtype)
-                batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
-                topk_ind_gid = paddle.stack([batch_ind, topk_ind_gid], axis=-1)
-                topk_coords_unact_gid = paddle.gather_nd(
-                    enc_outputs_coord_unact_gid, topk_ind_gid)  # unsigmoided.
-                enc_topk_bboxes_gid = F.sigmoid(topk_coords_unact_gid)
-                reference_points_gid = enc_topk_bboxes_gid.detach()
-                enc_topk_logits_gid = paddle.gather_nd(enc_outputs_class_gid,
-                                                       topk_ind_gid)
-
-                # append and combine
-                topk_ind_groups.append(topk_ind_gid)
-                enc_topk_logits_groups.append(enc_topk_logits_gid)
-                enc_topk_bboxes_groups.append(enc_topk_bboxes_gid)
-                reference_points_groups.append(reference_points_gid)
-
-            enc_topk_bboxes = paddle.concat(
-                [enc_topk_bboxes] + enc_topk_bboxes_groups, 1)
-            enc_topk_logits = paddle.concat(
-                [enc_topk_logits] + enc_topk_logits_groups, 1)
-            reference_points = paddle.concat(
-                [reference_points] + reference_points_groups, 1)
-            topk_ind = paddle.concat([topk_ind] + topk_ind_groups, 1)
-
-        # extract region features
-        if self.learnt_init_query:
-            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
-            if self.dual_queries:
-                target = paddle.concat([target] + [
-                    self.tgt_embed_dual[g_id].weight.unsqueeze(0).tile(
-                        [bs, 1, 1]) for g_id in range(self.dual_groups)
-                ], 1)
-        else:
-            if self.dual_queries:
-                target = paddle.gather_nd(output_memory[0], topk_ind)
-                target_groups = []
-                for g_id in range(self.dual_groups):
-                    target_gid = paddle.gather_nd(output_memory[g_id + 1],
-                                                  topk_ind_groups[g_id])
-                    target_groups.append(target_gid)
-                target = paddle.concat([target] + target_groups, 1).detach()
-            else:
-                target = paddle.gather_nd(output_memory, topk_ind).detach()
-
-        if denoising_bbox is not None:
-            if isinstance(denoising_bbox, list) and isinstance(
-                    denoising_class, list) and self.dual_queries:
-                if denoising_bbox[0] is not None:
-                    reference_points_list = paddle.split(
-                        reference_points, self.dual_groups + 1, axis=1)
-                    reference_points = paddle.concat(
-                        [
-                            paddle.concat(
-                                [ref, ref_], axis=1)
-                            for ref, ref_ in zip(denoising_bbox,
-                                                 reference_points_list)
-                        ],
-                        axis=1)
-
-                    target_list = paddle.split(
-                        target, self.dual_groups + 1, axis=1)
-                    target = paddle.concat(
-                        [
-                            paddle.concat(
-                                [tgt, tgt_], axis=1)
-                            for tgt, tgt_ in zip(denoising_class, target_list)
-                        ],
-                        axis=1)
-                else:
-                    reference_points, target = reference_points, target
-            else:
-                reference_points = paddle.concat(
-                    [denoising_bbox, reference_points], 1)
-                target = paddle.concat([denoising_class, target], 1)
-
-        return target, reference_points, enc_topk_bboxes, enc_topk_logits
diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/hybrid_encoder.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/hybrid_encoder.py
deleted file mode 100644
index 5694803..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/transformers/hybrid_encoder.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from ppdet.core.workspace import register, serializable
-from ppdet.modeling.ops import get_act_fn
-from ..shape_spec import ShapeSpec
-from ..backbones.csp_darknet import BaseConv
-from ..backbones.cspresnet import RepVggBlock
-from ppdet.modeling.transformers.detr_transformer import TransformerEncoder
-from ..initializer import xavier_uniform_, linear_init_
-from ..layers import MultiHeadAttention
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-
-__all__ = ['HybridEncoder']
-
-
-class CSPRepLayer(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 num_blocks=3,
-                 expansion=1.0,
-                 bias=False,
-                 act="silu"):
-        super(CSPRepLayer, self).__init__()
-        hidden_channels = int(out_channels * expansion)
-        self.conv1 = BaseConv(
-            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
-        self.conv2 = BaseConv(
-            in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act)
-        self.bottlenecks = nn.Sequential(* [
-            RepVggBlock(
-                hidden_channels, hidden_channels, act=act)
-            for _ in range(num_blocks)
-        ])
-        if hidden_channels != out_channels:
-            self.conv3 = BaseConv(
-                hidden_channels,
-                out_channels,
-                ksize=1,
-                stride=1,
-                bias=bias,
-                act=act)
-        else:
-            self.conv3 = nn.Identity()
-
-    def forward(self, x):
-        x_1 = self.conv1(x)
-        x_1 = self.bottlenecks(x_1)
-        x_2 = self.conv2(x)
-        return self.conv3(x_1 + x_2)
-
-
-@register
-class TransformerLayer(nn.Layer):
-    def __init__(self,
-                 d_model,
-                 nhead,
-                 dim_feedforward=1024,
-                 dropout=0.,
-                 activation="relu",
-                 attn_dropout=None,
-                 act_dropout=None,
-                 normalize_before=False):
-        super(TransformerLayer, self).__init__()
-        attn_dropout = dropout if attn_dropout is None else attn_dropout
-        act_dropout = dropout if act_dropout is None else act_dropout
-        self.normalize_before = normalize_before
-
-        self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
-        # Implementation of Feedforward model
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
-        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
-        self.activation = getattr(F, activation)
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        linear_init_(self.linear1)
-        linear_init_(self.linear2)
-
-    @staticmethod
-    def with_pos_embed(tensor, pos_embed):
-        return tensor if pos_embed is None else tensor + pos_embed
-
-    def forward(self, src, src_mask=None, pos_embed=None):
-        residual = src
-        if self.normalize_before:
-            src = self.norm1(src)
-        q = k = self.with_pos_embed(src, pos_embed)
-        src = self.self_attn(q, k, value=src, attn_mask=src_mask)
-
-        src = residual + self.dropout1(src)
-        if not self.normalize_before:
-            src = self.norm1(src)
-
-        residual = src
-        if self.normalize_before:
-            src = self.norm2(src)
-        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
-        src = residual + self.dropout2(src)
-        if not self.normalize_before:
-            src = self.norm2(src)
-        return src
-
-
-@register
-@serializable
-class HybridEncoder(nn.Layer):
-    __shared__ = ['depth_mult', 'act', 'trt', 'eval_size']
-    __inject__ = ['encoder_layer']
-
-    def __init__(self,
-                 in_channels=[512, 1024, 2048],
-                 feat_strides=[8, 16, 32],
-                 hidden_dim=256,
-                 use_encoder_idx=[2],
-                 num_encoder_layers=1,
-                 encoder_layer='TransformerLayer',
-                 pe_temperature=10000,
-                 expansion=1.0,
-                 depth_mult=1.0,
-                 act='silu',
-                 trt=False,
-                 eval_size=None):
-        super(HybridEncoder, self).__init__()
-        self.in_channels = in_channels
-        self.feat_strides = feat_strides
-        self.hidden_dim = hidden_dim
-        self.use_encoder_idx = use_encoder_idx
-        self.num_encoder_layers = num_encoder_layers
-        self.pe_temperature = pe_temperature
-        self.eval_size = eval_size
-
-        # channel projection
-        self.input_proj = nn.LayerList()
-        for in_channel in in_channels:
-            self.input_proj.append(
-                nn.Sequential(
-                    nn.Conv2D(
-                        in_channel, hidden_dim, kernel_size=1, bias_attr=False),
-                    nn.BatchNorm2D(
-                        hidden_dim,
-                        weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-                        bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))
-        # encoder transformer
-        self.encoder = nn.LayerList([
-            TransformerEncoder(encoder_layer, num_encoder_layers)
-            for _ in range(len(use_encoder_idx))
-        ])
-
-        act = get_act_fn(
-            act, trt=trt) if act is None or isinstance(act,
-                                                       (str, dict)) else act
-        # top-down fpn
-        self.lateral_convs = nn.LayerList()
-        self.fpn_blocks = nn.LayerList()
-        for idx in range(len(in_channels) - 1, 0, -1):
-            self.lateral_convs.append(
-                BaseConv(
-                    hidden_dim, hidden_dim, 1, 1, act=act))
-            self.fpn_blocks.append(
-                CSPRepLayer(
-                    hidden_dim * 2,
-                    hidden_dim,
-                    round(3 * depth_mult),
-                    act=act,
-                    expansion=expansion))
-
-        # bottom-up pan
-        self.downsample_convs = nn.LayerList()
-        self.pan_blocks = nn.LayerList()
-        for idx in range(len(in_channels) - 1):
-            self.downsample_convs.append(
-                BaseConv(
-                    hidden_dim, hidden_dim, 3, stride=2, act=act))
-            self.pan_blocks.append(
-                CSPRepLayer(
-                    hidden_dim * 2,
-                    hidden_dim,
-                    round(3 * depth_mult),
-                    act=act,
-                    expansion=expansion))
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        if self.eval_size:
-            for idx in self.use_encoder_idx:
-                stride = self.feat_strides[idx]
-                pos_embed = self.build_2d_sincos_position_embedding(
-                    self.eval_size[1] // stride, self.eval_size[0] // stride,
-                    self.hidden_dim, self.pe_temperature)
-                setattr(self, f'pos_embed{idx}', pos_embed)
-
-    @staticmethod
-    def build_2d_sincos_position_embedding(w,
-                                           h,
-                                           embed_dim=256,
-                                           temperature=10000.):
-        grid_w = paddle.arange(int(w), dtype=paddle.float32)
-        grid_h = paddle.arange(int(h), dtype=paddle.float32)
-        grid_w, grid_h = paddle.meshgrid(grid_w, grid_h)
-        assert embed_dim % 4 == 0, \
-            'Embed dimension must be divisible by 4 for 2D sin-cos position embedding'
-        pos_dim = embed_dim // 4
-        omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim
-        omega = 1. / (temperature**omega)
-
-        out_w = grid_w.flatten()[..., None] @omega[None]
-        out_h = grid_h.flatten()[..., None] @omega[None]
-
-        return paddle.concat(
-            [
-                paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h),
-                paddle.cos(out_h)
-            ],
-            axis=1)[None, :, :]
-
-    def forward(self, feats, for_mot=False, is_teacher=False):
-        assert len(feats) == len(self.in_channels)
-        # get projection features
-        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
-        # encoder
-        if self.num_encoder_layers > 0:
-            for i, enc_ind in enumerate(self.use_encoder_idx):
-                h, w = proj_feats[enc_ind].shape[2:]
-                # flatten [B, C, H, W] to [B, HxW, C]
-                src_flatten = proj_feats[enc_ind].flatten(2).transpose(
-                    [0, 2, 1])
-                if self.training or self.eval_size is None or is_teacher:
-                    pos_embed = self.build_2d_sincos_position_embedding(
-                        w, h, self.hidden_dim, self.pe_temperature)
-                else:
-                    pos_embed = getattr(self, f'pos_embed{enc_ind}', None)
-                memory = self.encoder[i](src_flatten, pos_embed=pos_embed)
-                proj_feats[enc_ind] = memory.transpose([0, 2, 1]).reshape(
-                    [-1, self.hidden_dim, h, w])
-
-        # top-down fpn
-        inner_outs = [proj_feats[-1]]
-        for idx in range(len(self.in_channels) - 1, 0, -1):
-            feat_heigh = inner_outs[0]
-            feat_low = proj_feats[idx - 1]
-            feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx](
-                feat_heigh)
-            inner_outs[0] = feat_heigh
-
-            upsample_feat = F.interpolate(
-                feat_heigh, scale_factor=2., mode="nearest")
-            inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx](
-                paddle.concat(
-                    [upsample_feat, feat_low], axis=1))
-            inner_outs.insert(0, inner_out)
-
-        # bottom-up pan
-        outs = [inner_outs[0]]
-        for idx in range(len(self.in_channels) - 1):
-            feat_low = outs[-1]
-            feat_height = inner_outs[idx + 1]
-            downsample_feat = self.downsample_convs[idx](feat_low)
-            out = self.pan_blocks[idx](paddle.concat(
-                [downsample_feat, feat_height], axis=1))
-            outs.append(out)
-
-        return outs
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {
-            'in_channels': [i.channels for i in input_shape],
-            'feat_strides': [i.stride for i in input_shape]
-        }
-
-    @property
-    def out_shape(self):
-        return [
-            ShapeSpec(
-                channels=self.hidden_dim, stride=self.feat_strides[idx])
-            for idx in range(len(self.in_channels))
-        ]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/mask_dino_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/mask_dino_transformer.py
deleted file mode 100644
index 6b29223..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/transformers/mask_dino_transformer.py
+++ /dev/null
@@ -1,536 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Modified from detrex (https://github.com/IDEA-Research/detrex)
-# Copyright 2022 The IDEA Authors. All rights reserved.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-
-from ppdet.core.workspace import register
-from .position_encoding import PositionEmbedding
-from ..heads.detr_head import MLP
-from .deformable_transformer import (DeformableTransformerEncoderLayer,
-                                     DeformableTransformerEncoder)
-from .dino_transformer import (DINOTransformerDecoderLayer)
-from ..initializer import (linear_init_, constant_, xavier_uniform_,
-                           bias_init_with_prob)
-from .utils import (_get_clones, get_valid_ratio, get_denoising_training_group,
-                    get_sine_pos_embed, inverse_sigmoid, mask_to_box_coordinate)
-
-__all__ = ['MaskDINO']
-
-
-class ConvGNBlock(nn.Layer):
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 groups=1,
-                 num_groups=32,
-                 bias=False,
-                 act=None):
-        super(ConvGNBlock, self).__init__()
-        self.conv = nn.Conv2D(
-            in_channels,
-            out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=(kernel_size - 1) // 2,
-            groups=groups,
-            bias_attr=bias)
-        self.norm = nn.GroupNorm(
-            num_groups,
-            out_channels,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-        self.act = getattr(F, act) if act is not None else None
-
-        self._init_weights()
-
-    def _init_weights(self):
-        xavier_uniform_(self.conv.weight)
-
-    def forward(self, x):
-        x = self.norm(self.conv(x))
-        if self.act is not None:
-            x = self.act(x)
-        return x
-
-
-class MaskDINOTransformerDecoder(nn.Layer):
-    def __init__(self, hidden_dim, decoder_layer, num_layers):
-        super(MaskDINOTransformerDecoder, self).__init__()
-        self.layers = _get_clones(decoder_layer, num_layers)
-        self.hidden_dim = hidden_dim
-        self.num_layers = num_layers
-
-    def forward(self,
-                tgt,
-                ref_points_unact,
-                memory,
-                memory_spatial_shapes,
-                memory_level_start_index,
-                bbox_head,
-                query_pos_head,
-                dec_norm,
-                valid_ratios=None,
-                attn_mask=None,
-                memory_mask=None):
-        if valid_ratios is None:
-            valid_ratios = paddle.ones(
-                [memory.shape[0], memory_spatial_shapes.shape[0], 2])
-
-        output = tgt
-        intermediate = []
-        inter_bboxes = []
-        ref_points = F.sigmoid(ref_points_unact)
-        for i, layer in enumerate(self.layers):
-            reference_points_input = ref_points.detach().unsqueeze(
-                2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1)
-            query_pos_embed = get_sine_pos_embed(
-                reference_points_input[..., 0, :], self.hidden_dim // 2)
-            query_pos_embed = query_pos_head(query_pos_embed)
-
-            output = layer(output, reference_points_input, memory,
-                           memory_spatial_shapes, memory_level_start_index,
-                           attn_mask, memory_mask, query_pos_embed)
-
-            ref_points = F.sigmoid(
-                bbox_head(output) + inverse_sigmoid(ref_points.detach()))
-
-            intermediate.append(dec_norm(output))
-            inter_bboxes.append(ref_points)
-
-        return paddle.stack(intermediate), paddle.stack(inter_bboxes)
-
-
-@register
-class MaskDINO(nn.Layer):
-    __shared__ = ['num_classes', 'hidden_dim']
-
-    def __init__(self,
-                 num_classes=80,
-                 hidden_dim=256,
-                 num_queries=300,
-                 position_embed_type='sine',
-                 in_feats_channel=[256, 512, 1024, 2048],
-                 num_levels=3,
-                 num_encoder_points=4,
-                 num_decoder_points=4,
-                 nhead=8,
-                 num_encoder_layers=6,
-                 num_decoder_layers=9,
-                 enc_dim_feedforward=1024,
-                 dec_dim_feedforward=2048,
-                 dropout=0.,
-                 activation="relu",
-                 lr_mult=1.0,
-                 pe_temperature=10000,
-                 pe_offset=-0.5,
-                 num_denoising=100,
-                 label_noise_ratio=0.4,
-                 box_noise_scale=0.4,
-                 learnt_init_query=False,
-                 mask_enhanced=True,
-                 eps=1e-2):
-        super(MaskDINO, self).__init__()
-        assert position_embed_type in ['sine', 'learned'], \
-            f'ValueError: position_embed_type not supported {position_embed_type}!'
-        feat0_dim = in_feats_channel.pop(0)
-        assert len(in_feats_channel) <= num_levels
-
-        self.hidden_dim = hidden_dim
-        self.nhead = nhead
-        self.num_levels = num_levels
-        self.num_classes = num_classes
-        self.num_queries = num_queries
-        self.eps = eps
-        self.num_decoder_layers = num_decoder_layers
-        self.mask_enhanced = mask_enhanced
-
-        weight_attr = ParamAttr(regularizer=L2Decay(0.0))
-        bias_attr = ParamAttr(regularizer=L2Decay(0.0))
-        # backbone feature projection
-        self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr)
-
-        # Transformer module
-        encoder_layer = DeformableTransformerEncoderLayer(
-            hidden_dim, nhead, enc_dim_feedforward, dropout, activation,
-            num_levels, num_encoder_points, lr_mult, weight_attr, bias_attr)
-        self.encoder = DeformableTransformerEncoder(encoder_layer,
-                                                    num_encoder_layers)
-        decoder_layer = DINOTransformerDecoderLayer(
-            hidden_dim, nhead, dec_dim_feedforward, dropout, activation,
-            num_levels, num_decoder_points, lr_mult, weight_attr, bias_attr)
-        self.decoder = MaskDINOTransformerDecoder(hidden_dim, decoder_layer,
-                                                  num_decoder_layers)
-
-        # denoising part
-        self.denoising_class_embed = nn.Embedding(
-            num_classes,
-            hidden_dim,
-            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
-        self.num_denoising = num_denoising
-        self.label_noise_ratio = label_noise_ratio
-        self.box_noise_scale = box_noise_scale
-
-        # position embedding
-        self.position_embedding = PositionEmbedding(
-            hidden_dim // 2,
-            temperature=pe_temperature,
-            normalize=True if position_embed_type == 'sine' else False,
-            embed_type=position_embed_type,
-            offset=pe_offset)
-        self.level_embed = nn.Embedding(
-            num_levels,
-            hidden_dim,
-            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
-        # decoder embedding
-        self.learnt_init_query = learnt_init_query
-        if learnt_init_query:
-            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
-        self.query_pos_head = MLP(2 * hidden_dim,
-                                  hidden_dim,
-                                  hidden_dim,
-                                  num_layers=2)
-        # mask embedding
-        self.mask_query_head = MLP(hidden_dim,
-                                   hidden_dim,
-                                   hidden_dim,
-                                   num_layers=3)
-
-        # encoder mask head
-        self.enc_mask_lateral = ConvGNBlock(feat0_dim, hidden_dim, 1)
-        self.enc_mask_output = nn.Sequential(
-            ConvGNBlock(
-                hidden_dim, hidden_dim, 3, act=activation),
-            nn.Conv2D(hidden_dim, hidden_dim, 1))
-        # encoder head
-        self.enc_output = nn.Sequential(
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.LayerNorm(
-                hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr))
-        # decoder norm layer
-        self.dec_norm = nn.LayerNorm(
-            hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)
-        # shared prediction head
-        self.class_head = nn.Linear(hidden_dim, num_classes)
-        self.bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        # class and bbox head init
-        bias_cls = bias_init_with_prob(0.01)
-        linear_init_(self.class_head)
-        constant_(self.class_head.bias, bias_cls)
-        constant_(self.bbox_head.layers[-1].weight)
-        constant_(self.bbox_head.layers[-1].bias)
-
-        xavier_uniform_(self.enc_mask_output[1].weight)
-        linear_init_(self.enc_output[0])
-        xavier_uniform_(self.enc_output[0].weight)
-        if self.learnt_init_query:
-            xavier_uniform_(self.tgt_embed.weight)
-        xavier_uniform_(self.query_pos_head.layers[0].weight)
-        xavier_uniform_(self.query_pos_head.layers[1].weight)
-        for l in self.input_proj:
-            xavier_uniform_(l[0].weight)
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'in_feats_channel': [i.channels for i in input_shape], }
-
-    def _build_input_proj_layer(self,
-                                in_feats_channel,
-                                weight_attr=None,
-                                bias_attr=None):
-        self.input_proj = nn.LayerList()
-        for in_channels in in_feats_channel:
-            self.input_proj.append(
-                nn.Sequential(
-                    ('conv', nn.Conv2D(
-                        in_channels, self.hidden_dim, kernel_size=1)), (
-                            'norm', nn.GroupNorm(
-                                32,
-                                self.hidden_dim,
-                                weight_attr=weight_attr,
-                                bias_attr=bias_attr))))
-        in_channels = in_feats_channel[-1]
-        for _ in range(self.num_levels - len(in_feats_channel)):
-            self.input_proj.append(
-                nn.Sequential(
-                    ('conv', nn.Conv2D(
-                        in_channels,
-                        self.hidden_dim,
-                        kernel_size=3,
-                        stride=2,
-                        padding=1)), ('norm', nn.GroupNorm(
-                            32,
-                            self.hidden_dim,
-                            weight_attr=weight_attr,
-                            bias_attr=bias_attr))))
-            in_channels = self.hidden_dim
-
-    def _get_encoder_input(self, feats, pad_mask=None):
-        # get projection features
-        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
-        if self.num_levels > len(proj_feats):
-            len_srcs = len(proj_feats)
-            for i in range(len_srcs, self.num_levels):
-                if i == len_srcs:
-                    proj_feats.append(self.input_proj[i](feats[-1]))
-                else:
-                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
-
-        # get encoder inputs
-        feat_flatten = []
-        mask_flatten = []
-        lvl_pos_embed_flatten = []
-        spatial_shapes = []
-        valid_ratios = []
-        for i, feat in enumerate(proj_feats):
-            bs, _, h, w = paddle.shape(feat)
-            spatial_shapes.append(paddle.concat([h, w]))
-            # [b,c,h,w] -> [b,h*w,c]
-            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
-            if pad_mask is not None:
-                mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0]
-            else:
-                mask = paddle.ones([bs, h, w])
-            valid_ratios.append(get_valid_ratio(mask))
-            # [b, h*w, c]
-            pos_embed = self.position_embedding(mask).flatten(1, 2)
-            lvl_pos_embed = pos_embed + self.level_embed.weight[i]
-            lvl_pos_embed_flatten.append(lvl_pos_embed)
-            if pad_mask is not None:
-                # [b, h*w]
-                mask_flatten.append(mask.flatten(1))
-
-        # [b, l, c]
-        feat_flatten = paddle.concat(feat_flatten, 1)
-        # [b, l]
-        mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten,
-                                                                   1)
-        # [b, l, c]
-        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
-        # [num_levels, 2]
-        spatial_shapes = paddle.to_tensor(
-            paddle.stack(spatial_shapes).astype('int64'))
-        # [l], 每一个level的起始index
-        level_start_index = paddle.concat([
-            paddle.zeros(
-                [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1]
-        ])
-        # [b, num_levels, 2]
-        valid_ratios = paddle.stack(valid_ratios, 1)
-        return (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
-                lvl_pos_embed_flatten, valid_ratios)
-
-    def forward(self, feats, pad_mask=None, gt_meta=None):
-        feat0 = feats.pop(0)
-        # input projection and embedding
-        (feat_flatten, spatial_shapes, level_start_index, mask_flatten,
-         lvl_pos_embed_flatten,
-         valid_ratios) = self._get_encoder_input(feats, pad_mask)
-
-        # encoder
-        memory = self.encoder(feat_flatten, spatial_shapes, level_start_index,
-                              mask_flatten, lvl_pos_embed_flatten, valid_ratios)
-
-        mask_feat = self._get_encoder_mask_feature(feat0, memory,
-                                                   spatial_shapes)
-
-        # prepare denoising training
-        if self.training:
-            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
-                get_denoising_training_group(gt_meta,
-                                            self.num_classes,
-                                            self.num_queries,
-                                            self.denoising_class_embed.weight,
-                                            self.num_denoising,
-                                            self.label_noise_ratio,
-                                            self.box_noise_scale)
-        else:
-            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
-
-        target, init_ref_points_unact, enc_out, init_out = \
-            self._get_decoder_input(
-            memory, mask_feat, spatial_shapes, mask_flatten, denoising_class,
-            denoising_bbox_unact)
-
-        # decoder
-        inter_feats, inter_bboxes = self.decoder(
-            target, init_ref_points_unact, memory, spatial_shapes,
-            level_start_index, self.bbox_head, self.query_pos_head,
-            self.dec_norm, valid_ratios, attn_mask, mask_flatten)
-
-        out_logits = []
-        out_bboxes = []
-        out_masks = []
-        for i in range(self.num_decoder_layers):
-            if self.training or i == self.num_decoder_layers - 1:
-                logits_, masks_ = self._get_pred_class_and_mask(inter_feats[i],
-                                                                mask_feat)
-            else:
-                continue
-            out_logits.append(logits_)
-            out_masks.append(masks_)
-            if i == 0:
-                out_bboxes.append(
-                    F.sigmoid(
-                        self.bbox_head(inter_feats[i]) + init_ref_points_unact))
-            else:
-                out_bboxes.append(
-                    F.sigmoid(
-                        self.bbox_head(inter_feats[i]) + inverse_sigmoid(
-                            inter_bboxes[i - 1])))
-        out_bboxes = paddle.stack(out_bboxes)
-        out_logits = paddle.stack(out_logits)
-        out_masks = paddle.stack(out_masks)
-
-        return (out_logits, out_bboxes, out_masks, enc_out, init_out, dn_meta)
-
-    def _get_encoder_mask_feature(self, in_feat, memory, spatial_shapes):
-        memory_feat0 = memory.split(
-            spatial_shapes.prod(1).split(self.num_levels), axis=1)[0]
-        h, w = spatial_shapes[0]
-        memory_feat0 = memory_feat0.reshape(
-            [0, h, w, self.hidden_dim]).transpose([0, 3, 1, 2])
-        out = self.enc_mask_lateral(in_feat) + F.interpolate(
-            memory_feat0,
-            scale_factor=2.0,
-            mode='bilinear',
-            align_corners=False)
-        return self.enc_mask_output(out)
-
-    def _get_encoder_output_anchors(self,
-                                    memory,
-                                    spatial_shapes,
-                                    memory_mask=None,
-                                    grid_size=0.05):
-        output_anchors = []
-        idx = 0
-        for lvl, (h, w) in enumerate(spatial_shapes):
-            if memory_mask is not None:
-                mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w])
-                valid_H = paddle.sum(mask_[:, :, 0], 1)
-                valid_W = paddle.sum(mask_[:, 0, :], 1)
-            else:
-                valid_H, valid_W = h, w
-
-            grid_y, grid_x = paddle.meshgrid(
-                paddle.arange(end=h), paddle.arange(end=w))
-            grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype)
-
-            valid_WH = paddle.stack([valid_W, valid_H], -1).reshape(
-                [-1, 1, 1, 2]).astype(grid_xy.dtype)
-            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
-            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
-            output_anchors.append(
-                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
-            idx += h * w
-
-        output_anchors = paddle.concat(output_anchors, 1)
-        valid_mask = ((output_anchors > self.eps) *
-                      (output_anchors < 1 - self.eps)).all(-1, keepdim=True)
-        output_anchors = paddle.log(output_anchors / (1 - output_anchors))
-        if memory_mask is not None:
-            valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0
-        output_anchors = paddle.where(valid_mask, output_anchors,
-                                      paddle.to_tensor(float("inf")))
-
-        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
-        output_memory = self.enc_output(memory)
-        return output_memory, output_anchors
-
-    def _get_decoder_input(self,
-                           memory,
-                           mask_feat,
-                           spatial_shapes,
-                           memory_mask=None,
-                           denoising_class=None,
-                           denoising_bbox_unact=None):
-        # prepare input for decoder
-        bs, _, _ = memory.shape
-        output_memory, output_anchors = self._get_encoder_output_anchors(
-            memory, spatial_shapes, memory_mask)
-        enc_logits_unact = self.class_head(output_memory)
-        enc_bboxes_unact = self.bbox_head(output_memory) + output_anchors
-
-        # get topk index
-        _, topk_ind = paddle.topk(
-            enc_logits_unact.max(-1), self.num_queries, axis=1)
-        batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype)
-        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
-        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
-
-        # extract content and position query embedding
-        target = paddle.gather_nd(output_memory, topk_ind)
-        reference_points_unact = paddle.gather_nd(enc_bboxes_unact,
-                                                  topk_ind)  # unsigmoided.
-        # get encoder output: {logits, bboxes, masks}
-        enc_out_logits, enc_out_masks = self._get_pred_class_and_mask(target,
-                                                                      mask_feat)
-        enc_out_bboxes = F.sigmoid(reference_points_unact)
-        enc_out = (enc_out_logits, enc_out_bboxes, enc_out_masks)
-
-        # concat denoising query
-        if self.learnt_init_query:
-            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
-        else:
-            target = target.detach()
-        if denoising_class is not None:
-            target = paddle.concat([denoising_class, target], 1)
-        if self.mask_enhanced:
-            # use mask-enhanced anchor box initialization
-            reference_points = mask_to_box_coordinate(
-                enc_out_masks > 0, normalize=True, format="xywh")
-            reference_points_unact = inverse_sigmoid(reference_points)
-        if denoising_bbox_unact is not None:
-            reference_points_unact = paddle.concat(
-                [denoising_bbox_unact, reference_points_unact], 1)
-
-        # direct prediction from the matching and denoising part in the begining
-        if self.training and denoising_class is not None:
-            init_out_logits, init_out_masks = self._get_pred_class_and_mask(
-                target, mask_feat)
-            init_out_bboxes = F.sigmoid(reference_points_unact)
-            init_out = (init_out_logits, init_out_bboxes, init_out_masks)
-        else:
-            init_out = None
-
-        return target, reference_points_unact.detach(), enc_out, init_out
-
-    def _get_pred_class_and_mask(self, query_embed, mask_feat):
-        out_query = self.dec_norm(query_embed)
-        out_logits = self.class_head(out_query)
-        mask_query_embed = self.mask_query_head(out_query)
-        _, _, h, w = paddle.shape(mask_feat)
-        # [b, q, c] x [b, c, h, w] -> [b, q, h, w]
-        out_mask = paddle.bmm(mask_query_embed, mask_feat.flatten(2)).reshape(
-            [0, 0, h, w])
-        return out_logits, out_mask
diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/matchers.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/matchers.py
deleted file mode 100644
index d8f85fc..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/transformers/matchers.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Modified from DETR (https://github.com/facebookresearch/detr)
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from scipy.optimize import linear_sum_assignment
-
-from ppdet.core.workspace import register, serializable
-from ..losses.iou_loss import GIoULoss
-from .utils import bbox_cxcywh_to_xyxy
-
-__all__ = ['HungarianMatcher']
-
-
-@register
-@serializable
-class HungarianMatcher(nn.Layer):
-    __shared__ = ['use_focal_loss', 'with_mask', 'num_sample_points']
-
-    def __init__(self,
-                 matcher_coeff={
-                     'class': 1,
-                     'bbox': 5,
-                     'giou': 2,
-                     'mask': 1,
-                     'dice': 1
-                 },
-                 use_focal_loss=False,
-                 with_mask=False,
-                 num_sample_points=12544,
-                 alpha=0.25,
-                 gamma=2.0):
-        r"""
-        Args:
-            matcher_coeff (dict): The coefficient of hungarian matcher cost.
-        """
-        super(HungarianMatcher, self).__init__()
-        self.matcher_coeff = matcher_coeff
-        self.use_focal_loss = use_focal_loss
-        self.with_mask = with_mask
-        self.num_sample_points = num_sample_points
-        self.alpha = alpha
-        self.gamma = gamma
-
-        self.giou_loss = GIoULoss()
-
-    def forward(self,
-                boxes,
-                logits,
-                gt_bbox,
-                gt_class,
-                masks=None,
-                gt_mask=None):
-        r"""
-        Args:
-            boxes (Tensor): [b, query, 4]
-            logits (Tensor): [b, query, num_classes]
-            gt_bbox (List(Tensor)): list[[n, 4]]
-            gt_class (List(Tensor)): list[[n, 1]]
-            masks (Tensor|None): [b, query, h, w]
-            gt_mask (List(Tensor)): list[[n, H, W]]
-
-        Returns:
-            A list of size batch_size, containing tuples of (index_i, index_j) where:
-                - index_i is the indices of the selected predictions (in order)
-                - index_j is the indices of the corresponding selected targets (in order)
-            For each batch element, it holds:
-                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
-        """
-        bs, num_queries = boxes.shape[:2]
-
-        num_gts = [len(a) for a in gt_class]
-        if sum(num_gts) == 0:
-            return [(paddle.to_tensor(
-                [], dtype=paddle.int64), paddle.to_tensor(
-                    [], dtype=paddle.int64)) for _ in range(bs)]
-
-        # We flatten to compute the cost matrices in a batch
-        # [batch_size * num_queries, num_classes]
-        logits = logits.detach()
-        out_prob = F.sigmoid(logits.flatten(
-            0, 1)) if self.use_focal_loss else F.softmax(logits.flatten(0, 1))
-        # [batch_size * num_queries, 4]
-        out_bbox = boxes.detach().flatten(0, 1)
-
-        # Also concat the target labels and boxes
-        tgt_ids = paddle.concat(gt_class).flatten()
-        tgt_bbox = paddle.concat(gt_bbox)
-
-        # Compute the classification cost
-        out_prob = paddle.gather(out_prob, tgt_ids, axis=1)
-        if self.use_focal_loss:
-            neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-(
-                1 - out_prob + 1e-8).log())
-            pos_cost_class = self.alpha * (
-                (1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log())
-            cost_class = pos_cost_class - neg_cost_class
-        else:
-            cost_class = -out_prob
-
-        # Compute the L1 cost between boxes
-        cost_bbox = (
-            out_bbox.unsqueeze(1) - tgt_bbox.unsqueeze(0)).abs().sum(-1)
-
-        # Compute the giou cost betwen boxes
-        giou_loss = self.giou_loss(
-            bbox_cxcywh_to_xyxy(out_bbox.unsqueeze(1)),
-            bbox_cxcywh_to_xyxy(tgt_bbox.unsqueeze(0))).squeeze(-1)
-        cost_giou = giou_loss - 1
-
-        # Final cost matrix
-        C = self.matcher_coeff['class'] * cost_class + \
-            self.matcher_coeff['bbox'] * cost_bbox + \
-            self.matcher_coeff['giou'] * cost_giou
-        # Compute the mask cost and dice cost
-        if self.with_mask:
-            assert (masks is not None and gt_mask is not None,
-                    'Make sure the input has `mask` and `gt_mask`')
-            # all masks share the same set of points for efficient matching
-            sample_points = paddle.rand([bs, 1, self.num_sample_points, 2])
-            sample_points = 2.0 * sample_points - 1.0
-
-            out_mask = F.grid_sample(
-                masks.detach(), sample_points, align_corners=False).squeeze(-2)
-            out_mask = out_mask.flatten(0, 1)
-
-            tgt_mask = paddle.concat(gt_mask).unsqueeze(1)
-            sample_points = paddle.concat([
-                a.tile([b, 1, 1, 1]) for a, b in zip(sample_points, num_gts)
-                if b > 0
-            ])
-            tgt_mask = F.grid_sample(
-                tgt_mask, sample_points, align_corners=False).squeeze([1, 2])
-
-            with paddle.amp.auto_cast(enable=False):
-                # binary cross entropy cost
-                pos_cost_mask = F.binary_cross_entropy_with_logits(
-                    out_mask, paddle.ones_like(out_mask), reduction='none')
-                neg_cost_mask = F.binary_cross_entropy_with_logits(
-                    out_mask, paddle.zeros_like(out_mask), reduction='none')
-                cost_mask = paddle.matmul(
-                    pos_cost_mask, tgt_mask, transpose_y=True) + paddle.matmul(
-                        neg_cost_mask, 1 - tgt_mask, transpose_y=True)
-                cost_mask /= self.num_sample_points
-
-                # dice cost
-                out_mask = F.sigmoid(out_mask)
-                numerator = 2 * paddle.matmul(
-                    out_mask, tgt_mask, transpose_y=True)
-                denominator = out_mask.sum(
-                    -1, keepdim=True) + tgt_mask.sum(-1).unsqueeze(0)
-                cost_dice = 1 - (numerator + 1) / (denominator + 1)
-
-                C = C + self.matcher_coeff['mask'] * cost_mask + \
-                    self.matcher_coeff['dice'] * cost_dice
-
-        C = C.reshape([bs, num_queries, -1])
-        C = [a.squeeze(0) for a in C.chunk(bs)]
-        sizes = [a.shape[0] for a in gt_bbox]
-        if hasattr(paddle.Tensor, "contiguous"):
-            indices = [
-                linear_sum_assignment(c.split(sizes, -1)[i].contiguous().numpy())
-                for i, c in enumerate(C)
-            ]
-        else:
-            indices = [
-                linear_sum_assignment(c.split(sizes, -1)[i].numpy())
-                for i, c in enumerate(C)
-            ]
-        return [(paddle.to_tensor(
-            i, dtype=paddle.int64), paddle.to_tensor(
-                j, dtype=paddle.int64)) for i, j in indices]
diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/petr_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/petr_transformer.py
deleted file mode 100644
index 7859b0d..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/transformers/petr_transformer.py
+++ /dev/null
@@ -1,1198 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/utils/transformer.py
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import numpy as np
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-
-from ppdet.core.workspace import register
-from ..layers import MultiHeadAttention, _convert_attention_mask
-from .utils import _get_clones
-from ..initializer import linear_init_, normal_, constant_, xavier_uniform_
-
-__all__ = [
-    'PETRTransformer', 'MultiScaleDeformablePoseAttention',
-    'PETR_TransformerDecoderLayer', 'PETR_TransformerDecoder',
-    'PETR_DeformableDetrTransformerDecoder',
-    'PETR_DeformableTransformerDecoder', 'TransformerEncoderLayer',
-    'TransformerEncoder', 'MSDeformableAttention'
-]
-
-
-def masked_fill(x, mask, value):
-    y = paddle.full(x.shape, value, x.dtype)
-    return paddle.where(mask, y, x)
-
-
-def inverse_sigmoid(x, eps=1e-5):
-    """Inverse function of sigmoid.
-
-    Args:
-        x (Tensor): The tensor to do the
-            inverse.
-        eps (float): EPS avoid numerical
-            overflow. Defaults 1e-5.
-    Returns:
-        Tensor: The x has passed the inverse
-            function of sigmoid, has same
-            shape with input.
-    """
-    x = x.clip(min=0, max=1)
-    x1 = x.clip(min=eps)
-    x2 = (1 - x).clip(min=eps)
-    return paddle.log(x1 / x2)
-
-
-@register
-class TransformerEncoderLayer(nn.Layer):
-    __inject__ = ['attn']
-
-    def __init__(self,
-                 d_model,
-                 attn=None,
-                 nhead=8,
-                 dim_feedforward=2048,
-                 dropout=0.1,
-                 activation="relu",
-                 attn_dropout=None,
-                 act_dropout=None,
-                 normalize_before=False):
-        super(TransformerEncoderLayer, self).__init__()
-        attn_dropout = dropout if attn_dropout is None else attn_dropout
-        act_dropout = dropout if act_dropout is None else act_dropout
-        self.normalize_before = normalize_before
-        self.embed_dims = d_model
-
-        if attn is None:
-            self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
-        else:
-            self.self_attn = attn
-        # Implementation of Feedforward model
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
-        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
-        self.activation = getattr(F, activation)
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        linear_init_(self.linear1)
-        linear_init_(self.linear2)
-
-    @staticmethod
-    def with_pos_embed(tensor, pos_embed):
-        return tensor if pos_embed is None else tensor + pos_embed
-
-    def forward(self, src, src_mask=None, pos_embed=None, **kwargs):
-        residual = src
-        if self.normalize_before:
-            src = self.norm1(src)
-        q = k = self.with_pos_embed(src, pos_embed)
-        src = self.self_attn(q, k, value=src, attn_mask=src_mask, **kwargs)
-
-        src = residual + self.dropout1(src)
-        if not self.normalize_before:
-            src = self.norm1(src)
-
-        residual = src
-        if self.normalize_before:
-            src = self.norm2(src)
-        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
-        src = residual + self.dropout2(src)
-        if not self.normalize_before:
-            src = self.norm2(src)
-        return src
-
-
-@register
-class TransformerEncoder(nn.Layer):
-    __inject__ = ['encoder_layer']
-
-    def __init__(self, encoder_layer, num_layers, norm=None):
-        super(TransformerEncoder, self).__init__()
-        self.layers = _get_clones(encoder_layer, num_layers)
-        self.num_layers = num_layers
-        self.norm = norm
-        self.embed_dims = encoder_layer.embed_dims
-
-    def forward(self, src, src_mask=None, pos_embed=None, **kwargs):
-        output = src
-        for layer in self.layers:
-            output = layer(
-                output, src_mask=src_mask, pos_embed=pos_embed, **kwargs)
-
-        if self.norm is not None:
-            output = self.norm(output)
-
-        return output
-
-
-@register
-class MSDeformableAttention(nn.Layer):
-    def __init__(self,
-                 embed_dim=256,
-                 num_heads=8,
-                 num_levels=4,
-                 num_points=4,
-                 lr_mult=0.1):
-        """
-        Multi-Scale Deformable Attention Module
-        """
-        super(MSDeformableAttention, self).__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.num_levels = num_levels
-        self.num_points = num_points
-        self.total_points = num_heads * num_levels * num_points
-
-        self.head_dim = embed_dim // num_heads
-        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
-
-        self.sampling_offsets = nn.Linear(
-            embed_dim,
-            self.total_points * 2,
-            weight_attr=ParamAttr(learning_rate=lr_mult),
-            bias_attr=ParamAttr(learning_rate=lr_mult))
-
-        self.attention_weights = nn.Linear(embed_dim, self.total_points)
-        self.value_proj = nn.Linear(embed_dim, embed_dim)
-        self.output_proj = nn.Linear(embed_dim, embed_dim)
-        try:
-            # use cuda op
-            print("use deformable_detr_ops in ms_deformable_attn")
-            from deformable_detr_ops import ms_deformable_attn
-        except:
-            # use paddle func
-            from .utils import deformable_attention_core_func as ms_deformable_attn
-        self.ms_deformable_attn_core = ms_deformable_attn
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        # sampling_offsets
-        constant_(self.sampling_offsets.weight)
-        thetas = paddle.arange(
-            self.num_heads,
-            dtype=paddle.float32) * (2.0 * math.pi / self.num_heads)
-        grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1)
-        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True)
-        grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile(
-            [1, self.num_levels, self.num_points, 1])
-        scaling = paddle.arange(
-            1, self.num_points + 1,
-            dtype=paddle.float32).reshape([1, 1, -1, 1])
-        grid_init *= scaling
-        self.sampling_offsets.bias.set_value(grid_init.flatten())
-        # attention_weights
-        constant_(self.attention_weights.weight)
-        constant_(self.attention_weights.bias)
-        # proj
-        xavier_uniform_(self.value_proj.weight)
-        constant_(self.value_proj.bias)
-        xavier_uniform_(self.output_proj.weight)
-        constant_(self.output_proj.bias)
-
-    def forward(self,
-                query,
-                key,
-                value,
-                reference_points,
-                value_spatial_shapes,
-                value_level_start_index,
-                attn_mask=None,
-                **kwargs):
-        """
-        Args:
-            query (Tensor): [bs, query_length, C]
-            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
-                bottom-right (1, 1), including padding area
-            value (Tensor): [bs, value_length, C]
-            value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
-            value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
-            attn_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
-
-        Returns:
-            output (Tensor): [bs, Length_{query}, C]
-        """
-        bs, Len_q = query.shape[:2]
-        Len_v = value.shape[1]
-        assert int(value_spatial_shapes.prod(1).sum()) == Len_v
-
-        value = self.value_proj(value)
-        if attn_mask is not None:
-            attn_mask = attn_mask.astype(value.dtype).unsqueeze(-1)
-            value *= attn_mask
-        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
-
-        sampling_offsets = self.sampling_offsets(query).reshape(
-            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
-        attention_weights = self.attention_weights(query).reshape(
-            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])
-        attention_weights = F.softmax(attention_weights).reshape(
-            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])
-
-        if reference_points.shape[-1] == 2:
-            offset_normalizer = value_spatial_shapes.flip([1]).reshape(
-                [1, 1, 1, self.num_levels, 1, 2])
-            sampling_locations = reference_points.reshape([
-                bs, Len_q, 1, self.num_levels, 1, 2
-            ]) + sampling_offsets / offset_normalizer
-        elif reference_points.shape[-1] == 4:
-            sampling_locations = (
-                reference_points[:, :, None, :, None, :2] + sampling_offsets /
-                self.num_points * reference_points[:, :, None, :, None, 2:] *
-                0.5)
-        else:
-            raise ValueError(
-                "Last dim of reference_points must be 2 or 4, but get {} instead.".
-                format(reference_points.shape[-1]))
-
-        output = self.ms_deformable_attn_core(
-            value, value_spatial_shapes, value_level_start_index,
-            sampling_locations, attention_weights)
-        output = self.output_proj(output)
-
-        return output
-
-
-@register
-class MultiScaleDeformablePoseAttention(nn.Layer):
-    """An attention module used in PETR. `End-to-End Multi-Person
-    Pose Estimation with Transformers`.
-
-    Args:
-        embed_dims (int): The embedding dimension of Attention.
-            Default: 256.
-        num_heads (int): Parallel attention heads. Default: 8.
-        num_levels (int): The number of feature map used in
-            Attention. Default: 4.
-        num_points (int): The number of sampling points for
-            each query in each head. Default: 17.
-        im2col_step (int): The step used in image_to_column.
-            Default: 64.
-        dropout (float): A Dropout layer on `inp_residual`.
-            Default: 0.1.
-        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
-            Default: None.
-    """
-
-    def __init__(self,
-                 embed_dims=256,
-                 num_heads=8,
-                 num_levels=4,
-                 num_points=17,
-                 im2col_step=64,
-                 dropout=0.1,
-                 norm_cfg=None,
-                 init_cfg=None,
-                 batch_first=False,
-                 lr_mult=0.1):
-        super().__init__()
-        if embed_dims % num_heads != 0:
-            raise ValueError(f'embed_dims must be divisible by num_heads, '
-                             f'but got {embed_dims} and {num_heads}')
-        dim_per_head = embed_dims // num_heads
-        self.norm_cfg = norm_cfg
-        self.init_cfg = init_cfg
-        self.dropout = nn.Dropout(dropout)
-        self.batch_first = batch_first
-
-        # you'd better set dim_per_head to a power of 2
-        # which is more efficient in the CUDA implementation
-        def _is_power_of_2(n):
-            if (not isinstance(n, int)) or (n < 0):
-                raise ValueError(
-                    'invalid input for _is_power_of_2: {} (type: {})'.format(
-                        n, type(n)))
-            return (n & (n - 1) == 0) and n != 0
-
-        if not _is_power_of_2(dim_per_head):
-            warnings.warn("You'd better set embed_dims in "
-                          'MultiScaleDeformAttention to make '
-                          'the dimension of each attention head a power of 2 '
-                          'which is more efficient in our CUDA implementation.')
-
-        self.im2col_step = im2col_step
-        self.embed_dims = embed_dims
-        self.num_levels = num_levels
-        self.num_heads = num_heads
-        self.num_points = num_points
-        self.sampling_offsets = nn.Linear(
-            embed_dims,
-            num_heads * num_levels * num_points * 2,
-            weight_attr=ParamAttr(learning_rate=lr_mult),
-            bias_attr=ParamAttr(learning_rate=lr_mult))
-        self.attention_weights = nn.Linear(embed_dims,
-                                           num_heads * num_levels * num_points)
-        self.value_proj = nn.Linear(embed_dims, embed_dims)
-        self.output_proj = nn.Linear(embed_dims, embed_dims)
-
-        try:
-            # use cuda op
-            from deformable_detr_ops import ms_deformable_attn
-        except:
-            # use paddle func
-            from .utils import deformable_attention_core_func as ms_deformable_attn
-        self.ms_deformable_attn_core = ms_deformable_attn
-
-        self.init_weights()
-
-    def init_weights(self):
-        """Default initialization for Parameters of Module."""
-        constant_(self.sampling_offsets.weight)
-        constant_(self.sampling_offsets.bias)
-        constant_(self.attention_weights.weight)
-        constant_(self.attention_weights.bias)
-        xavier_uniform_(self.value_proj.weight)
-        constant_(self.value_proj.bias)
-        xavier_uniform_(self.output_proj.weight)
-        constant_(self.output_proj.bias)
-
-    def forward(self,
-                query,
-                key,
-                value,
-                residual=None,
-                attn_mask=None,
-                reference_points=None,
-                value_spatial_shapes=None,
-                value_level_start_index=None,
-                **kwargs):
-        """Forward Function of MultiScaleDeformAttention.
-
-        Args:
-            query (Tensor): Query of Transformer with shape
-                (num_query, bs, embed_dims).
-            key (Tensor): The key tensor with shape (num_key, bs, embed_dims).
-            value (Tensor): The value tensor with shape
-                (num_key, bs, embed_dims).
-            residual (Tensor): The tensor used for addition, with the
-                same shape as `x`. Default None. If None, `x` will be used.
-            reference_points (Tensor):  The normalized reference points with
-                shape (bs, num_query, num_levels, K*2), all elements is range
-                in [0, 1], top-left (0,0), bottom-right (1, 1), including
-                padding area.
-            attn_mask (Tensor): ByteTensor for `query`, with
-                shape [bs, num_key].
-            value_spatial_shapes (Tensor): Spatial shape of features in
-                different level. With shape  (num_levels, 2),
-                last dimension represent (h, w).
-            value_level_start_index (Tensor): The start index of each level.
-                A tensor has shape (num_levels) and can be represented
-                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
-
-        Returns:
-            Tensor: forwarded results with shape [num_query, bs, embed_dims].
-        """
-
-        if key is None:
-            key = query
-        if value is None:
-            value = key
-
-        bs, num_query, _ = query.shape
-        bs, num_key, _ = value.shape
-        assert (value_spatial_shapes[:, 0].numpy() *
-                value_spatial_shapes[:, 1].numpy()).sum() == num_key
-
-        value = self.value_proj(value)
-        if attn_mask is not None:
-            # value = value.masked_fill(attn_mask[..., None], 0.0)
-            value *= attn_mask.unsqueeze(-1)
-        value = value.reshape([bs, num_key, self.num_heads, -1])
-        sampling_offsets = self.sampling_offsets(query).reshape([
-            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2
-        ])
-        attention_weights = self.attention_weights(query).reshape(
-            [bs, num_query, self.num_heads, self.num_levels * self.num_points])
-        attention_weights = F.softmax(attention_weights, axis=-1)
-
-        attention_weights = attention_weights.reshape(
-            [bs, num_query, self.num_heads, self.num_levels, self.num_points])
-        if reference_points.shape[-1] == self.num_points * 2:
-            reference_points_reshape = reference_points.reshape(
-                (bs, num_query, self.num_levels, -1, 2)).unsqueeze(2)
-            x1 = reference_points[:, :, :, 0::2].min(axis=-1, keepdim=True)
-            y1 = reference_points[:, :, :, 1::2].min(axis=-1, keepdim=True)
-            x2 = reference_points[:, :, :, 0::2].max(axis=-1, keepdim=True)
-            y2 = reference_points[:, :, :, 1::2].max(axis=-1, keepdim=True)
-            w = paddle.clip(x2 - x1, min=1e-4)
-            h = paddle.clip(y2 - y1, min=1e-4)
-            wh = paddle.concat([w, h], axis=-1)[:, :, None, :, None, :]
-
-            sampling_locations = reference_points_reshape \
-                                 + sampling_offsets * wh * 0.5
-        else:
-            raise ValueError(
-                f'Last dim of reference_points must be'
-                f' 2K, but get {reference_points.shape[-1]} instead.')
-
-        output = self.ms_deformable_attn_core(
-            value, value_spatial_shapes, value_level_start_index,
-            sampling_locations, attention_weights)
-
-        output = self.output_proj(output)
-        return output
-
-
-@register
-class PETR_TransformerDecoderLayer(nn.Layer):
-    __inject__ = ['self_attn', 'cross_attn']
-
-    def __init__(self,
-                 d_model,
-                 nhead=8,
-                 self_attn=None,
-                 cross_attn=None,
-                 dim_feedforward=2048,
-                 dropout=0.1,
-                 activation="relu",
-                 attn_dropout=None,
-                 act_dropout=None,
-                 normalize_before=False):
-        super(PETR_TransformerDecoderLayer, self).__init__()
-        attn_dropout = dropout if attn_dropout is None else attn_dropout
-        act_dropout = dropout if act_dropout is None else act_dropout
-        self.normalize_before = normalize_before
-
-        if self_attn is None:
-            self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
-        else:
-            self.self_attn = self_attn
-        if cross_attn is None:
-            self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout)
-        else:
-            self.cross_attn = cross_attn
-        # Implementation of Feedforward model
-        self.linear1 = nn.Linear(d_model, dim_feedforward)
-        self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train")
-        self.linear2 = nn.Linear(dim_feedforward, d_model)
-
-        self.norm1 = nn.LayerNorm(d_model)
-        self.norm2 = nn.LayerNorm(d_model)
-        self.norm3 = nn.LayerNorm(d_model)
-        self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train")
-        self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train")
-        self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train")
-        self.activation = getattr(F, activation)
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        linear_init_(self.linear1)
-        linear_init_(self.linear2)
-
-    @staticmethod
-    def with_pos_embed(tensor, pos_embed):
-        return tensor if pos_embed is None else tensor + pos_embed
-
-    def forward(self,
-                tgt,
-                memory,
-                tgt_mask=None,
-                memory_mask=None,
-                pos_embed=None,
-                query_pos_embed=None,
-                **kwargs):
-        tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype)
-
-        residual = tgt
-        if self.normalize_before:
-            tgt = self.norm1(tgt)
-        q = k = self.with_pos_embed(tgt, query_pos_embed)
-        tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask)
-        tgt = residual + self.dropout1(tgt)
-        if not self.normalize_before:
-            tgt = self.norm1(tgt)
-
-        residual = tgt
-        if self.normalize_before:
-            tgt = self.norm2(tgt)
-        q = self.with_pos_embed(tgt, query_pos_embed)
-        key_tmp = tgt
-        # k = self.with_pos_embed(memory, pos_embed)
-        tgt = self.cross_attn(
-            q, key=key_tmp, value=memory, attn_mask=memory_mask, **kwargs)
-        tgt = residual + self.dropout2(tgt)
-        if not self.normalize_before:
-            tgt = self.norm2(tgt)
-
-        residual = tgt
-        if self.normalize_before:
-            tgt = self.norm3(tgt)
-        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
-        tgt = residual + self.dropout3(tgt)
-        if not self.normalize_before:
-            tgt = self.norm3(tgt)
-        return tgt
-
-
-@register
-class PETR_TransformerDecoder(nn.Layer):
-    """Implements the decoder in PETR transformer.
-
-    Args:
-        return_intermediate (bool): Whether to return intermediate outputs.
-        coder_norm_cfg (dict): Config of last normalization layer. Default：
-            `LN`.
-    """
-    __inject__ = ['decoder_layer']
-
-    def __init__(self,
-                 decoder_layer,
-                 num_layers,
-                 norm=None,
-                 return_intermediate=False,
-                 num_keypoints=17,
-                 **kwargs):
-        super(PETR_TransformerDecoder, self).__init__()
-        self.layers = _get_clones(decoder_layer, num_layers)
-        self.num_layers = num_layers
-        self.norm = norm
-        self.return_intermediate = return_intermediate
-        self.num_keypoints = num_keypoints
-
-    def forward(self,
-                query,
-                *args,
-                reference_points=None,
-                valid_ratios=None,
-                kpt_branches=None,
-                **kwargs):
-        """Forward function for `TransformerDecoder`.
-
-        Args:
-            query (Tensor): Input query with shape (num_query, bs, embed_dims).
-            reference_points (Tensor): The reference points of offset,
-                has shape (bs, num_query, K*2).
-            valid_ratios (Tensor): The radios of valid points on the feature
-                map, has shape (bs, num_levels, 2).
-            kpt_branches: (obj:`nn.LayerList`): Used for refining the
-                regression results. Only would be passed when `with_box_refine`
-                is True, otherwise would be passed a `None`.
-
-        Returns:
-            tuple (Tensor): Results with shape [1, num_query, bs, embed_dims] when
-                return_intermediate is `False`, otherwise it has shape
-                [num_layers, num_query, bs, embed_dims] and
-                [num_layers, bs, num_query, K*2].
-        """
-        output = query
-        intermediate = []
-        intermediate_reference_points = []
-        for lid, layer in enumerate(self.layers):
-            if reference_points.shape[-1] == self.num_keypoints * 2:
-                reference_points_input = \
-                    reference_points[:, :, None] * \
-                    valid_ratios.tile((1, 1, self.num_keypoints))[:, None]
-            else:
-                assert reference_points.shape[-1] == 2
-                reference_points_input = reference_points[:, :, None] * \
-                                         valid_ratios[:, None]
-            output = layer(
-                output,
-                *args,
-                reference_points=reference_points_input,
-                **kwargs)
-
-            if kpt_branches is not None:
-                tmp = kpt_branches[lid](output)
-                if reference_points.shape[-1] == self.num_keypoints * 2:
-                    new_reference_points = tmp + inverse_sigmoid(
-                        reference_points)
-                    new_reference_points = F.sigmoid(new_reference_points)
-                else:
-                    raise NotImplementedError
-                reference_points = new_reference_points.detach()
-
-            if self.return_intermediate:
-                intermediate.append(output)
-                intermediate_reference_points.append(reference_points)
-
-        if self.return_intermediate:
-            return paddle.stack(intermediate), paddle.stack(
-                intermediate_reference_points)
-
-        return output, reference_points
-
-
-@register
-class PETR_DeformableTransformerDecoder(nn.Layer):
-    __inject__ = ['decoder_layer']
-
-    def __init__(self, decoder_layer, num_layers, return_intermediate=False):
-        super(PETR_DeformableTransformerDecoder, self).__init__()
-        self.layers = _get_clones(decoder_layer, num_layers)
-        self.num_layers = num_layers
-        self.return_intermediate = return_intermediate
-
-    def forward(self,
-                tgt,
-                reference_points,
-                memory,
-                memory_spatial_shapes,
-                memory_mask=None,
-                query_pos_embed=None):
-        output = tgt
-        intermediate = []
-        for lid, layer in enumerate(self.layers):
-            output = layer(output, reference_points, memory,
-                           memory_spatial_shapes, memory_mask, query_pos_embed)
-
-            if self.return_intermediate:
-                intermediate.append(output)
-
-        if self.return_intermediate:
-            return paddle.stack(intermediate)
-
-        return output.unsqueeze(0)
-
-
-@register
-class PETR_DeformableDetrTransformerDecoder(PETR_DeformableTransformerDecoder):
-    """Implements the decoder in DETR transformer.
-
-    Args:
-        return_intermediate (bool): Whether to return intermediate outputs.
-        coder_norm_cfg (dict): Config of last normalization layer. Default：
-            `LN`.
-    """
-
-    def __init__(self, *args, return_intermediate=False, **kwargs):
-
-        super(PETR_DeformableDetrTransformerDecoder, self).__init__(*args,
-                                                                    **kwargs)
-        self.return_intermediate = return_intermediate
-
-    def forward(self,
-                query,
-                *args,
-                reference_points=None,
-                valid_ratios=None,
-                reg_branches=None,
-                **kwargs):
-        """Forward function for `TransformerDecoder`.
-
-        Args:
-            query (Tensor): Input query with shape
-                `(num_query, bs, embed_dims)`.
-            reference_points (Tensor): The reference
-                points of offset. has shape
-                (bs, num_query, 4) when as_two_stage,
-                otherwise has shape ((bs, num_query, 2).
-            valid_ratios (Tensor): The radios of valid
-                points on the feature map, has shape
-                (bs, num_levels, 2)
-            reg_branch: (obj:`nn.LayerList`): Used for
-                refining the regression results. Only would
-                be passed when with_box_refine is True,
-                otherwise would be passed a `None`.
-
-        Returns:
-            Tensor: Results with shape [1, num_query, bs, embed_dims] when
-                return_intermediate is `False`, otherwise it has shape
-                [num_layers, num_query, bs, embed_dims].
-        """
-        output = query
-        intermediate = []
-        intermediate_reference_points = []
-        for lid, layer in enumerate(self.layers):
-            if reference_points.shape[-1] == 4:
-                reference_points_input = reference_points[:, :, None] * \
-                    paddle.concat([valid_ratios, valid_ratios], -1)[:, None]
-            else:
-                assert reference_points.shape[-1] == 2
-                reference_points_input = reference_points[:, :, None] * \
-                    valid_ratios[:, None]
-            output = layer(
-                output,
-                *args,
-                reference_points=reference_points_input,
-                **kwargs)
-
-            if reg_branches is not None:
-                tmp = reg_branches[lid](output)
-                if reference_points.shape[-1] == 4:
-                    new_reference_points = tmp + inverse_sigmoid(
-                        reference_points)
-                    new_reference_points = F.sigmoid(new_reference_points)
-                else:
-                    assert reference_points.shape[-1] == 2
-                    new_reference_points = tmp
-                    new_reference_points[..., :2] = tmp[
-                        ..., :2] + inverse_sigmoid(reference_points)
-                    new_reference_points = F.sigmoid(new_reference_points)
-                reference_points = new_reference_points.detach()
-
-            if self.return_intermediate:
-                intermediate.append(output)
-                intermediate_reference_points.append(reference_points)
-
-        if self.return_intermediate:
-            return paddle.stack(intermediate), paddle.stack(
-                intermediate_reference_points)
-
-        return output, reference_points
-
-
-@register
-class PETRTransformer(nn.Layer):
-    """Implements the PETR transformer.
-
-    Args:
-        as_two_stage (bool): Generate query from encoder features.
-            Default: False.
-        num_feature_levels (int): Number of feature maps from FPN:
-            Default: 4.
-        two_stage_num_proposals (int): Number of proposals when set
-            `as_two_stage` as True. Default: 300.
-    """
-    __inject__ = ["encoder", "decoder", "hm_encoder", "refine_decoder"]
-
-    def __init__(self,
-                 encoder="",
-                 decoder="",
-                 hm_encoder="",
-                 refine_decoder="",
-                 as_two_stage=True,
-                 num_feature_levels=4,
-                 two_stage_num_proposals=300,
-                 num_keypoints=17,
-                 **kwargs):
-        super(PETRTransformer, self).__init__(**kwargs)
-        self.as_two_stage = as_two_stage
-        self.num_feature_levels = num_feature_levels
-        self.two_stage_num_proposals = two_stage_num_proposals
-        self.num_keypoints = num_keypoints
-        self.encoder = encoder
-        self.decoder = decoder
-        self.embed_dims = self.encoder.embed_dims
-        self.hm_encoder = hm_encoder
-        self.refine_decoder = refine_decoder
-        self.init_layers()
-        self.init_weights()
-
-    def init_layers(self):
-        """Initialize layers of the DeformableDetrTransformer."""
-        #paddle.create_parameter
-        self.level_embeds = paddle.create_parameter(
-            (self.num_feature_levels, self.embed_dims), dtype="float32")
-
-        if self.as_two_stage:
-            self.enc_output = nn.Linear(self.embed_dims, self.embed_dims)
-            self.enc_output_norm = nn.LayerNorm(self.embed_dims)
-            self.refine_query_embedding = nn.Embedding(self.num_keypoints,
-                                                       self.embed_dims * 2)
-        else:
-            self.reference_points = nn.Linear(self.embed_dims,
-                                              2 * self.num_keypoints)
-
-    def init_weights(self):
-        """Initialize the transformer weights."""
-        for p in self.parameters():
-            if p.rank() > 1:
-                xavier_uniform_(p)
-                if hasattr(p, 'bias') and p.bias is not None:
-                    constant_(p.bais)
-        for m in self.sublayers():
-            if isinstance(m, MSDeformableAttention):
-                m._reset_parameters()
-        for m in self.sublayers():
-            if isinstance(m, MultiScaleDeformablePoseAttention):
-                m.init_weights()
-        if not self.as_two_stage:
-            xavier_uniform_(self.reference_points.weight)
-            constant_(self.reference_points.bias)
-        normal_(self.level_embeds)
-        normal_(self.refine_query_embedding.weight)
-
-    def gen_encoder_output_proposals(self, memory, memory_padding_mask,
-                                     spatial_shapes):
-        """Generate proposals from encoded memory.
-
-        Args:
-            memory (Tensor): The output of encoder, has shape
-                (bs, num_key, embed_dim). num_key is equal the number of points
-                on feature map from all level.
-            memory_padding_mask (Tensor): Padding mask for memory.
-                has shape (bs, num_key).
-            spatial_shapes (Tensor): The shape of all feature maps.
-                has shape (num_level, 2).
-
-        Returns:
-            tuple: A tuple of feature map and bbox prediction.
-
-                - output_memory (Tensor): The input of decoder, has shape
-                    (bs, num_key, embed_dim). num_key is equal the number of
-                    points on feature map from all levels.
-                - output_proposals (Tensor): The normalized proposal
-                    after a inverse sigmoid, has shape (bs, num_keys, 4).
-        """
-
-        N, S, C = memory.shape
-        proposals = []
-        _cur = 0
-        for lvl, (H, W) in enumerate(spatial_shapes):
-            mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H * W)].reshape(
-                [N, H, W, 1])
-            valid_H = paddle.sum(mask_flatten_[:, :, 0, 0], 1)
-            valid_W = paddle.sum(mask_flatten_[:, 0, :, 0], 1)
-
-            grid_y, grid_x = paddle.meshgrid(
-                paddle.linspace(
-                    0, H - 1, H, dtype="float32"),
-                paddle.linspace(
-                    0, W - 1, W, dtype="float32"))
-            grid = paddle.concat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)],
-                                 -1)
-
-            scale = paddle.concat(
-                [valid_W.unsqueeze(-1),
-                 valid_H.unsqueeze(-1)], 1).reshape([N, 1, 1, 2])
-            grid = (grid.unsqueeze(0).expand((N, -1, -1, -1)) + 0.5) / scale
-            proposal = grid.reshape([N, -1, 2])
-            proposals.append(proposal)
-            _cur += (H * W)
-        output_proposals = paddle.concat(proposals, 1)
-        output_proposals_valid = ((output_proposals > 0.01) &
-                                  (output_proposals < 0.99)).all(
-                                      -1, keepdim=True).astype("bool")
-        output_proposals = paddle.log(output_proposals / (1 - output_proposals))
-        output_proposals = masked_fill(
-            output_proposals, ~memory_padding_mask.astype("bool").unsqueeze(-1),
-            float('inf'))
-        output_proposals = masked_fill(output_proposals,
-                                       ~output_proposals_valid, float('inf'))
-
-        output_memory = memory
-        output_memory = masked_fill(
-            output_memory, ~memory_padding_mask.astype("bool").unsqueeze(-1),
-            float(0))
-        output_memory = masked_fill(output_memory, ~output_proposals_valid,
-                                    float(0))
-        output_memory = self.enc_output_norm(self.enc_output(output_memory))
-        return output_memory, output_proposals
-
-    @staticmethod
-    def get_reference_points(spatial_shapes, valid_ratios):
-        """Get the reference points used in decoder.
-
-        Args:
-            spatial_shapes (Tensor): The shape of all feature maps,
-                has shape (num_level, 2).
-            valid_ratios (Tensor): The radios of valid points on the
-                feature map, has shape (bs, num_levels, 2).
-
-        Returns:
-            Tensor: reference points used in decoder, has \
-                shape (bs, num_keys, num_levels, 2).
-        """
-        reference_points_list = []
-        for lvl, (H, W) in enumerate(spatial_shapes):
-            ref_y, ref_x = paddle.meshgrid(
-                paddle.linspace(
-                    0.5, H - 0.5, H, dtype="float32"),
-                paddle.linspace(
-                    0.5, W - 0.5, W, dtype="float32"))
-            ref_y = ref_y.reshape(
-                (-1, ))[None] / (valid_ratios[:, None, lvl, 1] * H)
-            ref_x = ref_x.reshape(
-                (-1, ))[None] / (valid_ratios[:, None, lvl, 0] * W)
-            ref = paddle.stack((ref_x, ref_y), -1)
-            reference_points_list.append(ref)
-        reference_points = paddle.concat(reference_points_list, 1)
-        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
-        return reference_points
-
-    def get_valid_ratio(self, mask):
-        """Get the valid radios of feature maps of all level."""
-        _, H, W = mask.shape
-        valid_H = paddle.sum(mask[:, :, 0].astype('float'), 1)
-        valid_W = paddle.sum(mask[:, 0, :].astype('float'), 1)
-        valid_ratio_h = valid_H.astype('float') / H
-        valid_ratio_w = valid_W.astype('float') / W
-        valid_ratio = paddle.stack([valid_ratio_w, valid_ratio_h], -1)
-        return valid_ratio
-
-    def get_proposal_pos_embed(self,
-                               proposals,
-                               num_pos_feats=128,
-                               temperature=10000):
-        """Get the position embedding of proposal."""
-        scale = 2 * math.pi
-        dim_t = paddle.arange(num_pos_feats, dtype="float32")
-        dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats)
-        # N, L, 4
-        proposals = F.sigmoid(proposals) * scale
-        # N, L, 4, 128
-        pos = proposals[:, :, :, None] / dim_t
-        # N, L, 4, 64, 2
-        pos = paddle.stack(
-            (pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()),
-            axis=4).flatten(2)
-        return pos
-
-    def forward(self,
-                mlvl_feats,
-                mlvl_masks,
-                query_embed,
-                mlvl_pos_embeds,
-                kpt_branches=None,
-                cls_branches=None):
-        """Forward function for `Transformer`.
-
-        Args:
-            mlvl_feats (list(Tensor)): Input queries from different level.
-                Each element has shape [bs, embed_dims, h, w].
-            mlvl_masks (list(Tensor)): The key_padding_mask from different
-                level used for encoder and decoder, each element has shape
-                [bs, h, w].
-            query_embed (Tensor): The query embedding for decoder,
-                with shape [num_query, c].
-            mlvl_pos_embeds (list(Tensor)): The positional encoding
-                of feats from different level, has the shape
-                 [bs, embed_dims, h, w].
-            kpt_branches (obj:`nn.LayerList`): Keypoint Regression heads for
-                feature maps from each decoder layer. Only would be passed when
-                `with_box_refine` is Ture. Default to None.
-            cls_branches (obj:`nn.LayerList`): Classification heads for
-                feature maps from each decoder layer. Only would be passed when
-                `as_two_stage` is Ture. Default to None.
-
-        Returns:
-            tuple[Tensor]: results of decoder containing the following tensor.
-
-                - inter_states: Outputs from decoder. If
-                    `return_intermediate_dec` is True output has shape \
-                    (num_dec_layers, bs, num_query, embed_dims), else has \
-                    shape (1, bs, num_query, embed_dims).
-                - init_reference_out: The initial value of reference \
-                    points, has shape (bs, num_queries, 4).
-                - inter_references_out: The internal value of reference \
-                    points in decoder, has shape \
-                    (num_dec_layers, bs,num_query, embed_dims)
-                - enc_outputs_class: The classification score of proposals \
-                    generated from encoder's feature maps, has shape \
-                    (batch, h*w, num_classes). \
-                    Only would be returned when `as_two_stage` is True, \
-                    otherwise None.
-                - enc_outputs_kpt_unact: The regression results generated from \
-                    encoder's feature maps., has shape (batch, h*w, K*2).
-                    Only would be returned when `as_two_stage` is True, \
-                    otherwise None.
-        """
-        assert self.as_two_stage or query_embed is not None
-
-        feat_flatten = []
-        mask_flatten = []
-        lvl_pos_embed_flatten = []
-        spatial_shapes = []
-        for lvl, (feat, mask, pos_embed
-                  ) in enumerate(zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
-            bs, c, h, w = feat.shape
-            spatial_shape = (h, w)
-            spatial_shapes.append(spatial_shape)
-            feat = feat.flatten(2).transpose((0, 2, 1))
-            mask = mask.flatten(1)
-            pos_embed = pos_embed.flatten(2).transpose((0, 2, 1))
-            lvl_pos_embed = pos_embed + self.level_embeds[lvl].reshape(
-                [1, 1, -1])
-            lvl_pos_embed_flatten.append(lvl_pos_embed)
-            feat_flatten.append(feat)
-            mask_flatten.append(mask)
-        feat_flatten = paddle.concat(feat_flatten, 1)
-        mask_flatten = paddle.concat(mask_flatten, 1)
-        lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1)
-        spatial_shapes_cumsum = paddle.to_tensor(
-            np.array(spatial_shapes).prod(1).cumsum(0))
-        spatial_shapes = paddle.to_tensor(spatial_shapes, dtype="int64")
-        level_start_index = paddle.concat((paddle.zeros(
-            (1, ), dtype=spatial_shapes.dtype), spatial_shapes_cumsum[:-1]))
-        valid_ratios = paddle.stack(
-            [self.get_valid_ratio(m) for m in mlvl_masks], 1)
-
-        reference_points = \
-            self.get_reference_points(spatial_shapes,
-                                      valid_ratios)
-
-        memory = self.encoder(
-            src=feat_flatten,
-            pos_embed=lvl_pos_embed_flatten,
-            src_mask=mask_flatten,
-            value_spatial_shapes=spatial_shapes,
-            reference_points=reference_points,
-            value_level_start_index=level_start_index,
-            valid_ratios=valid_ratios)
-
-        bs, _, c = memory.shape
-
-        hm_proto = None
-        if self.training:
-            hm_memory = paddle.slice(
-                memory,
-                starts=level_start_index[0],
-                ends=level_start_index[1],
-                axes=[1])
-            hm_pos_embed = paddle.slice(
-                lvl_pos_embed_flatten,
-                starts=level_start_index[0],
-                ends=level_start_index[1],
-                axes=[1])
-            hm_mask = paddle.slice(
-                mask_flatten,
-                starts=level_start_index[0],
-                ends=level_start_index[1],
-                axes=[1])
-            hm_reference_points = paddle.slice(
-                reference_points,
-                starts=level_start_index[0],
-                ends=level_start_index[1],
-                axes=[1])[:, :, :1, :]
-
-            # official code make a mistake of pos_embed to pose_embed, which disable pos_embed
-            hm_memory = self.hm_encoder(
-                src=hm_memory,
-                pose_embed=hm_pos_embed,
-                src_mask=hm_mask,
-                value_spatial_shapes=spatial_shapes[[0]],
-                reference_points=hm_reference_points,
-                value_level_start_index=level_start_index[0],
-                valid_ratios=valid_ratios[:, :1, :])
-            hm_memory = hm_memory.reshape((bs, spatial_shapes[0, 0],
-                                           spatial_shapes[0, 1], -1))
-            hm_proto = (hm_memory, mlvl_masks[0])
-
-        if self.as_two_stage:
-            output_memory, output_proposals = \
-                self.gen_encoder_output_proposals(
-                    memory, mask_flatten, spatial_shapes)
-            enc_outputs_class = cls_branches[self.decoder.num_layers](
-                output_memory)
-            enc_outputs_kpt_unact = \
-                kpt_branches[self.decoder.num_layers](output_memory)
-            enc_outputs_kpt_unact[..., 0::2] += output_proposals[..., 0:1]
-            enc_outputs_kpt_unact[..., 1::2] += output_proposals[..., 1:2]
-
-            topk = self.two_stage_num_proposals
-            topk_proposals = paddle.topk(
-                enc_outputs_class[..., 0], topk, axis=1)[1].unsqueeze(-1)
-
-            #paddle.take_along_axis 对应torch.gather
-            topk_kpts_unact = paddle.take_along_axis(enc_outputs_kpt_unact,
-                                                     topk_proposals, 1)
-            topk_kpts_unact = topk_kpts_unact.detach()
-
-            reference_points = F.sigmoid(topk_kpts_unact)
-            init_reference_out = reference_points
-            # learnable query and query_pos
-            query_pos, query = paddle.split(
-                query_embed, query_embed.shape[1] // c, axis=1)
-            query_pos = query_pos.unsqueeze(0).expand((bs, -1, -1))
-            query = query.unsqueeze(0).expand((bs, -1, -1))
-        else:
-            query_pos, query = paddle.split(
-                query_embed, query_embed.shape[1] // c, axis=1)
-            query_pos = query_pos.unsqueeze(0).expand((bs, -1, -1))
-            query = query.unsqueeze(0).expand((bs, -1, -1))
-            reference_points = F.sigmoid(self.reference_points(query_pos))
-            init_reference_out = reference_points
-
-        # decoder
-        inter_states, inter_references = self.decoder(
-            query=query,
-            memory=memory,
-            query_pos_embed=query_pos,
-            memory_mask=mask_flatten,
-            reference_points=reference_points,
-            value_spatial_shapes=spatial_shapes,
-            value_level_start_index=level_start_index,
-            valid_ratios=valid_ratios,
-            kpt_branches=kpt_branches)
-
-        inter_references_out = inter_references
-        if self.as_two_stage:
-            return inter_states, init_reference_out, \
-                   inter_references_out, enc_outputs_class, \
-                   enc_outputs_kpt_unact, hm_proto, memory
-        return inter_states, init_reference_out, \
-               inter_references_out, None, None, None, None, None, hm_proto
-
-    def forward_refine(self,
-                       mlvl_masks,
-                       memory,
-                       reference_points_pose,
-                       img_inds,
-                       kpt_branches=None,
-                       **kwargs):
-        mask_flatten = []
-        spatial_shapes = []
-        for lvl, mask in enumerate(mlvl_masks):
-            bs, h, w = mask.shape
-            spatial_shape = (h, w)
-            spatial_shapes.append(spatial_shape)
-            mask = mask.flatten(1)
-            mask_flatten.append(mask)
-        mask_flatten = paddle.concat(mask_flatten, 1)
-        spatial_shapes_cumsum = paddle.to_tensor(
-            np.array(
-                spatial_shapes, dtype='int64').prod(1).cumsum(0))
-        spatial_shapes = paddle.to_tensor(spatial_shapes, dtype="int64")
-        level_start_index = paddle.concat((paddle.zeros(
-            (1, ), dtype=spatial_shapes.dtype), spatial_shapes_cumsum[:-1]))
-        valid_ratios = paddle.stack(
-            [self.get_valid_ratio(m) for m in mlvl_masks], 1)
-
-        # pose refinement (17 queries corresponding to 17 keypoints)
-        # learnable query and query_pos
-        refine_query_embedding = self.refine_query_embedding.weight
-        query_pos, query = paddle.split(refine_query_embedding, 2, axis=1)
-        pos_num = reference_points_pose.shape[0]
-        query_pos = query_pos.unsqueeze(0).expand((pos_num, -1, -1))
-        query = query.unsqueeze(0).expand((pos_num, -1, -1))
-        reference_points = reference_points_pose.reshape(
-            (pos_num, reference_points_pose.shape[1] // 2, 2))
-        pos_memory = memory[img_inds]
-        mask_flatten = mask_flatten[img_inds]
-        valid_ratios = valid_ratios[img_inds]
-        if img_inds.size == 1:
-            pos_memory = pos_memory.unsqueeze(0)
-            mask_flatten = mask_flatten.unsqueeze(0)
-            valid_ratios = valid_ratios.unsqueeze(0)
-        inter_states, inter_references = self.refine_decoder(
-            query=query,
-            memory=pos_memory,
-            query_pos_embed=query_pos,
-            memory_mask=mask_flatten,
-            reference_points=reference_points,
-            value_spatial_shapes=spatial_shapes,
-            value_level_start_index=level_start_index,
-            valid_ratios=valid_ratios,
-            reg_branches=kpt_branches,
-            **kwargs)
-        # [num_decoder, num_query, bs, embed_dim]
-
-        init_reference_out = reference_points
-        return inter_states, init_reference_out, inter_references
diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/position_encoding.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/position_encoding.py
deleted file mode 100644
index a2c3260..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/transformers/position_encoding.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Modified from DETR (https://github.com/facebookresearch/detr)
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import paddle
-import paddle.nn as nn
-
-from ppdet.core.workspace import register, serializable
-
-
-@register
-@serializable
-class PositionEmbedding(nn.Layer):
-    def __init__(self,
-                 num_pos_feats=128,
-                 temperature=10000,
-                 normalize=True,
-                 scale=2 * math.pi,
-                 embed_type='sine',
-                 num_embeddings=50,
-                 offset=0.,
-                 eps=1e-6):
-        super(PositionEmbedding, self).__init__()
-        assert embed_type in ['sine', 'learned']
-
-        self.embed_type = embed_type
-        self.offset = offset
-        self.eps = eps
-        if self.embed_type == 'sine':
-            self.num_pos_feats = num_pos_feats
-            self.temperature = temperature
-            self.normalize = normalize
-            self.scale = scale
-        elif self.embed_type == 'learned':
-            self.row_embed = nn.Embedding(num_embeddings, num_pos_feats)
-            self.col_embed = nn.Embedding(num_embeddings, num_pos_feats)
-        else:
-            raise ValueError(f"{self.embed_type} is not supported.")
-
-    def forward(self, mask):
-        """
-        Args:
-            mask (Tensor): [B, H, W]
-        Returns:
-            pos (Tensor): [B, H, W, C]
-        """
-        if self.embed_type == 'sine':
-            y_embed = mask.cumsum(1)
-            x_embed = mask.cumsum(2)
-            if self.normalize:
-                y_embed = (y_embed + self.offset) / (
-                    y_embed[:, -1:, :] + self.eps) * self.scale
-                x_embed = (x_embed + self.offset) / (
-                    x_embed[:, :, -1:] + self.eps) * self.scale
-
-            dim_t = 2 * (paddle.arange(self.num_pos_feats) //
-                         2).astype('float32')
-            dim_t = self.temperature**(dim_t / self.num_pos_feats)
-
-            pos_x = x_embed.unsqueeze(-1) / dim_t
-            pos_y = y_embed.unsqueeze(-1) / dim_t
-            pos_x = paddle.stack(
-                (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
-                axis=4).flatten(3)
-            pos_y = paddle.stack(
-                (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
-                axis=4).flatten(3)
-            return paddle.concat((pos_y, pos_x), axis=3)
-        elif self.embed_type == 'learned':
-            h, w = mask.shape[-2:]
-            i = paddle.arange(w)
-            j = paddle.arange(h)
-            x_emb = self.col_embed(i)
-            y_emb = self.row_embed(j)
-            return paddle.concat(
-                [
-                    x_emb.unsqueeze(0).tile([h, 1, 1]),
-                    y_emb.unsqueeze(1).tile([1, w, 1]),
-                ],
-                axis=-1).unsqueeze(0)
-        else:
-            raise ValueError(f"not supported {self.embed_type}")
diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/rtdetr_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/rtdetr_transformer.py
deleted file mode 100644
index f3d021f..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/transformers/rtdetr_transformer.py
+++ /dev/null
@@ -1,557 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR)
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Modified from detrex (https://github.com/IDEA-Research/detrex)
-# Copyright 2022 The IDEA Authors. All rights reserved.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-from paddle.regularizer import L2Decay
-
-from ppdet.core.workspace import register
-from ..layers import MultiHeadAttention
-from ..heads.detr_head import MLP
-from .deformable_transformer import MSDeformableAttention
-from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_,
-                           bias_init_with_prob)
-from .utils import (_get_clones, get_sine_pos_embed,
-                    get_contrastive_denoising_training_group, inverse_sigmoid)
-
-__all__ = ['RTDETRTransformer']
-
-
-class PPMSDeformableAttention(MSDeformableAttention):
-    def forward(self,
-                query,
-                reference_points,
-                value,
-                value_spatial_shapes,
-                value_level_start_index,
-                value_mask=None):
-        """
-        Args:
-            query (Tensor): [bs, query_length, C]
-            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
-                bottom-right (1, 1), including padding area
-            value (Tensor): [bs, value_length, C]
-            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
-            value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...]
-            value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements
-
-        Returns:
-            output (Tensor): [bs, Length_{query}, C]
-        """
-        bs, Len_q = query.shape[:2]
-        Len_v = value.shape[1]
-
-        value = self.value_proj(value)
-        if value_mask is not None:
-            value_mask = value_mask.astype(value.dtype).unsqueeze(-1)
-            value *= value_mask
-        value = value.reshape([bs, Len_v, self.num_heads, self.head_dim])
-
-        sampling_offsets = self.sampling_offsets(query).reshape(
-            [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2])
-        attention_weights = self.attention_weights(query).reshape(
-            [bs, Len_q, self.num_heads, self.num_levels * self.num_points])
-        attention_weights = F.softmax(attention_weights).reshape(
-            [bs, Len_q, self.num_heads, self.num_levels, self.num_points])
-
-        if reference_points.shape[-1] == 2:
-            offset_normalizer = paddle.to_tensor(value_spatial_shapes)
-            offset_normalizer = offset_normalizer.flip([1]).reshape(
-                [1, 1, 1, self.num_levels, 1, 2])
-            sampling_locations = reference_points.reshape([
-                bs, Len_q, 1, self.num_levels, 1, 2
-            ]) + sampling_offsets / offset_normalizer
-        elif reference_points.shape[-1] == 4:
-            sampling_locations = (
-                reference_points[:, :, None, :, None, :2] + sampling_offsets /
-                self.num_points * reference_points[:, :, None, :, None, 2:] *
-                0.5)
-        else:
-            raise ValueError(
-                "Last dim of reference_points must be 2 or 4, but get {} instead.".
-                format(reference_points.shape[-1]))
-
-        if not isinstance(query, paddle.Tensor):
-            from ppdet.modeling.transformers.utils import deformable_attention_core_func
-            output = deformable_attention_core_func(
-                value, value_spatial_shapes, value_level_start_index,
-                sampling_locations, attention_weights)
-        else:
-            value_spatial_shapes = paddle.to_tensor(value_spatial_shapes)
-            value_level_start_index = paddle.to_tensor(value_level_start_index)
-            output = self.ms_deformable_attn_core(
-                value, value_spatial_shapes, value_level_start_index,
-                sampling_locations, attention_weights)
-        output = self.output_proj(output)
-
-        return output
-
-
-class TransformerDecoderLayer(nn.Layer):
-    def __init__(self,
-                 d_model=256,
-                 n_head=8,
-                 dim_feedforward=1024,
-                 dropout=0.,
-                 activation="relu",
-                 n_levels=4,
-                 n_points=4,
-                 weight_attr=None,
-                 bias_attr=None):
-        super(TransformerDecoderLayer, self).__init__()
-
-        # self attention
-        self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout)
-        self.dropout1 = nn.Dropout(dropout)
-        self.norm1 = nn.LayerNorm(
-            d_model,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-
-        # cross attention
-        self.cross_attn = PPMSDeformableAttention(d_model, n_head, n_levels,
-                                                  n_points, 1.0)
-        self.dropout2 = nn.Dropout(dropout)
-        self.norm2 = nn.LayerNorm(
-            d_model,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-
-        # ffn
-        self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr,
-                                 bias_attr)
-        self.activation = getattr(F, activation)
-        self.dropout3 = nn.Dropout(dropout)
-        self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr,
-                                 bias_attr)
-        self.dropout4 = nn.Dropout(dropout)
-        self.norm3 = nn.LayerNorm(
-            d_model,
-            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-            bias_attr=ParamAttr(regularizer=L2Decay(0.0)))
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        linear_init_(self.linear1)
-        linear_init_(self.linear2)
-        xavier_uniform_(self.linear1.weight)
-        xavier_uniform_(self.linear2.weight)
-
-    def with_pos_embed(self, tensor, pos):
-        return tensor if pos is None else tensor + pos
-
-    def forward_ffn(self, tgt):
-        return self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
-
-    def forward(self,
-                tgt,
-                reference_points,
-                memory,
-                memory_spatial_shapes,
-                memory_level_start_index,
-                attn_mask=None,
-                memory_mask=None,
-                query_pos_embed=None):
-        # self attention
-        q = k = self.with_pos_embed(tgt, query_pos_embed)
-        if attn_mask is not None:
-            attn_mask = paddle.where(
-                attn_mask.astype('bool'),
-                paddle.zeros(attn_mask.shape, tgt.dtype),
-                paddle.full(attn_mask.shape, float("-inf"), tgt.dtype))
-        tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask)
-        tgt = tgt + self.dropout1(tgt2)
-        tgt = self.norm1(tgt)
-
-        # cross attention
-        tgt2 = self.cross_attn(
-            self.with_pos_embed(tgt, query_pos_embed), reference_points, memory,
-            memory_spatial_shapes, memory_level_start_index, memory_mask)
-        tgt = tgt + self.dropout2(tgt2)
-        tgt = self.norm2(tgt)
-
-        # ffn
-        tgt2 = self.forward_ffn(tgt)
-        tgt = tgt + self.dropout4(tgt2)
-        tgt = self.norm3(tgt)
-
-        return tgt
-
-
-class TransformerDecoder(nn.Layer):
-    def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1):
-        super(TransformerDecoder, self).__init__()
-        self.layers = _get_clones(decoder_layer, num_layers)
-        self.hidden_dim = hidden_dim
-        self.num_layers = num_layers
-        self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx
-
-    def forward(self,
-                tgt,
-                ref_points_unact,
-                memory,
-                memory_spatial_shapes,
-                memory_level_start_index,
-                bbox_head,
-                score_head,
-                query_pos_head,
-                attn_mask=None,
-                memory_mask=None,
-                query_pos_head_inv_sig=False):
-        output = tgt
-        dec_out_bboxes = []
-        dec_out_logits = []
-        ref_points_detach = F.sigmoid(ref_points_unact)
-        for i, layer in enumerate(self.layers):
-            ref_points_input = ref_points_detach.unsqueeze(2)
-            if not query_pos_head_inv_sig:
-                query_pos_embed = query_pos_head(ref_points_detach)
-            else:
-                query_pos_embed = query_pos_head(
-                    inverse_sigmoid(ref_points_detach))
-
-            output = layer(output, ref_points_input, memory,
-                           memory_spatial_shapes, memory_level_start_index,
-                           attn_mask, memory_mask, query_pos_embed)
-
-            inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
-                ref_points_detach))
-
-            if self.training:
-                dec_out_logits.append(score_head[i](output))
-                if i == 0:
-                    dec_out_bboxes.append(inter_ref_bbox)
-                else:
-                    dec_out_bboxes.append(
-                        F.sigmoid(bbox_head[i](output) + inverse_sigmoid(
-                            ref_points)))
-            elif i == self.eval_idx:
-                dec_out_logits.append(score_head[i](output))
-                dec_out_bboxes.append(inter_ref_bbox)
-                break
-
-            ref_points = inter_ref_bbox
-            ref_points_detach = inter_ref_bbox.detach(
-            ) if self.training else inter_ref_bbox
-
-        return paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits)
-
-
-@register
-class RTDETRTransformer(nn.Layer):
-    __shared__ = ['num_classes', 'hidden_dim', 'eval_size']
-
-    def __init__(self,
-                 num_classes=80,
-                 hidden_dim=256,
-                 num_queries=300,
-                 position_embed_type='sine',
-                 backbone_feat_channels=[512, 1024, 2048],
-                 feat_strides=[8, 16, 32],
-                 num_levels=3,
-                 num_decoder_points=4,
-                 nhead=8,
-                 num_decoder_layers=6,
-                 dim_feedforward=1024,
-                 dropout=0.,
-                 activation="relu",
-                 num_denoising=100,
-                 label_noise_ratio=0.5,
-                 box_noise_scale=1.0,
-                 learnt_init_query=True,
-                 query_pos_head_inv_sig=False,
-                 eval_size=None,
-                 eval_idx=-1,
-                 eps=1e-2):
-        super(RTDETRTransformer, self).__init__()
-        assert position_embed_type in ['sine', 'learned'], \
-            f'ValueError: position_embed_type not supported {position_embed_type}!'
-        assert len(backbone_feat_channels) <= num_levels
-        assert len(feat_strides) == len(backbone_feat_channels)
-        for _ in range(num_levels - len(feat_strides)):
-            feat_strides.append(feat_strides[-1] * 2)
-
-        self.hidden_dim = hidden_dim
-        self.nhead = nhead
-        self.feat_strides = feat_strides
-        self.num_levels = num_levels
-        self.num_classes = num_classes
-        self.num_queries = num_queries
-        self.eps = eps
-        self.num_decoder_layers = num_decoder_layers
-        self.eval_size = eval_size
-
-        # backbone feature projection
-        self._build_input_proj_layer(backbone_feat_channels)
-
-        # Transformer module
-        decoder_layer = TransformerDecoderLayer(
-            hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels,
-            num_decoder_points)
-        self.decoder = TransformerDecoder(hidden_dim, decoder_layer,
-                                          num_decoder_layers, eval_idx)
-
-        # denoising part
-        self.denoising_class_embed = nn.Embedding(
-            num_classes,
-            hidden_dim,
-            weight_attr=ParamAttr(initializer=nn.initializer.Normal()))
-        self.num_denoising = num_denoising
-        self.label_noise_ratio = label_noise_ratio
-        self.box_noise_scale = box_noise_scale
-
-        # decoder embedding
-        self.learnt_init_query = learnt_init_query
-        if learnt_init_query:
-            self.tgt_embed = nn.Embedding(num_queries, hidden_dim)
-        self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2)
-        self.query_pos_head_inv_sig = query_pos_head_inv_sig
-
-        # encoder head
-        self.enc_output = nn.Sequential(
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.LayerNorm(
-                hidden_dim,
-                weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-                bias_attr=ParamAttr(regularizer=L2Decay(0.0))))
-        self.enc_score_head = nn.Linear(hidden_dim, num_classes)
-        self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3)
-
-        # decoder head
-        self.dec_score_head = nn.LayerList([
-            nn.Linear(hidden_dim, num_classes)
-            for _ in range(num_decoder_layers)
-        ])
-        self.dec_bbox_head = nn.LayerList([
-            MLP(hidden_dim, hidden_dim, 4, num_layers=3)
-            for _ in range(num_decoder_layers)
-        ])
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        # class and bbox head init
-        bias_cls = bias_init_with_prob(0.01)
-        linear_init_(self.enc_score_head)
-        constant_(self.enc_score_head.bias, bias_cls)
-        constant_(self.enc_bbox_head.layers[-1].weight)
-        constant_(self.enc_bbox_head.layers[-1].bias)
-        for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head):
-            linear_init_(cls_)
-            constant_(cls_.bias, bias_cls)
-            constant_(reg_.layers[-1].weight)
-            constant_(reg_.layers[-1].bias)
-
-        linear_init_(self.enc_output[0])
-        xavier_uniform_(self.enc_output[0].weight)
-        if self.learnt_init_query:
-            xavier_uniform_(self.tgt_embed.weight)
-        xavier_uniform_(self.query_pos_head.layers[0].weight)
-        xavier_uniform_(self.query_pos_head.layers[1].weight)
-        for l in self.input_proj:
-            xavier_uniform_(l[0].weight)
-
-        # init encoder output anchors and valid_mask
-        if self.eval_size:
-            self.anchors, self.valid_mask = self._generate_anchors()
-
-    @classmethod
-    def from_config(cls, cfg, input_shape):
-        return {'backbone_feat_channels': [i.channels for i in input_shape]}
-
-    def _build_input_proj_layer(self, backbone_feat_channels):
-        self.input_proj = nn.LayerList()
-        for in_channels in backbone_feat_channels:
-            self.input_proj.append(
-                nn.Sequential(
-                    ('conv', nn.Conv2D(
-                        in_channels,
-                        self.hidden_dim,
-                        kernel_size=1,
-                        bias_attr=False)), ('norm', nn.BatchNorm2D(
-                            self.hidden_dim,
-                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
-        in_channels = backbone_feat_channels[-1]
-        for _ in range(self.num_levels - len(backbone_feat_channels)):
-            self.input_proj.append(
-                nn.Sequential(
-                    ('conv', nn.Conv2D(
-                        in_channels,
-                        self.hidden_dim,
-                        kernel_size=3,
-                        stride=2,
-                        padding=1,
-                        bias_attr=False)), ('norm', nn.BatchNorm2D(
-                            self.hidden_dim,
-                            weight_attr=ParamAttr(regularizer=L2Decay(0.0)),
-                            bias_attr=ParamAttr(regularizer=L2Decay(0.0))))))
-            in_channels = self.hidden_dim
-
-    def _get_encoder_input(self, feats):
-        # get projection features
-        proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)]
-        if self.num_levels > len(proj_feats):
-            len_srcs = len(proj_feats)
-            for i in range(len_srcs, self.num_levels):
-                if i == len_srcs:
-                    proj_feats.append(self.input_proj[i](feats[-1]))
-                else:
-                    proj_feats.append(self.input_proj[i](proj_feats[-1]))
-
-        # get encoder inputs
-        feat_flatten = []
-        spatial_shapes = []
-        level_start_index = [0, ]
-        for i, feat in enumerate(proj_feats):
-            _, _, h, w = feat.shape
-            # [b, c, h, w] -> [b, h*w, c]
-            feat_flatten.append(feat.flatten(2).transpose([0, 2, 1]))
-            # [num_levels, 2]
-            spatial_shapes.append([h, w])
-            # [l], start index of each level
-            level_start_index.append(h * w + level_start_index[-1])
-
-        # [b, l, c]
-        feat_flatten = paddle.concat(feat_flatten, 1)
-        level_start_index.pop()
-        return (feat_flatten, spatial_shapes, level_start_index)
-
-    def forward(self, feats, pad_mask=None, gt_meta=None, is_teacher=False):
-        # input projection and embedding
-        (memory, spatial_shapes,
-         level_start_index) = self._get_encoder_input(feats)
-
-        # prepare denoising training
-        if self.training:
-            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \
-                get_contrastive_denoising_training_group(gt_meta,
-                                            self.num_classes,
-                                            self.num_queries,
-                                            self.denoising_class_embed.weight,
-                                            self.num_denoising,
-                                            self.label_noise_ratio,
-                                            self.box_noise_scale)
-        else:
-            denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None
-
-        target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \
-            self._get_decoder_input(
-            memory, spatial_shapes, denoising_class, denoising_bbox_unact,is_teacher)
-
-        # decoder
-        out_bboxes, out_logits = self.decoder(
-            target,
-            init_ref_points_unact,
-            memory,
-            spatial_shapes,
-            level_start_index,
-            self.dec_bbox_head,
-            self.dec_score_head,
-            self.query_pos_head,
-            attn_mask=attn_mask,
-            memory_mask=None,
-            query_pos_head_inv_sig=self.query_pos_head_inv_sig)
-        return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits,
-                dn_meta)
-
-    def _generate_anchors(self,
-                          spatial_shapes=None,
-                          grid_size=0.05,
-                          dtype="float32"):
-        if spatial_shapes is None:
-            spatial_shapes = [
-                [int(self.eval_size[0] / s), int(self.eval_size[1] / s)]
-                for s in self.feat_strides
-            ]
-        anchors = []
-        for lvl, (h, w) in enumerate(spatial_shapes):
-            grid_y, grid_x = paddle.meshgrid(
-                paddle.arange(
-                    end=h, dtype=dtype),
-                paddle.arange(
-                    end=w, dtype=dtype))
-            grid_xy = paddle.stack([grid_x, grid_y], -1)
-
-            valid_WH = paddle.to_tensor([h, w]).astype(dtype)
-            grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH
-            wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl)
-            anchors.append(
-                paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4]))
-
-        anchors = paddle.concat(anchors, 1)
-        valid_mask = ((anchors > self.eps) *
-                      (anchors < 1 - self.eps)).all(-1, keepdim=True)
-        anchors = paddle.log(anchors / (1 - anchors))
-        anchors = paddle.where(valid_mask, anchors,
-                               paddle.to_tensor(float("inf")))
-        return anchors, valid_mask
-
-    def _get_decoder_input(self,
-                           memory,
-                           spatial_shapes,
-                           denoising_class=None,
-                           denoising_bbox_unact=None,
-                           is_teacher=False):
-        bs, _, _ = memory.shape
-        # prepare input for decoder
-        if self.training or self.eval_size is None or is_teacher:
-            anchors, valid_mask = self._generate_anchors(spatial_shapes)
-        else:
-            anchors, valid_mask = self.anchors, self.valid_mask
-        memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.))
-        output_memory = self.enc_output(memory)
-
-        enc_outputs_class = self.enc_score_head(output_memory)
-        enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors
-
-        _, topk_ind = paddle.topk(
-            enc_outputs_class.max(-1), self.num_queries, axis=1)
-        # extract region proposal boxes
-        batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype)
-        batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries])
-        topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1)
-
-        reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact,
-                                                  topk_ind)  # unsigmoided.
-        enc_topk_bboxes = F.sigmoid(reference_points_unact)
-        if denoising_bbox_unact is not None:
-            reference_points_unact = paddle.concat(
-                [denoising_bbox_unact, reference_points_unact], 1)
-        if self.training:
-            reference_points_unact = reference_points_unact.detach()
-        enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind)
-
-        # extract region features
-        if self.learnt_init_query:
-            target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1])
-        else:
-            target = paddle.gather_nd(output_memory, topk_ind)
-            if self.training:
-                target = target.detach()
-        if denoising_class is not None:
-            target = paddle.concat([denoising_class, target], 1)
-
-        return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits
diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/utils.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/utils.py
deleted file mode 100644
index a6f211a..0000000
--- a/pdfdet/models/Paddle/ppdet/modeling/transformers/utils.py
+++ /dev/null
@@ -1,410 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# Modified from DETR (https://github.com/facebookresearch/detr)
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-# Modified from detrex (https://github.com/IDEA-Research/detrex)
-# Copyright 2022 The IDEA Authors. All rights reserved.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import copy
-import math
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ..bbox_utils import bbox_overlaps
-
-__all__ = [
-    '_get_clones', 'bbox_overlaps', 'bbox_cxcywh_to_xyxy',
-    'bbox_xyxy_to_cxcywh', 'sigmoid_focal_loss', 'inverse_sigmoid',
-    'deformable_attention_core_func', 'varifocal_loss_with_logits'
-]
-
-
-def _get_clones(module, N):
-    return nn.LayerList([copy.deepcopy(module) for _ in range(N)])
-
-
-def bbox_cxcywh_to_xyxy(x):
-    cxcy, wh = paddle.split(x, 2, axis=-1)
-    return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1)
-
-
-def bbox_xyxy_to_cxcywh(x):
-    x1, y1, x2, y2 = x.split(4, axis=-1)
-    return paddle.concat(
-        [(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)], axis=-1)
-
-
-def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0):
-    prob = F.sigmoid(logit)
-    ce_loss = F.binary_cross_entropy_with_logits(logit, label, reduction="none")
-    p_t = prob * label + (1 - prob) * (1 - label)
-    loss = ce_loss * ((1 - p_t)**gamma)
-
-    if alpha >= 0:
-        alpha_t = alpha * label + (1 - alpha) * (1 - label)
-        loss = alpha_t * loss
-    return loss.mean(1).sum() / normalizer
-
-
-def inverse_sigmoid(x, eps=1e-5):
-    x = x.clip(min=0., max=1.)
-    return paddle.log(x.clip(min=eps) / (1 - x).clip(min=eps))
-
-
-def deformable_attention_core_func(value, value_spatial_shapes,
-                                   value_level_start_index, sampling_locations,
-                                   attention_weights):
-    """
-    Args:
-        value (Tensor): [bs, value_length, n_head, c]
-        value_spatial_shapes (Tensor|List): [n_levels, 2]
-        value_level_start_index (Tensor|List): [n_levels]
-        sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2]
-        attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points]
-
-    Returns:
-        output (Tensor): [bs, Length_{query}, C]
-    """
-    bs, _, n_head, c = value.shape
-    _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape
-
-    split_shape = [h * w for h, w in value_spatial_shapes]
-    value_list = value.split(split_shape, axis=1)
-    sampling_grids = 2 * sampling_locations - 1
-    sampling_value_list = []
-    for level, (h, w) in enumerate(value_spatial_shapes):
-        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
-        value_l_ = value_list[level].flatten(2).transpose(
-            [0, 2, 1]).reshape([bs * n_head, c, h, w])
-        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
-        sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(
-            [0, 2, 1, 3, 4]).flatten(0, 1)
-        # N_*M_, D_, Lq_, P_
-        sampling_value_l_ = F.grid_sample(
-            value_l_,
-            sampling_grid_l_,
-            mode='bilinear',
-            padding_mode='zeros',
-            align_corners=False)
-        sampling_value_list.append(sampling_value_l_)
-    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_)
-    attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape(
-        [bs * n_head, 1, Len_q, n_levels * n_points])
-    output = (paddle.stack(
-        sampling_value_list, axis=-2).flatten(-2) *
-              attention_weights).sum(-1).reshape([bs, n_head * c, Len_q])
-
-    return output.transpose([0, 2, 1])
-
-
-def get_valid_ratio(mask):
-    _, H, W = paddle.shape(mask)
-    valid_ratio_h = paddle.sum(mask[:, :, 0], 1) / H
-    valid_ratio_w = paddle.sum(mask[:, 0, :], 1) / W
-    # [b, 2]
-    return paddle.stack([valid_ratio_w, valid_ratio_h], -1)
-
-
-def get_denoising_training_group(targets,
-                                 num_classes,
-                                 num_queries,
-                                 class_embed,
-                                 num_denoising=100,
-                                 label_noise_ratio=0.5,
-                                 box_noise_scale=1.0):
-    if num_denoising <= 0:
-        return None, None, None, None
-    num_gts = [len(t) for t in targets["gt_class"]]
-    max_gt_num = max(num_gts)
-    if max_gt_num == 0:
-        return None, None, None, None
-
-    num_group = num_denoising // max_gt_num
-    num_group = 1 if num_group == 0 else num_group
-    # pad gt to max_num of a batch
-    bs = len(targets["gt_class"])
-    input_query_class = paddle.full(
-        [bs, max_gt_num], num_classes, dtype='int32')
-    input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
-    pad_gt_mask = paddle.zeros([bs, max_gt_num])
-    for i in range(bs):
-        num_gt = num_gts[i]
-        if num_gt > 0:
-            input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
-            input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
-            pad_gt_mask[i, :num_gt] = 1
-
-    input_query_class = input_query_class.tile([1, num_group])
-    input_query_bbox = input_query_bbox.tile([1, num_group, 1])
-    pad_gt_mask = pad_gt_mask.tile([1, num_group])
-
-    dn_positive_idx = paddle.nonzero(pad_gt_mask)[:, 1]
-    dn_positive_idx = paddle.split(dn_positive_idx,
-                                   [n * num_group for n in num_gts])
-    # total denoising queries
-    num_denoising = int(max_gt_num * num_group)
-
-    if label_noise_ratio > 0:
-        input_query_class = input_query_class.flatten()
-        pad_gt_mask = pad_gt_mask.flatten()
-        # half of bbox prob
-        mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
-        chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
-        # randomly put a new one here
-        new_label = paddle.randint_like(
-            chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
-        input_query_class.scatter_(chosen_idx, new_label)
-        input_query_class.reshape_([bs, num_denoising])
-        pad_gt_mask.reshape_([bs, num_denoising])
-
-    if box_noise_scale > 0:
-        diff = paddle.concat(
-            [input_query_bbox[..., 2:] * 0.5, input_query_bbox[..., 2:]],
-            axis=-1) * box_noise_scale
-        diff *= (paddle.rand(input_query_bbox.shape) * 2.0 - 1.0)
-        input_query_bbox += diff
-        input_query_bbox = inverse_sigmoid(input_query_bbox)
-
-    class_embed = paddle.concat(
-        [class_embed, paddle.zeros([1, class_embed.shape[-1]])])
-    input_query_class = paddle.gather(
-        class_embed, input_query_class.flatten(),
-        axis=0).reshape([bs, num_denoising, -1])
-
-    tgt_size = num_denoising + num_queries
-    attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
-    # match query cannot see the reconstruction
-    attn_mask[num_denoising:, :num_denoising] = True
-    # reconstruct cannot see each other
-    for i in range(num_group):
-        if i == 0:
-            attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):
-                      num_denoising] = True
-        if i == num_group - 1:
-            attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
-                      i] = True
-        else:
-            attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1):
-                      num_denoising] = True
-            attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num *
-                      i] = True
-    attn_mask = ~attn_mask
-    dn_meta = {
-        "dn_positive_idx": dn_positive_idx,
-        "dn_num_group": num_group,
-        "dn_num_split": [num_denoising, num_queries]
-    }
-
-    return input_query_class, input_query_bbox, attn_mask, dn_meta
-
-
-def get_contrastive_denoising_training_group(targets,
-                                             num_classes,
-                                             num_queries,
-                                             class_embed,
-                                             num_denoising=100,
-                                             label_noise_ratio=0.5,
-                                             box_noise_scale=1.0):
-    if num_denoising <= 0:
-        return None, None, None, None
-    num_gts = [len(t) for t in targets["gt_class"]]
-    max_gt_num = max(num_gts)
-    if max_gt_num == 0:
-        return None, None, None, None
-
-    num_group = num_denoising // max_gt_num
-    num_group = 1 if num_group == 0 else num_group
-    # pad gt to max_num of a batch
-    bs = len(targets["gt_class"])
-    input_query_class = paddle.full(
-        [bs, max_gt_num], num_classes, dtype='int32')
-    input_query_bbox = paddle.zeros([bs, max_gt_num, 4])
-    pad_gt_mask = paddle.zeros([bs, max_gt_num])
-    for i in range(bs):
-        num_gt = num_gts[i]
-        if num_gt > 0:
-            input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1)
-            input_query_bbox[i, :num_gt] = targets["gt_bbox"][i]
-            pad_gt_mask[i, :num_gt] = 1
-    # each group has positive and negative queries.
-    input_query_class = input_query_class.tile([1, 2 * num_group])
-    input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1])
-    pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group])
-    # positive and negative mask
-    negative_gt_mask = paddle.zeros([bs, max_gt_num * 2, 1])
-    negative_gt_mask[:, max_gt_num:] = 1
-    negative_gt_mask = negative_gt_mask.tile([1, num_group, 1])
-    positive_gt_mask = 1 - negative_gt_mask
-    # contrastive denoising training positive index
-    positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
-    dn_positive_idx = paddle.nonzero(positive_gt_mask)[:, 1]
-    dn_positive_idx = paddle.split(dn_positive_idx,
-                                   [n * num_group for n in num_gts])
-    # total denoising queries
-    num_denoising = int(max_gt_num * 2 * num_group)
-
-    if label_noise_ratio > 0:
-        input_query_class = input_query_class.flatten()
-        pad_gt_mask = pad_gt_mask.flatten()
-        # half of bbox prob
-        mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5)
-        chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1)
-        # randomly put a new one here
-        new_label = paddle.randint_like(
-            chosen_idx, 0, num_classes, dtype=input_query_class.dtype)
-        input_query_class.scatter_(chosen_idx, new_label)
-        input_query_class.reshape_([bs, num_denoising])
-        pad_gt_mask.reshape_([bs, num_denoising])
-
-    if box_noise_scale > 0:
-        known_bbox = bbox_cxcywh_to_xyxy(input_query_bbox)
-
-        diff = paddle.tile(input_query_bbox[..., 2:] * 0.5,
-                           [1, 1, 2]) * box_noise_scale
-
-        rand_sign = paddle.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
-        rand_part = paddle.rand(input_query_bbox.shape)
-        rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (
-            1 - negative_gt_mask)
-        rand_part *= rand_sign
-        known_bbox += rand_part * diff
-        known_bbox.clip_(min=0.0, max=1.0)
-        input_query_bbox = bbox_xyxy_to_cxcywh(known_bbox)
-        input_query_bbox = inverse_sigmoid(input_query_bbox)
-
-    class_embed = paddle.concat(
-        [class_embed, paddle.zeros([1, class_embed.shape[-1]])])
-    input_query_class = paddle.gather(
-        class_embed, input_query_class.flatten(),
-        axis=0).reshape([bs, num_denoising, -1])
-
-    tgt_size = num_denoising + num_queries
-    attn_mask = paddle.ones([tgt_size, tgt_size]) < 0
-    # match query cannot see the reconstruction
-    attn_mask[num_denoising:, :num_denoising] = True
-    # reconstruct cannot see each other
-    for i in range(num_group):
-        if i == 0:
-            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
-                      2 * (i + 1):num_denoising] = True
-        if i == num_group - 1:
-            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
-                      i * 2] = True
-        else:
-            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num *
-                      2 * (i + 1):num_denoising] = True
-            attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num *
-                      2 * i] = True
-    attn_mask = ~attn_mask
-    dn_meta = {
-        "dn_positive_idx": dn_positive_idx,
-        "dn_num_group": num_group,
-        "dn_num_split": [num_denoising, num_queries]
-    }
-
-    return input_query_class, input_query_bbox, attn_mask, dn_meta
-
-
-def get_sine_pos_embed(pos_tensor,
-                       num_pos_feats=128,
-                       temperature=10000,
-                       exchange_xy=True):
-    """generate sine position embedding from a position tensor
-
-    Args:
-        pos_tensor (Tensor): Shape as `(None, n)`.
-        num_pos_feats (int): projected shape for each float in the tensor. Default: 128
-        temperature (int): The temperature used for scaling
-            the position embedding. Default: 10000.
-        exchange_xy (bool, optional): exchange pos x and pos y. \
-            For example, input tensor is `[x, y]`, the results will  # noqa
-            be `[pos(y), pos(x)]`. Defaults: True.
-
-    Returns:
-        Tensor: Returned position embedding  # noqa
-        with shape `(None, n * num_pos_feats)`.
-    """
-    scale = 2. * math.pi
-    dim_t = 2. * paddle.floor_divide(
-        paddle.arange(num_pos_feats), paddle.to_tensor(2))
-    dim_t = scale / temperature**(dim_t / num_pos_feats)
-
-    def sine_func(x):
-        x *= dim_t
-        return paddle.stack(
-            (x[:, :, 0::2].sin(), x[:, :, 1::2].cos()), axis=3).flatten(2)
-
-    pos_res = [sine_func(x) for x in pos_tensor.split(pos_tensor.shape[-1], -1)]
-    if exchange_xy:
-        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
-    pos_res = paddle.concat(pos_res, axis=2)
-    return pos_res
-
-
-def mask_to_box_coordinate(mask,
-                           normalize=False,
-                           format="xyxy",
-                           dtype="float32"):
-    """
-    Compute the bounding boxes around the provided mask.
-    Args:
-        mask (Tensor:bool): [b, c, h, w]
-
-    Returns:
-        bbox (Tensor): [b, c, 4]
-    """
-    assert mask.ndim == 4
-    assert format in ["xyxy", "xywh"]
-    if mask.sum() == 0:
-        return paddle.zeros([mask.shape[0], mask.shape[1], 4], dtype=dtype)
-
-    h, w = mask.shape[-2:]
-    y, x = paddle.meshgrid(
-        paddle.arange(
-            end=h, dtype=dtype), paddle.arange(
-                end=w, dtype=dtype))
-
-    x_mask = x * mask
-    x_max = x_mask.flatten(-2).max(-1) + 1
-    x_min = paddle.where(mask, x_mask,
-                         paddle.to_tensor(1e8)).flatten(-2).min(-1)
-
-    y_mask = y * mask
-    y_max = y_mask.flatten(-2).max(-1) + 1
-    y_min = paddle.where(mask, y_mask,
-                         paddle.to_tensor(1e8)).flatten(-2).min(-1)
-    out_bbox = paddle.stack([x_min, y_min, x_max, y_max], axis=-1)
-    if normalize:
-        out_bbox /= paddle.to_tensor([w, h, w, h]).astype(dtype)
-
-    return out_bbox if format == "xyxy" else bbox_xyxy_to_cxcywh(out_bbox)
-
-
-def varifocal_loss_with_logits(pred_logits,
-                               gt_score,
-                               label,
-                               normalizer=1.0,
-                               alpha=0.75,
-                               gamma=2.0):
-    pred_score = F.sigmoid(pred_logits)
-    weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label
-    loss = F.binary_cross_entropy_with_logits(
-        pred_logits, gt_score, weight=weight, reduction='none')
-    return loss.mean(1).sum() / normalizer
diff --git a/pdfdet/models/Paddle/ppdet/optimizer/__init__.py b/pdfdet/models/Paddle/ppdet/optimizer/__init__.py
deleted file mode 100644
index aa690dc..0000000
--- a/pdfdet/models/Paddle/ppdet/optimizer/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import optimizer
-from . import ema
-
-from .optimizer import *
-from .ema import *
diff --git a/pdfdet/models/Paddle/ppdet/optimizer/adamw.py b/pdfdet/models/Paddle/ppdet/optimizer/adamw.py
deleted file mode 100644
index 12ab619..0000000
--- a/pdfdet/models/Paddle/ppdet/optimizer/adamw.py
+++ /dev/null
@@ -1,272 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-from paddle.optimizer import AdamW
-from functools import partial
-import re
-
-IS_PADDLE_LATER_2_4 = (
-    int(paddle.version.major) >= 2 and
-    int(paddle.version.minor) >= 4) or int(paddle.version.major) == 0
-
-
-def layerwise_lr_decay(decay_rate, name_dict, n_layers, param):
-    """
-    Args:
-        decay_rate (float): 
-            The layer-wise decay ratio.
-        name_dict (dict): 
-            The keys of name_dict is dynamic name of model while the value
-            of name_dict is static name.
-            Use model.named_parameters() to get name_dict.
-        n_layers (int):
-            Total number of layers in the transformer encoder.
-    """
-    ratio = 1.0
-    static_name = name_dict[param.name]
-    if 'blocks.' in static_name or 'layers.' in static_name:
-        idx_1 = static_name.find('blocks.')
-        idx_2 = static_name.find('layers.')
-        assert any([x >= 0 for x in [idx_1, idx_2]]), ''
-        idx = idx_1 if idx_1 >= 0 else idx_2
-        # idx = re.findall('[blocks|layers]\.(\d+)\.', static_name)[0]
-
-        layer = int(static_name[idx:].split('.')[1])
-        ratio = decay_rate**(n_layers - layer)
-
-    elif 'cls_token' in static_name or 'patch_embed' in static_name or 'pos_embed' in static_name:
-        ratio = decay_rate**(n_layers + 1)
-
-    if IS_PADDLE_LATER_2_4:
-        return ratio
-    else:
-        param.optimize_attr['learning_rate'] *= ratio
-
-
-class AdamWDL(AdamW):
-    r"""
-    The AdamWDL optimizer is implemented based on the AdamW Optimization with dynamic lr setting.
-    Generally it's used for transformer model.
-
-    We use "layerwise_lr_decay" as default dynamic lr setting method of AdamWDL.
-    “Layer-wise decay” means exponentially decaying the learning rates of individual 
-    layers in a top-down manner. For example, suppose the 24-th layer uses a learning
-    rate l, and the Layer-wise decay rate is α, then the learning rate of layer m 
-    is lα^(24-m). See more details on: https://arxiv.org/abs/1906.08237.
-
-    .. math::
-        & t = t + 1
-    
-        & moment\_1\_out = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad
-
-        & moment\_2\_out = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad
-
-        & learning\_rate = learning\_rate * \frac{\sqrt{1 - {\beta}_2^t}}{1 - {\beta}_1^t}
-
-        & param\_out = param - learning\_rate * (\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param)
-
-    Args:
-        learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``.
-            It can be a float value or a LRScheduler. The default value is 0.001.
-        beta1 (float, optional): The exponential decay rate for the 1st moment estimates.
-            It should be a float number or a Tensor with shape [1] and data type as float32.
-            The default value is 0.9.
-        beta2 (float, optional): The exponential decay rate for the 2nd moment estimates.
-            It should be a float number or a Tensor with shape [1] and data type as float32.
-            The default value is 0.999.
-        epsilon (float, optional): A small float value for numerical stability.
-            It should be a float number or a Tensor with shape [1] and data type as float32.
-            The default value is 1e-08.
-        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
-            This parameter is required in dygraph mode. \
-            The default value is None in static mode, at this time all parameters will be updated.
-        weight_decay (float, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
-        apply_decay_param_fun (function|None, optional): If it is not None,
-            only tensors that makes apply_decay_param_fun(Tensor.name)==True
-            will be updated. It only works when we want to specify tensors.
-            Default: None.
-        grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
-            some derived class of ``GradientClipBase`` . There are three cliping strategies
-            ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` ,
-            :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping.
-        lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators.
-            The accumulators are updated at every step. Every element of the two moving-average
-            is updated in both dense mode and sparse mode. If the size of parameter is very large,
-            then the update may be very slow. The lazy mode only update the element that has
-            gradient in current mini-batch, so it will be much more faster. But this mode has
-            different semantics with the original Adam algorithm and may lead to different result.
-            The default value is False.
-        multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false.  
-        layerwise_decay (float, optional): The layer-wise decay ratio. Defaults to 1.0.
-        n_layers (int, optional): The total number of encoder layers. Defaults to 12.
-        set_param_lr_fun (function|None, optional): If it's not None, set_param_lr_fun() will set the the parameter 
-            learning rate before it executes Adam Operator. Defaults to :ref:`layerwise_lr_decay`.
-        name_dict (dict, optional): The keys of name_dict is dynamic name of model while the value
-            of name_dict is static name. Use model.named_parameters() to get name_dict.
-        name (str, optional): Normally there is no need for user to set this property.
-            For more information, please refer to :ref:`api_guide_Name`.
-            The default value is None.
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            from paddlenlp.ops.optimizer import AdamWDL
-            def simple_lr_setting(decay_rate, name_dict, n_layers, param):
-                ratio = 1.0
-                static_name = name_dict[param.name]
-                if "weight" in static_name:
-                    ratio = decay_rate**0.5
-                param.optimize_attr["learning_rate"] *= ratio
-            
-            linear = paddle.nn.Linear(10, 10)
-
-            name_dict = dict()
-            for n, p in linear.named_parameters():
-                name_dict[p.name] = n
-
-            inp = paddle.rand([10,10], dtype="float32")
-            out = linear(inp)
-            loss = paddle.mean(out)
-
-            adamwdl = AdamWDL(
-                learning_rate=1e-4,
-                parameters=linear.parameters(),
-                set_param_lr_fun=simple_lr_setting,
-                layerwise_decay=0.8,
-                name_dict=name_dict)
-            
-            loss.backward()
-            adamwdl.step()
-            adamwdl.clear_grad()
-    """
-
-    def __init__(self,
-                 learning_rate=0.001,
-                 beta1=0.9,
-                 beta2=0.999,
-                 epsilon=1e-8,
-                 parameters=None,
-                 weight_decay=0.01,
-                 apply_decay_param_fun=None,
-                 grad_clip=None,
-                 lazy_mode=False,
-                 multi_precision=False,
-                 layerwise_decay=1.0,
-                 n_layers=12,
-                 set_param_lr_func=None,
-                 name_dict=None,
-                 name=None):
-        if not isinstance(layerwise_decay, float):
-            raise TypeError("coeff should be float or Tensor.")
-        self.layerwise_decay = layerwise_decay
-        self.n_layers = n_layers
-        self.set_param_lr_func = partial(
-            set_param_lr_func, layerwise_decay, name_dict,
-            n_layers) if set_param_lr_func is not None else set_param_lr_func
-
-        if IS_PADDLE_LATER_2_4:
-            super(AdamWDL, self).__init__(
-                learning_rate=learning_rate,
-                parameters=parameters,
-                beta1=beta1,
-                beta2=beta2,
-                epsilon=epsilon,
-                grad_clip=grad_clip,
-                name=name,
-                apply_decay_param_fun=apply_decay_param_fun,
-                weight_decay=weight_decay,
-                lazy_mode=lazy_mode,
-                multi_precision=multi_precision,
-                lr_ratio=self.set_param_lr_func)
-        else:
-            super(AdamWDL, self).__init__(
-                learning_rate=learning_rate,
-                parameters=parameters,
-                beta1=beta1,
-                beta2=beta2,
-                epsilon=epsilon,
-                grad_clip=grad_clip,
-                name=name,
-                apply_decay_param_fun=apply_decay_param_fun,
-                weight_decay=weight_decay,
-                lazy_mode=lazy_mode,
-                multi_precision=multi_precision)
-
-
-def _append_optimize_op(self, block, param_and_grad):
-    if self.set_param_lr_func is None:
-        return super(AdamWDL, self)._append_optimize_op(block, param_and_grad)
-
-    self._append_decoupled_weight_decay(block, param_and_grad)
-    prev_lr = param_and_grad[0].optimize_attr["learning_rate"]
-    self.set_param_lr_func(param_and_grad[0])
-    # excute Adam op
-    res = super(AdamW, self)._append_optimize_op(block, param_and_grad)
-    param_and_grad[0].optimize_attr["learning_rate"] = prev_lr
-    return res
-
-
-if not IS_PADDLE_LATER_2_4:
-    AdamWDL._append_optimize_op = _append_optimize_op
-
-
-def build_adamwdl(model,
-                  lr=1e-4,
-                  weight_decay=0.05,
-                  betas=(0.9, 0.999),
-                  layer_decay=0.65,
-                  num_layers=None,
-                  filter_bias_and_bn=True,
-                  skip_decay_names=None,
-                  set_param_lr_func='layerwise_lr_decay'):
-
-    if skip_decay_names and filter_bias_and_bn:
-        decay_dict = {
-            param.name: not (len(param.shape) == 1 or name.endswith('.bias') or
-                             any([_n in name for _n in skip_decay_names]))
-            for name, param in model.named_parameters()
-        }
-        parameters = [p for p in model.parameters()]
-
-    else:
-        parameters = model.parameters()
-
-    opt_args = dict(
-        parameters=parameters, learning_rate=lr, weight_decay=weight_decay)
-
-    if decay_dict is not None:
-        opt_args['apply_decay_param_fun'] = lambda n: decay_dict[n]
-
-    if isinstance(set_param_lr_func, str):
-        set_param_lr_func = eval(set_param_lr_func)
-        opt_args['set_param_lr_func'] = set_param_lr_func
-
-    opt_args['beta1'] = betas[0]
-    opt_args['beta2'] = betas[1]
-
-    opt_args['layerwise_decay'] = layer_decay
-    name_dict = {p.name: n for n, p in model.named_parameters()}
-
-    opt_args['name_dict'] = name_dict
-    opt_args['n_layers'] = num_layers
-
-    optimizer = AdamWDL(**opt_args)
-
-    return optimizer
diff --git a/pdfdet/models/Paddle/ppdet/optimizer/ema.py b/pdfdet/models/Paddle/ppdet/optimizer/ema.py
deleted file mode 100644
index 84cc9ac..0000000
--- a/pdfdet/models/Paddle/ppdet/optimizer/ema.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import paddle
-import weakref
-from copy import deepcopy
-
-from .utils import get_bn_running_state_names
-
-__all__ = ['ModelEMA', 'SimpleModelEMA']
-
-
-class ModelEMA(object):
-    """
-    Exponential Weighted Average for Deep Neutal Networks
-    Args:
-        model (nn.Layer): Detector of model.
-        decay (int):  The decay used for updating ema parameter.
-            Ema's parameter are updated with the formula:
-           `ema_param = decay * ema_param + (1 - decay) * cur_param`.
-            Defaults is 0.9998.
-        ema_decay_type (str): type in ['threshold', 'normal', 'exponential'],
-            'threshold' as default.
-        cycle_epoch (int): The epoch of interval to reset ema_param and
-            step. Defaults is -1, which means not reset. Its function is to
-            add a regular effect to ema, which is set according to experience
-            and is effective when the total training epoch is large.
-        ema_black_list (set|list|tuple, optional): The custom EMA black_list.
-            Blacklist of weight names that will not participate in EMA
-            calculation. Default: None.
-    """
-
-    def __init__(self,
-                 model,
-                 decay=0.9998,
-                 ema_decay_type='threshold',
-                 cycle_epoch=-1,
-                 ema_black_list=None,
-                 ema_filter_no_grad=False):
-        self.step = 0
-        self.epoch = 0
-        self.decay = decay
-        self.ema_decay_type = ema_decay_type
-        self.cycle_epoch = cycle_epoch
-        self.ema_black_list = self._match_ema_black_list(
-            model.state_dict().keys(), ema_black_list)
-        bn_states_names = get_bn_running_state_names(model)
-        if ema_filter_no_grad:
-            for n, p in model.named_parameters():
-                if p.stop_gradient and n not in bn_states_names:
-                    self.ema_black_list.add(n)
-
-        self.state_dict = dict()
-        for k, v in model.state_dict().items():
-            if k in self.ema_black_list:
-                self.state_dict[k] = v
-            else:
-                self.state_dict[k] = paddle.zeros_like(v, dtype='float32')
-
-        self._model_state = {
-            k: weakref.ref(p)
-            for k, p in model.state_dict().items()
-        }
-
-    def reset(self):
-        self.step = 0
-        self.epoch = 0
-        for k, v in self.state_dict.items():
-            if k in self.ema_black_list:
-                self.state_dict[k] = v
-            else:
-                self.state_dict[k] = paddle.zeros_like(v)
-
-    def resume(self, state_dict, step=0):
-        for k, v in state_dict.items():
-            if k in self.state_dict:
-                if self.state_dict[k].dtype == v.dtype:
-                    self.state_dict[k] = v
-                else:
-                    self.state_dict[k] = v.astype(self.state_dict[k].dtype)
-        self.step = step
-
-    def update(self, model=None):
-        if self.ema_decay_type == 'threshold':
-            decay = min(self.decay, (1 + self.step) / (10 + self.step))
-        elif self.ema_decay_type == 'exponential':
-            decay = self.decay * (1 - math.exp(-(self.step + 1) / 2000))
-        else:
-            decay = self.decay
-        self._decay = decay
-
-        if model is not None:
-            model_dict = model.state_dict()
-        else:
-            model_dict = {k: p() for k, p in self._model_state.items()}
-            assert all(
-                [v is not None for _, v in model_dict.items()]), 'python gc.'
-
-        for k, v in self.state_dict.items():
-            if k not in self.ema_black_list:
-                v = decay * v + (1 - decay) * model_dict[k].astype('float32')
-                v.stop_gradient = True
-                self.state_dict[k] = v
-        self.step += 1
-
-    def apply(self):
-        if self.step == 0:
-            return self.state_dict
-        state_dict = dict()
-        model_dict = {k: p() for k, p in self._model_state.items()}
-        for k, v in self.state_dict.items():
-            if k in self.ema_black_list:
-                v.stop_gradient = True
-                state_dict[k] = v
-            else:
-                if self.ema_decay_type != 'exponential':
-                    v = v / (1 - self._decay**self.step)
-                    v = v.astype(model_dict[k].dtype)
-                v.stop_gradient = True
-                state_dict[k] = v
-        self.epoch += 1
-        if self.cycle_epoch > 0 and self.epoch == self.cycle_epoch:
-            self.reset()
-
-        return state_dict
-
-    def _match_ema_black_list(self, weight_name, ema_black_list=None):
-        out_list = set()
-        if ema_black_list:
-            for name in weight_name:
-                for key in ema_black_list:
-                    if key in name:
-                        out_list.add(name)
-        return out_list
-
-
-class SimpleModelEMA(object):
-    """
-    Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models
-    Keep a moving average of everything in the model state_dict (parameters and buffers).
-    This is intended to allow functionality like
-    https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage
-    A smoothed version of the weights is necessary for some training schemes to perform well.
-    This class is sensitive where it is initialized in the sequence of model init,
-    GPU assignment and distributed training wrappers.
-    """
-
-    def __init__(self, model=None, decay=0.9996):
-        """
-        Args:
-            model (nn.Module): model to apply EMA.
-            decay (float): ema decay reate.
-        """
-        self.model = deepcopy(model)
-        self.decay = decay
-
-    def update(self, model, decay=None):
-        if decay is None:
-            decay = self.decay
-
-        with paddle.no_grad():
-            state = {}
-            msd = model.state_dict()
-            for k, v in self.model.state_dict().items():
-                if paddle.is_floating_point(v):
-                    v *= decay
-                    v += (1.0 - decay) * msd[k].detach()
-                state[k] = v
-            self.model.set_state_dict(state)
-
-    def resume(self, state_dict, step=0):
-        state = {}
-        msd = state_dict
-        for k, v in self.model.state_dict().items():
-            if paddle.is_floating_point(v):
-                v = msd[k].detach()
-            state[k] = v
-        self.model.set_state_dict(state)
-        self.step = step
diff --git a/pdfdet/models/Paddle/ppdet/optimizer/optimizer.py b/pdfdet/models/Paddle/ppdet/optimizer/optimizer.py
deleted file mode 100644
index 3c528fc..0000000
--- a/pdfdet/models/Paddle/ppdet/optimizer/optimizer.py
+++ /dev/null
@@ -1,358 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-import math
-import paddle
-import paddle.nn as nn
-
-import paddle.optimizer as optimizer
-import paddle.regularizer as regularizer
-
-from ppdet.core.workspace import register, serializable
-import copy
-
-from .adamw import AdamWDL, build_adamwdl
-
-__all__ = ['LearningRate', 'OptimizerBuilder']
-
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-
-@serializable
-class CosineDecay(object):
-    """
-    Cosine learning rate decay
-
-    Args:
-        max_epochs (int): max epochs for the training process.
-            if you commbine cosine decay with warmup, it is recommended that
-            the max_iters is much larger than the warmup iter
-        use_warmup (bool): whether to use warmup. Default: True.
-        min_lr_ratio (float): minimum learning rate ratio. Default: 0.
-        last_plateau_epochs (int): use minimum learning rate in
-            the last few epochs. Default: 0.
-    """
-
-    def __init__(self,
-                 max_epochs=1000,
-                 use_warmup=True,
-                 min_lr_ratio=0.,
-                 last_plateau_epochs=0):
-        self.max_epochs = max_epochs
-        self.use_warmup = use_warmup
-        self.min_lr_ratio = min_lr_ratio
-        self.last_plateau_epochs = last_plateau_epochs
-
-    def __call__(self,
-                 base_lr=None,
-                 boundary=None,
-                 value=None,
-                 step_per_epoch=None):
-        assert base_lr is not None, "either base LR or values should be provided"
-
-        max_iters = self.max_epochs * int(step_per_epoch)
-        last_plateau_iters = self.last_plateau_epochs * int(step_per_epoch)
-        min_lr = base_lr * self.min_lr_ratio
-        if boundary is not None and value is not None and self.use_warmup:
-            # use warmup
-            warmup_iters = len(boundary)
-            for i in range(int(boundary[-1]), max_iters):
-                boundary.append(i)
-                if i < max_iters - last_plateau_iters:
-                    decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos(
-                        (i - warmup_iters) * math.pi /
-                        (max_iters - warmup_iters - last_plateau_iters)) + 1)
-                    value.append(decayed_lr)
-                else:
-                    value.append(min_lr)
-            return optimizer.lr.PiecewiseDecay(boundary, value)
-        elif last_plateau_iters > 0:
-            # not use warmup, but set `last_plateau_epochs` > 0
-            boundary = []
-            value = []
-            for i in range(max_iters):
-                if i < max_iters - last_plateau_iters:
-                    decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos(
-                        i * math.pi / (max_iters - last_plateau_iters)) + 1)
-                    value.append(decayed_lr)
-                else:
-                    value.append(min_lr)
-                if i > 0:
-                    boundary.append(i)
-            return optimizer.lr.PiecewiseDecay(boundary, value)
-
-        return optimizer.lr.CosineAnnealingDecay(
-            base_lr, T_max=max_iters, eta_min=min_lr)
-
-
-@serializable
-class PiecewiseDecay(object):
-    """
-    Multi step learning rate decay
-
-    Args:
-        gamma (float | list): decay factor
-        milestones (list): steps at which to decay learning rate
-    """
-
-    def __init__(self,
-                 gamma=[0.1, 0.01],
-                 milestones=[8, 11],
-                 values=None,
-                 use_warmup=True):
-        super(PiecewiseDecay, self).__init__()
-        if type(gamma) is not list:
-            self.gamma = []
-            for i in range(len(milestones)):
-                self.gamma.append(gamma / 10**i)
-        else:
-            self.gamma = gamma
-        self.milestones = milestones
-        self.values = values
-        self.use_warmup = use_warmup
-
-    def __call__(self,
-                 base_lr=None,
-                 boundary=None,
-                 value=None,
-                 step_per_epoch=None):
-        if boundary is not None and self.use_warmup:
-            boundary.extend([int(step_per_epoch) * i for i in self.milestones])
-        else:
-            # do not use LinearWarmup
-            boundary = [int(step_per_epoch) * i for i in self.milestones]
-            value = [base_lr]  # during step[0, boundary[0]] is base_lr
-
-        # self.values is setted directly in config
-        if self.values is not None:
-            assert len(self.milestones) + 1 == len(self.values)
-            return optimizer.lr.PiecewiseDecay(boundary, self.values)
-
-        # value is computed by self.gamma
-        value = value if value is not None else [base_lr]
-        for i in self.gamma:
-            value.append(base_lr * i)
-
-        return optimizer.lr.PiecewiseDecay(boundary, value)
-
-
-@serializable
-class LinearWarmup(object):
-    """
-    Warm up learning rate linearly
-
-    Args:
-        steps (int): warm up steps
-        start_factor (float): initial learning rate factor
-        epochs (int|None): use epochs as warm up steps, the priority
-            of `epochs` is higher than `steps`. Default: None.
-    """
-
-    def __init__(self, steps=500, start_factor=1. / 3, epochs=None, epochs_first=True):
-        super(LinearWarmup, self).__init__()
-        self.steps = steps
-        self.start_factor = start_factor
-        self.epochs = epochs
-        self.epochs_first = epochs_first
-
-    def __call__(self, base_lr, step_per_epoch):
-        boundary = []
-        value = []
-        if self.epochs_first and self.epochs is not None:
-            warmup_steps = self.epochs * step_per_epoch
-        else:
-            warmup_steps = self.steps
-        warmup_steps = max(warmup_steps, 1)
-        for i in range(warmup_steps + 1):
-            if warmup_steps > 0:
-                alpha = i / warmup_steps
-                factor = self.start_factor * (1 - alpha) + alpha
-                lr = base_lr * factor
-                value.append(lr)
-            if i > 0:
-                boundary.append(i)
-        return boundary, value
-
-
-@serializable
-class ExpWarmup(object):
-    """
-    Warm up learning rate in exponential mode
-    Args:
-        steps (int): warm up steps.
-        epochs (int|None): use epochs as warm up steps, the priority
-            of `epochs` is higher than `steps`. Default: None.
-        power (int): Exponential coefficient. Default: 2.
-    """
-
-    def __init__(self, steps=1000, epochs=None, power=2):
-        super(ExpWarmup, self).__init__()
-        self.steps = steps
-        self.epochs = epochs
-        self.power = power
-
-    def __call__(self, base_lr, step_per_epoch):
-        boundary = []
-        value = []
-        warmup_steps = self.epochs * step_per_epoch if self.epochs is not None else self.steps
-        warmup_steps = max(warmup_steps, 1)
-        for i in range(warmup_steps + 1):
-            factor = (i / float(warmup_steps))**self.power
-            value.append(base_lr * factor)
-            if i > 0:
-                boundary.append(i)
-        return boundary, value
-
-
-@register
-class LearningRate(object):
-    """
-    Learning Rate configuration
-
-    Args:
-        base_lr (float): base learning rate
-        schedulers (list): learning rate schedulers
-    """
-    __category__ = 'optim'
-
-    def __init__(self,
-                 base_lr=0.01,
-                 schedulers=[PiecewiseDecay(), LinearWarmup()]):
-        super(LearningRate, self).__init__()
-        self.base_lr = base_lr
-        self.schedulers = []
-
-        schedulers = copy.deepcopy(schedulers)
-        for sched in schedulers:
-            if isinstance(sched, dict):
-                # support dict sched instantiate
-                module = sys.modules[__name__]
-                type = sched.pop("name")
-                scheduler = getattr(module, type)(**sched)
-                self.schedulers.append(scheduler)
-            else:
-                self.schedulers.append(sched)
-
-    def __call__(self, step_per_epoch):
-        assert len(self.schedulers) >= 1
-        if not self.schedulers[0].use_warmup:
-            return self.schedulers[0](base_lr=self.base_lr,
-                                      step_per_epoch=step_per_epoch)
-
-        # TODO: split warmup & decay
-        # warmup
-        boundary, value = self.schedulers[1](self.base_lr, step_per_epoch)
-        # decay
-        decay_lr = self.schedulers[0](self.base_lr, boundary, value,
-                                      step_per_epoch)
-        return decay_lr
-
-
-@register
-class OptimizerBuilder():
-    """
-    Build optimizer handles
-    Args:
-        regularizer (object): an `Regularizer` instance
-        optimizer (object): an `Optimizer` instance
-    """
-    __category__ = 'optim'
-
-    def __init__(self,
-                 clip_grad_by_norm=None,
-                 clip_grad_by_value=None,
-                 regularizer={'type': 'L2',
-                              'factor': .0001},
-                 optimizer={'type': 'Momentum',
-                            'momentum': .9}):
-        self.clip_grad_by_norm = clip_grad_by_norm
-        self.clip_grad_by_value = clip_grad_by_value
-        self.regularizer = regularizer
-        self.optimizer = optimizer
-
-    def __call__(self, learning_rate, model=None):
-        if self.clip_grad_by_norm is not None:
-            grad_clip = nn.ClipGradByGlobalNorm(
-                clip_norm=self.clip_grad_by_norm)
-        elif self.clip_grad_by_value is not None:
-            var = abs(self.clip_grad_by_value)
-            grad_clip = nn.ClipGradByValue(min=-var, max=var)
-        else:
-            grad_clip = None
-        if self.regularizer and self.regularizer != 'None':
-            reg_type = self.regularizer['type'] + 'Decay'
-            reg_factor = self.regularizer['factor']
-            regularization = getattr(regularizer, reg_type)(reg_factor)
-        else:
-            regularization = None
-
-        optim_args = self.optimizer.copy()
-        optim_type = optim_args['type']
-        del optim_args['type']
-
-        if optim_type == 'AdamWDL':
-            return build_adamwdl(model, lr=learning_rate, **optim_args)
-
-        if optim_type != 'AdamW':
-            optim_args['weight_decay'] = regularization
-
-        op = getattr(optimizer, optim_type)
-
-        if 'param_groups' in optim_args:
-            assert isinstance(optim_args['param_groups'], list), ''
-
-            param_groups = optim_args.pop('param_groups')
-
-            params, visited = [], []
-            for group in param_groups:
-                assert isinstance(group,
-                                  dict) and 'params' in group and isinstance(
-                                      group['params'], list), ''
-                _params = {
-                    n: p
-                    for n, p in model.named_parameters()
-                    if any([k in n
-                            for k in group['params']]) and p.trainable is True
-                }
-                _group = group.copy()
-                _group.update({'params': list(_params.values())})
-
-                params.append(_group)
-                visited.extend(list(_params.keys()))
-
-            ext_params = [
-                p for n, p in model.named_parameters()
-                if n not in visited and p.trainable is True
-            ]
-
-            if len(ext_params) < len(model.parameters()):
-                params.append({'params': ext_params})
-
-            elif len(ext_params) > len(model.parameters()):
-                raise RuntimeError
-
-        else:
-            _params = model.parameters()
-            params = [param for param in _params if param.trainable is True]
-
-        return op(learning_rate=learning_rate,
-                  parameters=params,
-                  grad_clip=grad_clip,
-                  **optim_args)
diff --git a/pdfdet/models/Paddle/ppdet/optimizer/utils.py b/pdfdet/models/Paddle/ppdet/optimizer/utils.py
deleted file mode 100644
index ce2de49..0000000
--- a/pdfdet/models/Paddle/ppdet/optimizer/utils.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-
-from typing import List
-
-
-def get_bn_running_state_names(model: nn.Layer) -> List[str]:
-    """Get all bn state full names including running mean and variance
-    """
-    names = []
-    for n, m in model.named_sublayers():
-        if isinstance(m, (nn.BatchNorm2D, nn.SyncBatchNorm)):
-            assert hasattr(m, '_mean'), f'assert {m} has _mean'
-            assert hasattr(m, '_variance'), f'assert {m} has _variance'
-            running_mean = f'{n}._mean'
-            running_var = f'{n}._variance'
-            names.extend([running_mean, running_var])
-
-    return names
diff --git a/pdfdet/models/Paddle/ppdet/slim/__init__.py b/pdfdet/models/Paddle/ppdet/slim/__init__.py
deleted file mode 100644
index 7129190..0000000
--- a/pdfdet/models/Paddle/ppdet/slim/__init__.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import distill_loss
-from . import distill_model
-from . import ofa
-from . import prune
-from . import quant
-from . import unstructured_prune
-
-from .distill_loss import *
-from .distill_model import *
-from .ofa import *
-from .prune import *
-from .quant import *
-from .unstructured_prune import *
-
-import yaml
-from ppdet.core.workspace import load_config
-from ppdet.utils.checkpoint import load_pretrain_weight
-
-
-def build_slim_model(cfg, slim_cfg, mode='train'):
-    with open(slim_cfg) as f:
-        slim_load_cfg = yaml.load(f, Loader=yaml.Loader)
-
-    if mode != 'train' and slim_load_cfg['slim'] == 'Distill':
-        return cfg
-
-    if slim_load_cfg['slim'] == 'Distill':
-        if "slim_method" in slim_load_cfg and slim_load_cfg[
-                'slim_method'] == "FGD":
-            model = FGDDistillModel(cfg, slim_cfg)
-        elif "slim_method" in slim_load_cfg and slim_load_cfg[
-                'slim_method'] == "LD":
-            model = LDDistillModel(cfg, slim_cfg)
-        elif "slim_method" in slim_load_cfg and slim_load_cfg[
-                'slim_method'] == "CWD":
-            model = CWDDistillModel(cfg, slim_cfg)
-        elif "slim_method" in slim_load_cfg and slim_load_cfg[
-                'slim_method'] == "PPYOLOEDistill":
-            model = PPYOLOEDistillModel(cfg, slim_cfg)
-        else:
-            # common distillation model
-            model = DistillModel(cfg, slim_cfg)
-        cfg['model'] = model
-        cfg['slim_type'] = cfg.slim
-    elif slim_load_cfg['slim'] == 'OFA':
-        load_config(slim_cfg)
-        model = create(cfg.architecture)
-        load_pretrain_weight(model, cfg.weights)
-        slim = create(cfg.slim)
-        cfg['slim'] = slim
-        cfg['model'] = slim(model, model.state_dict())
-        cfg['slim_type'] = cfg.slim
-    elif slim_load_cfg['slim'] == 'DistillPrune':
-        if mode == 'train':
-            model = DistillModel(cfg, slim_cfg)
-            pruner = create(cfg.pruner)
-            pruner(model.student_model)
-        else:
-            model = create(cfg.architecture)
-            weights = cfg.weights
-            load_config(slim_cfg)
-            pruner = create(cfg.pruner)
-            model = pruner(model)
-            load_pretrain_weight(model, weights)
-        cfg['model'] = model
-        cfg['slim_type'] = cfg.slim
-    elif slim_load_cfg['slim'] == 'PTQ':
-        model = create(cfg.architecture)
-        load_config(slim_cfg)
-        load_pretrain_weight(model, cfg.weights)
-        slim = create(cfg.slim)
-        cfg['slim_type'] = cfg.slim
-        cfg['slim'] = slim
-        cfg['model'] = slim(model)
-    elif slim_load_cfg['slim'] == 'UnstructuredPruner':
-        load_config(slim_cfg)
-        slim = create(cfg.slim)
-        cfg['slim_type'] = cfg.slim
-        cfg['slim'] = slim
-        cfg['unstructured_prune'] = True
-    else:
-        load_config(slim_cfg)
-        model = create(cfg.architecture)
-        if mode == 'train':
-            load_pretrain_weight(model, cfg.pretrain_weights)
-        slim = create(cfg.slim)
-        cfg['slim_type'] = cfg.slim
-        # TODO: fix quant export model in framework.
-        if mode == 'test' and 'QAT' in slim_load_cfg['slim']:
-            slim.quant_config['activation_preprocess_type'] = None
-        cfg['model'] = slim(model)
-        cfg['slim'] = slim
-        if mode != 'train':
-            load_pretrain_weight(cfg['model'], cfg.weights)
-
-    return cfg
diff --git a/pdfdet/models/Paddle/ppdet/slim/distill_loss.py b/pdfdet/models/Paddle/ppdet/slim/distill_loss.py
deleted file mode 100644
index d325a5b..0000000
--- a/pdfdet/models/Paddle/ppdet/slim/distill_loss.py
+++ /dev/null
@@ -1,919 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle import ParamAttr
-
-from ppdet.core.workspace import register
-from ppdet.modeling import ops
-from ppdet.modeling.losses.iou_loss import GIoULoss
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-__all__ = [
-    'DistillYOLOv3Loss',
-    'KnowledgeDistillationKLDivLoss',
-    'DistillPPYOLOELoss',
-    'FGDFeatureLoss',
-    'CWDFeatureLoss',
-    'PKDFeatureLoss',
-    'MGDFeatureLoss',
-]
-
-
-def parameter_init(mode="kaiming", value=0.):
-    if mode == "kaiming":
-        weight_attr = paddle.nn.initializer.KaimingUniform()
-    elif mode == "constant":
-        weight_attr = paddle.nn.initializer.Constant(value=value)
-    else:
-        weight_attr = paddle.nn.initializer.KaimingUniform()
-
-    weight_init = ParamAttr(initializer=weight_attr)
-    return weight_init
-
-
-def feature_norm(feat):
-    # Normalize the feature maps to have zero mean and unit variances.
-    assert len(feat.shape) == 4
-    N, C, H, W = feat.shape
-    feat = feat.transpose([1, 0, 2, 3]).reshape([C, -1])
-    mean = feat.mean(axis=-1, keepdim=True)
-    std = feat.std(axis=-1, keepdim=True)
-    feat = (feat - mean) / (std + 1e-6)
-    return feat.reshape([C, N, H, W]).transpose([1, 0, 2, 3])
-
-
-@register
-class DistillYOLOv3Loss(nn.Layer):
-    def __init__(self, weight=1000):
-        super(DistillYOLOv3Loss, self).__init__()
-        self.loss_weight = weight
-
-    def obj_weighted_reg(self, sx, sy, sw, sh, tx, ty, tw, th, tobj):
-        loss_x = ops.sigmoid_cross_entropy_with_logits(sx, F.sigmoid(tx))
-        loss_y = ops.sigmoid_cross_entropy_with_logits(sy, F.sigmoid(ty))
-        loss_w = paddle.abs(sw - tw)
-        loss_h = paddle.abs(sh - th)
-        loss = paddle.add_n([loss_x, loss_y, loss_w, loss_h])
-        weighted_loss = paddle.mean(loss * F.sigmoid(tobj))
-        return weighted_loss
-
-    def obj_weighted_cls(self, scls, tcls, tobj):
-        loss = ops.sigmoid_cross_entropy_with_logits(scls, F.sigmoid(tcls))
-        weighted_loss = paddle.mean(paddle.multiply(loss, F.sigmoid(tobj)))
-        return weighted_loss
-
-    def obj_loss(self, sobj, tobj):
-        obj_mask = paddle.cast(tobj > 0., dtype="float32")
-        obj_mask.stop_gradient = True
-        loss = paddle.mean(
-            ops.sigmoid_cross_entropy_with_logits(sobj, obj_mask))
-        return loss
-
-    def forward(self, teacher_model, student_model):
-        teacher_distill_pairs = teacher_model.yolo_head.loss.distill_pairs
-        student_distill_pairs = student_model.yolo_head.loss.distill_pairs
-        distill_reg_loss, distill_cls_loss, distill_obj_loss = [], [], []
-        for s_pair, t_pair in zip(student_distill_pairs, teacher_distill_pairs):
-            distill_reg_loss.append(
-                self.obj_weighted_reg(s_pair[0], s_pair[1], s_pair[2], s_pair[
-                    3], t_pair[0], t_pair[1], t_pair[2], t_pair[3], t_pair[4]))
-            distill_cls_loss.append(
-                self.obj_weighted_cls(s_pair[5], t_pair[5], t_pair[4]))
-            distill_obj_loss.append(self.obj_loss(s_pair[4], t_pair[4]))
-        distill_reg_loss = paddle.add_n(distill_reg_loss)
-        distill_cls_loss = paddle.add_n(distill_cls_loss)
-        distill_obj_loss = paddle.add_n(distill_obj_loss)
-        loss = (distill_reg_loss + distill_cls_loss + distill_obj_loss
-                ) * self.loss_weight
-        return loss
-
-
-@register
-class KnowledgeDistillationKLDivLoss(nn.Layer):
-    """Loss function for knowledge distilling using KL divergence.
-
-    Args:
-        reduction (str): Options are `'none'`, `'mean'` and `'sum'`.
-        loss_weight (float): Loss weight of current loss.
-        T (int): Temperature for distillation.
-    """
-
-    def __init__(self, reduction='mean', loss_weight=1.0, T=10):
-        super(KnowledgeDistillationKLDivLoss, self).__init__()
-        assert reduction in ('none', 'mean', 'sum')
-        assert T >= 1
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-        self.T = T
-
-    def knowledge_distillation_kl_div_loss(self,
-                                           pred,
-                                           soft_label,
-                                           T,
-                                           detach_target=True):
-        r"""Loss function for knowledge distilling using KL divergence.
-
-        Args:
-            pred (Tensor): Predicted logits with shape (N, n + 1).
-            soft_label (Tensor): Target logits with shape (N, N + 1).
-            T (int): Temperature for distillation.
-            detach_target (bool): Remove soft_label from automatic differentiation
-        """
-        assert pred.shape == soft_label.shape
-        target = F.softmax(soft_label / T, axis=1)
-        if detach_target:
-            target = target.detach()
-
-        kd_loss = F.kl_div(
-            F.log_softmax(
-                pred / T, axis=1), target, reduction='none').mean(1) * (T * T)
-
-        return kd_loss
-
-    def forward(self,
-                pred,
-                soft_label,
-                weight=None,
-                avg_factor=None,
-                reduction_override=None):
-        """Forward function.
-
-        Args:
-            pred (Tensor): Predicted logits with shape (N, n + 1).
-            soft_label (Tensor): Target logits with shape (N, N + 1).
-            weight (Tensor, optional): The weight of loss for each
-                prediction. Defaults to None.
-            avg_factor (int, optional): Average factor that is used to average
-                the loss. Defaults to None.
-            reduction_override (str, optional): The reduction method used to
-                override the original reduction method of the loss.
-                Defaults to None.
-        """
-        assert reduction_override in (None, 'none', 'mean', 'sum')
-
-        reduction = (reduction_override
-                     if reduction_override else self.reduction)
-
-        loss_kd_out = self.knowledge_distillation_kl_div_loss(
-            pred, soft_label, T=self.T)
-
-        if weight is not None:
-            loss_kd_out = weight * loss_kd_out
-
-        if avg_factor is None:
-            if reduction == 'none':
-                loss = loss_kd_out
-            elif reduction == 'mean':
-                loss = loss_kd_out.mean()
-            elif reduction == 'sum':
-                loss = loss_kd_out.sum()
-        else:
-            # if reduction is mean, then average the loss by avg_factor
-            if reduction == 'mean':
-                loss = loss_kd_out.sum() / avg_factor
-            # if reduction is 'none', then do nothing, otherwise raise an error
-            elif reduction != 'none':
-                raise ValueError(
-                    'avg_factor can not be used with reduction="sum"')
-
-        loss_kd = self.loss_weight * loss
-        return loss_kd
-
-
-@register
-class DistillPPYOLOELoss(nn.Layer):
-    def __init__(
-            self,
-            loss_weight={'logits': 4.0,
-                         'feat': 1.0},
-            logits_distill=True,
-            logits_loss_weight={'class': 1.0,
-                                'iou': 2.5,
-                                'dfl': 0.5},
-            logits_ld_distill=False,
-            logits_ld_params={'weight': 20000,
-                              'T': 10},
-            feat_distill=True,
-            feat_distiller='fgd',
-            feat_distill_place='neck_feats',
-            teacher_width_mult=1.0,  # L
-            student_width_mult=0.75,  # M
-            feat_out_channels=[768, 384, 192]):
-        super(DistillPPYOLOELoss, self).__init__()
-        self.loss_weight_logits = loss_weight['logits']
-        self.loss_weight_feat = loss_weight['feat']
-        self.logits_distill = logits_distill
-        self.logits_ld_distill = logits_ld_distill
-        self.feat_distill = feat_distill
-
-        if logits_distill and self.loss_weight_logits > 0:
-            self.bbox_loss_weight = logits_loss_weight['iou']
-            self.dfl_loss_weight = logits_loss_weight['dfl']
-            self.qfl_loss_weight = logits_loss_weight['class']
-            self.loss_bbox = GIoULoss()
-
-        if logits_ld_distill:
-            self.loss_kd = KnowledgeDistillationKLDivLoss(
-                loss_weight=logits_ld_params['weight'], T=logits_ld_params['T'])
-
-        if feat_distill and self.loss_weight_feat > 0:
-            assert feat_distiller in ['cwd', 'fgd', 'pkd', 'mgd', 'mimic']
-            assert feat_distill_place in ['backbone_feats', 'neck_feats']
-            self.feat_distill_place = feat_distill_place
-            self.t_channel_list = [
-                int(c * teacher_width_mult) for c in feat_out_channels
-            ]
-            self.s_channel_list = [
-                int(c * student_width_mult) for c in feat_out_channels
-            ]
-            self.distill_feat_loss_modules = []
-            for i in range(len(feat_out_channels)):
-                if feat_distiller == 'cwd':
-                    feat_loss_module = CWDFeatureLoss(
-                        student_channels=self.s_channel_list[i],
-                        teacher_channels=self.t_channel_list[i],
-                        normalize=True)
-                elif feat_distiller == 'fgd':
-                    feat_loss_module = FGDFeatureLoss(
-                        student_channels=self.s_channel_list[i],
-                        teacher_channels=self.t_channel_list[i],
-                        normalize=True,
-                        alpha_fgd=0.00001,
-                        beta_fgd=0.000005,
-                        gamma_fgd=0.00001,
-                        lambda_fgd=0.00000005)
-                elif feat_distiller == 'pkd':
-                    feat_loss_module = PKDFeatureLoss(
-                        student_channels=self.s_channel_list[i],
-                        teacher_channels=self.t_channel_list[i],
-                        normalize=True,
-                        resize_stu=True)
-                elif feat_distiller == 'mgd':
-                    feat_loss_module = MGDFeatureLoss(
-                        student_channels=self.s_channel_list[i],
-                        teacher_channels=self.t_channel_list[i],
-                        normalize=True,
-                        loss_func='ssim')
-                elif feat_distiller == 'mimic':
-                    feat_loss_module = MimicFeatureLoss(
-                        student_channels=self.s_channel_list[i],
-                        teacher_channels=self.t_channel_list[i],
-                        normalize=True)
-                else:
-                    raise ValueError
-                self.distill_feat_loss_modules.append(feat_loss_module)
-
-    def quality_focal_loss(self,
-                           pred_logits,
-                           soft_target_logits,
-                           beta=2.0,
-                           use_sigmoid=False,
-                           num_total_pos=None):
-        if use_sigmoid:
-            func = F.binary_cross_entropy_with_logits
-            soft_target = F.sigmoid(soft_target_logits)
-            pred_sigmoid = F.sigmoid(pred_logits)
-            preds = pred_logits
-        else:
-            func = F.binary_cross_entropy
-            soft_target = soft_target_logits
-            pred_sigmoid = pred_logits
-            preds = pred_sigmoid
-
-        scale_factor = pred_sigmoid - soft_target
-        loss = func(
-            preds, soft_target, reduction='none') * scale_factor.abs().pow(beta)
-        loss = loss.sum(1)
-
-        if num_total_pos is not None:
-            loss = loss.sum() / num_total_pos
-        else:
-            loss = loss.mean()
-        return loss
-
-    def bbox_loss(self, s_bbox, t_bbox, weight_targets=None):
-        # [x,y,w,h]
-        if weight_targets is not None:
-            loss = paddle.sum(self.loss_bbox(s_bbox, t_bbox) * weight_targets)
-            avg_factor = weight_targets.sum()
-            loss = loss / avg_factor
-        else:
-            loss = paddle.mean(self.loss_bbox(s_bbox, t_bbox))
-        return loss
-
-    def distribution_focal_loss(self,
-                                pred_corners,
-                                target_corners,
-                                weight_targets=None):
-        target_corners_label = F.softmax(target_corners, axis=-1)
-        loss_dfl = F.cross_entropy(
-            pred_corners,
-            target_corners_label,
-            soft_label=True,
-            reduction='none')
-        loss_dfl = loss_dfl.sum(1)
-
-        if weight_targets is not None:
-            loss_dfl = loss_dfl * (weight_targets.expand([-1, 4]).reshape([-1]))
-            loss_dfl = loss_dfl.sum(-1) / weight_targets.sum()
-        else:
-            loss_dfl = loss_dfl.mean(-1)
-        return loss_dfl / 4.0  # 4 direction
-
-    def main_kd(self, mask_positive, pred_scores, soft_cls, num_classes):
-        num_pos = mask_positive.sum()
-        if num_pos > 0:
-            cls_mask = mask_positive.unsqueeze(-1).tile([1, 1, num_classes])
-            pred_scores_pos = paddle.masked_select(
-                pred_scores, cls_mask).reshape([-1, num_classes])
-            soft_cls_pos = paddle.masked_select(
-                soft_cls, cls_mask).reshape([-1, num_classes])
-            loss_kd = self.loss_kd(
-                pred_scores_pos, soft_cls_pos, avg_factor=num_pos)
-        else:
-            loss_kd = paddle.zeros([1])
-        return loss_kd
-
-    def forward(self, teacher_model, student_model):
-        teacher_distill_pairs = teacher_model.yolo_head.distill_pairs
-        student_distill_pairs = student_model.yolo_head.distill_pairs
-        if self.logits_distill and self.loss_weight_logits > 0:
-            distill_bbox_loss, distill_dfl_loss, distill_cls_loss = [], [], []
-
-            distill_cls_loss.append(
-                self.quality_focal_loss(
-                    student_distill_pairs['pred_cls_scores'].reshape(
-                        (-1, student_distill_pairs['pred_cls_scores'].shape[-1]
-                         )),
-                    teacher_distill_pairs['pred_cls_scores'].detach().reshape(
-                        (-1, teacher_distill_pairs['pred_cls_scores'].shape[-1]
-                         )),
-                    num_total_pos=student_distill_pairs['pos_num'],
-                    use_sigmoid=False))
-
-            distill_bbox_loss.append(
-                self.bbox_loss(student_distill_pairs['pred_bboxes_pos'],
-                                teacher_distill_pairs['pred_bboxes_pos'].detach(),
-                                weight_targets=student_distill_pairs['bbox_weight']
-                    ) if 'pred_bboxes_pos' in student_distill_pairs and \
-                        'pred_bboxes_pos' in teacher_distill_pairs and \
-                            'bbox_weight' in student_distill_pairs
-                    else paddle.zeros([1]))
-
-            distill_dfl_loss.append(
-                self.distribution_focal_loss(
-                        student_distill_pairs['pred_dist_pos'].reshape((-1, student_distill_pairs['pred_dist_pos'].shape[-1])),
-                        teacher_distill_pairs['pred_dist_pos'].detach().reshape((-1, teacher_distill_pairs['pred_dist_pos'].shape[-1])), \
-                        weight_targets=student_distill_pairs['bbox_weight']
-                    ) if 'pred_dist_pos' in student_distill_pairs and \
-                        'pred_dist_pos' in teacher_distill_pairs and \
-                            'bbox_weight' in student_distill_pairs
-                    else paddle.zeros([1]))
-
-            distill_cls_loss = paddle.add_n(distill_cls_loss)
-            distill_bbox_loss = paddle.add_n(distill_bbox_loss)
-            distill_dfl_loss = paddle.add_n(distill_dfl_loss)
-            logits_loss = distill_bbox_loss * self.bbox_loss_weight + distill_cls_loss * self.qfl_loss_weight + distill_dfl_loss * self.dfl_loss_weight
-
-            if self.logits_ld_distill:
-                loss_kd = self.main_kd(
-                    student_distill_pairs['mask_positive_select'],
-                    student_distill_pairs['pred_cls_scores'],
-                    teacher_distill_pairs['pred_cls_scores'],
-                    student_model.yolo_head.num_classes, )
-                logits_loss += loss_kd
-        else:
-            logits_loss = paddle.zeros([1])
-
-        if self.feat_distill and self.loss_weight_feat > 0:
-            feat_loss_list = []
-            inputs = student_model.inputs
-            assert 'gt_bbox' in inputs
-            assert self.feat_distill_place in student_distill_pairs
-            assert self.feat_distill_place in teacher_distill_pairs
-            stu_feats = student_distill_pairs[self.feat_distill_place]
-            tea_feats = teacher_distill_pairs[self.feat_distill_place]
-            for i, loss_module in enumerate(self.distill_feat_loss_modules):
-                feat_loss_list.append(
-                    loss_module(stu_feats[i], tea_feats[i], inputs))
-            feat_loss = paddle.add_n(feat_loss_list)
-        else:
-            feat_loss = paddle.zeros([1])
-
-        student_model.yolo_head.distill_pairs.clear()
-        teacher_model.yolo_head.distill_pairs.clear()
-        return logits_loss * self.loss_weight_logits, feat_loss * self.loss_weight_feat
-
-
-@register
-class CWDFeatureLoss(nn.Layer):
-    def __init__(self,
-                 student_channels,
-                 teacher_channels,
-                 normalize=False,
-                 tau=1.0,
-                 weight=1.0):
-        super(CWDFeatureLoss, self).__init__()
-        self.normalize = normalize
-        self.tau = tau
-        self.loss_weight = weight
-
-        if student_channels != teacher_channels:
-            self.align = nn.Conv2D(
-                student_channels,
-                teacher_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0)
-        else:
-            self.align = None
-
-    def distill_softmax(self, x, tau):
-        _, _, w, h = paddle.shape(x)
-        x = paddle.reshape(x, [-1, w * h])
-        x /= tau
-        return F.softmax(x, axis=1)
-
-    def forward(self, preds_s, preds_t, inputs=None):
-        assert preds_s.shape[-2:] == preds_t.shape[-2:]
-        N, C, H, W = preds_s.shape
-        eps = 1e-5
-        if self.align is not None:
-            preds_s = self.align(preds_s)
-
-        if self.normalize:
-            preds_s = feature_norm(preds_s)
-            preds_t = feature_norm(preds_t)
-
-        softmax_pred_s = self.distill_softmax(preds_s, self.tau)
-        softmax_pred_t = self.distill_softmax(preds_t, self.tau)
-
-        loss = paddle.sum(-softmax_pred_t * paddle.log(eps + softmax_pred_s) +
-                          softmax_pred_t * paddle.log(eps + softmax_pred_t))
-        return self.loss_weight * loss / (C * N)
-
-
-@register
-class FGDFeatureLoss(nn.Layer):
-    """
-    Focal and Global Knowledge Distillation for Detectors
-    The code is reference from https://github.com/yzd-v/FGD/blob/master/mmdet/distillation/losses/fgd.py
-   
-    Args:
-        student_channels (int): The number of channels in the student's FPN feature map. Default to 256.
-        teacher_channels (int): The number of channels in the teacher's FPN feature map. Default to 256.
-        normalize (bool): Whether to normalize the feature maps.
-        temp (float, optional): The temperature coefficient. Defaults to 0.5.
-        alpha_fgd (float, optional): The weight of fg_loss. Defaults to 0.001
-        beta_fgd (float, optional): The weight of bg_loss. Defaults to 0.0005
-        gamma_fgd (float, optional): The weight of mask_loss. Defaults to 0.001
-        lambda_fgd (float, optional): The weight of relation_loss. Defaults to 0.000005
-    """
-
-    def __init__(self,
-                 student_channels,
-                 teacher_channels,
-                 normalize=False,
-                 loss_weight=1.0,
-                 temp=0.5,
-                 alpha_fgd=0.001,
-                 beta_fgd=0.0005,
-                 gamma_fgd=0.001,
-                 lambda_fgd=0.000005):
-        super(FGDFeatureLoss, self).__init__()
-        self.normalize = normalize
-        self.loss_weight = loss_weight
-        self.temp = temp
-        self.alpha_fgd = alpha_fgd
-        self.beta_fgd = beta_fgd
-        self.gamma_fgd = gamma_fgd
-        self.lambda_fgd = lambda_fgd
-        kaiming_init = parameter_init("kaiming")
-        zeros_init = parameter_init("constant", 0.0)
-
-        if student_channels != teacher_channels:
-            self.align = nn.Conv2D(
-                student_channels,
-                teacher_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                weight_attr=kaiming_init)
-            student_channels = teacher_channels
-        else:
-            self.align = None
-
-        self.conv_mask_s = nn.Conv2D(
-            student_channels, 1, kernel_size=1, weight_attr=kaiming_init)
-        self.conv_mask_t = nn.Conv2D(
-            teacher_channels, 1, kernel_size=1, weight_attr=kaiming_init)
-
-        self.stu_conv_block = nn.Sequential(
-            nn.Conv2D(
-                student_channels,
-                student_channels // 2,
-                kernel_size=1,
-                weight_attr=zeros_init),
-            nn.LayerNorm([student_channels // 2, 1, 1]),
-            nn.ReLU(),
-            nn.Conv2D(
-                student_channels // 2,
-                student_channels,
-                kernel_size=1,
-                weight_attr=zeros_init))
-        self.tea_conv_block = nn.Sequential(
-            nn.Conv2D(
-                teacher_channels,
-                teacher_channels // 2,
-                kernel_size=1,
-                weight_attr=zeros_init),
-            nn.LayerNorm([teacher_channels // 2, 1, 1]),
-            nn.ReLU(),
-            nn.Conv2D(
-                teacher_channels // 2,
-                teacher_channels,
-                kernel_size=1,
-                weight_attr=zeros_init))
-
-    def spatial_channel_attention(self, x, t=0.5):
-        shape = paddle.shape(x)
-        N, C, H, W = shape
-        _f = paddle.abs(x)
-        spatial_map = paddle.reshape(
-            paddle.mean(
-                _f, axis=1, keepdim=True) / t, [N, -1])
-        spatial_map = F.softmax(spatial_map, axis=1, dtype="float32") * H * W
-        spatial_att = paddle.reshape(spatial_map, [N, H, W])
-
-        channel_map = paddle.mean(
-            paddle.mean(
-                _f, axis=2, keepdim=False), axis=2, keepdim=False)
-        channel_att = F.softmax(channel_map / t, axis=1, dtype="float32") * C
-        return [spatial_att, channel_att]
-
-    def spatial_pool(self, x, mode="teacher"):
-        batch, channel, width, height = x.shape
-        x_copy = x
-        x_copy = paddle.reshape(x_copy, [batch, channel, height * width])
-        x_copy = x_copy.unsqueeze(1)
-        if mode.lower() == "student":
-            context_mask = self.conv_mask_s(x)
-        else:
-            context_mask = self.conv_mask_t(x)
-
-        context_mask = paddle.reshape(context_mask, [batch, 1, height * width])
-        context_mask = F.softmax(context_mask, axis=2)
-        context_mask = context_mask.unsqueeze(-1)
-        context = paddle.matmul(x_copy, context_mask)
-        context = paddle.reshape(context, [batch, channel, 1, 1])
-        return context
-
-    def mask_loss(self, stu_channel_att, tea_channel_att, stu_spatial_att,
-                  tea_spatial_att):
-        def _func(a, b):
-            return paddle.sum(paddle.abs(a - b)) / len(a)
-
-        mask_loss = _func(stu_channel_att, tea_channel_att) + _func(
-            stu_spatial_att, tea_spatial_att)
-        return mask_loss
-
-    def feature_loss(self, stu_feature, tea_feature, mask_fg, mask_bg,
-                     tea_channel_att, tea_spatial_att):
-        mask_fg = mask_fg.unsqueeze(axis=1)
-        mask_bg = mask_bg.unsqueeze(axis=1)
-        tea_channel_att = tea_channel_att.unsqueeze(axis=-1).unsqueeze(axis=-1)
-        tea_spatial_att = tea_spatial_att.unsqueeze(axis=1)
-
-        fea_t = paddle.multiply(tea_feature, paddle.sqrt(tea_spatial_att))
-        fea_t = paddle.multiply(fea_t, paddle.sqrt(tea_channel_att))
-        fg_fea_t = paddle.multiply(fea_t, paddle.sqrt(mask_fg))
-        bg_fea_t = paddle.multiply(fea_t, paddle.sqrt(mask_bg))
-
-        fea_s = paddle.multiply(stu_feature, paddle.sqrt(tea_spatial_att))
-        fea_s = paddle.multiply(fea_s, paddle.sqrt(tea_channel_att))
-        fg_fea_s = paddle.multiply(fea_s, paddle.sqrt(mask_fg))
-        bg_fea_s = paddle.multiply(fea_s, paddle.sqrt(mask_bg))
-
-        fg_loss = F.mse_loss(fg_fea_s, fg_fea_t, reduction="sum") / len(mask_fg)
-        bg_loss = F.mse_loss(bg_fea_s, bg_fea_t, reduction="sum") / len(mask_bg)
-        return fg_loss, bg_loss
-
-    def relation_loss(self, stu_feature, tea_feature):
-        context_s = self.spatial_pool(stu_feature, "student")
-        context_t = self.spatial_pool(tea_feature, "teacher")
-        out_s = stu_feature + self.stu_conv_block(context_s)
-        out_t = tea_feature + self.tea_conv_block(context_t)
-        rela_loss = F.mse_loss(out_s, out_t, reduction="sum") / len(out_s)
-        return rela_loss
-
-    def mask_value(self, mask, xl, xr, yl, yr, value):
-        mask[xl:xr, yl:yr] = paddle.maximum(mask[xl:xr, yl:yr], value)
-        return mask
-
-    def forward(self, stu_feature, tea_feature, inputs):
-        assert stu_feature.shape[-2:] == stu_feature.shape[-2:]
-        assert "gt_bbox" in inputs.keys() and "im_shape" in inputs.keys()
-        gt_bboxes = inputs['gt_bbox']
-        ins_shape = [
-            inputs['im_shape'][i] for i in range(inputs['im_shape'].shape[0])
-        ]
-        index_gt = []
-        for i in range(len(gt_bboxes)):
-            if gt_bboxes[i].size > 2:
-                index_gt.append(i)
-        # only distill feature with labeled GTbox
-        if len(index_gt) != len(gt_bboxes):
-            index_gt_t = paddle.to_tensor(index_gt)
-            stu_feature = paddle.index_select(stu_feature, index_gt_t)
-            tea_feature = paddle.index_select(tea_feature, index_gt_t)
-
-            ins_shape = [ins_shape[c] for c in index_gt]
-            gt_bboxes = [gt_bboxes[c] for c in index_gt]
-            assert len(gt_bboxes) == tea_feature.shape[0]
-
-        if self.align is not None:
-            stu_feature = self.align(stu_feature)
-
-        if self.normalize:
-            stu_feature = feature_norm(stu_feature)
-            tea_feature = feature_norm(tea_feature)
-
-        tea_spatial_att, tea_channel_att = self.spatial_channel_attention(
-            tea_feature, self.temp)
-        stu_spatial_att, stu_channel_att = self.spatial_channel_attention(
-            stu_feature, self.temp)
-
-        mask_fg = paddle.zeros(tea_spatial_att.shape)
-        mask_bg = paddle.ones_like(tea_spatial_att)
-        one_tmp = paddle.ones([*tea_spatial_att.shape[1:]])
-        zero_tmp = paddle.zeros([*tea_spatial_att.shape[1:]])
-        mask_fg.stop_gradient = True
-        mask_bg.stop_gradient = True
-        one_tmp.stop_gradient = True
-        zero_tmp.stop_gradient = True
-
-        wmin, wmax, hmin, hmax = [], [], [], []
-
-        if len(gt_bboxes) == 0:
-            loss = self.relation_loss(stu_feature, tea_feature)
-            return self.lambda_fgd * loss
-
-        N, _, H, W = stu_feature.shape
-        for i in range(N):
-            tmp_box = paddle.ones_like(gt_bboxes[i])
-            tmp_box.stop_gradient = True
-            tmp_box[:, 0] = gt_bboxes[i][:, 0] / ins_shape[i][1] * W
-            tmp_box[:, 2] = gt_bboxes[i][:, 2] / ins_shape[i][1] * W
-            tmp_box[:, 1] = gt_bboxes[i][:, 1] / ins_shape[i][0] * H
-            tmp_box[:, 3] = gt_bboxes[i][:, 3] / ins_shape[i][0] * H
-
-            zero = paddle.zeros_like(tmp_box[:, 0], dtype="int32")
-            ones = paddle.ones_like(tmp_box[:, 2], dtype="int32")
-            zero.stop_gradient = True
-            ones.stop_gradient = True
-            wmin.append(
-                paddle.cast(paddle.floor(tmp_box[:, 0]), "int32").maximum(zero))
-            wmax.append(paddle.cast(paddle.ceil(tmp_box[:, 2]), "int32"))
-            hmin.append(
-                paddle.cast(paddle.floor(tmp_box[:, 1]), "int32").maximum(zero))
-            hmax.append(paddle.cast(paddle.ceil(tmp_box[:, 3]), "int32"))
-
-            area_recip = 1.0 / (
-                hmax[i].reshape([1, -1]) + 1 - hmin[i].reshape([1, -1])) / (
-                    wmax[i].reshape([1, -1]) + 1 - wmin[i].reshape([1, -1]))
-
-            for j in range(len(gt_bboxes[i])):
-                if gt_bboxes[i][j].sum() > 0:
-                    mask_fg[i] = self.mask_value(
-                        mask_fg[i], hmin[i][j], hmax[i][j] + 1, wmin[i][j],
-                        wmax[i][j] + 1, area_recip[0][j])
-
-            mask_bg[i] = paddle.where(mask_fg[i] > zero_tmp, zero_tmp, one_tmp)
-
-            if paddle.sum(mask_bg[i]):
-                mask_bg[i] /= paddle.sum(mask_bg[i])
-
-        fg_loss, bg_loss = self.feature_loss(stu_feature, tea_feature, mask_fg,
-                                             mask_bg, tea_channel_att,
-                                             tea_spatial_att)
-        mask_loss = self.mask_loss(stu_channel_att, tea_channel_att,
-                                   stu_spatial_att, tea_spatial_att)
-        rela_loss = self.relation_loss(stu_feature, tea_feature)
-        loss = self.alpha_fgd * fg_loss + self.beta_fgd * bg_loss \
-               + self.gamma_fgd * mask_loss + self.lambda_fgd * rela_loss
-        return loss * self.loss_weight
-
-
-@register
-class PKDFeatureLoss(nn.Layer):
-    """
-    PKD: General Distillation Framework for Object Detectors via Pearson Correlation Coefficient.
-
-    Args:
-        loss_weight (float): Weight of loss. Defaults to 1.0.
-        resize_stu (bool): If True, we'll down/up sample the features of the
-            student model to the spatial size of those of the teacher model if
-            their spatial sizes are different. And vice versa. Defaults to
-            True.
-    """
-
-    def __init__(self,
-                 student_channels=256,
-                 teacher_channels=256,
-                 normalize=True,
-                 loss_weight=1.0,
-                 resize_stu=True):
-        super(PKDFeatureLoss, self).__init__()
-        self.normalize = normalize
-        self.loss_weight = loss_weight
-        self.resize_stu = resize_stu
-
-    def forward(self, stu_feature, tea_feature, inputs=None):
-        size_s, size_t = stu_feature.shape[2:], tea_feature.shape[2:]
-        if size_s[0] != size_t[0]:
-            if self.resize_stu:
-                stu_feature = F.interpolate(
-                    stu_feature, size_t, mode='bilinear')
-            else:
-                tea_feature = F.interpolate(
-                    tea_feature, size_s, mode='bilinear')
-        assert stu_feature.shape == tea_feature.shape
-
-        if self.normalize:
-            stu_feature = feature_norm(stu_feature)
-            tea_feature = feature_norm(tea_feature)
-
-        loss = F.mse_loss(stu_feature, tea_feature) / 2
-        return loss * self.loss_weight
-
-
-@register
-class MimicFeatureLoss(nn.Layer):
-    def __init__(self,
-                 student_channels=256,
-                 teacher_channels=256,
-                 normalize=True,
-                 loss_weight=1.0):
-        super(MimicFeatureLoss, self).__init__()
-        self.normalize = normalize
-        self.loss_weight = loss_weight
-        self.mse_loss = nn.MSELoss()
-
-        if student_channels != teacher_channels:
-            self.align = nn.Conv2D(
-                student_channels,
-                teacher_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0)
-        else:
-            self.align = None
-
-    def forward(self, stu_feature, tea_feature, inputs=None):
-        if self.align is not None:
-            stu_feature = self.align(stu_feature)
-
-        if self.normalize:
-            stu_feature = feature_norm(stu_feature)
-            tea_feature = feature_norm(tea_feature)
-
-        loss = self.mse_loss(stu_feature, tea_feature)
-        return loss * self.loss_weight
-
-
-@register
-class MGDFeatureLoss(nn.Layer):
-    def __init__(self,
-                 student_channels=256,
-                 teacher_channels=256,
-                 normalize=True,
-                 loss_weight=1.0,
-                 loss_func='mse'):
-        super(MGDFeatureLoss, self).__init__()
-        self.normalize = normalize
-        self.loss_weight = loss_weight
-        assert loss_func in ['mse', 'ssim']
-        self.loss_func = loss_func
-        self.mse_loss = nn.MSELoss(reduction='sum')
-        self.ssim_loss = SSIM(11)
-
-        kaiming_init = parameter_init("kaiming")
-        if student_channels != teacher_channels:
-            self.align = nn.Conv2D(
-                student_channels,
-                teacher_channels,
-                kernel_size=1,
-                stride=1,
-                padding=0,
-                weight_attr=kaiming_init,
-                bias_attr=False)
-        else:
-            self.align = None
-
-        self.generation = nn.Sequential(
-            nn.Conv2D(
-                teacher_channels, teacher_channels, kernel_size=3, padding=1),
-            nn.ReLU(),
-            nn.Conv2D(
-                teacher_channels, teacher_channels, kernel_size=3, padding=1))
-
-    def forward(self, stu_feature, tea_feature, inputs=None):
-        N = stu_feature.shape[0]
-        if self.align is not None:
-            stu_feature = self.align(stu_feature)
-        stu_feature = self.generation(stu_feature)
-
-        if self.normalize:
-            stu_feature = feature_norm(stu_feature)
-            tea_feature = feature_norm(tea_feature)
-
-        if self.loss_func == 'mse':
-            loss = self.mse_loss(stu_feature, tea_feature) / N
-        elif self.loss_func == 'ssim':
-            ssim_loss = self.ssim_loss(stu_feature, tea_feature)
-            loss = paddle.clip((1 - ssim_loss) / 2, 0, 1)
-        else:
-            raise ValueError
-        return loss * self.loss_weight
-
-
-class SSIM(nn.Layer):
-    def __init__(self, window_size=11, size_average=True):
-        super(SSIM, self).__init__()
-        self.window_size = window_size
-        self.size_average = size_average
-        self.channel = 1
-        self.window = self.create_window(window_size, self.channel)
-
-    def gaussian(self, window_size, sigma):
-        gauss = paddle.to_tensor([
-            math.exp(-(x - window_size // 2)**2 / float(2 * sigma**2))
-            for x in range(window_size)
-        ])
-        return gauss / gauss.sum()
-
-    def create_window(self, window_size, channel):
-        _1D_window = self.gaussian(window_size, 1.5).unsqueeze(1)
-        _2D_window = _1D_window.mm(_1D_window.t()).unsqueeze(0).unsqueeze(0)
-        window = _2D_window.expand([channel, 1, window_size, window_size])
-        return window
-
-    def _ssim(self, img1, img2, window, window_size, channel,
-              size_average=True):
-        mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel)
-        mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel)
-        mu1_sq = mu1.pow(2)
-        mu2_sq = mu2.pow(2)
-        mu1_mu2 = mu1 * mu2
-
-        sigma1_sq = F.conv2d(
-            img1 * img1, window, padding=window_size // 2,
-            groups=channel) - mu1_sq
-        sigma2_sq = F.conv2d(
-            img2 * img2, window, padding=window_size // 2,
-            groups=channel) - mu2_sq
-        sigma12 = F.conv2d(
-            img1 * img2, window, padding=window_size // 2,
-            groups=channel) - mu1_mu2
-
-        C1 = 0.01**2
-        C2 = 0.03**2
-        ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / (
-            1e-12 + (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
-
-        if size_average:
-            return ssim_map.mean()
-        else:
-            return ssim_map.mean([1, 2, 3])
-
-    def forward(self, img1, img2):
-        channel = img1.shape[1]
-        if channel == self.channel and self.window.dtype == img1.dtype:
-            window = self.window
-        else:
-            window = self.create_window(self.window_size, channel)
-            self.window = window
-            self.channel = channel
-
-        return self._ssim(img1, img2, window, self.window_size, channel,
-                          self.size_average)
diff --git a/pdfdet/models/Paddle/ppdet/slim/distill_model.py b/pdfdet/models/Paddle/ppdet/slim/distill_model.py
deleted file mode 100644
index 4fa3ccc..0000000
--- a/pdfdet/models/Paddle/ppdet/slim/distill_model.py
+++ /dev/null
@@ -1,352 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-
-from ppdet.core.workspace import register, create, load_config
-from ppdet.utils.checkpoint import load_pretrain_weight
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-__all__ = [
-    'DistillModel',
-    'FGDDistillModel',
-    'CWDDistillModel',
-    'LDDistillModel',
-    'PPYOLOEDistillModel',
-]
-
-
-@register
-class DistillModel(nn.Layer):
-    """
-    Build common distill model.
-    Args:
-        cfg: The student config.
-        slim_cfg: The teacher and distill config.
-    """
-
-    def __init__(self, cfg, slim_cfg):
-        super(DistillModel, self).__init__()
-        self.arch = cfg.architecture
-
-        self.stu_cfg = cfg
-        self.student_model = create(self.stu_cfg.architecture)
-        if 'pretrain_weights' in self.stu_cfg and self.stu_cfg.pretrain_weights:
-            stu_pretrain = self.stu_cfg.pretrain_weights
-        else:
-            stu_pretrain = None
-
-        slim_cfg = load_config(slim_cfg)
-        self.tea_cfg = slim_cfg
-        self.teacher_model = create(self.tea_cfg.architecture)
-        if 'pretrain_weights' in self.tea_cfg and self.tea_cfg.pretrain_weights:
-            tea_pretrain = self.tea_cfg.pretrain_weights
-        else:
-            tea_pretrain = None
-        self.distill_cfg = slim_cfg
-
-        # load pretrain weights
-        self.is_inherit = False
-        if stu_pretrain:
-            if self.is_inherit and tea_pretrain:
-                load_pretrain_weight(self.student_model, tea_pretrain)
-                logger.debug(
-                    "Inheriting! loading teacher weights to student model!")
-            load_pretrain_weight(self.student_model, stu_pretrain)
-            logger.info("Student model has loaded pretrain weights!")
-        if tea_pretrain:
-            load_pretrain_weight(self.teacher_model, tea_pretrain)
-            logger.info("Teacher model has loaded pretrain weights!")
-
-        self.teacher_model.eval()
-        for param in self.teacher_model.parameters():
-            param.trainable = False
-
-        self.distill_loss = self.build_loss(self.distill_cfg)
-
-    def build_loss(self, distill_cfg):
-        if 'distill_loss' in distill_cfg and distill_cfg.distill_loss:
-            return create(distill_cfg.distill_loss)
-        else:
-            return None
-
-    def parameters(self):
-        return self.student_model.parameters()
-
-    def forward(self, inputs):
-        if self.training:
-            student_loss = self.student_model(inputs)
-            with paddle.no_grad():
-                teacher_loss = self.teacher_model(inputs)
-
-            loss = self.distill_loss(self.teacher_model, self.student_model)
-            student_loss['distill_loss'] = loss
-            student_loss['teacher_loss'] = teacher_loss['loss']
-            student_loss['loss'] += student_loss['distill_loss']
-            return student_loss
-        else:
-            return self.student_model(inputs)
-
-
-@register
-class FGDDistillModel(DistillModel):
-    """
-    Build FGD distill model.
-    Args:
-        cfg: The student config.
-        slim_cfg: The teacher and distill config.
-    """
-
-    def __init__(self, cfg, slim_cfg):
-        super(FGDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg)
-        assert self.arch in ['RetinaNet', 'PicoDet'
-                             ], 'Unsupported arch: {}'.format(self.arch)
-        self.is_inherit = True
-
-    def build_loss(self, distill_cfg):
-        assert 'distill_loss_name' in distill_cfg and distill_cfg.distill_loss_name
-        assert 'distill_loss' in distill_cfg and distill_cfg.distill_loss
-        loss_func = dict()
-        name_list = distill_cfg.distill_loss_name
-        for name in name_list:
-            loss_func[name] = create(distill_cfg.distill_loss)
-        return loss_func
-
-    def forward(self, inputs):
-        if self.training:
-            s_body_feats = self.student_model.backbone(inputs)
-            s_neck_feats = self.student_model.neck(s_body_feats)
-            with paddle.no_grad():
-                t_body_feats = self.teacher_model.backbone(inputs)
-                t_neck_feats = self.teacher_model.neck(t_body_feats)
-
-            loss_dict = {}
-            for idx, k in enumerate(self.distill_loss):
-                loss_dict[k] = self.distill_loss[k](s_neck_feats[idx],
-                                                    t_neck_feats[idx], inputs)
-            if self.arch == "RetinaNet":
-                loss = self.student_model.head(s_neck_feats, inputs)
-            elif self.arch == "PicoDet":
-                head_outs = self.student_model.head(
-                    s_neck_feats, self.student_model.export_post_process)
-                loss_gfl = self.student_model.head.get_loss(head_outs, inputs)
-                total_loss = paddle.add_n(list(loss_gfl.values()))
-                loss = {}
-                loss.update(loss_gfl)
-                loss.update({'loss': total_loss})
-            else:
-                raise ValueError(f"Unsupported model {self.arch}")
-
-            for k in loss_dict:
-                loss['loss'] += loss_dict[k]
-                loss[k] = loss_dict[k]
-            return loss
-        else:
-            body_feats = self.student_model.backbone(inputs)
-            neck_feats = self.student_model.neck(body_feats)
-            head_outs = self.student_model.head(neck_feats)
-            if self.arch == "RetinaNet":
-                bbox, bbox_num = self.student_model.head.post_process(
-                    head_outs, inputs['im_shape'], inputs['scale_factor'])
-                return {'bbox': bbox, 'bbox_num': bbox_num}
-            elif self.arch == "PicoDet":
-                head_outs = self.student_model.head(
-                    neck_feats, self.student_model.export_post_process)
-                scale_factor = inputs['scale_factor']
-                bboxes, bbox_num = self.student_model.head.post_process(
-                    head_outs,
-                    scale_factor,
-                    export_nms=self.student_model.export_nms)
-                return {'bbox': bboxes, 'bbox_num': bbox_num}
-            else:
-                raise ValueError(f"Unsupported model {self.arch}")
-
-
-@register
-class CWDDistillModel(DistillModel):
-    """                                                                                                                                                    
-    Build CWD distill model.                                                                                                                               
-    Args:                                                                                                                                                  
-        cfg: The student config.                                                                                                                           
-        slim_cfg: The teacher and distill config.                                                                                                          
-    """
-
-    def __init__(self, cfg, slim_cfg):
-        super(CWDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg)
-        assert self.arch in ['GFL', 'RetinaNet'], 'Unsupported arch: {}'.format(
-            self.arch)
-
-    def build_loss(self, distill_cfg):
-        assert 'distill_loss_name' in distill_cfg and distill_cfg.distill_loss_name
-        assert 'distill_loss' in distill_cfg and distill_cfg.distill_loss
-        loss_func = dict()
-        name_list = distill_cfg.distill_loss_name
-        for name in name_list:
-            loss_func[name] = create(distill_cfg.distill_loss)
-        return loss_func
-
-    def get_loss_retinanet(self, stu_fea_list, tea_fea_list, inputs):
-        loss = self.student_model.head(stu_fea_list, inputs)
-        loss_dict = {}
-        for idx, k in enumerate(self.distill_loss):
-            loss_dict[k] = self.distill_loss[k](stu_fea_list[idx],
-                                                tea_fea_list[idx])
-
-            loss['loss'] += loss_dict[k]
-            loss[k] = loss_dict[k]
-        return loss
-
-    def get_loss_gfl(self, stu_fea_list, tea_fea_list, inputs):
-        loss = {}
-        head_outs = self.student_model.head(stu_fea_list)
-        loss_gfl = self.student_model.head.get_loss(head_outs, inputs)
-        loss.update(loss_gfl)
-        total_loss = paddle.add_n(list(loss.values()))
-        loss.update({'loss': total_loss})
-
-        feat_loss = {}
-        loss_dict = {}
-        s_cls_feat, t_cls_feat = [], []
-        for s_neck_f, t_neck_f in zip(stu_fea_list, tea_fea_list):
-            conv_cls_feat, _ = self.student_model.head.conv_feat(s_neck_f)
-            cls_score = self.student_model.head.gfl_head_cls(conv_cls_feat)
-            t_conv_cls_feat, _ = self.teacher_model.head.conv_feat(t_neck_f)
-            t_cls_score = self.teacher_model.head.gfl_head_cls(t_conv_cls_feat)
-            s_cls_feat.append(cls_score)
-            t_cls_feat.append(t_cls_score)
-
-        for idx, k in enumerate(self.distill_loss):
-            loss_dict[k] = self.distill_loss[k](s_cls_feat[idx],
-                                                t_cls_feat[idx])
-            feat_loss[f"neck_f_{idx}"] = self.distill_loss[k](stu_fea_list[idx],
-                                                              tea_fea_list[idx])
-
-        for k in feat_loss:
-            loss['loss'] += feat_loss[k]
-            loss[k] = feat_loss[k]
-
-        for k in loss_dict:
-            loss['loss'] += loss_dict[k]
-            loss[k] = loss_dict[k]
-        return loss
-
-    def forward(self, inputs):
-        if self.training:
-            s_body_feats = self.student_model.backbone(inputs)
-            s_neck_feats = self.student_model.neck(s_body_feats)
-            with paddle.no_grad():
-                t_body_feats = self.teacher_model.backbone(inputs)
-                t_neck_feats = self.teacher_model.neck(t_body_feats)
-
-            if self.arch == "RetinaNet":
-                loss = self.get_loss_retinanet(s_neck_feats, t_neck_feats,
-                                               inputs)
-            elif self.arch == "GFL":
-                loss = self.get_loss_gfl(s_neck_feats, t_neck_feats, inputs)
-            else:
-                raise ValueError(f"unsupported arch {self.arch}")
-            return loss
-        else:
-            body_feats = self.student_model.backbone(inputs)
-            neck_feats = self.student_model.neck(body_feats)
-            head_outs = self.student_model.head(neck_feats)
-            if self.arch == "RetinaNet":
-                bbox, bbox_num = self.student_model.head.post_process(
-                    head_outs, inputs['im_shape'], inputs['scale_factor'])
-                return {'bbox': bbox, 'bbox_num': bbox_num}
-            elif self.arch == "GFL":
-                bbox_pred, bbox_num = head_outs
-                output = {'bbox': bbox_pred, 'bbox_num': bbox_num}
-                return output
-            else:
-                raise ValueError(f"unsupported arch {self.arch}")
-
-
-@register
-class LDDistillModel(DistillModel):
-    """
-    Build LD distill model.
-    Args:
-        cfg: The student config.
-        slim_cfg: The teacher and distill config.
-    """
-
-    def __init__(self, cfg, slim_cfg):
-        super(LDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg)
-        assert self.arch in ['GFL'], 'Unsupported arch: {}'.format(self.arch)
-
-    def forward(self, inputs):
-        if self.training:
-            s_body_feats = self.student_model.backbone(inputs)
-            s_neck_feats = self.student_model.neck(s_body_feats)
-            s_head_outs = self.student_model.head(s_neck_feats)
-            with paddle.no_grad():
-                t_body_feats = self.teacher_model.backbone(inputs)
-                t_neck_feats = self.teacher_model.neck(t_body_feats)
-                t_head_outs = self.teacher_model.head(t_neck_feats)
-
-            soft_label_list = t_head_outs[0]
-            soft_targets_list = t_head_outs[1]
-            student_loss = self.student_model.head.get_loss(
-                s_head_outs, inputs, soft_label_list, soft_targets_list)
-            total_loss = paddle.add_n(list(student_loss.values()))
-            student_loss['loss'] = total_loss
-            return student_loss
-        else:
-            return self.student_model(inputs)
-
-
-@register
-class PPYOLOEDistillModel(DistillModel):
-    """
-    Build PPYOLOE distill model, only used in PPYOLOE
-    Args:
-        cfg: The student config.
-        slim_cfg: The teacher and distill config.
-    """
-
-    def __init__(self, cfg, slim_cfg):
-        super(PPYOLOEDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg)
-        assert self.arch in ['PPYOLOE'], 'Unsupported arch: {}'.format(
-            self.arch)
-
-    def forward(self, inputs, alpha=0.125):
-        if self.training:
-            with paddle.no_grad():
-                teacher_loss = self.teacher_model(inputs)
-            if hasattr(self.teacher_model.yolo_head, "assigned_labels"):
-                self.student_model.yolo_head.assigned_labels, self.student_model.yolo_head.assigned_bboxes, self.student_model.yolo_head.assigned_scores = \
-                    self.teacher_model.yolo_head.assigned_labels, self.teacher_model.yolo_head.assigned_bboxes, self.teacher_model.yolo_head.assigned_scores
-                delattr(self.teacher_model.yolo_head, "assigned_labels")
-                delattr(self.teacher_model.yolo_head, "assigned_bboxes")
-                delattr(self.teacher_model.yolo_head, "assigned_scores")
-            student_loss = self.student_model(inputs)
-
-            logits_loss, feat_loss = self.distill_loss(self.teacher_model,
-                                                       self.student_model)
-            det_total_loss = student_loss['loss']
-            total_loss = alpha * (det_total_loss + logits_loss + feat_loss)
-            student_loss['loss'] = total_loss
-            student_loss['det_loss'] = det_total_loss
-            student_loss['logits_loss'] = logits_loss
-            student_loss['feat_loss'] = feat_loss
-            return student_loss
-        else:
-            return self.student_model(inputs)
diff --git a/pdfdet/models/Paddle/ppdet/slim/ofa.py b/pdfdet/models/Paddle/ppdet/slim/ofa.py
deleted file mode 100644
index b75edac..0000000
--- a/pdfdet/models/Paddle/ppdet/slim/ofa.py
+++ /dev/null
@@ -1,89 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-import paddle.nn as nn
-import paddle.nn.functional as F
-
-from ppdet.core.workspace import load_config, merge_config, create
-from ppdet.utils.checkpoint import load_weight, load_pretrain_weight
-from ppdet.utils.logger import setup_logger
-from ppdet.core.workspace import register, serializable
-
-from paddle.utils import try_import
-
-logger = setup_logger(__name__)
-
-
-@register
-@serializable
-class OFA(object):
-    def __init__(self, ofa_config):
-        super(OFA, self).__init__()
-        self.ofa_config = ofa_config
-
-    def __call__(self, model, param_state_dict):
-
-        paddleslim = try_import('paddleslim')
-        from paddleslim.nas.ofa import OFA, RunConfig, utils
-        from paddleslim.nas.ofa.convert_super import Convert, supernet
-        task = self.ofa_config['task']
-        expand_ratio = self.ofa_config['expand_ratio']
-
-        skip_neck = self.ofa_config['skip_neck']
-        skip_head = self.ofa_config['skip_head']
-
-        run_config = self.ofa_config['RunConfig']
-        if 'skip_layers' in run_config:
-            skip_layers = run_config['skip_layers']
-        else:
-            skip_layers = []
-
-        # supernet config
-        sp_config = supernet(expand_ratio=expand_ratio)
-        # convert to supernet
-        model = Convert(sp_config).convert(model)
-
-        skip_names = []
-        if skip_neck:
-            skip_names.append('neck.')
-        if skip_head:
-            skip_names.append('head.')
-
-        for name, sublayer in model.named_sublayers():
-            for n in skip_names:
-                if n in name:
-                    skip_layers.append(name)
-
-        run_config['skip_layers'] = skip_layers
-        run_config = RunConfig(**run_config)
-
-        # build ofa model
-        ofa_model = OFA(model, run_config=run_config)
-
-        ofa_model.set_epoch(0)
-        ofa_model.set_task(task)
-
-        input_spec = [{
-            "image": paddle.ones(
-                shape=[1, 3, 640, 640], dtype='float32'),
-            "im_shape": paddle.full(
-                [1, 2], 640, dtype='float32'),
-            "scale_factor": paddle.ones(
-                shape=[1, 2], dtype='float32')
-        }]
-
-        ofa_model._clear_search_space(input_spec=input_spec)
-        ofa_model._build_ss = True
-        check_ss = ofa_model._sample_config('expand_ratio', phase=None)
-        # tokenize the search space
-        ofa_model.tokenize()
-        # check token map, search cands and search space
-        logger.info('Token map is {}'.format(ofa_model.token_map))
-        logger.info('Search candidates is {}'.format(ofa_model.search_cands))
-        logger.info('The length of search_space is {}, search_space is {}'.
-                    format(len(ofa_model._ofa_layers), ofa_model._ofa_layers))
-        # set model state dict into ofa model
-        utils.set_state_dict(ofa_model.model, param_state_dict)
-        return ofa_model
diff --git a/pdfdet/models/Paddle/ppdet/slim/prune.py b/pdfdet/models/Paddle/ppdet/slim/prune.py
deleted file mode 100644
index 28ffb75..0000000
--- a/pdfdet/models/Paddle/ppdet/slim/prune.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import paddle
-from paddle.utils import try_import
-
-from ppdet.core.workspace import register, serializable
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-
-def print_prune_params(model):
-    model_dict = model.state_dict()
-    for key in model_dict.keys():
-        weight_name = model_dict[key].name
-        logger.info('Parameter name: {}, shape: {}'.format(
-            weight_name, model_dict[key].shape))
-
-
-@register
-@serializable
-class Pruner(object):
-    def __init__(self,
-                 criterion,
-                 pruned_params,
-                 pruned_ratios,
-                 print_params=False):
-        super(Pruner, self).__init__()
-        assert criterion in ['l1_norm', 'fpgm'], \
-            "unsupported prune criterion: {}".format(criterion)
-        self.criterion = criterion
-        self.pruned_params = pruned_params
-        self.pruned_ratios = pruned_ratios
-        self.print_params = print_params
-
-    def __call__(self, model):
-        # FIXME: adapt to network graph when Training and inference are
-        # inconsistent, now only supports prune inference network graph.
-        model.eval()
-        paddleslim = try_import('paddleslim')
-        from paddleslim.analysis import dygraph_flops as flops
-        input_spec = [{
-            "image": paddle.ones(
-                shape=[1, 3, 640, 640], dtype='float32'),
-            "im_shape": paddle.full(
-                [1, 2], 640, dtype='float32'),
-            "scale_factor": paddle.ones(
-                shape=[1, 2], dtype='float32')
-        }]
-        if self.print_params:
-            print_prune_params(model)
-
-        ori_flops = flops(model, input_spec) / (1000**3)
-        logger.info("FLOPs before pruning: {}GFLOPs".format(ori_flops))
-        if self.criterion == 'fpgm':
-            pruner = paddleslim.dygraph.FPGMFilterPruner(model, input_spec)
-        elif self.criterion == 'l1_norm':
-            pruner = paddleslim.dygraph.L1NormFilterPruner(model, input_spec)
-
-        logger.info("pruned params: {}".format(self.pruned_params))
-        pruned_ratios = [float(n) for n in self.pruned_ratios]
-        ratios = {}
-        for i, param in enumerate(self.pruned_params):
-            ratios[param] = pruned_ratios[i]
-        pruner.prune_vars(ratios, [0])
-        pruned_flops = flops(model, input_spec) / (1000**3)
-        logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format(
-            pruned_flops, (ori_flops - pruned_flops) / ori_flops))
-
-        return model
-
-
-@register
-@serializable
-class PrunerQAT(object):
-    def __init__(self, criterion, pruned_params, pruned_ratios,
-                 print_prune_params, quant_config, print_qat_model):
-        super(PrunerQAT, self).__init__()
-        assert criterion in ['l1_norm', 'fpgm'], \
-            "unsupported prune criterion: {}".format(criterion)
-        # Pruner hyperparameter
-        self.criterion = criterion
-        self.pruned_params = pruned_params
-        self.pruned_ratios = pruned_ratios
-        self.print_prune_params = print_prune_params
-        # QAT hyperparameter
-        self.quant_config = quant_config
-        self.print_qat_model = print_qat_model
-
-    def __call__(self, model):
-        # FIXME: adapt to network graph when Training and inference are
-        # inconsistent, now only supports prune inference network graph.
-        model.eval()
-        paddleslim = try_import('paddleslim')
-        from paddleslim.analysis import dygraph_flops as flops
-        input_spec = [{
-            "image": paddle.ones(
-                shape=[1, 3, 640, 640], dtype='float32'),
-            "im_shape": paddle.full(
-                [1, 2], 640, dtype='float32'),
-            "scale_factor": paddle.ones(
-                shape=[1, 2], dtype='float32')
-        }]
-        if self.print_prune_params:
-            print_prune_params(model)
-
-        ori_flops = flops(model, input_spec) / 1000
-        logger.info("FLOPs before pruning: {}GFLOPs".format(ori_flops))
-        if self.criterion == 'fpgm':
-            pruner = paddleslim.dygraph.FPGMFilterPruner(model, input_spec)
-        elif self.criterion == 'l1_norm':
-            pruner = paddleslim.dygraph.L1NormFilterPruner(model, input_spec)
-
-        logger.info("pruned params: {}".format(self.pruned_params))
-        pruned_ratios = [float(n) for n in self.pruned_ratios]
-        ratios = {}
-        for i, param in enumerate(self.pruned_params):
-            ratios[param] = pruned_ratios[i]
-        pruner.prune_vars(ratios, [0])
-        pruned_flops = flops(model, input_spec) / 1000
-        logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format(
-            pruned_flops, (ori_flops - pruned_flops) / ori_flops))
-
-        self.quanter = paddleslim.dygraph.quant.QAT(config=self.quant_config)
-
-        self.quanter.quantize(model)
-
-        if self.print_qat_model:
-            logger.info("Quantized model:")
-            logger.info(model)
-
-        return model
-
-    def save_quantized_model(self, layer, path, input_spec=None, **config):
-        self.quanter.save_quantized_model(
-            model=layer, path=path, input_spec=input_spec, **config)
diff --git a/pdfdet/models/Paddle/ppdet/slim/quant.py b/pdfdet/models/Paddle/ppdet/slim/quant.py
deleted file mode 100644
index 4450819..0000000
--- a/pdfdet/models/Paddle/ppdet/slim/quant.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from paddle.utils import try_import
-
-from ppdet.core.workspace import register, serializable
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-
-@register
-@serializable
-class QAT(object):
-    def __init__(self, quant_config, print_model):
-        super(QAT, self).__init__()
-        self.quant_config = quant_config
-        self.print_model = print_model
-
-    def __call__(self, model):
-        paddleslim = try_import('paddleslim')
-        self.quanter = paddleslim.dygraph.quant.QAT(config=self.quant_config)
-        if self.print_model:
-            logger.info("Model before quant:")
-            logger.info(model)
-
-        # For PP-YOLOE, convert model to deploy firstly.
-        for layer in model.sublayers():
-            if hasattr(layer, 'convert_to_deploy'):
-                layer.convert_to_deploy()
-
-        self.quanter.quantize(model)
-
-        if self.print_model:
-            logger.info("Quantized model:")
-            logger.info(model)
-
-        return model
-
-    def save_quantized_model(self, layer, path, input_spec=None, **config):
-        self.quanter.save_quantized_model(
-            model=layer, path=path, input_spec=input_spec, **config)
-
-
-@register
-@serializable
-class PTQ(object):
-    def __init__(self,
-                 ptq_config,
-                 quant_batch_num=10,
-                 output_dir='output_inference',
-                 fuse=True,
-                 fuse_list=None):
-        super(PTQ, self).__init__()
-        self.ptq_config = ptq_config
-        self.quant_batch_num = quant_batch_num
-        self.output_dir = output_dir
-        self.fuse = fuse
-        self.fuse_list = fuse_list
-
-    def __call__(self, model):
-        paddleslim = try_import('paddleslim')
-        self.ptq = paddleslim.PTQ(**self.ptq_config)
-        model.eval()
-        quant_model = self.ptq.quantize(
-            model, fuse=self.fuse, fuse_list=self.fuse_list)
-
-        return quant_model
-
-    def save_quantized_model(self,
-                             quant_model,
-                             quantize_model_path,
-                             input_spec=None):
-        self.ptq.save_quantized_model(quant_model, quantize_model_path,
-                                      input_spec)
diff --git a/pdfdet/models/Paddle/ppdet/slim/unstructured_prune.py b/pdfdet/models/Paddle/ppdet/slim/unstructured_prune.py
deleted file mode 100644
index 1dc876a..0000000
--- a/pdfdet/models/Paddle/ppdet/slim/unstructured_prune.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from paddle.utils import try_import
-
-from ppdet.core.workspace import register, serializable
-from ppdet.utils.logger import setup_logger
-logger = setup_logger(__name__)
-
-
-@register
-@serializable
-class UnstructuredPruner(object):
-    def __init__(self,
-                 stable_epochs,
-                 pruning_epochs,
-                 tunning_epochs,
-                 pruning_steps,
-                 ratio,
-                 initial_ratio,
-                 prune_params_type=None):
-        self.stable_epochs = stable_epochs
-        self.pruning_epochs = pruning_epochs
-        self.tunning_epochs = tunning_epochs
-        self.ratio = ratio
-        self.prune_params_type = prune_params_type
-        self.initial_ratio = initial_ratio
-        self.pruning_steps = pruning_steps
-
-    def __call__(self, model, steps_per_epoch, skip_params_func=None):
-        paddleslim = try_import('paddleslim')
-        from paddleslim import GMPUnstructuredPruner
-        configs = {
-            'pruning_strategy': 'gmp',
-            'stable_iterations': self.stable_epochs * steps_per_epoch,
-            'pruning_iterations': self.pruning_epochs * steps_per_epoch,
-            'tunning_iterations': self.tunning_epochs * steps_per_epoch,
-            'resume_iteration': 0,
-            'pruning_steps': self.pruning_steps,
-            'initial_ratio': self.initial_ratio,
-        }
-
-        pruner = GMPUnstructuredPruner(
-            model,
-            ratio=self.ratio,
-            skip_params_func=skip_params_func,
-            prune_params_type=self.prune_params_type,
-            local_sparsity=True,
-            configs=configs)
-
-        return pruner
diff --git a/pdfdet/models/Paddle/ppdet/utils/__init__.py b/pdfdet/models/Paddle/ppdet/utils/__init__.py
deleted file mode 100644
index d0c32e2..0000000
--- a/pdfdet/models/Paddle/ppdet/utils/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/pdfdet/models/Paddle/ppdet/utils/cam_utils.py b/pdfdet/models/Paddle/ppdet/utils/cam_utils.py
deleted file mode 100644
index d2f7a47..0000000
--- a/pdfdet/models/Paddle/ppdet/utils/cam_utils.py
+++ /dev/null
@@ -1,343 +0,0 @@
-import numpy as np
-import cv2
-import os
-import sys
-import glob
-from ppdet.utils.logger import setup_logger
-import copy
-logger = setup_logger('ppdet_cam')
-
-import paddle
-from ppdet.engine import Trainer
-
-
-def get_test_images(infer_dir, infer_img):
-    """
-    Get image path list in TEST mode
-    """
-    assert infer_img is not None or infer_dir is not None, \
-        "--infer_img or --infer_dir should be set"
-    assert infer_img is None or os.path.isfile(infer_img), \
-            "{} is not a file".format(infer_img)
-    assert infer_dir is None or os.path.isdir(infer_dir), \
-            "{} is not a directory".format(infer_dir)
-
-    # infer_img has a higher priority
-    if infer_img and os.path.isfile(infer_img):
-        return [infer_img]
-
-    images = set()
-    infer_dir = os.path.abspath(infer_dir)
-    assert os.path.isdir(infer_dir), \
-        "infer_dir {} is not a directory".format(infer_dir)
-    exts = ['jpg', 'jpeg', 'png', 'bmp']
-    exts += [ext.upper() for ext in exts]
-    for ext in exts:
-        images.update(glob.glob('{}/*.{}'.format(infer_dir, ext)))
-    images = list(images)
-
-    assert len(images) > 0, "no image found in {}".format(infer_dir)
-    logger.info("Found {} inference images in total.".format(len(images)))
-
-    return images
-
-
-def compute_ious(boxes1, boxes2):
-    """[Compute pairwise IOU matrix for given two sets of boxes]
-
-        Args:
-            boxes1 ([numpy ndarray with shape N,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)]
-            boxes2 ([numpy ndarray with shape M,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)]
-        Returns:
-            pairwise IOU maxtrix with shape (N,M)，where the value at ith row jth column hold the iou between ith
-            box and jth box from box1 and box2 respectively.
-    """
-    lu = np.maximum(
-        boxes1[:, None, :2], boxes2[:, :2]
-    )  # lu with shape N,M,2 ; boxes1[:,None,:2] with shape (N,1,2) boxes2 with shape(M,2)
-    rd = np.minimum(boxes1[:, None, 2:], boxes2[:, 2:])  # rd same to lu
-    intersection_wh = np.maximum(0.0, rd - lu)
-    intersection_area = intersection_wh[:, :,
-                                        0] * intersection_wh[:, :,
-                                                             1]  # with shape (N,M)
-    boxes1_wh = np.maximum(0.0, boxes1[:, 2:] - boxes1[:, :2])
-    boxes1_area = boxes1_wh[:, 0] * boxes1_wh[:, 1]  # with shape (N,)
-    boxes2_wh = np.maximum(0.0, boxes2[:, 2:] - boxes2[:, :2])
-    boxes2_area = boxes2_wh[:, 0] * boxes2_wh[:, 1]  # with shape (M,)
-    union_area = np.maximum(
-        boxes1_area[:, None] + boxes2_area - intersection_area,
-        1e-8)  # with shape (N,M)
-    ious = np.clip(intersection_area / union_area, 0.0, 1.0)
-    return ious
-
-
-def grad_cam(feat, grad):
-    """
-
-    Args:
-        feat:  CxHxW
-        grad:  CxHxW
-
-    Returns:
-           cam: HxW
-    """
-    exp = (feat * grad.mean((1, 2), keepdims=True)).mean(axis=0)
-    exp = np.maximum(-exp, 0)
-    return exp
-
-
-def resize_cam(explanation, resize_shape) -> np.ndarray:
-    """
-
-    Args:
-        explanation: (width, height)
-        resize_shape: (width, height)
-
-    Returns:
-
-    """
-    assert len(explanation.shape) == 2, f"{explanation.shape}. " \
-                                        f"Currently support 2D explanation results for visualization. " \
-                                        "Reduce higher dimensions to 2D for visualization."
-
-    explanation = (explanation - explanation.min()) / (
-        explanation.max() - explanation.min())
-
-    explanation = cv2.resize(explanation, resize_shape)
-    explanation = np.uint8(255 * explanation)
-    explanation = cv2.applyColorMap(explanation, cv2.COLORMAP_JET)
-    explanation = cv2.cvtColor(explanation, cv2.COLOR_BGR2RGB)
-
-    return explanation
-
-
-class BBoxCAM:
-    def __init__(self, FLAGS, cfg):
-        self.FLAGS = FLAGS
-        self.cfg = cfg
-        # build model
-        self.trainer = self.build_trainer(cfg)
-        # num_class
-        self.num_class = cfg.num_classes
-        # set hook for extraction of featuremaps and grads
-        self.set_hook(cfg)
-        self.nms_idx_need_divid_numclass_arch = ['FasterRCNN', 'MaskRCNN', 'CascadeRCNN']
-        """
-        In these networks, the bbox array shape before nms contain num_class,
-        the nms_keep_idx of the bbox need to divide the num_class; 
-        """
-
-        # cam image output_dir
-        try:
-            os.makedirs(FLAGS.cam_out)
-        except:
-            print('Path already exists.')
-            pass
-
-    def build_trainer(self, cfg):
-        # build trainer
-        trainer = Trainer(cfg, mode='test')
-        # load weights
-        trainer.load_weights(cfg.weights)
-
-        # set for get extra_data before nms
-        trainer.model.use_extra_data=True
-        # set for record the bbox index before nms
-        if cfg.architecture in ['FasterRCNN', 'MaskRCNN']:
-            trainer.model.bbox_post_process.nms.return_index = True
-        elif cfg.architecture in ['YOLOv3', 'PPYOLOE', 'PPYOLOEWithAuxHead']:
-            if trainer.model.post_process is not None:
-                # anchor based YOLOs: YOLOv3,PP-YOLO
-                trainer.model.post_process.nms.return_index = True
-            else:
-                # anchor free YOLOs: PP-YOLOE, PP-YOLOE+
-                trainer.model.yolo_head.nms.return_index = True
-        elif cfg.architecture=='BlazeFace' or cfg.architecture=='SSD':
-            trainer.model.post_process.nms.return_index = True
-        elif cfg.architecture=='RetinaNet':
-            trainer.model.head.nms.return_index = True
-        else:
-            print(
-                cfg.architecture+' is not supported for cam temporarily!'
-            )
-            sys.exit()
-        # Todo: Unify the head/post_process name in each model
-
-        return trainer
-
-    def set_hook(self, cfg):
-        # set hook for extraction of featuremaps and grads
-        self.target_feats = {}
-        self.target_layer_name = cfg.target_feature_layer_name
-        # such as trainer.model.backbone, trainer.model.bbox_head.roi_extractor
-
-        def hook(layer, input, output):
-            self.target_feats[layer._layer_name_for_hook] = output
-
-        try:
-            exec('self.trainer.'+self.target_layer_name+'._layer_name_for_hook = self.target_layer_name')
-            # self.trainer.target_layer_name._layer_name_for_hook = self.target_layer_name
-            exec('self.trainer.'+self.target_layer_name+'.register_forward_post_hook(hook)')
-            # self.trainer.target_layer_name.register_forward_post_hook(hook)
-        except:
-            print("Error! "
-                  "The target_layer_name--"+self.target_layer_name+" is not in model! "
-                  "Please check the spelling and "
-                  "the network's architecture!")
-            sys.exit()
-
-    def get_bboxes(self):
-        # get inference images
-        images = get_test_images(self.FLAGS.infer_dir, self.FLAGS.infer_img)
-
-        # inference
-        result = self.trainer.predict(
-            images,
-            draw_threshold=self.FLAGS.draw_threshold,
-            output_dir=self.FLAGS.output_dir,
-            save_results=self.FLAGS.save_results,
-            visualize=False)[0]
-        return result
-
-    def get_bboxes_cams(self):
-        # Get the bboxes prediction(after nms result) of the input
-        inference_result = self.get_bboxes()
-
-        # read input image
-        # Todo: Support folder multi-images process
-        from PIL import Image
-        img = np.array(Image.open(self.cfg.infer_img))
-
-        # data for calaulate bbox grad_cam
-        extra_data = inference_result['extra_data']
-        """
-        Example of Faster_RCNN based architecture:
-            extra_data: {'scores': tensor with shape [num_of_bboxes_before_nms, num_classes], for example: [1000, 80]
-                       'nms_keep_idx': tensor with shape [num_of_bboxes_after_nms, 1], for example: [300, 1]
-                      }
-        Example of YOLOv3 based architecture:
-            extra_data: {'scores': tensor with shape [1, num_classes, num_of_yolo_bboxes_before_nms], #for example: [1, 80, 8400]
-                       'nms_keep_idx': tensor with shape [num_of_yolo_bboxes_after_nms, 1], # for example: [300, 1]
-                      }
-        """
-
-        # array index of the predicted bbox before nms
-        if self.cfg.architecture in self.nms_idx_need_divid_numclass_arch:
-            # some network's bbox array shape before nms may be like [num_of_bboxes_before_nms, num_classes, 4],
-            # we need to divide num_classes to get the before_nms_index；
-            # currently, only include the rcnn architectures （fasterrcnn, maskrcnn, cascadercnn);
-            before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy(
-            ) // self.num_class  # num_class
-        else :
-            before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy()
-
-        # Calculate and visualize the heatmap of per predict bbox
-        for index, target_bbox in enumerate(inference_result['bbox']):
-            # target_bbox: [cls, score, x1, y1, x2, y2]
-            # filter bboxes with low predicted scores
-            if target_bbox[1] < self.FLAGS.draw_threshold:
-                continue
-
-            target_bbox_before_nms = int(before_nms_indexes[index])
-
-            if len(extra_data['scores'].shape)==2:
-                score_out = extra_data['scores'][target_bbox_before_nms]
-            else:
-                score_out = extra_data['scores'][0, :, target_bbox_before_nms]
-            """
-            There are two kinds array shape of bbox score output :
-                1) [num_of_bboxes_before_nms, num_classes], for example: [1000, 80]
-                2) [num_of_image, num_classes, num_of_yolo_bboxes_before_nms], for example: [1, 80, 1000]
-            """
-
-
-            # construct one_hot label and do backward to get the gradients
-            predicted_label = paddle.argmax(score_out)
-            label_onehot = paddle.nn.functional.one_hot(
-                predicted_label, num_classes=len(score_out))
-            label_onehot = label_onehot.squeeze()
-            target = paddle.sum(score_out * label_onehot)
-            target.backward(retain_graph=True)
-
-
-            if 'backbone' in self.target_layer_name or \
-                    'neck' in self.target_layer_name: # backbone/neck level feature
-                if isinstance(self.target_feats[self.target_layer_name], list):
-                    # when the featuremap contains of multiple scales,
-                    # take the featuremap of the last scale
-                    # Todo: fuse the cam result from multisclae featuremaps
-                    if self.target_feats[self.target_layer_name][
-                            -1].shape[-1]==1:
-                        """
-                        if the last level featuremap is 1x1 size,
-                        we take the second last one
-                        """
-                        cam_grad = self.target_feats[self.target_layer_name][
-                            -2].grad.squeeze().cpu().numpy()
-                        cam_feat = self.target_feats[self.target_layer_name][
-                            -2].squeeze().cpu().numpy()
-                    else:
-                        cam_grad = self.target_feats[self.target_layer_name][
-                            -1].grad.squeeze().cpu().numpy()
-                        cam_feat = self.target_feats[self.target_layer_name][
-                            -1].squeeze().cpu().numpy()
-                else:
-                    cam_grad = self.target_feats[
-                        self.target_layer_name].grad.squeeze().cpu().numpy()
-                    cam_feat = self.target_feats[
-                        self.target_layer_name].squeeze().cpu().numpy()
-            else: # roi level feature
-                cam_grad = self.target_feats[
-                    self.target_layer_name].grad.squeeze().cpu().numpy()[target_bbox_before_nms]
-                cam_feat = self.target_feats[
-                    self.target_layer_name].squeeze().cpu().numpy()[target_bbox_before_nms]
-
-            # grad_cam:
-            exp = grad_cam(cam_feat, cam_grad)
-
-            if 'backbone' in self.target_layer_name or \
-                    'neck' in self.target_layer_name:
-                """
-                when use backbone/neck featuremap, 
-                we first do the cam on whole image, 
-                and then set the area outside the predic bbox to 0
-                """
-                # reshape the cam image to the input image size
-                resized_exp = resize_cam(exp, (img.shape[1], img.shape[0]))
-                mask = np.zeros((img.shape[0], img.shape[1], 3))
-                mask[int(target_bbox[3]):int(target_bbox[5]), int(target_bbox[2]):
-                     int(target_bbox[4]), :] = 1
-                resized_exp = resized_exp * mask
-                # add the bbox cam back to the input image
-                overlay_vis = np.uint8(resized_exp * 0.4 + img * 0.6)
-            elif 'roi' in self.target_layer_name:
-                # get the bbox part of the image
-                bbox_img = copy.deepcopy(img[int(target_bbox[3]):int(target_bbox[5]),
-                                         int(target_bbox[2]):int(target_bbox[4]), :])
-                # reshape the cam image to the bbox size
-                resized_exp = resize_cam(exp, (bbox_img.shape[1], bbox_img.shape[0]))
-                # add the bbox cam back to the bbox image
-                bbox_overlay_vis = np.uint8(resized_exp * 0.4 + bbox_img * 0.6)
-                # put the bbox_cam image to the original image
-                overlay_vis = copy.deepcopy(img)
-                overlay_vis[int(target_bbox[3]):int(target_bbox[5]),
-                    int(target_bbox[2]):int(target_bbox[4]), :] = bbox_overlay_vis
-            else:
-                print(
-                    'Only supported cam for  backbone/neck feature and roi feature,  the others are not supported temporarily!'
-                )
-                sys.exit()
-
-            # put the bbox rectangle on image
-            cv2.rectangle(
-                overlay_vis, (int(target_bbox[2]), int(target_bbox[3])),
-                (int(target_bbox[4]), int(target_bbox[5])), (0, 0, 255), 2)
-
-            # save visualization result
-            cam_image = Image.fromarray(overlay_vis)
-            cam_image.save(self.FLAGS.cam_out + '/' + str(index) + '.jpg')
-
-            # clear gradients after each bbox grad_cam
-            target.clear_gradient()
-            for n, v in self.trainer.model.named_sublayers():
-                v.clear_gradients()
diff --git a/pdfdet/models/Paddle/ppdet/utils/check.py b/pdfdet/models/Paddle/ppdet/utils/check.py
deleted file mode 100644
index 7690ade..0000000
--- a/pdfdet/models/Paddle/ppdet/utils/check.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-
-import paddle
-import six
-import paddle.version as paddle_version
-
-from .logger import setup_logger
-logger = setup_logger(__name__)
-
-__all__ = [
-    'check_gpu', 'check_npu', 'check_xpu', 'check_mlu', 'check_version',
-    'check_config'
-]
-
-
-def check_mlu(use_mlu):
-    """
-    Log error and exit when set use_mlu=true in paddlepaddle
-    cpu/gpu/xpu/npu version.
-    """
-    err = "Config use_mlu cannot be set as true while you are " \
-          "using paddlepaddle cpu/gpu/xpu/npu version ! \nPlease try: \n" \
-          "\t1. Install paddlepaddle-mlu to run model on MLU \n" \
-          "\t2. Set use_mlu as false in config file to run " \
-          "model on CPU/GPU/XPU/NPU"
-
-    try:
-        if use_mlu and not paddle.is_compiled_with_mlu():
-            logger.error(err)
-            sys.exit(1)
-    except Exception as e:
-        pass
-
-
-def check_npu(use_npu):
-    """
-    Log error and exit when set use_npu=true in paddlepaddle
-    version without paddle-custom-npu installed.
-    """
-    err = "Config use_npu cannot be set as true while you are " \
-          "using paddlepaddle version without paddle-custom-npu " \
-          "installed! \nPlease try: \n" \
-          "\t1. Install paddle-custom-npu to run model on NPU \n" \
-          "\t2. Set use_npu as false in config file to run " \
-          "model on other devices supported."
-
-    try:
-        if use_npu and not 'npu' in paddle.device.get_all_custom_device_type():
-            logger.error(err)
-            sys.exit(1)
-    except Exception as e:
-        pass
-
-
-def check_xpu(use_xpu):
-    """
-    Log error and exit when set use_xpu=true in paddlepaddle
-    cpu/gpu/npu version.
-    """
-    err = "Config use_xpu cannot be set as true while you are " \
-          "using paddlepaddle cpu/gpu/npu version ! \nPlease try: \n" \
-          "\t1. Install paddlepaddle-xpu to run model on XPU \n" \
-          "\t2. Set use_xpu as false in config file to run " \
-          "model on CPU/GPU/NPU"
-
-    try:
-        if use_xpu and not paddle.is_compiled_with_xpu():
-            logger.error(err)
-            sys.exit(1)
-    except Exception as e:
-        pass
-
-
-def check_gpu(use_gpu):
-    """
-    Log error and exit when set use_gpu=true in paddlepaddle
-    cpu version.
-    """
-    err = "Config use_gpu cannot be set as true while you are " \
-          "using paddlepaddle cpu version ! \nPlease try: \n" \
-          "\t1. Install paddlepaddle-gpu to run model on GPU \n" \
-          "\t2. Set use_gpu as false in config file to run " \
-          "model on CPU"
-
-    try:
-        if use_gpu and not paddle.is_compiled_with_cuda():
-            logger.error(err)
-            sys.exit(1)
-    except Exception as e:
-        pass
-
-
-def check_version(version='2.2'):
-    """
-    Log error and exit when the installed version of paddlepaddle is
-    not satisfied.
-    """
-    err = "PaddlePaddle version {} or higher is required, " \
-          "or a suitable develop version is satisfied as well. \n" \
-          "Please make sure the version is good with your code.".format(version)
-
-    version_installed = [
-        paddle_version.major, paddle_version.minor, paddle_version.patch,
-        paddle_version.rc
-    ]
-
-    if version_installed == ['0', '0', '0', '0']:
-        return
-
-    version_split = version.split('.')
-
-    length = min(len(version_installed), len(version_split))
-    for i in six.moves.range(length):
-        if version_installed[i] > version_split[i]:
-            return
-        if version_installed[i] < version_split[i]:
-            raise Exception(err)
-
-
-def check_config(cfg):
-    """
-    Check the correctness of the configuration file. Log error and exit
-    when Config is not compliant.
-    """
-    err = "'{}' not specified in config file. Please set it in config file."
-    check_list = ['architecture', 'num_classes']
-    try:
-        for var in check_list:
-            if not var in cfg:
-                logger.error(err.format(var))
-                sys.exit(1)
-    except Exception as e:
-        pass
-
-    if 'log_iter' not in cfg:
-        cfg.log_iter = 20
-
-    return cfg
diff --git a/pdfdet/models/Paddle/ppdet/utils/checkpoint.py b/pdfdet/models/Paddle/ppdet/utils/checkpoint.py
deleted file mode 100644
index 8672c98..0000000
--- a/pdfdet/models/Paddle/ppdet/utils/checkpoint.py
+++ /dev/null
@@ -1,377 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import os
-import numpy as np
-import paddle
-import paddle.nn as nn
-from .download import get_weights_path
-
-from .logger import setup_logger
-logger = setup_logger(__name__)
-
-
-def is_url(path):
-    """
-    Whether path is URL.
-    Args:
-        path (string): URL string or not.
-    """
-    return path.startswith('http://') \
-            or path.startswith('https://') \
-            or path.startswith('ppdet://')
-
-
-def _strip_postfix(path):
-    path, ext = os.path.splitext(path)
-    assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \
-            "Unknown postfix {} from weights".format(ext)
-    return path
-
-
-def load_weight(model, weight, optimizer=None, ema=None, exchange=True):
-    if is_url(weight):
-        weight = get_weights_path(weight)
-
-    path = _strip_postfix(weight)
-    pdparam_path = path + '.pdparams'
-    if not os.path.exists(pdparam_path):
-        raise ValueError("Model pretrain path {} does not "
-                         "exists.".format(pdparam_path))
-
-    if ema is not None and os.path.exists(path + '.pdema'):
-        if exchange:
-            # Exchange model and ema_model to load
-            logger.info('Exchange model and ema_model to load:')
-            ema_state_dict = paddle.load(pdparam_path)
-            logger.info('Loading ema_model weights from {}'.format(path +
-                                                                   '.pdparams'))
-            param_state_dict = paddle.load(path + '.pdema')
-            logger.info('Loading model weights from {}'.format(path + '.pdema'))
-        else:
-            ema_state_dict = paddle.load(path + '.pdema')
-            logger.info('Loading ema_model weights from {}'.format(path +
-                                                                   '.pdema'))
-            param_state_dict = paddle.load(pdparam_path)
-            logger.info('Loading model weights from {}'.format(path +
-                                                               '.pdparams'))
-    else:
-        ema_state_dict = None
-        param_state_dict = paddle.load(pdparam_path)
-
-    if hasattr(model, 'modelTeacher') and hasattr(model, 'modelStudent'):
-        print('Loading pretrain weights for Teacher-Student framework.')
-        print('Loading pretrain weights for Student model.')
-        student_model_dict = model.modelStudent.state_dict()
-        student_param_state_dict = match_state_dict(
-            student_model_dict, param_state_dict, mode='student')
-        model.modelStudent.set_dict(student_param_state_dict)
-        print('Loading pretrain weights for Teacher model.')
-        teacher_model_dict = model.modelTeacher.state_dict()
-
-        teacher_param_state_dict = match_state_dict(
-            teacher_model_dict, param_state_dict, mode='teacher')
-        model.modelTeacher.set_dict(teacher_param_state_dict)
-
-    else:
-        model_dict = model.state_dict()
-        model_weight = {}
-        incorrect_keys = 0
-        for key in model_dict.keys():
-            if key in param_state_dict.keys():
-                model_weight[key] = param_state_dict[key]
-            else:
-                logger.info('Unmatched key: {}'.format(key))
-                incorrect_keys += 1
-        assert incorrect_keys == 0, "Load weight {} incorrectly, \
-                {} keys unmatched, please check again.".format(weight,
-                                                               incorrect_keys)
-        logger.info('Finish resuming model weights: {}'.format(pdparam_path))
-        model.set_dict(model_weight)
-
-    last_epoch = 0
-    if optimizer is not None and os.path.exists(path + '.pdopt'):
-        optim_state_dict = paddle.load(path + '.pdopt')
-        # to solve resume bug, will it be fixed in paddle 2.0
-        for key in optimizer.state_dict().keys():
-            if not key in optim_state_dict.keys():
-                optim_state_dict[key] = optimizer.state_dict()[key]
-        if 'last_epoch' in optim_state_dict:
-            last_epoch = optim_state_dict.pop('last_epoch')
-        optimizer.set_state_dict(optim_state_dict)
-
-        if ema_state_dict is not None:
-            ema.resume(ema_state_dict,
-                       optim_state_dict['LR_Scheduler']['last_epoch'])
-    elif ema_state_dict is not None:
-        ema.resume(ema_state_dict)
-    return last_epoch
-
-
-def match_state_dict(model_state_dict, weight_state_dict, mode='default'):
-    """
-    Match between the model state dict and pretrained weight state dict.
-    Return the matched state dict.
-
-    The method supposes that all the names in pretrained weight state dict are
-    subclass of the names in models`, if the prefix 'backbone.' in pretrained weight
-    keys is stripped. And we could get the candidates for each model key. Then we
-    select the name with the longest matched size as the final match result. For
-    example, the model state dict has the name of
-    'backbone.res2.res2a.branch2a.conv.weight' and the pretrained weight as
-    name of 'res2.res2a.branch2a.conv.weight' and 'branch2a.conv.weight'. We
-    match the 'res2.res2a.branch2a.conv.weight' to the model key.
-    """
-
-    model_keys = sorted(model_state_dict.keys())
-    weight_keys = sorted(weight_state_dict.keys())
-
-    def teacher_match(a, b):
-        # skip student params
-        if b.startswith('modelStudent'):
-            return False
-        return a == b or a.endswith("." + b) or b.endswith("." + a)
-
-    def student_match(a, b):
-        # skip teacher params
-        if b.startswith('modelTeacher'):
-            return False
-        return a == b or a.endswith("." + b) or b.endswith("." + a)
-
-    def match(a, b):
-        if b.startswith('backbone.res5'):
-            b = b[9:]
-        return a == b or a.endswith("." + b)
-
-    if mode == 'student':
-        match_op = student_match
-    elif mode == 'teacher':
-        match_op = teacher_match
-    else:
-        match_op = match
-
-    match_matrix = np.zeros([len(model_keys), len(weight_keys)])
-    for i, m_k in enumerate(model_keys):
-        for j, w_k in enumerate(weight_keys):
-            if match_op(m_k, w_k):
-                match_matrix[i, j] = len(w_k)
-    max_id = match_matrix.argmax(1)
-    max_len = match_matrix.max(1)
-    max_id[max_len == 0] = -1
-    load_id = set(max_id)
-    load_id.discard(-1)
-    not_load_weight_name = []
-    if weight_keys[0].startswith('modelStudent') or weight_keys[0].startswith(
-            'modelTeacher'):
-        for match_idx in range(len(max_id)):
-            if max_id[match_idx] == -1:
-                not_load_weight_name.append(model_keys[match_idx])
-        if len(not_load_weight_name) > 0:
-            logger.info('{} in model is not matched with pretrained weights, '
-                        'and its will be trained from scratch'.format(
-                            not_load_weight_name))
-
-    else:
-        for idx in range(len(weight_keys)):
-            if idx not in load_id:
-                not_load_weight_name.append(weight_keys[idx])
-
-        if len(not_load_weight_name) > 0:
-            logger.info('{} in pretrained weight is not used in the model, '
-                        'and its will not be loaded'.format(
-                            not_load_weight_name))
-    matched_keys = {}
-    result_state_dict = {}
-    for model_id, weight_id in enumerate(max_id):
-        if weight_id == -1:
-            continue
-        model_key = model_keys[model_id]
-        weight_key = weight_keys[weight_id]
-        weight_value = weight_state_dict[weight_key]
-        model_value_shape = list(model_state_dict[model_key].shape)
-
-        if list(weight_value.shape) != model_value_shape:
-            logger.info(
-                'The shape {} in pretrained weight {} is unmatched with '
-                'the shape {} in model {}. And the weight {} will not be '
-                'loaded'.format(weight_value.shape, weight_key,
-                                model_value_shape, model_key, weight_key))
-            continue
-
-        assert model_key not in result_state_dict
-        result_state_dict[model_key] = weight_value
-        if weight_key in matched_keys:
-            raise ValueError('Ambiguity weight {} loaded, it matches at least '
-                             '{} and {} in the model'.format(
-                                 weight_key, model_key, matched_keys[
-                                     weight_key]))
-        matched_keys[weight_key] = model_key
-    return result_state_dict
-
-
-def load_pretrain_weight(model, pretrain_weight, ARSL_eval=False):
-    if is_url(pretrain_weight):
-        pretrain_weight = get_weights_path(pretrain_weight)
-
-    path = _strip_postfix(pretrain_weight)
-    if not (os.path.isdir(path) or os.path.isfile(path) or
-            os.path.exists(path + '.pdparams')):
-        raise ValueError("Model pretrain path `{}` does not exists. "
-                         "If you don't want to load pretrain model, "
-                         "please delete `pretrain_weights` field in "
-                         "config file.".format(path))
-    teacher_student_flag = False
-    if not ARSL_eval:
-        if hasattr(model, 'modelTeacher') and hasattr(model, 'modelStudent'):
-            print('Loading pretrain weights for Teacher-Student framework.')
-            print(
-                'Assert Teacher model has the same structure with Student model.'
-            )
-            model_dict = model.modelStudent.state_dict()
-            teacher_student_flag = True
-        else:
-            model_dict = model.state_dict()
-
-        weights_path = path + '.pdparams'
-        param_state_dict = paddle.load(weights_path)
-        param_state_dict = match_state_dict(model_dict, param_state_dict)
-        for k, v in param_state_dict.items():
-            if isinstance(v, np.ndarray):
-                v = paddle.to_tensor(v)
-            if model_dict[k].dtype != v.dtype:
-                param_state_dict[k] = v.astype(model_dict[k].dtype)
-
-        if teacher_student_flag:
-            model.modelStudent.set_dict(param_state_dict)
-            model.modelTeacher.set_dict(param_state_dict)
-        else:
-            model.set_dict(param_state_dict)
-        logger.info('Finish loading model weights: {}'.format(weights_path))
-
-    else:
-        weights_path = path + '.pdparams'
-        param_state_dict = paddle.load(weights_path)
-        student_model_dict = model.modelStudent.state_dict()
-        student_param_state_dict = match_state_dict(
-            student_model_dict, param_state_dict, mode='student')
-        model.modelStudent.set_dict(student_param_state_dict)
-        print('Loading pretrain weights for Teacher model.')
-        teacher_model_dict = model.modelTeacher.state_dict()
-
-        teacher_param_state_dict = match_state_dict(
-            teacher_model_dict, param_state_dict, mode='teacher')
-        model.modelTeacher.set_dict(teacher_param_state_dict)
-        logger.info('Finish loading model weights: {}'.format(weights_path))
-
-
-def save_model(model,
-               optimizer,
-               save_dir,
-               save_name,
-               last_epoch,
-               ema_model=None):
-    """
-    save model into disk.
-
-    Args:
-        model (dict): the model state_dict to save parameters.
-        optimizer (paddle.optimizer.Optimizer): the Optimizer instance to
-            save optimizer states.
-        save_dir (str): the directory to be saved.
-        save_name (str): the path to be saved.
-        last_epoch (int): the epoch index.
-        ema_model (dict|None): the ema_model state_dict to save parameters.
-    """
-    if paddle.distributed.get_rank() != 0:
-        return
-        
-    save_dir = os.path.normpath(save_dir)
-    if not os.path.exists(save_dir):
-        os.makedirs(save_dir)
-
-    if save_name == "best_model":
-        best_model_path = os.path.join(save_dir, 'best_model')
-        if not os.path.exists(best_model_path):
-            os.makedirs(best_model_path)
-
-    save_path = os.path.join(save_dir, save_name)
-    # save model
-    if isinstance(model, nn.Layer):
-        paddle.save(model.state_dict(), save_path + ".pdparams")
-        best_model = model.state_dict()
-    else:
-        assert isinstance(model,
-                          dict), 'model is not a instance of nn.layer or dict'
-        if ema_model is None:
-            paddle.save(model, save_path + ".pdparams")
-            best_model = model
-        else:
-            assert isinstance(ema_model,
-                              dict), ("ema_model is not a instance of dict, "
-                                      "please call model.state_dict() to get.")
-            # Exchange model and ema_model to save
-            paddle.save(ema_model, save_path + ".pdparams")
-            paddle.save(model, save_path + ".pdema")
-            best_model = ema_model
-
-    if save_name == 'best_model':
-        best_model_path = os.path.join(best_model_path, 'model')
-        paddle.save(best_model, best_model_path + ".pdparams")
-    # save optimizer
-    state_dict = optimizer.state_dict()
-    state_dict['last_epoch'] = last_epoch
-    paddle.save(state_dict, save_path + ".pdopt")
-    logger.info("Save checkpoint: {}".format(save_dir))
-
-
-def save_semi_model(teacher_model, student_model, optimizer, save_dir,
-                    save_name, last_epoch, last_iter):
-    """
-    save teacher and student model into disk.
-    Args:
-        teacher_model (dict): the teacher_model state_dict to save parameters.
-        student_model (dict): the student_model state_dict to save parameters.
-        optimizer (paddle.optimizer.Optimizer): the Optimizer instance to
-            save optimizer states.
-        save_dir (str): the directory to be saved.
-        save_name (str): the path to be saved.
-        last_epoch (int): the epoch index.
-        last_iter (int): the iter index.
-    """
-    if paddle.distributed.get_rank() != 0:
-        return
-    assert isinstance(teacher_model, dict), (
-        "teacher_model is not a instance of dict, "
-        "please call teacher_model.state_dict() to get.")
-    assert isinstance(student_model, dict), (
-        "student_model is not a instance of dict, "
-        "please call student_model.state_dict() to get.")
-    if not os.path.exists(save_dir):
-        os.makedirs(save_dir)
-    save_path = os.path.join(save_dir, save_name)
-    # save model
-    paddle.save(teacher_model, save_path + str(last_epoch) + "epoch_t.pdparams")
-    paddle.save(student_model, save_path + str(last_epoch) + "epoch_s.pdparams")
-
-    # save optimizer
-    state_dict = optimizer.state_dict()
-    state_dict['last_epoch'] = last_epoch
-    state_dict['last_iter'] = last_iter
-    paddle.save(state_dict, save_path + str(last_epoch) + "epoch.pdopt")
-    logger.info("Save checkpoint: {}".format(save_dir))
diff --git a/pdfdet/models/Paddle/ppdet/utils/cli.py b/pdfdet/models/Paddle/ppdet/utils/cli.py
deleted file mode 100644
index 2c5acc0..0000000
--- a/pdfdet/models/Paddle/ppdet/utils/cli.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from argparse import ArgumentParser, RawDescriptionHelpFormatter
-
-import yaml
-import re
-from ppdet.core.workspace import get_registered_modules, dump_value
-
-__all__ = ['ColorTTY', 'ArgsParser']
-
-
-class ColorTTY(object):
-    def __init__(self):
-        super(ColorTTY, self).__init__()
-        self.colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan']
-
-    def __getattr__(self, attr):
-        if attr in self.colors:
-            color = self.colors.index(attr) + 31
-
-            def color_message(message):
-                return "[{}m{}[0m".format(color, message)
-
-            setattr(self, attr, color_message)
-            return color_message
-
-    def bold(self, message):
-        return self.with_code('01', message)
-
-    def with_code(self, code, message):
-        return "[{}m{}[0m".format(code, message)
-
-
-class ArgsParser(ArgumentParser):
-    def __init__(self):
-        super(ArgsParser, self).__init__(
-            formatter_class=RawDescriptionHelpFormatter)
-        self.add_argument("-c", "--config", help="configuration file to use")
-        self.add_argument(
-            "-o", "--opt", nargs='*', help="set configuration options")
-
-    def parse_args(self, argv=None):
-        args = super(ArgsParser, self).parse_args(argv)
-        assert args.config is not None, \
-            "Please specify --config=configure_file_path."
-        args.opt = self._parse_opt(args.opt)
-        return args
-
-    def _parse_opt(self, opts):
-        config = {}
-        if not opts:
-            return config
-        for s in opts:
-            s = s.strip()
-            k, v = s.split('=', 1)
-            if '.' not in k:
-                config[k] = yaml.load(v, Loader=yaml.Loader)
-            else:
-                keys = k.split('.')
-                if keys[0] not in config:
-                    config[keys[0]] = {}
-                cur = config[keys[0]]
-                for idx, key in enumerate(keys[1:]):
-                    if idx == len(keys) - 2:
-                        cur[key] = yaml.load(v, Loader=yaml.Loader)
-                    else:
-                        cur[key] = {}
-                        cur = cur[key]
-        return config
-
-
-def merge_args(config, args, exclude_args=['config', 'opt', 'slim_config']):
-    for k, v in vars(args).items():
-        if k not in exclude_args:
-            config[k] = v
-    return config
-
-
-def print_total_cfg(config):
-    modules = get_registered_modules()
-    color_tty = ColorTTY()
-    green = '___{}___'.format(color_tty.colors.index('green') + 31)
-
-    styled = {}
-    for key in config.keys():
-        if not config[key]:  # empty schema
-            continue
-
-        if key not in modules and not hasattr(config[key], '__dict__'):
-            styled[key] = config[key]
-            continue
-        elif key in modules:
-            module = modules[key]
-        else:
-            type_name = type(config[key]).__name__
-            if type_name in modules:
-                module = modules[type_name].copy()
-                module.update({
-                    k: v
-                    for k, v in config[key].__dict__.items()
-                    if k in module.schema
-                })
-                key += " ({})".format(type_name)
-        default = module.find_default_keys()
-        missing = module.find_missing_keys()
-        mismatch = module.find_mismatch_keys()
-        extra = module.find_extra_keys()
-        dep_missing = []
-        for dep in module.inject:
-            if isinstance(module[dep], str) and module[dep] != '<value>':
-                if module[dep] not in modules:  # not a valid module
-                    dep_missing.append(dep)
-                else:
-                    dep_mod = modules[module[dep]]
-                    # empty dict but mandatory
-                    if not dep_mod and dep_mod.mandatory():
-                        dep_missing.append(dep)
-        override = list(
-            set(module.keys()) - set(default) - set(extra) - set(dep_missing))
-        replacement = {}
-        for name in set(override + default + extra + mismatch + missing):
-            new_name = name
-            if name in missing:
-                value = "<missing>"
-            else:
-                value = module[name]
-
-            if name in extra:
-                value = dump_value(value) + " <extraneous>"
-            elif name in mismatch:
-                value = dump_value(value) + " <type mismatch>"
-            elif name in dep_missing:
-                value = dump_value(value) + " <module config missing>"
-            elif name in override and value != '<missing>':
-                mark = green
-                new_name = mark + name
-            replacement[new_name] = value
-        styled[key] = replacement
-    buffer = yaml.dump(styled, default_flow_style=False, default_style='')
-    buffer = (re.sub(r"<missing>", r"[31m<missing>[0m", buffer))
-    buffer = (re.sub(r"<extraneous>", r"[33m<extraneous>[0m", buffer))
-    buffer = (re.sub(r"<type mismatch>", r"[31m<type mismatch>[0m", buffer))
-    buffer = (re.sub(r"<module config missing>",
-                     r"[31m<module config missing>[0m", buffer))
-    buffer = re.sub(r"___(\d+)___(.*?):", r"[\1m\2[0m:", buffer)
-    print(buffer)
diff --git a/pdfdet/models/Paddle/ppdet/utils/colormap.py b/pdfdet/models/Paddle/ppdet/utils/colormap.py
deleted file mode 100644
index 67c68dc..0000000
--- a/pdfdet/models/Paddle/ppdet/utils/colormap.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import numpy as np
-
-
-def colormap(rgb=False):
-    """
-    Get colormap
-
-    The code of this function is copied from https://github.com/facebookresearch/Detectron/blob/main/detectron/utils/colormap.py
-    """
-    color_list = np.array([
-        0.000, 0.447, 0.741, 0.850, 0.325, 0.098, 0.929, 0.694, 0.125, 0.494,
-        0.184, 0.556, 0.466, 0.674, 0.188, 0.301, 0.745, 0.933, 0.635, 0.078,
-        0.184, 0.300, 0.300, 0.300, 0.600, 0.600, 0.600, 1.000, 0.000, 0.000,
-        1.000, 0.500, 0.000, 0.749, 0.749, 0.000, 0.000, 1.000, 0.000, 0.000,
-        0.000, 1.000, 0.667, 0.000, 1.000, 0.333, 0.333, 0.000, 0.333, 0.667,
-        0.000, 0.333, 1.000, 0.000, 0.667, 0.333, 0.000, 0.667, 0.667, 0.000,
-        0.667, 1.000, 0.000, 1.000, 0.333, 0.000, 1.000, 0.667, 0.000, 1.000,
-        1.000, 0.000, 0.000, 0.333, 0.500, 0.000, 0.667, 0.500, 0.000, 1.000,
-        0.500, 0.333, 0.000, 0.500, 0.333, 0.333, 0.500, 0.333, 0.667, 0.500,
-        0.333, 1.000, 0.500, 0.667, 0.000, 0.500, 0.667, 0.333, 0.500, 0.667,
-        0.667, 0.500, 0.667, 1.000, 0.500, 1.000, 0.000, 0.500, 1.000, 0.333,
-        0.500, 1.000, 0.667, 0.500, 1.000, 1.000, 0.500, 0.000, 0.333, 1.000,
-        0.000, 0.667, 1.000, 0.000, 1.000, 1.000, 0.333, 0.000, 1.000, 0.333,
-        0.333, 1.000, 0.333, 0.667, 1.000, 0.333, 1.000, 1.000, 0.667, 0.000,
-        1.000, 0.667, 0.333, 1.000, 0.667, 0.667, 1.000, 0.667, 1.000, 1.000,
-        1.000, 0.000, 1.000, 1.000, 0.333, 1.000, 1.000, 0.667, 1.000, 0.167,
-        0.000, 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000,
-        0.000, 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000,
-        0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000,
-        0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, 0.000,
-        0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, 0.833,
-        0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.143, 0.143, 0.143, 0.286,
-        0.286, 0.286, 0.429, 0.429, 0.429, 0.571, 0.571, 0.571, 0.714, 0.714,
-        0.714, 0.857, 0.857, 0.857, 1.000, 1.000, 1.000
-    ]).astype(np.float32)
-    color_list = color_list.reshape((-1, 3)) * 255
-    if not rgb:
-        color_list = color_list[:, ::-1]
-    return color_list.astype('int32')
diff --git a/pdfdet/models/Paddle/ppdet/utils/compact.py b/pdfdet/models/Paddle/ppdet/utils/compact.py
deleted file mode 100644
index b2f803b..0000000
--- a/pdfdet/models/Paddle/ppdet/utils/compact.py
+++ /dev/null
@@ -1,11 +0,0 @@
-import PIL
-
-def imagedraw_textsize_c(draw, text, font=None):
-    if int(PIL.__version__.split('.')[0]) < 10:
-        tw, th = draw.textsize(text, font=font)
-    else:
-        left, top, right, bottom = draw.textbbox((0, 0), text, font=font)
-        tw, th = right - left, bottom - top
-
-    return tw, th
-
diff --git a/pdfdet/models/Paddle/ppdet/utils/download.py b/pdfdet/models/Paddle/ppdet/utils/download.py
deleted file mode 100644
index a7909b8..0000000
--- a/pdfdet/models/Paddle/ppdet/utils/download.py
+++ /dev/null
@@ -1,560 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import os.path as osp
-import sys
-import yaml
-import time
-import shutil
-import requests
-import tqdm
-import hashlib
-import base64
-import binascii
-import tarfile
-import zipfile
-import errno
-
-from paddle.utils.download import _get_unique_endpoints
-from ppdet.core.workspace import BASE_KEY
-from .logger import setup_logger
-from .voc_utils import create_list
-
-logger = setup_logger(__name__)
-
-__all__ = [
-    'get_weights_path', 'get_dataset_path', 'get_config_path',
-    'download_dataset', 'create_voc_list'
-]
-
-WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/weights")
-DATASET_HOME = osp.expanduser("~/.cache/paddle/dataset")
-CONFIGS_HOME = osp.expanduser("~/.cache/paddle/configs")
-
-# dict of {dataset_name: (download_info, sub_dirs)}
-# download info: [(url, md5sum)]
-DATASETS = {
-    'coco': ([
-        (
-            'http://images.cocodataset.org/zips/train2017.zip',
-            'cced6f7f71b7629ddf16f17bbcfab6b2', ),
-        (
-            'http://images.cocodataset.org/zips/val2017.zip',
-            '442b8da7639aecaf257c1dceb8ba8c80', ),
-        (
-            'http://images.cocodataset.org/annotations/annotations_trainval2017.zip',
-            'f4bbac642086de4f52a3fdda2de5fa2c', ),
-    ], ["annotations", "train2017", "val2017"]),
-    'voc': ([
-        (
-            'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar',
-            '6cd6e144f989b92b3379bac3b3de84fd', ),
-        (
-            'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar',
-            'c52e279531787c972589f7e41ab4ae64', ),
-        (
-            'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar',
-            'b6e924de25625d8de591ea690078ad9f', ),
-        (
-            'https://paddledet.bj.bcebos.com/data/label_list.txt',
-            '5ae5d62183cfb6f6d3ac109359d06a1b', ),
-    ], ["VOCdevkit/VOC2012", "VOCdevkit/VOC2007"]),
-    'wider_face': ([
-        (
-            'https://dataset.bj.bcebos.com/wider_face/WIDER_train.zip',
-            '3fedf70df600953d25982bcd13d91ba2', ),
-        (
-            'https://dataset.bj.bcebos.com/wider_face/WIDER_val.zip',
-            'dfa7d7e790efa35df3788964cf0bbaea', ),
-        (
-            'https://dataset.bj.bcebos.com/wider_face/wider_face_split.zip',
-            'a4a898d6193db4b9ef3260a68bad0dc7', ),
-    ], ["WIDER_train", "WIDER_val", "wider_face_split"]),
-    'fruit': ([(
-        'https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit.tar',
-        'baa8806617a54ccf3685fa7153388ae6', ), ],
-              ['Annotations', 'JPEGImages']),
-    'roadsign_voc': ([(
-        'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_voc.tar',
-        '8d629c0f880dd8b48de9aeff44bf1f3e', ), ], ['annotations', 'images']),
-    'roadsign_coco': ([(
-        'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_coco.tar',
-        '49ce5a9b5ad0d6266163cd01de4b018e', ), ], ['annotations', 'images']),
-    'spine_coco': ([(
-        'https://paddledet.bj.bcebos.com/data/spine.tar',
-        '8a3a353c2c54a2284ad7d2780b65f6a6', ), ], ['annotations', 'images']),
-    'coco_ce': ([(
-        'https://paddledet.bj.bcebos.com/data/coco_ce.tar',
-        'eadd1b79bc2f069f2744b1dd4e0c0329', ), ], []),
-    'culane': ([('https://bj.bcebos.com/v1/paddledet/data/culane.tar', None, ), ], [])
-}
-
-DOWNLOAD_DATASETS_LIST = DATASETS.keys()
-
-DOWNLOAD_RETRY_LIMIT = 3
-
-PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX = 'https://paddledet.bj.bcebos.com/'
-
-
-# When running unit tests, there could be multiple processes that
-# trying to create DATA_HOME directory simultaneously, so we cannot
-# use a if condition to check for the existence of the directory;
-# instead, we use the filesystem as the synchronization mechanism by
-# catching returned errors.
-def must_mkdirs(path):
-    try:
-        os.makedirs(path)
-    except OSError as exc:
-        if exc.errno != errno.EEXIST:
-            raise
-        pass
-
-
-def parse_url(url):
-    url = url.replace("ppdet://", PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX)
-    return url
-
-
-def get_weights_path(url):
-    """Get weights path from WEIGHTS_HOME, if not exists,
-    download it from url.
-    """
-    url = parse_url(url)
-    path, _ = get_path(url, WEIGHTS_HOME)
-    return path
-
-
-def get_config_path(url):
-    """Get weights path from CONFIGS_HOME, if not exists,
-    download it from url.
-    """
-    url = parse_url(url)
-    path = map_path(url, CONFIGS_HOME, path_depth=2)
-    if os.path.isfile(path):
-        return path
-
-    # config file not found, try download
-    # 1. clear configs directory
-    if osp.isdir(CONFIGS_HOME):
-        shutil.rmtree(CONFIGS_HOME)
-
-    # 2. get url
-    try:
-        from ppdet import __version__ as version
-    except ImportError:
-        version = None
-
-    cfg_url = "ppdet://configs/{}/configs.tar".format(version) \
-                if version else "ppdet://configs/configs.tar"
-    cfg_url = parse_url(cfg_url)
-
-    # 3. download and decompress
-    cfg_fullname = _download_dist(cfg_url, osp.dirname(CONFIGS_HOME))
-    _decompress_dist(cfg_fullname)
-
-    # 4. check config file existing
-    if os.path.isfile(path):
-        return path
-    else:
-        logger.error("Get config {} failed after download, please contact us on " \
-            "https://github.com/PaddlePaddle/PaddleDetection/issues".format(path))
-        sys.exit(1)
-
-
-def get_dataset_path(path, annotation, image_dir):
-    """
-    If path exists, return path.
-    Otherwise, get dataset path from DATASET_HOME, if not exists,
-    download it.
-    """
-    if _dataset_exists(path, annotation, image_dir):
-        return path
-
-    data_name = os.path.split(path.strip().lower())[-1]
-    if data_name not in DOWNLOAD_DATASETS_LIST:
-        raise ValueError(
-            "Dataset {} is not valid for reason above, please check again.".
-            format(osp.realpath(path)))
-    else:
-        logger.warning(
-            "Dataset {} is not valid for reason above, try searching {} or "
-            "downloading dataset...".format(osp.realpath(path), DATASET_HOME))
-
-    for name, dataset in DATASETS.items():
-        if data_name == name:
-            logger.debug("Parse dataset_dir {} as dataset "
-                         "{}".format(path, name))
-            data_dir = osp.join(DATASET_HOME, name)
-
-            if name == "spine_coco":
-                if _dataset_exists(data_dir, annotation, image_dir):
-                    return data_dir
-
-            # For voc, only check dir VOCdevkit/VOC2012, VOCdevkit/VOC2007
-            if name in ['voc', 'fruit', 'roadsign_voc']:
-                exists = True
-                for sub_dir in dataset[1]:
-                    check_dir = osp.join(data_dir, sub_dir)
-                    if osp.exists(check_dir):
-                        logger.info("Found {}".format(check_dir))
-                    else:
-                        exists = False
-                if exists:
-                    return data_dir
-
-            # voc exist is checked above, voc is not exist here
-            check_exist = name != 'voc' and name != 'fruit' and name != 'roadsign_voc'
-            for url, md5sum in dataset[0]:
-                get_path(url, data_dir, md5sum, check_exist)
-
-            # voc should create list after download
-            if name == 'voc':
-                create_voc_list(data_dir)
-            return data_dir
-
-    raise ValueError("Dataset automaticly downloading Error.")
-
-
-def create_voc_list(data_dir, devkit_subdir='VOCdevkit'):
-    logger.debug("Create voc file list...")
-    devkit_dir = osp.join(data_dir, devkit_subdir)
-    years = ['2007', '2012']
-
-    # NOTE: since using auto download VOC
-    # dataset, VOC default label list should be used,
-    # do not generate label_list.txt here. For default
-    # label, see ../data/source/voc.py
-    create_list(devkit_dir, years, data_dir)
-    logger.debug("Create voc file list finished")
-
-
-def map_path(url, root_dir, path_depth=1):
-    # parse path after download to decompress under root_dir
-    assert path_depth > 0, "path_depth should be a positive integer"
-    dirname = url
-    for _ in range(path_depth):
-        dirname = osp.dirname(dirname)
-    fpath = osp.relpath(url, dirname)
-
-    zip_formats = ['.zip', '.tar', '.gz']
-    for zip_format in zip_formats:
-        fpath = fpath.replace(zip_format, '')
-    return osp.join(root_dir, fpath)
-
-
-def get_path(url, root_dir, md5sum=None, check_exist=True):
-    """ Download from given url to root_dir.
-    if file or directory specified by url is exists under
-    root_dir, return the path directly, otherwise download
-    from url and decompress it, return the path.
-
-    url (str): download url
-    root_dir (str): root dir for downloading, it should be
-                    WEIGHTS_HOME or DATASET_HOME
-    md5sum (str): md5 sum of download package
-    """
-    # parse path after download to decompress under root_dir
-    fullpath = map_path(url, root_dir)
-
-    # For same zip file, decompressed directory name different
-    # from zip file name, rename by following map
-    decompress_name_map = {
-        "VOCtrainval_11-May-2012": "VOCdevkit/VOC2012",
-        "VOCtrainval_06-Nov-2007": "VOCdevkit/VOC2007",
-        "VOCtest_06-Nov-2007": "VOCdevkit/VOC2007",
-        "annotations_trainval": "annotations"
-    }
-    for k, v in decompress_name_map.items():
-        if fullpath.find(k) >= 0:
-            fullpath = osp.join(osp.split(fullpath)[0], v)
-
-    if osp.exists(fullpath) and check_exist:
-        if not osp.isfile(fullpath) or \
-                _check_exist_file_md5(fullpath, md5sum, url):
-            logger.debug("Found {}".format(fullpath))
-            return fullpath, True
-        else:
-            os.remove(fullpath)
-
-    fullname = _download_dist(url, root_dir, md5sum)
-
-    # new weights format which postfix is 'pdparams' not
-    # need to decompress
-    if osp.splitext(fullname)[-1] not in ['.pdparams', '.yml', '.ttf']:
-        _decompress_dist(fullname)
-
-    return fullpath, False
-
-
-def download_dataset(path, dataset=None):
-    if dataset not in DATASETS.keys():
-        logger.error("Unknown dataset {}, it should be "
-                     "{}".format(dataset, DATASETS.keys()))
-        return
-    dataset_info = DATASETS[dataset][0]
-    for info in dataset_info:
-        get_path(info[0], path, info[1], False)
-    logger.debug("Download dataset {} finished.".format(dataset))
-
-
-def _dataset_exists(path, annotation, image_dir):
-    """
-    Check if user define dataset exists
-    """
-    if not osp.exists(path):
-        logger.warning("Config dataset_dir {} is not exits, "
-                       "dataset config is not valid".format(path))
-        return False
-
-    if annotation:
-        annotation_path = osp.join(path, annotation)
-        if not osp.isfile(annotation_path):
-            logger.warning("Config annotation {} is not a "
-                           "file, dataset config is not "
-                           "valid".format(annotation_path))
-            return False
-    if image_dir:
-        image_path = osp.join(path, image_dir)
-        if not osp.isdir(image_path):
-            logger.warning("Config image_dir {} is not a "
-                           "directory, dataset config is not "
-                           "valid".format(image_path))
-            return False
-    return True
-
-
-def _download(url, path, md5sum=None):
-    """
-    Download from url, save to path.
-
-    url (str): download url
-    path (str): download to given path
-    """
-    must_mkdirs(path)
-
-    fname = osp.split(url)[-1]
-    fullname = osp.join(path, fname)
-    retry_cnt = 0
-
-    while not (osp.exists(fullname) and _check_exist_file_md5(fullname, md5sum,
-                                                              url)):
-        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
-            retry_cnt += 1
-        else:
-            raise RuntimeError("Download from {} failed. "
-                               "Retry limit reached".format(url))
-
-        logger.info("Downloading {} from {}".format(fname, url))
-
-        # NOTE: windows path join may incur \, which is invalid in url
-        if sys.platform == "win32":
-            url = url.replace('\\', '/')
-
-        req = requests.get(url, stream=True)
-        if req.status_code != 200:
-            raise RuntimeError("Downloading from {} failed with code "
-                               "{}!".format(url, req.status_code))
-
-        # For protecting download interupted, download to
-        # tmp_fullname firstly, move tmp_fullname to fullname
-        # after download finished
-        tmp_fullname = fullname + "_tmp"
-        total_size = req.headers.get('content-length')
-        with open(tmp_fullname, 'wb') as f:
-            if total_size:
-                for chunk in tqdm.tqdm(
-                        req.iter_content(chunk_size=1024),
-                        total=(int(total_size) + 1023) // 1024,
-                        unit='KB'):
-                    f.write(chunk)
-            else:
-                for chunk in req.iter_content(chunk_size=1024):
-                    if chunk:
-                        f.write(chunk)
-        shutil.move(tmp_fullname, fullname)
-    return fullname
-
-
-def _download_dist(url, path, md5sum=None):
-    env = os.environ
-    if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env:
-        # Mainly used to solve the problem of downloading data from
-        # different machines in the case of multiple machines.
-        # Different nodes will download data, and the same node
-        # will only download data once.
-        # Reference https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/utils/download.py#L108
-        rank_id_curr_node = int(os.environ.get("PADDLE_RANK_IN_NODE", 0))
-        num_trainers = int(env['PADDLE_TRAINERS_NUM'])
-        if num_trainers <= 1:
-            return _download(url, path, md5sum)
-        else:
-            fname = osp.split(url)[-1]
-            fullname = osp.join(path, fname)
-            lock_path = fullname + '.download.lock'
-
-            must_mkdirs(path)
-
-            if not osp.exists(fullname):
-                with open(lock_path, 'w'):  # touch
-                    os.utime(lock_path, None)
-                if rank_id_curr_node == 0:
-                    _download(url, path, md5sum)
-                    os.remove(lock_path)
-                else:
-                    while os.path.exists(lock_path):
-                        time.sleep(0.5)
-            return fullname
-    else:
-        return _download(url, path, md5sum)
-
-
-def _check_exist_file_md5(filename, md5sum, url):
-    # if md5sum is None, and file to check is weights file,
-    # read md5um from url and check, else check md5sum directly
-    return _md5check_from_url(filename, url) if md5sum is None \
-            and filename.endswith('pdparams') \
-            else _md5check(filename, md5sum)
-
-
-def _md5check_from_url(filename, url):
-    # For weights in bcebos URLs, MD5 value is contained
-    # in request header as 'content_md5'
-    req = requests.get(url, stream=True)
-    content_md5 = req.headers.get('content-md5')
-    req.close()
-    if not content_md5 or _md5check(
-            filename,
-            binascii.hexlify(base64.b64decode(content_md5.strip('"'))).decode(
-            )):
-        return True
-    else:
-        return False
-
-
-def _md5check(fullname, md5sum=None):
-    if md5sum is None:
-        return True
-
-    logger.debug("File {} md5 checking...".format(fullname))
-    md5 = hashlib.md5()
-    with open(fullname, 'rb') as f:
-        for chunk in iter(lambda: f.read(4096), b""):
-            md5.update(chunk)
-    calc_md5sum = md5.hexdigest()
-
-    if calc_md5sum != md5sum:
-        logger.warning("File {} md5 check failed, {}(calc) != "
-                       "{}(base)".format(fullname, calc_md5sum, md5sum))
-        return False
-    return True
-
-
-def _decompress(fname):
-    """
-    Decompress for zip and tar file
-    """
-    logger.info("Decompressing {}...".format(fname))
-
-    # For protecting decompressing interupted,
-    # decompress to fpath_tmp directory firstly, if decompress
-    # successed, move decompress files to fpath and delete
-    # fpath_tmp and remove download compress file.
-    fpath = osp.split(fname)[0]
-    fpath_tmp = osp.join(fpath, 'tmp')
-    if osp.isdir(fpath_tmp):
-        shutil.rmtree(fpath_tmp)
-        os.makedirs(fpath_tmp)
-
-    if fname.find('tar') >= 0:
-        with tarfile.open(fname) as tf:
-            tf.extractall(path=fpath_tmp)
-    elif fname.find('zip') >= 0:
-        with zipfile.ZipFile(fname) as zf:
-            zf.extractall(path=fpath_tmp)
-    elif fname.find('.txt') >= 0:
-        return
-    else:
-        raise TypeError("Unsupport compress file type {}".format(fname))
-
-    for f in os.listdir(fpath_tmp):
-        src_dir = osp.join(fpath_tmp, f)
-        dst_dir = osp.join(fpath, f)
-        _move_and_merge_tree(src_dir, dst_dir)
-
-    shutil.rmtree(fpath_tmp)
-    os.remove(fname)
-
-
-def _decompress_dist(fname):
-    env = os.environ
-    if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env:
-        trainer_id = int(env['PADDLE_TRAINER_ID'])
-        num_trainers = int(env['PADDLE_TRAINERS_NUM'])
-        if num_trainers <= 1:
-            _decompress(fname)
-        else:
-            lock_path = fname + '.decompress.lock'
-            from paddle.distributed import ParallelEnv
-            unique_endpoints = _get_unique_endpoints(ParallelEnv()
-                                                     .trainer_endpoints[:])
-            # NOTE(dkp): _decompress_dist always performed after
-            # _download_dist, in _download_dist sub-trainers is waiting
-            # for download lock file release with sleeping, if decompress
-            # prograss is very fast and finished with in the sleeping gap
-            # time, e.g in tiny dataset such as coco_ce, spine_coco, main
-            # trainer may finish decompress and release lock file, so we
-            # only craete lock file in main trainer and all sub-trainer
-            # wait 1s for main trainer to create lock file, for 1s is
-            # twice as sleeping gap, this waiting time can keep all
-            # trainer pipeline in order
-            # **change this if you have more elegent methods**
-            if ParallelEnv().current_endpoint in unique_endpoints:
-                with open(lock_path, 'w'):  # touch
-                    os.utime(lock_path, None)
-                _decompress(fname)
-                os.remove(lock_path)
-            else:
-                time.sleep(1)
-                while os.path.exists(lock_path):
-                    time.sleep(0.5)
-    else:
-        _decompress(fname)
-
-
-def _move_and_merge_tree(src, dst):
-    """
-    Move src directory to dst, if dst is already exists,
-    merge src to dst
-    """
-    if not osp.exists(dst):
-        shutil.move(src, dst)
-    elif osp.isfile(src):
-        shutil.move(src, dst)
-    else:
-        for fp in os.listdir(src):
-            src_fp = osp.join(src, fp)
-            dst_fp = osp.join(dst, fp)
-            if osp.isdir(src_fp):
-                if osp.isdir(dst_fp):
-                    _move_and_merge_tree(src_fp, dst_fp)
-                else:
-                    shutil.move(src_fp, dst_fp)
-            elif osp.isfile(src_fp) and \
-                    not osp.isfile(dst_fp):
-                shutil.move(src_fp, dst_fp)
diff --git a/pdfdet/models/Paddle/ppdet/utils/fuse_utils.py b/pdfdet/models/Paddle/ppdet/utils/fuse_utils.py
deleted file mode 100644
index 647fa99..0000000
--- a/pdfdet/models/Paddle/ppdet/utils/fuse_utils.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import paddle
-import paddle.nn as nn
-
-__all__ = ['fuse_conv_bn']
-
-
-def fuse_conv_bn(model):
-    is_train = False
-    if model.training:
-        model.eval()
-        is_train = True
-    fuse_list = []
-    tmp_pair = [None, None]
-    for name, layer in model.named_sublayers():
-        if isinstance(layer, nn.Conv2D):
-            tmp_pair[0] = name
-        if isinstance(layer, nn.BatchNorm2D):
-            tmp_pair[1] = name
-
-        if tmp_pair[0] and tmp_pair[1] and len(tmp_pair) == 2:
-            fuse_list.append(tmp_pair)
-            tmp_pair = [None, None]
-    model = fuse_layers(model, fuse_list)
-    if is_train:
-        model.train()
-    return model
-
-
-def find_parent_layer_and_sub_name(model, name):
-    """
-    Given the model and the name of a layer, find the parent layer and
-    the sub_name of the layer.
-    For example, if name is 'block_1/convbn_1/conv_1', the parent layer is
-    'block_1/convbn_1' and the sub_name is `conv_1`.
-    Args:
-        model(paddle.nn.Layer): the model to be quantized.
-        name(string): the name of a layer
-
-    Returns:
-        parent_layer, subname
-    """
-    assert isinstance(model, nn.Layer), \
-            "The model must be the instance of paddle.nn.Layer."
-    assert len(name) > 0, "The input (name) should not be empty."
-
-    last_idx = 0
-    idx = 0
-    parent_layer = model
-    while idx < len(name):
-        if name[idx] == '.':
-            sub_name = name[last_idx:idx]
-            if hasattr(parent_layer, sub_name):
-                parent_layer = getattr(parent_layer, sub_name)
-                last_idx = idx + 1
-        idx += 1
-    sub_name = name[last_idx:idx]
-    return parent_layer, sub_name
-
-
-class Identity(nn.Layer):
-    '''a layer to replace bn or relu layers'''
-
-    def __init__(self, *args, **kwargs):
-        super(Identity, self).__init__()
-
-    def forward(self, input):
-        return input
-
-
-def fuse_layers(model, layers_to_fuse, inplace=False):
-    '''
-       fuse layers in layers_to_fuse
-
-       Args:
-           model(nn.Layer): The model to be fused.
-           layers_to_fuse(list): The layers' names to be fused. For
-               example,"fuse_list = [["conv1", "bn1"], ["conv2", "bn2"]]".
-               A TypeError would be raised if "fuse" was set as
-               True but "fuse_list" was None.
-                                 Default: None.
-           inplace(bool): Whether apply fusing to the input model.
-                          Default: False.
-
-       Return
-           fused_model(paddle.nn.Layer): The fused model.
-    '''
-    if not inplace:
-        model = copy.deepcopy(model)
-    for layers_list in layers_to_fuse:
-        layer_list = []
-        for layer_name in layers_list:
-            parent_layer, sub_name = find_parent_layer_and_sub_name(model,
-                                                                    layer_name)
-            layer_list.append(getattr(parent_layer, sub_name))
-        new_layers = _fuse_func(layer_list)
-        for i, item in enumerate(layers_list):
-            parent_layer, sub_name = find_parent_layer_and_sub_name(model, item)
-            setattr(parent_layer, sub_name, new_layers[i])
-    return model
-
-
-def _fuse_func(layer_list):
-    '''choose the fuser method and fuse layers'''
-    types = tuple(type(m) for m in layer_list)
-    fusion_method = types_to_fusion_method.get(types, None)
-    new_layers = [None] * len(layer_list)
-    fused_layer = fusion_method(*layer_list)
-    for handle_id, pre_hook_fn in layer_list[0]._forward_pre_hooks.items():
-        fused_layer.register_forward_pre_hook(pre_hook_fn)
-        del layer_list[0]._forward_pre_hooks[handle_id]
-    for handle_id, hook_fn in layer_list[-1]._forward_post_hooks.items():
-        fused_layer.register_forward_post_hook(hook_fn)
-        del layer_list[-1]._forward_post_hooks[handle_id]
-    new_layers[0] = fused_layer
-    for i in range(1, len(layer_list)):
-        identity = Identity()
-        identity.training = layer_list[0].training
-        new_layers[i] = identity
-    return new_layers
-
-
-def _fuse_conv_bn(conv, bn):
-    '''fuse conv and bn for train or eval'''
-    assert(conv.training == bn.training),\
-        "Conv and BN both must be in the same mode (train or eval)."
-    if conv.training:
-        assert bn._num_features == conv._out_channels, 'Output channel of Conv2d must match num_features of BatchNorm2d'
-        raise NotImplementedError
-    else:
-        return _fuse_conv_bn_eval(conv, bn)
-
-
-def _fuse_conv_bn_eval(conv, bn):
-    '''fuse conv and bn for eval'''
-    assert (not (conv.training or bn.training)), "Fusion only for eval!"
-    fused_conv = copy.deepcopy(conv)
-
-    fused_weight, fused_bias = _fuse_conv_bn_weights(
-        fused_conv.weight, fused_conv.bias, bn._mean, bn._variance, bn._epsilon,
-        bn.weight, bn.bias)
-    fused_conv.weight.set_value(fused_weight)
-    if fused_conv.bias is None:
-        fused_conv.bias = paddle.create_parameter(
-            shape=[fused_conv._out_channels], is_bias=True, dtype=bn.bias.dtype)
-    fused_conv.bias.set_value(fused_bias)
-    return fused_conv
-
-
-def _fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b):
-    '''fuse weights and bias of conv and bn'''
-    if conv_b is None:
-        conv_b = paddle.zeros_like(bn_rm)
-    if bn_w is None:
-        bn_w = paddle.ones_like(bn_rm)
-    if bn_b is None:
-        bn_b = paddle.zeros_like(bn_rm)
-    bn_var_rsqrt = paddle.rsqrt(bn_rv + bn_eps)
-    conv_w = conv_w * \
-        (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1))
-    conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b
-    return conv_w, conv_b
-
-
-types_to_fusion_method = {(nn.Conv2D, nn.BatchNorm2D): _fuse_conv_bn, }
diff --git a/pdfdet/models/Paddle/ppdet/utils/logger.py b/pdfdet/models/Paddle/ppdet/utils/logger.py
deleted file mode 100644
index 51e2962..0000000
--- a/pdfdet/models/Paddle/ppdet/utils/logger.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import sys
-
-import paddle.distributed as dist
-
-__all__ = ['setup_logger']
-
-logger_initialized = []
-
-
-def setup_logger(name="ppdet", output=None):
-    """
-    Initialize logger and set its verbosity level to INFO.
-    Args:
-        output (str): a file name or a directory to save log. If None, will not save log file.
-            If ends with ".txt" or ".log", assumed to be a file name.
-            Otherwise, logs will be saved to `output/log.txt`.
-        name (str): the root module name of this logger
-
-    Returns:
-        logging.Logger: a logger
-    """
-    logger = logging.getLogger(name)
-    if name in logger_initialized:
-        return logger
-
-    logger.setLevel(logging.INFO)
-    logger.propagate = False
-
-    formatter = logging.Formatter(
-        "[%(asctime)s] %(name)s %(levelname)s: %(message)s",
-        datefmt="%m/%d %H:%M:%S")
-    # stdout logging: master only
-    local_rank = dist.get_rank()
-    if local_rank == 0:
-        ch = logging.StreamHandler(stream=sys.stdout)
-        ch.setLevel(logging.DEBUG)
-        ch.setFormatter(formatter)
-        logger.addHandler(ch)
-
-    # file logging: all workers
-    if output is not None:
-        if output.endswith(".txt") or output.endswith(".log"):
-            filename = output
-        else:
-            filename = os.path.join(output, "log.txt")
-        if local_rank > 0:
-            filename = filename + ".rank{}".format(local_rank)
-        os.makedirs(os.path.dirname(filename))
-        fh = logging.FileHandler(filename, mode='a')
-        fh.setLevel(logging.DEBUG)
-        fh.setFormatter(logging.Formatter())
-        logger.addHandler(fh)
-    logger_initialized.append(name)
-    return logger
diff --git a/pdfdet/models/Paddle/ppdet/utils/profiler.py b/pdfdet/models/Paddle/ppdet/utils/profiler.py
deleted file mode 100644
index 28ac467..0000000
--- a/pdfdet/models/Paddle/ppdet/utils/profiler.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import paddle
-import paddle.profiler as profiler
-
-# A global variable to record the number of calling times for profiler
-# functions. It is used to specify the tracing range of training steps.
-_profiler_step_id = 0
-
-# A global variable to avoid parsing from string every time.
-_profiler_options = None
-_prof = None
-
-class ProfilerOptions(object):
-    '''
-    Use a string to initialize a ProfilerOptions.
-    The string should be in the format: "key1=value1;key2=value;key3=value3".
-    For example:
-      "profile_path=model.profile"
-      "batch_range=[50, 60]; profile_path=model.profile"
-      "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
-
-    ProfilerOptions supports following key-value pair:
-      batch_range      - a integer list, e.g. [100, 110].
-      state            - a string, the optional values are 'CPU', 'GPU' or 'All'. 
-      sorted_key       - a string, the optional values are 'calls', 'total',
-                         'max', 'min' or 'ave.
-      tracer_option    - a string, the optional values are 'Default', 'OpDetail',
-                         'AllOpDetail'.
-      profile_path     - a string, the path to save the serialized profile data,
-                         which can be used to generate a timeline.
-      exit_on_finished - a boolean.
-    '''
-
-    def __init__(self, options_str):
-        assert isinstance(options_str, str)
-
-        self._options = {
-            'batch_range': [10, 20],
-            'state': 'All',
-            'sorted_key': 'total',
-            'tracer_option': 'Default',
-            'profile_path': '/tmp/profile',
-            'exit_on_finished': True,
-            'timer_only': True
-        }
-        self._parse_from_string(options_str)
-
-    def _parse_from_string(self, options_str):
-        for kv in options_str.replace(' ', '').split(';'):
-            key, value = kv.split('=')
-            if key == 'batch_range':
-                value_list = value.replace('[', '').replace(']', '').split(',')
-                value_list = list(map(int, value_list))
-                if len(value_list) >= 2 and value_list[0] >= 0 and value_list[
-                        1] > value_list[0]:
-                    self._options[key] = value_list
-            elif key == 'exit_on_finished':
-                self._options[key] = value.lower() in ("yes", "true", "t", "1")
-            elif key in [
-                    'state', 'sorted_key', 'tracer_option', 'profile_path'
-            ]:
-                self._options[key] = value
-            elif key == 'timer_only':
-                self._options[key] = value
-
-    def __getitem__(self, name):
-        if self._options.get(name, None) is None:
-            raise ValueError(
-                "ProfilerOptions does not have an option named %s." % name)
-        return self._options[name]
-
-
-def add_profiler_step(options_str=None):
-    '''
-    Enable the operator-level timing using PaddlePaddle's profiler.
-    The profiler uses a independent variable to count the profiler steps.
-    One call of this function is treated as a profiler step.
-    Args:
-      profiler_options - a string to initialize the ProfilerOptions.
-                         Default is None, and the profiler is disabled.
-    '''
-    if options_str is None:
-        return
-
-    global _prof 
-    global _profiler_step_id
-    global _profiler_options
-
-    if _profiler_options is None:
-        _profiler_options = ProfilerOptions(options_str)
-    # profile : https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/profiling_model.html#chakanxingnengshujudetongjibiaodan
-    # timer_only = True  only the model's throughput and time overhead are displayed
-    # timer_only = False calling summary can print a statistical form that presents performance data from different perspectives.
-    # timer_only = False the output Timeline information can be found in the profiler_log directory
-    if _prof is None:
-        _timer_only = str(_profiler_options['timer_only']) == str(True)
-        _prof = profiler.Profiler(
-                   scheduler = (_profiler_options['batch_range'][0], _profiler_options['batch_range'][1]),
-                   on_trace_ready = profiler.export_chrome_tracing('./profiler_log'),
-                   timer_only = _timer_only)
-        _prof.start()
-    else:
-        _prof.step()
-        
-    if _profiler_step_id == _profiler_options['batch_range'][1]:
-        _prof.stop()
-        _prof.summary(
-             op_detail=True,
-             thread_sep=False,
-             time_unit='ms')
-        _prof = None
-        if _profiler_options['exit_on_finished']:
-            sys.exit(0)
-
-    _profiler_step_id += 1
diff --git a/pdfdet/models/Paddle/ppdet/utils/stats.py b/pdfdet/models/Paddle/ppdet/utils/stats.py
deleted file mode 100644
index c070e65..0000000
--- a/pdfdet/models/Paddle/ppdet/utils/stats.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import collections
-import numpy as np
-
-__all__ = ['SmoothedValue', 'TrainingStats']
-
-
-class SmoothedValue(object):
-    """Track a series of values and provide access to smoothed values over a
-    window or the global series average.
-    """
-
-    def __init__(self, window_size=20, fmt=None):
-        if fmt is None:
-            fmt = "{median:.4f} ({avg:.4f})"
-        self.deque = collections.deque(maxlen=window_size)
-        self.fmt = fmt
-        self.total = 0.
-        self.count = 0
-
-    def update(self, value, n=1):
-        self.deque.append(value)
-        self.count += n
-        self.total += value * n
-
-    @property
-    def median(self):
-        return np.median(self.deque)
-
-    @property
-    def avg(self):
-        return np.mean(self.deque)
-
-    @property
-    def max(self):
-        return np.max(self.deque)
-
-    @property
-    def value(self):
-        return self.deque[-1]
-
-    @property
-    def global_avg(self):
-        return self.total / self.count
-
-    def __str__(self):
-        return self.fmt.format(
-            median=self.median, avg=self.avg, max=self.max, value=self.value)
-
-
-class TrainingStats(object):
-    def __init__(self, window_size, delimiter=' '):
-        self.meters = None
-        self.window_size = window_size
-        self.delimiter = delimiter
-
-    def update(self, stats):
-        if self.meters is None:
-            self.meters = {
-                k: SmoothedValue(self.window_size)
-                for k in stats.keys()
-            }
-        for k, v in self.meters.items():
-            v.update(float(stats[k]))
-
-    def get(self, extras=None):
-        stats = collections.OrderedDict()
-        if extras:
-            for k, v in extras.items():
-                stats[k] = v
-        for k, v in self.meters.items():
-            stats[k] = format(v.median, '.6f')
-
-        return stats
-
-    def log(self, extras=None):
-        d = self.get(extras)
-        strs = []
-        for k, v in d.items():
-            strs.append("{}: {}".format(k, str(v)))
-        return self.delimiter.join(strs)
diff --git a/pdfdet/models/Paddle/ppdet/utils/visualizer.py b/pdfdet/models/Paddle/ppdet/utils/visualizer.py
deleted file mode 100644
index e29a189..0000000
--- a/pdfdet/models/Paddle/ppdet/utils/visualizer.py
+++ /dev/null
@@ -1,465 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from __future__ import unicode_literals
-
-import os
-import numpy as np
-from PIL import Image, ImageDraw, ImageFont
-import cv2
-import math
-
-from .colormap import colormap
-from ppdet.utils.logger import setup_logger
-from ppdet.utils.compact import imagedraw_textsize_c
-from ppdet.utils.download import get_path
-logger = setup_logger(__name__)
-
-__all__ = ['visualize_results']
-
-
-def visualize_results(image,
-                      bbox_res,
-                      mask_res,
-                      segm_res,
-                      keypoint_res,
-                      pose3d_res,
-                      im_id,
-                      catid2name,
-                      threshold=0.5):
-    """
-    Visualize bbox and mask results
-    """
-    if bbox_res is not None:
-        image = draw_bbox(image, im_id, catid2name, bbox_res, threshold)
-    if mask_res is not None:
-        image = draw_mask(image, im_id, mask_res, threshold)
-    if segm_res is not None:
-        image = draw_segm(image, im_id, catid2name, segm_res, threshold)
-    if keypoint_res is not None:
-        image = draw_pose(image, keypoint_res, threshold)
-    if pose3d_res is not None:
-        pose3d = np.array(pose3d_res[0]['pose3d']) * 1000
-        image = draw_pose3d(image, pose3d, visual_thread=threshold)
-    return image
-
-
-def draw_mask(image, im_id, segms, threshold, alpha=0.7):
-    """
-    Draw mask on image
-    """
-    mask_color_id = 0
-    w_ratio = .4
-    color_list = colormap(rgb=True)
-    img_array = np.array(image).astype('float32')
-    for dt in np.array(segms):
-        if im_id != dt['image_id']:
-            continue
-        segm, score = dt['segmentation'], dt['score']
-        if score < threshold:
-            continue
-        import pycocotools.mask as mask_util
-        mask = mask_util.decode(segm) * 255
-        color_mask = color_list[mask_color_id % len(color_list), 0:3]
-        mask_color_id += 1
-        for c in range(3):
-            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255
-        idx = np.nonzero(mask)
-        img_array[idx[0], idx[1], :] *= 1.0 - alpha
-        img_array[idx[0], idx[1], :] += alpha * color_mask
-    return Image.fromarray(img_array.astype('uint8'))
-
-
-def draw_bbox(image, im_id, catid2name, bboxes, threshold):
-    """
-    Draw bbox on image
-    """
-    font_url = "https://paddledet.bj.bcebos.com/simfang.ttf"
-    font_path , _ = get_path(font_url, "~/.cache/paddle/")
-    font_size = 18
-    font = ImageFont.truetype(font_path, font_size, encoding="utf-8")
-
-    draw = ImageDraw.Draw(image)
-
-    catid2color = {}
-    color_list = colormap(rgb=True)[:40]
-    for dt in np.array(bboxes):
-        if im_id != dt['image_id']:
-            continue
-        catid, bbox, score = dt['category_id'], dt['bbox'], dt['score']
-        if score < threshold:
-            continue
-
-        if catid not in catid2color:
-            idx = np.random.randint(len(color_list))
-            catid2color[catid] = color_list[idx]
-        color = tuple(catid2color[catid])
-
-        # draw bbox
-        if len(bbox) == 4:
-            # draw bbox
-            xmin, ymin, w, h = bbox
-            xmax = xmin + w
-            ymax = ymin + h
-            draw.line(
-                [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin),
-                 (xmin, ymin)],
-                width=2,
-                fill=color)
-        elif len(bbox) == 8:
-            x1, y1, x2, y2, x3, y3, x4, y4 = bbox
-            draw.line(
-                [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)],
-                width=2,
-                fill=color)
-            xmin = min(x1, x2, x3, x4)
-            ymin = min(y1, y2, y3, y4)
-        else:
-            logger.error('the shape of bbox must be [M, 4] or [M, 8]!')
-
-        # draw label
-        text = "{} {:.2f}".format(catid2name[catid], score)
-        tw, th = imagedraw_textsize_c(draw, text, font=font)
-        draw.rectangle(
-            [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color)
-        draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255), font=font)
-
-    return image
-
-
-def save_result(save_path, results, catid2name, threshold):
-    """
-    save result as txt
-    """
-    img_id = int(results["im_id"])
-    with open(save_path, 'w') as f:
-        if "bbox_res" in results:
-            for dt in results["bbox_res"]:
-                catid, bbox, score = dt['category_id'], dt['bbox'], dt['score']
-                if score < threshold:
-                    continue
-                # each bbox result as a line
-                # for rbox: classname score x1 y1 x2 y2 x3 y3 x4 y4
-                # for bbox: classname score x1 y1 w h
-                bbox_pred = '{} {} '.format(catid2name[catid],
-                                            score) + ' '.join(
-                                                [str(e) for e in bbox])
-                f.write(bbox_pred + '\n')
-        elif "keypoint_res" in results:
-            for dt in results["keypoint_res"]:
-                kpts = dt['keypoints']
-                scores = dt['score']
-                keypoint_pred = [img_id, scores, kpts]
-                print(keypoint_pred, file=f)
-        else:
-            print("No valid results found, skip txt save")
-
-
-def draw_segm(image,
-              im_id,
-              catid2name,
-              segms,
-              threshold,
-              alpha=0.7,
-              draw_box=True):
-    """
-    Draw segmentation on image
-    """
-    mask_color_id = 0
-    w_ratio = .4
-    color_list = colormap(rgb=True)
-    img_array = np.array(image).astype('float32')
-    for dt in np.array(segms):
-        if im_id != dt['image_id']:
-            continue
-        segm, score, catid = dt['segmentation'], dt['score'], dt['category_id']
-        if score < threshold:
-            continue
-        import pycocotools.mask as mask_util
-        mask = mask_util.decode(segm) * 255
-        color_mask = color_list[mask_color_id % len(color_list), 0:3]
-        mask_color_id += 1
-        for c in range(3):
-            color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255
-        idx = np.nonzero(mask)
-        img_array[idx[0], idx[1], :] *= 1.0 - alpha
-        img_array[idx[0], idx[1], :] += alpha * color_mask
-
-        if not draw_box:
-            center_y, center_x = ndimage.measurements.center_of_mass(mask)
-            label_text = "{}".format(catid2name[catid])
-            vis_pos = (max(int(center_x) - 10, 0), int(center_y))
-            cv2.putText(img_array, label_text, vis_pos,
-                        cv2.FONT_HERSHEY_COMPLEX, 0.3, (255, 255, 255))
-        else:
-            mask = mask_util.decode(segm) * 255
-            sum_x = np.sum(mask, axis=0)
-            x = np.where(sum_x > 0.5)[0]
-            sum_y = np.sum(mask, axis=1)
-            y = np.where(sum_y > 0.5)[0]
-            x0, x1, y0, y1 = x[0], x[-1], y[0], y[-1]
-            cv2.rectangle(img_array, (x0, y0), (x1, y1),
-                          tuple(color_mask.astype('int32').tolist()), 1)
-            bbox_text = '%s %.2f' % (catid2name[catid], score)
-            t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0]
-            cv2.rectangle(img_array, (x0, y0), (x0 + t_size[0],
-                                                y0 - t_size[1] - 3),
-                          tuple(color_mask.astype('int32').tolist()), -1)
-            cv2.putText(
-                img_array,
-                bbox_text, (x0, y0 - 2),
-                cv2.FONT_HERSHEY_SIMPLEX,
-                0.3, (0, 0, 0),
-                1,
-                lineType=cv2.LINE_AA)
-
-    return Image.fromarray(img_array.astype('uint8'))
-
-
-def draw_pose(image,
-              results,
-              visual_thread=0.6,
-              save_name='pose.jpg',
-              save_dir='output',
-              returnimg=False,
-              ids=None):
-    try:
-        import matplotlib.pyplot as plt
-        import matplotlib
-        plt.switch_backend('agg')
-    except Exception as e:
-        logger.error('Matplotlib not found, please install matplotlib.'
-                     'for example: `pip install matplotlib`.')
-        raise e
-
-    skeletons = np.array([item['keypoints'] for item in results])
-    kpt_nums = 17
-    if len(skeletons) > 0:
-        kpt_nums = int(skeletons.shape[1] / 3)
-    skeletons = skeletons.reshape(-1, kpt_nums, 3)
-    if kpt_nums == 17:  #plot coco keypoint
-        EDGES = [(0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 8),
-                 (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14),
-                 (13, 15), (14, 16), (11, 12)]
-    else:  #plot mpii keypoint
-        EDGES = [(0, 1), (1, 2), (3, 4), (4, 5), (2, 6), (3, 6), (6, 7), (7, 8),
-                 (8, 9), (10, 11), (11, 12), (13, 14), (14, 15), (8, 12),
-                 (8, 13)]
-    NUM_EDGES = len(EDGES)
-
-    colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
-            [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
-            [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]
-    cmap = matplotlib.cm.get_cmap('hsv')
-    plt.figure()
-
-    img = np.array(image).astype('float32')
-
-    color_set = results['colors'] if 'colors' in results else None
-
-    if 'bbox' in results and ids is None:
-        bboxs = results['bbox']
-        for j, rect in enumerate(bboxs):
-            xmin, ymin, xmax, ymax = rect
-            color = colors[0] if color_set is None else colors[color_set[j] %
-                                                               len(colors)]
-            cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 1)
-
-    canvas = img.copy()
-    for i in range(kpt_nums):
-        for j in range(len(skeletons)):
-            if skeletons[j][i, 2] < visual_thread:
-                continue
-            if ids is None:
-                color = colors[i] if color_set is None else colors[color_set[j]
-                                                                   %
-                                                                   len(colors)]
-            else:
-                color = get_color(ids[j])
-
-            cv2.circle(
-                canvas,
-                tuple(skeletons[j][i, 0:2].astype('int32')),
-                2,
-                color,
-                thickness=-1)
-
-    to_plot = cv2.addWeighted(img, 0.3, canvas, 0.7, 0)
-    fig = matplotlib.pyplot.gcf()
-
-    stickwidth = 2
-
-    for i in range(NUM_EDGES):
-        for j in range(len(skeletons)):
-            edge = EDGES[i]
-            if skeletons[j][edge[0], 2] < visual_thread or skeletons[j][edge[
-                    1], 2] < visual_thread:
-                continue
-
-            cur_canvas = canvas.copy()
-            X = [skeletons[j][edge[0], 1], skeletons[j][edge[1], 1]]
-            Y = [skeletons[j][edge[0], 0], skeletons[j][edge[1], 0]]
-            mX = np.mean(X)
-            mY = np.mean(Y)
-            length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5
-            angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
-            polygon = cv2.ellipse2Poly((int(mY), int(mX)),
-                                       (int(length / 2), stickwidth),
-                                       int(angle), 0, 360, 1)
-            if ids is None:
-                color = colors[i] if color_set is None else colors[color_set[j]
-                                                                   %
-                                                                   len(colors)]
-            else:
-                color = get_color(ids[j])
-            cv2.fillConvexPoly(cur_canvas, polygon, color)
-            canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
-    image = Image.fromarray(canvas.astype('uint8'))
-    plt.close()
-    return image
-
-
-def draw_pose3d(image,
-                pose3d,
-                pose2d=None,
-                visual_thread=0.6,
-                save_name='pose3d.jpg',
-                returnimg=True):
-    try:
-        import matplotlib.pyplot as plt
-        import matplotlib
-        plt.switch_backend('agg')
-    except Exception as e:
-        logger.error('Matplotlib not found, please install matplotlib.'
-                     'for example: `pip install matplotlib`.')
-        raise e
-
-    if pose3d.shape[0] == 24:
-        joints_connectivity_dict = [
-            [0, 1, 0], [1, 2, 0], [5, 4, 1], [4, 3, 1], [2, 3, 0], [2, 14, 1],
-            [3, 14, 1], [14, 16, 1], [15, 16, 1], [15, 12, 1], [6, 7, 0],
-            [7, 8, 0], [11, 10, 1], [10, 9, 1], [8, 12, 0], [9, 12, 1],
-            [12, 19, 1], [19, 18, 1], [19, 20, 0], [19, 21, 1], [22, 20, 0],
-            [23, 21, 1]
-        ]
-    elif pose3d.shape[0] == 14:
-        joints_connectivity_dict = [
-            [0, 1, 0], [1, 2, 0], [5, 4, 1], [4, 3, 1], [2, 3, 0], [2, 12, 0],
-            [3, 12, 1], [6, 7, 0], [7, 8, 0], [11, 10, 1], [10, 9, 1],
-            [8, 12, 0], [9, 12, 1], [12, 13, 1]
-        ]
-    else:
-        print(
-            "not defined joints number :{}, cannot visualize because unknown of joint connectivity".
-            format(pose.shape[0]))
-        return
-
-    def draw3Dpose(pose3d,
-                   ax,
-                   lcolor="#3498db",
-                   rcolor="#e74c3c",
-                   add_labels=False):
-        #    pose3d = orthographic_projection(pose3d, cam)
-        for i in joints_connectivity_dict:
-            x, y, z = [
-                np.array([pose3d[i[0], j], pose3d[i[1], j]]) for j in range(3)
-            ]
-            ax.plot(-x, -z, -y, lw=2, c=lcolor if i[2] else rcolor)
-
-        RADIUS = 1000
-        center_xy = 2 if pose3d.shape[0] == 14 else 14
-        x, y, z = pose3d[center_xy, 0], pose3d[center_xy, 1], pose3d[center_xy,
-                                                                     2]
-        ax.set_xlim3d([-RADIUS + x, RADIUS + x])
-        ax.set_ylim3d([-RADIUS + y, RADIUS + y])
-        ax.set_zlim3d([-RADIUS + z, RADIUS + z])
-
-        ax.set_xlabel("x")
-        ax.set_ylabel("y")
-        ax.set_zlabel("z")
-
-    def draw2Dpose(pose2d,
-                   ax,
-                   lcolor="#3498db",
-                   rcolor="#e74c3c",
-                   add_labels=False):
-        for i in joints_connectivity_dict:
-            if pose2d[i[0], 2] and pose2d[i[1], 2]:
-                x, y = [
-                    np.array([pose2d[i[0], j], pose2d[i[1], j]])
-                    for j in range(2)
-                ]
-                ax.plot(x, y, 0, lw=2, c=lcolor if i[2] else rcolor)
-
-    def draw_img_pose(pose3d,
-                      pose2d=None,
-                      frame=None,
-                      figsize=(12, 12),
-                      savepath=None):
-        fig = plt.figure(figsize=figsize, dpi=80)
-        # fig.clear()
-        fig.tight_layout()
-
-        ax = fig.add_subplot(221)
-        if frame is not None:
-            ax.imshow(frame, interpolation='nearest')
-        if pose2d is not None:
-            draw2Dpose(pose2d, ax)
-
-        ax = fig.add_subplot(222, projection='3d')
-        ax.view_init(45, 45)
-        draw3Dpose(pose3d, ax)
-        ax = fig.add_subplot(223, projection='3d')
-        ax.view_init(0, 0)
-        draw3Dpose(pose3d, ax)
-        ax = fig.add_subplot(224, projection='3d')
-        ax.view_init(0, 90)
-        draw3Dpose(pose3d, ax)
-
-        if savepath is not None:
-            plt.savefig(savepath)
-            plt.close()
-        else:
-            return fig
-
-    def fig2data(fig):
-        """
-        fig = plt.figure()
-        image = fig2data(fig)
-        @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it
-        @param fig a matplotlib figure
-        @return a numpy 3D array of RGBA values
-        """
-        # draw the renderer
-        fig.canvas.draw()
-
-        # Get the RGBA buffer from the figure
-        w, h = fig.canvas.get_width_height()
-        buf = np.fromstring(fig.canvas.tostring_argb(), dtype=np.uint8)
-        buf.shape = (w, h, 4)
-
-        # canvas.tostring_argb give pixmap in ARGB mode. Roll the ALPHA channel to have it in RGBA mode
-        buf = np.roll(buf, 3, axis=2)
-        image = Image.frombytes("RGBA", (w, h), buf.tostring())
-        return image.convert("RGB")
-
-    fig = draw_img_pose(pose3d, pose2d, frame=image)
-    data = fig2data(fig)
-    if returnimg is False:
-        data.save(save_name)
-    else:
-        return data
diff --git a/pdfdet/models/Paddle/ppdet/utils/voc_utils.py b/pdfdet/models/Paddle/ppdet/utils/voc_utils.py
deleted file mode 100644
index cd6d9f9..0000000
--- a/pdfdet/models/Paddle/ppdet/utils/voc_utils.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import os.path as osp
-import re
-import random
-
-__all__ = ['create_list']
-
-
-def create_list(devkit_dir, years, output_dir):
-    """
-    create following list:
-        1. trainval.txt
-        2. test.txt
-    """
-    trainval_list = []
-    test_list = []
-    for year in years:
-        trainval, test = _walk_voc_dir(devkit_dir, year, output_dir)
-        trainval_list.extend(trainval)
-        test_list.extend(test)
-
-    random.shuffle(trainval_list)
-    with open(osp.join(output_dir, 'trainval.txt'), 'w') as ftrainval:
-        for item in trainval_list:
-            ftrainval.write(item[0] + ' ' + item[1] + '\n')
-
-    with open(osp.join(output_dir, 'test.txt'), 'w') as fval:
-        ct = 0
-        for item in test_list:
-            ct += 1
-            fval.write(item[0] + ' ' + item[1] + '\n')
-
-
-def _get_voc_dir(devkit_dir, year, type):
-    return osp.join(devkit_dir, 'VOC' + year, type)
-
-
-def _walk_voc_dir(devkit_dir, year, output_dir):
-    filelist_dir = _get_voc_dir(devkit_dir, year, 'ImageSets/Main')
-    annotation_dir = _get_voc_dir(devkit_dir, year, 'Annotations')
-    img_dir = _get_voc_dir(devkit_dir, year, 'JPEGImages')
-    trainval_list = []
-    test_list = []
-    added = set()
-
-    for _, _, files in os.walk(filelist_dir):
-        for fname in files:
-            img_ann_list = []
-            if re.match(r'[a-z]+_trainval\.txt', fname):
-                img_ann_list = trainval_list
-            elif re.match(r'[a-z]+_test\.txt', fname):
-                img_ann_list = test_list
-            else:
-                continue
-            fpath = osp.join(filelist_dir, fname)
-            for line in open(fpath):
-                name_prefix = line.strip().split()[0]
-                if name_prefix in added:
-                    continue
-                added.add(name_prefix)
-                ann_path = osp.join(
-                    osp.relpath(annotation_dir, output_dir),
-                    name_prefix + '.xml')
-                img_path = osp.join(
-                    osp.relpath(img_dir, output_dir), name_prefix + '.jpg')
-                img_ann_list.append((img_path, ann_path))
-
-    return trainval_list, test_list
diff --git a/requirements.txt b/requirements.txt
index 8fe0fc1..ac390ec 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,9 @@
+cnstd
 paddlepaddle==2.6.0
+paddledet==2.6.0
+PyMuPDF==1.23.26
 tqdm
 opencv-python
-PyMuPDF==1.23.26
 pyyaml
 requests
 six
@@ -9,5 +11,4 @@ scipy
 scikit-learn
 pycocotools
 pandas
-shapely
-cnstd
\ No newline at end of file
+shapely
\ No newline at end of file