diff --git a/pdfdet/models/Paddle/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml b/pdfdet/models/Paddle/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml index fb36a74..d63d5e5 100644 --- a/pdfdet/models/Paddle/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml +++ b/pdfdet/models/Paddle/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout.yml @@ -17,7 +17,7 @@ PicoDet: backbone: LCNet neck: CSPPAN head: PicoHead - nms_cpu: True + #nms_cpu: True LCNet: scale: 1.0 diff --git a/pdfdet/models/Paddle/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout1.yml b/pdfdet/models/Paddle/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout1.yml index 251a3dd..ebe7201 100644 --- a/pdfdet/models/Paddle/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout1.yml +++ b/pdfdet/models/Paddle/configs/picodet/legacy_model/application/layout_analysis/picodet_lcnet_x1_0_layout1.yml @@ -17,7 +17,7 @@ PicoDet: backbone: LCNet neck: CSPPAN head: PicoHead - nms_cpu: True + #nms_cpu: True LCNet: scale: 1.0 diff --git a/pdfdet/models/Paddle/paddle_cdla.py b/pdfdet/models/Paddle/paddle_cdla.py index ef65277..16efe91 100644 --- a/pdfdet/models/Paddle/paddle_cdla.py +++ b/pdfdet/models/Paddle/paddle_cdla.py @@ -13,8 +13,32 @@ from pdfdet.models.baseModel import base_module from ppdet.core.workspace import load_config -from ppdet.engine import Trainer +from ppdet.engine import Trainer as Trainer1 +from ppdet.core.workspace import create +class Trainer(Trainer1): + def __init__(self, *args, **kwargs) -> None: + super().__init__(*args, **kwargs) + + def predict(self, + images): + + self.dataset.set_images(images) + loader = create('TestReader')(self.dataset, 0) + + # Run Infer + self.model.eval() + results = [] + for step_id, data in enumerate(loader): + # forward + outs = self.model(data) + + for key, value in outs.items(): + if hasattr(value, 'numpy'): + outs[key] = value.numpy() + results.append(outs) + + return results class paddle_cdla_model(base_module): def __init__(self, *args, **kwargs) -> None: diff --git a/pdfdet/models/Paddle/paddle_pub.py b/pdfdet/models/Paddle/paddle_pub.py index 93da3cf..3bc7fb9 100644 --- a/pdfdet/models/Paddle/paddle_pub.py +++ b/pdfdet/models/Paddle/paddle_pub.py @@ -7,9 +7,9 @@ parent_path = os.path.abspath(os.path.join(__file__, *([".."] * 1))) sys.path.insert(0, parent_path) -from .paddle_cdla import paddle_cdla_model +from .paddle_cdla import paddle_cdla_model,Trainer from ppdet.core.workspace import load_config -from ppdet.engine import Trainer + class paddle_pub_model(paddle_cdla_model): diff --git a/pdfdet/models/Paddle/ppdet/__init__.py b/pdfdet/models/Paddle/ppdet/__init__.py deleted file mode 100644 index ac53c97..0000000 --- a/pdfdet/models/Paddle/ppdet/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import (core, data, engine, modeling, model_zoo, optimizer, metrics, - utils, slim) - - diff --git a/pdfdet/models/Paddle/ppdet/core/__init__.py b/pdfdet/models/Paddle/ppdet/core/__init__.py deleted file mode 100644 index d042771..0000000 --- a/pdfdet/models/Paddle/ppdet/core/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import config diff --git a/pdfdet/models/Paddle/ppdet/core/config/__init__.py b/pdfdet/models/Paddle/ppdet/core/config/__init__.py deleted file mode 100644 index d0c32e2..0000000 --- a/pdfdet/models/Paddle/ppdet/core/config/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/pdfdet/models/Paddle/ppdet/core/config/schema.py b/pdfdet/models/Paddle/ppdet/core/config/schema.py deleted file mode 100644 index 2e41b5c..0000000 --- a/pdfdet/models/Paddle/ppdet/core/config/schema.py +++ /dev/null @@ -1,248 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import inspect -import importlib -import re - -try: - from docstring_parser import parse as doc_parse -except Exception: - - def doc_parse(*args): - pass - - -try: - from typeguard import check_type -except Exception: - - def check_type(*args): - pass - - -__all__ = ['SchemaValue', 'SchemaDict', 'SharedConfig', 'extract_schema'] - - -class SchemaValue(object): - def __init__(self, name, doc='', type=None): - super(SchemaValue, self).__init__() - self.name = name - self.doc = doc - self.type = type - - def set_default(self, value): - self.default = value - - def has_default(self): - return hasattr(self, 'default') - - -class SchemaDict(dict): - def __init__(self, **kwargs): - super(SchemaDict, self).__init__() - self.schema = {} - self.strict = False - self.doc = "" - self.update(kwargs) - - def __setitem__(self, key, value): - # XXX also update regular dict to SchemaDict?? - if isinstance(value, dict) and key in self and isinstance(self[key], - SchemaDict): - self[key].update(value) - else: - super(SchemaDict, self).__setitem__(key, value) - - def __missing__(self, key): - if self.has_default(key): - return self.schema[key].default - elif key in self.schema: - return self.schema[key] - else: - raise KeyError(key) - - def copy(self): - newone = SchemaDict() - newone.__dict__.update(self.__dict__) - newone.update(self) - return newone - - def set_schema(self, key, value): - assert isinstance(value, SchemaValue) - self.schema[key] = value - - def set_strict(self, strict): - self.strict = strict - - def has_default(self, key): - return key in self.schema and self.schema[key].has_default() - - def is_default(self, key): - if not self.has_default(key): - return False - if hasattr(self[key], '__dict__'): - return True - else: - return key not in self or self[key] == self.schema[key].default - - def find_default_keys(self): - return [ - k for k in list(self.keys()) + list(self.schema.keys()) - if self.is_default(k) - ] - - def mandatory(self): - return any([k for k in self.schema.keys() if not self.has_default(k)]) - - def find_missing_keys(self): - missing = [ - k for k in self.schema.keys() - if k not in self and not self.has_default(k) - ] - placeholders = [k for k in self if self[k] in ('', '')] - return missing + placeholders - - def find_extra_keys(self): - return list(set(self.keys()) - set(self.schema.keys())) - - def find_mismatch_keys(self): - mismatch_keys = [] - for arg in self.schema.values(): - if arg.type is not None: - try: - check_type("{}.{}".format(self.name, arg.name), - self[arg.name], arg.type) - except Exception: - mismatch_keys.append(arg.name) - return mismatch_keys - - def validate(self): - missing_keys = self.find_missing_keys() - if missing_keys: - raise ValueError("Missing param for class<{}>: {}".format( - self.name, ", ".join(missing_keys))) - extra_keys = self.find_extra_keys() - if extra_keys and self.strict: - raise ValueError("Extraneous param for class<{}>: {}".format( - self.name, ", ".join(extra_keys))) - mismatch_keys = self.find_mismatch_keys() - if mismatch_keys: - raise TypeError("Wrong param type for class<{}>: {}".format( - self.name, ", ".join(mismatch_keys))) - - -class SharedConfig(object): - """ - Representation class for `__shared__` annotations, which work as follows: - - - if `key` is set for the module in config file, its value will take - precedence - - if `key` is not set for the module but present in the config file, its - value will be used - - otherwise, use the provided `default_value` as fallback - - Args: - key: config[key] will be injected - default_value: fallback value - """ - - def __init__(self, key, default_value=None): - super(SharedConfig, self).__init__() - self.key = key - self.default_value = default_value - - -def extract_schema(cls): - """ - Extract schema from a given class - - Args: - cls (type): Class from which to extract. - - Returns: - schema (SchemaDict): Extracted schema. - """ - ctor = cls.__init__ - # python 2 compatibility - if hasattr(inspect, 'getfullargspec'): - argspec = inspect.getfullargspec(ctor) - annotations = argspec.annotations - has_kwargs = argspec.varkw is not None - else: - argspec = inspect.getfullargspec(ctor) - # python 2 type hinting workaround, see pep-3107 - # however, since `typeguard` does not support python 2, type checking - # is still python 3 only for now - annotations = getattr(ctor, '__annotations__', {}) - has_kwargs = argspec.varkw is not None - - names = [arg for arg in argspec.args if arg != 'self'] - defaults = argspec.defaults - num_defaults = argspec.defaults is not None and len(argspec.defaults) or 0 - num_required = len(names) - num_defaults - - docs = cls.__doc__ - if docs is None and getattr(cls, '__category__', None) == 'op': - docs = cls.__call__.__doc__ - try: - docstring = doc_parse(docs) - except Exception: - docstring = None - - if docstring is None: - comments = {} - else: - comments = {} - for p in docstring.params: - match_obj = re.match('^([a-zA-Z_]+[a-zA-Z_0-9]*).*', p.arg_name) - if match_obj is not None: - comments[match_obj.group(1)] = p.description - - schema = SchemaDict() - schema.name = cls.__name__ - schema.doc = "" - if docs is not None: - start_pos = docs[0] == '\n' and 1 or 0 - schema.doc = docs[start_pos:].split("\n")[0].strip() - # XXX handle paddle's weird doc convention - if '**' == schema.doc[:2] and '**' == schema.doc[-2:]: - schema.doc = schema.doc[2:-2].strip() - schema.category = hasattr(cls, '__category__') and getattr( - cls, '__category__') or 'module' - schema.strict = not has_kwargs - schema.pymodule = importlib.import_module(cls.__module__) - schema.inject = getattr(cls, '__inject__', []) - schema.shared = getattr(cls, '__shared__', []) - for idx, name in enumerate(names): - comment = name in comments and comments[name] or name - if name in schema.inject: - type_ = None - else: - type_ = name in annotations and annotations[name] or None - value_schema = SchemaValue(name, comment, type_) - if name in schema.shared: - assert idx >= num_required, "shared config must have default value" - default = defaults[idx - num_required] - value_schema.set_default(SharedConfig(name, default)) - elif idx >= num_required: - default = defaults[idx - num_required] - value_schema.set_default(default) - schema.set_schema(name, value_schema) - - return schema diff --git a/pdfdet/models/Paddle/ppdet/core/config/yaml_helpers.py b/pdfdet/models/Paddle/ppdet/core/config/yaml_helpers.py deleted file mode 100644 index 181cfe6..0000000 --- a/pdfdet/models/Paddle/ppdet/core/config/yaml_helpers.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import importlib -import inspect - -import yaml -from .schema import SharedConfig - -__all__ = ['serializable', 'Callable'] - - -def represent_dictionary_order(self, dict_data): - return self.represent_mapping('tag:yaml.org,2002:map', dict_data.items()) - - -def setup_orderdict(): - from collections import OrderedDict - yaml.add_representer(OrderedDict, represent_dictionary_order) - - -def _make_python_constructor(cls): - def python_constructor(loader, node): - if isinstance(node, yaml.SequenceNode): - args = loader.construct_sequence(node, deep=True) - return cls(*args) - else: - kwargs = loader.construct_mapping(node, deep=True) - try: - return cls(**kwargs) - except Exception as ex: - print("Error when construct {} instance from yaml config". - format(cls.__name__)) - raise ex - - return python_constructor - - -def _make_python_representer(cls): - # python 2 compatibility - if hasattr(inspect, 'getfullargspec'): - argspec = inspect.getfullargspec(cls) - else: - argspec = inspect.getfullargspec(cls.__init__) - argnames = [arg for arg in argspec.args if arg != 'self'] - - def python_representer(dumper, obj): - if argnames: - data = {name: getattr(obj, name) for name in argnames} - else: - data = obj.__dict__ - if '_id' in data: - del data['_id'] - return dumper.represent_mapping(u'!{}'.format(cls.__name__), data) - - return python_representer - - -def serializable(cls): - """ - Add loader and dumper for given class, which must be - "trivially serializable" - - Args: - cls: class to be serialized - - Returns: cls - """ - yaml.add_constructor(u'!{}'.format(cls.__name__), - _make_python_constructor(cls)) - yaml.add_representer(cls, _make_python_representer(cls)) - return cls - - -yaml.add_representer(SharedConfig, - lambda d, o: d.represent_data(o.default_value)) - - -@serializable -class Callable(object): - """ - Helper to be used in Yaml for creating arbitrary class objects - - Args: - full_type (str): the full module path to target function - """ - - def __init__(self, full_type, args=[], kwargs={}): - super(Callable, self).__init__() - self.full_type = full_type - self.args = args - self.kwargs = kwargs - - def __call__(self): - if '.' in self.full_type: - idx = self.full_type.rfind('.') - module = importlib.import_module(self.full_type[:idx]) - func_name = self.full_type[idx + 1:] - else: - try: - module = importlib.import_module('builtins') - except Exception: - module = importlib.import_module('__builtin__') - func_name = self.full_type - - func = getattr(module, func_name) - return func(*self.args, **self.kwargs) diff --git a/pdfdet/models/Paddle/ppdet/core/workspace.py b/pdfdet/models/Paddle/ppdet/core/workspace.py deleted file mode 100644 index 6735bcf..0000000 --- a/pdfdet/models/Paddle/ppdet/core/workspace.py +++ /dev/null @@ -1,292 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import importlib -import os -import sys - -import yaml -import collections - -try: - collectionsAbc = collections.abc -except AttributeError: - collectionsAbc = collections - -from .config.schema import SchemaDict, SharedConfig, extract_schema -from .config.yaml_helpers import serializable - -__all__ = [ - 'global_config', - 'load_config', - 'merge_config', - 'get_registered_modules', - 'create', - 'register', - 'serializable', - 'dump_value', -] - - -def dump_value(value): - # XXX this is hackish, but collections.abc is not available in python 2 - if hasattr(value, '__dict__') or isinstance(value, (dict, tuple, list)): - value = yaml.dump(value, default_flow_style=True) - value = value.replace('\n', '') - value = value.replace('...', '') - return "'{}'".format(value) - else: - # primitive types - return str(value) - - -class AttrDict(dict): - """Single level attribute dict, NOT recursive""" - - def __init__(self, **kwargs): - super(AttrDict, self).__init__() - super(AttrDict, self).update(kwargs) - - def __getattr__(self, key): - if key in self: - return self[key] - raise AttributeError("object has no attribute '{}'".format(key)) - - def __setattr__(self, key, value): - self[key] = value - - def copy(self): - new_dict = AttrDict() - for k, v in self.items(): - new_dict.update({k: v}) - return new_dict - - -global_config = AttrDict() - -BASE_KEY = '_BASE_' - - -# parse and load _BASE_ recursively -def _load_config_with_base(file_path): - with open(file_path) as f: - file_cfg = yaml.load(f, Loader=yaml.Loader) - - # NOTE: cfgs outside have higher priority than cfgs in _BASE_ - if BASE_KEY in file_cfg: - all_base_cfg = AttrDict() - base_ymls = list(file_cfg[BASE_KEY]) - for base_yml in base_ymls: - if base_yml.startswith("~"): - base_yml = os.path.expanduser(base_yml) - if not base_yml.startswith('/'): - base_yml = os.path.join(os.path.dirname(file_path), base_yml) - - with open(base_yml) as f: - base_cfg = _load_config_with_base(base_yml) - all_base_cfg = merge_config(base_cfg, all_base_cfg) - - del file_cfg[BASE_KEY] - return merge_config(file_cfg, all_base_cfg) - - return file_cfg - - -def load_config(file_path): - """ - Load config from file. - - Args: - file_path (str): Path of the config file to be loaded. - - Returns: global config - """ - _, ext = os.path.splitext(file_path) - assert ext in ['.yml', '.yaml'], "only support yaml files for now" - - # load config from file and merge into global config - cfg = _load_config_with_base(file_path) - cfg['filename'] = os.path.splitext(os.path.split(file_path)[-1])[0] - merge_config(cfg) - - return global_config - - -def dict_merge(dct, merge_dct): - """ Recursive dict merge. Inspired by :meth:``dict.update()``, instead of - updating only top-level keys, dict_merge recurses down into dicts nested - to an arbitrary depth, updating keys. The ``merge_dct`` is merged into - ``dct``. - - Args: - dct: dict onto which the merge is executed - merge_dct: dct merged into dct - - Returns: dct - """ - for k, v in merge_dct.items(): - if (k in dct and isinstance(dct[k], dict) and - isinstance(merge_dct[k], collectionsAbc.Mapping)): - dict_merge(dct[k], merge_dct[k]) - else: - dct[k] = merge_dct[k] - return dct - - -def merge_config(config, another_cfg=None): - """ - Merge config into global config or another_cfg. - - Args: - config (dict): Config to be merged. - - Returns: global config - """ - global global_config - dct = another_cfg or global_config - return dict_merge(dct, config) - - -def get_registered_modules(): - return {k: v for k, v in global_config.items() if isinstance(v, SchemaDict)} - - -def make_partial(cls): - op_module = importlib.import_module(cls.__op__.__module__) - op = getattr(op_module, cls.__op__.__name__) - cls.__category__ = getattr(cls, '__category__', None) or 'op' - - def partial_apply(self, *args, **kwargs): - kwargs_ = self.__dict__.copy() - kwargs_.update(kwargs) - return op(*args, **kwargs_) - - if getattr(cls, '__append_doc__', True): # XXX should default to True? - if sys.version_info[0] > 2: - cls.__doc__ = "Wrapper for `{}` OP".format(op.__name__) - cls.__init__.__doc__ = op.__doc__ - cls.__call__ = partial_apply - cls.__call__.__doc__ = op.__doc__ - else: - # XXX work around for python 2 - partial_apply.__doc__ = op.__doc__ - cls.__call__ = partial_apply - return cls - - -def register(cls): - """ - Register a given module class. - - Args: - cls (type): Module class to be registered. - - Returns: cls - """ - if cls.__name__ in global_config: - raise ValueError("Module class already registered: {}".format( - cls.__name__)) - if hasattr(cls, '__op__'): - cls = make_partial(cls) - global_config[cls.__name__] = extract_schema(cls) - return cls - - -def create(cls_or_name, **kwargs): - """ - Create an instance of given module class. - - Args: - cls_or_name (type or str): Class of which to create instance. - - Returns: instance of type `cls_or_name` - """ - assert type(cls_or_name) in [type, str - ], "should be a class or name of a class" - name = type(cls_or_name) == str and cls_or_name or cls_or_name.__name__ - if name in global_config: - if isinstance(global_config[name], SchemaDict): - pass - elif hasattr(global_config[name], "__dict__"): - # support instance return directly - return global_config[name] - else: - raise ValueError("The module {} is not registered".format(name)) - else: - raise ValueError("The module {} is not registered".format(name)) - - config = global_config[name] - cls = getattr(config.pymodule, name) - cls_kwargs = {} - cls_kwargs.update(global_config[name]) - - # parse `shared` annoation of registered modules - if getattr(config, 'shared', None): - for k in config.shared: - target_key = config[k] - shared_conf = config.schema[k].default - assert isinstance(shared_conf, SharedConfig) - if target_key is not None and not isinstance(target_key, - SharedConfig): - continue # value is given for the module - elif shared_conf.key in global_config: - # `key` is present in config - cls_kwargs[k] = global_config[shared_conf.key] - else: - cls_kwargs[k] = shared_conf.default_value - - # parse `inject` annoation of registered modules - if getattr(cls, 'from_config', None): - cls_kwargs.update(cls.from_config(config, **kwargs)) - - if getattr(config, 'inject', None): - for k in config.inject: - target_key = config[k] - # optional dependency - if target_key is None: - continue - - if isinstance(target_key, dict) or hasattr(target_key, '__dict__'): - if 'name' not in target_key.keys(): - continue - inject_name = str(target_key['name']) - if inject_name not in global_config: - raise ValueError( - "Missing injection name {} and check it's name in cfg file". - format(k)) - target = global_config[inject_name] - for i, v in target_key.items(): - if i == 'name': - continue - target[i] = v - if isinstance(target, SchemaDict): - cls_kwargs[k] = create(inject_name) - elif isinstance(target_key, str): - if target_key not in global_config: - raise ValueError("Missing injection config:", target_key) - target = global_config[target_key] - if isinstance(target, SchemaDict): - cls_kwargs[k] = create(target_key) - elif hasattr(target, '__dict__'): # serialized object - cls_kwargs[k] = target - else: - raise ValueError("Unsupported injection type:", target_key) - # prevent modification of global config values of reference types - # (e.g., list, dict) from within the created module instances - #kwargs = copy.deepcopy(kwargs) - return cls(**cls_kwargs) diff --git a/pdfdet/models/Paddle/ppdet/data/__init__.py b/pdfdet/models/Paddle/ppdet/data/__init__.py deleted file mode 100644 index a12aa32..0000000 --- a/pdfdet/models/Paddle/ppdet/data/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import source -from . import transform -from . import reader - -from .source import * -from .transform import * -from .reader import * diff --git a/pdfdet/models/Paddle/ppdet/data/crop_utils/__init__.py b/pdfdet/models/Paddle/ppdet/data/crop_utils/__init__.py deleted file mode 100644 index 61d5aa2..0000000 --- a/pdfdet/models/Paddle/ppdet/data/crop_utils/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. \ No newline at end of file diff --git a/pdfdet/models/Paddle/ppdet/data/crop_utils/annotation_cropper.py b/pdfdet/models/Paddle/ppdet/data/crop_utils/annotation_cropper.py deleted file mode 100644 index e288fab..0000000 --- a/pdfdet/models/Paddle/ppdet/data/crop_utils/annotation_cropper.py +++ /dev/null @@ -1,580 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import math -import random -import numpy as np -from copy import deepcopy -from typing import List, Tuple -from collections import defaultdict - -from .chip_box_utils import nms, transform_chip_boxes2image_boxes -from .chip_box_utils import find_chips_to_cover_overlaped_boxes -from .chip_box_utils import transform_chip_box -from .chip_box_utils import intersection_over_box - - -class AnnoCropper(object): - def __init__(self, - image_target_sizes: List[int], - valid_box_ratio_ranges: List[List[float]], - chip_target_size: int, - chip_target_stride: int, - use_neg_chip: bool=False, - max_neg_num_per_im: int=8, - max_per_img: int=-1, - nms_thresh: int=0.5): - """ - Generate chips by chip_target_size and chip_target_stride. - These two parameters just like kernel_size and stride in cnn. - - Each image has its raw size. After resizing, then get its target size. - The resizing scale = target_size / raw_size. - So are chips of the image. - box_ratio = box_raw_size / image_raw_size = box_target_size / image_target_size - The 'size' above mentioned is the size of long-side of image, box or chip. - - :param image_target_sizes: [2000, 1000] - :param valid_box_ratio_ranges: [[-1, 0.1],[0.08, -1]] - :param chip_target_size: 500 - :param chip_target_stride: 200 - """ - self.target_sizes = image_target_sizes - self.valid_box_ratio_ranges = valid_box_ratio_ranges - assert len(self.target_sizes) == len(self.valid_box_ratio_ranges) - self.scale_num = len(self.target_sizes) - self.chip_target_size = chip_target_size # is target size - self.chip_target_stride = chip_target_stride # is target stride - self.use_neg_chip = use_neg_chip - self.max_neg_num_per_im = max_neg_num_per_im - self.max_per_img = max_per_img - self.nms_thresh = nms_thresh - - def crop_anno_records(self, records: List[dict]): - """ - The main logic: - # foreach record(image): - # foreach scale: - # 1 generate chips by chip size and stride for each scale - # 2 get pos chips - # - validate boxes: current scale; h,w >= 1 - # - find pos chips greedily by valid gt boxes in each scale - # - for every valid gt box, find its corresponding pos chips in each scale - # 3 get neg chips - # - If given proposals, find neg boxes in them which are not in pos chips - # - If got neg boxes in last step, we find neg chips and assign neg boxes to neg chips such as 2. - # 4 sample neg chips if too much each image - # transform this image-scale annotations to chips(pos chips&neg chips) annotations - - :param records, standard coco_record but with extra key `proposals`(Px4), which are predicted by stage1 - model and maybe have neg boxes in them. - :return: new_records, list of dict like - { - 'im_file': 'fake_image1.jpg', - 'im_id': np.array([1]), # new _global_chip_id as im_id - 'h': h, # chip height - 'w': w, # chip width - 'is_crowd': is_crowd, # Nx1 -> Mx1 - 'gt_class': gt_class, # Nx1 -> Mx1 - 'gt_bbox': gt_bbox, # Nx4 -> Mx4, 4 represents [x1,y1,x2,y2] - 'gt_poly': gt_poly, # [None]xN -> [None]xM - 'chip': [x1, y1, x2, y2] # added - } - - Attention: - ------------------------------>x - | - | (x1,y1)------ - | | | - | | | - | | | - | | | - | | | - | ---------- - | (x2,y2) - | - ↓ - y - - If we use [x1, y1, x2, y2] to represent boxes or chips, - (x1,y1) is the left-top point which is in the box, - but (x2,y2) is the right-bottom point which is not in the box. - So x1 in [0, w-1], x2 in [1, w], y1 in [0, h-1], y2 in [1,h]. - And you can use x2-x1 to get width, and you can use image[y1:y2, x1:x2] to get the box area. - """ - - self.chip_records = [] - self._global_chip_id = 1 - for r in records: - self._cur_im_pos_chips = [ - ] # element: (chip, boxes_idx), chip is [x1, y1, x2, y2], boxes_ids is List[int] - self._cur_im_neg_chips = [] # element: (chip, neg_box_num) - for scale_i in range(self.scale_num): - self._get_current_scale_parameters(scale_i, r) - - # Cx4 - chips = self._create_chips(r['h'], r['w'], self._cur_scale) - - # # dict: chipid->[box_id, ...] - pos_chip2boxes_idx = self._get_valid_boxes_and_pos_chips( - r['gt_bbox'], chips) - - # dict: chipid->neg_box_num - neg_chip2box_num = self._get_neg_boxes_and_chips( - chips, - list(pos_chip2boxes_idx.keys()), r.get('proposals', None)) - - self._add_to_cur_im_chips(chips, pos_chip2boxes_idx, - neg_chip2box_num) - - cur_image_records = self._trans_all_chips2annotations(r) - self.chip_records.extend(cur_image_records) - return self.chip_records - - def _add_to_cur_im_chips(self, chips, pos_chip2boxes_idx, neg_chip2box_num): - for pos_chipid, boxes_idx in pos_chip2boxes_idx.items(): - chip = np.array(chips[pos_chipid]) # copy chips slice - self._cur_im_pos_chips.append((chip, boxes_idx)) - - if neg_chip2box_num is None: - return - - for neg_chipid, neg_box_num in neg_chip2box_num.items(): - chip = np.array(chips[neg_chipid]) - self._cur_im_neg_chips.append((chip, neg_box_num)) - - def _trans_all_chips2annotations(self, r): - gt_bbox = r['gt_bbox'] - im_file = r['im_file'] - is_crowd = r['is_crowd'] - gt_class = r['gt_class'] - # gt_poly = r['gt_poly'] # [None]xN - # remaining keys: im_id, h, w - chip_records = self._trans_pos_chips2annotations(im_file, gt_bbox, - is_crowd, gt_class) - - if not self.use_neg_chip: - return chip_records - - sampled_neg_chips = self._sample_neg_chips() - neg_chip_records = self._trans_neg_chips2annotations(im_file, - sampled_neg_chips) - chip_records.extend(neg_chip_records) - return chip_records - - def _trans_pos_chips2annotations(self, im_file, gt_bbox, is_crowd, - gt_class): - chip_records = [] - for chip, boxes_idx in self._cur_im_pos_chips: - chip_bbox, final_boxes_idx = transform_chip_box(gt_bbox, boxes_idx, - chip) - x1, y1, x2, y2 = chip - chip_h = y2 - y1 - chip_w = x2 - x1 - rec = { - 'im_file': im_file, - 'im_id': np.array([self._global_chip_id]), - 'h': chip_h, - 'w': chip_w, - 'gt_bbox': chip_bbox, - 'is_crowd': is_crowd[final_boxes_idx].copy(), - 'gt_class': gt_class[final_boxes_idx].copy(), - # 'gt_poly': [None] * len(final_boxes_idx), - 'chip': chip - } - self._global_chip_id += 1 - chip_records.append(rec) - return chip_records - - def _sample_neg_chips(self): - pos_num = len(self._cur_im_pos_chips) - neg_num = len(self._cur_im_neg_chips) - sample_num = min(pos_num + 2, self.max_neg_num_per_im) - assert sample_num >= 1 - if neg_num <= sample_num: - return self._cur_im_neg_chips - - candidate_num = int(sample_num * 1.5) - candidate_neg_chips = sorted( - self._cur_im_neg_chips, key=lambda x: -x[1])[:candidate_num] - random.shuffle(candidate_neg_chips) - sampled_neg_chips = candidate_neg_chips[:sample_num] - return sampled_neg_chips - - def _trans_neg_chips2annotations(self, - im_file: str, - sampled_neg_chips: List[Tuple]): - chip_records = [] - for chip, neg_box_num in sampled_neg_chips: - x1, y1, x2, y2 = chip - chip_h = y2 - y1 - chip_w = x2 - x1 - rec = { - 'im_file': im_file, - 'im_id': np.array([self._global_chip_id]), - 'h': chip_h, - 'w': chip_w, - 'gt_bbox': np.zeros( - (0, 4), dtype=np.float32), - 'is_crowd': np.zeros( - (0, 1), dtype=np.int32), - 'gt_class': np.zeros( - (0, 1), dtype=np.int32), - # 'gt_poly': [], - 'chip': chip - } - self._global_chip_id += 1 - chip_records.append(rec) - return chip_records - - def _get_current_scale_parameters(self, scale_i, r): - im_size = max(r['h'], r['w']) - im_target_size = self.target_sizes[scale_i] - self._cur_im_size, self._cur_im_target_size = im_size, im_target_size - self._cur_scale = self._get_current_scale(im_target_size, im_size) - self._cur_valid_ratio_range = self.valid_box_ratio_ranges[scale_i] - - def _get_current_scale(self, im_target_size, im_size): - return im_target_size / im_size - - def _create_chips(self, h: int, w: int, scale: float): - """ - Generate chips by chip_target_size and chip_target_stride. - These two parameters just like kernel_size and stride in cnn. - :return: chips, Cx4, xy in raw size dimension - """ - chip_size = self.chip_target_size # omit target for simplicity - stride = self.chip_target_stride - width = int(scale * w) - height = int(scale * h) - min_chip_location_diff = 20 # in target size - - assert chip_size >= stride - chip_overlap = chip_size - stride - if (width - chip_overlap - ) % stride > min_chip_location_diff: # 不能被stride整除的部分比较大,则保留 - w_steps = max(1, int(math.ceil((width - chip_overlap) / stride))) - else: # 不能被stride整除的部分比较小,则丢弃 - w_steps = max(1, int(math.floor((width - chip_overlap) / stride))) - if (height - chip_overlap) % stride > min_chip_location_diff: - h_steps = max(1, int(math.ceil((height - chip_overlap) / stride))) - else: - h_steps = max(1, int(math.floor((height - chip_overlap) / stride))) - - chips = list() - for j in range(h_steps): - for i in range(w_steps): - x1 = i * stride - y1 = j * stride - x2 = min(x1 + chip_size, width) - y2 = min(y1 + chip_size, height) - chips.append([x1, y1, x2, y2]) - - # check chip size - for item in chips: - if item[2] - item[0] > chip_size * 1.1 or item[3] - item[ - 1] > chip_size * 1.1: - raise ValueError(item) - chips = np.array(chips, dtype=np.float32) - - raw_size_chips = chips / scale - return raw_size_chips - - def _get_valid_boxes_and_pos_chips(self, gt_bbox, chips): - valid_ratio_range = self._cur_valid_ratio_range - im_size = self._cur_im_size - scale = self._cur_scale - # Nx4 N - valid_boxes, valid_boxes_idx = self._validate_boxes( - valid_ratio_range, im_size, gt_bbox, scale) - # dict: chipid->[box_id, ...] - pos_chip2boxes_idx = self._find_pos_chips(chips, valid_boxes, - valid_boxes_idx) - return pos_chip2boxes_idx - - def _validate_boxes(self, - valid_ratio_range: List[float], - im_size: int, - gt_boxes: 'np.array of Nx4', - scale: float): - """ - :return: valid_boxes: Nx4, valid_boxes_idx: N - """ - ws = (gt_boxes[:, 2] - gt_boxes[:, 0]).astype(np.int32) - hs = (gt_boxes[:, 3] - gt_boxes[:, 1]).astype(np.int32) - maxs = np.maximum(ws, hs) - box_ratio = maxs / im_size - mins = np.minimum(ws, hs) - target_mins = mins * scale - - low = valid_ratio_range[0] if valid_ratio_range[0] > 0 else 0 - high = valid_ratio_range[1] if valid_ratio_range[1] > 0 else np.finfo( - np.float32).max - - valid_boxes_idx = np.nonzero((low <= box_ratio) & (box_ratio < high) & ( - target_mins >= 2))[0] - valid_boxes = gt_boxes[valid_boxes_idx] - return valid_boxes, valid_boxes_idx - - def _find_pos_chips(self, - chips: 'Cx4', - valid_boxes: 'Bx4', - valid_boxes_idx: 'B'): - """ - :return: pos_chip2boxes_idx, dict: chipid->[box_id, ...] - """ - iob = intersection_over_box(chips, valid_boxes) # overlap, CxB - - iob_threshold_to_find_chips = 1. - pos_chip_ids, _ = self._find_chips_to_cover_overlaped_boxes( - iob, iob_threshold_to_find_chips) - pos_chip_ids = set(pos_chip_ids) - - iob_threshold_to_assign_box = 0.5 - pos_chip2boxes_idx = self._assign_boxes_to_pos_chips( - iob, iob_threshold_to_assign_box, pos_chip_ids, valid_boxes_idx) - return pos_chip2boxes_idx - - def _find_chips_to_cover_overlaped_boxes(self, iob, overlap_threshold): - return find_chips_to_cover_overlaped_boxes(iob, overlap_threshold) - - def _assign_boxes_to_pos_chips(self, iob, overlap_threshold, pos_chip_ids, - valid_boxes_idx): - chip_ids, box_ids = np.nonzero(iob >= overlap_threshold) - pos_chip2boxes_idx = defaultdict(list) - for chip_id, box_id in zip(chip_ids, box_ids): - if chip_id not in pos_chip_ids: - continue - raw_gt_box_idx = valid_boxes_idx[box_id] - pos_chip2boxes_idx[chip_id].append(raw_gt_box_idx) - return pos_chip2boxes_idx - - def _get_neg_boxes_and_chips(self, - chips: 'Cx4', - pos_chip_ids: 'D', - proposals: 'Px4'): - """ - :param chips: - :param pos_chip_ids: - :param proposals: - :return: neg_chip2box_num, None or dict: chipid->neg_box_num - """ - if not self.use_neg_chip: - return None - - # train proposals maybe None - if proposals is None or len(proposals) < 1: - return None - - valid_ratio_range = self._cur_valid_ratio_range - im_size = self._cur_im_size - scale = self._cur_scale - - valid_props, _ = self._validate_boxes(valid_ratio_range, im_size, - proposals, scale) - neg_boxes = self._find_neg_boxes(chips, pos_chip_ids, valid_props) - neg_chip2box_num = self._find_neg_chips(chips, pos_chip_ids, neg_boxes) - return neg_chip2box_num - - def _find_neg_boxes(self, - chips: 'Cx4', - pos_chip_ids: 'D', - valid_props: 'Px4'): - """ - :return: neg_boxes: Nx4 - """ - if len(pos_chip_ids) == 0: - return valid_props - - pos_chips = chips[pos_chip_ids] - iob = intersection_over_box(pos_chips, valid_props) - overlap_per_prop = np.max(iob, axis=0) - non_overlap_props_idx = overlap_per_prop < 0.5 - neg_boxes = valid_props[non_overlap_props_idx] - return neg_boxes - - def _find_neg_chips(self, chips: 'Cx4', pos_chip_ids: 'D', - neg_boxes: 'Nx4'): - """ - :return: neg_chip2box_num, dict: chipid->neg_box_num - """ - neg_chip_ids = np.setdiff1d(np.arange(len(chips)), pos_chip_ids) - neg_chips = chips[neg_chip_ids] - - iob = intersection_over_box(neg_chips, neg_boxes) - iob_threshold_to_find_chips = 0.7 - chosen_neg_chip_ids, chip_id2overlap_box_num = \ - self._find_chips_to_cover_overlaped_boxes(iob, iob_threshold_to_find_chips) - - neg_chipid2box_num = {} - for cid in chosen_neg_chip_ids: - box_num = chip_id2overlap_box_num[cid] - raw_chip_id = neg_chip_ids[cid] - neg_chipid2box_num[raw_chip_id] = box_num - return neg_chipid2box_num - - def crop_infer_anno_records(self, records: List[dict]): - """ - transform image record to chips record - :param records: - :return: new_records, list of dict like - { - 'im_file': 'fake_image1.jpg', - 'im_id': np.array([1]), # new _global_chip_id as im_id - 'h': h, # chip height - 'w': w, # chip width - 'chip': [x1, y1, x2, y2] # added - 'ori_im_h': ori_im_h # added, origin image height - 'ori_im_w': ori_im_w # added, origin image width - 'scale_i': 0 # added, - } - """ - self.chip_records = [] - self._global_chip_id = 1 # im_id start from 1 - self._global_chip_id2img_id = {} - - for r in records: - for scale_i in range(self.scale_num): - self._get_current_scale_parameters(scale_i, r) - # Cx4 - chips = self._create_chips(r['h'], r['w'], self._cur_scale) - cur_img_chip_record = self._get_chips_records(r, chips, scale_i) - self.chip_records.extend(cur_img_chip_record) - - return self.chip_records - - def _get_chips_records(self, rec, chips, scale_i): - cur_img_chip_records = [] - ori_im_h = rec["h"] - ori_im_w = rec["w"] - im_file = rec["im_file"] - ori_im_id = rec["im_id"] - for id, chip in enumerate(chips): - chip_rec = {} - x1, y1, x2, y2 = chip - chip_h = y2 - y1 - chip_w = x2 - x1 - chip_rec["im_file"] = im_file - chip_rec["im_id"] = self._global_chip_id - chip_rec["h"] = chip_h - chip_rec["w"] = chip_w - chip_rec["chip"] = chip - chip_rec["ori_im_h"] = ori_im_h - chip_rec["ori_im_w"] = ori_im_w - chip_rec["scale_i"] = scale_i - - self._global_chip_id2img_id[self._global_chip_id] = int(ori_im_id) - self._global_chip_id += 1 - cur_img_chip_records.append(chip_rec) - - return cur_img_chip_records - - def aggregate_chips_detections(self, results, records=None): - """ - # 1. transform chip dets to image dets - # 2. nms boxes per image; - # 3. format output results - :param results: - :param roidb: - :return: - """ - results = deepcopy(results) - records = records if records else self.chip_records - img_id2bbox = self._transform_chip2image_bboxes(results, records) - nms_img_id2bbox = self._nms_dets(img_id2bbox) - aggregate_results = self._reformat_results(nms_img_id2bbox) - return aggregate_results - - def _transform_chip2image_bboxes(self, results, records): - # 1. Transform chip dets to image dets; - # 2. Filter valid range; - # 3. Reformat and Aggregate chip dets to Get scale_cls_dets - img_id2bbox = defaultdict(list) - for result in results: - bbox_locs = result['bbox'] - bbox_nums = result['bbox_num'] - if len(bbox_locs) == 1 and bbox_locs[0][ - 0] == -1: # current batch has no detections - # bbox_locs = array([[-1.]], dtype=float32); bbox_nums = [[1]] - # MultiClassNMS output: If there is no detected boxes for all images, lod will be set to {1} and Out only contains one value which is -1. - continue - im_ids = result['im_id'] # replace with range(len(bbox_nums)) - - last_bbox_num = 0 - for idx, im_id in enumerate(im_ids): - - cur_bbox_len = bbox_nums[idx] - bboxes = bbox_locs[last_bbox_num:last_bbox_num + cur_bbox_len] - last_bbox_num += cur_bbox_len - # box: [num_id, score, xmin, ymin, xmax, ymax] - if len(bboxes) == 0: # current image has no detections - continue - - chip_rec = records[int(im_id) - - 1] # im_id starts from 1, type is np.int64 - image_size = max(chip_rec["ori_im_h"], chip_rec["ori_im_w"]) - - bboxes = transform_chip_boxes2image_boxes( - bboxes, chip_rec["chip"], chip_rec["ori_im_h"], - chip_rec["ori_im_w"]) - - scale_i = chip_rec["scale_i"] - cur_scale = self._get_current_scale(self.target_sizes[scale_i], - image_size) - _, valid_boxes_idx = self._validate_boxes( - self.valid_box_ratio_ranges[scale_i], image_size, - bboxes[:, 2:], cur_scale) - ori_img_id = self._global_chip_id2img_id[int(im_id)] - - img_id2bbox[ori_img_id].append(bboxes[valid_boxes_idx]) - - return img_id2bbox - - def _nms_dets(self, img_id2bbox): - # 1. NMS on each image-class - # 2. Limit number of detections to MAX_PER_IMAGE if requested - max_per_img = self.max_per_img - nms_thresh = self.nms_thresh - - for img_id in img_id2bbox: - box = img_id2bbox[ - img_id] # list of np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2] - box = np.concatenate(box, axis=0) - nms_dets = nms(box, nms_thresh) - if max_per_img > 0: - if len(nms_dets) > max_per_img: - keep = np.argsort(-nms_dets[:, 1])[:max_per_img] - nms_dets = nms_dets[keep] - - img_id2bbox[img_id] = nms_dets - - return img_id2bbox - - def _reformat_results(self, img_id2bbox): - """reformat results""" - im_ids = img_id2bbox.keys() - results = [] - for img_id in im_ids: # output by original im_id order - if len(img_id2bbox[img_id]) == 0: - bbox = np.array( - [[-1., 0., 0., 0., 0., 0.]]) # edge case: no detections - bbox_num = np.array([0]) - else: - # np.array of shape [N, 6], 6 is [label, score, x1, y1, x2, y2] - bbox = img_id2bbox[img_id] - bbox_num = np.array([len(bbox)]) - res = dict(im_id=np.array([[img_id]]), bbox=bbox, bbox_num=bbox_num) - results.append(res) - return results diff --git a/pdfdet/models/Paddle/ppdet/data/crop_utils/chip_box_utils.py b/pdfdet/models/Paddle/ppdet/data/crop_utils/chip_box_utils.py deleted file mode 100644 index cfa1e39..0000000 --- a/pdfdet/models/Paddle/ppdet/data/crop_utils/chip_box_utils.py +++ /dev/null @@ -1,170 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np - - -def bbox_area(boxes): - return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) - - -def intersection_over_box(chips, boxes): - """ - intersection area over box area - :param chips: C - :param boxes: B - :return: iob, CxB - """ - M = chips.shape[0] - N = boxes.shape[0] - if M * N == 0: - return np.zeros([M, N], dtype='float32') - - box_area = bbox_area(boxes) # B - - inter_x2y2 = np.minimum(np.expand_dims(chips, 1)[:, :, 2:], - boxes[:, 2:]) # CxBX2 - inter_x1y1 = np.maximum(np.expand_dims(chips, 1)[:, :, :2], - boxes[:, :2]) # CxBx2 - inter_wh = inter_x2y2 - inter_x1y1 - inter_wh = np.clip(inter_wh, a_min=0, a_max=None) - inter_area = inter_wh[:, :, 0] * inter_wh[:, :, 1] # CxB - - iob = inter_area / np.expand_dims(box_area, 0) - return iob - - -def clip_boxes(boxes, im_shape): - """ - Clip boxes to image boundaries. - :param boxes: [N, 4] - :param im_shape: tuple of 2, [h, w] - :return: [N, 4] - """ - # x1 >= 0 - boxes[:, 0] = np.clip(boxes[:, 0], 0, im_shape[1] - 1) - # y1 >= 0 - boxes[:, 1] = np.clip(boxes[:, 1], 0, im_shape[0] - 1) - # x2 < im_shape[1] - boxes[:, 2] = np.clip(boxes[:, 2], 1, im_shape[1]) - # y2 < im_shape[0] - boxes[:, 3] = np.clip(boxes[:, 3], 1, im_shape[0]) - return boxes - - -def transform_chip_box(gt_bbox: 'Gx4', boxes_idx: 'B', chip: '4'): - boxes_idx = np.array(boxes_idx) - cur_gt_bbox = gt_bbox[boxes_idx].copy() # Bx4 - x1, y1, x2, y2 = chip - cur_gt_bbox[:, 0] -= x1 - cur_gt_bbox[:, 1] -= y1 - cur_gt_bbox[:, 2] -= x1 - cur_gt_bbox[:, 3] -= y1 - h = y2 - y1 - w = x2 - x1 - cur_gt_bbox = clip_boxes(cur_gt_bbox, (h, w)) - ws = (cur_gt_bbox[:, 2] - cur_gt_bbox[:, 0]).astype(np.int32) - hs = (cur_gt_bbox[:, 3] - cur_gt_bbox[:, 1]).astype(np.int32) - valid_idx = (ws >= 2) & (hs >= 2) - return cur_gt_bbox[valid_idx], boxes_idx[valid_idx] - - -def find_chips_to_cover_overlaped_boxes(iob, overlap_threshold): - chip_ids, box_ids = np.nonzero(iob >= overlap_threshold) - chip_id2overlap_box_num = np.bincount(chip_ids) # 1d array - chip_id2overlap_box_num = np.pad( - chip_id2overlap_box_num, (0, len(iob) - len(chip_id2overlap_box_num)), - constant_values=0) - - chosen_chip_ids = [] - while len(box_ids) > 0: - value_counts = np.bincount(chip_ids) # 1d array - max_count_chip_id = np.argmax(value_counts) - assert max_count_chip_id not in chosen_chip_ids - chosen_chip_ids.append(max_count_chip_id) - - box_ids_in_cur_chip = box_ids[chip_ids == max_count_chip_id] - ids_not_in_cur_boxes_mask = np.logical_not( - np.isin(box_ids, box_ids_in_cur_chip)) - chip_ids = chip_ids[ids_not_in_cur_boxes_mask] - box_ids = box_ids[ids_not_in_cur_boxes_mask] - return chosen_chip_ids, chip_id2overlap_box_num - - -def transform_chip_boxes2image_boxes(chip_boxes, chip, img_h, img_w): - chip_boxes = np.array(sorted(chip_boxes, key=lambda item: -item[1])) - xmin, ymin, _, _ = chip - # Transform to origin image loc - chip_boxes[:, 2] += xmin - chip_boxes[:, 4] += xmin - chip_boxes[:, 3] += ymin - chip_boxes[:, 5] += ymin - chip_boxes = clip_boxes(chip_boxes, (img_h, img_w)) - return chip_boxes - - -def nms(dets, thresh): - """Apply classic DPM-style greedy NMS.""" - if dets.shape[0] == 0: - return dets[[], :] - scores = dets[:, 1] - x1 = dets[:, 2] - y1 = dets[:, 3] - x2 = dets[:, 4] - y2 = dets[:, 5] - - areas = (x2 - x1 + 1) * (y2 - y1 + 1) - order = scores.argsort()[::-1] - - ndets = dets.shape[0] - suppressed = np.zeros((ndets), dtype=np.int32) - - # nominal indices - # _i, _j - # sorted indices - # i, j - # temp variables for box i's (the box currently under consideration) - # ix1, iy1, ix2, iy2, iarea - - # variables for computing overlap with box j (lower scoring box) - # xx1, yy1, xx2, yy2 - # w, h - # inter, ovr - - for _i in range(ndets): - i = order[_i] - if suppressed[i] == 1: - continue - ix1 = x1[i] - iy1 = y1[i] - ix2 = x2[i] - iy2 = y2[i] - iarea = areas[i] - for _j in range(_i + 1, ndets): - j = order[_j] - if suppressed[j] == 1: - continue - xx1 = max(ix1, x1[j]) - yy1 = max(iy1, y1[j]) - xx2 = min(ix2, x2[j]) - yy2 = min(iy2, y2[j]) - w = max(0.0, xx2 - xx1 + 1) - h = max(0.0, yy2 - yy1 + 1) - inter = w * h - ovr = inter / (iarea + areas[j] - inter) - if ovr >= thresh: - suppressed[j] = 1 - keep = np.where(suppressed == 0)[0] - dets = dets[keep, :] - return dets diff --git a/pdfdet/models/Paddle/ppdet/data/culane_utils.py b/pdfdet/models/Paddle/ppdet/data/culane_utils.py deleted file mode 100644 index ea8c948..0000000 --- a/pdfdet/models/Paddle/ppdet/data/culane_utils.py +++ /dev/null @@ -1,130 +0,0 @@ -import math -import numpy as np -from imgaug.augmentables.lines import LineString -from scipy.interpolate import InterpolatedUnivariateSpline - - -def lane_to_linestrings(lanes): - lines = [] - for lane in lanes: - lines.append(LineString(lane)) - - return lines - - -def linestrings_to_lanes(lines): - lanes = [] - for line in lines: - lanes.append(line.coords) - - return lanes - - -def sample_lane(points, sample_ys, img_w): - # this function expects the points to be sorted - points = np.array(points) - if not np.all(points[1:, 1] < points[:-1, 1]): - raise Exception('Annotaion points have to be sorted') - x, y = points[:, 0], points[:, 1] - - # interpolate points inside domain - assert len(points) > 1 - interp = InterpolatedUnivariateSpline( - y[::-1], x[::-1], k=min(3, len(points) - 1)) - domain_min_y = y.min() - domain_max_y = y.max() - sample_ys_inside_domain = sample_ys[(sample_ys >= domain_min_y) & ( - sample_ys <= domain_max_y)] - assert len(sample_ys_inside_domain) > 0 - interp_xs = interp(sample_ys_inside_domain) - - # extrapolate lane to the bottom of the image with a straight line using the 2 points closest to the bottom - two_closest_points = points[:2] - extrap = np.polyfit( - two_closest_points[:, 1], two_closest_points[:, 0], deg=1) - extrap_ys = sample_ys[sample_ys > domain_max_y] - extrap_xs = np.polyval(extrap, extrap_ys) - all_xs = np.hstack((extrap_xs, interp_xs)) - - # separate between inside and outside points - inside_mask = (all_xs >= 0) & (all_xs < img_w) - xs_inside_image = all_xs[inside_mask] - xs_outside_image = all_xs[~inside_mask] - - return xs_outside_image, xs_inside_image - - -def filter_lane(lane): - assert lane[-1][1] <= lane[0][1] - filtered_lane = [] - used = set() - for p in lane: - if p[1] not in used: - filtered_lane.append(p) - used.add(p[1]) - - return filtered_lane - - -def transform_annotation(img_w, img_h, max_lanes, n_offsets, offsets_ys, - n_strips, strip_size, anno): - old_lanes = anno['lanes'] - - # removing lanes with less than 2 points - old_lanes = filter(lambda x: len(x) > 1, old_lanes) - # sort lane points by Y (bottom to top of the image) - old_lanes = [sorted(lane, key=lambda x: -x[1]) for lane in old_lanes] - # remove points with same Y (keep first occurrence) - old_lanes = [filter_lane(lane) for lane in old_lanes] - # normalize the annotation coordinates - old_lanes = [[[x * img_w / float(img_w), y * img_h / float(img_h)] - for x, y in lane] for lane in old_lanes] - # create tranformed annotations - lanes = np.ones( - (max_lanes, 2 + 1 + 1 + 2 + n_offsets), dtype=np.float32 - ) * -1e5 # 2 scores, 1 start_y, 1 start_x, 1 theta, 1 length, S+1 coordinates - lanes_endpoints = np.ones((max_lanes, 2)) - # lanes are invalid by default - lanes[:, 0] = 1 - lanes[:, 1] = 0 - for lane_idx, lane in enumerate(old_lanes): - if lane_idx >= max_lanes: - break - - try: - xs_outside_image, xs_inside_image = sample_lane(lane, offsets_ys, - img_w) - except AssertionError: - continue - if len(xs_inside_image) <= 1: - continue - all_xs = np.hstack((xs_outside_image, xs_inside_image)) - lanes[lane_idx, 0] = 0 - lanes[lane_idx, 1] = 1 - lanes[lane_idx, 2] = len(xs_outside_image) / n_strips - lanes[lane_idx, 3] = xs_inside_image[0] - - thetas = [] - for i in range(1, len(xs_inside_image)): - theta = math.atan( - i * strip_size / - (xs_inside_image[i] - xs_inside_image[0] + 1e-5)) / math.pi - theta = theta if theta > 0 else 1 - abs(theta) - thetas.append(theta) - - theta_far = sum(thetas) / len(thetas) - - # lanes[lane_idx, - # 4] = (theta_closest + theta_far) / 2 # averaged angle - lanes[lane_idx, 4] = theta_far - lanes[lane_idx, 5] = len(xs_inside_image) - lanes[lane_idx, 6:6 + len(all_xs)] = all_xs - lanes_endpoints[lane_idx, 0] = (len(all_xs) - 1) / n_strips - lanes_endpoints[lane_idx, 1] = xs_inside_image[-1] - - new_anno = { - 'label': lanes, - 'old_anno': anno, - 'lane_endpoints': lanes_endpoints - } - return new_anno diff --git a/pdfdet/models/Paddle/ppdet/data/reader.py b/pdfdet/models/Paddle/ppdet/data/reader.py deleted file mode 100644 index c40f3c3..0000000 --- a/pdfdet/models/Paddle/ppdet/data/reader.py +++ /dev/null @@ -1,615 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import os -import traceback -import six -import sys -if sys.version_info >= (3, 0): - pass -else: - pass -import numpy as np -import paddle -import paddle.nn.functional as F - -from copy import deepcopy - -from paddle.io import DataLoader, DistributedBatchSampler -from .utils import default_collate_fn - -from ppdet.core.workspace import register -from . import transform -from .shm_utils import _get_shared_memory_size_in_M - -from ppdet.utils.logger import setup_logger -logger = setup_logger('reader') - -MAIN_PID = os.getpid() - - -class Compose(object): - def __init__(self, transforms, num_classes=80): - self.transforms = transforms - self.transforms_cls = [] - for t in self.transforms: - for k, v in t.items(): - op_cls = getattr(transform, k) - f = op_cls(**v) - if hasattr(f, 'num_classes'): - f.num_classes = num_classes - - self.transforms_cls.append(f) - - def __call__(self, data): - for f in self.transforms_cls: - try: - data = f(data) - except Exception as e: - stack_info = traceback.format_exc() - logger.warning("fail to map sample transform [{}] " - "with error: {} and stack:\n{}".format( - f, e, str(stack_info))) - raise e - - return data - - -class BatchCompose(Compose): - def __init__(self, transforms, num_classes=80, collate_batch=True): - super(BatchCompose, self).__init__(transforms, num_classes) - self.collate_batch = collate_batch - - def __call__(self, data): - for f in self.transforms_cls: - try: - data = f(data) - except Exception as e: - stack_info = traceback.format_exc() - logger.warning("fail to map batch transform [{}] " - "with error: {} and stack:\n{}".format( - f, e, str(stack_info))) - raise e - - # remove keys which is not needed by model - extra_key = ['h', 'w', 'flipped'] - for k in extra_key: - for sample in data: - if k in sample: - sample.pop(k) - - # batch data, if user-define batch function needed - # use user-defined here - if self.collate_batch: - batch_data = default_collate_fn(data) - else: - batch_data = {} - for k in data[0].keys(): - tmp_data = [] - for i in range(len(data)): - tmp_data.append(data[i][k]) - if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k: - tmp_data = np.stack(tmp_data, axis=0) - batch_data[k] = tmp_data - return batch_data - - -class BaseDataLoader(object): - """ - Base DataLoader implementation for detection models - - Args: - sample_transforms (list): a list of transforms to perform - on each sample - batch_transforms (list): a list of transforms to perform - on batch - batch_size (int): batch size for batch collating, default 1. - shuffle (bool): whether to shuffle samples - drop_last (bool): whether to drop the last incomplete, - default False - num_classes (int): class number of dataset, default 80 - collate_batch (bool): whether to collate batch in dataloader. - If set to True, the samples will collate into batch according - to the batch size. Otherwise, the ground-truth will not collate, - which is used when the number of ground-truch is different in - samples. - use_shared_memory (bool): whether to use shared memory to - accelerate data loading, enable this only if you - are sure that the shared memory size of your OS - is larger than memory cost of input datas of model. - Note that shared memory will be automatically - disabled if the shared memory of OS is less than - 1G, which is not enough for detection models. - Default False. - """ - - def __init__(self, - sample_transforms=[], - batch_transforms=[], - batch_size=1, - shuffle=False, - drop_last=False, - num_classes=80, - collate_batch=True, - use_shared_memory=False, - **kwargs): - # sample transform - self._sample_transforms = Compose( - sample_transforms, num_classes=num_classes) - - # batch transfrom - self._batch_transforms = BatchCompose(batch_transforms, num_classes, - collate_batch) - self.batch_size = batch_size - self.shuffle = shuffle - self.drop_last = drop_last - self.use_shared_memory = use_shared_memory - self.kwargs = kwargs - - def __call__(self, - dataset, - worker_num, - batch_sampler=None, - return_list=False): - self.dataset = dataset - self.dataset.check_or_download_dataset() - self.dataset.parse_dataset() - # get data - self.dataset.set_transform(self._sample_transforms) - # set kwargs - self.dataset.set_kwargs(**self.kwargs) - # batch sampler - if batch_sampler is None: - self._batch_sampler = DistributedBatchSampler( - self.dataset, - batch_size=self.batch_size, - shuffle=self.shuffle, - drop_last=self.drop_last) - else: - self._batch_sampler = batch_sampler - - # DataLoader do not start sub-process in Windows and Mac - # system, do not need to use shared memory - use_shared_memory = self.use_shared_memory and \ - sys.platform not in ['win32', 'darwin'] - # check whether shared memory size is bigger than 1G(1024M) - if use_shared_memory: - shm_size = _get_shared_memory_size_in_M() - if shm_size is not None and shm_size < 1024.: - logger.warning("Shared memory size is less than 1G, " - "disable shared_memory in DataLoader") - use_shared_memory = False - - self.dataloader = DataLoader( - dataset=self.dataset, - batch_sampler=self._batch_sampler, - collate_fn=self._batch_transforms, - num_workers=worker_num, - return_list=return_list, - use_shared_memory=use_shared_memory) - self.loader = iter(self.dataloader) - - return self - - def __len__(self): - return len(self._batch_sampler) - - def __iter__(self): - return self - - def __next__(self): - try: - return next(self.loader) - except StopIteration: - self.loader = iter(self.dataloader) - six.reraise(*sys.exc_info()) - - def next(self): - # python2 compatibility - return self.__next__() - - -@register -class TrainReader(BaseDataLoader): - __shared__ = ['num_classes'] - - def __init__(self, - sample_transforms=[], - batch_transforms=[], - batch_size=1, - shuffle=True, - drop_last=True, - num_classes=80, - collate_batch=True, - **kwargs): - super(TrainReader, self).__init__(sample_transforms, batch_transforms, - batch_size, shuffle, drop_last, - num_classes, collate_batch, **kwargs) - - -@register -class EvalReader(BaseDataLoader): - __shared__ = ['num_classes'] - - def __init__(self, - sample_transforms=[], - batch_transforms=[], - batch_size=1, - shuffle=False, - drop_last=False, - num_classes=80, - **kwargs): - super(EvalReader, self).__init__(sample_transforms, batch_transforms, - batch_size, shuffle, drop_last, - num_classes, **kwargs) - - -@register -class TestReader(BaseDataLoader): - __shared__ = ['num_classes'] - - def __init__(self, - sample_transforms=[], - batch_transforms=[], - batch_size=1, - shuffle=False, - drop_last=False, - num_classes=80, - **kwargs): - super(TestReader, self).__init__(sample_transforms, batch_transforms, - batch_size, shuffle, drop_last, - num_classes, **kwargs) - - -@register -class EvalMOTReader(BaseDataLoader): - __shared__ = ['num_classes'] - - def __init__(self, - sample_transforms=[], - batch_transforms=[], - batch_size=1, - shuffle=False, - drop_last=False, - num_classes=1, - **kwargs): - super(EvalMOTReader, self).__init__(sample_transforms, batch_transforms, - batch_size, shuffle, drop_last, - num_classes, **kwargs) - - -@register -class TestMOTReader(BaseDataLoader): - __shared__ = ['num_classes'] - - def __init__(self, - sample_transforms=[], - batch_transforms=[], - batch_size=1, - shuffle=False, - drop_last=False, - num_classes=1, - **kwargs): - super(TestMOTReader, self).__init__(sample_transforms, batch_transforms, - batch_size, shuffle, drop_last, - num_classes, **kwargs) - - -# For Semi-Supervised Object Detection (SSOD) -class Compose_SSOD(object): - def __init__(self, base_transforms, weak_aug, strong_aug, num_classes=80): - self.base_transforms = base_transforms - self.base_transforms_cls = [] - for t in self.base_transforms: - for k, v in t.items(): - op_cls = getattr(transform, k) - f = op_cls(**v) - if hasattr(f, 'num_classes'): - f.num_classes = num_classes - self.base_transforms_cls.append(f) - - self.weak_augs = weak_aug - self.weak_augs_cls = [] - for t in self.weak_augs: - for k, v in t.items(): - op_cls = getattr(transform, k) - f = op_cls(**v) - if hasattr(f, 'num_classes'): - f.num_classes = num_classes - self.weak_augs_cls.append(f) - - self.strong_augs = strong_aug - self.strong_augs_cls = [] - for t in self.strong_augs: - for k, v in t.items(): - op_cls = getattr(transform, k) - f = op_cls(**v) - if hasattr(f, 'num_classes'): - f.num_classes = num_classes - self.strong_augs_cls.append(f) - - def __call__(self, data): - for f in self.base_transforms_cls: - try: - data = f(data) - except Exception as e: - stack_info = traceback.format_exc() - logger.warning("fail to map sample transform [{}] " - "with error: {} and stack:\n{}".format( - f, e, str(stack_info))) - raise e - - weak_data = deepcopy(data) - strong_data = deepcopy(data) - for f in self.weak_augs_cls: - try: - weak_data = f(weak_data) - except Exception as e: - stack_info = traceback.format_exc() - logger.warning("fail to map weak aug [{}] " - "with error: {} and stack:\n{}".format( - f, e, str(stack_info))) - raise e - - for f in self.strong_augs_cls: - try: - strong_data = f(strong_data) - except Exception as e: - stack_info = traceback.format_exc() - logger.warning("fail to map strong aug [{}] " - "with error: {} and stack:\n{}".format( - f, e, str(stack_info))) - raise e - - weak_data['strong_aug'] = strong_data - return weak_data - - -class BatchCompose_SSOD(Compose): - def __init__(self, transforms, num_classes=80, collate_batch=True): - super(BatchCompose_SSOD, self).__init__(transforms, num_classes) - self.collate_batch = collate_batch - - def __call__(self, data): - # split strong_data from data(weak_data) - strong_data = [] - for sample in data: - strong_data.append(sample['strong_aug']) - sample.pop('strong_aug') - - for f in self.transforms_cls: - try: - data = f(data) - if 'BatchRandomResizeForSSOD' in f._id: - strong_data = f(strong_data, data[1])[0] - data = data[0] - else: - strong_data = f(strong_data) - except Exception as e: - stack_info = traceback.format_exc() - logger.warning("fail to map batch transform [{}] " - "with error: {} and stack:\n{}".format( - f, e, str(stack_info))) - raise e - - # remove keys which is not needed by model - extra_key = ['h', 'w', 'flipped'] - for k in extra_key: - for sample in data: - if k in sample: - sample.pop(k) - for sample in strong_data: - if k in sample: - sample.pop(k) - - # batch data, if user-define batch function needed - # use user-defined here - if self.collate_batch: - batch_data = default_collate_fn(data) - strong_batch_data = default_collate_fn(strong_data) - return batch_data, strong_batch_data - else: - batch_data = {} - for k in data[0].keys(): - tmp_data = [] - for i in range(len(data)): - tmp_data.append(data[i][k]) - if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k: - tmp_data = np.stack(tmp_data, axis=0) - batch_data[k] = tmp_data - - strong_batch_data = {} - for k in strong_data[0].keys(): - tmp_data = [] - for i in range(len(strong_data)): - tmp_data.append(strong_data[i][k]) - if not 'gt_' in k and not 'is_crowd' in k and not 'difficult' in k: - tmp_data = np.stack(tmp_data, axis=0) - strong_batch_data[k] = tmp_data - - return batch_data, strong_batch_data - - -class CombineSSODLoader(object): - def __init__(self, label_loader, unlabel_loader): - self.label_loader = label_loader - self.unlabel_loader = unlabel_loader - - def __iter__(self): - while True: - try: - label_samples = next(self.label_loader_iter) - except: - self.label_loader_iter = iter(self.label_loader) - label_samples = next(self.label_loader_iter) - - try: - unlabel_samples = next(self.unlabel_loader_iter) - except: - self.unlabel_loader_iter = iter(self.unlabel_loader) - unlabel_samples = next(self.unlabel_loader_iter) - - yield ( - label_samples[0], # sup weak - label_samples[1], # sup strong - unlabel_samples[0], # unsup weak - unlabel_samples[1] # unsup strong - ) - - def __call__(self): - return self.__iter__() - - -class BaseSemiDataLoader(object): - def __init__(self, - sample_transforms=[], - weak_aug=[], - strong_aug=[], - sup_batch_transforms=[], - unsup_batch_transforms=[], - sup_batch_size=1, - unsup_batch_size=1, - shuffle=True, - drop_last=True, - num_classes=80, - collate_batch=True, - use_shared_memory=False, - **kwargs): - # sup transforms - self._sample_transforms_label = Compose_SSOD( - sample_transforms, weak_aug, strong_aug, num_classes=num_classes) - self._batch_transforms_label = BatchCompose_SSOD( - sup_batch_transforms, num_classes, collate_batch) - self.batch_size_label = sup_batch_size - - # unsup transforms - self._sample_transforms_unlabel = Compose_SSOD( - sample_transforms, weak_aug, strong_aug, num_classes=num_classes) - self._batch_transforms_unlabel = BatchCompose_SSOD( - unsup_batch_transforms, num_classes, collate_batch) - self.batch_size_unlabel = unsup_batch_size - - # common - self.shuffle = shuffle - self.drop_last = drop_last - self.use_shared_memory = use_shared_memory - self.kwargs = kwargs - - def __call__(self, - dataset_label, - dataset_unlabel, - worker_num, - batch_sampler_label=None, - batch_sampler_unlabel=None, - return_list=False): - # sup dataset - self.dataset_label = dataset_label - self.dataset_label.check_or_download_dataset() - self.dataset_label.parse_dataset() - self.dataset_label.set_transform(self._sample_transforms_label) - self.dataset_label.set_kwargs(**self.kwargs) - if batch_sampler_label is None: - self._batch_sampler_label = DistributedBatchSampler( - self.dataset_label, - batch_size=self.batch_size_label, - shuffle=self.shuffle, - drop_last=self.drop_last) - else: - self._batch_sampler_label = batch_sampler_label - - # unsup dataset - self.dataset_unlabel = dataset_unlabel - self.dataset_unlabel.length = self.dataset_label.__len__() - self.dataset_unlabel.check_or_download_dataset() - self.dataset_unlabel.parse_dataset() - self.dataset_unlabel.set_transform(self._sample_transforms_unlabel) - self.dataset_unlabel.set_kwargs(**self.kwargs) - if batch_sampler_unlabel is None: - self._batch_sampler_unlabel = DistributedBatchSampler( - self.dataset_unlabel, - batch_size=self.batch_size_unlabel, - shuffle=self.shuffle, - drop_last=self.drop_last) - else: - self._batch_sampler_unlabel = batch_sampler_unlabel - - # DataLoader do not start sub-process in Windows and Mac - # system, do not need to use shared memory - use_shared_memory = self.use_shared_memory and \ - sys.platform not in ['win32', 'darwin'] - # check whether shared memory size is bigger than 1G(1024M) - if use_shared_memory: - shm_size = _get_shared_memory_size_in_M() - if shm_size is not None and shm_size < 1024.: - logger.warning("Shared memory size is less than 1G, " - "disable shared_memory in DataLoader") - use_shared_memory = False - - self.dataloader_label = DataLoader( - dataset=self.dataset_label, - batch_sampler=self._batch_sampler_label, - collate_fn=self._batch_transforms_label, - num_workers=worker_num, - return_list=return_list, - use_shared_memory=use_shared_memory) - - self.dataloader_unlabel = DataLoader( - dataset=self.dataset_unlabel, - batch_sampler=self._batch_sampler_unlabel, - collate_fn=self._batch_transforms_unlabel, - num_workers=worker_num, - return_list=return_list, - use_shared_memory=use_shared_memory) - - self.dataloader = CombineSSODLoader(self.dataloader_label, - self.dataloader_unlabel) - self.loader = iter(self.dataloader) - return self - - def __len__(self): - return len(self._batch_sampler_label) - - def __iter__(self): - return self - - def __next__(self): - return next(self.loader) - - def next(self): - # python2 compatibility - return self.__next__() - - -@register -class SemiTrainReader(BaseSemiDataLoader): - __shared__ = ['num_classes'] - - def __init__(self, - sample_transforms=[], - weak_aug=[], - strong_aug=[], - sup_batch_transforms=[], - unsup_batch_transforms=[], - sup_batch_size=1, - unsup_batch_size=1, - shuffle=True, - drop_last=True, - num_classes=80, - collate_batch=True, - **kwargs): - super(SemiTrainReader, self).__init__( - sample_transforms, weak_aug, strong_aug, sup_batch_transforms, - unsup_batch_transforms, sup_batch_size, unsup_batch_size, shuffle, - drop_last, num_classes, collate_batch, **kwargs) diff --git a/pdfdet/models/Paddle/ppdet/data/shm_utils.py b/pdfdet/models/Paddle/ppdet/data/shm_utils.py deleted file mode 100644 index a929a80..0000000 --- a/pdfdet/models/Paddle/ppdet/data/shm_utils.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - -SIZE_UNIT = ['K', 'M', 'G', 'T'] -SHM_QUERY_CMD = 'df -h' -SHM_KEY = 'shm' -SHM_DEFAULT_MOUNT = '/dev/shm' - -# [ shared memory size check ] -# In detection models, image/target data occupies a lot of memory, and -# will occupy lots of shared memory in multi-process DataLoader, we use -# following code to get shared memory size and perform a size check to -# disable shared memory use if shared memory size is not enough. -# Shared memory getting process as follows: -# 1. use `df -h` get all mount info -# 2. pick up spaces whose mount info contains 'shm' -# 3. if 'shm' space number is only 1, return its size -# 4. if there are multiple 'shm' space, try to find the default mount -# directory '/dev/shm' is Linux-like system, otherwise return the -# biggest space size. - - -def _parse_size_in_M(size_str): - if size_str[-1] == 'B': - num, unit = size_str[:-2], size_str[-2] - else: - num, unit = size_str[:-1], size_str[-1] - assert unit in SIZE_UNIT, \ - "unknown shm size unit {}".format(unit) - return float(num) * \ - (1024 ** (SIZE_UNIT.index(unit) - 1)) - - -def _get_shared_memory_size_in_M(): - try: - df_infos = os.popen(SHM_QUERY_CMD).readlines() - except: - return None - else: - shm_infos = [] - for df_info in df_infos: - info = df_info.strip() - if info.find(SHM_KEY) >= 0: - shm_infos.append(info.split()) - - if len(shm_infos) == 0: - return None - elif len(shm_infos) == 1: - return _parse_size_in_M(shm_infos[0][3]) - else: - default_mount_infos = [ - si for si in shm_infos if si[-1] == SHM_DEFAULT_MOUNT - ] - if default_mount_infos: - return _parse_size_in_M(default_mount_infos[0][3]) - else: - return max([_parse_size_in_M(si[3]) for si in shm_infos]) diff --git a/pdfdet/models/Paddle/ppdet/data/source/__init__.py b/pdfdet/models/Paddle/ppdet/data/source/__init__.py deleted file mode 100644 index 2821ff5..0000000 --- a/pdfdet/models/Paddle/ppdet/data/source/__init__.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import coco -# from . import voc -# from . import widerface -# from . import category -# from . import keypoint_coco -# from . import mot -# from . import sniper_coco -# from . import culane - -from .coco import * -# from .voc import * -# from .widerface import * -# from .category import * -# from .keypoint_coco import * -# from .mot import * -# from .sniper_coco import SniperCOCODataSet -# from .dataset import ImageFolder -# from .pose3d_cmb import * -# from .culane import * diff --git a/pdfdet/models/Paddle/ppdet/data/source/category.py b/pdfdet/models/Paddle/ppdet/data/source/category.py deleted file mode 100644 index 8ed1f9e..0000000 --- a/pdfdet/models/Paddle/ppdet/data/source/category.py +++ /dev/null @@ -1,942 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os - -from ppdet.data.source.voc import pascalvoc_label -from ppdet.data.source.widerface import widerface_label -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -__all__ = ['get_categories'] - - -def get_categories(metric_type, anno_file=None, arch=None): - """ - Get class id to category id map and category id - to category name map from annotation file. - - Args: - metric_type (str): metric type, currently support 'coco', 'voc', 'oid' - and 'widerface'. - anno_file (str): annotation file path - """ - if arch == 'keypoint_arch': - return (None, {'id': 'keypoint'}) - - if anno_file == None or (not os.path.isfile(anno_file)): - logger.warning( - "anno_file '{}' is None or not set or not exist, " - "please recheck TrainDataset/EvalDataset/TestDataset.anno_path, " - "otherwise the default categories will be used by metric_type.". - format(anno_file)) - - if metric_type.lower() == 'coco' or metric_type.lower( - ) == 'rbox' or metric_type.lower() == 'snipercoco': - if anno_file and os.path.isfile(anno_file): - if anno_file.endswith('json'): - # lazy import pycocotools here - from pycocotools.coco import COCO - coco = COCO(anno_file) - cats = coco.loadCats(coco.getCatIds()) - - clsid2catid = {i: cat['id'] for i, cat in enumerate(cats)} - catid2name = {cat['id']: cat['name'] for cat in cats} - - elif anno_file.endswith('txt'): - cats = [] - with open(anno_file) as f: - for line in f.readlines(): - cats.append(line.strip()) - if cats[0] == 'background': cats = cats[1:] - - clsid2catid = {i: i for i in range(len(cats))} - catid2name = {i: name for i, name in enumerate(cats)} - - else: - raise ValueError("anno_file {} should be json or txt.".format( - anno_file)) - return clsid2catid, catid2name - - # anno file not exist, load default categories of COCO17 - else: - if metric_type.lower() == 'rbox': - logger.warning( - "metric_type: {}, load default categories of DOTA.".format( - metric_type)) - return _dota_category() - logger.warning("metric_type: {}, load default categories of COCO.". - format(metric_type)) - return _coco17_category() - - elif metric_type.lower() == 'voc': - if anno_file and os.path.isfile(anno_file): - cats = [] - with open(anno_file) as f: - for line in f.readlines(): - cats.append(line.strip()) - - if cats[0] == 'background': - cats = cats[1:] - - clsid2catid = {i: i for i in range(len(cats))} - catid2name = {i: name for i, name in enumerate(cats)} - - return clsid2catid, catid2name - - # anno file not exist, load default categories of - # VOC all 20 categories - else: - logger.warning("metric_type: {}, load default categories of VOC.". - format(metric_type)) - return _vocall_category() - - elif metric_type.lower() == 'oid': - if anno_file and os.path.isfile(anno_file): - logger.warning("only default categories support for OID19") - return _oid19_category() - - elif metric_type.lower() == 'widerface': - return _widerface_category() - - elif metric_type.lower() in [ - 'keypointtopdowncocoeval', 'keypointtopdownmpiieval', - 'keypointtopdowncocowholebadyhandeval' - ]: - return (None, {'id': 'keypoint'}) - - elif metric_type.lower() == 'pose3deval': - return (None, {'id': 'pose3d'}) - - elif metric_type.lower() in ['mot', 'motdet', 'reid']: - if anno_file and os.path.isfile(anno_file): - cats = [] - with open(anno_file) as f: - for line in f.readlines(): - cats.append(line.strip()) - if cats[0] == 'background': - cats = cats[1:] - clsid2catid = {i: i for i in range(len(cats))} - catid2name = {i: name for i, name in enumerate(cats)} - return clsid2catid, catid2name - # anno file not exist, load default category 'pedestrian'. - else: - logger.warning( - "metric_type: {}, load default categories of pedestrian MOT.". - format(metric_type)) - return _mot_category(category='pedestrian') - - elif metric_type.lower() in ['kitti', 'bdd100kmot']: - return _mot_category(category='vehicle') - - elif metric_type.lower() in ['mcmot']: - if anno_file and os.path.isfile(anno_file): - cats = [] - with open(anno_file) as f: - for line in f.readlines(): - cats.append(line.strip()) - if cats[0] == 'background': - cats = cats[1:] - clsid2catid = {i: i for i in range(len(cats))} - catid2name = {i: name for i, name in enumerate(cats)} - return clsid2catid, catid2name - # anno file not exist, load default categories of visdrone all 10 categories - else: - logger.warning( - "metric_type: {}, load default categories of VisDrone.".format( - metric_type)) - return _visdrone_category() - - else: - raise ValueError("unknown metric type {}".format(metric_type)) - - -def _mot_category(category='pedestrian'): - """ - Get class id to category id map and category id - to category name map of mot dataset - """ - label_map = {category: 0} - label_map = sorted(label_map.items(), key=lambda x: x[1]) - cats = [l[0] for l in label_map] - - clsid2catid = {i: i for i in range(len(cats))} - catid2name = {i: name for i, name in enumerate(cats)} - - return clsid2catid, catid2name - - -def _coco17_category(): - """ - Get class id to category id map and category id - to category name map of COCO2017 dataset - - """ - clsid2catid = { - 1: 1, - 2: 2, - 3: 3, - 4: 4, - 5: 5, - 6: 6, - 7: 7, - 8: 8, - 9: 9, - 10: 10, - 11: 11, - 12: 13, - 13: 14, - 14: 15, - 15: 16, - 16: 17, - 17: 18, - 18: 19, - 19: 20, - 20: 21, - 21: 22, - 22: 23, - 23: 24, - 24: 25, - 25: 27, - 26: 28, - 27: 31, - 28: 32, - 29: 33, - 30: 34, - 31: 35, - 32: 36, - 33: 37, - 34: 38, - 35: 39, - 36: 40, - 37: 41, - 38: 42, - 39: 43, - 40: 44, - 41: 46, - 42: 47, - 43: 48, - 44: 49, - 45: 50, - 46: 51, - 47: 52, - 48: 53, - 49: 54, - 50: 55, - 51: 56, - 52: 57, - 53: 58, - 54: 59, - 55: 60, - 56: 61, - 57: 62, - 58: 63, - 59: 64, - 60: 65, - 61: 67, - 62: 70, - 63: 72, - 64: 73, - 65: 74, - 66: 75, - 67: 76, - 68: 77, - 69: 78, - 70: 79, - 71: 80, - 72: 81, - 73: 82, - 74: 84, - 75: 85, - 76: 86, - 77: 87, - 78: 88, - 79: 89, - 80: 90 - } - - catid2name = { - 0: 'background', - 1: 'person', - 2: 'bicycle', - 3: 'car', - 4: 'motorcycle', - 5: 'airplane', - 6: 'bus', - 7: 'train', - 8: 'truck', - 9: 'boat', - 10: 'traffic light', - 11: 'fire hydrant', - 13: 'stop sign', - 14: 'parking meter', - 15: 'bench', - 16: 'bird', - 17: 'cat', - 18: 'dog', - 19: 'horse', - 20: 'sheep', - 21: 'cow', - 22: 'elephant', - 23: 'bear', - 24: 'zebra', - 25: 'giraffe', - 27: 'backpack', - 28: 'umbrella', - 31: 'handbag', - 32: 'tie', - 33: 'suitcase', - 34: 'frisbee', - 35: 'skis', - 36: 'snowboard', - 37: 'sports ball', - 38: 'kite', - 39: 'baseball bat', - 40: 'baseball glove', - 41: 'skateboard', - 42: 'surfboard', - 43: 'tennis racket', - 44: 'bottle', - 46: 'wine glass', - 47: 'cup', - 48: 'fork', - 49: 'knife', - 50: 'spoon', - 51: 'bowl', - 52: 'banana', - 53: 'apple', - 54: 'sandwich', - 55: 'orange', - 56: 'broccoli', - 57: 'carrot', - 58: 'hot dog', - 59: 'pizza', - 60: 'donut', - 61: 'cake', - 62: 'chair', - 63: 'couch', - 64: 'potted plant', - 65: 'bed', - 67: 'dining table', - 70: 'toilet', - 72: 'tv', - 73: 'laptop', - 74: 'mouse', - 75: 'remote', - 76: 'keyboard', - 77: 'cell phone', - 78: 'microwave', - 79: 'oven', - 80: 'toaster', - 81: 'sink', - 82: 'refrigerator', - 84: 'book', - 85: 'clock', - 86: 'vase', - 87: 'scissors', - 88: 'teddy bear', - 89: 'hair drier', - 90: 'toothbrush' - } - - clsid2catid = {k - 1: v for k, v in clsid2catid.items()} - catid2name.pop(0) - - return clsid2catid, catid2name - - -def _dota_category(): - """ - Get class id to category id map and category id - to category name map of dota dataset - """ - catid2name = { - 0: 'background', - 1: 'plane', - 2: 'baseball-diamond', - 3: 'bridge', - 4: 'ground-track-field', - 5: 'small-vehicle', - 6: 'large-vehicle', - 7: 'ship', - 8: 'tennis-court', - 9: 'basketball-court', - 10: 'storage-tank', - 11: 'soccer-ball-field', - 12: 'roundabout', - 13: 'harbor', - 14: 'swimming-pool', - 15: 'helicopter' - } - catid2name.pop(0) - clsid2catid = {i: i + 1 for i in range(len(catid2name))} - return clsid2catid, catid2name - - -def _vocall_category(): - """ - Get class id to category id map and category id - to category name map of mixup voc dataset - - """ - label_map = pascalvoc_label() - label_map = sorted(label_map.items(), key=lambda x: x[1]) - cats = [l[0] for l in label_map] - - clsid2catid = {i: i for i in range(len(cats))} - catid2name = {i: name for i, name in enumerate(cats)} - - return clsid2catid, catid2name - - -def _widerface_category(): - label_map = widerface_label() - label_map = sorted(label_map.items(), key=lambda x: x[1]) - cats = [l[0] for l in label_map] - clsid2catid = {i: i for i in range(len(cats))} - catid2name = {i: name for i, name in enumerate(cats)} - - return clsid2catid, catid2name - - -def _oid19_category(): - clsid2catid = {k: k + 1 for k in range(500)} - - catid2name = { - 0: "background", - 1: "Infant bed", - 2: "Rose", - 3: "Flag", - 4: "Flashlight", - 5: "Sea turtle", - 6: "Camera", - 7: "Animal", - 8: "Glove", - 9: "Crocodile", - 10: "Cattle", - 11: "House", - 12: "Guacamole", - 13: "Penguin", - 14: "Vehicle registration plate", - 15: "Bench", - 16: "Ladybug", - 17: "Human nose", - 18: "Watermelon", - 19: "Flute", - 20: "Butterfly", - 21: "Washing machine", - 22: "Raccoon", - 23: "Segway", - 24: "Taco", - 25: "Jellyfish", - 26: "Cake", - 27: "Pen", - 28: "Cannon", - 29: "Bread", - 30: "Tree", - 31: "Shellfish", - 32: "Bed", - 33: "Hamster", - 34: "Hat", - 35: "Toaster", - 36: "Sombrero", - 37: "Tiara", - 38: "Bowl", - 39: "Dragonfly", - 40: "Moths and butterflies", - 41: "Antelope", - 42: "Vegetable", - 43: "Torch", - 44: "Building", - 45: "Power plugs and sockets", - 46: "Blender", - 47: "Billiard table", - 48: "Cutting board", - 49: "Bronze sculpture", - 50: "Turtle", - 51: "Broccoli", - 52: "Tiger", - 53: "Mirror", - 54: "Bear", - 55: "Zucchini", - 56: "Dress", - 57: "Volleyball", - 58: "Guitar", - 59: "Reptile", - 60: "Golf cart", - 61: "Tart", - 62: "Fedora", - 63: "Carnivore", - 64: "Car", - 65: "Lighthouse", - 66: "Coffeemaker", - 67: "Food processor", - 68: "Truck", - 69: "Bookcase", - 70: "Surfboard", - 71: "Footwear", - 72: "Bench", - 73: "Necklace", - 74: "Flower", - 75: "Radish", - 76: "Marine mammal", - 77: "Frying pan", - 78: "Tap", - 79: "Peach", - 80: "Knife", - 81: "Handbag", - 82: "Laptop", - 83: "Tent", - 84: "Ambulance", - 85: "Christmas tree", - 86: "Eagle", - 87: "Limousine", - 88: "Kitchen & dining room table", - 89: "Polar bear", - 90: "Tower", - 91: "Football", - 92: "Willow", - 93: "Human head", - 94: "Stop sign", - 95: "Banana", - 96: "Mixer", - 97: "Binoculars", - 98: "Dessert", - 99: "Bee", - 100: "Chair", - 101: "Wood-burning stove", - 102: "Flowerpot", - 103: "Beaker", - 104: "Oyster", - 105: "Woodpecker", - 106: "Harp", - 107: "Bathtub", - 108: "Wall clock", - 109: "Sports uniform", - 110: "Rhinoceros", - 111: "Beehive", - 112: "Cupboard", - 113: "Chicken", - 114: "Man", - 115: "Blue jay", - 116: "Cucumber", - 117: "Balloon", - 118: "Kite", - 119: "Fireplace", - 120: "Lantern", - 121: "Missile", - 122: "Book", - 123: "Spoon", - 124: "Grapefruit", - 125: "Squirrel", - 126: "Orange", - 127: "Coat", - 128: "Punching bag", - 129: "Zebra", - 130: "Billboard", - 131: "Bicycle", - 132: "Door handle", - 133: "Mechanical fan", - 134: "Ring binder", - 135: "Table", - 136: "Parrot", - 137: "Sock", - 138: "Vase", - 139: "Weapon", - 140: "Shotgun", - 141: "Glasses", - 142: "Seahorse", - 143: "Belt", - 144: "Watercraft", - 145: "Window", - 146: "Giraffe", - 147: "Lion", - 148: "Tire", - 149: "Vehicle", - 150: "Canoe", - 151: "Tie", - 152: "Shelf", - 153: "Picture frame", - 154: "Printer", - 155: "Human leg", - 156: "Boat", - 157: "Slow cooker", - 158: "Croissant", - 159: "Candle", - 160: "Pancake", - 161: "Pillow", - 162: "Coin", - 163: "Stretcher", - 164: "Sandal", - 165: "Woman", - 166: "Stairs", - 167: "Harpsichord", - 168: "Stool", - 169: "Bus", - 170: "Suitcase", - 171: "Human mouth", - 172: "Juice", - 173: "Skull", - 174: "Door", - 175: "Violin", - 176: "Chopsticks", - 177: "Digital clock", - 178: "Sunflower", - 179: "Leopard", - 180: "Bell pepper", - 181: "Harbor seal", - 182: "Snake", - 183: "Sewing machine", - 184: "Goose", - 185: "Helicopter", - 186: "Seat belt", - 187: "Coffee cup", - 188: "Microwave oven", - 189: "Hot dog", - 190: "Countertop", - 191: "Serving tray", - 192: "Dog bed", - 193: "Beer", - 194: "Sunglasses", - 195: "Golf ball", - 196: "Waffle", - 197: "Palm tree", - 198: "Trumpet", - 199: "Ruler", - 200: "Helmet", - 201: "Ladder", - 202: "Office building", - 203: "Tablet computer", - 204: "Toilet paper", - 205: "Pomegranate", - 206: "Skirt", - 207: "Gas stove", - 208: "Cookie", - 209: "Cart", - 210: "Raven", - 211: "Egg", - 212: "Burrito", - 213: "Goat", - 214: "Kitchen knife", - 215: "Skateboard", - 216: "Salt and pepper shakers", - 217: "Lynx", - 218: "Boot", - 219: "Platter", - 220: "Ski", - 221: "Swimwear", - 222: "Swimming pool", - 223: "Drinking straw", - 224: "Wrench", - 225: "Drum", - 226: "Ant", - 227: "Human ear", - 228: "Headphones", - 229: "Fountain", - 230: "Bird", - 231: "Jeans", - 232: "Television", - 233: "Crab", - 234: "Microphone", - 235: "Home appliance", - 236: "Snowplow", - 237: "Beetle", - 238: "Artichoke", - 239: "Jet ski", - 240: "Stationary bicycle", - 241: "Human hair", - 242: "Brown bear", - 243: "Starfish", - 244: "Fork", - 245: "Lobster", - 246: "Corded phone", - 247: "Drink", - 248: "Saucer", - 249: "Carrot", - 250: "Insect", - 251: "Clock", - 252: "Castle", - 253: "Tennis racket", - 254: "Ceiling fan", - 255: "Asparagus", - 256: "Jaguar", - 257: "Musical instrument", - 258: "Train", - 259: "Cat", - 260: "Rifle", - 261: "Dumbbell", - 262: "Mobile phone", - 263: "Taxi", - 264: "Shower", - 265: "Pitcher", - 266: "Lemon", - 267: "Invertebrate", - 268: "Turkey", - 269: "High heels", - 270: "Bust", - 271: "Elephant", - 272: "Scarf", - 273: "Barrel", - 274: "Trombone", - 275: "Pumpkin", - 276: "Box", - 277: "Tomato", - 278: "Frog", - 279: "Bidet", - 280: "Human face", - 281: "Houseplant", - 282: "Van", - 283: "Shark", - 284: "Ice cream", - 285: "Swim cap", - 286: "Falcon", - 287: "Ostrich", - 288: "Handgun", - 289: "Whiteboard", - 290: "Lizard", - 291: "Pasta", - 292: "Snowmobile", - 293: "Light bulb", - 294: "Window blind", - 295: "Muffin", - 296: "Pretzel", - 297: "Computer monitor", - 298: "Horn", - 299: "Furniture", - 300: "Sandwich", - 301: "Fox", - 302: "Convenience store", - 303: "Fish", - 304: "Fruit", - 305: "Earrings", - 306: "Curtain", - 307: "Grape", - 308: "Sofa bed", - 309: "Horse", - 310: "Luggage and bags", - 311: "Desk", - 312: "Crutch", - 313: "Bicycle helmet", - 314: "Tick", - 315: "Airplane", - 316: "Canary", - 317: "Spatula", - 318: "Watch", - 319: "Lily", - 320: "Kitchen appliance", - 321: "Filing cabinet", - 322: "Aircraft", - 323: "Cake stand", - 324: "Candy", - 325: "Sink", - 326: "Mouse", - 327: "Wine", - 328: "Wheelchair", - 329: "Goldfish", - 330: "Refrigerator", - 331: "French fries", - 332: "Drawer", - 333: "Treadmill", - 334: "Picnic basket", - 335: "Dice", - 336: "Cabbage", - 337: "Football helmet", - 338: "Pig", - 339: "Person", - 340: "Shorts", - 341: "Gondola", - 342: "Honeycomb", - 343: "Doughnut", - 344: "Chest of drawers", - 345: "Land vehicle", - 346: "Bat", - 347: "Monkey", - 348: "Dagger", - 349: "Tableware", - 350: "Human foot", - 351: "Mug", - 352: "Alarm clock", - 353: "Pressure cooker", - 354: "Human hand", - 355: "Tortoise", - 356: "Baseball glove", - 357: "Sword", - 358: "Pear", - 359: "Miniskirt", - 360: "Traffic sign", - 361: "Girl", - 362: "Roller skates", - 363: "Dinosaur", - 364: "Porch", - 365: "Human beard", - 366: "Submarine sandwich", - 367: "Screwdriver", - 368: "Strawberry", - 369: "Wine glass", - 370: "Seafood", - 371: "Racket", - 372: "Wheel", - 373: "Sea lion", - 374: "Toy", - 375: "Tea", - 376: "Tennis ball", - 377: "Waste container", - 378: "Mule", - 379: "Cricket ball", - 380: "Pineapple", - 381: "Coconut", - 382: "Doll", - 383: "Coffee table", - 384: "Snowman", - 385: "Lavender", - 386: "Shrimp", - 387: "Maple", - 388: "Cowboy hat", - 389: "Goggles", - 390: "Rugby ball", - 391: "Caterpillar", - 392: "Poster", - 393: "Rocket", - 394: "Organ", - 395: "Saxophone", - 396: "Traffic light", - 397: "Cocktail", - 398: "Plastic bag", - 399: "Squash", - 400: "Mushroom", - 401: "Hamburger", - 402: "Light switch", - 403: "Parachute", - 404: "Teddy bear", - 405: "Winter melon", - 406: "Deer", - 407: "Musical keyboard", - 408: "Plumbing fixture", - 409: "Scoreboard", - 410: "Baseball bat", - 411: "Envelope", - 412: "Adhesive tape", - 413: "Briefcase", - 414: "Paddle", - 415: "Bow and arrow", - 416: "Telephone", - 417: "Sheep", - 418: "Jacket", - 419: "Boy", - 420: "Pizza", - 421: "Otter", - 422: "Office supplies", - 423: "Couch", - 424: "Cello", - 425: "Bull", - 426: "Camel", - 427: "Ball", - 428: "Duck", - 429: "Whale", - 430: "Shirt", - 431: "Tank", - 432: "Motorcycle", - 433: "Accordion", - 434: "Owl", - 435: "Porcupine", - 436: "Sun hat", - 437: "Nail", - 438: "Scissors", - 439: "Swan", - 440: "Lamp", - 441: "Crown", - 442: "Piano", - 443: "Sculpture", - 444: "Cheetah", - 445: "Oboe", - 446: "Tin can", - 447: "Mango", - 448: "Tripod", - 449: "Oven", - 450: "Mouse", - 451: "Barge", - 452: "Coffee", - 453: "Snowboard", - 454: "Common fig", - 455: "Salad", - 456: "Marine invertebrates", - 457: "Umbrella", - 458: "Kangaroo", - 459: "Human arm", - 460: "Measuring cup", - 461: "Snail", - 462: "Loveseat", - 463: "Suit", - 464: "Teapot", - 465: "Bottle", - 466: "Alpaca", - 467: "Kettle", - 468: "Trousers", - 469: "Popcorn", - 470: "Centipede", - 471: "Spider", - 472: "Sparrow", - 473: "Plate", - 474: "Bagel", - 475: "Personal care", - 476: "Apple", - 477: "Brassiere", - 478: "Bathroom cabinet", - 479: "studio couch", - 480: "Computer keyboard", - 481: "Table tennis racket", - 482: "Sushi", - 483: "Cabinetry", - 484: "Street light", - 485: "Towel", - 486: "Nightstand", - 487: "Rabbit", - 488: "Dolphin", - 489: "Dog", - 490: "Jug", - 491: "Wok", - 492: "Fire hydrant", - 493: "Human eye", - 494: "Skyscraper", - 495: "Backpack", - 496: "Potato", - 497: "Paper towel", - 498: "Lifejacket", - 499: "Bicycle wheel", - 500: "Toilet", - } - - return clsid2catid, catid2name - - -def _visdrone_category(): - clsid2catid = {i: i for i in range(10)} - - catid2name = { - 0: 'pedestrian', - 1: 'people', - 2: 'bicycle', - 3: 'car', - 4: 'van', - 5: 'truck', - 6: 'tricycle', - 7: 'awning-tricycle', - 8: 'bus', - 9: 'motor' - } - return clsid2catid, catid2name diff --git a/pdfdet/models/Paddle/ppdet/data/source/coco.py b/pdfdet/models/Paddle/ppdet/data/source/coco.py deleted file mode 100644 index 4120327..0000000 --- a/pdfdet/models/Paddle/ppdet/data/source/coco.py +++ /dev/null @@ -1,596 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import copy -try: - from collections.abc import Sequence -except Exception: - from collections import Sequence -import numpy as np -from ppdet.core.workspace import register, serializable -from .dataset import DetDataset - -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -__all__ = [ - 'COCODataSet', 'SlicedCOCODataSet', 'SemiCOCODataSet', 'COCODetDataset' -] - - -@register -@serializable -class COCODataSet(DetDataset): - """ - Load dataset with COCO format. - - Args: - dataset_dir (str): root directory for dataset. - image_dir (str): directory for images. - anno_path (str): coco annotation file path. - data_fields (list): key name of data dictionary, at least have 'image'. - sample_num (int): number of samples to load, -1 means all. - load_crowd (bool): whether to load crowded ground-truth. - False as default - allow_empty (bool): whether to load empty entry. False as default - empty_ratio (float): the ratio of empty record number to total - record's, if empty_ratio is out of [0. ,1.), do not sample the - records and use all the empty entries. 1. as default - repeat (int): repeat times for dataset, use in benchmark. - """ - - def __init__(self, - dataset_dir=None, - image_dir=None, - anno_path=None, - data_fields=['image'], - sample_num=-1, - load_crowd=False, - allow_empty=False, - empty_ratio=1., - repeat=1): - super(COCODataSet, self).__init__( - dataset_dir, - image_dir, - anno_path, - data_fields, - sample_num, - repeat=repeat) - self.load_image_only = False - self.load_semantic = False - self.load_crowd = load_crowd - self.allow_empty = allow_empty - self.empty_ratio = empty_ratio - - def _sample_empty(self, records, num): - # if empty_ratio is out of [0. ,1.), do not sample the records - if self.empty_ratio < 0. or self.empty_ratio >= 1.: - return records - import random - sample_num = min( - int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records)) - records = random.sample(records, sample_num) - return records - - def parse_dataset(self): - anno_path = os.path.join(self.dataset_dir, self.anno_path) - image_dir = os.path.join(self.dataset_dir, self.image_dir) - - assert anno_path.endswith('.json'), \ - 'invalid coco annotation file: ' + anno_path - from pycocotools.coco import COCO - coco = COCO(anno_path) - img_ids = coco.getImgIds() - img_ids.sort() - cat_ids = coco.getCatIds() - records = [] - empty_records = [] - ct = 0 - - self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)}) - self.cname2cid = dict({ - coco.loadCats(catid)[0]['name']: clsid - for catid, clsid in self.catid2clsid.items() - }) - - if 'annotations' not in coco.dataset: - self.load_image_only = True - logger.warning('Annotation file: {} does not contains ground truth ' - 'and load image information only.'.format(anno_path)) - - for img_id in img_ids: - img_anno = coco.loadImgs([img_id])[0] - im_fname = img_anno['file_name'] - im_w = float(img_anno['width']) - im_h = float(img_anno['height']) - - im_path = os.path.join(image_dir, - im_fname) if image_dir else im_fname - is_empty = False - if not os.path.exists(im_path): - logger.warning('Illegal image file: {}, and it will be ' - 'ignored'.format(im_path)) - continue - - if im_w < 0 or im_h < 0: - logger.warning('Illegal width: {} or height: {} in annotation, ' - 'and im_id: {} will be ignored'.format( - im_w, im_h, img_id)) - continue - - coco_rec = { - 'im_file': im_path, - 'im_id': np.array([img_id]), - 'h': im_h, - 'w': im_w, - } if 'image' in self.data_fields else {} - - if not self.load_image_only: - ins_anno_ids = coco.getAnnIds( - imgIds=[img_id], iscrowd=None if self.load_crowd else False) - instances = coco.loadAnns(ins_anno_ids) - - bboxes = [] - is_rbox_anno = False - for inst in instances: - # check gt bbox - if inst.get('ignore', False): - continue - if 'bbox' not in inst.keys(): - continue - else: - if not any(np.array(inst['bbox'])): - continue - - x1, y1, box_w, box_h = inst['bbox'] - x2 = x1 + box_w - y2 = y1 + box_h - eps = 1e-5 - if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps: - inst['clean_bbox'] = [ - round(float(x), 3) for x in [x1, y1, x2, y2] - ] - bboxes.append(inst) - else: - logger.warning( - 'Found an invalid bbox in annotations: im_id: {}, ' - 'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format( - img_id, float(inst['area']), x1, y1, x2, y2)) - - num_bbox = len(bboxes) - if num_bbox <= 0 and not self.allow_empty: - continue - elif num_bbox <= 0: - is_empty = True - - gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32) - gt_class = np.zeros((num_bbox, 1), dtype=np.int32) - is_crowd = np.zeros((num_bbox, 1), dtype=np.int32) - gt_poly = [None] * num_bbox - gt_track_id = -np.ones((num_bbox, 1), dtype=np.int32) - - has_segmentation = False - has_track_id = False - for i, box in enumerate(bboxes): - catid = box['category_id'] - gt_class[i][0] = self.catid2clsid[catid] - gt_bbox[i, :] = box['clean_bbox'] - is_crowd[i][0] = box['iscrowd'] - # check RLE format - if 'segmentation' in box and box['iscrowd'] == 1: - gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] - elif 'segmentation' in box and box['segmentation']: - if not np.array( - box['segmentation'], - dtype=object).size > 0 and not self.allow_empty: - bboxes.pop(i) - gt_poly.pop(i) - np.delete(is_crowd, i) - np.delete(gt_class, i) - np.delete(gt_bbox, i) - else: - gt_poly[i] = box['segmentation'] - has_segmentation = True - - if 'track_id' in box: - gt_track_id[i][0] = box['track_id'] - has_track_id = True - - if has_segmentation and not any( - gt_poly) and not self.allow_empty: - continue - - gt_rec = { - 'is_crowd': is_crowd, - 'gt_class': gt_class, - 'gt_bbox': gt_bbox, - 'gt_poly': gt_poly, - } - if has_track_id: - gt_rec.update({'gt_track_id': gt_track_id}) - - for k, v in gt_rec.items(): - if k in self.data_fields: - coco_rec[k] = v - - # TODO: remove load_semantic - if self.load_semantic and 'semantic' in self.data_fields: - seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps', - 'train2017', im_fname[:-3] + 'png') - coco_rec.update({'semantic': seg_path}) - - logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format( - im_path, img_id, im_h, im_w)) - if is_empty: - empty_records.append(coco_rec) - else: - records.append(coco_rec) - ct += 1 - if self.sample_num > 0 and ct >= self.sample_num: - break - assert ct > 0, 'not found any coco record in %s' % (anno_path) - logger.info('Load [{} samples valid, {} samples invalid] in file {}.'. - format(ct, len(img_ids) - ct, anno_path)) - if self.allow_empty and len(empty_records) > 0: - empty_records = self._sample_empty(empty_records, len(records)) - records += empty_records - self.roidbs = records - - -@register -@serializable -class SlicedCOCODataSet(COCODataSet): - """Sliced COCODataSet""" - - def __init__( - self, - dataset_dir=None, - image_dir=None, - anno_path=None, - data_fields=['image'], - sample_num=-1, - load_crowd=False, - allow_empty=False, - empty_ratio=1., - repeat=1, - sliced_size=[640, 640], - overlap_ratio=[0.25, 0.25], ): - super(SlicedCOCODataSet, self).__init__( - dataset_dir=dataset_dir, - image_dir=image_dir, - anno_path=anno_path, - data_fields=data_fields, - sample_num=sample_num, - load_crowd=load_crowd, - allow_empty=allow_empty, - empty_ratio=empty_ratio, - repeat=repeat, ) - self.sliced_size = sliced_size - self.overlap_ratio = overlap_ratio - - def parse_dataset(self): - anno_path = os.path.join(self.dataset_dir, self.anno_path) - image_dir = os.path.join(self.dataset_dir, self.image_dir) - - assert anno_path.endswith('.json'), \ - 'invalid coco annotation file: ' + anno_path - from pycocotools.coco import COCO - coco = COCO(anno_path) - img_ids = coco.getImgIds() - img_ids.sort() - cat_ids = coco.getCatIds() - records = [] - empty_records = [] - ct = 0 - ct_sub = 0 - - self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)}) - self.cname2cid = dict({ - coco.loadCats(catid)[0]['name']: clsid - for catid, clsid in self.catid2clsid.items() - }) - - if 'annotations' not in coco.dataset: - self.load_image_only = True - logger.warning('Annotation file: {} does not contains ground truth ' - 'and load image information only.'.format(anno_path)) - try: - import sahi - from sahi.slicing import slice_image - except Exception as e: - logger.error( - 'sahi not found, plaese install sahi. ' - 'for example: `pip install sahi`, see https://github.com/obss/sahi.' - ) - raise e - - sub_img_ids = 0 - for img_id in img_ids: - img_anno = coco.loadImgs([img_id])[0] - im_fname = img_anno['file_name'] - im_w = float(img_anno['width']) - im_h = float(img_anno['height']) - - im_path = os.path.join(image_dir, - im_fname) if image_dir else im_fname - is_empty = False - if not os.path.exists(im_path): - logger.warning('Illegal image file: {}, and it will be ' - 'ignored'.format(im_path)) - continue - - if im_w < 0 or im_h < 0: - logger.warning('Illegal width: {} or height: {} in annotation, ' - 'and im_id: {} will be ignored'.format( - im_w, im_h, img_id)) - continue - - slice_image_result = sahi.slicing.slice_image( - image=im_path, - slice_height=self.sliced_size[0], - slice_width=self.sliced_size[1], - overlap_height_ratio=self.overlap_ratio[0], - overlap_width_ratio=self.overlap_ratio[1]) - - sub_img_num = len(slice_image_result) - for _ind in range(sub_img_num): - im = slice_image_result.images[_ind] - coco_rec = { - 'image': im, - 'im_id': np.array([sub_img_ids + _ind]), - 'h': im.shape[0], - 'w': im.shape[1], - 'ori_im_id': np.array([img_id]), - 'st_pix': np.array( - slice_image_result.starting_pixels[_ind], - dtype=np.float32), - 'is_last': 1 if _ind == sub_img_num - 1 else 0, - } if 'image' in self.data_fields else {} - records.append(coco_rec) - ct_sub += sub_img_num - ct += 1 - if self.sample_num > 0 and ct >= self.sample_num: - break - assert ct > 0, 'not found any coco record in %s' % (anno_path) - logger.info('{} samples and slice to {} sub_samples in file {}'.format( - ct, ct_sub, anno_path)) - if self.allow_empty and len(empty_records) > 0: - empty_records = self._sample_empty(empty_records, len(records)) - records += empty_records - self.roidbs = records - - -@register -@serializable -class SemiCOCODataSet(COCODataSet): - """Semi-COCODataSet used for supervised and unsupervised dataSet""" - - def __init__(self, - dataset_dir=None, - image_dir=None, - anno_path=None, - data_fields=['image'], - sample_num=-1, - load_crowd=False, - allow_empty=False, - empty_ratio=1., - repeat=1, - supervised=True): - super(SemiCOCODataSet, self).__init__( - dataset_dir, image_dir, anno_path, data_fields, sample_num, - load_crowd, allow_empty, empty_ratio, repeat) - self.supervised = supervised - self.length = -1 # defalut -1 means all - - def parse_dataset(self): - anno_path = os.path.join(self.dataset_dir, self.anno_path) - image_dir = os.path.join(self.dataset_dir, self.image_dir) - - assert anno_path.endswith('.json'), \ - 'invalid coco annotation file: ' + anno_path - from pycocotools.coco import COCO - coco = COCO(anno_path) - img_ids = coco.getImgIds() - img_ids.sort() - cat_ids = coco.getCatIds() - records = [] - empty_records = [] - ct = 0 - - self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)}) - self.cname2cid = dict({ - coco.loadCats(catid)[0]['name']: clsid - for catid, clsid in self.catid2clsid.items() - }) - - if 'annotations' not in coco.dataset or self.supervised == False: - self.load_image_only = True - logger.warning('Annotation file: {} does not contains ground truth ' - 'and load image information only.'.format(anno_path)) - - for img_id in img_ids: - img_anno = coco.loadImgs([img_id])[0] - im_fname = img_anno['file_name'] - im_w = float(img_anno['width']) - im_h = float(img_anno['height']) - - im_path = os.path.join(image_dir, - im_fname) if image_dir else im_fname - is_empty = False - if not os.path.exists(im_path): - logger.warning('Illegal image file: {}, and it will be ' - 'ignored'.format(im_path)) - continue - - if im_w < 0 or im_h < 0: - logger.warning('Illegal width: {} or height: {} in annotation, ' - 'and im_id: {} will be ignored'.format( - im_w, im_h, img_id)) - continue - - coco_rec = { - 'im_file': im_path, - 'im_id': np.array([img_id]), - 'h': im_h, - 'w': im_w, - } if 'image' in self.data_fields else {} - - if not self.load_image_only: - ins_anno_ids = coco.getAnnIds( - imgIds=[img_id], iscrowd=None if self.load_crowd else False) - instances = coco.loadAnns(ins_anno_ids) - - bboxes = [] - is_rbox_anno = False - for inst in instances: - # check gt bbox - if inst.get('ignore', False): - continue - if 'bbox' not in inst.keys(): - continue - else: - if not any(np.array(inst['bbox'])): - continue - - x1, y1, box_w, box_h = inst['bbox'] - x2 = x1 + box_w - y2 = y1 + box_h - eps = 1e-5 - if inst['area'] > 0 and x2 - x1 > eps and y2 - y1 > eps: - inst['clean_bbox'] = [ - round(float(x), 3) for x in [x1, y1, x2, y2] - ] - bboxes.append(inst) - else: - logger.warning( - 'Found an invalid bbox in annotations: im_id: {}, ' - 'area: {} x1: {}, y1: {}, x2: {}, y2: {}.'.format( - img_id, float(inst['area']), x1, y1, x2, y2)) - - num_bbox = len(bboxes) - if num_bbox <= 0 and not self.allow_empty: - continue - elif num_bbox <= 0: - is_empty = True - - gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32) - gt_class = np.zeros((num_bbox, 1), dtype=np.int32) - is_crowd = np.zeros((num_bbox, 1), dtype=np.int32) - gt_poly = [None] * num_bbox - - has_segmentation = False - for i, box in enumerate(bboxes): - catid = box['category_id'] - gt_class[i][0] = self.catid2clsid[catid] - gt_bbox[i, :] = box['clean_bbox'] - is_crowd[i][0] = box['iscrowd'] - # check RLE format - if 'segmentation' in box and box['iscrowd'] == 1: - gt_poly[i] = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] - elif 'segmentation' in box and box['segmentation']: - if not np.array(box['segmentation'] - ).size > 0 and not self.allow_empty: - bboxes.pop(i) - gt_poly.pop(i) - np.delete(is_crowd, i) - np.delete(gt_class, i) - np.delete(gt_bbox, i) - else: - gt_poly[i] = box['segmentation'] - has_segmentation = True - - if has_segmentation and not any( - gt_poly) and not self.allow_empty: - continue - - gt_rec = { - 'is_crowd': is_crowd, - 'gt_class': gt_class, - 'gt_bbox': gt_bbox, - 'gt_poly': gt_poly, - } - - for k, v in gt_rec.items(): - if k in self.data_fields: - coco_rec[k] = v - - # TODO: remove load_semantic - if self.load_semantic and 'semantic' in self.data_fields: - seg_path = os.path.join(self.dataset_dir, 'stuffthingmaps', - 'train2017', im_fname[:-3] + 'png') - coco_rec.update({'semantic': seg_path}) - - logger.debug('Load file: {}, im_id: {}, h: {}, w: {}.'.format( - im_path, img_id, im_h, im_w)) - if is_empty: - empty_records.append(coco_rec) - else: - records.append(coco_rec) - ct += 1 - if self.sample_num > 0 and ct >= self.sample_num: - break - assert ct > 0, 'not found any coco record in %s' % (anno_path) - logger.info('Load [{} samples valid, {} samples invalid] in file {}.'. - format(ct, len(img_ids) - ct, anno_path)) - if self.allow_empty and len(empty_records) > 0: - empty_records = self._sample_empty(empty_records, len(records)) - records += empty_records - self.roidbs = records - - if self.supervised: - logger.info(f'Use {len(self.roidbs)} sup_samples data as LABELED') - else: - if self.length > 0: # unsup length will be decide by sup length - all_roidbs = self.roidbs.copy() - selected_idxs = [ - np.random.choice(len(all_roidbs)) - for _ in range(self.length) - ] - self.roidbs = [all_roidbs[i] for i in selected_idxs] - logger.info( - f'Use {len(self.roidbs)} unsup_samples data as UNLABELED') - - def __getitem__(self, idx): - n = len(self.roidbs) - if self.repeat > 1: - idx %= n - # data batch - roidb = copy.deepcopy(self.roidbs[idx]) - if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch: - idx = np.random.randint(n) - roidb = [roidb, copy.deepcopy(self.roidbs[idx])] - elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch: - idx = np.random.randint(n) - roidb = [roidb, copy.deepcopy(self.roidbs[idx])] - elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch: - roidb = [roidb, ] + [ - copy.deepcopy(self.roidbs[np.random.randint(n)]) - for _ in range(4) - ] - if isinstance(roidb, Sequence): - for r in roidb: - r['curr_iter'] = self._curr_iter - else: - roidb['curr_iter'] = self._curr_iter - self._curr_iter += 1 - - return self.transform(roidb) - - -# for PaddleX -@register -@serializable -class COCODetDataset(COCODataSet): - pass diff --git a/pdfdet/models/Paddle/ppdet/data/source/culane.py b/pdfdet/models/Paddle/ppdet/data/source/culane.py deleted file mode 100644 index 977d608..0000000 --- a/pdfdet/models/Paddle/ppdet/data/source/culane.py +++ /dev/null @@ -1,206 +0,0 @@ -from ppdet.core.workspace import register, serializable -import cv2 -import os -import tarfile -import numpy as np -import os.path as osp -from ppdet.data.source.dataset import DetDataset -from imgaug.augmentables.lines import LineStringsOnImage -from imgaug.augmentables.segmaps import SegmentationMapsOnImage -from ppdet.data.culane_utils import lane_to_linestrings -import pickle as pkl -from ppdet.utils.logger import setup_logger -try: - from collections.abc import Sequence -except Exception: - from collections import Sequence -from .dataset import DetDataset, _make_dataset, _is_valid_file -from ppdet.utils.download import download_dataset - -logger = setup_logger(__name__) - - -@register -@serializable -class CULaneDataSet(DetDataset): - def __init__( - self, - dataset_dir, - cut_height, - list_path, - split='train', - data_fields=['image'], - video_file=None, - frame_rate=-1, ): - super(CULaneDataSet, self).__init__( - dataset_dir=dataset_dir, - cut_height=cut_height, - split=split, - data_fields=data_fields) - self.dataset_dir = dataset_dir - self.list_path = osp.join(dataset_dir, list_path) - self.cut_height = cut_height - self.data_fields = data_fields - self.split = split - self.training = 'train' in split - self.data_infos = [] - self.video_file = video_file - self.frame_rate = frame_rate - self._imid2path = {} - self.predict_dir = None - - def __len__(self): - return len(self.data_infos) - - def check_or_download_dataset(self): - if not osp.exists(self.dataset_dir): - download_dataset("dataset", dataset="culane") - # extract .tar files in self.dataset_dir - for fname in os.listdir(self.dataset_dir): - logger.info("Decompressing {}...".format(fname)) - # ignore .* files - if fname.startswith('.'): - continue - if fname.find('.tar.gz') >= 0: - with tarfile.open(osp.join(self.dataset_dir, fname)) as tf: - tf.extractall(path=self.dataset_dir) - logger.info("Dataset files are ready.") - - def parse_dataset(self): - logger.info('Loading CULane annotations...') - if self.predict_dir is not None: - logger.info('switch to predict mode') - return - # Waiting for the dataset to load is tedious, let's cache it - os.makedirs('cache', exist_ok=True) - cache_path = 'cache/culane_paddle_{}.pkl'.format(self.split) - if os.path.exists(cache_path): - with open(cache_path, 'rb') as cache_file: - self.data_infos = pkl.load(cache_file) - self.max_lanes = max( - len(anno['lanes']) for anno in self.data_infos) - return - - with open(self.list_path) as list_file: - for line in list_file: - infos = self.load_annotation(line.split()) - self.data_infos.append(infos) - - # cache data infos to file - with open(cache_path, 'wb') as cache_file: - pkl.dump(self.data_infos, cache_file) - - def load_annotation(self, line): - infos = {} - img_line = line[0] - img_line = img_line[1 if img_line[0] == '/' else 0::] - img_path = os.path.join(self.dataset_dir, img_line) - infos['img_name'] = img_line - infos['img_path'] = img_path - if len(line) > 1: - mask_line = line[1] - mask_line = mask_line[1 if mask_line[0] == '/' else 0::] - mask_path = os.path.join(self.dataset_dir, mask_line) - infos['mask_path'] = mask_path - - if len(line) > 2: - exist_list = [int(l) for l in line[2:]] - infos['lane_exist'] = np.array(exist_list) - - anno_path = img_path[: - -3] + 'lines.txt' # remove sufix jpg and add lines.txt - with open(anno_path, 'r') as anno_file: - data = [ - list(map(float, line.split())) for line in anno_file.readlines() - ] - lanes = [[(lane[i], lane[i + 1]) for i in range(0, len(lane), 2) - if lane[i] >= 0 and lane[i + 1] >= 0] for lane in data] - lanes = [list(set(lane)) for lane in lanes] # remove duplicated points - lanes = [lane for lane in lanes - if len(lane) > 2] # remove lanes with less than 2 points - - lanes = [sorted( - lane, key=lambda x: x[1]) for lane in lanes] # sort by y - infos['lanes'] = lanes - - return infos - - def set_images(self, images): - self.predict_dir = images - self.data_infos = self._load_images() - - def _find_images(self): - predict_dir = self.predict_dir - if not isinstance(predict_dir, Sequence): - predict_dir = [predict_dir] - images = [] - for im_dir in predict_dir: - if os.path.isdir(im_dir): - im_dir = os.path.join(self.predict_dir, im_dir) - images.extend(_make_dataset(im_dir)) - elif os.path.isfile(im_dir) and _is_valid_file(im_dir): - images.append(im_dir) - return images - - def _load_images(self): - images = self._find_images() - ct = 0 - records = [] - for image in images: - assert image != '' and os.path.isfile(image), \ - "Image {} not found".format(image) - if self.sample_num > 0 and ct >= self.sample_num: - break - rec = { - 'im_id': np.array([ct]), - "img_path": os.path.abspath(image), - "img_name": os.path.basename(image), - "lanes": [] - } - self._imid2path[ct] = image - ct += 1 - records.append(rec) - assert len(records) > 0, "No image file found" - return records - - def get_imid2path(self): - return self._imid2path - - def __getitem__(self, idx): - data_info = self.data_infos[idx] - img = cv2.imread(data_info['img_path']) - img = img[self.cut_height:, :, :] - sample = data_info.copy() - sample.update({'image': img}) - img_org = sample['image'] - - if self.training: - label = cv2.imread(sample['mask_path'], cv2.IMREAD_UNCHANGED) - if len(label.shape) > 2: - label = label[:, :, 0] - label = label.squeeze() - label = label[self.cut_height:, :] - sample.update({'mask': label}) - if self.cut_height != 0: - new_lanes = [] - for i in sample['lanes']: - lanes = [] - for p in i: - lanes.append((p[0], p[1] - self.cut_height)) - new_lanes.append(lanes) - sample.update({'lanes': new_lanes}) - - sample['mask'] = SegmentationMapsOnImage( - sample['mask'], shape=img_org.shape) - - sample['full_img_path'] = data_info['img_path'] - sample['img_name'] = data_info['img_name'] - sample['im_id'] = np.array([idx]) - - sample['image'] = sample['image'].copy().astype(np.uint8) - sample['lanes'] = lane_to_linestrings(sample['lanes']) - sample['lanes'] = LineStringsOnImage( - sample['lanes'], shape=img_org.shape) - sample['seg'] = np.zeros(img_org.shape) - - return sample diff --git a/pdfdet/models/Paddle/ppdet/data/source/dataset.py b/pdfdet/models/Paddle/ppdet/data/source/dataset.py deleted file mode 100644 index 4f22b22..0000000 --- a/pdfdet/models/Paddle/ppdet/data/source/dataset.py +++ /dev/null @@ -1,307 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import copy -import numpy as np -try: - from collections.abc import Sequence -except Exception: - from collections import Sequence -from paddle.io import Dataset -from ppdet.core.workspace import register, serializable -from ppdet.utils.download import get_dataset_path -from ppdet.data import source - -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - - -@serializable -class DetDataset(Dataset): - """ - Load detection dataset. - - Args: - dataset_dir (str): root directory for dataset. - image_dir (str): directory for images. - anno_path (str): annotation file path. - data_fields (list): key name of data dictionary, at least have 'image'. - sample_num (int): number of samples to load, -1 means all. - use_default_label (bool): whether to load default label list. - repeat (int): repeat times for dataset, use in benchmark. - """ - - def __init__(self, - dataset_dir=None, - image_dir=None, - anno_path=None, - data_fields=['image'], - sample_num=-1, - use_default_label=None, - repeat=1, - **kwargs): - super(DetDataset, self).__init__() - self.dataset_dir = dataset_dir if dataset_dir is not None else '' - self.anno_path = anno_path - self.image_dir = image_dir if image_dir is not None else '' - self.data_fields = data_fields - self.sample_num = sample_num - self.use_default_label = use_default_label - self.repeat = repeat - self._epoch = 0 - self._curr_iter = 0 - - def __len__(self, ): - return len(self.roidbs) * self.repeat - - def __call__(self, *args, **kwargs): - return self - - def __getitem__(self, idx): - n = len(self.roidbs) - if self.repeat > 1: - idx %= n - # data batch - roidb = copy.deepcopy(self.roidbs[idx]) - if self.mixup_epoch == 0 or self._epoch < self.mixup_epoch: - idx = np.random.randint(n) - roidb = [roidb, copy.deepcopy(self.roidbs[idx])] - elif self.cutmix_epoch == 0 or self._epoch < self.cutmix_epoch: - idx = np.random.randint(n) - roidb = [roidb, copy.deepcopy(self.roidbs[idx])] - elif self.mosaic_epoch == 0 or self._epoch < self.mosaic_epoch: - roidb = [roidb, ] + [ - copy.deepcopy(self.roidbs[np.random.randint(n)]) - for _ in range(4) - ] - elif self.pre_img_epoch == 0 or self._epoch < self.pre_img_epoch: - # Add previous image as input, only used in CenterTrack - idx_pre_img = idx - 1 - if idx_pre_img < 0: - idx_pre_img = idx + 1 - roidb = [roidb, ] + [copy.deepcopy(self.roidbs[idx_pre_img])] - if isinstance(roidb, Sequence): - for r in roidb: - r['curr_iter'] = self._curr_iter - else: - roidb['curr_iter'] = self._curr_iter - self._curr_iter += 1 - - return self.transform(roidb) - - def check_or_download_dataset(self): - self.dataset_dir = get_dataset_path(self.dataset_dir, self.anno_path, - self.image_dir) - - def set_kwargs(self, **kwargs): - self.mixup_epoch = kwargs.get('mixup_epoch', -1) - self.cutmix_epoch = kwargs.get('cutmix_epoch', -1) - self.mosaic_epoch = kwargs.get('mosaic_epoch', -1) - self.pre_img_epoch = kwargs.get('pre_img_epoch', -1) - - def set_transform(self, transform): - self.transform = transform - - def set_epoch(self, epoch_id): - self._epoch = epoch_id - - def parse_dataset(self, ): - raise NotImplementedError( - "Need to implement parse_dataset method of Dataset") - - def get_anno(self): - if self.anno_path is None: - return - return os.path.join(self.dataset_dir, self.anno_path) - - -def _is_valid_file(f, extensions=('.jpg', '.jpeg', '.png', '.bmp')): - return f.lower().endswith(extensions) - - -def _make_dataset(dir): - dir = os.path.expanduser(dir) - if not os.path.isdir(dir): - raise ('{} should be a dir'.format(dir)) - images = [] - for root, _, fnames in sorted(os.walk(dir, followlinks=True)): - for fname in sorted(fnames): - path = os.path.join(root, fname) - if _is_valid_file(path): - images.append(path) - return images - - -@register -@serializable -class ImageFolder(DetDataset): - def __init__(self, - dataset_dir=None, - image_dir=None, - anno_path=None, - sample_num=-1, - use_default_label=None, - **kwargs): - super(ImageFolder, self).__init__( - dataset_dir, - image_dir, - anno_path, - sample_num=sample_num, - use_default_label=use_default_label) - self._imid2path = {} - self.roidbs = None - self.sample_num = sample_num - - def check_or_download_dataset(self): - return - - def get_anno(self): - if self.anno_path is None: - return - if self.dataset_dir: - return os.path.join(self.dataset_dir, self.anno_path) - else: - return self.anno_path - - def parse_dataset(self, ): - if not self.roidbs: - self.roidbs = self._load_images() - - def _parse(self): - image_dir = self.image_dir - if not isinstance(image_dir, Sequence): - image_dir = [image_dir] - images = [] - for im_dir in image_dir: - if os.path.isdir(im_dir): - im_dir = os.path.join(self.dataset_dir, im_dir) - images.extend(_make_dataset(im_dir)) - elif os.path.isfile(im_dir) and _is_valid_file(im_dir): - images.append(im_dir) - return images - - def _load_images(self): - images = self._parse() - ct = 0 - records = [] - for image in images: - assert image != '' and os.path.isfile(image), \ - "Image {} not found".format(image) - if self.sample_num > 0 and ct >= self.sample_num: - break - rec = {'im_id': np.array([ct]), 'im_file': image} - self._imid2path[ct] = image - ct += 1 - records.append(rec) - assert len(records) > 0, "No image file found" - return records - - def get_imid2path(self): - return self._imid2path - - def set_images(self, images): - self.image_dir = images - self.roidbs = self._load_images() - - def set_slice_images(self, - images, - slice_size=[640, 640], - overlap_ratio=[0.25, 0.25]): - self.image_dir = images - ori_records = self._load_images() - try: - import sahi - from sahi.slicing import slice_image - except Exception as e: - logger.error( - 'sahi not found, plaese install sahi. ' - 'for example: `pip install sahi`, see https://github.com/obss/sahi.' - ) - raise e - - sub_img_ids = 0 - ct = 0 - ct_sub = 0 - records = [] - for i, ori_rec in enumerate(ori_records): - im_path = ori_rec['im_file'] - slice_image_result = sahi.slicing.slice_image( - image=im_path, - slice_height=slice_size[0], - slice_width=slice_size[1], - overlap_height_ratio=overlap_ratio[0], - overlap_width_ratio=overlap_ratio[1]) - - sub_img_num = len(slice_image_result) - for _ind in range(sub_img_num): - im = slice_image_result.images[_ind] - rec = { - 'image': im, - 'im_id': np.array([sub_img_ids + _ind]), - 'h': im.shape[0], - 'w': im.shape[1], - 'ori_im_id': np.array([ori_rec['im_id'][0]]), - 'st_pix': np.array( - slice_image_result.starting_pixels[_ind], - dtype=np.float32), - 'is_last': 1 if _ind == sub_img_num - 1 else 0, - } if 'image' in self.data_fields else {} - records.append(rec) - ct_sub += sub_img_num - ct += 1 - logger.info('{} samples and slice to {} sub_samples.'.format(ct, - ct_sub)) - self.roidbs = records - - def get_label_list(self): - # Only VOC dataset needs label list in ImageFold - return self.anno_path - - -@register -class CommonDataset(object): - def __init__(self, **dataset_args): - super(CommonDataset, self).__init__() - dataset_args = copy.deepcopy(dataset_args) - type = dataset_args.pop("name") - self.dataset = getattr(source, type)(**dataset_args) - - def __call__(self): - return self.dataset - - -@register -class TrainDataset(CommonDataset): - pass - - -@register -class EvalMOTDataset(CommonDataset): - pass - - -@register -class TestMOTDataset(CommonDataset): - pass - - -@register -class EvalDataset(CommonDataset): - pass - - -@register -class TestDataset(CommonDataset): - pass diff --git a/pdfdet/models/Paddle/ppdet/data/source/keypoint_coco.py b/pdfdet/models/Paddle/ppdet/data/source/keypoint_coco.py deleted file mode 100644 index 86d8343..0000000 --- a/pdfdet/models/Paddle/ppdet/data/source/keypoint_coco.py +++ /dev/null @@ -1,845 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -this code is base on https://github.com/open-mmlab/mmpose -""" -import os -import cv2 -import numpy as np -import json -import copy -import pycocotools -from pycocotools.coco import COCO -from .dataset import DetDataset -from ppdet.core.workspace import register, serializable - - -@serializable -class KeypointBottomUpBaseDataset(DetDataset): - """Base class for bottom-up datasets. - - All datasets should subclass it. - All subclasses should overwrite: - Methods:`_get_imganno` - - Args: - dataset_dir (str): Root path to the dataset. - anno_path (str): Relative path to the annotation file. - image_dir (str): Path to a directory where images are held. - Default: None. - num_joints (int): keypoint numbers - transform (composed(operators)): A sequence of data transforms. - shard (list): [rank, worldsize], the distributed env params - test_mode (bool): Store True when building test or - validation dataset. Default: False. - """ - - def __init__(self, - dataset_dir, - image_dir, - anno_path, - num_joints, - transform=[], - shard=[0, 1], - test_mode=False): - super().__init__(dataset_dir, image_dir, anno_path) - self.image_info = {} - self.ann_info = {} - - self.img_prefix = os.path.join(dataset_dir, image_dir) - self.transform = transform - self.test_mode = test_mode - - self.ann_info['num_joints'] = num_joints - self.img_ids = [] - - def parse_dataset(self): - pass - - def __len__(self): - """Get dataset length.""" - return len(self.img_ids) - - def _get_imganno(self, idx): - """Get anno for a single image.""" - raise NotImplementedError - - def __getitem__(self, idx): - """Prepare image for training given the index.""" - records = copy.deepcopy(self._get_imganno(idx)) - records['image'] = cv2.imread(records['image_file']) - records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB) - if 'mask' in records: - records['mask'] = (records['mask'] + 0).astype('uint8') - records = self.transform(records) - return records - - def parse_dataset(self): - return - - -@register -@serializable -class KeypointBottomUpCocoDataset(KeypointBottomUpBaseDataset): - """COCO dataset for bottom-up pose estimation. - - The dataset loads raw features and apply specified transforms - to return a dict containing the image tensors and other information. - - COCO keypoint indexes:: - - 0: 'nose', - 1: 'left_eye', - 2: 'right_eye', - 3: 'left_ear', - 4: 'right_ear', - 5: 'left_shoulder', - 6: 'right_shoulder', - 7: 'left_elbow', - 8: 'right_elbow', - 9: 'left_wrist', - 10: 'right_wrist', - 11: 'left_hip', - 12: 'right_hip', - 13: 'left_knee', - 14: 'right_knee', - 15: 'left_ankle', - 16: 'right_ankle' - - Args: - dataset_dir (str): Root path to the dataset. - anno_path (str): Relative path to the annotation file. - image_dir (str): Path to a directory where images are held. - Default: None. - num_joints (int): keypoint numbers - transform (composed(operators)): A sequence of data transforms. - shard (list): [rank, worldsize], the distributed env params - test_mode (bool): Store True when building test or - validation dataset. Default: False. - """ - - def __init__(self, - dataset_dir, - image_dir, - anno_path, - num_joints, - transform=[], - shard=[0, 1], - test_mode=False, - return_mask=True, - return_bbox=True, - return_area=True, - return_class=True): - super().__init__(dataset_dir, image_dir, anno_path, num_joints, - transform, shard, test_mode) - - self.ann_file = os.path.join(dataset_dir, anno_path) - self.shard = shard - self.test_mode = test_mode - self.return_mask = return_mask - self.return_bbox = return_bbox - self.return_area = return_area - self.return_class = return_class - - def parse_dataset(self): - self.coco = COCO(self.ann_file) - - self.img_ids = self.coco.getImgIds() - if not self.test_mode: - self.img_ids_tmp = [] - for img_id in self.img_ids: - ann_ids = self.coco.getAnnIds(imgIds=img_id) - anno = self.coco.loadAnns(ann_ids) - anno = [obj for obj in anno if obj['iscrowd'] == 0] - if len(anno) == 0: - continue - self.img_ids_tmp.append(img_id) - self.img_ids = self.img_ids_tmp - - blocknum = int(len(self.img_ids) / self.shard[1]) - self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * ( - self.shard[0] + 1))] - self.num_images = len(self.img_ids) - self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs) - self.dataset_name = 'coco' - - cat_ids = self.coco.getCatIds() - self.catid2clsid = dict({catid: i for i, catid in enumerate(cat_ids)}) - print('=> num_images: {}'.format(self.num_images)) - - @staticmethod - def _get_mapping_id_name(imgs): - """ - Args: - imgs (dict): dict of image info. - - Returns: - tuple: Image name & id mapping dicts. - - - id2name (dict): Mapping image id to name. - - name2id (dict): Mapping image name to id. - """ - id2name = {} - name2id = {} - for image_id, image in imgs.items(): - file_name = image['file_name'] - id2name[image_id] = file_name - name2id[file_name] = image_id - - return id2name, name2id - - def _get_imganno(self, idx): - """Get anno for a single image. - - Args: - idx (int): image idx - - Returns: - dict: info for model training - """ - coco = self.coco - img_id = self.img_ids[idx] - ann_ids = coco.getAnnIds(imgIds=img_id) - anno = coco.loadAnns(ann_ids) - - anno = [ - obj for obj in anno - if obj['iscrowd'] == 0 and obj['num_keypoints'] > 0 - ] - - db_rec = {} - joints, orgsize = self._get_joints(anno, idx) - db_rec['gt_joints'] = joints - db_rec['im_shape'] = orgsize - - if self.return_bbox: - db_rec['gt_bbox'] = self._get_bboxs(anno, idx) - - if self.return_class: - db_rec['gt_class'] = self._get_labels(anno, idx) - - if self.return_area: - db_rec['gt_areas'] = self._get_areas(anno, idx) - - if self.return_mask: - db_rec['mask'] = self._get_mask(anno, idx) - - db_rec['im_id'] = img_id - db_rec['image_file'] = os.path.join(self.img_prefix, - self.id2name[img_id]) - - return db_rec - - def _get_joints(self, anno, idx): - """Get joints for all people in an image.""" - num_people = len(anno) - - joints = np.zeros( - (num_people, self.ann_info['num_joints'], 3), dtype=np.float32) - - for i, obj in enumerate(anno): - joints[i, :self.ann_info['num_joints'], :3] = \ - np.array(obj['keypoints']).reshape([-1, 3]) - - img_info = self.coco.loadImgs(self.img_ids[idx])[0] - orgsize = np.array([img_info['height'], img_info['width'], 1]) - - return joints, orgsize - - def _get_bboxs(self, anno, idx): - num_people = len(anno) - gt_bboxes = np.zeros((num_people, 4), dtype=np.float32) - - for idx, obj in enumerate(anno): - if 'bbox' in obj: - gt_bboxes[idx, :] = obj['bbox'] - - gt_bboxes[:, 2] += gt_bboxes[:, 0] - gt_bboxes[:, 3] += gt_bboxes[:, 1] - return gt_bboxes - - def _get_labels(self, anno, idx): - num_people = len(anno) - gt_labels = np.zeros((num_people, 1), dtype=np.float32) - - for idx, obj in enumerate(anno): - if 'category_id' in obj: - catid = obj['category_id'] - gt_labels[idx, 0] = self.catid2clsid[catid] - return gt_labels - - def _get_areas(self, anno, idx): - num_people = len(anno) - gt_areas = np.zeros((num_people, ), dtype=np.float32) - - for idx, obj in enumerate(anno): - if 'area' in obj: - gt_areas[idx, ] = obj['area'] - return gt_areas - - def _get_mask(self, anno, idx): - """Get ignore masks to mask out losses.""" - coco = self.coco - img_info = coco.loadImgs(self.img_ids[idx])[0] - - m = np.zeros((img_info['height'], img_info['width']), dtype=np.float32) - - for obj in anno: - if 'segmentation' in obj: - if obj['iscrowd']: - rle = pycocotools.mask.frPyObjects(obj['segmentation'], - img_info['height'], - img_info['width']) - m += pycocotools.mask.decode(rle) - elif obj['num_keypoints'] == 0: - rles = pycocotools.mask.frPyObjects(obj['segmentation'], - img_info['height'], - img_info['width']) - for rle in rles: - m += pycocotools.mask.decode(rle) - - return m < 0.5 - - -@register -@serializable -class KeypointBottomUpCrowdPoseDataset(KeypointBottomUpCocoDataset): - """CrowdPose dataset for bottom-up pose estimation. - - The dataset loads raw features and apply specified transforms - to return a dict containing the image tensors and other information. - - CrowdPose keypoint indexes:: - - 0: 'left_shoulder', - 1: 'right_shoulder', - 2: 'left_elbow', - 3: 'right_elbow', - 4: 'left_wrist', - 5: 'right_wrist', - 6: 'left_hip', - 7: 'right_hip', - 8: 'left_knee', - 9: 'right_knee', - 10: 'left_ankle', - 11: 'right_ankle', - 12: 'top_head', - 13: 'neck' - - Args: - dataset_dir (str): Root path to the dataset. - anno_path (str): Relative path to the annotation file. - image_dir (str): Path to a directory where images are held. - Default: None. - num_joints (int): keypoint numbers - transform (composed(operators)): A sequence of data transforms. - shard (list): [rank, worldsize], the distributed env params - test_mode (bool): Store True when building test or - validation dataset. Default: False. - """ - - def __init__(self, - dataset_dir, - image_dir, - anno_path, - num_joints, - transform=[], - shard=[0, 1], - test_mode=False): - super().__init__(dataset_dir, image_dir, anno_path, num_joints, - transform, shard, test_mode) - - self.ann_file = os.path.join(dataset_dir, anno_path) - self.shard = shard - self.test_mode = test_mode - - def parse_dataset(self): - self.coco = COCO(self.ann_file) - - self.img_ids = self.coco.getImgIds() - if not self.test_mode: - self.img_ids = [ - img_id for img_id in self.img_ids - if len(self.coco.getAnnIds( - imgIds=img_id, iscrowd=None)) > 0 - ] - blocknum = int(len(self.img_ids) / self.shard[1]) - self.img_ids = self.img_ids[(blocknum * self.shard[0]):(blocknum * ( - self.shard[0] + 1))] - self.num_images = len(self.img_ids) - self.id2name, self.name2id = self._get_mapping_id_name(self.coco.imgs) - - self.dataset_name = 'crowdpose' - print('=> num_images: {}'.format(self.num_images)) - - -@serializable -class KeypointTopDownBaseDataset(DetDataset): - """Base class for top_down datasets. - - All datasets should subclass it. - All subclasses should overwrite: - Methods:`_get_db` - - Args: - dataset_dir (str): Root path to the dataset. - image_dir (str): Path to a directory where images are held. - anno_path (str): Relative path to the annotation file. - num_joints (int): keypoint numbers - transform (composed(operators)): A sequence of data transforms. - """ - - def __init__(self, - dataset_dir, - image_dir, - anno_path, - num_joints, - transform=[]): - super().__init__(dataset_dir, image_dir, anno_path) - self.image_info = {} - self.ann_info = {} - - self.img_prefix = os.path.join(dataset_dir, image_dir) - self.transform = transform - - self.ann_info['num_joints'] = num_joints - self.db = [] - - def __len__(self): - """Get dataset length.""" - return len(self.db) - - def _get_db(self): - """Get a sample""" - raise NotImplementedError - - def __getitem__(self, idx): - """Prepare sample for training given the index.""" - records = copy.deepcopy(self.db[idx]) - records['image'] = cv2.imread(records['image_file'], cv2.IMREAD_COLOR | - cv2.IMREAD_IGNORE_ORIENTATION) - records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB) - records['score'] = records['score'] if 'score' in records else 1 - records = self.transform(records) - # print('records', records) - return records - - -@register -@serializable -class KeypointTopDownCocoDataset(KeypointTopDownBaseDataset): - """COCO dataset for top-down pose estimation. - - The dataset loads raw features and apply specified transforms - to return a dict containing the image tensors and other information. - - COCO keypoint indexes: - - 0: 'nose', - 1: 'left_eye', - 2: 'right_eye', - 3: 'left_ear', - 4: 'right_ear', - 5: 'left_shoulder', - 6: 'right_shoulder', - 7: 'left_elbow', - 8: 'right_elbow', - 9: 'left_wrist', - 10: 'right_wrist', - 11: 'left_hip', - 12: 'right_hip', - 13: 'left_knee', - 14: 'right_knee', - 15: 'left_ankle', - 16: 'right_ankle' - - Args: - dataset_dir (str): Root path to the dataset. - image_dir (str): Path to a directory where images are held. - anno_path (str): Relative path to the annotation file. - num_joints (int): Keypoint numbers - trainsize (list):[w, h] Image target size - transform (composed(operators)): A sequence of data transforms. - bbox_file (str): Path to a detection bbox file - Default: None. - use_gt_bbox (bool): Whether to use ground truth bbox - Default: True. - pixel_std (int): The pixel std of the scale - Default: 200. - image_thre (float): The threshold to filter the detection box - Default: 0.0. - """ - - def __init__(self, - dataset_dir, - image_dir, - anno_path, - num_joints, - trainsize, - transform=[], - bbox_file=None, - use_gt_bbox=True, - pixel_std=200, - image_thre=0.0, - center_scale=None): - super().__init__(dataset_dir, image_dir, anno_path, num_joints, - transform) - - self.bbox_file = bbox_file - self.use_gt_bbox = use_gt_bbox - self.trainsize = trainsize - self.pixel_std = pixel_std - self.image_thre = image_thre - self.center_scale = center_scale - self.dataset_name = 'coco' - - def parse_dataset(self): - if self.use_gt_bbox: - self.db = self._load_coco_keypoint_annotations() - else: - self.db = self._load_coco_person_detection_results() - - def _load_coco_keypoint_annotations(self): - coco = COCO(self.get_anno()) - img_ids = coco.getImgIds() - gt_db = [] - for index in img_ids: - im_ann = coco.loadImgs(index)[0] - width = im_ann['width'] - height = im_ann['height'] - file_name = im_ann['file_name'] - im_id = int(im_ann["id"]) - - annIds = coco.getAnnIds(imgIds=index, iscrowd=False) - objs = coco.loadAnns(annIds) - - valid_objs = [] - for obj in objs: - x, y, w, h = obj['bbox'] - x1 = np.max((0, x)) - y1 = np.max((0, y)) - x2 = np.min((width - 1, x1 + np.max((0, w - 1)))) - y2 = np.min((height - 1, y1 + np.max((0, h - 1)))) - if obj['area'] > 0 and x2 >= x1 and y2 >= y1: - obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] - valid_objs.append(obj) - objs = valid_objs - - rec = [] - for obj in objs: - if max(obj['keypoints']) == 0: - continue - - joints = np.zeros( - (self.ann_info['num_joints'], 3), dtype=np.float32) - joints_vis = np.zeros( - (self.ann_info['num_joints'], 3), dtype=np.float32) - for ipt in range(self.ann_info['num_joints']): - joints[ipt, 0] = obj['keypoints'][ipt * 3 + 0] - joints[ipt, 1] = obj['keypoints'][ipt * 3 + 1] - joints[ipt, 2] = 0 - t_vis = obj['keypoints'][ipt * 3 + 2] - if t_vis > 1: - t_vis = 1 - joints_vis[ipt, 0] = t_vis - joints_vis[ipt, 1] = t_vis - joints_vis[ipt, 2] = 0 - - center, scale = self._box2cs(obj['clean_bbox'][:4]) - rec.append({ - 'image_file': os.path.join(self.img_prefix, file_name), - 'center': center, - 'scale': scale, - 'gt_joints': joints, - 'joints_vis': joints_vis, - 'im_id': im_id, - }) - gt_db.extend(rec) - - return gt_db - - def _box2cs(self, box): - x, y, w, h = box[:4] - center = np.zeros((2), dtype=np.float32) - center[0] = x + w * 0.5 - center[1] = y + h * 0.5 - aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1] - - if self.center_scale is not None and np.random.rand() < 0.3: - center += self.center_scale * (np.random.rand(2) - 0.5) * [w, h] - - if w > aspect_ratio * h: - h = w * 1.0 / aspect_ratio - elif w < aspect_ratio * h: - w = h * aspect_ratio - scale = np.array( - [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std], - dtype=np.float32) - if center[0] != -1: - scale = scale * 1.25 - - return center, scale - - def _load_coco_person_detection_results(self): - all_boxes = None - bbox_file_path = os.path.join(self.dataset_dir, self.bbox_file) - with open(bbox_file_path, 'r') as f: - all_boxes = json.load(f) - - if not all_boxes: - print('=> Load %s fail!' % bbox_file_path) - return None - - kpt_db = [] - for n_img in range(0, len(all_boxes)): - det_res = all_boxes[n_img] - if det_res['category_id'] != 1: - continue - file_name = det_res[ - 'filename'] if 'filename' in det_res else '%012d.jpg' % det_res[ - 'image_id'] - img_name = os.path.join(self.img_prefix, file_name) - box = det_res['bbox'] - score = det_res['score'] - im_id = int(det_res['image_id']) - - if score < self.image_thre: - continue - - center, scale = self._box2cs(box) - joints = np.zeros( - (self.ann_info['num_joints'], 3), dtype=np.float32) - joints_vis = np.ones( - (self.ann_info['num_joints'], 3), dtype=np.float32) - kpt_db.append({ - 'image_file': img_name, - 'im_id': im_id, - 'center': center, - 'scale': scale, - 'score': score, - 'gt_joints': joints, - 'joints_vis': joints_vis, - }) - - return kpt_db - - -@register -@serializable -class KeypointTopDownCocoWholeBodyHandDataset(KeypointTopDownBaseDataset): - """CocoWholeBody dataset for top-down hand pose estimation. - - The dataset loads raw features and apply specified transforms - to return a dict containing the image tensors and other information. - - COCO-WholeBody Hand keypoint indexes: - - 0: 'wrist', - 1: 'thumb1', - 2: 'thumb2', - 3: 'thumb3', - 4: 'thumb4', - 5: 'forefinger1', - 6: 'forefinger2', - 7: 'forefinger3', - 8: 'forefinger4', - 9: 'middle_finger1', - 10: 'middle_finger2', - 11: 'middle_finger3', - 12: 'middle_finger4', - 13: 'ring_finger1', - 14: 'ring_finger2', - 15: 'ring_finger3', - 16: 'ring_finger4', - 17: 'pinky_finger1', - 18: 'pinky_finger2', - 19: 'pinky_finger3', - 20: 'pinky_finger4' - - Args: - dataset_dir (str): Root path to the dataset. - image_dir (str): Path to a directory where images are held. - anno_path (str): Relative path to the annotation file. - num_joints (int): Keypoint numbers - trainsize (list):[w, h] Image target size - transform (composed(operators)): A sequence of data transforms. - pixel_std (int): The pixel std of the scale - Default: 200. - """ - - def __init__(self, - dataset_dir, - image_dir, - anno_path, - num_joints, - trainsize, - transform=[], - pixel_std=200): - super().__init__(dataset_dir, image_dir, anno_path, num_joints, - transform) - - self.trainsize = trainsize - self.pixel_std = pixel_std - self.dataset_name = 'coco_wholebady_hand' - - def _box2cs(self, box): - x, y, w, h = box[:4] - center = np.zeros((2), dtype=np.float32) - center[0] = x + w * 0.5 - center[1] = y + h * 0.5 - aspect_ratio = self.trainsize[0] * 1.0 / self.trainsize[1] - - if w > aspect_ratio * h: - h = w * 1.0 / aspect_ratio - elif w < aspect_ratio * h: - w = h * aspect_ratio - scale = np.array( - [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std], - dtype=np.float32) - if center[0] != -1: - scale = scale * 1.25 - - return center, scale - - def parse_dataset(self): - gt_db = [] - num_joints = self.ann_info['num_joints'] - coco = COCO(self.get_anno()) - img_ids = list(coco.imgs.keys()) - for img_id in img_ids: - im_ann = coco.loadImgs(img_id)[0] - image_file = os.path.join(self.img_prefix, im_ann['file_name']) - im_id = int(im_ann["id"]) - - ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False) - objs = coco.loadAnns(ann_ids) - - for obj in objs: - for type in ['left', 'right']: - if (obj[f'{type}hand_valid'] and - max(obj[f'{type}hand_kpts']) > 0): - - joints = np.zeros((num_joints, 3), dtype=np.float32) - joints_vis = np.zeros((num_joints, 3), dtype=np.float32) - - keypoints = np.array(obj[f'{type}hand_kpts']) - keypoints = keypoints.reshape(-1, 3) - joints[:, :2] = keypoints[:, :2] - joints_vis[:, :2] = np.minimum(1, keypoints[:, 2:3]) - - center, scale = self._box2cs(obj[f'{type}hand_box'][:4]) - gt_db.append({ - 'image_file': image_file, - 'center': center, - 'scale': scale, - 'gt_joints': joints, - 'joints_vis': joints_vis, - 'im_id': im_id, - }) - - self.db = gt_db - - -@register -@serializable -class KeypointTopDownMPIIDataset(KeypointTopDownBaseDataset): - """MPII dataset for topdown pose estimation. - - The dataset loads raw features and apply specified transforms - to return a dict containing the image tensors and other information. - - MPII keypoint indexes:: - - 0: 'right_ankle', - 1: 'right_knee', - 2: 'right_hip', - 3: 'left_hip', - 4: 'left_knee', - 5: 'left_ankle', - 6: 'pelvis', - 7: 'thorax', - 8: 'upper_neck', - 9: 'head_top', - 10: 'right_wrist', - 11: 'right_elbow', - 12: 'right_shoulder', - 13: 'left_shoulder', - 14: 'left_elbow', - 15: 'left_wrist', - - Args: - dataset_dir (str): Root path to the dataset. - image_dir (str): Path to a directory where images are held. - anno_path (str): Relative path to the annotation file. - num_joints (int): Keypoint numbers - trainsize (list):[w, h] Image target size - transform (composed(operators)): A sequence of data transforms. - """ - - def __init__(self, - dataset_dir, - image_dir, - anno_path, - num_joints, - transform=[]): - super().__init__(dataset_dir, image_dir, anno_path, num_joints, - transform) - - self.dataset_name = 'mpii' - - def parse_dataset(self): - with open(self.get_anno()) as anno_file: - anno = json.load(anno_file) - - gt_db = [] - for a in anno: - image_name = a['image'] - im_id = a['image_id'] if 'image_id' in a else int( - os.path.splitext(image_name)[0]) - - c = np.array(a['center'], dtype=np.float32) - s = np.array([a['scale'], a['scale']], dtype=np.float32) - - # Adjust center/scale slightly to avoid cropping limbs - if c[0] != -1: - c[1] = c[1] + 15 * s[1] - s = s * 1.25 - c = c - 1 - - joints = np.zeros( - (self.ann_info['num_joints'], 3), dtype=np.float32) - joints_vis = np.zeros( - (self.ann_info['num_joints'], 3), dtype=np.float32) - if 'gt_joints' in a: - joints_ = np.array(a['gt_joints']) - joints_[:, 0:2] = joints_[:, 0:2] - 1 - joints_vis_ = np.array(a['joints_vis']) - assert len(joints_) == self.ann_info[ - 'num_joints'], 'joint num diff: {} vs {}'.format( - len(joints_), self.ann_info['num_joints']) - - joints[:, 0:2] = joints_[:, 0:2] - joints_vis[:, 0] = joints_vis_[:] - joints_vis[:, 1] = joints_vis_[:] - - gt_db.append({ - 'image_file': os.path.join(self.img_prefix, image_name), - 'im_id': im_id, - 'center': c, - 'scale': s, - 'gt_joints': joints, - 'joints_vis': joints_vis - }) - print("number length: {}".format(len(gt_db))) - self.db = gt_db diff --git a/pdfdet/models/Paddle/ppdet/data/source/mot.py b/pdfdet/models/Paddle/ppdet/data/source/mot.py deleted file mode 100644 index 90a8a1f..0000000 --- a/pdfdet/models/Paddle/ppdet/data/source/mot.py +++ /dev/null @@ -1,638 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import cv2 -import glob -import numpy as np -from collections import OrderedDict, defaultdict -try: - from collections.abc import Sequence -except Exception: - from collections import Sequence -from .dataset import DetDataset, _make_dataset, _is_valid_file -from ppdet.core.workspace import register, serializable -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - - -@register -@serializable -class MOTDataSet(DetDataset): - """ - Load dataset with MOT format, only support single class MOT. - - Args: - dataset_dir (str): root directory for dataset. - image_lists (str|list): mot data image lists, muiti-source mot dataset. - data_fields (list): key name of data dictionary, at least have 'image'. - sample_num (int): number of samples to load, -1 means all. - repeat (int): repeat times for dataset, use in benchmark. - - Notes: - MOT datasets root directory following this: - dataset/mot - |——————image_lists - | |——————caltech.train - | |——————caltech.val - | |——————mot16.train - | |——————mot17.train - | ...... - |——————Caltech - |——————MOT17 - |——————...... - - All the MOT datasets have the following structure: - Caltech - |——————images - | └——————00001.jpg - | |—————— ... - | └——————0000N.jpg - └——————labels_with_ids - └——————00001.txt - |—————— ... - └——————0000N.txt - or - - MOT17 - |——————images - | └——————train - | └——————test - └——————labels_with_ids - └——————train - """ - - def __init__(self, - dataset_dir=None, - image_lists=[], - data_fields=['image'], - sample_num=-1, - repeat=1): - super(MOTDataSet, self).__init__( - dataset_dir=dataset_dir, - data_fields=data_fields, - sample_num=sample_num, - repeat=repeat) - self.dataset_dir = dataset_dir - self.image_lists = image_lists - if isinstance(self.image_lists, str): - self.image_lists = [self.image_lists] - self.roidbs = None - self.cname2cid = None - - def get_anno(self): - if self.image_lists == []: - return - # only used to get categories and metric - # only check first data, but the label_list of all data should be same. - first_mot_data = self.image_lists[0].split('.')[0] - anno_file = os.path.join(self.dataset_dir, first_mot_data, - 'label_list.txt') - return anno_file - - def parse_dataset(self): - self.img_files = OrderedDict() - self.img_start_index = OrderedDict() - self.label_files = OrderedDict() - self.tid_num = OrderedDict() - self.tid_start_index = OrderedDict() - - img_index = 0 - for data_name in self.image_lists: - # check every data image list - image_lists_dir = os.path.join(self.dataset_dir, 'image_lists') - assert os.path.isdir(image_lists_dir), \ - "The {} is not a directory.".format(image_lists_dir) - - list_path = os.path.join(image_lists_dir, data_name) - assert os.path.exists(list_path), \ - "The list path {} does not exist.".format(list_path) - - # record img_files, filter out empty ones - with open(list_path, 'r') as file: - self.img_files[data_name] = file.readlines() - self.img_files[data_name] = [ - os.path.join(self.dataset_dir, x.strip()) - for x in self.img_files[data_name] - ] - self.img_files[data_name] = list( - filter(lambda x: len(x) > 0, self.img_files[data_name])) - - self.img_start_index[data_name] = img_index - img_index += len(self.img_files[data_name]) - - # record label_files - self.label_files[data_name] = [ - x.replace('images', 'labels_with_ids').replace( - '.png', '.txt').replace('.jpg', '.txt') - for x in self.img_files[data_name] - ] - - for data_name, label_paths in self.label_files.items(): - max_index = -1 - for lp in label_paths: - lb = np.loadtxt(lp) - if len(lb) < 1: - continue - if len(lb.shape) < 2: - img_max = lb[1] - else: - img_max = np.max(lb[:, 1]) - if img_max > max_index: - max_index = img_max - self.tid_num[data_name] = int(max_index + 1) - - last_index = 0 - for i, (k, v) in enumerate(self.tid_num.items()): - self.tid_start_index[k] = last_index - last_index += v - - self.num_identities_dict = defaultdict(int) - self.num_identities_dict[0] = int(last_index + 1) # single class - self.num_imgs_each_data = [len(x) for x in self.img_files.values()] - self.total_imgs = sum(self.num_imgs_each_data) - - logger.info('MOT dataset summary: ') - logger.info(self.tid_num) - logger.info('Total images: {}'.format(self.total_imgs)) - logger.info('Image start index: {}'.format(self.img_start_index)) - logger.info('Total identities: {}'.format(self.num_identities_dict[0])) - logger.info('Identity start index: {}'.format(self.tid_start_index)) - - records = [] - cname2cid = mot_label() - - for img_index in range(self.total_imgs): - for i, (k, v) in enumerate(self.img_start_index.items()): - if img_index >= v: - data_name = list(self.label_files.keys())[i] - start_index = v - img_file = self.img_files[data_name][img_index - start_index] - lbl_file = self.label_files[data_name][img_index - start_index] - - if not os.path.exists(img_file): - logger.warning('Illegal image file: {}, and it will be ignored'. - format(img_file)) - continue - if not os.path.isfile(lbl_file): - logger.warning('Illegal label file: {}, and it will be ignored'. - format(lbl_file)) - continue - - labels = np.loadtxt(lbl_file, dtype=np.float32).reshape(-1, 6) - # each row in labels (N, 6) is [gt_class, gt_identity, cx, cy, w, h] - - cx, cy = labels[:, 2], labels[:, 3] - w, h = labels[:, 4], labels[:, 5] - gt_bbox = np.stack((cx, cy, w, h)).T.astype('float32') - gt_class = labels[:, 0:1].astype('int32') - gt_score = np.ones((len(labels), 1)).astype('float32') - gt_ide = labels[:, 1:2].astype('int32') - for i, _ in enumerate(gt_ide): - if gt_ide[i] > -1: - gt_ide[i] += self.tid_start_index[data_name] - - mot_rec = { - 'im_file': img_file, - 'im_id': img_index, - } if 'image' in self.data_fields else {} - - gt_rec = { - 'gt_class': gt_class, - 'gt_score': gt_score, - 'gt_bbox': gt_bbox, - 'gt_ide': gt_ide, - } - - for k, v in gt_rec.items(): - if k in self.data_fields: - mot_rec[k] = v - - records.append(mot_rec) - if self.sample_num > 0 and img_index >= self.sample_num: - break - assert len(records) > 0, 'not found any mot record in %s' % ( - self.image_lists) - self.roidbs, self.cname2cid = records, cname2cid - - -@register -@serializable -class MCMOTDataSet(DetDataset): - """ - Load dataset with MOT format, support multi-class MOT. - - Args: - dataset_dir (str): root directory for dataset. - image_lists (list(str)): mcmot data image lists, muiti-source mcmot dataset. - data_fields (list): key name of data dictionary, at least have 'image'. - label_list (str): if use_default_label is False, will load - mapping between category and class index. - sample_num (int): number of samples to load, -1 means all. - - Notes: - MCMOT datasets root directory following this: - dataset/mot - |——————image_lists - | |——————visdrone_mcmot.train - | |——————visdrone_mcmot.val - visdrone_mcmot - |——————images - | └——————train - | └——————val - └——————labels_with_ids - └——————train - """ - - def __init__(self, - dataset_dir=None, - image_lists=[], - data_fields=['image'], - label_list=None, - sample_num=-1): - super(MCMOTDataSet, self).__init__( - dataset_dir=dataset_dir, - data_fields=data_fields, - sample_num=sample_num) - self.dataset_dir = dataset_dir - self.image_lists = image_lists - if isinstance(self.image_lists, str): - self.image_lists = [self.image_lists] - self.label_list = label_list - self.roidbs = None - self.cname2cid = None - - def get_anno(self): - if self.image_lists == []: - return - # only used to get categories and metric - # only check first data, but the label_list of all data should be same. - first_mot_data = self.image_lists[0].split('.')[0] - anno_file = os.path.join(self.dataset_dir, first_mot_data, - 'label_list.txt') - return anno_file - - def parse_dataset(self): - self.img_files = OrderedDict() - self.img_start_index = OrderedDict() - self.label_files = OrderedDict() - self.tid_num = OrderedDict() - self.tid_start_idx_of_cls_ids = defaultdict(dict) # for MCMOT - - img_index = 0 - for data_name in self.image_lists: - # check every data image list - image_lists_dir = os.path.join(self.dataset_dir, 'image_lists') - assert os.path.isdir(image_lists_dir), \ - "The {} is not a directory.".format(image_lists_dir) - - list_path = os.path.join(image_lists_dir, data_name) - assert os.path.exists(list_path), \ - "The list path {} does not exist.".format(list_path) - - # record img_files, filter out empty ones - with open(list_path, 'r') as file: - self.img_files[data_name] = file.readlines() - self.img_files[data_name] = [ - os.path.join(self.dataset_dir, x.strip()) - for x in self.img_files[data_name] - ] - self.img_files[data_name] = list( - filter(lambda x: len(x) > 0, self.img_files[data_name])) - - self.img_start_index[data_name] = img_index - img_index += len(self.img_files[data_name]) - - # record label_files - self.label_files[data_name] = [ - x.replace('images', 'labels_with_ids').replace( - '.png', '.txt').replace('.jpg', '.txt') - for x in self.img_files[data_name] - ] - - for data_name, label_paths in self.label_files.items(): - # using max_ids_dict rather than max_index - max_ids_dict = defaultdict(int) - for lp in label_paths: - lb = np.loadtxt(lp) - if len(lb) < 1: - continue - lb = lb.reshape(-1, 6) - for item in lb: - if item[1] > max_ids_dict[int(item[0])]: - # item[0]: cls_id - # item[1]: track id - max_ids_dict[int(item[0])] = int(item[1]) - # track id number - self.tid_num[data_name] = max_ids_dict - - last_idx_dict = defaultdict(int) - for i, (k, v) in enumerate(self.tid_num.items()): # each sub dataset - for cls_id, id_num in v.items(): # v is a max_ids_dict - self.tid_start_idx_of_cls_ids[k][cls_id] = last_idx_dict[cls_id] - last_idx_dict[cls_id] += id_num - - self.num_identities_dict = defaultdict(int) - for k, v in last_idx_dict.items(): - self.num_identities_dict[k] = int(v) # total ids of each category - - self.num_imgs_each_data = [len(x) for x in self.img_files.values()] - self.total_imgs = sum(self.num_imgs_each_data) - - # cname2cid and cid2cname - cname2cid = {} - if self.label_list is not None: - # if use label_list for multi source mix dataset, - # please make sure label_list in the first sub_dataset at least. - sub_dataset = self.image_lists[0].split('.')[0] - label_path = os.path.join(self.dataset_dir, sub_dataset, - self.label_list) - if not os.path.exists(label_path): - logger.info( - "Note: label_list {} does not exists, use VisDrone 10 classes labels as default.". - format(label_path)) - cname2cid = visdrone_mcmot_label() - else: - with open(label_path, 'r') as fr: - label_id = 0 - for line in fr.readlines(): - cname2cid[line.strip()] = label_id - label_id += 1 - else: - cname2cid = visdrone_mcmot_label() - - cid2cname = dict([(v, k) for (k, v) in cname2cid.items()]) - - logger.info('MCMOT dataset summary: ') - logger.info(self.tid_num) - logger.info('Total images: {}'.format(self.total_imgs)) - logger.info('Image start index: {}'.format(self.img_start_index)) - - logger.info('Total identities of each category: ') - num_identities_dict = sorted( - self.num_identities_dict.items(), key=lambda x: x[0]) - total_IDs_all_cats = 0 - for (k, v) in num_identities_dict: - logger.info('Category {} [{}] has {} IDs.'.format(k, cid2cname[k], - v)) - total_IDs_all_cats += v - logger.info('Total identities of all categories: {}'.format( - total_IDs_all_cats)) - - logger.info('Identity start index of each category: ') - for k, v in self.tid_start_idx_of_cls_ids.items(): - sorted_v = sorted(v.items(), key=lambda x: x[0]) - for (cls_id, start_idx) in sorted_v: - logger.info('Start index of dataset {} category {:d} is {:d}' - .format(k, cls_id, start_idx)) - - records = [] - for img_index in range(self.total_imgs): - for i, (k, v) in enumerate(self.img_start_index.items()): - if img_index >= v: - data_name = list(self.label_files.keys())[i] - start_index = v - img_file = self.img_files[data_name][img_index - start_index] - lbl_file = self.label_files[data_name][img_index - start_index] - - if not os.path.exists(img_file): - logger.warning('Illegal image file: {}, and it will be ignored'. - format(img_file)) - continue - if not os.path.isfile(lbl_file): - logger.warning('Illegal label file: {}, and it will be ignored'. - format(lbl_file)) - continue - - labels = np.loadtxt(lbl_file, dtype=np.float32).reshape(-1, 6) - # each row in labels (N, 6) is [gt_class, gt_identity, cx, cy, w, h] - - cx, cy = labels[:, 2], labels[:, 3] - w, h = labels[:, 4], labels[:, 5] - gt_bbox = np.stack((cx, cy, w, h)).T.astype('float32') - gt_class = labels[:, 0:1].astype('int32') - gt_score = np.ones((len(labels), 1)).astype('float32') - gt_ide = labels[:, 1:2].astype('int32') - for i, _ in enumerate(gt_ide): - if gt_ide[i] > -1: - cls_id = int(gt_class[i]) - start_idx = self.tid_start_idx_of_cls_ids[data_name][cls_id] - gt_ide[i] += start_idx - - mot_rec = { - 'im_file': img_file, - 'im_id': img_index, - } if 'image' in self.data_fields else {} - - gt_rec = { - 'gt_class': gt_class, - 'gt_score': gt_score, - 'gt_bbox': gt_bbox, - 'gt_ide': gt_ide, - } - - for k, v in gt_rec.items(): - if k in self.data_fields: - mot_rec[k] = v - - records.append(mot_rec) - if self.sample_num > 0 and img_index >= self.sample_num: - break - assert len(records) > 0, 'not found any mot record in %s' % ( - self.image_lists) - self.roidbs, self.cname2cid = records, cname2cid - - -@register -@serializable -class MOTImageFolder(DetDataset): - """ - Load MOT dataset with MOT format from image folder or video . - Args: - video_file (str): path of the video file, default ''. - frame_rate (int): frame rate of the video, use cv2 VideoCapture if not set. - dataset_dir (str): root directory for dataset. - keep_ori_im (bool): whether to keep original image, default False. - Set True when used during MOT model inference while saving - images or video, or used in DeepSORT. - """ - - def __init__(self, - video_file=None, - frame_rate=-1, - dataset_dir=None, - data_root=None, - image_dir=None, - sample_num=-1, - keep_ori_im=False, - anno_path=None, - **kwargs): - super(MOTImageFolder, self).__init__( - dataset_dir, image_dir, sample_num=sample_num) - self.video_file = video_file - self.data_root = data_root - self.keep_ori_im = keep_ori_im - self._imid2path = {} - self.roidbs = None - self.frame_rate = frame_rate - self.anno_path = anno_path - - def check_or_download_dataset(self): - return - - def parse_dataset(self, ): - if not self.roidbs: - if self.video_file is None: - self.frame_rate = 30 # set as default if infer image folder - self.roidbs = self._load_images() - else: - self.roidbs = self._load_video_images() - - def _load_video_images(self): - if self.frame_rate == -1: - # if frame_rate is not set for video, use cv2.VideoCapture - cap = cv2.VideoCapture(self.video_file) - self.frame_rate = int(cap.get(cv2.CAP_PROP_FPS)) - - extension = self.video_file.split('.')[-1] - output_path = self.video_file.replace('.{}'.format(extension), '') - frames_path = video2frames(self.video_file, output_path, - self.frame_rate) - self.video_frames = sorted( - glob.glob(os.path.join(frames_path, '*.png'))) - - self.video_length = len(self.video_frames) - logger.info('Length of the video: {:d} frames.'.format( - self.video_length)) - ct = 0 - records = [] - for image in self.video_frames: - assert image != '' and os.path.isfile(image), \ - "Image {} not found".format(image) - if self.sample_num > 0 and ct >= self.sample_num: - break - rec = {'im_id': np.array([ct]), 'im_file': image} - if self.keep_ori_im: - rec.update({'keep_ori_im': 1}) - self._imid2path[ct] = image - ct += 1 - records.append(rec) - assert len(records) > 0, "No image file found" - return records - - def _find_images(self): - image_dir = self.image_dir - if not isinstance(image_dir, Sequence): - image_dir = [image_dir] - images = [] - for im_dir in image_dir: - if os.path.isdir(im_dir): - im_dir = os.path.join(self.dataset_dir, im_dir) - images.extend(_make_dataset(im_dir)) - elif os.path.isfile(im_dir) and _is_valid_file(im_dir): - images.append(im_dir) - return images - - def _load_images(self): - images = self._find_images() - ct = 0 - records = [] - for image in images: - assert image != '' and os.path.isfile(image), \ - "Image {} not found".format(image) - if self.sample_num > 0 and ct >= self.sample_num: - break - rec = {'im_id': np.array([ct]), 'im_file': image} - if self.keep_ori_im: - rec.update({'keep_ori_im': 1}) - self._imid2path[ct] = image - ct += 1 - records.append(rec) - assert len(records) > 0, "No image file found" - return records - - def get_imid2path(self): - return self._imid2path - - def set_images(self, images): - self.image_dir = images - self.roidbs = self._load_images() - - def set_video(self, video_file, frame_rate): - # update video_file and frame_rate by command line of tools/infer_mot.py - self.video_file = video_file - self.frame_rate = frame_rate - assert os.path.isfile(self.video_file) and _is_valid_video(self.video_file), \ - "wrong or unsupported file format: {}".format(self.video_file) - self.roidbs = self._load_video_images() - - def get_anno(self): - return self.anno_path - - -def _is_valid_video(f, extensions=('.mp4', '.avi', '.mov', '.rmvb', 'flv')): - return f.lower().endswith(extensions) - - -def video2frames(video_path, outpath, frame_rate, **kargs): - def _dict2str(kargs): - cmd_str = '' - for k, v in kargs.items(): - cmd_str += (' ' + str(k) + ' ' + str(v)) - return cmd_str - - ffmpeg = ['ffmpeg ', ' -y -loglevel ', ' error '] - vid_name = os.path.basename(video_path).split('.')[0] - out_full_path = os.path.join(outpath, vid_name) - - if not os.path.exists(out_full_path): - os.makedirs(out_full_path) - - # video file name - outformat = os.path.join(out_full_path, '%08d.png') - - cmd = ffmpeg - cmd = ffmpeg + [ - ' -i ', video_path, ' -r ', str(frame_rate), ' -f image2 ', outformat - ] - cmd = ''.join(cmd) + _dict2str(kargs) - - if os.system(cmd) != 0: - raise RuntimeError('ffmpeg process video: {} error'.format(video_path)) - sys.exit(-1) - - sys.stdout.flush() - return out_full_path - - -def mot_label(): - labels_map = {'person': 0} - return labels_map - - -def visdrone_mcmot_label(): - labels_map = { - 'pedestrian': 0, - 'people': 1, - 'bicycle': 2, - 'car': 3, - 'van': 4, - 'truck': 5, - 'tricycle': 6, - 'awning-tricycle': 7, - 'bus': 8, - 'motor': 9, - } - return labels_map diff --git a/pdfdet/models/Paddle/ppdet/data/source/pose3d_cmb.py b/pdfdet/models/Paddle/ppdet/data/source/pose3d_cmb.py deleted file mode 100644 index 06dbdd9..0000000 --- a/pdfdet/models/Paddle/ppdet/data/source/pose3d_cmb.py +++ /dev/null @@ -1,380 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import cv2 -import numpy as np -import json -import copy -import pycocotools -from pycocotools.coco import COCO -from .dataset import DetDataset -from ppdet.core.workspace import register, serializable -from paddle.io import Dataset - - -@serializable -class Pose3DDataset(DetDataset): - """Pose3D Dataset class. - - Args: - dataset_dir (str): Root path to the dataset. - anno_list (list of str): each of the element is a relative path to the annotation file. - image_dirs (list of str): each of path is a relative path where images are held. - transform (composed(operators)): A sequence of data transforms. - test_mode (bool): Store True when building test or - validation dataset. Default: False. - 24 joints order: - 0-2: 'R_Ankle', 'R_Knee', 'R_Hip', - 3-5:'L_Hip', 'L_Knee', 'L_Ankle', - 6-8:'R_Wrist', 'R_Elbow', 'R_Shoulder', - 9-11:'L_Shoulder','L_Elbow','L_Wrist', - 12-14:'Neck','Top_of_Head','Pelvis', - 15-18:'Thorax','Spine','Jaw','Head', - 19-23:'Nose','L_Eye','R_Eye','L_Ear','R_Ear' - """ - - def __init__(self, - dataset_dir, - image_dirs, - anno_list, - transform=[], - num_joints=24, - test_mode=False): - super().__init__(dataset_dir, image_dirs, anno_list) - self.image_info = {} - self.ann_info = {} - self.num_joints = num_joints - - self.transform = transform - self.test_mode = test_mode - - self.img_ids = [] - self.dataset_dir = dataset_dir - self.image_dirs = image_dirs - self.anno_list = anno_list - - def get_mask(self, mvm_percent=0.3): - num_joints = self.num_joints - mjm_mask = np.ones((num_joints, 1)).astype(np.float32) - if self.test_mode == False: - pb = np.random.random_sample() - masked_num = int( - pb * mvm_percent * - num_joints) # at most x% of the joints could be masked - indices = np.random.choice( - np.arange(num_joints), replace=False, size=masked_num) - mjm_mask[indices, :] = 0.0 - # return mjm_mask - - num_joints = 10 - mvm_mask = np.ones((num_joints, 1)).astype(np.float) - if self.test_mode == False: - num_vertices = num_joints - pb = np.random.random_sample() - masked_num = int( - pb * mvm_percent * - num_vertices) # at most x% of the vertices could be masked - indices = np.random.choice( - np.arange(num_vertices), replace=False, size=masked_num) - mvm_mask[indices, :] = 0.0 - - mjm_mask = np.concatenate([mjm_mask, mvm_mask], axis=0) - return mjm_mask - - def filterjoints(self, x): - if self.num_joints == 24: - return x - elif self.num_joints == 14: - return x[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18], :] - elif self.num_joints == 17: - return x[ - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19], :] - else: - raise ValueError( - "unsupported joint numbers, only [24 or 17 or 14] is supported!") - - def parse_dataset(self): - print("Loading annotations..., please wait") - self.annos = [] - im_id = 0 - self.human36m_num = 0 - for idx, annof in enumerate(self.anno_list): - img_prefix = os.path.join(self.dataset_dir, self.image_dirs[idx]) - dataf = os.path.join(self.dataset_dir, annof) - with open(dataf, 'r') as rf: - anno_data = json.load(rf) - annos = anno_data['data'] - new_annos = [] - print("{} has annos numbers: {}".format(dataf, len(annos))) - for anno in annos: - new_anno = {} - new_anno['im_id'] = im_id - im_id += 1 - imagename = anno['imageName'] - if imagename.startswith("COCO_train2014_"): - imagename = imagename[len("COCO_train2014_"):] - elif imagename.startswith("COCO_val2014_"): - imagename = imagename[len("COCO_val2014_"):] - imagename = os.path.join(img_prefix, imagename) - if not os.path.exists(imagename): - if "train2017" in imagename: - imagename = imagename.replace("train2017", - "val2017") - if not os.path.exists(imagename): - print("cannot find imagepath:{}".format( - imagename)) - continue - else: - print("cannot find imagepath:{}".format(imagename)) - continue - new_anno['imageName'] = imagename - if 'human3.6m' in imagename: - self.human36m_num += 1 - new_anno['bbox_center'] = anno['bbox_center'] - new_anno['bbox_scale'] = anno['bbox_scale'] - new_anno['joints_2d'] = np.array(anno[ - 'gt_keypoint_2d']).astype(np.float32) - if new_anno['joints_2d'].shape[0] == 49: - #if the joints_2d is in SPIN format(which generated by eft), choose the last 24 public joints - #for detail please refer: https://github.com/nkolot/SPIN/blob/master/constants.py - new_anno['joints_2d'] = new_anno['joints_2d'][25:] - new_anno['joints_3d'] = np.array(anno[ - 'pose3d'])[:, :3].astype(np.float32) - new_anno['mjm_mask'] = self.get_mask() - if not 'has_3d_joints' in anno: - new_anno['has_3d_joints'] = int(1) - new_anno['has_2d_joints'] = int(1) - else: - new_anno['has_3d_joints'] = int(anno['has_3d_joints']) - new_anno['has_2d_joints'] = int(anno['has_2d_joints']) - new_anno['joints_2d'] = self.filterjoints(new_anno[ - 'joints_2d']) - self.annos.append(new_anno) - del annos - - def get_temp_num(self): - """get temporal data number, like human3.6m""" - return self.human36m_num - - def __len__(self): - """Get dataset length.""" - return len(self.annos) - - def _get_imganno(self, idx): - """Get anno for a single image.""" - return self.annos[idx] - - def __getitem__(self, idx): - """Prepare image for training given the index.""" - records = copy.deepcopy(self._get_imganno(idx)) - imgpath = records['imageName'] - assert os.path.exists(imgpath), "cannot find image {}".format(imgpath) - records['image'] = cv2.imread(imgpath) - records['image'] = cv2.cvtColor(records['image'], cv2.COLOR_BGR2RGB) - records = self.transform(records) - return records - - def check_or_download_dataset(self): - alldatafind = True - for image_dir in self.image_dirs: - image_dir = os.path.join(self.dataset_dir, image_dir) - if not os.path.isdir(image_dir): - print("dataset [{}] is not found".format(image_dir)) - alldatafind = False - if not alldatafind: - raise ValueError( - "Some dataset is not valid and cannot download automatically now, please prepare the dataset first" - ) - - -@register -@serializable -class Keypoint3DMultiFramesDataset(Dataset): - """24 keypoints 3D dataset for pose estimation. - - each item is a list of images - - The dataset loads raw features and apply specified transforms - to return a dict containing the image tensors and other information. - - Args: - dataset_dir (str): Root path to the dataset. - image_dir (str): Path to a directory where images are held. - """ - - def __init__( - self, - dataset_dir, # 数据集根目录 - image_dir, # 图像文件夹 - p3d_dir, # 3D关键点文件夹 - json_path, - img_size, #图像resize大小 - num_frames, # 帧序列长度 - anno_path=None, ): - - self.dataset_dir = dataset_dir - self.image_dir = image_dir - self.p3d_dir = p3d_dir - self.json_path = json_path - self.img_size = img_size - self.num_frames = num_frames - self.anno_path = anno_path - - self.data_labels, self.mf_inds = self._generate_multi_frames_list() - - def _generate_multi_frames_list(self): - act_list = os.listdir(self.dataset_dir) # 动作列表 - count = 0 - mf_list = [] - annos_dict = {'images': [], 'annotations': [], 'act_inds': []} - for act in act_list: #对每个动作,生成帧序列 - if '.' in act: - continue - - json_path = os.path.join(self.dataset_dir, act, self.json_path) - with open(json_path, 'r') as j: - annos = json.load(j) - length = len(annos['images']) - for k, v in annos.items(): - if k in annos_dict: - annos_dict[k].extend(v) - annos_dict['act_inds'].extend([act] * length) - - mf = [[i + j + count for j in range(self.num_frames)] - for i in range(0, length - self.num_frames + 1)] - mf_list.extend(mf) - count += length - - print("total data number:", len(mf_list)) - return annos_dict, mf_list - - def __call__(self, *args, **kwargs): - return self - - def __getitem__(self, index): # 拿一个连续的序列 - inds = self.mf_inds[ - index] # 如[568, 569, 570, 571, 572, 573],长度为num_frames - - images = self.data_labels['images'] # all images - annots = self.data_labels['annotations'] # all annots - - act = self.data_labels['act_inds'][inds[0]] # 动作名(文件夹名) - - kps3d_list = [] - kps3d_vis_list = [] - names = [] - - h, w = 0, 0 - for ind in inds: # one image - height = float(images[ind]['height']) - width = float(images[ind]['width']) - name = images[ind]['file_name'] # 图像名称,带有后缀 - - kps3d_name = name.split('.')[0] + '.obj' - kps3d_path = os.path.join(self.dataset_dir, act, self.p3d_dir, - kps3d_name) - - joints, joints_vis = self.kps3d_process(kps3d_path) - joints_vis = np.array(joints_vis, dtype=np.float32) - - kps3d_list.append(joints) - kps3d_vis_list.append(joints_vis) - names.append(name) - - kps3d = np.array(kps3d_list) # (6, 24, 3),(num_frames, joints_num, 3) - kps3d_vis = np.array(kps3d_vis_list) - - # read image - imgs = [] - for name in names: - img_path = os.path.join(self.dataset_dir, act, self.image_dir, name) - - image = cv2.imread(img_path, cv2.IMREAD_COLOR | - cv2.IMREAD_IGNORE_ORIENTATION) - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - - imgs.append(np.expand_dims(image, axis=0)) - - imgs = np.concatenate(imgs, axis=0) - imgs = imgs.astype( - np.float32) # (6, 1080, 1920, 3),(num_frames, h, w, c) - - # attention: 此时图像和标注是镜像的 - records = { - 'kps3d': kps3d, - 'kps3d_vis': kps3d_vis, - "image": imgs, - 'act': act, - 'names': names, - 'im_id': index - } - - return self.transform(records) - - def kps3d_process(self, kps3d_path): - count = 0 - kps = [] - kps_vis = [] - - with open(kps3d_path, 'r') as f: - lines = f.readlines() - for line in lines: - if line[0] == 'v': - kps.append([]) - line = line.strip('\n').split(' ')[1:] - for kp in line: - kps[-1].append(float(kp)) - count += 1 - - kps_vis.append([1, 1, 1]) - - kps = np.array(kps) # 52,3 - kps_vis = np.array(kps_vis) - - kps *= 10 # scale points - kps -= kps[[0], :] # set root point to zero - - kps = np.concatenate((kps[0:23], kps[[37]]), axis=0) # 24,3 - - kps *= 10 - - kps_vis = np.concatenate((kps_vis[0:23], kps_vis[[37]]), axis=0) # 24,3 - - return kps, kps_vis - - def __len__(self): - return len(self.mf_inds) - - def get_anno(self): - if self.anno_path is None: - return - return os.path.join(self.dataset_dir, self.anno_path) - - def check_or_download_dataset(self): - return - - def parse_dataset(self, ): - return - - def set_transform(self, transform): - self.transform = transform - - def set_epoch(self, epoch_id): - self._epoch = epoch_id - - def set_kwargs(self, **kwargs): - self.mixup_epoch = kwargs.get('mixup_epoch', -1) - self.cutmix_epoch = kwargs.get('cutmix_epoch', -1) - self.mosaic_epoch = kwargs.get('mosaic_epoch', -1) diff --git a/pdfdet/models/Paddle/ppdet/data/source/sniper_coco.py b/pdfdet/models/Paddle/ppdet/data/source/sniper_coco.py deleted file mode 100644 index 1b07e7a..0000000 --- a/pdfdet/models/Paddle/ppdet/data/source/sniper_coco.py +++ /dev/null @@ -1,194 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import cv2 -import json -import copy -import numpy as np - -try: - from collections.abc import Sequence -except Exception: - from collections import Sequence - -from ppdet.core.workspace import register, serializable -from ppdet.data.crop_utils.annotation_cropper import AnnoCropper -from .coco import COCODataSet -from .dataset import _make_dataset, _is_valid_file -from ppdet.utils.logger import setup_logger - -logger = setup_logger('sniper_coco_dataset') - - -@register -@serializable -class SniperCOCODataSet(COCODataSet): - """SniperCOCODataSet""" - - def __init__(self, - dataset_dir=None, - image_dir=None, - anno_path=None, - proposals_file=None, - data_fields=['image'], - sample_num=-1, - load_crowd=False, - allow_empty=True, - empty_ratio=1., - is_trainset=True, - image_target_sizes=[2000, 1000], - valid_box_ratio_ranges=[[-1, 0.1],[0.08, -1]], - chip_target_size=500, - chip_target_stride=200, - use_neg_chip=False, - max_neg_num_per_im=8, - max_per_img=-1, - nms_thresh=0.5): - super(SniperCOCODataSet, self).__init__( - dataset_dir=dataset_dir, - image_dir=image_dir, - anno_path=anno_path, - data_fields=data_fields, - sample_num=sample_num, - load_crowd=load_crowd, - allow_empty=allow_empty, - empty_ratio=empty_ratio - ) - self.proposals_file = proposals_file - self.proposals = None - self.anno_cropper = None - self.is_trainset = is_trainset - self.image_target_sizes = image_target_sizes - self.valid_box_ratio_ranges = valid_box_ratio_ranges - self.chip_target_size = chip_target_size - self.chip_target_stride = chip_target_stride - self.use_neg_chip = use_neg_chip - self.max_neg_num_per_im = max_neg_num_per_im - self.max_per_img = max_per_img - self.nms_thresh = nms_thresh - - - def parse_dataset(self): - if not hasattr(self, "roidbs"): - super(SniperCOCODataSet, self).parse_dataset() - if self.is_trainset: - self._parse_proposals() - self._merge_anno_proposals() - self.ori_roidbs = copy.deepcopy(self.roidbs) - self.init_anno_cropper() - self.roidbs = self.generate_chips_roidbs(self.roidbs, self.is_trainset) - - def set_proposals_file(self, file_path): - self.proposals_file = file_path - - def init_anno_cropper(self): - logger.info("Init AnnoCropper...") - self.anno_cropper = AnnoCropper( - image_target_sizes=self.image_target_sizes, - valid_box_ratio_ranges=self.valid_box_ratio_ranges, - chip_target_size=self.chip_target_size, - chip_target_stride=self.chip_target_stride, - use_neg_chip=self.use_neg_chip, - max_neg_num_per_im=self.max_neg_num_per_im, - max_per_img=self.max_per_img, - nms_thresh=self.nms_thresh - ) - - def generate_chips_roidbs(self, roidbs, is_trainset): - if is_trainset: - roidbs = self.anno_cropper.crop_anno_records(roidbs) - else: - roidbs = self.anno_cropper.crop_infer_anno_records(roidbs) - return roidbs - - def _parse_proposals(self): - if self.proposals_file: - self.proposals = {} - logger.info("Parse proposals file:{}".format(self.proposals_file)) - with open(self.proposals_file, 'r') as f: - proposals = json.load(f) - for prop in proposals: - image_id = prop["image_id"] - if image_id not in self.proposals: - self.proposals[image_id] = [] - x, y, w, h = prop["bbox"] - self.proposals[image_id].append([x, y, x + w, y + h]) - - def _merge_anno_proposals(self): - assert self.roidbs - if self.proposals and len(self.proposals.keys()) > 0: - logger.info("merge proposals to annos") - for id, record in enumerate(self.roidbs): - image_id = int(record["im_id"]) - if image_id not in self.proposals.keys(): - logger.info("image id :{} no proposals".format(image_id)) - record["proposals"] = np.array(self.proposals.get(image_id, []), dtype=np.float32) - self.roidbs[id] = record - - def get_ori_roidbs(self): - if not hasattr(self, "ori_roidbs"): - return None - return self.ori_roidbs - - def get_roidbs(self): - if not hasattr(self, "roidbs"): - self.parse_dataset() - return self.roidbs - - def set_roidbs(self, roidbs): - self.roidbs = roidbs - - def check_or_download_dataset(self): - return - - def _parse(self): - image_dir = self.image_dir - if not isinstance(image_dir, Sequence): - image_dir = [image_dir] - images = [] - for im_dir in image_dir: - if os.path.isdir(im_dir): - im_dir = os.path.join(self.dataset_dir, im_dir) - images.extend(_make_dataset(im_dir)) - elif os.path.isfile(im_dir) and _is_valid_file(im_dir): - images.append(im_dir) - return images - - def _load_images(self): - images = self._parse() - ct = 0 - records = [] - for image in images: - assert image != '' and os.path.isfile(image), \ - "Image {} not found".format(image) - if self.sample_num > 0 and ct >= self.sample_num: - break - im = cv2.imread(image) - h, w, c = im.shape - rec = {'im_id': np.array([ct]), 'im_file': image, "h": h, "w": w} - self._imid2path[ct] = image - ct += 1 - records.append(rec) - assert len(records) > 0, "No image file found" - return records - - def get_imid2path(self): - return self._imid2path - - def set_images(self, images): - self._imid2path = {} - self.image_dir = images - self.roidbs = self._load_images() - diff --git a/pdfdet/models/Paddle/ppdet/data/source/voc.py b/pdfdet/models/Paddle/ppdet/data/source/voc.py deleted file mode 100644 index 2f10358..0000000 --- a/pdfdet/models/Paddle/ppdet/data/source/voc.py +++ /dev/null @@ -1,234 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import numpy as np - -import xml.etree.ElementTree as ET - -from ppdet.core.workspace import register, serializable - -from .dataset import DetDataset - -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - - -@register -@serializable -class VOCDataSet(DetDataset): - """ - Load dataset with PascalVOC format. - - Notes: - `anno_path` must contains xml file and image file path for annotations. - - Args: - dataset_dir (str): root directory for dataset. - image_dir (str): directory for images. - anno_path (str): voc annotation file path. - data_fields (list): key name of data dictionary, at least have 'image'. - sample_num (int): number of samples to load, -1 means all. - label_list (str): if use_default_label is False, will load - mapping between category and class index. - allow_empty (bool): whether to load empty entry. False as default - empty_ratio (float): the ratio of empty record number to total - record's, if empty_ratio is out of [0. ,1.), do not sample the - records and use all the empty entries. 1. as default - repeat (int): repeat times for dataset, use in benchmark. - """ - - def __init__(self, - dataset_dir=None, - image_dir=None, - anno_path=None, - data_fields=['image'], - sample_num=-1, - label_list=None, - allow_empty=False, - empty_ratio=1., - repeat=1): - super(VOCDataSet, self).__init__( - dataset_dir=dataset_dir, - image_dir=image_dir, - anno_path=anno_path, - data_fields=data_fields, - sample_num=sample_num, - repeat=repeat) - self.label_list = label_list - self.allow_empty = allow_empty - self.empty_ratio = empty_ratio - - def _sample_empty(self, records, num): - # if empty_ratio is out of [0. ,1.), do not sample the records - if self.empty_ratio < 0. or self.empty_ratio >= 1.: - return records - import random - sample_num = min( - int(num * self.empty_ratio / (1 - self.empty_ratio)), len(records)) - records = random.sample(records, sample_num) - return records - - def parse_dataset(self, ): - anno_path = os.path.join(self.dataset_dir, self.anno_path) - image_dir = os.path.join(self.dataset_dir, self.image_dir) - - # mapping category name to class id - # first_class:0, second_class:1, ... - records = [] - empty_records = [] - ct = 0 - cname2cid = {} - if self.label_list: - label_path = os.path.join(self.dataset_dir, self.label_list) - if not os.path.exists(label_path): - raise ValueError("label_list {} does not exists".format( - label_path)) - with open(label_path, 'r') as fr: - label_id = 0 - for line in fr.readlines(): - cname2cid[line.strip()] = label_id - label_id += 1 - else: - cname2cid = pascalvoc_label() - - with open(anno_path, 'r') as fr: - while True: - line = fr.readline() - if not line: - break - img_file, xml_file = [os.path.join(image_dir, x) \ - for x in line.strip().split()[:2]] - if not os.path.exists(img_file): - logger.warning( - 'Illegal image file: {}, and it will be ignored'.format( - img_file)) - continue - if not os.path.isfile(xml_file): - logger.warning( - 'Illegal xml file: {}, and it will be ignored'.format( - xml_file)) - continue - tree = ET.parse(xml_file) - if tree.find('id') is None: - im_id = np.array([ct]) - else: - im_id = np.array([int(tree.find('id').text)]) - - objs = tree.findall('object') - im_w = float(tree.find('size').find('width').text) - im_h = float(tree.find('size').find('height').text) - if im_w < 0 or im_h < 0: - logger.warning( - 'Illegal width: {} or height: {} in annotation, ' - 'and {} will be ignored'.format(im_w, im_h, xml_file)) - continue - - num_bbox, i = len(objs), 0 - gt_bbox = np.zeros((num_bbox, 4), dtype=np.float32) - gt_class = np.zeros((num_bbox, 1), dtype=np.int32) - gt_score = np.zeros((num_bbox, 1), dtype=np.float32) - difficult = np.zeros((num_bbox, 1), dtype=np.int32) - for obj in objs: - cname = obj.find('name').text - - # user dataset may not contain difficult field - _difficult = obj.find('difficult') - _difficult = int( - _difficult.text) if _difficult is not None else 0 - - x1 = float(obj.find('bndbox').find('xmin').text) - y1 = float(obj.find('bndbox').find('ymin').text) - x2 = float(obj.find('bndbox').find('xmax').text) - y2 = float(obj.find('bndbox').find('ymax').text) - x1 = max(0, x1) - y1 = max(0, y1) - x2 = min(im_w - 1, x2) - y2 = min(im_h - 1, y2) - if x2 > x1 and y2 > y1: - gt_bbox[i, :] = [x1, y1, x2, y2] - gt_class[i, 0] = cname2cid[cname] - gt_score[i, 0] = 1. - difficult[i, 0] = _difficult - i += 1 - else: - logger.warning( - 'Found an invalid bbox in annotations: xml_file: {}' - ', x1: {}, y1: {}, x2: {}, y2: {}.'.format( - xml_file, x1, y1, x2, y2)) - gt_bbox = gt_bbox[:i, :] - gt_class = gt_class[:i, :] - gt_score = gt_score[:i, :] - difficult = difficult[:i, :] - - voc_rec = { - 'im_file': img_file, - 'im_id': im_id, - 'h': im_h, - 'w': im_w - } if 'image' in self.data_fields else {} - - gt_rec = { - 'gt_class': gt_class, - 'gt_score': gt_score, - 'gt_bbox': gt_bbox, - 'difficult': difficult - } - for k, v in gt_rec.items(): - if k in self.data_fields: - voc_rec[k] = v - - if len(objs) == 0: - empty_records.append(voc_rec) - else: - records.append(voc_rec) - - ct += 1 - if self.sample_num > 0 and ct >= self.sample_num: - break - assert ct > 0, 'not found any voc record in %s' % (self.anno_path) - logger.debug('{} samples in file {}'.format(ct, anno_path)) - if self.allow_empty and len(empty_records) > 0: - empty_records = self._sample_empty(empty_records, len(records)) - records += empty_records - self.roidbs, self.cname2cid = records, cname2cid - - def get_label_list(self): - return os.path.join(self.dataset_dir, self.label_list) - - -def pascalvoc_label(): - labels_map = { - 'aeroplane': 0, - 'bicycle': 1, - 'bird': 2, - 'boat': 3, - 'bottle': 4, - 'bus': 5, - 'car': 6, - 'cat': 7, - 'chair': 8, - 'cow': 9, - 'diningtable': 10, - 'dog': 11, - 'horse': 12, - 'motorbike': 13, - 'person': 14, - 'pottedplant': 15, - 'sheep': 16, - 'sofa': 17, - 'train': 18, - 'tvmonitor': 19 - } - return labels_map diff --git a/pdfdet/models/Paddle/ppdet/data/source/widerface.py b/pdfdet/models/Paddle/ppdet/data/source/widerface.py deleted file mode 100644 index a17c2aa..0000000 --- a/pdfdet/models/Paddle/ppdet/data/source/widerface.py +++ /dev/null @@ -1,180 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import numpy as np - -from ppdet.core.workspace import register, serializable -from .dataset import DetDataset - -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - - -@register -@serializable -class WIDERFaceDataSet(DetDataset): - """ - Load WiderFace records with 'anno_path' - - Args: - dataset_dir (str): root directory for dataset. - image_dir (str): directory for images. - anno_path (str): WiderFace annotation data. - data_fields (list): key name of data dictionary, at least have 'image'. - sample_num (int): number of samples to load, -1 means all. - with_lmk (bool): whether to load face landmark keypoint labels. - """ - - def __init__(self, - dataset_dir=None, - image_dir=None, - anno_path=None, - data_fields=['image'], - sample_num=-1, - with_lmk=False): - super(WIDERFaceDataSet, self).__init__( - dataset_dir=dataset_dir, - image_dir=image_dir, - anno_path=anno_path, - data_fields=data_fields, - sample_num=sample_num, - with_lmk=with_lmk) - self.anno_path = anno_path - self.sample_num = sample_num - self.roidbs = None - self.cname2cid = None - self.with_lmk = with_lmk - - def parse_dataset(self): - anno_path = os.path.join(self.dataset_dir, self.anno_path) - image_dir = os.path.join(self.dataset_dir, self.image_dir) - - txt_file = anno_path - - records = [] - ct = 0 - file_lists = self._load_file_list(txt_file) - cname2cid = widerface_label() - - for item in file_lists: - im_fname = item[0] - im_id = np.array([ct]) - gt_bbox = np.zeros((len(item) - 1, 4), dtype=np.float32) - gt_class = np.zeros((len(item) - 1, 1), dtype=np.int32) - gt_lmk_labels = np.zeros((len(item) - 1, 10), dtype=np.float32) - lmk_ignore_flag = np.zeros((len(item) - 1, 1), dtype=np.int32) - for index_box in range(len(item)): - if index_box < 1: - continue - gt_bbox[index_box - 1] = item[index_box][0] - if self.with_lmk: - gt_lmk_labels[index_box - 1] = item[index_box][1] - lmk_ignore_flag[index_box - 1] = item[index_box][2] - im_fname = os.path.join(image_dir, - im_fname) if image_dir else im_fname - widerface_rec = { - 'im_file': im_fname, - 'im_id': im_id, - } if 'image' in self.data_fields else {} - gt_rec = { - 'gt_bbox': gt_bbox, - 'gt_class': gt_class, - } - for k, v in gt_rec.items(): - if k in self.data_fields: - widerface_rec[k] = v - if self.with_lmk: - widerface_rec['gt_keypoint'] = gt_lmk_labels - widerface_rec['keypoint_ignore'] = lmk_ignore_flag - - if len(item) != 0: - records.append(widerface_rec) - - ct += 1 - if self.sample_num > 0 and ct >= self.sample_num: - break - assert len(records) > 0, 'not found any widerface in %s' % (anno_path) - logger.debug('{} samples in file {}'.format(ct, anno_path)) - self.roidbs, self.cname2cid = records, cname2cid - - def _load_file_list(self, input_txt): - with open(input_txt, 'r') as f_dir: - lines_input_txt = f_dir.readlines() - - file_dict = {} - num_class = 0 - exts = ['jpg', 'jpeg', 'png', 'bmp'] - exts += [ext.upper() for ext in exts] - for i in range(len(lines_input_txt)): - line_txt = lines_input_txt[i].strip('\n\t\r') - split_str = line_txt.split(' ') - if len(split_str) == 1: - img_file_name = os.path.split(split_str[0])[1] - split_txt = img_file_name.split('.') - if len(split_txt) < 2: - continue - elif split_txt[-1] in exts: - if i != 0: - num_class += 1 - file_dict[num_class] = [line_txt] - else: - if len(line_txt) <= 6: - continue - result_boxs = [] - xmin = float(split_str[0]) - ymin = float(split_str[1]) - w = float(split_str[2]) - h = float(split_str[3]) - # Filter out wrong labels - if w < 0 or h < 0: - logger.warning('Illegal box with w: {}, h: {} in ' - 'img: {}, and it will be ignored'.format( - w, h, file_dict[num_class][0])) - continue - xmin = max(0, xmin) - ymin = max(0, ymin) - xmax = xmin + w - ymax = ymin + h - gt_bbox = [xmin, ymin, xmax, ymax] - result_boxs.append(gt_bbox) - if self.with_lmk: - assert len(split_str) > 18, 'When `with_lmk=True`, the number' \ - 'of characters per line in the annotation file should' \ - 'exceed 18.' - lmk0_x = float(split_str[5]) - lmk0_y = float(split_str[6]) - lmk1_x = float(split_str[8]) - lmk1_y = float(split_str[9]) - lmk2_x = float(split_str[11]) - lmk2_y = float(split_str[12]) - lmk3_x = float(split_str[14]) - lmk3_y = float(split_str[15]) - lmk4_x = float(split_str[17]) - lmk4_y = float(split_str[18]) - lmk_ignore_flag = 0 if lmk0_x == -1 else 1 - gt_lmk_label = [ - lmk0_x, lmk0_y, lmk1_x, lmk1_y, lmk2_x, lmk2_y, lmk3_x, - lmk3_y, lmk4_x, lmk4_y - ] - result_boxs.append(gt_lmk_label) - result_boxs.append(lmk_ignore_flag) - file_dict[num_class].append(result_boxs) - - return list(file_dict.values()) - - -def widerface_label(): - labels_map = {'face': 0} - return labels_map diff --git a/pdfdet/models/Paddle/ppdet/data/transform/__init__.py b/pdfdet/models/Paddle/ppdet/data/transform/__init__.py deleted file mode 100644 index d45cf47..0000000 --- a/pdfdet/models/Paddle/ppdet/data/transform/__init__.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# from . import operators -# from . import batch_operators -# from . import keypoint_operators -# from . import mot_operators -# from . import rotated_operators -# from . import keypoints_3d_operators -# from . import culane_operators - -from .operators import Decode,Resize,NormalizeImage,Permute -from .batch_operators import PadBatch -# from .keypoint_operators import * -# from .mot_operators import * -# from .rotated_operators import * -# from .keypoints_3d_operators import * -# from .culane_operators import * - -# __all__ = [] -# __all__ += registered_ops -# __all__ += keypoint_operators.__all__ -# __all__ += mot_operators.__all__ -# __all__ += culane_operators.__all__ diff --git a/pdfdet/models/Paddle/ppdet/data/transform/atss_assigner.py b/pdfdet/models/Paddle/ppdet/data/transform/atss_assigner.py deleted file mode 100644 index 686b140..0000000 --- a/pdfdet/models/Paddle/ppdet/data/transform/atss_assigner.py +++ /dev/null @@ -1,421 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The code is based on: -# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/atss_assigner.py - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - - -def bbox_overlaps(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6): - """Calculate overlap between two set of bboxes. - If ``is_aligned `` is ``False``, then calculate the overlaps between each - bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned - pair of bboxes1 and bboxes2. - Args: - bboxes1 (Tensor): shape (B, m, 4) in format or empty. - bboxes2 (Tensor): shape (B, n, 4) in format or empty. - B indicates the batch dim, in shape (B1, B2, ..., Bn). - If ``is_aligned `` is ``True``, then m and n must be equal. - mode (str): "iou" (intersection over union) or "iof" (intersection over - foreground). - is_aligned (bool, optional): If True, then m and n must be equal. - Default False. - eps (float, optional): A value added to the denominator for numerical - stability. Default 1e-6. - Returns: - Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,) - """ - assert mode in ['iou', 'iof', 'giou', 'diou'], 'Unsupported mode {}'.format( - mode) - # Either the boxes are empty or the length of boxes's last dimenstion is 4 - assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0) - assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0) - - # Batch dim must be the same - # Batch dim: (B1, B2, ... Bn) - assert bboxes1.shape[:-2] == bboxes2.shape[:-2] - batch_shape = bboxes1.shape[:-2] - - rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0 - cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0 - if is_aligned: - assert rows == cols - - if rows * cols == 0: - if is_aligned: - return np.random.random(batch_shape + (rows, )) - else: - return np.random.random(batch_shape + (rows, cols)) - - area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * ( - bboxes1[..., 3] - bboxes1[..., 1]) - area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * ( - bboxes2[..., 3] - bboxes2[..., 1]) - - if is_aligned: - lt = np.maximum(bboxes1[..., :2], bboxes2[..., :2]) # [B, rows, 2] - rb = np.minimum(bboxes1[..., 2:], bboxes2[..., 2:]) # [B, rows, 2] - - wh = (rb - lt).clip(min=0) # [B, rows, 2] - overlap = wh[..., 0] * wh[..., 1] - - if mode in ['iou', 'giou']: - union = area1 + area2 - overlap - else: - union = area1 - if mode == 'giou': - enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2]) - enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:]) - if mode == 'diou': - enclosed_lt = np.minimum(bboxes1[..., :2], bboxes2[..., :2]) - enclosed_rb = np.maximum(bboxes1[..., 2:], bboxes2[..., 2:]) - b1_x1, b1_y1 = bboxes1[..., 0], bboxes1[..., 1] - b1_x2, b1_y2 = bboxes1[..., 2], bboxes1[..., 3] - b2_x1, b2_y1 = bboxes2[..., 0], bboxes2[..., 1] - b2_x2, b2_y2 = bboxes2[..., 2], bboxes2[..., 3] - else: - lt = np.maximum(bboxes1[..., :, None, :2], - bboxes2[..., None, :, :2]) # [B, rows, cols, 2] - rb = np.minimum(bboxes1[..., :, None, 2:], - bboxes2[..., None, :, 2:]) # [B, rows, cols, 2] - - wh = (rb - lt).clip(min=0) # [B, rows, cols, 2] - overlap = wh[..., 0] * wh[..., 1] - - if mode in ['iou', 'giou']: - union = area1[..., None] + area2[..., None, :] - overlap - else: - union = area1[..., None] - if mode == 'giou': - enclosed_lt = np.minimum(bboxes1[..., :, None, :2], - bboxes2[..., None, :, :2]) - enclosed_rb = np.maximum(bboxes1[..., :, None, 2:], - bboxes2[..., None, :, 2:]) - if mode == 'diou': - enclosed_lt = np.minimum(bboxes1[..., :, None, :2], - bboxes2[..., None, :, :2]) - enclosed_rb = np.maximum(bboxes1[..., :, None, 2:], - bboxes2[..., None, :, 2:]) - b1_x1, b1_y1 = bboxes1[..., :, None, 0], bboxes1[..., :, None, 1] - b1_x2, b1_y2 = bboxes1[..., :, None, 2], bboxes1[..., :, None, 3] - b2_x1, b2_y1 = bboxes2[..., None, :, 0], bboxes2[..., None, :, 1] - b2_x2, b2_y2 = bboxes2[..., None, :, 2], bboxes2[..., None, :, 3] - - eps = np.array([eps]) - union = np.maximum(union, eps) - ious = overlap / union - if mode in ['iou', 'iof']: - return ious - # calculate gious - if mode in ['giou']: - enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0) - enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] - enclose_area = np.maximum(enclose_area, eps) - gious = ious - (enclose_area - union) / enclose_area - return gious - if mode in ['diou']: - left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4 - right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4 - rho2 = left + right - enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0) - enclose_c = enclose_wh[..., 0]**2 + enclose_wh[..., 1]**2 - enclose_c = np.maximum(enclose_c, eps) - dious = ious - rho2 / enclose_c - return dious - - -def topk_(input, k, axis=1, largest=True): - x = -input if largest else input - if axis == 0: - row_index = np.arange(input.shape[1 - axis]) - if k == x.shape[0]: # argpartition requires index < len(input) - topk_index = np.argpartition(x, k - 1, axis=axis)[0:k, :] - else: - topk_index = np.argpartition(x, k, axis=axis)[0:k, :] - - topk_data = x[topk_index, row_index] - - topk_index_sort = np.argsort(topk_data, axis=axis) - topk_data_sort = topk_data[topk_index_sort, row_index] - topk_index_sort = topk_index[0:k, :][topk_index_sort, row_index] - else: - column_index = np.arange(x.shape[1 - axis])[:, None] - topk_index = np.argpartition(x, k, axis=axis)[:, 0:k] - topk_data = x[column_index, topk_index] - topk_data = -topk_data if largest else topk_data - topk_index_sort = np.argsort(topk_data, axis=axis) - topk_data_sort = topk_data[column_index, topk_index_sort] - topk_index_sort = topk_index[:, 0:k][column_index, topk_index_sort] - - return topk_data_sort, topk_index_sort - - -class ATSSAssigner(object): - """Assign a corresponding gt bbox or background to each bbox. - - Each proposals will be assigned with `0` or a positive integer - indicating the ground truth index. - - - 0: negative sample, no assigned gt - - positive integer: positive sample, index (1-based) of assigned gt - - Args: - topk (float): number of bbox selected in each level - """ - - def __init__(self, topk=9): - self.topk = topk - - def __call__(self, - bboxes, - num_level_bboxes, - gt_bboxes, - gt_bboxes_ignore=None, - gt_labels=None): - """Assign gt to bboxes. - The assignment is done in following steps - 1. compute iou between all bbox (bbox of all pyramid levels) and gt - 2. compute center distance between all bbox and gt - 3. on each pyramid level, for each gt, select k bbox whose center - are closest to the gt center, so we total select k*l bbox as - candidates for each gt - 4. get corresponding iou for the these candidates, and compute the - mean and std, set mean + std as the iou threshold - 5. select these candidates whose iou are greater than or equal to - the threshold as postive - 6. limit the positive sample's center in gt - Args: - bboxes (np.array): Bounding boxes to be assigned, shape(n, 4). - num_level_bboxes (List): num of bboxes in each level - gt_bboxes (np.array): Groundtruth boxes, shape (k, 4). - gt_bboxes_ignore (np.array, optional): Ground truth bboxes that are - labelled as `ignored`, e.g., crowd boxes in COCO. - gt_labels (np.array, optional): Label of gt_bboxes, shape (k, ). - """ - bboxes = bboxes[:, :4] - num_gt, num_bboxes = gt_bboxes.shape[0], bboxes.shape[0] - - # assign 0 by default - assigned_gt_inds = np.zeros((num_bboxes, ), dtype=np.int64) - - if num_gt == 0 or num_bboxes == 0: - # No ground truth or boxes, return empty assignment - max_overlaps = np.zeros((num_bboxes, )) - if num_gt == 0: - # No truth, assign everything to background - assigned_gt_inds[:] = 0 - if not np.any(gt_labels): - assigned_labels = None - else: - assigned_labels = -np.ones((num_bboxes, ), dtype=np.int64) - return assigned_gt_inds, max_overlaps - - # compute iou between all bbox and gt - overlaps = bbox_overlaps(bboxes, gt_bboxes) - # compute center distance between all bbox and gt - gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0 - gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0 - gt_points = np.stack((gt_cx, gt_cy), axis=1) - - bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0 - bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0 - bboxes_points = np.stack((bboxes_cx, bboxes_cy), axis=1) - - distances = np.sqrt( - np.power((bboxes_points[:, None, :] - gt_points[None, :, :]), 2) - .sum(-1)) - - # Selecting candidates based on the center distance - candidate_idxs = [] - start_idx = 0 - for bboxes_per_level in num_level_bboxes: - # on each pyramid level, for each gt, - # select k bbox whose center are closest to the gt center - end_idx = start_idx + bboxes_per_level - distances_per_level = distances[start_idx:end_idx, :] - selectable_k = min(self.topk, bboxes_per_level) - _, topk_idxs_per_level = topk_( - distances_per_level, selectable_k, axis=0, largest=False) - candidate_idxs.append(topk_idxs_per_level + start_idx) - start_idx = end_idx - candidate_idxs = np.concatenate(candidate_idxs, axis=0) - - # get corresponding iou for the these candidates, and compute the - # mean and std, set mean + std as the iou threshold - candidate_overlaps = overlaps[candidate_idxs, np.arange(num_gt)] - overlaps_mean_per_gt = candidate_overlaps.mean(0) - overlaps_std_per_gt = candidate_overlaps.std(0) - overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt - - is_pos = candidate_overlaps >= overlaps_thr_per_gt[None, :] - - # limit the positive sample's center in gt - for gt_idx in range(num_gt): - candidate_idxs[:, gt_idx] += gt_idx * num_bboxes - ep_bboxes_cx = np.broadcast_to( - bboxes_cx.reshape(1, -1), [num_gt, num_bboxes]).reshape(-1) - ep_bboxes_cy = np.broadcast_to( - bboxes_cy.reshape(1, -1), [num_gt, num_bboxes]).reshape(-1) - candidate_idxs = candidate_idxs.reshape(-1) - - # calculate the left, top, right, bottom distance between positive - # bbox center and gt side - l_ = ep_bboxes_cx[candidate_idxs].reshape(-1, num_gt) - gt_bboxes[:, 0] - t_ = ep_bboxes_cy[candidate_idxs].reshape(-1, num_gt) - gt_bboxes[:, 1] - r_ = gt_bboxes[:, 2] - ep_bboxes_cx[candidate_idxs].reshape(-1, num_gt) - b_ = gt_bboxes[:, 3] - ep_bboxes_cy[candidate_idxs].reshape(-1, num_gt) - is_in_gts = np.stack([l_, t_, r_, b_], axis=1).min(axis=1) > 0.01 - is_pos = is_pos & is_in_gts - - # if an anchor box is assigned to multiple gts, - # the one with the highest IoU will be selected. - overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1) - index = candidate_idxs.reshape(-1)[is_pos.reshape(-1)] - overlaps_inf[index] = overlaps.T.reshape(-1)[index] - overlaps_inf = overlaps_inf.reshape(num_gt, -1).T - - max_overlaps = overlaps_inf.max(axis=1) - argmax_overlaps = overlaps_inf.argmax(axis=1) - assigned_gt_inds[max_overlaps != - -np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1 - - return assigned_gt_inds, max_overlaps - - def get_vlr_region(self, - bboxes, - num_level_bboxes, - gt_bboxes, - gt_bboxes_ignore=None, - gt_labels=None): - """get vlr region for ld distillation. - Args: - bboxes (np.array): Bounding boxes to be assigned, shape(n, 4). - num_level_bboxes (List): num of bboxes in each level - gt_bboxes (np.array): Groundtruth boxes, shape (k, 4). - gt_bboxes_ignore (np.array, optional): Ground truth bboxes that are - labelled as `ignored`, e.g., crowd boxes in COCO. - gt_labels (np.array, optional): Label of gt_bboxes, shape (k, ). - """ - bboxes = bboxes[:, :4] - - num_gt, num_bboxes = gt_bboxes.shape[0], bboxes.shape[0] - - # compute iou between all bbox and gt - overlaps = bbox_overlaps(bboxes, gt_bboxes) - - # compute diou between all bbox and gt - diou = bbox_overlaps(bboxes, gt_bboxes, mode='diou') - - # assign 0 by default - assigned_gt_inds = np.zeros((num_bboxes, ), dtype=np.int64) - - vlr_region_iou = (assigned_gt_inds + 0).astype(np.float32) - - if num_gt == 0 or num_bboxes == 0: - # No ground truth or boxes, return empty assignment - max_overlaps = np.zeros((num_bboxes, )) - if num_gt == 0: - # No truth, assign everything to background - assigned_gt_inds[:] = 0 - if not np.any(gt_labels): - assigned_labels = None - else: - assigned_labels = -np.ones((num_bboxes, ), dtype=np.int64) - return assigned_gt_inds, max_overlaps - - # compute center distance between all bbox and gt - gt_cx = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0 - gt_cy = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0 - gt_points = np.stack((gt_cx, gt_cy), axis=1) - - bboxes_cx = (bboxes[:, 0] + bboxes[:, 2]) / 2.0 - bboxes_cy = (bboxes[:, 1] + bboxes[:, 3]) / 2.0 - bboxes_points = np.stack((bboxes_cx, bboxes_cy), axis=1) - - distances = np.sqrt( - np.power((bboxes_points[:, None, :] - gt_points[None, :, :]), 2) - .sum(-1)) - - # Selecting candidates based on the center distance - candidate_idxs = [] - candidate_idxs_t = [] - start_idx = 0 - for bboxes_per_level in num_level_bboxes: - # on each pyramid level, for each gt, - # select k bbox whose center are closest to the gt center - end_idx = start_idx + bboxes_per_level - distances_per_level = distances[start_idx:end_idx, :] - selectable_t = min(self.topk, bboxes_per_level) - selectable_k = bboxes_per_level #k for all - _, topt_idxs_per_level = topk_( - distances_per_level, selectable_t, axis=0, largest=False) - _, topk_idxs_per_level = topk_( - distances_per_level, selectable_k, axis=0, largest=False) - candidate_idxs_t.append(topt_idxs_per_level + start_idx) - candidate_idxs.append(topk_idxs_per_level + start_idx) - start_idx = end_idx - - candidate_idxs_t = np.concatenate(candidate_idxs_t, axis=0) - candidate_idxs = np.concatenate(candidate_idxs, axis=0) - - # get corresponding iou for the these candidates, and compute the - # mean and std, set mean + std as the iou threshold - candidate_overlaps_t = overlaps[candidate_idxs_t, np.arange(num_gt)] - - # compute tdiou - t_diou = diou[candidate_idxs, np.arange(num_gt)] - - overlaps_mean_per_gt = candidate_overlaps_t.mean(0) - overlaps_std_per_gt = candidate_overlaps_t.std( - 0, ddof=1) # NOTE: use Bessel correction - overlaps_thr_per_gt = overlaps_mean_per_gt + overlaps_std_per_gt - - # compute region - is_pos = (t_diou < overlaps_thr_per_gt[None, :]) & ( - t_diou >= 0.25 * overlaps_thr_per_gt[None, :]) - - # limit the positive sample's center in gt - for gt_idx in range(num_gt): - candidate_idxs[:, gt_idx] += gt_idx * num_bboxes - - candidate_idxs = candidate_idxs.reshape(-1) - - # if an anchor box is assigned to multiple gts, - # the one with the highest IoU will be selected. - overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1) - index = candidate_idxs.reshape(-1)[is_pos.reshape(-1)] - - overlaps_inf[index] = overlaps.T.reshape(-1)[index] - overlaps_inf = overlaps_inf.reshape(num_gt, -1).T - - max_overlaps = overlaps_inf.max(axis=1) - argmax_overlaps = overlaps_inf.argmax(axis=1) - - overlaps_inf = -np.inf * np.ones_like(overlaps).T.reshape(-1) - overlaps_inf = overlaps_inf.reshape(num_gt, -1).T - - assigned_gt_inds[max_overlaps != - -np.inf] = argmax_overlaps[max_overlaps != -np.inf] + 1 - - vlr_region_iou[max_overlaps != - -np.inf] = max_overlaps[max_overlaps != -np.inf] + 0 - - return vlr_region_iou diff --git a/pdfdet/models/Paddle/ppdet/data/transform/autoaugment_utils.py b/pdfdet/models/Paddle/ppdet/data/transform/autoaugment_utils.py deleted file mode 100644 index cfa89d3..0000000 --- a/pdfdet/models/Paddle/ppdet/data/transform/autoaugment_utils.py +++ /dev/null @@ -1,1586 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# Reference: -# https://github.com/tensorflow/tpu/blob/master/models/official/detection/utils/autoaugment_utils.py -"""AutoAugment util file.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import inspect -import math -from PIL import Image, ImageEnhance -import numpy as np -import cv2 -from copy import deepcopy - -# This signifies the max integer that the controller RNN could predict for the -# augmentation scheme. -_MAX_LEVEL = 10. - -# Represents an invalid bounding box that is used for checking for padding -# lists of bounding box coordinates for a few augmentation operations -_INVALID_BOX = [[-1.0, -1.0, -1.0, -1.0]] - - -def policy_v0(): - """Autoaugment policy that was used in AutoAugment Detection Paper.""" - # Each tuple is an augmentation operation of the form - # (operation, probability, magnitude). Each element in policy is a - # sub-policy that will be applied sequentially on the image. - policy = [ - [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)], - [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)], - [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)], - [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)], - [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)], - ] - return policy - - -def policy_v1(): - """Autoaugment policy that was used in AutoAugment Detection Paper.""" - # Each tuple is an augmentation operation of the form - # (operation, probability, magnitude). Each element in policy is a - # sub-policy that will be applied sequentially on the image. - policy = [ - [('TranslateX_BBox', 0.6, 4), ('Equalize', 0.8, 10)], - [('TranslateY_Only_BBoxes', 0.2, 2), ('Cutout', 0.8, 8)], - [('Sharpness', 0.0, 8), ('ShearX_BBox', 0.4, 0)], - [('ShearY_BBox', 1.0, 2), ('TranslateY_Only_BBoxes', 0.6, 6)], - [('Rotate_BBox', 0.6, 10), ('Color', 1.0, 6)], - [('Color', 0.0, 0), ('ShearX_Only_BBoxes', 0.8, 4)], - [('ShearY_Only_BBoxes', 0.8, 2), ('Flip_Only_BBoxes', 0.0, 10)], - [('Equalize', 0.6, 10), ('TranslateX_BBox', 0.2, 2)], - [('Color', 1.0, 10), ('TranslateY_Only_BBoxes', 0.4, 6)], - [('Rotate_BBox', 0.8, 10), ('Contrast', 0.0, 10)], # , - [('Cutout', 0.2, 2), ('Brightness', 0.8, 10)], - [('Color', 1.0, 6), ('Equalize', 1.0, 2)], - [('Cutout_Only_BBoxes', 0.4, 6), ('TranslateY_Only_BBoxes', 0.8, 2)], - [('Color', 0.2, 8), ('Rotate_BBox', 0.8, 10)], - [('Sharpness', 0.4, 4), ('TranslateY_Only_BBoxes', 0.0, 4)], - [('Sharpness', 1.0, 4), ('SolarizeAdd', 0.4, 4)], - [('Rotate_BBox', 1.0, 8), ('Sharpness', 0.2, 8)], - [('ShearY_BBox', 0.6, 10), ('Equalize_Only_BBoxes', 0.6, 8)], - [('ShearX_BBox', 0.2, 6), ('TranslateY_Only_BBoxes', 0.2, 10)], - [('SolarizeAdd', 0.6, 8), ('Brightness', 0.8, 10)], - ] - return policy - - -def policy_vtest(): - """Autoaugment test policy for debugging.""" - # Each tuple is an augmentation operation of the form - # (operation, probability, magnitude). Each element in policy is a - # sub-policy that will be applied sequentially on the image. - policy = [[('TranslateX_BBox', 1.0, 4), ('Equalize', 1.0, 10)], ] - return policy - - -def policy_v2(): - """Additional policy that performs well on object detection.""" - # Each tuple is an augmentation operation of the form - # (operation, probability, magnitude). Each element in policy is a - # sub-policy that will be applied sequentially on the image. - policy = [ - [('Color', 0.0, 6), ('Cutout', 0.6, 8), ('Sharpness', 0.4, 8)], - [('Rotate_BBox', 0.4, 8), ('Sharpness', 0.4, 2), - ('Rotate_BBox', 0.8, 10)], - [('TranslateY_BBox', 1.0, 8), ('AutoContrast', 0.8, 2)], - [('AutoContrast', 0.4, 6), ('ShearX_BBox', 0.8, 8), - ('Brightness', 0.0, 10)], - [('SolarizeAdd', 0.2, 6), ('Contrast', 0.0, 10), - ('AutoContrast', 0.6, 0)], - [('Cutout', 0.2, 0), ('Solarize', 0.8, 8), ('Color', 1.0, 4)], - [('TranslateY_BBox', 0.0, 4), ('Equalize', 0.6, 8), - ('Solarize', 0.0, 10)], - [('TranslateY_BBox', 0.2, 2), ('ShearY_BBox', 0.8, 8), - ('Rotate_BBox', 0.8, 8)], - [('Cutout', 0.8, 8), ('Brightness', 0.8, 8), ('Cutout', 0.2, 2)], - [('Color', 0.8, 4), ('TranslateY_BBox', 1.0, 6), - ('Rotate_BBox', 0.6, 6)], - [('Rotate_BBox', 0.6, 10), ('BBox_Cutout', 1.0, 4), ('Cutout', 0.2, 8)], - [('Rotate_BBox', 0.0, 0), ('Equalize', 0.6, 6), - ('ShearY_BBox', 0.6, 8)], - [('Brightness', 0.8, 8), ('AutoContrast', 0.4, 2), - ('Brightness', 0.2, 2)], - [('TranslateY_BBox', 0.4, 8), ('Solarize', 0.4, 6), - ('SolarizeAdd', 0.2, 10)], - [('Contrast', 1.0, 10), ('SolarizeAdd', 0.2, 8), ('Equalize', 0.2, 4)], - ] - return policy - - -def policy_v3(): - """"Additional policy that performs well on object detection.""" - # Each tuple is an augmentation operation of the form - # (operation, probability, magnitude). Each element in policy is a - # sub-policy that will be applied sequentially on the image. - policy = [ - [('Posterize', 0.8, 2), ('TranslateX_BBox', 1.0, 8)], - [('BBox_Cutout', 0.2, 10), ('Sharpness', 1.0, 8)], - [('Rotate_BBox', 0.6, 8), ('Rotate_BBox', 0.8, 10)], - [('Equalize', 0.8, 10), ('AutoContrast', 0.2, 10)], - [('SolarizeAdd', 0.2, 2), ('TranslateY_BBox', 0.2, 8)], - [('Sharpness', 0.0, 2), ('Color', 0.4, 8)], - [('Equalize', 1.0, 8), ('TranslateY_BBox', 1.0, 8)], - [('Posterize', 0.6, 2), ('Rotate_BBox', 0.0, 10)], - [('AutoContrast', 0.6, 0), ('Rotate_BBox', 1.0, 6)], - [('Equalize', 0.0, 4), ('Cutout', 0.8, 10)], - [('Brightness', 1.0, 2), ('TranslateY_BBox', 1.0, 6)], - [('Contrast', 0.0, 2), ('ShearY_BBox', 0.8, 0)], - [('AutoContrast', 0.8, 10), ('Contrast', 0.2, 10)], - [('Rotate_BBox', 1.0, 10), ('Cutout', 1.0, 10)], - [('SolarizeAdd', 0.8, 6), ('Equalize', 0.8, 8)], - ] - return policy - - -def _equal(val1, val2, eps=1e-8): - return abs(val1 - val2) <= eps - - -def blend(image1, image2, factor): - """Blend image1 and image2 using 'factor'. - - Factor can be above 0.0. A value of 0.0 means only image1 is used. - A value of 1.0 means only image2 is used. A value between 0.0 and - 1.0 means we linearly interpolate the pixel values between the two - images. A value greater than 1.0 "extrapolates" the difference - between the two pixel values, and we clip the results to values - between 0 and 255. - - Args: - image1: An image Tensor of type uint8. - image2: An image Tensor of type uint8. - factor: A floating point value above 0.0. - - Returns: - A blended image Tensor of type uint8. - """ - if factor == 0.0: - return image1 - if factor == 1.0: - return image2 - - image1 = image1.astype(np.float32) - image2 = image2.astype(np.float32) - - difference = image2 - image1 - scaled = factor * difference - - # Do addition in float. - temp = image1 + scaled - - # Interpolate - if factor > 0.0 and factor < 1.0: - # Interpolation means we always stay within 0 and 255. - return temp.astype(np.uint8) - - # Extrapolate: - # - # We need to clip and then cast. - return np.clip(temp, a_min=0, a_max=255).astype(np.uint8) - - -def cutout(image, pad_size, replace=0): - """Apply cutout (https://arxiv.org/abs/1708.04552) to image. - - This operation applies a (2*pad_size x 2*pad_size) mask of zeros to - a random location within `img`. The pixel values filled in will be of the - value `replace`. The located where the mask will be applied is randomly - chosen uniformly over the whole image. - - Args: - image: An image Tensor of type uint8. - pad_size: Specifies how big the zero mask that will be generated is that - is applied to the image. The mask will be of size - (2*pad_size x 2*pad_size). - replace: What pixel value to fill in the image in the area that has - the cutout mask applied to it. - - Returns: - An image Tensor that is of type uint8. - Example: - img = cv2.imread( "/home/vis/gry/train/img_data/test.jpg", cv2.COLOR_BGR2RGB ) - new_img = cutout(img, pad_size=50, replace=0) - """ - image_height, image_width = image.shape[0], image.shape[1] - - cutout_center_height = np.random.randint(low=0, high=image_height) - cutout_center_width = np.random.randint(low=0, high=image_width) - - lower_pad = np.maximum(0, cutout_center_height - pad_size) - upper_pad = np.maximum(0, image_height - cutout_center_height - pad_size) - left_pad = np.maximum(0, cutout_center_width - pad_size) - right_pad = np.maximum(0, image_width - cutout_center_width - pad_size) - - cutout_shape = [ - image_height - (lower_pad + upper_pad), - image_width - (left_pad + right_pad) - ] - padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]] - mask = np.pad(np.zeros( - cutout_shape, dtype=image.dtype), - padding_dims, - 'constant', - constant_values=1) - mask = np.expand_dims(mask, -1) - mask = np.tile(mask, [1, 1, 3]) - image = np.where( - np.equal(mask, 0), - np.ones_like( - image, dtype=image.dtype) * replace, - image) - return image.astype(np.uint8) - - -def solarize(image, threshold=128): - # For each pixel in the image, select the pixel - # if the value is less than the threshold. - # Otherwise, subtract 255 from the pixel. - return np.where(image < threshold, image, 255 - image) - - -def solarize_add(image, addition=0, threshold=128): - # For each pixel in the image less than threshold - # we add 'addition' amount to it and then clip the - # pixel value to be between 0 and 255. The value - # of 'addition' is between -128 and 128. - added_image = image.astype(np.int64) + addition - added_image = np.clip(added_image, a_min=0, a_max=255).astype(np.uint8) - return np.where(image < threshold, added_image, image) - - -def color(image, factor): - """use cv2 to deal""" - gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - degenerate = cv2.cvtColor(gray, cv2.COLOR_GRAY2BGR) - return blend(degenerate, image, factor) - - -# refer to https://github.com/4uiiurz1/pytorch-auto-augment/blob/024b2eac4140c38df8342f09998e307234cafc80/auto_augment.py#L197 -def contrast(img, factor): - img = ImageEnhance.Contrast(Image.fromarray(img)).enhance(factor) - return np.array(img) - - -def brightness(image, factor): - """Equivalent of PIL Brightness.""" - degenerate = np.zeros_like(image) - return blend(degenerate, image, factor) - - -def posterize(image, bits): - """Equivalent of PIL Posterize.""" - shift = 8 - bits - return np.left_shift(np.right_shift(image, shift), shift) - - -def rotate(image, degrees, replace): - """Rotates the image by degrees either clockwise or counterclockwise. - - Args: - image: An image Tensor of type uint8. - degrees: Float, a scalar angle in degrees to rotate all images by. If - degrees is positive the image will be rotated clockwise otherwise it will - be rotated counterclockwise. - replace: A one or three value 1D tensor to fill empty pixels caused by - the rotate operation. - - Returns: - The rotated version of image. - """ - image = wrap(image) - image = Image.fromarray(image) - image = image.rotate(degrees) - image = np.array(image, dtype=np.uint8) - return unwrap(image, replace) - - -def random_shift_bbox(image, - bbox, - pixel_scaling, - replace, - new_min_bbox_coords=None): - """Move the bbox and the image content to a slightly new random location. - - Args: - image: 3D uint8 Tensor. - bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) - of type float that represents the normalized coordinates between 0 and 1. - The potential values for the new min corner of the bbox will be between - [old_min - pixel_scaling * bbox_height/2, - old_min - pixel_scaling * bbox_height/2]. - pixel_scaling: A float between 0 and 1 that specifies the pixel range - that the new bbox location will be sampled from. - replace: A one or three value 1D tensor to fill empty pixels. - new_min_bbox_coords: If not None, then this is a tuple that specifies the - (min_y, min_x) coordinates of the new bbox. Normally this is randomly - specified, but this allows it to be manually set. The coordinates are - the absolute coordinates between 0 and image height/width and are int32. - - Returns: - The new image that will have the shifted bbox location in it along with - the new bbox that contains the new coordinates. - """ - # Obtains image height and width and create helper clip functions. - image_height, image_width = image.shape[0], image.shape[1] - image_height = float(image_height) - image_width = float(image_width) - - def clip_y(val): - return np.clip(val, a_min=0, a_max=image_height - 1).astype(np.int32) - - def clip_x(val): - return np.clip(val, a_min=0, a_max=image_width - 1).astype(np.int32) - - # Convert bbox to pixel coordinates. - min_y = int(image_height * bbox[0]) - min_x = int(image_width * bbox[1]) - max_y = clip_y(image_height * bbox[2]) - max_x = clip_x(image_width * bbox[3]) - - bbox_height, bbox_width = (max_y - min_y + 1, max_x - min_x + 1) - image_height = int(image_height) - image_width = int(image_width) - - # Select the new min/max bbox ranges that are used for sampling the - # new min x/y coordinates of the shifted bbox. - minval_y = clip_y(min_y - np.int32(pixel_scaling * float(bbox_height) / - 2.0)) - maxval_y = clip_y(min_y + np.int32(pixel_scaling * float(bbox_height) / - 2.0)) - minval_x = clip_x(min_x - np.int32(pixel_scaling * float(bbox_width) / 2.0)) - maxval_x = clip_x(min_x + np.int32(pixel_scaling * float(bbox_width) / 2.0)) - - # Sample and calculate the new unclipped min/max coordinates of the new bbox. - if new_min_bbox_coords is None: - unclipped_new_min_y = np.random.randint( - low=minval_y, high=maxval_y, dtype=np.int32) - unclipped_new_min_x = np.random.randint( - low=minval_x, high=maxval_x, dtype=np.int32) - else: - unclipped_new_min_y, unclipped_new_min_x = ( - clip_y(new_min_bbox_coords[0]), clip_x(new_min_bbox_coords[1])) - unclipped_new_max_y = unclipped_new_min_y + bbox_height - 1 - unclipped_new_max_x = unclipped_new_min_x + bbox_width - 1 - - # Determine if any of the new bbox was shifted outside the current image. - # This is used for determining if any of the original bbox content should be - # discarded. - new_min_y, new_min_x, new_max_y, new_max_x = ( - clip_y(unclipped_new_min_y), clip_x(unclipped_new_min_x), - clip_y(unclipped_new_max_y), clip_x(unclipped_new_max_x)) - shifted_min_y = (new_min_y - unclipped_new_min_y) + min_y - shifted_max_y = max_y - (unclipped_new_max_y - new_max_y) - shifted_min_x = (new_min_x - unclipped_new_min_x) + min_x - shifted_max_x = max_x - (unclipped_new_max_x - new_max_x) - - # Create the new bbox tensor by converting pixel integer values to floats. - new_bbox = np.stack([ - float(new_min_y) / float(image_height), float(new_min_x) / - float(image_width), float(new_max_y) / float(image_height), - float(new_max_x) / float(image_width) - ]) - - # Copy the contents in the bbox and fill the old bbox location - # with gray (128). - bbox_content = image[shifted_min_y:shifted_max_y + 1, shifted_min_x: - shifted_max_x + 1, :] - - def mask_and_add_image(min_y_, min_x_, max_y_, max_x_, mask, content_tensor, - image_): - """Applies mask to bbox region in image then adds content_tensor to it.""" - mask = np.pad(mask, [[min_y_, (image_height - 1) - max_y_], - [min_x_, (image_width - 1) - max_x_], [0, 0]], - 'constant', - constant_values=1) - - content_tensor = np.pad(content_tensor, - [[min_y_, (image_height - 1) - max_y_], - [min_x_, (image_width - 1) - max_x_], [0, 0]], - 'constant', - constant_values=0) - return image_ * mask + content_tensor - - # Zero out original bbox location. - mask = np.zeros_like(image)[min_y:max_y + 1, min_x:max_x + 1, :] - grey_tensor = np.zeros_like(mask) + replace[0] - image = mask_and_add_image(min_y, min_x, max_y, max_x, mask, grey_tensor, - image) - - # Fill in bbox content to new bbox location. - mask = np.zeros_like(bbox_content) - image = mask_and_add_image(new_min_y, new_min_x, new_max_y, new_max_x, mask, - bbox_content, image) - - return image.astype(np.uint8), new_bbox - - -def _clip_bbox(min_y, min_x, max_y, max_x): - """Clip bounding box coordinates between 0 and 1. - - Args: - min_y: Normalized bbox coordinate of type float between 0 and 1. - min_x: Normalized bbox coordinate of type float between 0 and 1. - max_y: Normalized bbox coordinate of type float between 0 and 1. - max_x: Normalized bbox coordinate of type float between 0 and 1. - - Returns: - Clipped coordinate values between 0 and 1. - """ - min_y = np.clip(min_y, a_min=0, a_max=1.0) - min_x = np.clip(min_x, a_min=0, a_max=1.0) - max_y = np.clip(max_y, a_min=0, a_max=1.0) - max_x = np.clip(max_x, a_min=0, a_max=1.0) - return min_y, min_x, max_y, max_x - - -def _check_bbox_area(min_y, min_x, max_y, max_x, delta=0.05): - """Adjusts bbox coordinates to make sure the area is > 0. - - Args: - min_y: Normalized bbox coordinate of type float between 0 and 1. - min_x: Normalized bbox coordinate of type float between 0 and 1. - max_y: Normalized bbox coordinate of type float between 0 and 1. - max_x: Normalized bbox coordinate of type float between 0 and 1. - delta: Float, this is used to create a gap of size 2 * delta between - bbox min/max coordinates that are the same on the boundary. - This prevents the bbox from having an area of zero. - - Returns: - Tuple of new bbox coordinates between 0 and 1 that will now have a - guaranteed area > 0. - """ - height = max_y - min_y - width = max_x - min_x - - def _adjust_bbox_boundaries(min_coord, max_coord): - # Make sure max is never 0 and min is never 1. - max_coord = np.maximum(max_coord, 0.0 + delta) - min_coord = np.minimum(min_coord, 1.0 - delta) - return min_coord, max_coord - - if _equal(height, 0): - min_y, max_y = _adjust_bbox_boundaries(min_y, max_y) - - if _equal(width, 0): - min_x, max_x = _adjust_bbox_boundaries(min_x, max_x) - - return min_y, min_x, max_y, max_x - - -def _scale_bbox_only_op_probability(prob): - """Reduce the probability of the bbox-only operation. - - Probability is reduced so that we do not distort the content of too many - bounding boxes that are close to each other. The value of 3.0 was a chosen - hyper parameter when designing the autoaugment algorithm that we found - empirically to work well. - - Args: - prob: Float that is the probability of applying the bbox-only operation. - - Returns: - Reduced probability. - """ - return prob / 3.0 - - -def _apply_bbox_augmentation(image, bbox, augmentation_func, *args): - """Applies augmentation_func to the subsection of image indicated by bbox. - - Args: - image: 3D uint8 Tensor. - bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) - of type float that represents the normalized coordinates between 0 and 1. - augmentation_func: Augmentation function that will be applied to the - subsection of image. - *args: Additional parameters that will be passed into augmentation_func - when it is called. - - Returns: - A modified version of image, where the bbox location in the image will - have `ugmentation_func applied to it. - """ - image_height = image.shape[0] - image_width = image.shape[1] - - min_y = int(image_height * bbox[0]) - min_x = int(image_width * bbox[1]) - max_y = int(image_height * bbox[2]) - max_x = int(image_width * bbox[3]) - - # Clip to be sure the max values do not fall out of range. - max_y = np.minimum(max_y, image_height - 1) - max_x = np.minimum(max_x, image_width - 1) - - # Get the sub-tensor that is the image within the bounding box region. - bbox_content = image[min_y:max_y + 1, min_x:max_x + 1, :] - - # Apply the augmentation function to the bbox portion of the image. - augmented_bbox_content = augmentation_func(bbox_content, *args) - - # Pad the augmented_bbox_content and the mask to match the shape of original - # image. - augmented_bbox_content = np.pad( - augmented_bbox_content, [[min_y, (image_height - 1) - max_y], - [min_x, (image_width - 1) - max_x], [0, 0]], - 'constant', - constant_values=1) - - # Create a mask that will be used to zero out a part of the original image. - mask_tensor = np.zeros_like(bbox_content) - - mask_tensor = np.pad(mask_tensor, - [[min_y, (image_height - 1) - max_y], - [min_x, (image_width - 1) - max_x], [0, 0]], - 'constant', - constant_values=1) - # Replace the old bbox content with the new augmented content. - image = image * mask_tensor + augmented_bbox_content - return image.astype(np.uint8) - - -def _concat_bbox(bbox, bboxes): - """Helper function that concates bbox to bboxes along the first dimension.""" - - # Note if all elements in bboxes are -1 (_INVALID_BOX), then this means - # we discard bboxes and start the bboxes Tensor with the current bbox. - bboxes_sum_check = np.sum(bboxes) - bbox = np.expand_dims(bbox, 0) - # This check will be true when it is an _INVALID_BOX - if _equal(bboxes_sum_check, -4): - bboxes = bbox - else: - bboxes = np.concatenate([bboxes, bbox], 0) - return bboxes - - -def _apply_bbox_augmentation_wrapper(image, bbox, new_bboxes, prob, - augmentation_func, func_changes_bbox, - *args): - """Applies _apply_bbox_augmentation with probability prob. - - Args: - image: 3D uint8 Tensor. - bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) - of type float that represents the normalized coordinates between 0 and 1. - new_bboxes: 2D Tensor that is a list of the bboxes in the image after they - have been altered by aug_func. These will only be changed when - func_changes_bbox is set to true. Each bbox has 4 elements - (min_y, min_x, max_y, max_x) of type float that are the normalized - bbox coordinates between 0 and 1. - prob: Float that is the probability of applying _apply_bbox_augmentation. - augmentation_func: Augmentation function that will be applied to the - subsection of image. - func_changes_bbox: Boolean. Does augmentation_func return bbox in addition - to image. - *args: Additional parameters that will be passed into augmentation_func - when it is called. - - Returns: - A tuple. Fist element is a modified version of image, where the bbox - location in the image will have augmentation_func applied to it if it is - chosen to be called with probability `prob`. The second element is a - Tensor of Tensors of length 4 that will contain the altered bbox after - applying augmentation_func. - """ - should_apply_op = (np.random.rand() + prob >= 1) - if func_changes_bbox: - if should_apply_op: - augmented_image, bbox = augmentation_func(image, bbox, *args) - else: - augmented_image, bbox = (image, bbox) - else: - if should_apply_op: - augmented_image = _apply_bbox_augmentation(image, bbox, - augmentation_func, *args) - else: - augmented_image = image - new_bboxes = _concat_bbox(bbox, new_bboxes) - return augmented_image.astype(np.uint8), new_bboxes - - -def _apply_multi_bbox_augmentation(image, bboxes, prob, aug_func, - func_changes_bbox, *args): - """Applies aug_func to the image for each bbox in bboxes. - - Args: - image: 3D uint8 Tensor. - bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox - has 4 elements (min_y, min_x, max_y, max_x) of type float. - prob: Float that is the probability of applying aug_func to a specific - bounding box within the image. - aug_func: Augmentation function that will be applied to the - subsections of image indicated by the bbox values in bboxes. - func_changes_bbox: Boolean. Does augmentation_func return bbox in addition - to image. - *args: Additional parameters that will be passed into augmentation_func - when it is called. - - Returns: - A modified version of image, where each bbox location in the image will - have augmentation_func applied to it if it is chosen to be called with - probability prob independently across all bboxes. Also the final - bboxes are returned that will be unchanged if func_changes_bbox is set to - false and if true, the new altered ones will be returned. - """ - # Will keep track of the new altered bboxes after aug_func is repeatedly - # applied. The -1 values are a dummy value and this first Tensor will be - # removed upon appending the first real bbox. - new_bboxes = np.array(_INVALID_BOX) - - # If the bboxes are empty, then just give it _INVALID_BOX. The result - # will be thrown away. - bboxes = np.array((_INVALID_BOX)) if bboxes.size == 0 else bboxes - - assert bboxes.shape[1] == 4, "bboxes.shape[1] must be 4!!!!" - - # pylint:disable=g-long-lambda - # pylint:disable=line-too-long - wrapped_aug_func = lambda _image, bbox, _new_bboxes: _apply_bbox_augmentation_wrapper(_image, bbox, _new_bboxes, prob, aug_func, func_changes_bbox, *args) - # pylint:enable=g-long-lambda - # pylint:enable=line-too-long - - # Setup the while_loop. - num_bboxes = bboxes.shape[0] # We loop until we go over all bboxes. - idx = 0 # Counter for the while loop. - - # Conditional function when to end the loop once we go over all bboxes - # images_and_bboxes contain (_image, _new_bboxes) - def cond(_idx, _images_and_bboxes): - return _idx < num_bboxes - - # Shuffle the bboxes so that the augmentation order is not deterministic if - # we are not changing the bboxes with aug_func. - # if not func_changes_bbox: - # print(bboxes) - # loop_bboxes = np.take(bboxes,np.random.permutation(bboxes.shape[0]),axis=0) - # print(loop_bboxes) - # else: - # loop_bboxes = bboxes - # we can not shuffle the bbox because it does not contain class information here - loop_bboxes = deepcopy(bboxes) - - # Main function of while_loop where we repeatedly apply augmentation on the - # bboxes in the image. - # pylint:disable=g-long-lambda - body = lambda _idx, _images_and_bboxes: [ - _idx + 1, wrapped_aug_func(_images_and_bboxes[0], - loop_bboxes[_idx], - _images_and_bboxes[1])] - while (cond(idx, (image, new_bboxes))): - idx, (image, new_bboxes) = body(idx, (image, new_bboxes)) - - # Either return the altered bboxes or the original ones depending on if - # we altered them in anyway. - if func_changes_bbox: - final_bboxes = new_bboxes - else: - final_bboxes = bboxes - return image, final_bboxes - - -def _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, aug_func, - func_changes_bbox, *args): - """Checks to be sure num bboxes > 0 before calling inner function.""" - num_bboxes = len(bboxes) - new_image = deepcopy(image) - new_bboxes = deepcopy(bboxes) - if num_bboxes != 0: - new_image, new_bboxes = _apply_multi_bbox_augmentation( - new_image, new_bboxes, prob, aug_func, func_changes_bbox, *args) - return new_image, new_bboxes - - -def rotate_only_bboxes(image, bboxes, prob, degrees, replace): - """Apply rotate to each bbox in the image with probability prob.""" - func_changes_bbox = False - prob = _scale_bbox_only_op_probability(prob) - return _apply_multi_bbox_augmentation_wrapper( - image, bboxes, prob, rotate, func_changes_bbox, degrees, replace) - - -def shear_x_only_bboxes(image, bboxes, prob, level, replace): - """Apply shear_x to each bbox in the image with probability prob.""" - func_changes_bbox = False - prob = _scale_bbox_only_op_probability(prob) - return _apply_multi_bbox_augmentation_wrapper( - image, bboxes, prob, shear_x, func_changes_bbox, level, replace) - - -def shear_y_only_bboxes(image, bboxes, prob, level, replace): - """Apply shear_y to each bbox in the image with probability prob.""" - func_changes_bbox = False - prob = _scale_bbox_only_op_probability(prob) - return _apply_multi_bbox_augmentation_wrapper( - image, bboxes, prob, shear_y, func_changes_bbox, level, replace) - - -def translate_x_only_bboxes(image, bboxes, prob, pixels, replace): - """Apply translate_x to each bbox in the image with probability prob.""" - func_changes_bbox = False - prob = _scale_bbox_only_op_probability(prob) - return _apply_multi_bbox_augmentation_wrapper( - image, bboxes, prob, translate_x, func_changes_bbox, pixels, replace) - - -def translate_y_only_bboxes(image, bboxes, prob, pixels, replace): - """Apply translate_y to each bbox in the image with probability prob.""" - func_changes_bbox = False - prob = _scale_bbox_only_op_probability(prob) - return _apply_multi_bbox_augmentation_wrapper( - image, bboxes, prob, translate_y, func_changes_bbox, pixels, replace) - - -def flip_only_bboxes(image, bboxes, prob): - """Apply flip_lr to each bbox in the image with probability prob.""" - func_changes_bbox = False - prob = _scale_bbox_only_op_probability(prob) - return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, - np.fliplr, func_changes_bbox) - - -def solarize_only_bboxes(image, bboxes, prob, threshold): - """Apply solarize to each bbox in the image with probability prob.""" - func_changes_bbox = False - prob = _scale_bbox_only_op_probability(prob) - return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, solarize, - func_changes_bbox, threshold) - - -def equalize_only_bboxes(image, bboxes, prob): - """Apply equalize to each bbox in the image with probability prob.""" - func_changes_bbox = False - prob = _scale_bbox_only_op_probability(prob) - return _apply_multi_bbox_augmentation_wrapper(image, bboxes, prob, equalize, - func_changes_bbox) - - -def cutout_only_bboxes(image, bboxes, prob, pad_size, replace): - """Apply cutout to each bbox in the image with probability prob.""" - func_changes_bbox = False - prob = _scale_bbox_only_op_probability(prob) - return _apply_multi_bbox_augmentation_wrapper( - image, bboxes, prob, cutout, func_changes_bbox, pad_size, replace) - - -def _rotate_bbox(bbox, image_height, image_width, degrees): - """Rotates the bbox coordinated by degrees. - - Args: - bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) - of type float that represents the normalized coordinates between 0 and 1. - image_height: Int, height of the image. - image_width: Int, height of the image. - degrees: Float, a scalar angle in degrees to rotate all images by. If - degrees is positive the image will be rotated clockwise otherwise it will - be rotated counterclockwise. - - Returns: - A tensor of the same shape as bbox, but now with the rotated coordinates. - """ - image_height, image_width = (float(image_height), float(image_width)) - - # Convert from degrees to radians. - degrees_to_radians = math.pi / 180.0 - radians = degrees * degrees_to_radians - - # Translate the bbox to the center of the image and turn the normalized 0-1 - # coordinates to absolute pixel locations. - # Y coordinates are made negative as the y axis of images goes down with - # increasing pixel values, so we negate to make sure x axis and y axis points - # are in the traditionally positive direction. - min_y = -int(image_height * (bbox[0] - 0.5)) - min_x = int(image_width * (bbox[1] - 0.5)) - max_y = -int(image_height * (bbox[2] - 0.5)) - max_x = int(image_width * (bbox[3] - 0.5)) - coordinates = np.stack([[min_y, min_x], [min_y, max_x], [max_y, min_x], - [max_y, max_x]]).astype(np.float32) - # Rotate the coordinates according to the rotation matrix clockwise if - # radians is positive, else negative - rotation_matrix = np.stack([[math.cos(radians), math.sin(radians)], - [-math.sin(radians), math.cos(radians)]]) - new_coords = np.matmul(rotation_matrix, - np.transpose(coordinates)).astype(np.int32) - - # Find min/max values and convert them back to normalized 0-1 floats. - min_y = -(float(np.max(new_coords[0, :])) / image_height - 0.5) - min_x = float(np.min(new_coords[1, :])) / image_width + 0.5 - max_y = -(float(np.min(new_coords[0, :])) / image_height - 0.5) - max_x = float(np.max(new_coords[1, :])) / image_width + 0.5 - - # Clip the bboxes to be sure the fall between [0, 1]. - min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) - min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) - return np.stack([min_y, min_x, max_y, max_x]) - - -def rotate_with_bboxes(image, bboxes, degrees, replace): - # Rotate the image. - image = rotate(image, degrees, replace) - - # Convert bbox coordinates to pixel values. - image_height, image_width = image.shape[:2] - # pylint:disable=g-long-lambda - wrapped_rotate_bbox = lambda bbox: _rotate_bbox(bbox, image_height, image_width, degrees) - # pylint:enable=g-long-lambda - new_bboxes = np.zeros_like(bboxes) - for idx in range(len(bboxes)): - new_bboxes[idx] = wrapped_rotate_bbox(bboxes[idx]) - return image, new_bboxes - - -def translate_x(image, pixels, replace): - """Equivalent of PIL Translate in X dimension.""" - image = Image.fromarray(wrap(image)) - image = image.transform(image.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0)) - return unwrap(np.array(image), replace) - - -def translate_y(image, pixels, replace): - """Equivalent of PIL Translate in Y dimension.""" - image = Image.fromarray(wrap(image)) - image = image.transform(image.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels)) - return unwrap(np.array(image), replace) - - -def _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal): - """Shifts the bbox coordinates by pixels. - - Args: - bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) - of type float that represents the normalized coordinates between 0 and 1. - image_height: Int, height of the image. - image_width: Int, width of the image. - pixels: An int. How many pixels to shift the bbox. - shift_horizontal: Boolean. If true then shift in X dimension else shift in - Y dimension. - - Returns: - A tensor of the same shape as bbox, but now with the shifted coordinates. - """ - pixels = int(pixels) - # Convert bbox to integer pixel locations. - min_y = int(float(image_height) * bbox[0]) - min_x = int(float(image_width) * bbox[1]) - max_y = int(float(image_height) * bbox[2]) - max_x = int(float(image_width) * bbox[3]) - - if shift_horizontal: - min_x = np.maximum(0, min_x - pixels) - max_x = np.minimum(image_width, max_x - pixels) - else: - min_y = np.maximum(0, min_y - pixels) - max_y = np.minimum(image_height, max_y - pixels) - - # Convert bbox back to floats. - min_y = float(min_y) / float(image_height) - min_x = float(min_x) / float(image_width) - max_y = float(max_y) / float(image_height) - max_x = float(max_x) / float(image_width) - - # Clip the bboxes to be sure the fall between [0, 1]. - min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) - min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) - return np.stack([min_y, min_x, max_y, max_x]) - - -def translate_bbox(image, bboxes, pixels, replace, shift_horizontal): - """Equivalent of PIL Translate in X/Y dimension that shifts image and bbox. - - Args: - image: 3D uint8 Tensor. - bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox - has 4 elements (min_y, min_x, max_y, max_x) of type float with values - between [0, 1]. - pixels: An int. How many pixels to shift the image and bboxes - replace: A one or three value 1D tensor to fill empty pixels. - shift_horizontal: Boolean. If true then shift in X dimension else shift in - Y dimension. - - Returns: - A tuple containing a 3D uint8 Tensor that will be the result of translating - image by pixels. The second element of the tuple is bboxes, where now - the coordinates will be shifted to reflect the shifted image. - """ - if shift_horizontal: - image = translate_x(image, pixels, replace) - else: - image = translate_y(image, pixels, replace) - - # Convert bbox coordinates to pixel values. - image_height, image_width = image.shape[0], image.shape[1] - # pylint:disable=g-long-lambda - wrapped_shift_bbox = lambda bbox: _shift_bbox(bbox, image_height, image_width, pixels, shift_horizontal) - # pylint:enable=g-long-lambda - new_bboxes = deepcopy(bboxes) - num_bboxes = len(bboxes) - for idx in range(num_bboxes): - new_bboxes[idx] = wrapped_shift_bbox(bboxes[idx]) - return image.astype(np.uint8), new_bboxes - - -def shear_x(image, level, replace): - """Equivalent of PIL Shearing in X dimension.""" - # Shear parallel to x axis is a projective transform - # with a matrix form of: - # [1 level - # 0 1]. - image = Image.fromarray(wrap(image)) - image = image.transform(image.size, Image.AFFINE, (1, level, 0, 0, 1, 0)) - return unwrap(np.array(image), replace) - - -def shear_y(image, level, replace): - """Equivalent of PIL Shearing in Y dimension.""" - # Shear parallel to y axis is a projective transform - # with a matrix form of: - # [1 0 - # level 1]. - image = Image.fromarray(wrap(image)) - image = image.transform(image.size, Image.AFFINE, (1, 0, 0, level, 1, 0)) - return unwrap(np.array(image), replace) - - -def _shear_bbox(bbox, image_height, image_width, level, shear_horizontal): - """Shifts the bbox according to how the image was sheared. - - Args: - bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) - of type float that represents the normalized coordinates between 0 and 1. - image_height: Int, height of the image. - image_width: Int, height of the image. - level: Float. How much to shear the image. - shear_horizontal: If true then shear in X dimension else shear in - the Y dimension. - - Returns: - A tensor of the same shape as bbox, but now with the shifted coordinates. - """ - image_height, image_width = (float(image_height), float(image_width)) - - # Change bbox coordinates to be pixels. - min_y = int(image_height * bbox[0]) - min_x = int(image_width * bbox[1]) - max_y = int(image_height * bbox[2]) - max_x = int(image_width * bbox[3]) - coordinates = np.stack( - [[min_y, min_x], [min_y, max_x], [max_y, min_x], [max_y, max_x]]) - coordinates = coordinates.astype(np.float32) - - # Shear the coordinates according to the translation matrix. - if shear_horizontal: - translation_matrix = np.stack([[1, 0], [-level, 1]]) - else: - translation_matrix = np.stack([[1, -level], [0, 1]]) - translation_matrix = translation_matrix.astype(np.float32) - new_coords = np.matmul(translation_matrix, - np.transpose(coordinates)).astype(np.int32) - - # Find min/max values and convert them back to floats. - min_y = float(np.min(new_coords[0, :])) / image_height - min_x = float(np.min(new_coords[1, :])) / image_width - max_y = float(np.max(new_coords[0, :])) / image_height - max_x = float(np.max(new_coords[1, :])) / image_width - - # Clip the bboxes to be sure the fall between [0, 1]. - min_y, min_x, max_y, max_x = _clip_bbox(min_y, min_x, max_y, max_x) - min_y, min_x, max_y, max_x = _check_bbox_area(min_y, min_x, max_y, max_x) - return np.stack([min_y, min_x, max_y, max_x]) - - -def shear_with_bboxes(image, bboxes, level, replace, shear_horizontal): - """Applies Shear Transformation to the image and shifts the bboxes. - - Args: - image: 3D uint8 Tensor. - bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox - has 4 elements (min_y, min_x, max_y, max_x) of type float with values - between [0, 1]. - level: Float. How much to shear the image. This value will be between - -0.3 to 0.3. - replace: A one or three value 1D tensor to fill empty pixels. - shear_horizontal: Boolean. If true then shear in X dimension else shear in - the Y dimension. - - Returns: - A tuple containing a 3D uint8 Tensor that will be the result of shearing - image by level. The second element of the tuple is bboxes, where now - the coordinates will be shifted to reflect the sheared image. - """ - if shear_horizontal: - image = shear_x(image, level, replace) - else: - image = shear_y(image, level, replace) - - # Convert bbox coordinates to pixel values. - image_height, image_width = image.shape[:2] - # pylint:disable=g-long-lambda - wrapped_shear_bbox = lambda bbox: _shear_bbox(bbox, image_height, image_width, level, shear_horizontal) - # pylint:enable=g-long-lambda - new_bboxes = deepcopy(bboxes) - num_bboxes = len(bboxes) - for idx in range(num_bboxes): - new_bboxes[idx] = wrapped_shear_bbox(bboxes[idx]) - return image.astype(np.uint8), new_bboxes - - -def autocontrast(image): - """Implements Autocontrast function from PIL. - - Args: - image: A 3D uint8 tensor. - - Returns: - The image after it has had autocontrast applied to it and will be of type - uint8. - """ - - def scale_channel(image): - """Scale the 2D image using the autocontrast rule.""" - # A possibly cheaper version can be done using cumsum/unique_with_counts - # over the histogram values, rather than iterating over the entire image. - # to compute mins and maxes. - lo = float(np.min(image)) - hi = float(np.max(image)) - - # Scale the image, making the lowest value 0 and the highest value 255. - def scale_values(im): - scale = 255.0 / (hi - lo) - offset = -lo * scale - im = im.astype(np.float32) * scale + offset - img = np.clip(im, a_min=0, a_max=255.0) - return im.astype(np.uint8) - - result = scale_values(image) if hi > lo else image - return result - - # Assumes RGB for now. Scales each channel independently - # and then stacks the result. - s1 = scale_channel(image[:, :, 0]) - s2 = scale_channel(image[:, :, 1]) - s3 = scale_channel(image[:, :, 2]) - image = np.stack([s1, s2, s3], 2) - return image - - -def sharpness(image, factor): - """Implements Sharpness function from PIL.""" - orig_image = image - image = image.astype(np.float32) - # Make image 4D for conv operation. - # SMOOTH PIL Kernel. - kernel = np.array([[1, 1, 1], [1, 5, 1], [1, 1, 1]], dtype=np.float32) / 13. - result = cv2.filter2D(image, -1, kernel).astype(np.uint8) - - # Blend the final result. - return blend(result, orig_image, factor) - - -def equalize(image): - """Implements Equalize function from PIL using.""" - - def scale_channel(im, c): - """Scale the data in the channel to implement equalize.""" - im = im[:, :, c].astype(np.int32) - # Compute the histogram of the image channel. - histo, _ = np.histogram(im, range=[0, 255], bins=256) - - # For the purposes of computing the step, filter out the nonzeros. - nonzero = np.where(np.not_equal(histo, 0)) - nonzero_histo = np.reshape(np.take(histo, nonzero), [-1]) - step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255 - - def build_lut(histo, step): - # Compute the cumulative sum, shifting by step // 2 - # and then normalization by step. - lut = (np.cumsum(histo) + (step // 2)) // step - # Shift lut, prepending with 0. - lut = np.concatenate([[0], lut[:-1]], 0) - # Clip the counts to be in range. This is done - # in the C code for image.point. - return np.clip(lut, a_min=0, a_max=255).astype(np.uint8) - - # If step is zero, return the original image. Otherwise, build - # lut from the full histogram and step and then index from it. - if step == 0: - result = im - else: - result = np.take(build_lut(histo, step), im) - - return result.astype(np.uint8) - - # Assumes RGB for now. Scales each channel independently - # and then stacks the result. - s1 = scale_channel(image, 0) - s2 = scale_channel(image, 1) - s3 = scale_channel(image, 2) - image = np.stack([s1, s2, s3], 2) - return image - - -def wrap(image): - """Returns 'image' with an extra channel set to all 1s.""" - shape = image.shape - extended_channel = 255 * np.ones([shape[0], shape[1], 1], image.dtype) - extended = np.concatenate([image, extended_channel], 2).astype(image.dtype) - return extended - - -def unwrap(image, replace): - """Unwraps an image produced by wrap. - - Where there is a 0 in the last channel for every spatial position, - the rest of the three channels in that spatial dimension are grayed - (set to 128). Operations like translate and shear on a wrapped - Tensor will leave 0s in empty locations. Some transformations look - at the intensity of values to do preprocessing, and we want these - empty pixels to assume the 'average' value, rather than pure black. - - - Args: - image: A 3D Image Tensor with 4 channels. - replace: A one or three value 1D tensor to fill empty pixels. - - Returns: - image: A 3D image Tensor with 3 channels. - """ - image_shape = image.shape - # Flatten the spatial dimensions. - flattened_image = np.reshape(image, [-1, image_shape[2]]) - - # Find all pixels where the last channel is zero. - alpha_channel = flattened_image[:, 3] - - replace = np.concatenate([replace, np.ones([1], image.dtype)], 0) - - # Where they are zero, fill them in with 'replace'. - alpha_channel = np.reshape(alpha_channel, (-1, 1)) - alpha_channel = np.tile(alpha_channel, reps=(1, flattened_image.shape[1])) - - flattened_image = np.where( - np.equal(alpha_channel, 0), - np.ones_like( - flattened_image, dtype=image.dtype) * replace, - flattened_image) - - image = np.reshape(flattened_image, image_shape) - image = image[:, :, :3] - return image.astype(np.uint8) - - -def _cutout_inside_bbox(image, bbox, pad_fraction): - """Generates cutout mask and the mean pixel value of the bbox. - - First a location is randomly chosen within the image as the center where the - cutout mask will be applied. Note this can be towards the boundaries of the - image, so the full cutout mask may not be applied. - - Args: - image: 3D uint8 Tensor. - bbox: 1D Tensor that has 4 elements (min_y, min_x, max_y, max_x) - of type float that represents the normalized coordinates between 0 and 1. - pad_fraction: Float that specifies how large the cutout mask should be in - in reference to the size of the original bbox. If pad_fraction is 0.25, - then the cutout mask will be of shape - (0.25 * bbox height, 0.25 * bbox width). - - Returns: - A tuple. Fist element is a tensor of the same shape as image where each - element is either a 1 or 0 that is used to determine where the image - will have cutout applied. The second element is the mean of the pixels - in the image where the bbox is located. - mask value: [0,1] - """ - image_height, image_width = image.shape[0], image.shape[1] - # Transform from shape [1, 4] to [4]. - bbox = np.squeeze(bbox) - - min_y = int(float(image_height) * bbox[0]) - min_x = int(float(image_width) * bbox[1]) - max_y = int(float(image_height) * bbox[2]) - max_x = int(float(image_width) * bbox[3]) - - # Calculate the mean pixel values in the bounding box, which will be used - # to fill the cutout region. - mean = np.mean(image[min_y:max_y + 1, min_x:max_x + 1], axis=(0, 1)) - # Cutout mask will be size pad_size_heigh * 2 by pad_size_width * 2 if the - # region lies entirely within the bbox. - box_height = max_y - min_y + 1 - box_width = max_x - min_x + 1 - pad_size_height = int(pad_fraction * (box_height / 2)) - pad_size_width = int(pad_fraction * (box_width / 2)) - - # Sample the center location in the image where the zero mask will be applied. - cutout_center_height = np.random.randint(min_y, max_y + 1, dtype=np.int32) - cutout_center_width = np.random.randint(min_x, max_x + 1, dtype=np.int32) - - lower_pad = np.maximum(0, cutout_center_height - pad_size_height) - upper_pad = np.maximum( - 0, image_height - cutout_center_height - pad_size_height) - left_pad = np.maximum(0, cutout_center_width - pad_size_width) - right_pad = np.maximum(0, - image_width - cutout_center_width - pad_size_width) - - cutout_shape = [ - image_height - (lower_pad + upper_pad), - image_width - (left_pad + right_pad) - ] - padding_dims = [[lower_pad, upper_pad], [left_pad, right_pad]] - - mask = np.pad(np.zeros( - cutout_shape, dtype=image.dtype), - padding_dims, - 'constant', - constant_values=1) - - mask = np.expand_dims(mask, 2) - mask = np.tile(mask, [1, 1, 3]) - return mask, mean - - -def bbox_cutout(image, bboxes, pad_fraction, replace_with_mean): - """Applies cutout to the image according to bbox information. - - This is a cutout variant that using bbox information to make more informed - decisions on where to place the cutout mask. - - Args: - image: 3D uint8 Tensor. - bboxes: 2D Tensor that is a list of the bboxes in the image. Each bbox - has 4 elements (min_y, min_x, max_y, max_x) of type float with values - between [0, 1]. - pad_fraction: Float that specifies how large the cutout mask should be in - in reference to the size of the original bbox. If pad_fraction is 0.25, - then the cutout mask will be of shape - (0.25 * bbox height, 0.25 * bbox width). - replace_with_mean: Boolean that specified what value should be filled in - where the cutout mask is applied. Since the incoming image will be of - uint8 and will not have had any mean normalization applied, by default - we set the value to be 128. If replace_with_mean is True then we find - the mean pixel values across the channel dimension and use those to fill - in where the cutout mask is applied. - - Returns: - A tuple. First element is a tensor of the same shape as image that has - cutout applied to it. Second element is the bboxes that were passed in - that will be unchanged. - """ - - def apply_bbox_cutout(image, bboxes, pad_fraction): - """Applies cutout to a single bounding box within image.""" - # Choose a single bounding box to apply cutout to. - random_index = np.random.randint(0, bboxes.shape[0], dtype=np.int32) - # Select the corresponding bbox and apply cutout. - chosen_bbox = np.take(bboxes, random_index, axis=0) - mask, mean = _cutout_inside_bbox(image, chosen_bbox, pad_fraction) - - # When applying cutout we either set the pixel value to 128 or to the mean - # value inside the bbox. - replace = mean if replace_with_mean else [128] * 3 - - # Apply the cutout mask to the image. Where the mask is 0 we fill it with - # `replace`. - image = np.where( - np.equal(mask, 0), - np.ones_like( - image, dtype=image.dtype) * replace, - image).astype(image.dtype) - return image - - # Check to see if there are boxes, if so then apply boxcutout. - if len(bboxes) != 0: - image = apply_bbox_cutout(image, bboxes, pad_fraction) - - return image, bboxes - - -NAME_TO_FUNC = { - 'AutoContrast': autocontrast, - 'Equalize': equalize, - 'Posterize': posterize, - 'Solarize': solarize, - 'SolarizeAdd': solarize_add, - 'Color': color, - 'Contrast': contrast, - 'Brightness': brightness, - 'Sharpness': sharpness, - 'Cutout': cutout, - 'BBox_Cutout': bbox_cutout, - 'Rotate_BBox': rotate_with_bboxes, - # pylint:disable=g-long-lambda - 'TranslateX_BBox': lambda image, bboxes, pixels, replace: translate_bbox( - image, bboxes, pixels, replace, shift_horizontal=True), - 'TranslateY_BBox': lambda image, bboxes, pixels, replace: translate_bbox( - image, bboxes, pixels, replace, shift_horizontal=False), - 'ShearX_BBox': lambda image, bboxes, level, replace: shear_with_bboxes( - image, bboxes, level, replace, shear_horizontal=True), - 'ShearY_BBox': lambda image, bboxes, level, replace: shear_with_bboxes( - image, bboxes, level, replace, shear_horizontal=False), - # pylint:enable=g-long-lambda - 'Rotate_Only_BBoxes': rotate_only_bboxes, - 'ShearX_Only_BBoxes': shear_x_only_bboxes, - 'ShearY_Only_BBoxes': shear_y_only_bboxes, - 'TranslateX_Only_BBoxes': translate_x_only_bboxes, - 'TranslateY_Only_BBoxes': translate_y_only_bboxes, - 'Flip_Only_BBoxes': flip_only_bboxes, - 'Solarize_Only_BBoxes': solarize_only_bboxes, - 'Equalize_Only_BBoxes': equalize_only_bboxes, - 'Cutout_Only_BBoxes': cutout_only_bboxes, -} - - -def _randomly_negate_tensor(tensor): - """With 50% prob turn the tensor negative.""" - should_flip = np.floor(np.random.rand() + 0.5) >= 1 - final_tensor = tensor if should_flip else -tensor - return final_tensor - - -def _rotate_level_to_arg(level): - level = (level / _MAX_LEVEL) * 30. - level = _randomly_negate_tensor(level) - return (level, ) - - -def _shrink_level_to_arg(level): - """Converts level to ratio by which we shrink the image content.""" - if level == 0: - return (1.0, ) # if level is zero, do not shrink the image - # Maximum shrinking ratio is 2.9. - level = 2. / (_MAX_LEVEL / level) + 0.9 - return (level, ) - - -def _enhance_level_to_arg(level): - return ((level / _MAX_LEVEL) * 1.8 + 0.1, ) - - -def _shear_level_to_arg(level): - level = (level / _MAX_LEVEL) * 0.3 - # Flip level to negative with 50% chance. - level = _randomly_negate_tensor(level) - return (level, ) - - -def _translate_level_to_arg(level, translate_const): - level = (level / _MAX_LEVEL) * float(translate_const) - # Flip level to negative with 50% chance. - level = _randomly_negate_tensor(level) - return (level, ) - - -def _bbox_cutout_level_to_arg(level, hparams): - cutout_pad_fraction = (level / - _MAX_LEVEL) * 0.75 # hparams.cutout_max_pad_fraction - return (cutout_pad_fraction, False) # hparams.cutout_bbox_replace_with_mean - - -def level_to_arg(hparams): - return { - 'AutoContrast': lambda level: (), - 'Equalize': lambda level: (), - 'Posterize': lambda level: (int((level / _MAX_LEVEL) * 4), ), - 'Solarize': lambda level: (int((level / _MAX_LEVEL) * 256), ), - 'SolarizeAdd': lambda level: (int((level / _MAX_LEVEL) * 110), ), - 'Color': _enhance_level_to_arg, - 'Contrast': _enhance_level_to_arg, - 'Brightness': _enhance_level_to_arg, - 'Sharpness': _enhance_level_to_arg, - 'Cutout': - lambda level: (int((level / _MAX_LEVEL) * 100), ), # hparams.cutout_const=100 - # pylint:disable=g-long-lambda - 'BBox_Cutout': lambda level: _bbox_cutout_level_to_arg(level, hparams), - 'TranslateX_BBox': - lambda level: _translate_level_to_arg(level, 250), # hparams.translate_const=250 - 'TranslateY_BBox': - lambda level: _translate_level_to_arg(level, 250), # hparams.translate_cons - # pylint:enable=g-long-lambda - 'ShearX_BBox': _shear_level_to_arg, - 'ShearY_BBox': _shear_level_to_arg, - 'Rotate_BBox': _rotate_level_to_arg, - 'Rotate_Only_BBoxes': _rotate_level_to_arg, - 'ShearX_Only_BBoxes': _shear_level_to_arg, - 'ShearY_Only_BBoxes': _shear_level_to_arg, - # pylint:disable=g-long-lambda - 'TranslateX_Only_BBoxes': - lambda level: _translate_level_to_arg(level, 120), # hparams.translate_bbox_const - 'TranslateY_Only_BBoxes': - lambda level: _translate_level_to_arg(level, 120), # hparams.translate_bbox_const - # pylint:enable=g-long-lambda - 'Flip_Only_BBoxes': lambda level: (), - 'Solarize_Only_BBoxes': - lambda level: (int((level / _MAX_LEVEL) * 256), ), - 'Equalize_Only_BBoxes': lambda level: (), - # pylint:disable=g-long-lambda - 'Cutout_Only_BBoxes': - lambda level: (int((level / _MAX_LEVEL) * 50), ), # hparams.cutout_bbox_const - # pylint:enable=g-long-lambda - } - - -def bbox_wrapper(func): - """Adds a bboxes function argument to func and returns unchanged bboxes.""" - - def wrapper(images, bboxes, *args, **kwargs): - return (func(images, *args, **kwargs), bboxes) - - return wrapper - - -def _parse_policy_info(name, prob, level, replace_value, augmentation_hparams): - """Return the function that corresponds to `name` and update `level` param.""" - func = NAME_TO_FUNC[name] - args = level_to_arg(augmentation_hparams)[name](level) - - # Check to see if prob is passed into function. This is used for operations - # where we alter bboxes independently. - # pytype:disable=wrong-arg-types - if 'prob' in inspect.getfullargspec(func)[0]: - args = tuple([prob] + list(args)) - # pytype:enable=wrong-arg-types - - # Add in replace arg if it is required for the function that is being called. - if 'replace' in inspect.getfullargspec(func)[0]: - # Make sure replace is the final argument - assert 'replace' == inspect.getfullargspec(func)[0][-1] - args = tuple(list(args) + [replace_value]) - - # Add bboxes as the second positional argument for the function if it does - # not already exist. - if 'bboxes' not in inspect.getfullargspec(func)[0]: - func = bbox_wrapper(func) - return (func, prob, args) - - -def _apply_func_with_prob(func, image, args, prob, bboxes): - """Apply `func` to image w/ `args` as input with probability `prob`.""" - assert isinstance(args, tuple) - assert 'bboxes' == inspect.getfullargspec(func)[0][1] - - # If prob is a function argument, then this randomness is being handled - # inside the function, so make sure it is always called. - if 'prob' in inspect.getfullargspec(func)[0]: - prob = 1.0 - - # Apply the function with probability `prob`. - should_apply_op = np.floor(np.random.rand() + 0.5) >= 1 - if should_apply_op: - augmented_image, augmented_bboxes = func(image, bboxes, *args) - else: - augmented_image, augmented_bboxes = (image, bboxes) - return augmented_image, augmented_bboxes - - -def select_and_apply_random_policy(policies, image, bboxes): - """Select a random policy from `policies` and apply it to `image`.""" - policy_to_select = np.random.randint(0, len(policies), dtype=np.int32) - # policy_to_select = 6 # for test - for (i, policy) in enumerate(policies): - if i == policy_to_select: - image, bboxes = policy(image, bboxes) - return (image, bboxes) - - -def build_and_apply_nas_policy(policies, image, bboxes, augmentation_hparams): - """Build a policy from the given policies passed in and apply to image. - - Args: - policies: list of lists of tuples in the form `(func, prob, level)`, `func` - is a string name of the augmentation function, `prob` is the probability - of applying the `func` operation, `level` is the input argument for - `func`. - image: numpy array that the resulting policy will be applied to. - bboxes: - augmentation_hparams: Hparams associated with the NAS learned policy. - - Returns: - A version of image that now has data augmentation applied to it based on - the `policies` pass into the function. Additionally, returns bboxes if - a value for them is passed in that is not None - """ - replace_value = [128, 128, 128] - - # func is the string name of the augmentation function, prob is the - # probability of applying the operation and level is the parameter associated - - # tf_policies are functions that take in an image and return an augmented - # image. - tf_policies = [] - for policy in policies: - tf_policy = [] - # Link string name to the correct python function and make sure the correct - # argument is passed into that function. - for policy_info in policy: - policy_info = list( - policy_info) + [replace_value, augmentation_hparams] - - tf_policy.append(_parse_policy_info(*policy_info)) - # Now build the tf policy that will apply the augmentation procedue - # on image. - def make_final_policy(tf_policy_): - def final_policy(image_, bboxes_): - for func, prob, args in tf_policy_: - image_, bboxes_ = _apply_func_with_prob(func, image_, args, - prob, bboxes_) - return image_, bboxes_ - - return final_policy - - tf_policies.append(make_final_policy(tf_policy)) - - augmented_images, augmented_bboxes = select_and_apply_random_policy( - tf_policies, image, bboxes) - # If no bounding boxes were specified, then just return the images. - return (augmented_images, augmented_bboxes) - - -# TODO(barretzoph): Add in ArXiv link once paper is out. -def distort_image_with_autoaugment(image, bboxes, augmentation_name): - """Applies the AutoAugment policy to `image` and `bboxes`. - - Args: - image: `Tensor` of shape [height, width, 3] representing an image. - bboxes: `Tensor` of shape [N, 4] representing ground truth boxes that are - normalized between [0, 1]. - augmentation_name: The name of the AutoAugment policy to use. The available - options are `v0`, `v1`, `v2`, `v3` and `test`. `v0` is the policy used for - all of the results in the paper and was found to achieve the best results - on the COCO dataset. `v1`, `v2` and `v3` are additional good policies - found on the COCO dataset that have slight variation in what operations - were used during the search procedure along with how many operations are - applied in parallel to a single image (2 vs 3). - - Returns: - A tuple containing the augmented versions of `image` and `bboxes`. - """ - available_policies = { - 'v0': policy_v0, - 'v1': policy_v1, - 'v2': policy_v2, - 'v3': policy_v3, - 'test': policy_vtest - } - if augmentation_name not in available_policies: - raise ValueError('Invalid augmentation_name: {}'.format( - augmentation_name)) - - policy = available_policies[augmentation_name]() - augmentation_hparams = {} - return build_and_apply_nas_policy(policy, image, bboxes, - augmentation_hparams) diff --git a/pdfdet/models/Paddle/ppdet/data/transform/batch_operators.py b/pdfdet/models/Paddle/ppdet/data/transform/batch_operators.py deleted file mode 100644 index f1ea702..0000000 --- a/pdfdet/models/Paddle/ppdet/data/transform/batch_operators.py +++ /dev/null @@ -1,1532 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import typing - -try: - from collections.abc import Sequence -except Exception: - from collections import Sequence - -import cv2 -import copy -import math -import numpy as np -from .operators import register_op, BaseOperator, Resize -from .op_helper import jaccard_overlap, gaussian2D, gaussian_radius, draw_umich_gaussian -from .atss_assigner import ATSSAssigner -from scipy import ndimage - -from ppdet.modeling import bbox_utils -from ppdet.utils.logger import setup_logger -from ppdet.modeling.keypoint_utils import get_affine_transform, affine_transform -logger = setup_logger(__name__) - -__all__ = [ - 'PadBatch', 'BatchRandomResize', 'Gt2YoloTarget', 'Gt2FCOSTarget', - 'Gt2TTFTarget', 'Gt2Solov2Target', 'Gt2SparseTarget', 'PadMaskBatch', - 'Gt2GFLTarget', 'Gt2CenterNetTarget', 'Gt2CenterTrackTarget', 'PadGT', - 'PadRGT', 'BatchRandomResizeForSSOD' -] - - -@register_op -class PadBatch(BaseOperator): - """ - Pad a batch of samples so they can be divisible by a stride. - The layout of each image should be 'CHW'. - Args: - pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure - height and width is divisible by `pad_to_stride`. - """ - - def __init__(self, pad_to_stride=0): - super(PadBatch, self).__init__() - self.pad_to_stride = pad_to_stride - - def __call__(self, samples, context=None): - """ - Args: - samples (list): a batch of sample, each is dict. - """ - coarsest_stride = self.pad_to_stride - - # multi scale input is nested list - if isinstance(samples, - typing.Sequence) and len(samples) > 0 and isinstance( - samples[0], typing.Sequence): - inner_samples = samples[0] - else: - inner_samples = samples - - max_shape = np.array( - [data['image'].shape for data in inner_samples]).max(axis=0) - if coarsest_stride > 0: - max_shape[1] = int( - np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride) - max_shape[2] = int( - np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride) - - for data in inner_samples: - im = data['image'] - im_c, im_h, im_w = im.shape[:] - padding_im = np.zeros( - (im_c, max_shape[1], max_shape[2]), dtype=np.float32) - padding_im[:, :im_h, :im_w] = im - data['image'] = padding_im - if 'semantic' in data and data['semantic'] is not None: - semantic = data['semantic'] - padding_sem = np.zeros( - (1, max_shape[1], max_shape[2]), dtype=np.float32) - padding_sem[:, :im_h, :im_w] = semantic - data['semantic'] = padding_sem - if 'gt_segm' in data and data['gt_segm'] is not None: - gt_segm = data['gt_segm'] - padding_segm = np.zeros( - (gt_segm.shape[0], max_shape[1], max_shape[2]), - dtype=np.uint8) - padding_segm[:, :im_h, :im_w] = gt_segm - data['gt_segm'] = padding_segm - - return samples - - -@register_op -class BatchRandomResize(BaseOperator): - """ - Resize image to target size randomly. random target_size and interpolation method - Args: - target_size (int, list, tuple): image target size, if random size is True, must be list or tuple - keep_ratio (bool): whether keep_raio or not, default true - interp (int): the interpolation method - random_size (bool): whether random select target size of image - random_interp (bool): whether random select interpolation method - """ - - def __init__(self, - target_size, - keep_ratio, - interp=cv2.INTER_NEAREST, - random_size=True, - random_interp=False): - super(BatchRandomResize, self).__init__() - self.keep_ratio = keep_ratio - self.interps = [ - cv2.INTER_NEAREST, - cv2.INTER_LINEAR, - cv2.INTER_AREA, - cv2.INTER_CUBIC, - cv2.INTER_LANCZOS4, - ] - self.interp = interp - assert isinstance(target_size, ( - int, Sequence)), "target_size must be int, list or tuple" - if random_size and not isinstance(target_size, list): - raise TypeError( - "Type of target_size is invalid when random_size is True. Must be List, now is {}". - format(type(target_size))) - self.target_size = target_size - self.random_size = random_size - self.random_interp = random_interp - - def __call__(self, samples, context=None): - if self.random_size: - index = np.random.choice(len(self.target_size)) - target_size = self.target_size[index] - else: - target_size = self.target_size - - if self.random_interp: - interp = np.random.choice(self.interps) - else: - interp = self.interp - - resizer = Resize(target_size, keep_ratio=self.keep_ratio, interp=interp) - return resizer(samples, context=context) - - -@register_op -class Gt2YoloTarget(BaseOperator): - __shared__ = ['num_classes'] - """ - Generate YOLOv3 targets by groud truth data, this operator is only used in - fine grained YOLOv3 loss mode - """ - - def __init__(self, - anchors, - anchor_masks, - downsample_ratios, - num_classes=80, - iou_thresh=1.): - super(Gt2YoloTarget, self).__init__() - self.anchors = anchors - self.anchor_masks = anchor_masks - self.downsample_ratios = downsample_ratios - self.num_classes = num_classes - self.iou_thresh = iou_thresh - - def __call__(self, samples, context=None): - assert len(self.anchor_masks) == len(self.downsample_ratios), \ - "anchor_masks', and 'downsample_ratios' should have same length." - - h, w = samples[0]['image'].shape[1:3] - an_hw = np.array(self.anchors) / np.array([[w, h]]) - for sample in samples: - gt_bbox = sample['gt_bbox'] - gt_class = sample['gt_class'] - if 'gt_score' not in sample: - sample['gt_score'] = np.ones( - (gt_bbox.shape[0], 1), dtype=np.float32) - gt_score = sample['gt_score'] - for i, ( - mask, downsample_ratio - ) in enumerate(zip(self.anchor_masks, self.downsample_ratios)): - grid_h = int(h / downsample_ratio) - grid_w = int(w / downsample_ratio) - target = np.zeros( - (len(mask), 6 + self.num_classes, grid_h, grid_w), - dtype=np.float32) - for b in range(gt_bbox.shape[0]): - gx, gy, gw, gh = gt_bbox[b, :] - cls = gt_class[b] - score = gt_score[b] - if gw <= 0. or gh <= 0. or score <= 0.: - continue - - # find best match anchor index - best_iou = 0. - best_idx = -1 - for an_idx in range(an_hw.shape[0]): - iou = jaccard_overlap( - [0., 0., gw, gh], - [0., 0., an_hw[an_idx, 0], an_hw[an_idx, 1]]) - if iou > best_iou: - best_iou = iou - best_idx = an_idx - - gi = int(gx * grid_w) - gj = int(gy * grid_h) - - # gtbox should be regresed in this layes if best match - # anchor index in anchor mask of this layer - if best_idx in mask: - best_n = mask.index(best_idx) - - # x, y, w, h, scale - target[best_n, 0, gj, gi] = gx * grid_w - gi - target[best_n, 1, gj, gi] = gy * grid_h - gj - target[best_n, 2, gj, gi] = np.log( - gw * w / self.anchors[best_idx][0]) - target[best_n, 3, gj, gi] = np.log( - gh * h / self.anchors[best_idx][1]) - target[best_n, 4, gj, gi] = 2.0 - gw * gh - - # objectness record gt_score - target[best_n, 5, gj, gi] = score - - # classification - target[best_n, 6 + cls, gj, gi] = 1. - - # For non-matched anchors, calculate the target if the iou - # between anchor and gt is larger than iou_thresh - if self.iou_thresh < 1: - for idx, mask_i in enumerate(mask): - if mask_i == best_idx: continue - iou = jaccard_overlap( - [0., 0., gw, gh], - [0., 0., an_hw[mask_i, 0], an_hw[mask_i, 1]]) - if iou > self.iou_thresh and target[idx, 5, gj, - gi] == 0.: - # x, y, w, h, scale - target[idx, 0, gj, gi] = gx * grid_w - gi - target[idx, 1, gj, gi] = gy * grid_h - gj - target[idx, 2, gj, gi] = np.log( - gw * w / self.anchors[mask_i][0]) - target[idx, 3, gj, gi] = np.log( - gh * h / self.anchors[mask_i][1]) - target[idx, 4, gj, gi] = 2.0 - gw * gh - - # objectness record gt_score - target[idx, 5, gj, gi] = score - - # classification - target[idx, 6 + cls, gj, gi] = 1. - sample['target{}'.format(i)] = target - - # remove useless gt_class and gt_score after target calculated - sample.pop('gt_class') - sample.pop('gt_score') - - return samples - - -@register_op -class Gt2FCOSTarget(BaseOperator): - """ - Generate FCOS targets by groud truth data - """ - - def __init__(self, - object_sizes_boundary, - center_sampling_radius, - downsample_ratios, - num_shift=0.5, - multiply_strides_reg_targets=False, - norm_reg_targets=True): - super(Gt2FCOSTarget, self).__init__() - self.center_sampling_radius = center_sampling_radius - self.downsample_ratios = downsample_ratios - self.INF = np.inf - self.object_sizes_boundary = [-1] + object_sizes_boundary + [self.INF] - object_sizes_of_interest = [] - for i in range(len(self.object_sizes_boundary) - 1): - object_sizes_of_interest.append([ - self.object_sizes_boundary[i], self.object_sizes_boundary[i + 1] - ]) - self.object_sizes_of_interest = object_sizes_of_interest - self.num_shift = num_shift - self.multiply_strides_reg_targets = multiply_strides_reg_targets - self.norm_reg_targets = norm_reg_targets - - def _compute_points(self, w, h): - """ - compute the corresponding points in each feature map - :param h: image height - :param w: image width - :return: points from all feature map - """ - locations = [] - for stride in self.downsample_ratios: - shift_x = np.arange(0, w, stride).astype(np.float32) - shift_y = np.arange(0, h, stride).astype(np.float32) - shift_x, shift_y = np.meshgrid(shift_x, shift_y) - shift_x = shift_x.flatten() - shift_y = shift_y.flatten() - location = np.stack( - [shift_x, shift_y], axis=1) + stride * self.num_shift - locations.append(location) - num_points_each_level = [len(location) for location in locations] - locations = np.concatenate(locations, axis=0) - return locations, num_points_each_level - - def _convert_xywh2xyxy(self, gt_bbox, w, h): - """ - convert the bounding box from style xywh to xyxy - :param gt_bbox: bounding boxes normalized into [0, 1] - :param w: image width - :param h: image height - :return: bounding boxes in xyxy style - """ - bboxes = gt_bbox.copy() - bboxes[:, [0, 2]] = bboxes[:, [0, 2]] * w - bboxes[:, [1, 3]] = bboxes[:, [1, 3]] * h - bboxes[:, 2] = bboxes[:, 0] + bboxes[:, 2] - bboxes[:, 3] = bboxes[:, 1] + bboxes[:, 3] - return bboxes - - def _check_inside_boxes_limited(self, gt_bbox, xs, ys, - num_points_each_level): - """ - check if points is within the clipped boxes - :param gt_bbox: bounding boxes - :param xs: horizontal coordinate of points - :param ys: vertical coordinate of points - :return: the mask of points is within gt_box or not - """ - bboxes = np.reshape( - gt_bbox, newshape=[1, gt_bbox.shape[0], gt_bbox.shape[1]]) - bboxes = np.tile(bboxes, reps=[xs.shape[0], 1, 1]) - ct_x = (bboxes[:, :, 0] + bboxes[:, :, 2]) / 2 - ct_y = (bboxes[:, :, 1] + bboxes[:, :, 3]) / 2 - beg = 0 - clipped_box = bboxes.copy() - for lvl, stride in enumerate(self.downsample_ratios): - end = beg + num_points_each_level[lvl] - stride_exp = self.center_sampling_radius * stride - clipped_box[beg:end, :, 0] = np.maximum( - bboxes[beg:end, :, 0], ct_x[beg:end, :] - stride_exp) - clipped_box[beg:end, :, 1] = np.maximum( - bboxes[beg:end, :, 1], ct_y[beg:end, :] - stride_exp) - clipped_box[beg:end, :, 2] = np.minimum( - bboxes[beg:end, :, 2], ct_x[beg:end, :] + stride_exp) - clipped_box[beg:end, :, 3] = np.minimum( - bboxes[beg:end, :, 3], ct_y[beg:end, :] + stride_exp) - beg = end - l_res = xs - clipped_box[:, :, 0] - r_res = clipped_box[:, :, 2] - xs - t_res = ys - clipped_box[:, :, 1] - b_res = clipped_box[:, :, 3] - ys - clipped_box_reg_targets = np.stack([l_res, t_res, r_res, b_res], axis=2) - inside_gt_box = np.min(clipped_box_reg_targets, axis=2) > 0 - return inside_gt_box - - def __call__(self, samples, context=None): - assert len(self.object_sizes_of_interest) == len(self.downsample_ratios), \ - "object_sizes_of_interest', and 'downsample_ratios' should have same length." - - for sample in samples: - im = sample['image'] - bboxes = sample['gt_bbox'] - gt_class = sample['gt_class'] - # calculate the locations - h, w = im.shape[1:3] - points, num_points_each_level = self._compute_points(w, h) - object_scale_exp = [] - for i, num_pts in enumerate(num_points_each_level): - object_scale_exp.append( - np.tile( - np.array([self.object_sizes_of_interest[i]]), - reps=[num_pts, 1])) - object_scale_exp = np.concatenate(object_scale_exp, axis=0) - - gt_area = (bboxes[:, 2] - bboxes[:, 0]) * ( - bboxes[:, 3] - bboxes[:, 1]) - xs, ys = points[:, 0], points[:, 1] - xs = np.reshape(xs, newshape=[xs.shape[0], 1]) - xs = np.tile(xs, reps=[1, bboxes.shape[0]]) - ys = np.reshape(ys, newshape=[ys.shape[0], 1]) - ys = np.tile(ys, reps=[1, bboxes.shape[0]]) - - l_res = xs - bboxes[:, 0] - r_res = bboxes[:, 2] - xs - t_res = ys - bboxes[:, 1] - b_res = bboxes[:, 3] - ys - reg_targets = np.stack([l_res, t_res, r_res, b_res], axis=2) - if self.center_sampling_radius > 0: - is_inside_box = self._check_inside_boxes_limited( - bboxes, xs, ys, num_points_each_level) - else: - is_inside_box = np.min(reg_targets, axis=2) > 0 - # check if the targets is inside the corresponding level - max_reg_targets = np.max(reg_targets, axis=2) - lower_bound = np.tile( - np.expand_dims( - object_scale_exp[:, 0], axis=1), - reps=[1, max_reg_targets.shape[1]]) - high_bound = np.tile( - np.expand_dims( - object_scale_exp[:, 1], axis=1), - reps=[1, max_reg_targets.shape[1]]) - is_match_current_level = \ - (max_reg_targets > lower_bound) & \ - (max_reg_targets < high_bound) - points2gtarea = np.tile( - np.expand_dims( - gt_area, axis=0), reps=[xs.shape[0], 1]) - points2gtarea[is_inside_box == 0] = self.INF - points2gtarea[is_match_current_level == 0] = self.INF - points2min_area = points2gtarea.min(axis=1) - points2min_area_ind = points2gtarea.argmin(axis=1) - labels = gt_class[points2min_area_ind] + 1 - labels[points2min_area == self.INF] = 0 - reg_targets = reg_targets[range(xs.shape[0]), points2min_area_ind] - ctn_targets = np.sqrt((reg_targets[:, [0, 2]].min(axis=1) / \ - reg_targets[:, [0, 2]].max(axis=1)) * \ - (reg_targets[:, [1, 3]].min(axis=1) / \ - reg_targets[:, [1, 3]].max(axis=1))).astype(np.float32) - ctn_targets = np.reshape( - ctn_targets, newshape=[ctn_targets.shape[0], 1]) - ctn_targets[labels <= 0] = 0 - pos_ind = np.nonzero(labels != 0) - reg_targets_pos = reg_targets[pos_ind[0], :] - split_sections = [] - beg = 0 - for lvl in range(len(num_points_each_level)): - end = beg + num_points_each_level[lvl] - split_sections.append(end) - beg = end - labels_by_level = np.split(labels, split_sections, axis=0) - reg_targets_by_level = np.split(reg_targets, split_sections, axis=0) - ctn_targets_by_level = np.split(ctn_targets, split_sections, axis=0) - for lvl in range(len(self.downsample_ratios)): - grid_w = int(np.ceil(w / self.downsample_ratios[lvl])) - grid_h = int(np.ceil(h / self.downsample_ratios[lvl])) - if self.norm_reg_targets: - if self.multiply_strides_reg_targets: - sample['reg_target{}'.format(lvl)] = np.reshape( - reg_targets_by_level[lvl], - newshape=[grid_h, grid_w, 4]) - else: - sample['reg_target{}'.format(lvl)] = \ - np.reshape( - reg_targets_by_level[lvl] / \ - self.downsample_ratios[lvl], - newshape=[grid_h, grid_w, 4]) - else: - sample['reg_target{}'.format(lvl)] = np.reshape( - reg_targets_by_level[lvl], - newshape=[grid_h, grid_w, 4]) - sample['labels{}'.format(lvl)] = np.reshape( - labels_by_level[lvl], newshape=[grid_h, grid_w, 1]) - sample['centerness{}'.format(lvl)] = np.reshape( - ctn_targets_by_level[lvl], newshape=[grid_h, grid_w, 1]) - - sample.pop('is_crowd', None) - sample.pop('difficult', None) - sample.pop('gt_class', None) - sample.pop('gt_bbox', None) - return samples - - -@register_op -class Gt2GFLTarget(BaseOperator): - __shared__ = ['num_classes'] - """ - Generate GFocal loss targets by groud truth data - """ - - def __init__(self, - num_classes=80, - downsample_ratios=[8, 16, 32, 64, 128], - grid_cell_scale=4, - cell_offset=0, - compute_vlr_region=False): - super(Gt2GFLTarget, self).__init__() - self.num_classes = num_classes - self.downsample_ratios = downsample_ratios - self.grid_cell_scale = grid_cell_scale - self.cell_offset = cell_offset - self.compute_vlr_region = compute_vlr_region - - self.assigner = ATSSAssigner() - - def get_grid_cells(self, featmap_size, scale, stride, offset=0): - """ - Generate grid cells of a feature map for target assignment. - Args: - featmap_size: Size of a single level feature map. - scale: Grid cell scale. - stride: Down sample stride of the feature map. - offset: Offset of grid cells. - return: - Grid_cells xyxy position. Size should be [feat_w * feat_h, 4] - """ - cell_size = stride * scale - h, w = featmap_size - x_range = (np.arange(w, dtype=np.float32) + offset) * stride - y_range = (np.arange(h, dtype=np.float32) + offset) * stride - x, y = np.meshgrid(x_range, y_range) - y = y.flatten() - x = x.flatten() - grid_cells = np.stack( - [ - x - 0.5 * cell_size, y - 0.5 * cell_size, x + 0.5 * cell_size, - y + 0.5 * cell_size - ], - axis=-1) - return grid_cells - - def get_sample(self, assign_gt_inds, gt_bboxes): - pos_inds = np.unique(np.nonzero(assign_gt_inds > 0)[0]) - neg_inds = np.unique(np.nonzero(assign_gt_inds == 0)[0]) - pos_assigned_gt_inds = assign_gt_inds[pos_inds] - 1 - - if gt_bboxes.size == 0: - # hack for index error case - assert pos_assigned_gt_inds.size == 0 - pos_gt_bboxes = np.empty_like(gt_bboxes).reshape(-1, 4) - else: - if len(gt_bboxes.shape) < 2: - gt_bboxes = gt_bboxes.resize(-1, 4) - pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :] - return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds - - def __call__(self, samples, context=None): - assert len(samples) > 0 - batch_size = len(samples) - # get grid cells of image - h, w = samples[0]['image'].shape[1:3] - multi_level_grid_cells = [] - for stride in self.downsample_ratios: - featmap_size = (int(math.ceil(h / stride)), - int(math.ceil(w / stride))) - multi_level_grid_cells.append( - self.get_grid_cells(featmap_size, self.grid_cell_scale, stride, - self.cell_offset)) - mlvl_grid_cells_list = [ - multi_level_grid_cells for i in range(batch_size) - ] - # pixel cell number of multi-level feature maps - num_level_cells = [ - grid_cells.shape[0] for grid_cells in mlvl_grid_cells_list[0] - ] - num_level_cells_list = [num_level_cells] * batch_size - # concat all level cells and to a single array - for i in range(batch_size): - mlvl_grid_cells_list[i] = np.concatenate(mlvl_grid_cells_list[i]) - # target assign on all images - for sample, grid_cells, num_level_cells in zip( - samples, mlvl_grid_cells_list, num_level_cells_list): - gt_bboxes = sample['gt_bbox'] - gt_labels = sample['gt_class'].squeeze() - if gt_labels.size == 1: - gt_labels = np.array([gt_labels]).astype(np.int32) - gt_bboxes_ignore = None - assign_gt_inds, _ = self.assigner(grid_cells, num_level_cells, - gt_bboxes, gt_bboxes_ignore, - gt_labels) - - if self.compute_vlr_region: - vlr_region = self.assigner.get_vlr_region( - grid_cells, num_level_cells, gt_bboxes, gt_bboxes_ignore, - gt_labels) - sample['vlr_regions'] = vlr_region - - pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds = self.get_sample( - assign_gt_inds, gt_bboxes) - - num_cells = grid_cells.shape[0] - bbox_targets = np.zeros_like(grid_cells) - bbox_weights = np.zeros_like(grid_cells) - labels = np.ones([num_cells], dtype=np.int64) * self.num_classes - label_weights = np.zeros([num_cells], dtype=np.float32) - - if len(pos_inds) > 0: - pos_bbox_targets = pos_gt_bboxes - bbox_targets[pos_inds, :] = pos_bbox_targets - bbox_weights[pos_inds, :] = 1.0 - if not np.any(gt_labels): - labels[pos_inds] = 0 - else: - labels[pos_inds] = gt_labels[pos_assigned_gt_inds] - - label_weights[pos_inds] = 1.0 - if len(neg_inds) > 0: - label_weights[neg_inds] = 1.0 - sample['grid_cells'] = grid_cells - sample['labels'] = labels - sample['label_weights'] = label_weights - sample['bbox_targets'] = bbox_targets - sample['pos_num'] = max(pos_inds.size, 1) - sample.pop('is_crowd', None) - sample.pop('difficult', None) - sample.pop('gt_class', None) - sample.pop('gt_bbox', None) - sample.pop('gt_score', None) - return samples - - -@register_op -class Gt2TTFTarget(BaseOperator): - __shared__ = ['num_classes'] - """ - Gt2TTFTarget - Generate TTFNet targets by ground truth data - - Args: - num_classes(int): the number of classes. - down_ratio(int): the down ratio from images to heatmap, 4 by default. - alpha(float): the alpha parameter to generate gaussian target. - 0.54 by default. - """ - - def __init__(self, num_classes=80, down_ratio=4, alpha=0.54): - super(Gt2TTFTarget, self).__init__() - self.down_ratio = down_ratio - self.num_classes = num_classes - self.alpha = alpha - - def __call__(self, samples, context=None): - output_size = samples[0]['image'].shape[1] - feat_size = output_size // self.down_ratio - for sample in samples: - heatmap = np.zeros( - (self.num_classes, feat_size, feat_size), dtype='float32') - box_target = np.ones( - (4, feat_size, feat_size), dtype='float32') * -1 - reg_weight = np.zeros((1, feat_size, feat_size), dtype='float32') - - gt_bbox = sample['gt_bbox'] - gt_class = sample['gt_class'] - - bbox_w = gt_bbox[:, 2] - gt_bbox[:, 0] + 1 - bbox_h = gt_bbox[:, 3] - gt_bbox[:, 1] + 1 - area = bbox_w * bbox_h - boxes_areas_log = np.log(area) - boxes_ind = np.argsort(boxes_areas_log, axis=0)[::-1] - boxes_area_topk_log = boxes_areas_log[boxes_ind] - gt_bbox = gt_bbox[boxes_ind] - gt_class = gt_class[boxes_ind] - - feat_gt_bbox = gt_bbox / self.down_ratio - feat_gt_bbox = np.clip(feat_gt_bbox, 0, feat_size - 1) - feat_hs, feat_ws = (feat_gt_bbox[:, 3] - feat_gt_bbox[:, 1], - feat_gt_bbox[:, 2] - feat_gt_bbox[:, 0]) - - ct_inds = np.stack( - [(gt_bbox[:, 0] + gt_bbox[:, 2]) / 2, - (gt_bbox[:, 1] + gt_bbox[:, 3]) / 2], - axis=1) / self.down_ratio - - h_radiuses_alpha = (feat_hs / 2. * self.alpha).astype('int32') - w_radiuses_alpha = (feat_ws / 2. * self.alpha).astype('int32') - - for k in range(len(gt_bbox)): - cls_id = gt_class[k] - fake_heatmap = np.zeros((feat_size, feat_size), dtype='float32') - self.draw_truncate_gaussian(fake_heatmap, ct_inds[k], - h_radiuses_alpha[k], - w_radiuses_alpha[k]) - - heatmap[cls_id] = np.maximum(heatmap[cls_id], fake_heatmap) - box_target_inds = fake_heatmap > 0 - box_target[:, box_target_inds] = gt_bbox[k][:, None] - - local_heatmap = fake_heatmap[box_target_inds] - ct_div = np.sum(local_heatmap) - local_heatmap *= boxes_area_topk_log[k] - reg_weight[0, box_target_inds] = local_heatmap / ct_div - sample['ttf_heatmap'] = heatmap - sample['ttf_box_target'] = box_target - sample['ttf_reg_weight'] = reg_weight - sample.pop('is_crowd', None) - sample.pop('difficult', None) - sample.pop('gt_class', None) - sample.pop('gt_bbox', None) - sample.pop('gt_score', None) - return samples - - def draw_truncate_gaussian(self, heatmap, center, h_radius, w_radius): - h, w = 2 * h_radius + 1, 2 * w_radius + 1 - sigma_x = w / 6 - sigma_y = h / 6 - gaussian = gaussian2D((h, w), sigma_x, sigma_y) - - x, y = int(center[0]), int(center[1]) - - height, width = heatmap.shape[0:2] - - left, right = min(x, w_radius), min(width - x, w_radius + 1) - top, bottom = min(y, h_radius), min(height - y, h_radius + 1) - - masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] - masked_gaussian = gaussian[h_radius - top:h_radius + bottom, w_radius - - left:w_radius + right] - if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: - heatmap[y - top:y + bottom, x - left:x + right] = np.maximum( - masked_heatmap, masked_gaussian) - return heatmap - - -@register_op -class Gt2Solov2Target(BaseOperator): - """Assign mask target and labels in SOLOv2 network. - The code of this function is based on: - https://github.com/WXinlong/SOLO/blob/master/mmdet/models/anchor_heads/solov2_head.py#L271 - Args: - num_grids (list): The list of feature map grids size. - scale_ranges (list): The list of mask boundary range. - coord_sigma (float): The coefficient of coordinate area length. - sampling_ratio (float): The ratio of down sampling. - """ - - def __init__(self, - num_grids=[40, 36, 24, 16, 12], - scale_ranges=[[1, 96], [48, 192], [96, 384], [192, 768], - [384, 2048]], - coord_sigma=0.2, - sampling_ratio=4.0): - super(Gt2Solov2Target, self).__init__() - self.num_grids = num_grids - self.scale_ranges = scale_ranges - self.coord_sigma = coord_sigma - self.sampling_ratio = sampling_ratio - - def _scale_size(self, im, scale): - h, w = im.shape[:2] - new_size = (int(w * float(scale) + 0.5), int(h * float(scale) + 0.5)) - resized_img = cv2.resize( - im, None, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR) - return resized_img - - def __call__(self, samples, context=None): - sample_id = 0 - max_ins_num = [0] * len(self.num_grids) - for sample in samples: - gt_bboxes_raw = sample['gt_bbox'] - gt_labels_raw = sample['gt_class'] + 1 - im_c, im_h, im_w = sample['image'].shape[:] - gt_masks_raw = sample['gt_segm'].astype(np.uint8) - mask_feat_size = [ - int(im_h / self.sampling_ratio), int(im_w / self.sampling_ratio) - ] - gt_areas = np.sqrt((gt_bboxes_raw[:, 2] - gt_bboxes_raw[:, 0]) * - (gt_bboxes_raw[:, 3] - gt_bboxes_raw[:, 1])) - ins_ind_label_list = [] - idx = 0 - for (lower_bound, upper_bound), num_grid \ - in zip(self.scale_ranges, self.num_grids): - - hit_indices = ((gt_areas >= lower_bound) & - (gt_areas <= upper_bound)).nonzero()[0] - num_ins = len(hit_indices) - - ins_label = [] - grid_order = [] - cate_label = np.zeros([num_grid, num_grid], dtype=np.int64) - ins_ind_label = np.zeros([num_grid**2], dtype=np.bool_) - - if num_ins == 0: - ins_label = np.zeros( - [1, mask_feat_size[0], mask_feat_size[1]], - dtype=np.uint8) - ins_ind_label_list.append(ins_ind_label) - sample['cate_label{}'.format(idx)] = cate_label.flatten() - sample['ins_label{}'.format(idx)] = ins_label - sample['grid_order{}'.format(idx)] = np.asarray( - [sample_id * num_grid * num_grid + 0], dtype=np.int32) - idx += 1 - continue - gt_bboxes = gt_bboxes_raw[hit_indices] - gt_labels = gt_labels_raw[hit_indices] - gt_masks = gt_masks_raw[hit_indices, ...] - - half_ws = 0.5 * ( - gt_bboxes[:, 2] - gt_bboxes[:, 0]) * self.coord_sigma - half_hs = 0.5 * ( - gt_bboxes[:, 3] - gt_bboxes[:, 1]) * self.coord_sigma - - for seg_mask, gt_label, half_h, half_w in zip( - gt_masks, gt_labels, half_hs, half_ws): - if seg_mask.sum() == 0: - continue - # mass center - upsampled_size = (mask_feat_size[0] * 4, - mask_feat_size[1] * 4) - center_h, center_w = ndimage.measurements.center_of_mass( - seg_mask) - coord_w = int( - (center_w / upsampled_size[1]) // (1. / num_grid)) - coord_h = int( - (center_h / upsampled_size[0]) // (1. / num_grid)) - - # left, top, right, down - top_box = max(0, - int(((center_h - half_h) / upsampled_size[0]) - // (1. / num_grid))) - down_box = min(num_grid - 1, - int(((center_h + half_h) / upsampled_size[0]) - // (1. / num_grid))) - left_box = max(0, - int(((center_w - half_w) / upsampled_size[1]) - // (1. / num_grid))) - right_box = min(num_grid - 1, - int(((center_w + half_w) / - upsampled_size[1]) // (1. / num_grid))) - - top = max(top_box, coord_h - 1) - down = min(down_box, coord_h + 1) - left = max(coord_w - 1, left_box) - right = min(right_box, coord_w + 1) - - cate_label[top:(down + 1), left:(right + 1)] = gt_label - seg_mask = self._scale_size( - seg_mask, scale=1. / self.sampling_ratio) - for i in range(top, down + 1): - for j in range(left, right + 1): - label = int(i * num_grid + j) - cur_ins_label = np.zeros( - [mask_feat_size[0], mask_feat_size[1]], - dtype=np.uint8) - cur_ins_label[:seg_mask.shape[0], :seg_mask.shape[ - 1]] = seg_mask - ins_label.append(cur_ins_label) - ins_ind_label[label] = True - grid_order.append(sample_id * num_grid * num_grid + - label) - if ins_label == []: - ins_label = np.zeros( - [1, mask_feat_size[0], mask_feat_size[1]], - dtype=np.uint8) - ins_ind_label_list.append(ins_ind_label) - sample['cate_label{}'.format(idx)] = cate_label.flatten() - sample['ins_label{}'.format(idx)] = ins_label - sample['grid_order{}'.format(idx)] = np.asarray( - [sample_id * num_grid * num_grid + 0], dtype=np.int32) - else: - ins_label = np.stack(ins_label, axis=0) - ins_ind_label_list.append(ins_ind_label) - sample['cate_label{}'.format(idx)] = cate_label.flatten() - sample['ins_label{}'.format(idx)] = ins_label - sample['grid_order{}'.format(idx)] = np.asarray( - grid_order, dtype=np.int32) - assert len(grid_order) > 0 - max_ins_num[idx] = max( - max_ins_num[idx], - sample['ins_label{}'.format(idx)].shape[0]) - idx += 1 - ins_ind_labels = np.concatenate([ - ins_ind_labels_level_img - for ins_ind_labels_level_img in ins_ind_label_list - ]) - fg_num = np.sum(ins_ind_labels) - sample['fg_num'] = fg_num - sample_id += 1 - - sample.pop('is_crowd') - sample.pop('gt_class') - sample.pop('gt_bbox') - sample.pop('gt_poly') - sample.pop('gt_segm') - - # padding batch - for data in samples: - for idx in range(len(self.num_grids)): - gt_ins_data = np.zeros( - [ - max_ins_num[idx], - data['ins_label{}'.format(idx)].shape[1], - data['ins_label{}'.format(idx)].shape[2] - ], - dtype=np.uint8) - gt_ins_data[0:data['ins_label{}'.format(idx)].shape[ - 0], :, :] = data['ins_label{}'.format(idx)] - gt_grid_order = np.zeros([max_ins_num[idx]], dtype=np.int32) - gt_grid_order[0:data['grid_order{}'.format(idx)].shape[ - 0]] = data['grid_order{}'.format(idx)] - data['ins_label{}'.format(idx)] = gt_ins_data - data['grid_order{}'.format(idx)] = gt_grid_order - - return samples - - -@register_op -class Gt2SparseTarget(BaseOperator): - def __init__(self, use_padding_shape=False): - super(Gt2SparseTarget, self).__init__() - self.use_padding_shape = use_padding_shape - - def __call__(self, samples, context=None): - for sample in samples: - ori_h, ori_w = sample['h'], sample['w'] - if self.use_padding_shape: - h, w = sample["image"].shape[1:3] - if "scale_factor" in sample: - sf_w, sf_h = sample["scale_factor"][1], sample[ - "scale_factor"][0] - sample["scale_factor_whwh"] = np.array( - [sf_w, sf_h, sf_w, sf_h], dtype=np.float32) - else: - sample["scale_factor_whwh"] = np.array( - [1.0, 1.0, 1.0, 1.0], dtype=np.float32) - else: - h, w = round(sample['im_shape'][0]), round(sample['im_shape'][ - 1]) - sample["scale_factor_whwh"] = np.array( - [w / ori_w, h / ori_h, w / ori_w, h / ori_h], - dtype=np.float32) - - sample["img_whwh"] = np.array([w, h, w, h], dtype=np.float32) - sample["ori_shape"] = np.array([ori_h, ori_w], dtype=np.int32) - - return samples - - -@register_op -class PadMaskBatch(BaseOperator): - """ - Pad a batch of samples so that they can be divisible by a stride. - The layout of each image should be 'CHW'. - Args: - pad_to_stride (int): If `pad_to_stride > 0`, pad zeros to ensure - height and width is divisible by `pad_to_stride`. - return_pad_mask (bool): If `return_pad_mask = True`, return - `pad_mask` for transformer. - """ - - def __init__(self, pad_to_stride=0, return_pad_mask=True): - super(PadMaskBatch, self).__init__() - self.pad_to_stride = pad_to_stride - self.return_pad_mask = return_pad_mask - - def __call__(self, samples, context=None): - """ - Args: - samples (list): a batch of sample, each is dict. - """ - coarsest_stride = self.pad_to_stride - - max_shape = np.array([data['image'].shape for data in samples]).max( - axis=0) - if coarsest_stride > 0: - max_shape[1] = int( - np.ceil(max_shape[1] / coarsest_stride) * coarsest_stride) - max_shape[2] = int( - np.ceil(max_shape[2] / coarsest_stride) * coarsest_stride) - - for data in samples: - im = data['image'] - im_c, im_h, im_w = im.shape[:] - padding_im = np.zeros( - (im_c, max_shape[1], max_shape[2]), dtype=np.float32) - padding_im[:, :im_h, :im_w] = im.astype(np.float32) - data['image'] = padding_im - if 'semantic' in data and data['semantic'] is not None: - semantic = data['semantic'] - padding_sem = np.zeros( - (1, max_shape[1], max_shape[2]), dtype=np.float32) - padding_sem[:, :im_h, :im_w] = semantic - data['semantic'] = padding_sem - if 'gt_segm' in data and data['gt_segm'] is not None: - gt_segm = data['gt_segm'] - padding_segm = np.zeros( - (gt_segm.shape[0], max_shape[1], max_shape[2]), - dtype=np.uint8) - padding_segm[:, :im_h, :im_w] = gt_segm - data['gt_segm'] = padding_segm - if self.return_pad_mask: - padding_mask = np.zeros( - (max_shape[1], max_shape[2]), dtype=np.float32) - padding_mask[:im_h, :im_w] = 1. - data['pad_mask'] = padding_mask - - return samples - - -@register_op -class Gt2CenterNetTarget(BaseOperator): - __shared__ = ['num_classes'] - """Gt2CenterNetTarget - Genterate CenterNet targets by ground-truth - Args: - down_ratio (int): The down sample ratio between output feature and - input image. - num_classes (int): The number of classes, 80 by default. - max_objs (int): The maximum objects detected, 128 by default. - """ - - def __init__(self, num_classes=80, down_ratio=4, max_objs=128): - super(Gt2CenterNetTarget, self).__init__() - self.nc = num_classes - self.down_ratio = down_ratio - self.max_objs = max_objs - - def __call__(self, sample, context=None): - input_h, input_w = sample['image'].shape[1:] - output_h = input_h // self.down_ratio - output_w = input_w // self.down_ratio - gt_bbox = sample['gt_bbox'] - gt_class = sample['gt_class'] - - hm = np.zeros((self.nc, output_h, output_w), dtype=np.float32) - wh = np.zeros((self.max_objs, 2), dtype=np.float32) - reg = np.zeros((self.max_objs, 2), dtype=np.float32) - ind = np.zeros((self.max_objs), dtype=np.int64) - reg_mask = np.zeros((self.max_objs), dtype=np.int32) - cat_spec_wh = np.zeros((self.max_objs, self.nc * 2), dtype=np.float32) - cat_spec_mask = np.zeros((self.max_objs, self.nc * 2), dtype=np.int32) - - trans_output = get_affine_transform( - center=sample['center'], - input_size=[sample['scale'], sample['scale']], - rot=0, - output_size=[output_w, output_h]) - - gt_det = [] - for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)): - cls = int(cls) - bbox[:2] = affine_transform(bbox[:2], trans_output) - bbox[2:] = affine_transform(bbox[2:], trans_output) - bbox_amodal = copy.deepcopy(bbox) - bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1) - bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1) - h, w = bbox[3] - bbox[1], bbox[2] - bbox[0] - if h > 0 and w > 0: - radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7) - radius = max(0, int(radius)) - ct = np.array( - [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], - dtype=np.float32) - ct_int = ct.astype(np.int32) - - # get hm,wh,reg,ind,ind_mask - draw_umich_gaussian(hm[cls], ct_int, radius) - wh[i] = 1. * w, 1. * h - reg[i] = ct - ct_int - ind[i] = ct_int[1] * output_w + ct_int[0] - reg_mask[i] = 1 - cat_spec_wh[i, cls * 2:cls * 2 + 2] = wh[i] - cat_spec_mask[i, cls * 2:cls * 2 + 2] = 1 - gt_det.append([ - ct[0] - w / 2, ct[1] - h / 2, ct[0] + w / 2, ct[1] + h / 2, - 1, cls - ]) - - sample.pop('gt_bbox', None) - sample.pop('gt_class', None) - sample.pop('center', None) - sample.pop('scale', None) - sample.pop('is_crowd', None) - sample.pop('difficult', None) - - sample['index'] = ind - sample['index_mask'] = reg_mask - sample['heatmap'] = hm - sample['size'] = wh - sample['offset'] = reg - return sample - - -@register_op -class PadGT(BaseOperator): - """ - Pad 0 to `gt_class`, `gt_bbox`, `gt_score`... - The num_max_boxes is the largest for batch. - Args: - return_gt_mask (bool): If true, return `pad_gt_mask`, - 1 means bbox, 0 means no bbox. - """ - - def __init__(self, return_gt_mask=True, pad_img=False, minimum_gtnum=0): - super(PadGT, self).__init__() - self.return_gt_mask = return_gt_mask - self.pad_img = pad_img - self.minimum_gtnum = minimum_gtnum - - def _impad(self, - img: np.ndarray, - *, - shape=None, - padding=None, - pad_val=0, - padding_mode='constant') -> np.ndarray: - """Pad the given image to a certain shape or pad on all sides with - specified padding mode and padding value. - - Args: - img (ndarray): Image to be padded. - shape (tuple[int]): Expected padding shape (h, w). Default: None. - padding (int or tuple[int]): Padding on each border. If a single int is - provided this is used to pad all borders. If tuple of length 2 is - provided this is the padding on left/right and top/bottom - respectively. If a tuple of length 4 is provided this is the - padding for the left, top, right and bottom borders respectively. - Default: None. Note that `shape` and `padding` can not be both - set. - pad_val (Number | Sequence[Number]): Values to be filled in padding - areas when padding_mode is 'constant'. Default: 0. - padding_mode (str): Type of padding. Should be: constant, edge, - reflect or symmetric. Default: constant. - - constant: pads with a constant value, this value is specified - with pad_val. - - edge: pads with the last value at the edge of the image. - - reflect: pads with reflection of image without repeating the last - value on the edge. For example, padding [1, 2, 3, 4] with 2 - elements on both sides in reflect mode will result in - [3, 2, 1, 2, 3, 4, 3, 2]. - - symmetric: pads with reflection of image repeating the last value - on the edge. For example, padding [1, 2, 3, 4] with 2 elements on - both sides in symmetric mode will result in - [2, 1, 1, 2, 3, 4, 4, 3] - - Returns: - ndarray: The padded image. - """ - - assert (shape is not None) ^ (padding is not None) - if shape is not None: - width = max(shape[1] - img.shape[1], 0) - height = max(shape[0] - img.shape[0], 0) - padding = (0, 0, int(width), int(height)) - - # check pad_val - import numbers - if isinstance(pad_val, tuple): - assert len(pad_val) == img.shape[-1] - elif not isinstance(pad_val, numbers.Number): - raise TypeError('pad_val must be a int or a tuple. ' - f'But received {type(pad_val)}') - - # check padding - if isinstance(padding, tuple) and len(padding) in [2, 4]: - if len(padding) == 2: - padding = (padding[0], padding[1], padding[0], padding[1]) - elif isinstance(padding, numbers.Number): - padding = (padding, padding, padding, padding) - else: - raise ValueError('Padding must be a int or a 2, or 4 element tuple.' - f'But received {padding}') - - # check padding mode - assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'] - - border_type = { - 'constant': cv2.BORDER_CONSTANT, - 'edge': cv2.BORDER_REPLICATE, - 'reflect': cv2.BORDER_REFLECT_101, - 'symmetric': cv2.BORDER_REFLECT - } - img = cv2.copyMakeBorder( - img, - padding[1], - padding[3], - padding[0], - padding[2], - border_type[padding_mode], - value=pad_val) - - return img - - def checkmaxshape(self, samples): - maxh, maxw = 0, 0 - for sample in samples: - h, w = sample['im_shape'] - if h > maxh: - maxh = h - if w > maxw: - maxw = w - return (maxh, maxw) - - def __call__(self, samples, context=None): - num_max_boxes = max([len(s['gt_bbox']) for s in samples]) - num_max_boxes = max(self.minimum_gtnum, num_max_boxes) - if self.pad_img: - maxshape = self.checkmaxshape(samples) - for sample in samples: - if self.pad_img: - img = sample['image'] - padimg = self._impad(img, shape=maxshape) - sample['image'] = padimg - if self.return_gt_mask: - sample['pad_gt_mask'] = np.zeros( - (num_max_boxes, 1), dtype=np.float32) - if num_max_boxes == 0: - continue - - num_gt = len(sample['gt_bbox']) - pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32) - pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32) - if num_gt > 0: - pad_gt_class[:num_gt] = sample['gt_class'] - pad_gt_bbox[:num_gt] = sample['gt_bbox'] - sample['gt_class'] = pad_gt_class - sample['gt_bbox'] = pad_gt_bbox - # pad_gt_mask - if 'pad_gt_mask' in sample: - sample['pad_gt_mask'][:num_gt] = 1 - # gt_score - if 'gt_score' in sample: - pad_gt_score = np.zeros((num_max_boxes, 1), dtype=np.float32) - if num_gt > 0: - pad_gt_score[:num_gt] = sample['gt_score'] - sample['gt_score'] = pad_gt_score - if 'is_crowd' in sample: - pad_is_crowd = np.zeros((num_max_boxes, 1), dtype=np.int32) - if num_gt > 0: - pad_is_crowd[:num_gt] = sample['is_crowd'] - sample['is_crowd'] = pad_is_crowd - if 'difficult' in sample: - pad_diff = np.zeros((num_max_boxes, 1), dtype=np.int32) - if num_gt > 0: - pad_diff[:num_gt] = sample['difficult'] - sample['difficult'] = pad_diff - if 'gt_joints' in sample: - num_joints = sample['gt_joints'].shape[1] - pad_gt_joints = np.zeros( - (num_max_boxes, num_joints, 3), dtype=np.float32) - if num_gt > 0: - pad_gt_joints[:num_gt] = sample['gt_joints'] - sample['gt_joints'] = pad_gt_joints - if 'gt_areas' in sample: - pad_gt_areas = np.zeros((num_max_boxes, 1), dtype=np.float32) - if num_gt > 0: - pad_gt_areas[:num_gt, 0] = sample['gt_areas'] - sample['gt_areas'] = pad_gt_areas - return samples - - -@register_op -class PadRGT(BaseOperator): - """ - Pad 0 to `gt_class`, `gt_bbox`, `gt_score`... - The num_max_boxes is the largest for batch. - Args: - return_gt_mask (bool): If true, return `pad_gt_mask`, - 1 means bbox, 0 means no bbox. - """ - - def __init__(self, return_gt_mask=True): - super(PadRGT, self).__init__() - self.return_gt_mask = return_gt_mask - - def pad_field(self, sample, field, num_gt): - name, shape, dtype = field - if name in sample: - pad_v = np.zeros(shape, dtype=dtype) - if num_gt > 0: - pad_v[:num_gt] = sample[name] - sample[name] = pad_v - - def __call__(self, samples, context=None): - num_max_boxes = max([len(s['gt_bbox']) for s in samples]) - for sample in samples: - if self.return_gt_mask: - sample['pad_gt_mask'] = np.zeros( - (num_max_boxes, 1), dtype=np.float32) - if num_max_boxes == 0: - continue - - num_gt = len(sample['gt_bbox']) - pad_gt_class = np.zeros((num_max_boxes, 1), dtype=np.int32) - pad_gt_bbox = np.zeros((num_max_boxes, 4), dtype=np.float32) - if num_gt > 0: - pad_gt_class[:num_gt] = sample['gt_class'] - pad_gt_bbox[:num_gt] = sample['gt_bbox'] - sample['gt_class'] = pad_gt_class - sample['gt_bbox'] = pad_gt_bbox - # pad_gt_mask - if 'pad_gt_mask' in sample: - sample['pad_gt_mask'][:num_gt] = 1 - # gt_score - names = ['gt_score', 'is_crowd', 'difficult', 'gt_poly', 'gt_rbox'] - dims = [1, 1, 1, 8, 5] - dtypes = [np.float32, np.int32, np.int32, np.float32, np.float32] - - for name, dim, dtype in zip(names, dims, dtypes): - self.pad_field(sample, [name, (num_max_boxes, dim), dtype], - num_gt) - - return samples - - -@register_op -class Gt2CenterTrackTarget(BaseOperator): - __shared__ = ['num_classes'] - """Gt2CenterTrackTarget - Genterate CenterTrack targets by ground-truth - Args: - num_classes (int): The number of classes, 1 by default. - down_ratio (int): The down sample ratio between output feature and - input image. - max_objs (int): The maximum objects detected, 256 by default. - """ - - def __init__(self, - num_classes=1, - down_ratio=4, - max_objs=256, - hm_disturb=0.05, - lost_disturb=0.4, - fp_disturb=0.1, - pre_hm=True, - add_tracking=True, - add_ltrb_amodal=True): - super(Gt2CenterTrackTarget, self).__init__() - self.nc = num_classes - self.down_ratio = down_ratio - self.max_objs = max_objs - - self.hm_disturb = hm_disturb - self.lost_disturb = lost_disturb - self.fp_disturb = fp_disturb - self.pre_hm = pre_hm - self.add_tracking = add_tracking - self.add_ltrb_amodal = add_ltrb_amodal - - def _get_pre_dets(self, input_h, input_w, trans_input_pre, gt_bbox_pre, - gt_class_pre, gt_track_id_pre): - hm_h, hm_w = input_h, input_w - reutrn_hm = self.pre_hm - pre_hm = np.zeros( - (1, hm_h, hm_w), dtype=np.float32) if reutrn_hm else None - pre_cts, track_ids = [], [] - - for i, ( - bbox, cls, track_id - ) in enumerate(zip(gt_bbox_pre, gt_class_pre, gt_track_id_pre)): - cls = int(cls) - bbox[:2] = affine_transform(bbox[:2], trans_input_pre) - bbox[2:] = affine_transform(bbox[2:], trans_input_pre) - bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, hm_w - 1) - bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, hm_h - 1) - h, w = bbox[3] - bbox[1], bbox[2] - bbox[0] - max_rad = 1 - if (h > 0 and w > 0): - radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7) - radius = max(0, int(radius)) - max_rad = max(max_rad, radius) - ct = np.array( - [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], - dtype=np.float32) - ct0 = ct.copy() - conf = 1 - - ct[0] = ct[0] + np.random.randn() * self.hm_disturb * w - ct[1] = ct[1] + np.random.randn() * self.hm_disturb * h - conf = 1 if np.random.rand() > self.lost_disturb else 0 - - ct_int = ct.astype(np.int32) - if conf == 0: - pre_cts.append(ct / self.down_ratio) - else: - pre_cts.append(ct0 / self.down_ratio) - - track_ids.append(track_id) - if reutrn_hm: - draw_umich_gaussian(pre_hm[0], ct_int, radius, k=conf) - - if np.random.rand() < self.fp_disturb and reutrn_hm: - ct2 = ct0.copy() - # Hard code heatmap disturb ratio, haven't tried other numbers. - ct2[0] = ct2[0] + np.random.randn() * 0.05 * w - ct2[1] = ct2[1] + np.random.randn() * 0.05 * h - ct2_int = ct2.astype(np.int32) - draw_umich_gaussian(pre_hm[0], ct2_int, radius, k=conf) - return pre_hm, pre_cts, track_ids - - def __call__(self, sample, context=None): - input_h, input_w = sample['image'].shape[1:] - output_h = input_h // self.down_ratio - output_w = input_w // self.down_ratio - gt_bbox = sample['gt_bbox'] - gt_class = sample['gt_class'] - - # init - hm = np.zeros((self.nc, output_h, output_w), dtype=np.float32) - wh = np.zeros((self.max_objs, 2), dtype=np.float32) - reg = np.zeros((self.max_objs, 2), dtype=np.float32) - ind = np.zeros((self.max_objs), dtype=np.int64) - reg_mask = np.zeros((self.max_objs), dtype=np.int32) - if self.add_tracking: - tr = np.zeros((self.max_objs, 2), dtype=np.float32) - if self.add_ltrb_amodal: - ltrb_amodal = np.zeros((self.max_objs, 4), dtype=np.float32) - - trans_output = get_affine_transform( - center=sample['center'], - input_size=[sample['scale'], sample['scale']], - rot=0, - output_size=[output_w, output_h]) - - pre_hm, pre_cts, track_ids = self._get_pre_dets( - input_h, input_w, sample['trans_input'], sample['pre_gt_bbox'], - sample['pre_gt_class'], sample['pre_gt_track_id']) - - for i, (bbox, cls) in enumerate(zip(gt_bbox, gt_class)): - cls = int(cls) - rect = np.array( - [[bbox[0], bbox[1]], [bbox[0], bbox[3]], [bbox[2], bbox[3]], - [bbox[2], bbox[1]]], - dtype=np.float32) - for t in range(4): - rect[t] = affine_transform(rect[t], trans_output) - bbox[:2] = rect[:, 0].min(), rect[:, 1].min() - bbox[2:] = rect[:, 0].max(), rect[:, 1].max() - - bbox_amodal = copy.deepcopy(bbox) - bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, output_w - 1) - bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, output_h - 1) - - h, w = bbox[3] - bbox[1], bbox[2] - bbox[0] - if h > 0 and w > 0: - radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7) - radius = max(0, int(radius)) - ct = np.array( - [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], - dtype=np.float32) - ct_int = ct.astype(np.int32) - - # get hm,wh,reg,ind,ind_mask - draw_umich_gaussian(hm[cls], ct_int, radius) - wh[i] = 1. * w, 1. * h - reg[i] = ct - ct_int - ind[i] = ct_int[1] * output_w + ct_int[0] - reg_mask[i] = 1 - if self.add_tracking: - if sample['gt_track_id'][i] in track_ids: - pre_ct = pre_cts[track_ids.index(sample['gt_track_id'][ - i])] - tr[i] = pre_ct - ct_int - - if self.add_ltrb_amodal: - ltrb_amodal[i] = \ - bbox_amodal[0] - ct_int[0], bbox_amodal[1] - ct_int[1], \ - bbox_amodal[2] - ct_int[0], bbox_amodal[3] - ct_int[1] - - new_sample = {'image': sample['image']} - new_sample['index'] = ind - new_sample['index_mask'] = reg_mask - new_sample['heatmap'] = hm - new_sample['size'] = wh - new_sample['offset'] = reg - if self.add_tracking: - new_sample['tracking'] = tr - if self.add_ltrb_amodal: - new_sample['ltrb_amodal'] = ltrb_amodal - - new_sample['pre_image'] = sample['pre_image'] - new_sample['pre_hm'] = pre_hm - - del sample - return new_sample - - -@register_op -class BatchRandomResizeForSSOD(BaseOperator): - """ - Resize image to target size randomly. random target_size and interpolation method - Args: - target_size (int, list, tuple): image target size, if random size is True, must be list or tuple - keep_ratio (bool): whether keep_raio or not, default true - interp (int): the interpolation method - random_size (bool): whether random select target size of image - random_interp (bool): whether random select interpolation method - """ - - def __init__(self, - target_size, - keep_ratio, - interp=cv2.INTER_NEAREST, - random_size=True, - random_interp=False): - super(BatchRandomResizeForSSOD, self).__init__() - self.keep_ratio = keep_ratio - self.interps = [ - cv2.INTER_NEAREST, - cv2.INTER_LINEAR, - cv2.INTER_AREA, - cv2.INTER_CUBIC, - cv2.INTER_LANCZOS4, - ] - self.interp = interp - assert isinstance(target_size, ( - int, Sequence)), "target_size must be int, list or tuple" - if random_size and not isinstance(target_size, list): - raise TypeError( - "Type of target_size is invalid when random_size is True. Must be List, now is {}". - format(type(target_size))) - self.target_size = target_size - self.random_size = random_size - self.random_interp = random_interp - - def __call__(self, samples, context=None): - if self.random_size: - index = np.random.choice(len(self.target_size)) - target_size = self.target_size[index] - else: - target_size = self.target_size - if context is not None: - target_size = self.target_size[context] - if self.random_interp: - interp = np.random.choice(self.interps) - else: - interp = self.interp - - resizer = Resize(target_size, keep_ratio=self.keep_ratio, interp=interp) - return [resizer(samples, context=context), index] diff --git a/pdfdet/models/Paddle/ppdet/data/transform/culane_operators.py b/pdfdet/models/Paddle/ppdet/data/transform/culane_operators.py deleted file mode 100644 index 4790435..0000000 --- a/pdfdet/models/Paddle/ppdet/data/transform/culane_operators.py +++ /dev/null @@ -1,366 +0,0 @@ -import numpy as np -import imgaug.augmenters as iaa -from .operators import BaseOperator, register_op -from ppdet.utils.logger import setup_logger -from ppdet.data.culane_utils import linestrings_to_lanes, transform_annotation - -logger = setup_logger(__name__) - -__all__ = [ - "CULaneTrainProcess", "CULaneDataProcess", "HorizontalFlip", - "ChannelShuffle", "CULaneAffine", "CULaneResize", "OneOfBlur", - "MultiplyAndAddToBrightness", "AddToHueAndSaturation" -] - - -def trainTransforms(img_h, img_w): - transforms = [{ - 'name': 'Resize', - 'parameters': dict(size=dict( - height=img_h, width=img_w)), - 'p': 1.0 - }, { - 'name': 'HorizontalFlip', - 'parameters': dict(p=1.0), - 'p': 0.5 - }, { - 'name': 'ChannelShuffle', - 'parameters': dict(p=1.0), - 'p': 0.1 - }, { - 'name': 'MultiplyAndAddToBrightness', - 'parameters': dict( - mul=(0.85, 1.15), add=(-10, 10)), - 'p': 0.6 - }, { - 'name': 'AddToHueAndSaturation', - 'parameters': dict(value=(-10, 10)), - 'p': 0.7 - }, { - 'name': 'OneOf', - 'transforms': [ - dict( - name='MotionBlur', parameters=dict(k=(3, 5))), dict( - name='MedianBlur', parameters=dict(k=(3, 5))) - ], - 'p': 0.2 - }, { - 'name': 'Affine', - 'parameters': dict( - translate_percent=dict( - x=(-0.1, 0.1), y=(-0.1, 0.1)), - rotate=(-10, 10), - scale=(0.8, 1.2)), - 'p': 0.7 - }, { - 'name': 'Resize', - 'parameters': dict(size=dict( - height=img_h, width=img_w)), - 'p': 1.0 - }] - return transforms - - -@register_op -class CULaneTrainProcess(BaseOperator): - def __init__(self, img_w, img_h): - super(CULaneTrainProcess, self).__init__() - self.img_w = img_w - self.img_h = img_h - self.transforms = trainTransforms(self.img_h, self.img_w) - - if self.transforms is not None: - img_transforms = [] - for aug in self.transforms: - p = aug['p'] - if aug['name'] != 'OneOf': - img_transforms.append( - iaa.Sometimes( - p=p, - then_list=getattr(iaa, aug['name'])(**aug[ - 'parameters']))) - else: - img_transforms.append( - iaa.Sometimes( - p=p, - then_list=iaa.OneOf([ - getattr(iaa, aug_['name'])(**aug_['parameters']) - for aug_ in aug['transforms'] - ]))) - else: - img_transforms = [] - self.iaa_transform = iaa.Sequential(img_transforms) - - def apply(self, sample, context=None): - img, line_strings, seg = self.iaa_transform( - image=sample['image'], - line_strings=sample['lanes'], - segmentation_maps=sample['mask']) - sample['image'] = img - sample['lanes'] = line_strings - sample['mask'] = seg - return sample - - -@register_op -class CULaneDataProcess(BaseOperator): - def __init__(self, img_w, img_h, num_points, max_lanes): - super(CULaneDataProcess, self).__init__() - self.img_w = img_w - self.img_h = img_h - self.num_points = num_points - self.n_offsets = num_points - self.n_strips = num_points - 1 - self.strip_size = self.img_h / self.n_strips - - self.max_lanes = max_lanes - self.offsets_ys = np.arange(self.img_h, -1, -self.strip_size) - - def apply(self, sample, context=None): - data = {} - line_strings = sample['lanes'] - line_strings.clip_out_of_image_() - new_anno = {'lanes': linestrings_to_lanes(line_strings)} - - for i in range(30): - try: - annos = transform_annotation( - self.img_w, self.img_h, self.max_lanes, self.n_offsets, - self.offsets_ys, self.n_strips, self.strip_size, new_anno) - label = annos['label'] - lane_endpoints = annos['lane_endpoints'] - break - except: - if (i + 1) == 30: - logger.critical('Transform annotation failed 30 times :(') - exit() - - sample['image'] = sample['image'].astype(np.float32) / 255. - data['image'] = sample['image'].transpose(2, 0, 1) - data['lane_line'] = label - data['seg'] = sample['seg'] - data['full_img_path'] = sample['full_img_path'] - data['img_name'] = sample['img_name'] - data['im_id'] = sample['im_id'] - - if 'mask' in sample.keys(): - data['seg'] = sample['mask'].get_arr() - - data['im_shape'] = np.array([self.img_w, self.img_h], dtype=np.float32) - data['scale_factor'] = np.array([1., 1.], dtype=np.float32) - - return data - - -@register_op -class CULaneResize(BaseOperator): - def __init__(self, img_h, img_w, prob=0.5): - super(CULaneResize, self).__init__() - self.img_h = img_h - self.img_w = img_w - self.prob = prob - - def apply(self, sample, context=None): - transform = iaa.Sometimes(self.prob, - iaa.Resize({ - "height": self.img_h, - "width": self.img_w - })) - if 'mask' in sample.keys(): - img, line_strings, seg = transform( - image=sample['image'], - line_strings=sample['lanes'], - segmentation_maps=sample['mask']) - sample['image'] = img - sample['lanes'] = line_strings - sample['mask'] = seg - else: - img, line_strings = transform( - image=sample['image'].copy().astype(np.uint8), - line_strings=sample['lanes']) - sample['image'] = img - sample['lanes'] = line_strings - - return sample - - -@register_op -class HorizontalFlip(BaseOperator): - def __init__(self, prob=0.5): - super(HorizontalFlip, self).__init__() - self.prob = prob - - def apply(self, sample, context=None): - transform = iaa.Sometimes(self.prob, iaa.HorizontalFlip(1.0)) - if 'mask' in sample.keys(): - img, line_strings, seg = transform( - image=sample['image'], - line_strings=sample['lanes'], - segmentation_maps=sample['mask']) - sample['image'] = img - sample['lanes'] = line_strings - sample['mask'] = seg - else: - img, line_strings = transform( - image=sample['image'], line_strings=sample['lanes']) - sample['image'] = img - sample['lanes'] = line_strings - - return sample - - -@register_op -class ChannelShuffle(BaseOperator): - def __init__(self, prob=0.1): - super(ChannelShuffle, self).__init__() - self.prob = prob - - def apply(self, sample, context=None): - transform = iaa.Sometimes(self.prob, iaa.ChannelShuffle(1.0)) - if 'mask' in sample.keys(): - img, line_strings, seg = transform( - image=sample['image'], - line_strings=sample['lanes'], - segmentation_maps=sample['mask']) - sample['image'] = img - sample['lanes'] = line_strings - sample['mask'] = seg - else: - img, line_strings = transform( - image=sample['image'], line_strings=sample['lanes']) - sample['image'] = img - sample['lanes'] = line_strings - - return sample - - -@register_op -class MultiplyAndAddToBrightness(BaseOperator): - def __init__(self, mul=(0.85, 1.15), add=(-10, 10), prob=0.5): - super(MultiplyAndAddToBrightness, self).__init__() - self.mul = tuple(mul) - self.add = tuple(add) - self.prob = prob - - def apply(self, sample, context=None): - transform = iaa.Sometimes( - self.prob, - iaa.MultiplyAndAddToBrightness( - mul=self.mul, add=self.add)) - if 'mask' in sample.keys(): - img, line_strings, seg = transform( - image=sample['image'], - line_strings=sample['lanes'], - segmentation_maps=sample['mask']) - sample['image'] = img - sample['lanes'] = line_strings - sample['mask'] = seg - else: - img, line_strings = transform( - image=sample['image'], line_strings=sample['lanes']) - sample['image'] = img - sample['lanes'] = line_strings - - return sample - - -@register_op -class AddToHueAndSaturation(BaseOperator): - def __init__(self, value=(-10, 10), prob=0.5): - super(AddToHueAndSaturation, self).__init__() - self.value = tuple(value) - self.prob = prob - - def apply(self, sample, context=None): - transform = iaa.Sometimes( - self.prob, iaa.AddToHueAndSaturation(value=self.value)) - if 'mask' in sample.keys(): - img, line_strings, seg = transform( - image=sample['image'], - line_strings=sample['lanes'], - segmentation_maps=sample['mask']) - sample['image'] = img - sample['lanes'] = line_strings - sample['mask'] = seg - else: - img, line_strings = transform( - image=sample['image'], line_strings=sample['lanes']) - sample['image'] = img - sample['lanes'] = line_strings - - return sample - - -@register_op -class OneOfBlur(BaseOperator): - def __init__(self, MotionBlur_k=(3, 5), MedianBlur_k=(3, 5), prob=0.5): - super(OneOfBlur, self).__init__() - self.MotionBlur_k = tuple(MotionBlur_k) - self.MedianBlur_k = tuple(MedianBlur_k) - self.prob = prob - - def apply(self, sample, context=None): - transform = iaa.Sometimes( - self.prob, - iaa.OneOf([ - iaa.MotionBlur(k=self.MotionBlur_k), - iaa.MedianBlur(k=self.MedianBlur_k) - ])) - - if 'mask' in sample.keys(): - img, line_strings, seg = transform( - image=sample['image'], - line_strings=sample['lanes'], - segmentation_maps=sample['mask']) - sample['image'] = img - sample['lanes'] = line_strings - sample['mask'] = seg - else: - img, line_strings = transform( - image=sample['image'], line_strings=sample['lanes']) - sample['image'] = img - sample['lanes'] = line_strings - - return sample - - -@register_op -class CULaneAffine(BaseOperator): - def __init__(self, - translate_percent_x=(-0.1, 0.1), - translate_percent_y=(-0.1, 0.1), - rotate=(3, 5), - scale=(0.8, 1.2), - prob=0.5): - super(CULaneAffine, self).__init__() - self.translate_percent = { - 'x': tuple(translate_percent_x), - 'y': tuple(translate_percent_y) - } - self.rotate = tuple(rotate) - self.scale = tuple(scale) - self.prob = prob - - def apply(self, sample, context=None): - transform = iaa.Sometimes( - self.prob, - iaa.Affine( - translate_percent=self.translate_percent, - rotate=self.rotate, - scale=self.scale)) - - if 'mask' in sample.keys(): - img, line_strings, seg = transform( - image=sample['image'], - line_strings=sample['lanes'], - segmentation_maps=sample['mask']) - sample['image'] = img - sample['lanes'] = line_strings - sample['mask'] = seg - else: - img, line_strings = transform( - image=sample['image'], line_strings=sample['lanes']) - sample['image'] = img - sample['lanes'] = line_strings - - return sample diff --git a/pdfdet/models/Paddle/ppdet/data/transform/gridmask_utils.py b/pdfdet/models/Paddle/ppdet/data/transform/gridmask_utils.py deleted file mode 100644 index c187015..0000000 --- a/pdfdet/models/Paddle/ppdet/data/transform/gridmask_utils.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The code is based on: -# https://github.com/dvlab-research/GridMask/blob/master/detection_grid/maskrcnn_benchmark/data/transforms/grid.py - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import numpy as np -from PIL import Image - - -class Gridmask(object): - def __init__(self, - use_h=True, - use_w=True, - rotate=1, - offset=False, - ratio=0.5, - mode=1, - prob=0.7, - upper_iter=360000): - super(Gridmask, self).__init__() - self.use_h = use_h - self.use_w = use_w - self.rotate = rotate - self.offset = offset - self.ratio = ratio - self.mode = mode - self.prob = prob - self.st_prob = prob - self.upper_iter = upper_iter - - def __call__(self, x, curr_iter): - self.prob = self.st_prob * min(1, 1.0 * curr_iter / self.upper_iter) - if np.random.rand() > self.prob: - return x - h, w, _ = x.shape - hh = int(1.5 * h) - ww = int(1.5 * w) - d = np.random.randint(2, h) - self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1) - mask = np.ones((hh, ww), np.float32) - st_h = np.random.randint(d) - st_w = np.random.randint(d) - if self.use_h: - for i in range(hh // d): - s = d * i + st_h - t = min(s + self.l, hh) - mask[s:t, :] *= 0 - if self.use_w: - for i in range(ww // d): - s = d * i + st_w - t = min(s + self.l, ww) - mask[:, s:t] *= 0 - - r = np.random.randint(self.rotate) - mask = Image.fromarray(np.uint8(mask)) - mask = mask.rotate(r) - mask = np.asarray(mask) - mask = mask[(hh - h) // 2:(hh - h) // 2 + h, (ww - w) // 2:(ww - w) // 2 - + w].astype(np.float32) - - if self.mode == 1: - mask = 1 - mask - mask = np.expand_dims(mask, axis=-1) - if self.offset: - offset = (2 * (np.random.rand(h, w) - 0.5)).astype(np.float32) - x = (x * mask + offset * (1 - mask)).astype(x.dtype) - else: - x = (x * mask).astype(x.dtype) - - return x diff --git a/pdfdet/models/Paddle/ppdet/data/transform/keypoint_operators.py b/pdfdet/models/Paddle/ppdet/data/transform/keypoint_operators.py deleted file mode 100644 index d29aa23..0000000 --- a/pdfdet/models/Paddle/ppdet/data/transform/keypoint_operators.py +++ /dev/null @@ -1,1742 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# function: -# operators to process sample, -# eg: decode/resize/crop image - -from __future__ import absolute_import - -try: - from collections.abc import Sequence -except Exception: - from collections import Sequence - -import cv2 -import numpy as np -import math -import copy - -from ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints, get_affine_transform, affine_transform, get_warp_matrix -from ppdet.core.workspace import serializable -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -registered_ops = [] - -__all__ = [ - 'RandomAffine', 'KeyPointFlip', 'TagGenerate', 'ToHeatmaps', - 'NormalizePermute', 'EvalAffine', 'RandomFlipHalfBodyTransform', - 'TopDownRandomFlip', 'TopDownRandomShiftBboxCenter', 'TopDownGetRandomScaleRotation', - 'TopDownAffine', 'ToHeatmapsTopDown', 'ToHeatmapsTopDown_DARK', - 'ToHeatmapsTopDown_UDP', 'TopDownEvalAffine', - 'AugmentationbyInformantionDropping', 'SinglePoseAffine', 'NoiseJitter', - 'FlipPose', 'PETR_Resize' -] - - -def register_keypointop(cls): - return serializable(cls) - - -@register_keypointop -class KeyPointFlip(object): - """Get the fliped image by flip_prob. flip the coords also - the left coords and right coords should exchange while flip, for the right keypoint will be left keypoint after image fliped - - Args: - flip_permutation (list[17]): the left-right exchange order list corresponding to [0,1,2,...,16] - hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet - flip_prob (float): the ratio whether to flip the image - records(dict): the dict contained the image, mask and coords - - Returns: - records(dict): contain the image, mask and coords after tranformed - - """ - - def __init__(self, flip_permutation, hmsize=None, flip_prob=0.5): - super(KeyPointFlip, self).__init__() - assert isinstance(flip_permutation, Sequence) - self.flip_permutation = flip_permutation - self.flip_prob = flip_prob - self.hmsize = hmsize - - def _flipjoints(self, records, sizelst): - ''' - records['gt_joints'] is Sequence in higherhrnet - ''' - if not ('gt_joints' in records and len(records['gt_joints']) > 0): - return records - - kpts_lst = records['gt_joints'] - if isinstance(kpts_lst, Sequence): - for idx, hmsize in enumerate(sizelst): - if kpts_lst[idx].ndim == 3: - kpts_lst[idx] = kpts_lst[idx][:, self.flip_permutation] - else: - kpts_lst[idx] = kpts_lst[idx][self.flip_permutation] - kpts_lst[idx][..., 0] = hmsize - kpts_lst[idx][..., 0] - else: - hmsize = sizelst[0] - if kpts_lst.ndim == 3: - kpts_lst = kpts_lst[:, self.flip_permutation] - else: - kpts_lst = kpts_lst[self.flip_permutation] - kpts_lst[..., 0] = hmsize - kpts_lst[..., 0] - - records['gt_joints'] = kpts_lst - return records - - def _flipmask(self, records, sizelst): - if not 'mask' in records: - return records - - mask_lst = records['mask'] - for idx, hmsize in enumerate(sizelst): - if len(mask_lst) > idx: - mask_lst[idx] = mask_lst[idx][:, ::-1] - records['mask'] = mask_lst - return records - - def _flipbbox(self, records, sizelst): - if not 'gt_bbox' in records: - return records - - bboxes = records['gt_bbox'] - hmsize = sizelst[0] - bboxes[:, 0::2] = hmsize - bboxes[:, 0::2][:, ::-1] - bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, hmsize) - records['gt_bbox'] = bboxes - return records - - def __call__(self, records): - flip = np.random.random() < self.flip_prob - if flip: - image = records['image'] - image = image[:, ::-1] - records['image'] = image - if self.hmsize is None: - sizelst = [image.shape[1]] - else: - sizelst = self.hmsize - self._flipjoints(records, sizelst) - self._flipmask(records, sizelst) - self._flipbbox(records, sizelst) - - return records - - -@register_keypointop -class RandomAffine(object): - """apply affine transform to image, mask and coords - to achieve the rotate, scale and shift effect for training image - - Args: - max_degree (float): the max abslute rotate degree to apply, transform range is [-max_degree, max_degree] - max_scale (list[2]): the scale range to apply, transform range is [min, max] - max_shift (float): the max abslute shift ratio to apply, transform range is [-max_shift*imagesize, max_shift*imagesize] - hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet - trainsize (list[2]): the standard length used to train, the 'scale_type' of [h,w] will be resize to trainsize for standard - scale_type (str): the length of [h,w] to used for trainsize, chosed between 'short' and 'long' - records(dict): the dict contained the image, mask and coords - - Returns: - records(dict): contain the image, mask and coords after tranformed - - """ - - def __init__(self, - max_degree=30, - scale=[0.75, 1.5], - max_shift=0.2, - hmsize=None, - trainsize=[512, 512], - scale_type='short', - boldervalue=[114, 114, 114]): - super(RandomAffine, self).__init__() - self.max_degree = max_degree - self.min_scale = scale[0] - self.max_scale = scale[1] - self.max_shift = max_shift - self.hmsize = hmsize - self.trainsize = trainsize - self.scale_type = scale_type - self.boldervalue = boldervalue - - def _get_affine_matrix_old(self, center, scale, res, rot=0): - """Generate transformation matrix.""" - h = scale - t = np.zeros((3, 3), dtype=np.float32) - t[0, 0] = float(res[1]) / h - t[1, 1] = float(res[0]) / h - t[0, 2] = res[1] * (-float(center[0]) / h + .5) - t[1, 2] = res[0] * (-float(center[1]) / h + .5) - t[2, 2] = 1 - if rot != 0: - rot = -rot # To match direction of rotation from cropping - rot_mat = np.zeros((3, 3), dtype=np.float32) - rot_rad = rot * np.pi / 180 - sn, cs = np.sin(rot_rad), np.cos(rot_rad) - rot_mat[0, :2] = [cs, -sn] - rot_mat[1, :2] = [sn, cs] - rot_mat[2, 2] = 1 - # Need to rotate around center - t_mat = np.eye(3) - t_mat[0, 2] = -res[1] / 2 - t_mat[1, 2] = -res[0] / 2 - t_inv = t_mat.copy() - t_inv[:2, 2] *= -1 - t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t))) - return t - - def _get_affine_matrix(self, center, scale, res, rot=0): - """Generate transformation matrix.""" - w, h = scale - t = np.zeros((3, 3), dtype=np.float32) - t[0, 0] = float(res[0]) / w - t[1, 1] = float(res[1]) / h - t[0, 2] = res[0] * (-float(center[0]) / w + .5) - t[1, 2] = res[1] * (-float(center[1]) / h + .5) - t[2, 2] = 1 - if rot != 0: - rot = -rot # To match direction of rotation from cropping - rot_mat = np.zeros((3, 3), dtype=np.float32) - rot_rad = rot * np.pi / 180 - sn, cs = np.sin(rot_rad), np.cos(rot_rad) - rot_mat[0, :2] = [cs, -sn] - rot_mat[1, :2] = [sn, cs] - rot_mat[2, 2] = 1 - # Need to rotate around center - t_mat = np.eye(3) - t_mat[0, 2] = -res[0] / 2 - t_mat[1, 2] = -res[1] / 2 - t_inv = t_mat.copy() - t_inv[:2, 2] *= -1 - t = np.dot(t_inv, np.dot(rot_mat, np.dot(t_mat, t))) - return t - - def _affine_joints_mask(self, - degree, - center, - roi_size, - dsize, - keypoints=None, - heatmap_mask=None, - gt_bbox=None): - kpts = None - mask = None - bbox = None - mask_affine_mat = self._get_affine_matrix(center, roi_size, dsize, - degree)[:2] - if heatmap_mask is not None: - mask = cv2.warpAffine(heatmap_mask, mask_affine_mat, dsize) - mask = ((mask / 255) > 0.5).astype(np.float32) - if keypoints is not None: - kpts = copy.deepcopy(keypoints) - kpts[..., 0:2] = warp_affine_joints(kpts[..., 0:2].copy(), - mask_affine_mat) - kpts[(kpts[..., 0]) > dsize[0], :] = 0 - kpts[(kpts[..., 1]) > dsize[1], :] = 0 - kpts[(kpts[..., 0]) < 0, :] = 0 - kpts[(kpts[..., 1]) < 0, :] = 0 - if gt_bbox is not None: - temp_bbox = gt_bbox[:, [0, 3, 2, 1]] - cat_bbox = np.concatenate((gt_bbox, temp_bbox), axis=-1) - gt_bbox_warped = warp_affine_joints(cat_bbox, mask_affine_mat) - bbox = np.zeros_like(gt_bbox) - bbox[:, 0] = gt_bbox_warped[:, 0::2].min(1).clip(0, dsize[0]) - bbox[:, 2] = gt_bbox_warped[:, 0::2].max(1).clip(0, dsize[0]) - bbox[:, 1] = gt_bbox_warped[:, 1::2].min(1).clip(0, dsize[1]) - bbox[:, 3] = gt_bbox_warped[:, 1::2].max(1).clip(0, dsize[1]) - return kpts, mask, bbox - - def __call__(self, records): - image = records['image'] - shape = np.array(image.shape[:2][::-1]) - keypoints = None - heatmap_mask = None - gt_bbox = None - if 'gt_joints' in records: - keypoints = records['gt_joints'] - - if 'mask' in records: - heatmap_mask = records['mask'] - heatmap_mask *= 255 - - if 'gt_bbox' in records: - gt_bbox = records['gt_bbox'] - - degree = (np.random.random() * 2 - 1) * self.max_degree - center = center = np.array((np.array(shape) / 2)) - - aug_scale = np.random.random() * (self.max_scale - self.min_scale - ) + self.min_scale - if self.scale_type == 'long': - scale = np.array([max(shape[0], shape[1]) / 1.0] * 2) - elif self.scale_type == 'short': - scale = np.array([min(shape[0], shape[1]) / 1.0] * 2) - elif self.scale_type == 'wh': - scale = shape - else: - raise ValueError('Unknown scale type: {}'.format(self.scale_type)) - roi_size = aug_scale * scale - dx = int(0) - dy = int(0) - if self.max_shift > 0: - - dx = np.random.randint(-self.max_shift * roi_size[0], - self.max_shift * roi_size[0]) - dy = np.random.randint(-self.max_shift * roi_size[0], - self.max_shift * roi_size[1]) - - center += np.array([dx, dy]) - input_size = 2 * center - if self.trainsize != -1: - dsize = self.trainsize - imgshape = (dsize) - else: - dsize = scale - imgshape = (shape.tolist()) - - image_affine_mat = self._get_affine_matrix(center, roi_size, dsize, - degree)[:2] - image = cv2.warpAffine( - image, - image_affine_mat, - imgshape, - flags=cv2.INTER_LINEAR, - borderValue=self.boldervalue) - - if self.hmsize is None: - kpts, mask, gt_bbox = self._affine_joints_mask( - degree, center, roi_size, dsize, keypoints, heatmap_mask, - gt_bbox) - records['image'] = image - if kpts is not None: records['gt_joints'] = kpts - if mask is not None: records['mask'] = mask - if gt_bbox is not None: records['gt_bbox'] = gt_bbox - return records - - kpts_lst = [] - mask_lst = [] - for hmsize in self.hmsize: - kpts, mask, gt_bbox = self._affine_joints_mask( - degree, center, roi_size, [hmsize, hmsize], keypoints, - heatmap_mask, gt_bbox) - kpts_lst.append(kpts) - mask_lst.append(mask) - records['image'] = image - - if 'gt_joints' in records: - records['gt_joints'] = kpts_lst - if 'mask' in records: - records['mask'] = mask_lst - if 'gt_bbox' in records: - records['gt_bbox'] = gt_bbox - return records - - -@register_keypointop -class EvalAffine(object): - """apply affine transform to image - resize the short of [h,w] to standard size for eval - - Args: - size (int): the standard length used to train, the 'short' of [h,w] will be resize to trainsize for standard - records(dict): the dict contained the image, mask and coords - - Returns: - records(dict): contain the image, mask and coords after tranformed - - """ - - def __init__(self, size, stride=64): - super(EvalAffine, self).__init__() - self.size = size - self.stride = stride - - def __call__(self, records): - image = records['image'] - mask = records['mask'] if 'mask' in records else None - s = self.size - h, w, _ = image.shape - trans, size_resized = get_affine_mat_kernel(h, w, s, inv=False) - image_resized = cv2.warpAffine(image, trans, size_resized) - if mask is not None: - mask = cv2.warpAffine(mask, trans, size_resized) - records['mask'] = mask - if 'gt_joints' in records: - del records['gt_joints'] - records['image'] = image_resized - records['scale_factor'] = self.size / min(h, w) - return records - - -@register_keypointop -class NormalizePermute(object): - def __init__(self, - mean=[123.675, 116.28, 103.53], - std=[58.395, 57.120, 57.375], - is_scale=True): - super(NormalizePermute, self).__init__() - self.mean = mean - self.std = std - self.is_scale = is_scale - - def __call__(self, records): - image = records['image'] - image = image.astype(np.float32) - if self.is_scale: - image /= 255. - image = image.transpose((2, 0, 1)) - mean = np.array(self.mean, dtype=np.float32) - std = np.array(self.std, dtype=np.float32) - invstd = 1. / std - for v, m, s in zip(image, mean, invstd): - v.__isub__(m).__imul__(s) - records['image'] = image - return records - - -@register_keypointop -class TagGenerate(object): - """record gt coords for aeloss to sample coords value in tagmaps - - Args: - num_joints (int): the keypoint numbers of dataset to train - num_people (int): maxmum people to support for sample aeloss - records(dict): the dict contained the image, mask and coords - - Returns: - records(dict): contain the gt coords used in tagmap - - """ - - def __init__(self, num_joints, max_people=30): - super(TagGenerate, self).__init__() - self.max_people = max_people - self.num_joints = num_joints - - def __call__(self, records): - kpts_lst = records['gt_joints'] - kpts = kpts_lst[0] - tagmap = np.zeros((self.max_people, self.num_joints, 4), dtype=np.int64) - inds = np.where(kpts[..., 2] > 0) - p, j = inds[0], inds[1] - visible = kpts[inds] - # tagmap is [p, j, 3], where last dim is j, y, x - tagmap[p, j, 0] = j - tagmap[p, j, 1] = visible[..., 1] # y - tagmap[p, j, 2] = visible[..., 0] # x - tagmap[p, j, 3] = 1 - records['tagmap'] = tagmap - del records['gt_joints'] - return records - - -@register_keypointop -class ToHeatmaps(object): - """to generate the gaussin heatmaps of keypoint for heatmap loss - - Args: - num_joints (int): the keypoint numbers of dataset to train - hmsize (list[2]): output heatmap's shape list of different scale outputs of higherhrnet - sigma (float): the std of gaussin kernel genereted - records(dict): the dict contained the image, mask and coords - - Returns: - records(dict): contain the heatmaps used to heatmaploss - - """ - - def __init__(self, num_joints, hmsize, sigma=None): - super(ToHeatmaps, self).__init__() - self.num_joints = num_joints - self.hmsize = np.array(hmsize) - if sigma is None: - sigma = hmsize[0] // 64 - self.sigma = sigma - - r = 6 * sigma + 3 - x = np.arange(0, r, 1, np.float32) - y = x[:, None] - x0, y0 = 3 * sigma + 1, 3 * sigma + 1 - self.gaussian = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2)) - - def __call__(self, records): - kpts_lst = records['gt_joints'] - mask_lst = records['mask'] - for idx, hmsize in enumerate(self.hmsize): - mask = mask_lst[idx] - kpts = kpts_lst[idx] - heatmaps = np.zeros((self.num_joints, hmsize, hmsize)) - inds = np.where(kpts[..., 2] > 0) - visible = kpts[inds].astype(np.int64)[..., :2] - ul = np.round(visible - 3 * self.sigma - 1) - br = np.round(visible + 3 * self.sigma + 2) - sul = np.maximum(0, -ul) - sbr = np.minimum(hmsize, br) - ul - dul = np.clip(ul, 0, hmsize - 1) - dbr = np.clip(br, 0, hmsize) - for i in range(len(visible)): - if visible[i][0] < 0 or visible[i][1] < 0 or visible[i][ - 0] >= hmsize or visible[i][1] >= hmsize: - continue - dx1, dy1 = dul[i] - dx2, dy2 = dbr[i] - sx1, sy1 = sul[i] - sx2, sy2 = sbr[i] - heatmaps[inds[1][i], dy1:dy2, dx1:dx2] = np.maximum( - self.gaussian[sy1:sy2, sx1:sx2], - heatmaps[inds[1][i], dy1:dy2, dx1:dx2]) - records['heatmap_gt{}x'.format(idx + 1)] = heatmaps - records['mask_{}x'.format(idx + 1)] = mask - del records['mask'] - return records - - -@register_keypointop -class RandomFlipHalfBodyTransform(object): - """apply data augment to image and coords - to achieve the flip, scale, rotate and half body transform effect for training image - - Args: - trainsize (list):[w, h], Image target size - upper_body_ids (list): The upper body joint ids - flip_pairs (list): The left-right joints exchange order list - pixel_std (int): The pixel std of the scale - scale (float): The scale factor to transform the image - rot (int): The rotate factor to transform the image - num_joints_half_body (int): The joints threshold of the half body transform - prob_half_body (float): The threshold of the half body transform - flip (bool): Whether to flip the image - - Returns: - records(dict): contain the image and coords after tranformed - - """ - - def __init__(self, - trainsize, - upper_body_ids, - flip_pairs, - pixel_std, - scale=0.35, - rot=40, - num_joints_half_body=8, - prob_half_body=0.3, - flip=True, - rot_prob=0.6): - super(RandomFlipHalfBodyTransform, self).__init__() - self.trainsize = trainsize - self.upper_body_ids = upper_body_ids - self.flip_pairs = flip_pairs - self.pixel_std = pixel_std - self.scale = scale - self.rot = rot - self.num_joints_half_body = num_joints_half_body - self.prob_half_body = prob_half_body - self.flip = flip - self.aspect_ratio = trainsize[0] * 1.0 / trainsize[1] - self.rot_prob = rot_prob - - def halfbody_transform(self, joints, joints_vis): - upper_joints = [] - lower_joints = [] - for joint_id in range(joints.shape[0]): - if joints_vis[joint_id][0] > 0: - if joint_id in self.upper_body_ids: - upper_joints.append(joints[joint_id]) - else: - lower_joints.append(joints[joint_id]) - if np.random.randn() < 0.5 and len(upper_joints) > 2: - selected_joints = upper_joints - else: - selected_joints = lower_joints if len( - lower_joints) > 2 else upper_joints - if len(selected_joints) < 2: - return None, None - selected_joints = np.array(selected_joints, dtype=np.float32) - center = selected_joints.mean(axis=0)[:2] - left_top = np.amin(selected_joints, axis=0) - right_bottom = np.amax(selected_joints, axis=0) - w = right_bottom[0] - left_top[0] - h = right_bottom[1] - left_top[1] - if w > self.aspect_ratio * h: - h = w * 1.0 / self.aspect_ratio - elif w < self.aspect_ratio * h: - w = h * self.aspect_ratio - scale = np.array( - [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std], - dtype=np.float32) - scale = scale * 1.5 - - return center, scale - - def flip_joints(self, joints, joints_vis, width, matched_parts): - joints[:, 0] = width - joints[:, 0] - 1 - for pair in matched_parts: - joints[pair[0], :], joints[pair[1], :] = \ - joints[pair[1], :], joints[pair[0], :].copy() - joints_vis[pair[0], :], joints_vis[pair[1], :] = \ - joints_vis[pair[1], :], joints_vis[pair[0], :].copy() - - return joints * joints_vis, joints_vis - - def __call__(self, records): - image = records['image'] - joints = records['gt_joints'] - joints_vis = records['joints_vis'] - c = records['center'] - s = records['scale'] - r = 0 - if (np.sum(joints_vis[:, 0]) > self.num_joints_half_body and - np.random.rand() < self.prob_half_body): - c_half_body, s_half_body = self.halfbody_transform(joints, - joints_vis) - if c_half_body is not None and s_half_body is not None: - c, s = c_half_body, s_half_body - sf = self.scale - rf = self.rot - s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf) - r = np.clip(np.random.randn() * rf, -rf * 2, - rf * 2) if np.random.random() <= self.rot_prob else 0 - - if self.flip and np.random.random() <= 0.5: - image = image[:, ::-1, :] - joints, joints_vis = self.flip_joints( - joints, joints_vis, image.shape[1], self.flip_pairs) - c[0] = image.shape[1] - c[0] - 1 - records['image'] = image - records['gt_joints'] = joints - records['joints_vis'] = joints_vis - records['center'] = c - records['scale'] = s - records['rotate'] = r - - return records - - -@register_keypointop -class AugmentationbyInformantionDropping(object): - """AID: Augmentation by Informantion Dropping. Please refer - to https://arxiv.org/abs/2008.07139 - - Args: - prob_cutout (float): The probability of the Cutout augmentation. - offset_factor (float): Offset factor of cutout center. - num_patch (int): Number of patches to be cutout. - records(dict): the dict contained the image and coords - - Returns: - records (dict): contain the image and coords after tranformed - - """ - - def __init__(self, - trainsize, - prob_cutout=0.0, - offset_factor=0.2, - num_patch=1): - self.prob_cutout = prob_cutout - self.offset_factor = offset_factor - self.num_patch = num_patch - self.trainsize = trainsize - - def _cutout(self, img, joints, joints_vis): - height, width, _ = img.shape - img = img.reshape((height * width, -1)) - feat_x_int = np.arange(0, width) - feat_y_int = np.arange(0, height) - feat_x_int, feat_y_int = np.meshgrid(feat_x_int, feat_y_int) - feat_x_int = feat_x_int.reshape((-1, )) - feat_y_int = feat_y_int.reshape((-1, )) - for _ in range(self.num_patch): - vis_idx, _ = np.where(joints_vis > 0) - occlusion_joint_id = np.random.choice(vis_idx) - center = joints[occlusion_joint_id, 0:2] - offset = np.random.randn(2) * self.trainsize[0] * self.offset_factor - center = center + offset - radius = np.random.uniform(0.1, 0.2) * self.trainsize[0] - x_offset = (center[0] - feat_x_int) / radius - y_offset = (center[1] - feat_y_int) / radius - dis = x_offset**2 + y_offset**2 - keep_pos = np.where((dis <= 1) & (dis >= 0))[0] - img[keep_pos, :] = 0 - img = img.reshape((height, width, -1)) - return img - - def __call__(self, records): - img = records['image'] - joints = records['gt_joints'] - joints_vis = records['joints_vis'] - if np.random.rand() < self.prob_cutout: - img = self._cutout(img, joints, joints_vis) - records['image'] = img - return records - - -@register_keypointop -class TopDownRandomFlip(object): - """Data augmentation with random image flip. - - Args: - flip_perm: (list[tuple]): Pairs of keypoints which are mirrored - (for example, left ear and right ear). - flip_prob (float): Probability of flip. - """ - - def __init__(self, flip_perm=[], flip_prob=0.5): - self.flip_perm = flip_perm - self.flip_prob = flip_prob - - def flip_joints(self, joints_3d, joints_3d_visible, img_width, flip_pairs): - assert len(joints_3d) == len(joints_3d_visible) - assert img_width > 0 - - joints_3d_flipped = joints_3d.copy() - joints_3d_visible_flipped = joints_3d_visible.copy() - - # Swap left-right parts - for left, right in flip_pairs: - joints_3d_flipped[left, :] = joints_3d[right, :] - joints_3d_flipped[right, :] = joints_3d[left, :] - - joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :] - joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :] - - # Flip horizontally - joints_3d_flipped[:, 0] = img_width - 1 - joints_3d_flipped[:, 0] - joints_3d_flipped = joints_3d_flipped * (joints_3d_visible_flipped > 0) - - return joints_3d_flipped, joints_3d_visible_flipped - - def __call__(self, results): - """Perform data augmentation with random image flip.""" - if np.random.rand() <= self.flip_prob: - return results - - img = results['image'] - joints_3d = results['gt_joints'] - joints_3d_visible = results['joints_vis'] - center = results['center'] - - # A flag indicating whether the image is flipped, - # which can be used by child class. - if not isinstance(img, list): - img = img[:, ::-1, :] - else: - img = [i[:, ::-1, :] for i in img] - if not isinstance(img, list): - joints_3d, joints_3d_visible = self.flip_joints( - joints_3d, joints_3d_visible, img.shape[1], - self.flip_perm) - center[0] = img.shape[1] - center[0] - 1 - else: - joints_3d, joints_3d_visible = self.flip_joints( - joints_3d, joints_3d_visible, img[0].shape[1], - self.flip_perm) - center[0] = img[0].shape[1] - center[0] - 1 - - results['image'] = img - results['gt_joints'] = joints_3d - results['joints_vis'] = joints_3d_visible - results['center'] = center - - return results - - -@register_keypointop -class TopDownRandomShiftBboxCenter(object): - """Random shift the bbox center. - - Args: - shift_factor (float): The factor to control the shift range, which is - scale*pixel_std*scale_factor. Default: 0.16 - shift_prob (float): Probability of applying random shift. Default: 0.3 - """ - - def __init__(self, shift_factor=0.16, shift_prob=0.3): - self.shift_factor = shift_factor - self.shift_prob = shift_prob - - def __call__(self, results): - center = results['center'] - scale = results['scale'] - if np.random.rand() < self.shift_prob: - center += np.random.uniform( - -1, 1, 2) * self.shift_factor * scale * 200.0 - - results['center'] = center - return results - -@register_keypointop -class TopDownGetRandomScaleRotation(object): - """Data augmentation with random scaling & rotating. - - Args: - rot_factor (int): Rotating to ``[-2*rot_factor, 2*rot_factor]``. - scale_factor (float): Scaling to ``[1-scale_factor, 1+scale_factor]``. - rot_prob (float): Probability of random rotation. - """ - - def __init__(self, rot_factor=40, scale_factor=0.5, rot_prob=0.6): - self.rot_factor = rot_factor - self.scale_factor = scale_factor - self.rot_prob = rot_prob - - def __call__(self, results): - """Perform data augmentation with random scaling & rotating.""" - s = results['scale'] - - sf = self.scale_factor - rf = self.rot_factor - - s_factor = np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf) - s = s * s_factor - - r_factor = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) - r = r_factor if np.random.rand() <= self.rot_prob else 0 - - results['scale'] = s - results['rotate'] = r - - return results - - -@register_keypointop -class TopDownAffine(object): - """apply affine transform to image and coords - - Args: - trainsize (list): [w, h], the standard size used to train - use_udp (bool): whether to use Unbiased Data Processing. - records(dict): the dict contained the image and coords - - Returns: - records (dict): contain the image and coords after tranformed - - """ - - def __init__(self, trainsize, use_udp=False): - self.trainsize = trainsize - self.use_udp = use_udp - - def __call__(self, records): - image = records['image'] - joints = records['gt_joints'] - joints_vis = records['joints_vis'] - rot = records['rotate'] if "rotate" in records else 0 - if self.use_udp: - trans = get_warp_matrix( - rot, records['center'] * 2.0, - [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], - records['scale'] * 200.0) - image = cv2.warpAffine( - image, - trans, (int(self.trainsize[0]), int(self.trainsize[1])), - flags=cv2.INTER_LINEAR) - joints[:, 0:2] = warp_affine_joints(joints[:, 0:2].copy(), trans) - else: - trans = get_affine_transform(records['center'], records['scale'] * - 200, rot, self.trainsize) - image = cv2.warpAffine( - image, - trans, (int(self.trainsize[0]), int(self.trainsize[1])), - flags=cv2.INTER_LINEAR) - for i in range(joints.shape[0]): - if joints_vis[i, 0] > 0.0: - joints[i, 0:2] = affine_transform(joints[i, 0:2], trans) - - records['image'] = image - records['gt_joints'] = joints - - return records - - -@register_keypointop -class SinglePoseAffine(object): - """apply affine transform to image and coords - - Args: - trainsize (list): [w, h], the standard size used to train - use_udp (bool): whether to use Unbiased Data Processing. - records(dict): the dict contained the image and coords - - Returns: - records (dict): contain the image and coords after tranformed - - """ - - def __init__(self, - trainsize, - rotate=[1.0, 30], - scale=[1.0, 0.25], - use_udp=False): - self.trainsize = trainsize - self.use_udp = use_udp - self.rot_prob = rotate[0] - self.rot_range = rotate[1] - self.scale_prob = scale[0] - self.scale_ratio = scale[1] - - def __call__(self, records): - image = records['image'] - if 'joints_2d' in records: - joints = records['joints_2d'] if 'joints_2d' in records else None - joints_vis = records[ - 'joints_vis'] if 'joints_vis' in records else np.ones( - (len(joints), 1)) - rot = 0 - s = 1. - if np.random.random() < self.rot_prob: - rot = np.clip(np.random.randn() * self.rot_range, - -self.rot_range * 2, self.rot_range * 2) - if np.random.random() < self.scale_prob: - s = np.clip(np.random.randn() * self.scale_ratio + 1, - 1 - self.scale_ratio, 1 + self.scale_ratio) - - if self.use_udp: - trans = get_warp_matrix( - rot, - np.array(records['bbox_center']) * 2.0, - [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], - records['bbox_scale'] * 200.0 * s) - image = cv2.warpAffine( - image, - trans, (int(self.trainsize[0]), int(self.trainsize[1])), - flags=cv2.INTER_LINEAR) - if 'joints_2d' in records: - joints[:, 0:2] = warp_affine_joints(joints[:, 0:2].copy(), - trans) - else: - trans = get_affine_transform( - np.array(records['bbox_center']), - records['bbox_scale'] * s * 200, rot, self.trainsize) - image = cv2.warpAffine( - image, - trans, (int(self.trainsize[0]), int(self.trainsize[1])), - flags=cv2.INTER_LINEAR) - if 'joints_2d' in records: - for i in range(len(joints)): - if joints_vis[i, 0] > 0.0: - joints[i, 0:2] = affine_transform(joints[i, 0:2], trans) - - if 'joints_3d' in records: - pose3d = records['joints_3d'] - if not rot == 0: - trans_3djoints = np.eye(3) - rot_rad = -rot * np.pi / 180 - sn, cs = np.sin(rot_rad), np.cos(rot_rad) - trans_3djoints[0, :2] = [cs, -sn] - trans_3djoints[1, :2] = [sn, cs] - pose3d[:, :3] = np.einsum('ij,kj->ki', trans_3djoints, - pose3d[:, :3]) - records['joints_3d'] = pose3d - - records['image'] = image - if 'joints_2d' in records: - records['joints_2d'] = joints - - return records - - -@register_keypointop -class NoiseJitter(object): - """apply NoiseJitter to image - - Args: - noise_factor (float): the noise factor ratio used to generate the jitter - - Returns: - records (dict): contain the image and coords after tranformed - - """ - - def __init__(self, noise_factor=0.4): - self.noise_factor = noise_factor - - def __call__(self, records): - self.pn = np.random.uniform(1 - self.noise_factor, - 1 + self.noise_factor, 3) - rgb_img = records['image'] - rgb_img[:, :, 0] = np.minimum( - 255.0, np.maximum(0.0, rgb_img[:, :, 0] * self.pn[0])) - rgb_img[:, :, 1] = np.minimum( - 255.0, np.maximum(0.0, rgb_img[:, :, 1] * self.pn[1])) - rgb_img[:, :, 2] = np.minimum( - 255.0, np.maximum(0.0, rgb_img[:, :, 2] * self.pn[2])) - records['image'] = rgb_img - return records - - -@register_keypointop -class FlipPose(object): - """random apply flip to image - - Args: - noise_factor (float): the noise factor ratio used to generate the jitter - - Returns: - records (dict): contain the image and coords after tranformed - - """ - - def __init__(self, flip_prob=0.5, img_res=224, num_joints=14): - self.flip_pob = flip_prob - self.img_res = img_res - if num_joints == 24: - self.perm = [ - 5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13, 14, 15, 16, 17, - 18, 19, 21, 20, 23, 22 - ] - elif num_joints == 14: - self.perm = [5, 4, 3, 2, 1, 0, 11, 10, 9, 8, 7, 6, 12, 13] - else: - print("error num_joints in flip :{}".format(num_joints)) - - def __call__(self, records): - - if np.random.random() < self.flip_pob: - img = records['image'] - img = np.fliplr(img) - - if 'joints_2d' in records: - joints_2d = records['joints_2d'] - joints_2d = joints_2d[self.perm] - joints_2d[:, 0] = self.img_res - joints_2d[:, 0] - records['joints_2d'] = joints_2d - - if 'joints_3d' in records: - joints_3d = records['joints_3d'] - joints_3d = joints_3d[self.perm] - joints_3d[:, 0] = -joints_3d[:, 0] - records['joints_3d'] = joints_3d - - records['image'] = img - return records - - -@register_keypointop -class TopDownEvalAffine(object): - """apply affine transform to image and coords - - Args: - trainsize (list): [w, h], the standard size used to train - use_udp (bool): whether to use Unbiased Data Processing. - records(dict): the dict contained the image and coords - - Returns: - records (dict): contain the image and coords after tranformed - - """ - - def __init__(self, trainsize, use_udp=False): - self.trainsize = trainsize - self.use_udp = use_udp - - def __call__(self, records): - image = records['image'] - rot = 0 - imshape = records['im_shape'][::-1] - center = imshape / 2. - scale = imshape - - if self.use_udp: - trans = get_warp_matrix( - rot, center * 2.0, - [self.trainsize[0] - 1.0, self.trainsize[1] - 1.0], scale) - image = cv2.warpAffine( - image, - trans, (int(self.trainsize[0]), int(self.trainsize[1])), - flags=cv2.INTER_LINEAR) - else: - trans = get_affine_transform(center, scale, rot, self.trainsize) - image = cv2.warpAffine( - image, - trans, (int(self.trainsize[0]), int(self.trainsize[1])), - flags=cv2.INTER_LINEAR) - records['image'] = image - - return records - - -@register_keypointop -class ToHeatmapsTopDown(object): - """to generate the gaussin heatmaps of keypoint for heatmap loss - - Args: - hmsize (list): [w, h] output heatmap's size - sigma (float): the std of gaussin kernel genereted - records(dict): the dict contained the image and coords - - Returns: - records (dict): contain the heatmaps used to heatmaploss - - """ - - def __init__(self, hmsize, sigma): - super(ToHeatmapsTopDown, self).__init__() - self.hmsize = np.array(hmsize) - self.sigma = sigma - - def __call__(self, records): - """refer to - https://github.com/leoxiaobin/deep-high-resolution-net.pytorch - Copyright (c) Microsoft, under the MIT License. - """ - joints = records['gt_joints'] - joints_vis = records['joints_vis'] - num_joints = joints.shape[0] - image_size = np.array( - [records['image'].shape[1], records['image'].shape[0]]) - target_weight = np.ones((num_joints, 1), dtype=np.float32) - target_weight[:, 0] = joints_vis[:, 0] - target = np.zeros( - (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32) - tmp_size = self.sigma * 3 - feat_stride = image_size / self.hmsize - for joint_id in range(num_joints): - mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5) - mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5) - # Check that any part of the gaussian is in-bounds - ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] - br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] - if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[ - 0] < 0 or br[1] < 0: - # If not, just return the image as is - target_weight[joint_id] = 0 - continue - # # Generate gaussian - size = 2 * tmp_size + 1 - x = np.arange(0, size, 1, np.float32) - y = x[:, np.newaxis] - x0 = y0 = size // 2 - # The gaussian is not normalized, we want the center value to equal 1 - g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * self.sigma**2)) - - # Usable gaussian range - g_x = max(0, -ul[0]), min(br[0], self.hmsize[0]) - ul[0] - g_y = max(0, -ul[1]), min(br[1], self.hmsize[1]) - ul[1] - # Image range - img_x = max(0, ul[0]), min(br[0], self.hmsize[0]) - img_y = max(0, ul[1]), min(br[1], self.hmsize[1]) - - v = target_weight[joint_id] - if v > 0.5: - target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[ - 0]:g_y[1], g_x[0]:g_x[1]] - records['target'] = target - records['target_weight'] = target_weight - del records['gt_joints'], records['joints_vis'] - - return records - - -@register_keypointop -class ToHeatmapsTopDown_DARK(object): - """to generate the gaussin heatmaps of keypoint for heatmap loss - - Args: - hmsize (list): [w, h] output heatmap's size - sigma (float): the std of gaussin kernel genereted - records(dict): the dict contained the image and coords - - Returns: - records (dict): contain the heatmaps used to heatmaploss - - """ - - def __init__(self, hmsize, sigma): - super(ToHeatmapsTopDown_DARK, self).__init__() - self.hmsize = np.array(hmsize) - self.sigma = sigma - - def __call__(self, records): - joints = records['gt_joints'] - joints_vis = records['joints_vis'] - num_joints = joints.shape[0] - image_size = np.array( - [records['image'].shape[1], records['image'].shape[0]]) - target_weight = np.ones((num_joints, 1), dtype=np.float32) - target_weight[:, 0] = joints_vis[:, 0] - target = np.zeros( - (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32) - tmp_size = self.sigma * 3 - feat_stride = image_size / self.hmsize - for joint_id in range(num_joints): - mu_x = joints[joint_id][0] / feat_stride[0] - mu_y = joints[joint_id][1] / feat_stride[1] - # Check that any part of the gaussian is in-bounds - ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] - br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] - if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[ - 0] < 0 or br[1] < 0: - # If not, just return the image as is - target_weight[joint_id] = 0 - continue - - x = np.arange(0, self.hmsize[0], 1, np.float32) - y = np.arange(0, self.hmsize[1], 1, np.float32) - y = y[:, np.newaxis] - - v = target_weight[joint_id] - if v > 0.5: - target[joint_id] = np.exp(-( - (x - mu_x)**2 + (y - mu_y)**2) / (2 * self.sigma**2)) - records['target'] = target - records['target_weight'] = target_weight - del records['gt_joints'], records['joints_vis'] - - return records - - -@register_keypointop -class ToHeatmapsTopDown_UDP(object): - """This code is based on: - https://github.com/HuangJunJie2017/UDP-Pose/blob/master/deep-high-resolution-net.pytorch/lib/dataset/JointsDataset.py - - to generate the gaussian heatmaps of keypoint for heatmap loss. - ref: Huang et al. The Devil is in the Details: Delving into Unbiased Data Processing - for Human Pose Estimation (CVPR 2020). - - Args: - hmsize (list): [w, h] output heatmap's size - sigma (float): the std of gaussin kernel genereted - records(dict): the dict contained the image and coords - - Returns: - records (dict): contain the heatmaps used to heatmaploss - """ - - def __init__(self, hmsize, sigma): - super(ToHeatmapsTopDown_UDP, self).__init__() - self.hmsize = np.array(hmsize) - self.sigma = sigma - - def __call__(self, records): - joints = records['gt_joints'] - joints_vis = records['joints_vis'] - num_joints = joints.shape[0] - image_size = np.array( - [records['image'].shape[1], records['image'].shape[0]]) - target_weight = np.ones((num_joints, 1), dtype=np.float32) - target_weight[:, 0] = joints_vis[:, 0] - target = np.zeros( - (num_joints, self.hmsize[1], self.hmsize[0]), dtype=np.float32) - tmp_size = self.sigma * 3 - size = 2 * tmp_size + 1 - x = np.arange(0, size, 1, np.float32) - y = x[:, None] - feat_stride = (image_size - 1.0) / (self.hmsize - 1.0) - for joint_id in range(num_joints): - mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5) - mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5) - # Check that any part of the gaussian is in-bounds - ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] - br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] - if ul[0] >= self.hmsize[0] or ul[1] >= self.hmsize[1] or br[ - 0] < 0 or br[1] < 0: - # If not, just return the image as is - target_weight[joint_id] = 0 - continue - - mu_x_ac = joints[joint_id][0] / feat_stride[0] - mu_y_ac = joints[joint_id][1] / feat_stride[1] - x0 = y0 = size // 2 - x0 += mu_x_ac - mu_x - y0 += mu_y_ac - mu_y - g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * self.sigma**2)) - # Usable gaussian range - g_x = max(0, -ul[0]), min(br[0], self.hmsize[0]) - ul[0] - g_y = max(0, -ul[1]), min(br[1], self.hmsize[1]) - ul[1] - # Image range - img_x = max(0, ul[0]), min(br[0], self.hmsize[0]) - img_y = max(0, ul[1]), min(br[1], self.hmsize[1]) - - v = target_weight[joint_id] - if v > 0.5: - target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = g[g_y[ - 0]:g_y[1], g_x[0]:g_x[1]] - records['target'] = target - records['target_weight'] = target_weight - del records['gt_joints'], records['joints_vis'] - - return records - - -from typing import Optional, Tuple, Union, List -import numbers - - -def _scale_size( - size: Tuple[int, int], - scale: Union[float, int, tuple], ) -> Tuple[int, int]: - """Rescale a size by a ratio. - - Args: - size (tuple[int]): (w, h). - scale (float | tuple(float)): Scaling factor. - - Returns: - tuple[int]: scaled size. - """ - if isinstance(scale, (float, int)): - scale = (scale, scale) - w, h = size - return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5) - - -def rescale_size(old_size: tuple, - scale: Union[float, int, tuple], - return_scale: bool=False) -> tuple: - """Calculate the new size to be rescaled to. - - Args: - old_size (tuple[int]): The old size (w, h) of image. - scale (float | tuple[int]): The scaling factor or maximum size. - If it is a float number, then the image will be rescaled by this - factor, else if it is a tuple of 2 integers, then the image will - be rescaled as large as possible within the scale. - return_scale (bool): Whether to return the scaling factor besides the - rescaled image size. - - Returns: - tuple[int]: The new rescaled image size. - """ - w, h = old_size - if isinstance(scale, (float, int)): - if scale <= 0: - raise ValueError(f'Invalid scale {scale}, must be positive.') - scale_factor = scale - elif isinstance(scale, list): - max_long_edge = max(scale) - max_short_edge = min(scale) - scale_factor = min(max_long_edge / max(h, w), - max_short_edge / min(h, w)) - else: - raise TypeError( - f'Scale must be a number or tuple of int, but got {type(scale)}') - - new_size = _scale_size((w, h), scale_factor) - - if return_scale: - return new_size, scale_factor - else: - return new_size - - -def imrescale(img: np.ndarray, - scale: Union[float, Tuple[int, int]], - return_scale: bool=False, - interpolation: str='bilinear', - backend: Optional[str]=None) -> Union[np.ndarray, Tuple[ - np.ndarray, float]]: - """Resize image while keeping the aspect ratio. - - Args: - img (ndarray): The input image. - scale (float | tuple[int]): The scaling factor or maximum size. - If it is a float number, then the image will be rescaled by this - factor, else if it is a tuple of 2 integers, then the image will - be rescaled as large as possible within the scale. - return_scale (bool): Whether to return the scaling factor besides the - rescaled image. - interpolation (str): Same as :func:`resize`. - backend (str | None): Same as :func:`resize`. - - Returns: - ndarray: The rescaled image. - """ - h, w = img.shape[:2] - new_size, scale_factor = rescale_size((w, h), scale, return_scale=True) - rescaled_img = imresize( - img, new_size, interpolation=interpolation, backend=backend) - if return_scale: - return rescaled_img, scale_factor - else: - return rescaled_img - - -def imresize( - img: np.ndarray, - size: Tuple[int, int], - return_scale: bool=False, - interpolation: str='bilinear', - out: Optional[np.ndarray]=None, - backend: Optional[str]=None, - interp=cv2.INTER_LINEAR, ) -> Union[Tuple[np.ndarray, float, float], - np.ndarray]: - """Resize image to a given size. - - Args: - img (ndarray): The input image. - size (tuple[int]): Target size (w, h). - return_scale (bool): Whether to return `w_scale` and `h_scale`. - interpolation (str): Interpolation method, accepted values are - "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' - backend, "nearest", "bilinear" for 'pillow' backend. - out (ndarray): The output destination. - backend (str | None): The image resize backend type. Options are `cv2`, - `pillow`, `None`. If backend is None, the global imread_backend - specified by ``mmcv.use_backend()`` will be used. Default: None. - - Returns: - tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or - `resized_img`. - """ - h, w = img.shape[:2] - if backend is None: - backend = imread_backend - if backend not in ['cv2', 'pillow']: - raise ValueError(f'backend: {backend} is not supported for resize.' - f"Supported backends are 'cv2', 'pillow'") - - if backend == 'pillow': - assert img.dtype == np.uint8, 'Pillow backend only support uint8 type' - pil_image = Image.fromarray(img) - pil_image = pil_image.resize(size, pillow_interp_codes[interpolation]) - resized_img = np.array(pil_image) - else: - resized_img = cv2.resize(img, size, dst=out, interpolation=interp) - if not return_scale: - return resized_img - else: - w_scale = size[0] / w - h_scale = size[1] / h - return resized_img, w_scale, h_scale - - -class PETR_Resize: - """Resize images & bbox & mask. - - This transform resizes the input image to some scale. Bboxes and masks are - then resized with the same scale factor. If the input dict contains the key - "scale", then the scale in the input dict is used, otherwise the specified - scale in the init method is used. If the input dict contains the key - "scale_factor" (if MultiScaleFlipAug does not give img_scale but - scale_factor), the actual scale will be computed by image shape and - scale_factor. - - `img_scale` can either be a tuple (single-scale) or a list of tuple - (multi-scale). There are 3 multiscale modes: - - - ``ratio_range is not None``: randomly sample a ratio from the ratio \ - range and multiply it with the image scale. - - ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \ - sample a scale from the multiscale range. - - ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \ - sample a scale from multiple scales. - - Args: - img_scale (tuple or list[tuple]): Images scales for resizing. - multiscale_mode (str): Either "range" or "value". - ratio_range (tuple[float]): (min_ratio, max_ratio) - keep_ratio (bool): Whether to keep the aspect ratio when resizing the - image. - bbox_clip_border (bool, optional): Whether to clip the objects outside - the border of the image. In some dataset like MOT17, the gt bboxes - are allowed to cross the border of images. Therefore, we don't - need to clip the gt bboxes in these cases. Defaults to True. - backend (str): Image resize backend, choices are 'cv2' and 'pillow'. - These two backends generates slightly different results. Defaults - to 'cv2'. - interpolation (str): Interpolation method, accepted values are - "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' - backend, "nearest", "bilinear" for 'pillow' backend. - override (bool, optional): Whether to override `scale` and - `scale_factor` so as to call resize twice. Default False. If True, - after the first resizing, the existed `scale` and `scale_factor` - will be ignored so the second resizing can be allowed. - This option is a work-around for multiple times of resize in DETR. - Defaults to False. - """ - - def __init__(self, - img_scale=None, - multiscale_mode='range', - ratio_range=None, - keep_ratio=True, - bbox_clip_border=True, - backend='cv2', - interpolation='bilinear', - override=False, - keypoint_clip_border=True): - if img_scale is None: - self.img_scale = None - else: - if isinstance(img_scale, list): - self.img_scale = img_scale - else: - self.img_scale = [img_scale] - assert isinstance(self.img_scale, list) - - if ratio_range is not None: - # mode 1: given a scale and a range of image ratio - assert len(self.img_scale) == 1 - else: - # mode 2: given multiple scales or a range of scales - assert multiscale_mode in ['value', 'range'] - - self.backend = backend - self.multiscale_mode = multiscale_mode - self.ratio_range = ratio_range - self.keep_ratio = keep_ratio - # TODO: refactor the override option in Resize - self.interpolation = interpolation - self.override = override - self.bbox_clip_border = bbox_clip_border - self.keypoint_clip_border = keypoint_clip_border - - @staticmethod - def random_select(img_scales): - """Randomly select an img_scale from given candidates. - - Args: - img_scales (list[tuple]): Images scales for selection. - - Returns: - (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \ - where ``img_scale`` is the selected image scale and \ - ``scale_idx`` is the selected index in the given candidates. - """ - - assert isinstance(img_scales, list) - scale_idx = np.random.randint(len(img_scales)) - img_scale = img_scales[scale_idx] - return img_scale, scale_idx - - @staticmethod - def random_sample(img_scales): - """Randomly sample an img_scale when ``multiscale_mode=='range'``. - - Args: - img_scales (list[tuple]): Images scale range for sampling. - There must be two tuples in img_scales, which specify the lower - and upper bound of image scales. - - Returns: - (tuple, None): Returns a tuple ``(img_scale, None)``, where \ - ``img_scale`` is sampled scale and None is just a placeholder \ - to be consistent with :func:`random_select`. - """ - - assert isinstance(img_scales, list) and len(img_scales) == 2 - img_scale_long = [max(s) for s in img_scales] - img_scale_short = [min(s) for s in img_scales] - long_edge = np.random.randint( - min(img_scale_long), max(img_scale_long) + 1) - short_edge = np.random.randint( - min(img_scale_short), max(img_scale_short) + 1) - img_scale = (long_edge, short_edge) - return img_scale, None - - @staticmethod - def random_sample_ratio(img_scale, ratio_range): - """Randomly sample an img_scale when ``ratio_range`` is specified. - - A ratio will be randomly sampled from the range specified by - ``ratio_range``. Then it would be multiplied with ``img_scale`` to - generate sampled scale. - - Args: - img_scale (list): Images scale base to multiply with ratio. - ratio_range (tuple[float]): The minimum and maximum ratio to scale - the ``img_scale``. - - Returns: - (tuple, None): Returns a tuple ``(scale, None)``, where \ - ``scale`` is sampled ratio multiplied with ``img_scale`` and \ - None is just a placeholder to be consistent with \ - :func:`random_select`. - """ - - assert isinstance(img_scale, list) and len(img_scale) == 2 - min_ratio, max_ratio = ratio_range - assert min_ratio <= max_ratio - ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio - scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio) - return scale, None - - def _random_scale(self, results): - """Randomly sample an img_scale according to ``ratio_range`` and - ``multiscale_mode``. - - If ``ratio_range`` is specified, a ratio will be sampled and be - multiplied with ``img_scale``. - If multiple scales are specified by ``img_scale``, a scale will be - sampled according to ``multiscale_mode``. - Otherwise, single scale will be used. - - Args: - results (dict): Result dict from :obj:`dataset`. - - Returns: - dict: Two new keys 'scale` and 'scale_idx` are added into \ - ``results``, which would be used by subsequent pipelines. - """ - - if self.ratio_range is not None: - scale, scale_idx = self.random_sample_ratio(self.img_scale[0], - self.ratio_range) - elif len(self.img_scale) == 1: - scale, scale_idx = self.img_scale[0], 0 - elif self.multiscale_mode == 'range': - scale, scale_idx = self.random_sample(self.img_scale) - elif self.multiscale_mode == 'value': - scale, scale_idx = self.random_select(self.img_scale) - else: - raise NotImplementedError - results['scale'] = scale - results['scale_idx'] = scale_idx - - def _resize_img(self, results): - """Resize images with ``results['scale']``.""" - for key in ['image'] if 'image' in results else []: - if self.keep_ratio: - img, scale_factor = imrescale( - results[key], - results['scale'], - return_scale=True, - interpolation=self.interpolation, - backend=self.backend) - # the w_scale and h_scale has minor difference - # a real fix should be done in the imrescale in the future - new_h, new_w = img.shape[:2] - h, w = results[key].shape[:2] - w_scale = new_w / w - h_scale = new_h / h - else: - img, w_scale, h_scale = imresize( - results[key], - results['scale'], - return_scale=True, - interpolation=self.interpolation, - backend=self.backend) - - scale_factor = np.array( - [w_scale, h_scale, w_scale, h_scale], dtype=np.float32) - results['im_shape'] = np.array(img.shape) - # in case that there is no padding - results['pad_shape'] = img.shape - results['scale_factor'] = scale_factor - results['keep_ratio'] = self.keep_ratio - # img_pad = self.impad(img, shape=results['scale']) - results[key] = img - - def _resize_bboxes(self, results): - """Resize bounding boxes with ``results['scale_factor']``.""" - for key in ['gt_bbox'] if 'gt_bbox' in results else []: - bboxes = results[key] * results['scale_factor'] - if self.bbox_clip_border: - img_shape = results['im_shape'] - bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1]) - bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0]) - results[key] = bboxes - - def _resize_masks(self, results): - """Resize masks with ``results['scale']``""" - for key in ['mask'] if 'mask' in results else []: - if results[key] is None: - continue - if self.keep_ratio: - results[key] = results[key].rescale(results['scale']) - else: - results[key] = results[key].resize(results['im_shape'][:2]) - - def _resize_seg(self, results): - """Resize semantic segmentation map with ``results['scale']``.""" - for key in ['seg'] if 'seg' in results else []: - if self.keep_ratio: - gt_seg = imrescale( - results[key], - results['scale'], - interpolation='nearest', - backend=self.backend) - else: - gt_seg = imresize( - results[key], - results['scale'], - interpolation='nearest', - backend=self.backend) - results[key] = gt_seg - - def _resize_keypoints(self, results): - """Resize keypoints with ``results['scale_factor']``.""" - for key in ['gt_joints'] if 'gt_joints' in results else []: - keypoints = results[key].copy() - keypoints[..., 0] = keypoints[..., 0] * results['scale_factor'][0] - keypoints[..., 1] = keypoints[..., 1] * results['scale_factor'][1] - if self.keypoint_clip_border: - img_shape = results['im_shape'] - keypoints[..., 0] = np.clip(keypoints[..., 0], 0, img_shape[1]) - keypoints[..., 1] = np.clip(keypoints[..., 1], 0, img_shape[0]) - results[key] = keypoints - - def _resize_areas(self, results): - """Resize mask areas with ``results['scale_factor']``.""" - for key in ['gt_areas'] if 'gt_areas' in results else []: - areas = results[key].copy() - areas = areas * results['scale_factor'][0] * results[ - 'scale_factor'][1] - results[key] = areas - - def __call__(self, results): - """Call function to resize images, bounding boxes, masks, semantic - segmentation map. - - Args: - results (dict): Result dict from loading pipeline. - - Returns: - dict: Resized results, 'im_shape', 'pad_shape', 'scale_factor', \ - 'keep_ratio' keys are added into result dict. - """ - if 'scale' not in results: - if 'scale_factor' in results: - img_shape = results['image'].shape[:2] - scale_factor = results['scale_factor'][0] - # assert isinstance(scale_factor, float) - results['scale'] = [int(x * scale_factor) - for x in img_shape][::-1] - else: - self._random_scale(results) - else: - if not self.override: - assert 'scale_factor' not in results, ( - 'scale and scale_factor cannot be both set.') - else: - results.pop('scale') - if 'scale_factor' in results: - results.pop('scale_factor') - self._random_scale(results) - - self._resize_img(results) - self._resize_bboxes(results) - self._resize_masks(results) - self._resize_seg(results) - self._resize_keypoints(results) - self._resize_areas(results) - return results - - def __repr__(self): - repr_str = self.__class__.__name__ - repr_str += f'(img_scale={self.img_scale}, ' - repr_str += f'multiscale_mode={self.multiscale_mode}, ' - repr_str += f'ratio_range={self.ratio_range}, ' - repr_str += f'keep_ratio={self.keep_ratio}, ' - repr_str += f'bbox_clip_border={self.bbox_clip_border})' - repr_str += f'keypoint_clip_border={self.keypoint_clip_border})' - return repr_str diff --git a/pdfdet/models/Paddle/ppdet/data/transform/keypoints_3d_operators.py b/pdfdet/models/Paddle/ppdet/data/transform/keypoints_3d_operators.py deleted file mode 100644 index 13337bc..0000000 --- a/pdfdet/models/Paddle/ppdet/data/transform/keypoints_3d_operators.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import - -try: - from collections.abc import Sequence -except Exception: - from collections import Sequence -import cv2 -import numpy as np -import math -import copy -import random -import uuid -from numbers import Number, Integral - -from ...modeling.keypoint_utils import get_affine_mat_kernel, warp_affine_joints, get_affine_transform, affine_transform, get_warp_matrix -from ppdet.core.workspace import serializable -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -registered_ops = [] - -__all__ = [ - 'CropAndFlipImages', 'PermuteImages', 'RandomFlipHalfBody3DTransformImages' -] - -import matplotlib.pyplot as plt -from PIL import Image, ImageDraw -from mpl_toolkits.mplot3d import Axes3D - - -def register_keypointop(cls): - return serializable(cls) - - -def register_op(cls): - registered_ops.append(cls.__name__) - if not hasattr(BaseOperator, cls.__name__): - setattr(BaseOperator, cls.__name__, cls) - else: - raise KeyError("The {} class has been registered.".format(cls.__name__)) - return serializable(cls) - - -class BaseOperator(object): - def __init__(self, name=None): - if name is None: - name = self.__class__.__name__ - self._id = name + '_' + str(uuid.uuid4())[-6:] - - def apply(self, sample, context=None): - """ Process a sample. - Args: - sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx} - context (dict): info about this sample processing - Returns: - result (dict): a processed sample - """ - return sample - - def __call__(self, sample, context=None): - """ Process a sample. - Args: - sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx} - context (dict): info about this sample processing - Returns: - result (dict): a processed sample - """ - if isinstance(sample, Sequence): # for batch_size - for i in range(len(sample)): - sample[i] = self.apply(sample[i], context) - else: - # image.shape changed - sample = self.apply(sample, context) - return sample - - def __str__(self): - return str(self._id) - - -@register_keypointop -class CropAndFlipImages(object): - """Crop all images""" - - def __init__(self, crop_range, flip_pairs=None): - super(CropAndFlipImages, self).__init__() - self.crop_range = crop_range - self.flip_pairs = flip_pairs - - def __call__(self, records): # tuple - images = records["image"] - images = images[:, :, ::-1, :] - images = images[:, :, self.crop_range[0]:self.crop_range[1]] - records["image"] = images - - if "kps2d" in records.keys(): - kps2d = records["kps2d"] - - width, height = images.shape[2], images.shape[1] - kps2d = np.array(kps2d) - kps2d[:, :, 0] = kps2d[:, :, 0] - self.crop_range[0] - - for pair in self.flip_pairs: - kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \ - kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy() - - records["kps2d"] = kps2d - - return records - - -@register_op -class PermuteImages(BaseOperator): - def __init__(self): - """ - Change the channel to be (batch_size, C, H, W) #(6, 3, 1080, 1920) - """ - super(PermuteImages, self).__init__() - - def apply(self, sample, context=None): - images = sample["image"] - images = images.transpose((0, 3, 1, 2)) - - sample["image"] = images - - return sample - - -@register_keypointop -class RandomFlipHalfBody3DTransformImages(object): - """apply data augment to images and coords - to achieve the flip, scale, rotate and half body transform effect for training image - Args: - trainsize (list):[w, h], Image target size - upper_body_ids (list): The upper body joint ids - flip_pairs (list): The left-right joints exchange order list - pixel_std (int): The pixel std of the scale - scale (float): The scale factor to transform the image - rot (int): The rotate factor to transform the image - num_joints_half_body (int): The joints threshold of the half body transform - prob_half_body (float): The threshold of the half body transform - flip (bool): Whether to flip the image - Returns: - records(dict): contain the image and coords after tranformed - """ - - def __init__(self, - trainsize, - upper_body_ids, - flip_pairs, - pixel_std, - scale=0.35, - rot=40, - num_joints_half_body=8, - prob_half_body=0.3, - flip=True, - rot_prob=0.6, - do_occlusion=False): - super(RandomFlipHalfBody3DTransformImages, self).__init__() - self.trainsize = trainsize - self.upper_body_ids = upper_body_ids - self.flip_pairs = flip_pairs - self.pixel_std = pixel_std - self.scale = scale - self.rot = rot - self.num_joints_half_body = num_joints_half_body - self.prob_half_body = prob_half_body - self.flip = flip - self.aspect_ratio = trainsize[0] * 1.0 / trainsize[1] - self.rot_prob = rot_prob - self.do_occlusion = do_occlusion - - def halfbody_transform(self, joints, joints_vis): - upper_joints = [] - lower_joints = [] - for joint_id in range(joints.shape[0]): - if joints_vis[joint_id][0] > 0: - if joint_id in self.upper_body_ids: - upper_joints.append(joints[joint_id]) - else: - lower_joints.append(joints[joint_id]) - if np.random.randn() < 0.5 and len(upper_joints) > 2: - selected_joints = upper_joints - else: - selected_joints = lower_joints if len( - lower_joints) > 2 else upper_joints - if len(selected_joints) < 2: - return None, None - selected_joints = np.array(selected_joints, dtype=np.float32) - center = selected_joints.mean(axis=0)[:2] - left_top = np.amin(selected_joints, axis=0) - right_bottom = np.amax(selected_joints, axis=0) - w = right_bottom[0] - left_top[0] - h = right_bottom[1] - left_top[1] - if w > self.aspect_ratio * h: - h = w * 1.0 / self.aspect_ratio - elif w < self.aspect_ratio * h: - w = h * self.aspect_ratio - scale = np.array( - [w * 1.0 / self.pixel_std, h * 1.0 / self.pixel_std], - dtype=np.float32) - scale = scale * 1.5 - - return center, scale - - def flip_joints(self, joints, joints_vis, width, matched_parts, kps2d=None): - # joints: (6, 24, 3),(num_frames, num_joints, 3) - - joints[:, :, 0] = width - joints[:, :, 0] - 1 # x - if kps2d is not None: - kps2d[:, :, 0] = width - kps2d[:, :, 0] - 1 - - for pair in matched_parts: - joints[:, pair[0], :], joints[:,pair[1], :] = \ - joints[:,pair[1], :], joints[:,pair[0], :].copy() - - joints_vis[:,pair[0], :], joints_vis[:,pair[1], :] = \ - joints_vis[:,pair[1], :], joints_vis[:,pair[0], :].copy() - - if kps2d is not None: - kps2d[:, pair[0], :], kps2d[:,pair[1], :] = \ - kps2d[:,pair[1], :], kps2d[:,pair[0], :].copy() - - # move to zero - joints -= joints[:, [0], :] # (batch_size, 24, 3),numpy.ndarray - - return joints, joints_vis, kps2d - - def __call__(self, records): - images = records[ - 'image'] #kps3d, kps3d_vis, images. images.shape(num_frames, width, height, 3) - - joints = records['kps3d'] - joints_vis = records['kps3d_vis'] - - kps2d = None - if 'kps2d' in records.keys(): - kps2d = records['kps2d'] - - if self.flip and np.random.random() <= 0.5: - images = images[:, :, ::-1, :] # 图像水平翻转 (6, 1080, 810, 3) - joints, joints_vis, kps2d = self.flip_joints( - joints, joints_vis, images.shape[2], self.flip_pairs, - kps2d) # 关键点左右对称翻转 - occlusion = False - if self.do_occlusion and random.random() <= 0.5: # 随机遮挡 - height = images[0].shape[0] - width = images[0].shape[1] - occlusion = True - while True: - area_min = 0.0 - area_max = 0.2 - synth_area = (random.random() * - (area_max - area_min) + area_min) * width * height - - ratio_min = 0.3 - ratio_max = 1 / 0.3 - synth_ratio = (random.random() * - (ratio_max - ratio_min) + ratio_min) - - synth_h = math.sqrt(synth_area * synth_ratio) - synth_w = math.sqrt(synth_area / synth_ratio) - synth_xmin = random.random() * (width - synth_w - 1) - synth_ymin = random.random() * (height - synth_h - 1) - - if synth_xmin >= 0 and synth_ymin >= 0 and synth_xmin + synth_w < width and synth_ymin + synth_h < height: - xmin = int(synth_xmin) - ymin = int(synth_ymin) - w = int(synth_w) - h = int(synth_h) - - mask = np.random.rand(h, w, 3) * 255 - images[:, ymin:ymin + h, xmin:xmin + w, :] = mask[ - None, :, :, :] - break - - records['image'] = images - records['kps3d'] = joints - records['kps3d_vis'] = joints_vis - if kps2d is not None: - records['kps2d'] = kps2d - - return records diff --git a/pdfdet/models/Paddle/ppdet/data/transform/mot_operators.py b/pdfdet/models/Paddle/ppdet/data/transform/mot_operators.py deleted file mode 100644 index e533ea3..0000000 --- a/pdfdet/models/Paddle/ppdet/data/transform/mot_operators.py +++ /dev/null @@ -1,627 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -try: - from collections.abc import Sequence -except Exception: - from collections import Sequence -from numbers import Integral - -import cv2 -import copy -import numpy as np -import random -import math - -from .operators import BaseOperator, register_op -from .batch_operators import Gt2TTFTarget -from ppdet.modeling.bbox_utils import bbox_iou_np_expand -from ppdet.utils.logger import setup_logger -from .op_helper import gaussian_radius -logger = setup_logger(__name__) - -__all__ = [ - 'RGBReverse', 'LetterBoxResize', 'MOTRandomAffine', 'Gt2JDETargetThres', - 'Gt2JDETargetMax', 'Gt2FairMOTTarget' -] - - -@register_op -class RGBReverse(BaseOperator): - """RGB to BGR, or BGR to RGB, sensitive to MOTRandomAffine - """ - - def __init__(self): - super(RGBReverse, self).__init__() - - def apply(self, sample, context=None): - im = sample['image'] - sample['image'] = np.ascontiguousarray(im[:, :, ::-1]) - return sample - - -@register_op -class LetterBoxResize(BaseOperator): - def __init__(self, target_size): - """ - Resize image to target size, convert normalized xywh to pixel xyxy - format ([x_center, y_center, width, height] -> [x0, y0, x1, y1]). - Args: - target_size (int|list): image target size. - """ - super(LetterBoxResize, self).__init__() - if not isinstance(target_size, (Integral, Sequence)): - raise TypeError( - "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}". - format(type(target_size))) - if isinstance(target_size, Integral): - target_size = [target_size, target_size] - self.target_size = target_size - - def apply_image(self, img, height, width, color=(127.5, 127.5, 127.5)): - # letterbox: resize a rectangular image to a padded rectangular - shape = img.shape[:2] # [height, width] - ratio_h = float(height) / shape[0] - ratio_w = float(width) / shape[1] - ratio = min(ratio_h, ratio_w) - new_shape = (round(shape[1] * ratio), - round(shape[0] * ratio)) # [width, height] - padw = (width - new_shape[0]) / 2 - padh = (height - new_shape[1]) / 2 - top, bottom = round(padh - 0.1), round(padh + 0.1) - left, right = round(padw - 0.1), round(padw + 0.1) - - img = cv2.resize( - img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border - img = cv2.copyMakeBorder( - img, top, bottom, left, right, cv2.BORDER_CONSTANT, - value=color) # padded rectangular - return img, ratio, padw, padh - - def apply_bbox(self, bbox0, h, w, ratio, padw, padh): - bboxes = bbox0.copy() - bboxes[:, 0] = ratio * w * (bbox0[:, 0] - bbox0[:, 2] / 2) + padw - bboxes[:, 1] = ratio * h * (bbox0[:, 1] - bbox0[:, 3] / 2) + padh - bboxes[:, 2] = ratio * w * (bbox0[:, 0] + bbox0[:, 2] / 2) + padw - bboxes[:, 3] = ratio * h * (bbox0[:, 1] + bbox0[:, 3] / 2) + padh - return bboxes - - def apply(self, sample, context=None): - """ Resize the image numpy. - """ - im = sample['image'] - h, w = sample['im_shape'] - if not isinstance(im, np.ndarray): - raise TypeError("{}: image type is not numpy.".format(self)) - if len(im.shape) != 3: - from PIL import UnidentifiedImageError - raise UnidentifiedImageError( - '{}: image is not 3-dimensional.'.format(self)) - - # apply image - height, width = self.target_size - img, ratio, padw, padh = self.apply_image( - im, height=height, width=width) - - sample['image'] = img - new_shape = (round(h * ratio), round(w * ratio)) - sample['im_shape'] = np.asarray(new_shape, dtype=np.float32) - sample['scale_factor'] = np.asarray([ratio, ratio], dtype=np.float32) - - # apply bbox - if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: - sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], h, w, ratio, - padw, padh) - return sample - - -@register_op -class MOTRandomAffine(BaseOperator): - """ - Affine transform to image and coords to achieve the rotate, scale and - shift effect for training image. - - Args: - degrees (list[2]): the rotate range to apply, transform range is [min, max] - translate (list[2]): the translate range to apply, transform range is [min, max] - scale (list[2]): the scale range to apply, transform range is [min, max] - shear (list[2]): the shear range to apply, transform range is [min, max] - borderValue (list[3]): value used in case of a constant border when appling - the perspective transformation - reject_outside (bool): reject warped bounding bboxes outside of image - - Returns: - records(dict): contain the image and coords after tranformed - - """ - - def __init__(self, - degrees=(-5, 5), - translate=(0.10, 0.10), - scale=(0.50, 1.20), - shear=(-2, 2), - borderValue=(127.5, 127.5, 127.5), - reject_outside=True): - super(MOTRandomAffine, self).__init__() - self.degrees = degrees - self.translate = translate - self.scale = scale - self.shear = shear - self.borderValue = borderValue - self.reject_outside = reject_outside - - def apply(self, sample, context=None): - # https://medium.com/uruvideo/dataset-augmentation-with-random-homographies-a8f4b44830d4 - border = 0 # width of added border (optional) - - img = sample['image'] - height, width = img.shape[0], img.shape[1] - - # Rotation and Scale - R = np.eye(3) - a = random.random() * (self.degrees[1] - self.degrees[0] - ) + self.degrees[0] - s = random.random() * (self.scale[1] - self.scale[0]) + self.scale[0] - R[:2] = cv2.getRotationMatrix2D( - angle=a, center=(width / 2, height / 2), scale=s) - - # Translation - T = np.eye(3) - T[0, 2] = ( - random.random() * 2 - 1 - ) * self.translate[0] * height + border # x translation (pixels) - T[1, 2] = ( - random.random() * 2 - 1 - ) * self.translate[1] * width + border # y translation (pixels) - - # Shear - S = np.eye(3) - S[0, 1] = math.tan((random.random() * - (self.shear[1] - self.shear[0]) + self.shear[0]) * - math.pi / 180) # x shear (deg) - S[1, 0] = math.tan((random.random() * - (self.shear[1] - self.shear[0]) + self.shear[0]) * - math.pi / 180) # y shear (deg) - - M = S @T @R # Combined rotation matrix. ORDER IS IMPORTANT HERE!! - imw = cv2.warpPerspective( - img, - M, - dsize=(width, height), - flags=cv2.INTER_LINEAR, - borderValue=self.borderValue) # BGR order borderValue - - if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: - targets = sample['gt_bbox'] - n = targets.shape[0] - points = targets.copy() - area0 = (points[:, 2] - points[:, 0]) * ( - points[:, 3] - points[:, 1]) - - # warp points - xy = np.ones((n * 4, 3)) - xy[:, :2] = points[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape( - n * 4, 2) # x1y1, x2y2, x1y2, x2y1 - xy = (xy @M.T)[:, :2].reshape(n, 8) - - # create new boxes - x = xy[:, [0, 2, 4, 6]] - y = xy[:, [1, 3, 5, 7]] - xy = np.concatenate( - (x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T - - # apply angle-based reduction - radians = a * math.pi / 180 - reduction = max(abs(math.sin(radians)), abs(math.cos(radians)))**0.5 - x = (xy[:, 2] + xy[:, 0]) / 2 - y = (xy[:, 3] + xy[:, 1]) / 2 - w = (xy[:, 2] - xy[:, 0]) * reduction - h = (xy[:, 3] - xy[:, 1]) * reduction - xy = np.concatenate( - (x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, n).T - - # reject warped points outside of image - if self.reject_outside: - np.clip(xy[:, 0], 0, width, out=xy[:, 0]) - np.clip(xy[:, 2], 0, width, out=xy[:, 2]) - np.clip(xy[:, 1], 0, height, out=xy[:, 1]) - np.clip(xy[:, 3], 0, height, out=xy[:, 3]) - w = xy[:, 2] - xy[:, 0] - h = xy[:, 3] - xy[:, 1] - area = w * h - ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16)) - i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10) - - if sum(i) > 0: - sample['gt_bbox'] = xy[i].astype(sample['gt_bbox'].dtype) - sample['gt_class'] = sample['gt_class'][i] - if 'difficult' in sample: - sample['difficult'] = sample['difficult'][i] - if 'gt_ide' in sample: - sample['gt_ide'] = sample['gt_ide'][i] - if 'is_crowd' in sample: - sample['is_crowd'] = sample['is_crowd'][i] - sample['image'] = imw - return sample - else: - return sample - - -@register_op -class Gt2JDETargetThres(BaseOperator): - __shared__ = ['num_classes'] - """ - Generate JDE targets by groud truth data when training - Args: - anchors (list): anchors of JDE model - anchor_masks (list): anchor_masks of JDE model - downsample_ratios (list): downsample ratios of JDE model - ide_thresh (float): thresh of identity, higher is groud truth - fg_thresh (float): thresh of foreground, higher is foreground - bg_thresh (float): thresh of background, lower is background - num_classes (int): number of classes - """ - - def __init__(self, - anchors, - anchor_masks, - downsample_ratios, - ide_thresh=0.5, - fg_thresh=0.5, - bg_thresh=0.4, - num_classes=1): - super(Gt2JDETargetThres, self).__init__() - self.anchors = anchors - self.anchor_masks = anchor_masks - self.downsample_ratios = downsample_ratios - self.ide_thresh = ide_thresh - self.fg_thresh = fg_thresh - self.bg_thresh = bg_thresh - self.num_classes = num_classes - - def generate_anchor(self, nGh, nGw, anchor_hw): - nA = len(anchor_hw) - yy, xx = np.meshgrid(np.arange(nGh), np.arange(nGw)) - - mesh = np.stack([xx.T, yy.T], axis=0) # [2, nGh, nGw] - mesh = np.repeat(mesh[None, :], nA, axis=0) # [nA, 2, nGh, nGw] - - anchor_offset_mesh = anchor_hw[:, :, None][:, :, :, None] - anchor_offset_mesh = np.repeat(anchor_offset_mesh, nGh, axis=-2) - anchor_offset_mesh = np.repeat(anchor_offset_mesh, nGw, axis=-1) - - anchor_mesh = np.concatenate( - [mesh, anchor_offset_mesh], axis=1) # [nA, 4, nGh, nGw] - return anchor_mesh - - def encode_delta(self, gt_box_list, fg_anchor_list): - px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \ - fg_anchor_list[:, 2], fg_anchor_list[:,3] - gx, gy, gw, gh = gt_box_list[:, 0], gt_box_list[:, 1], \ - gt_box_list[:, 2], gt_box_list[:, 3] - dx = (gx - px) / pw - dy = (gy - py) / ph - dw = np.log(gw / pw) - dh = np.log(gh / ph) - return np.stack([dx, dy, dw, dh], axis=1) - - def pad_box(self, sample, num_max): - assert 'gt_bbox' in sample - bbox = sample['gt_bbox'] - gt_num = len(bbox) - pad_bbox = np.zeros((num_max, 4), dtype=np.float32) - if gt_num > 0: - pad_bbox[:gt_num, :] = bbox[:gt_num, :] - sample['gt_bbox'] = pad_bbox - if 'gt_score' in sample: - pad_score = np.zeros((num_max, ), dtype=np.float32) - if gt_num > 0: - pad_score[:gt_num] = sample['gt_score'][:gt_num, 0] - sample['gt_score'] = pad_score - if 'difficult' in sample: - pad_diff = np.zeros((num_max, ), dtype=np.int32) - if gt_num > 0: - pad_diff[:gt_num] = sample['difficult'][:gt_num, 0] - sample['difficult'] = pad_diff - if 'is_crowd' in sample: - pad_crowd = np.zeros((num_max, ), dtype=np.int32) - if gt_num > 0: - pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0] - sample['is_crowd'] = pad_crowd - if 'gt_ide' in sample: - pad_ide = np.zeros((num_max, ), dtype=np.int32) - if gt_num > 0: - pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0] - sample['gt_ide'] = pad_ide - return sample - - def __call__(self, samples, context=None): - assert len(self.anchor_masks) == len(self.downsample_ratios), \ - "anchor_masks', and 'downsample_ratios' should have same length." - h, w = samples[0]['image'].shape[1:3] - - num_max = 0 - for sample in samples: - num_max = max(num_max, len(sample['gt_bbox'])) - - for sample in samples: - gt_bbox = sample['gt_bbox'] - gt_ide = sample['gt_ide'] - for i, (anchor_hw, downsample_ratio - ) in enumerate(zip(self.anchors, self.downsample_ratios)): - anchor_hw = np.array( - anchor_hw, dtype=np.float32) / downsample_ratio - nA = len(anchor_hw) - nGh, nGw = int(h / downsample_ratio), int(w / downsample_ratio) - tbox = np.zeros((nA, nGh, nGw, 4), dtype=np.float32) - tconf = np.zeros((nA, nGh, nGw), dtype=np.float32) - tid = -np.ones((nA, nGh, nGw, 1), dtype=np.float32) - - gxy, gwh = gt_bbox[:, 0:2].copy(), gt_bbox[:, 2:4].copy() - gxy[:, 0] = gxy[:, 0] * nGw - gxy[:, 1] = gxy[:, 1] * nGh - gwh[:, 0] = gwh[:, 0] * nGw - gwh[:, 1] = gwh[:, 1] * nGh - gxy[:, 0] = np.clip(gxy[:, 0], 0, nGw - 1) - gxy[:, 1] = np.clip(gxy[:, 1], 0, nGh - 1) - tboxes = np.concatenate([gxy, gwh], axis=1) - - anchor_mesh = self.generate_anchor(nGh, nGw, anchor_hw) - - anchor_list = np.transpose(anchor_mesh, - (0, 2, 3, 1)).reshape(-1, 4) - iou_pdist = bbox_iou_np_expand( - anchor_list, tboxes, x1y1x2y2=False) - - iou_max = np.max(iou_pdist, axis=1) - max_gt_index = np.argmax(iou_pdist, axis=1) - - iou_map = iou_max.reshape(nA, nGh, nGw) - gt_index_map = max_gt_index.reshape(nA, nGh, nGw) - - id_index = iou_map > self.ide_thresh - fg_index = iou_map > self.fg_thresh - bg_index = iou_map < self.bg_thresh - ign_index = (iou_map < self.fg_thresh) * ( - iou_map > self.bg_thresh) - tconf[fg_index] = 1 - tconf[bg_index] = 0 - tconf[ign_index] = -1 - - gt_index = gt_index_map[fg_index] - gt_box_list = tboxes[gt_index] - gt_id_list = gt_ide[gt_index_map[id_index]] - - if np.sum(fg_index) > 0: - tid[id_index] = gt_id_list - - fg_anchor_list = anchor_list.reshape(nA, nGh, nGw, - 4)[fg_index] - delta_target = self.encode_delta(gt_box_list, - fg_anchor_list) - tbox[fg_index] = delta_target - - sample['tbox{}'.format(i)] = tbox - sample['tconf{}'.format(i)] = tconf - sample['tide{}'.format(i)] = tid - sample.pop('gt_class') - sample = self.pad_box(sample, num_max) - return samples - - -@register_op -class Gt2JDETargetMax(BaseOperator): - __shared__ = ['num_classes'] - """ - Generate JDE targets by groud truth data when evaluating - Args: - anchors (list): anchors of JDE model - anchor_masks (list): anchor_masks of JDE model - downsample_ratios (list): downsample ratios of JDE model - max_iou_thresh (float): iou thresh for high quality anchor - num_classes (int): number of classes - """ - - def __init__(self, - anchors, - anchor_masks, - downsample_ratios, - max_iou_thresh=0.60, - num_classes=1): - super(Gt2JDETargetMax, self).__init__() - self.anchors = anchors - self.anchor_masks = anchor_masks - self.downsample_ratios = downsample_ratios - self.max_iou_thresh = max_iou_thresh - self.num_classes = num_classes - - def __call__(self, samples, context=None): - assert len(self.anchor_masks) == len(self.downsample_ratios), \ - "anchor_masks', and 'downsample_ratios' should have same length." - h, w = samples[0]['image'].shape[1:3] - for sample in samples: - gt_bbox = sample['gt_bbox'] - gt_ide = sample['gt_ide'] - for i, (anchor_hw, downsample_ratio - ) in enumerate(zip(self.anchors, self.downsample_ratios)): - anchor_hw = np.array( - anchor_hw, dtype=np.float32) / downsample_ratio - nA = len(anchor_hw) - nGh, nGw = int(h / downsample_ratio), int(w / downsample_ratio) - tbox = np.zeros((nA, nGh, nGw, 4), dtype=np.float32) - tconf = np.zeros((nA, nGh, nGw), dtype=np.float32) - tid = -np.ones((nA, nGh, nGw, 1), dtype=np.float32) - - gxy, gwh = gt_bbox[:, 0:2].copy(), gt_bbox[:, 2:4].copy() - gxy[:, 0] = gxy[:, 0] * nGw - gxy[:, 1] = gxy[:, 1] * nGh - gwh[:, 0] = gwh[:, 0] * nGw - gwh[:, 1] = gwh[:, 1] * nGh - gi = np.clip(gxy[:, 0], 0, nGw - 1).astype(int) - gj = np.clip(gxy[:, 1], 0, nGh - 1).astype(int) - - # iou of targets-anchors (using wh only) - box1 = gwh - box2 = anchor_hw[:, None, :] - inter_area = np.minimum(box1, box2).prod(2) - iou = inter_area / ( - box1.prod(1) + box2.prod(2) - inter_area + 1e-16) - - # Select best iou_pred and anchor - iou_best = iou.max(0) # best anchor [0-2] for each target - a = np.argmax(iou, axis=0) - - # Select best unique target-anchor combinations - iou_order = np.argsort(-iou_best) # best to worst - - # Unique anchor selection - u = np.stack((gi, gj, a), 0)[:, iou_order] - _, first_unique = np.unique(u, axis=1, return_index=True) - mask = iou_order[first_unique] - # best anchor must share significant commonality (iou) with target - # TODO: examine arbitrary threshold - idx = mask[iou_best[mask] > self.max_iou_thresh] - - if len(idx) > 0: - a_i, gj_i, gi_i = a[idx], gj[idx], gi[idx] - t_box = gt_bbox[idx] - t_id = gt_ide[idx] - if len(t_box.shape) == 1: - t_box = t_box.reshape(1, 4) - - gxy, gwh = t_box[:, 0:2].copy(), t_box[:, 2:4].copy() - gxy[:, 0] = gxy[:, 0] * nGw - gxy[:, 1] = gxy[:, 1] * nGh - gwh[:, 0] = gwh[:, 0] * nGw - gwh[:, 1] = gwh[:, 1] * nGh - - # XY coordinates - tbox[:, :, :, 0:2][a_i, gj_i, gi_i] = gxy - gxy.astype(int) - # Width and height in yolo method - tbox[:, :, :, 2:4][a_i, gj_i, gi_i] = np.log(gwh / - anchor_hw[a_i]) - tconf[a_i, gj_i, gi_i] = 1 - tid[a_i, gj_i, gi_i] = t_id - - sample['tbox{}'.format(i)] = tbox - sample['tconf{}'.format(i)] = tconf - sample['tide{}'.format(i)] = tid - - -class Gt2FairMOTTarget(Gt2TTFTarget): - __shared__ = ['num_classes'] - """ - Generate FairMOT targets by ground truth data. - Difference between Gt2FairMOTTarget and Gt2TTFTarget are: - 1. the gaussian kernal radius to generate a heatmap. - 2. the targets needed during training. - - Args: - num_classes(int): the number of classes. - down_ratio(int): the down ratio from images to heatmap, 4 by default. - max_objs(int): the maximum number of ground truth objects in a image, 500 by default. - """ - - def __init__(self, num_classes=1, down_ratio=4, max_objs=500): - super(Gt2TTFTarget, self).__init__() - self.down_ratio = down_ratio - self.num_classes = num_classes - self.max_objs = max_objs - - def __call__(self, samples, context=None): - for b_id, sample in enumerate(samples): - output_h = sample['image'].shape[1] // self.down_ratio - output_w = sample['image'].shape[2] // self.down_ratio - - heatmap = np.zeros( - (self.num_classes, output_h, output_w), dtype='float32') - bbox_size = np.zeros((self.max_objs, 4), dtype=np.float32) - center_offset = np.zeros((self.max_objs, 2), dtype=np.float32) - index = np.zeros((self.max_objs, ), dtype=np.int64) - index_mask = np.zeros((self.max_objs, ), dtype=np.int32) - reid = np.zeros((self.max_objs, ), dtype=np.int64) - bbox_xys = np.zeros((self.max_objs, 4), dtype=np.float32) - if self.num_classes > 1: - # each category corresponds to a set of track ids - cls_tr_ids = np.zeros( - (self.num_classes, output_h, output_w), dtype=np.int64) - cls_id_map = np.full((output_h, output_w), -1, dtype=np.int64) - - gt_bbox = sample['gt_bbox'] - gt_class = sample['gt_class'] - gt_ide = sample['gt_ide'] - - for k in range(len(gt_bbox)): - cls_id = gt_class[k][0] - bbox = gt_bbox[k] - ide = gt_ide[k][0] - bbox[[0, 2]] = bbox[[0, 2]] * output_w - bbox[[1, 3]] = bbox[[1, 3]] * output_h - bbox_amodal = copy.deepcopy(bbox) - bbox_amodal[0] = bbox_amodal[0] - bbox_amodal[2] / 2. - bbox_amodal[1] = bbox_amodal[1] - bbox_amodal[3] / 2. - bbox_amodal[2] = bbox_amodal[0] + bbox_amodal[2] - bbox_amodal[3] = bbox_amodal[1] + bbox_amodal[3] - bbox[0] = np.clip(bbox[0], 0, output_w - 1) - bbox[1] = np.clip(bbox[1], 0, output_h - 1) - h = bbox[3] - w = bbox[2] - - bbox_xy = copy.deepcopy(bbox) - bbox_xy[0] = bbox_xy[0] - bbox_xy[2] / 2 - bbox_xy[1] = bbox_xy[1] - bbox_xy[3] / 2 - bbox_xy[2] = bbox_xy[0] + bbox_xy[2] - bbox_xy[3] = bbox_xy[1] + bbox_xy[3] - - if h > 0 and w > 0: - radius = gaussian_radius((math.ceil(h), math.ceil(w)), 0.7) - radius = max(0, int(radius)) - ct = np.array([bbox[0], bbox[1]], dtype=np.float32) - ct_int = ct.astype(np.int32) - self.draw_truncate_gaussian(heatmap[cls_id], ct_int, radius, - radius) - bbox_size[k] = ct[0] - bbox_amodal[0], ct[1] - bbox_amodal[1], \ - bbox_amodal[2] - ct[0], bbox_amodal[3] - ct[1] - - index[k] = ct_int[1] * output_w + ct_int[0] - center_offset[k] = ct - ct_int - index_mask[k] = 1 - reid[k] = ide - bbox_xys[k] = bbox_xy - if self.num_classes > 1: - cls_id_map[ct_int[1], ct_int[0]] = cls_id - cls_tr_ids[cls_id][ct_int[1]][ct_int[0]] = ide - 1 - # track id start from 0 - - sample['heatmap'] = heatmap - sample['index'] = index - sample['offset'] = center_offset - sample['size'] = bbox_size - sample['index_mask'] = index_mask - sample['reid'] = reid - if self.num_classes > 1: - sample['cls_id_map'] = cls_id_map - sample['cls_tr_ids'] = cls_tr_ids - sample['bbox_xys'] = bbox_xys - sample.pop('is_crowd', None) - sample.pop('difficult', None) - sample.pop('gt_class', None) - sample.pop('gt_bbox', None) - sample.pop('gt_score', None) - sample.pop('gt_ide', None) - return samples diff --git a/pdfdet/models/Paddle/ppdet/data/transform/op_helper.py b/pdfdet/models/Paddle/ppdet/data/transform/op_helper.py deleted file mode 100644 index 6c40030..0000000 --- a/pdfdet/models/Paddle/ppdet/data/transform/op_helper.py +++ /dev/null @@ -1,494 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# this file contains helper methods for BBOX processing - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import random -import math -import cv2 - - -def meet_emit_constraint(src_bbox, sample_bbox): - center_x = (src_bbox[2] + src_bbox[0]) / 2 - center_y = (src_bbox[3] + src_bbox[1]) / 2 - if center_x >= sample_bbox[0] and \ - center_x <= sample_bbox[2] and \ - center_y >= sample_bbox[1] and \ - center_y <= sample_bbox[3]: - return True - return False - - -def clip_bbox(src_bbox): - src_bbox[0] = max(min(src_bbox[0], 1.0), 0.0) - src_bbox[1] = max(min(src_bbox[1], 1.0), 0.0) - src_bbox[2] = max(min(src_bbox[2], 1.0), 0.0) - src_bbox[3] = max(min(src_bbox[3], 1.0), 0.0) - return src_bbox - - -def bbox_area(src_bbox): - if src_bbox[2] < src_bbox[0] or src_bbox[3] < src_bbox[1]: - return 0. - else: - width = src_bbox[2] - src_bbox[0] - height = src_bbox[3] - src_bbox[1] - return width * height - - -def is_overlap(object_bbox, sample_bbox): - if object_bbox[0] >= sample_bbox[2] or \ - object_bbox[2] <= sample_bbox[0] or \ - object_bbox[1] >= sample_bbox[3] or \ - object_bbox[3] <= sample_bbox[1]: - return False - else: - return True - - -def filter_and_process(sample_bbox, bboxes, labels, scores=None, - keypoints=None): - new_bboxes = [] - new_labels = [] - new_scores = [] - new_keypoints = [] - new_kp_ignore = [] - for i in range(len(bboxes)): - new_bbox = [0, 0, 0, 0] - obj_bbox = [bboxes[i][0], bboxes[i][1], bboxes[i][2], bboxes[i][3]] - if not meet_emit_constraint(obj_bbox, sample_bbox): - continue - if not is_overlap(obj_bbox, sample_bbox): - continue - sample_width = sample_bbox[2] - sample_bbox[0] - sample_height = sample_bbox[3] - sample_bbox[1] - new_bbox[0] = (obj_bbox[0] - sample_bbox[0]) / sample_width - new_bbox[1] = (obj_bbox[1] - sample_bbox[1]) / sample_height - new_bbox[2] = (obj_bbox[2] - sample_bbox[0]) / sample_width - new_bbox[3] = (obj_bbox[3] - sample_bbox[1]) / sample_height - new_bbox = clip_bbox(new_bbox) - if bbox_area(new_bbox) > 0: - new_bboxes.append(new_bbox) - new_labels.append([labels[i][0]]) - if scores is not None: - new_scores.append([scores[i][0]]) - if keypoints is not None: - sample_keypoint = keypoints[0][i] - for j in range(len(sample_keypoint)): - kp_len = sample_height if j % 2 else sample_width - sample_coord = sample_bbox[1] if j % 2 else sample_bbox[0] - sample_keypoint[j] = ( - sample_keypoint[j] - sample_coord) / kp_len - sample_keypoint[j] = max(min(sample_keypoint[j], 1.0), 0.0) - new_keypoints.append(sample_keypoint) - new_kp_ignore.append(keypoints[1][i]) - - bboxes = np.array(new_bboxes) - labels = np.array(new_labels) - scores = np.array(new_scores) - if keypoints is not None: - keypoints = np.array(new_keypoints) - new_kp_ignore = np.array(new_kp_ignore) - return bboxes, labels, scores, (keypoints, new_kp_ignore) - return bboxes, labels, scores - - -def bbox_area_sampling(bboxes, labels, scores, target_size, min_size): - new_bboxes = [] - new_labels = [] - new_scores = [] - for i, bbox in enumerate(bboxes): - w = float((bbox[2] - bbox[0]) * target_size) - h = float((bbox[3] - bbox[1]) * target_size) - if w * h < float(min_size * min_size): - continue - else: - new_bboxes.append(bbox) - new_labels.append(labels[i]) - if scores is not None and scores.size != 0: - new_scores.append(scores[i]) - bboxes = np.array(new_bboxes) - labels = np.array(new_labels) - scores = np.array(new_scores) - return bboxes, labels, scores - - -def generate_sample_bbox(sampler): - scale = np.random.uniform(sampler[2], sampler[3]) - aspect_ratio = np.random.uniform(sampler[4], sampler[5]) - aspect_ratio = max(aspect_ratio, (scale**2.0)) - aspect_ratio = min(aspect_ratio, 1 / (scale**2.0)) - bbox_width = scale * (aspect_ratio**0.5) - bbox_height = scale / (aspect_ratio**0.5) - xmin_bound = 1 - bbox_width - ymin_bound = 1 - bbox_height - xmin = np.random.uniform(0, xmin_bound) - ymin = np.random.uniform(0, ymin_bound) - xmax = xmin + bbox_width - ymax = ymin + bbox_height - sampled_bbox = [xmin, ymin, xmax, ymax] - return sampled_bbox - - -def generate_sample_bbox_square(sampler, image_width, image_height): - scale = np.random.uniform(sampler[2], sampler[3]) - aspect_ratio = np.random.uniform(sampler[4], sampler[5]) - aspect_ratio = max(aspect_ratio, (scale**2.0)) - aspect_ratio = min(aspect_ratio, 1 / (scale**2.0)) - bbox_width = scale * (aspect_ratio**0.5) - bbox_height = scale / (aspect_ratio**0.5) - if image_height < image_width: - bbox_width = bbox_height * image_height / image_width - else: - bbox_height = bbox_width * image_width / image_height - xmin_bound = 1 - bbox_width - ymin_bound = 1 - bbox_height - xmin = np.random.uniform(0, xmin_bound) - ymin = np.random.uniform(0, ymin_bound) - xmax = xmin + bbox_width - ymax = ymin + bbox_height - sampled_bbox = [xmin, ymin, xmax, ymax] - return sampled_bbox - - -def data_anchor_sampling(bbox_labels, image_width, image_height, scale_array, - resize_width): - num_gt = len(bbox_labels) - # np.random.randint range: [low, high) - rand_idx = np.random.randint(0, num_gt) if num_gt != 0 else 0 - - if num_gt != 0: - norm_xmin = bbox_labels[rand_idx][0] - norm_ymin = bbox_labels[rand_idx][1] - norm_xmax = bbox_labels[rand_idx][2] - norm_ymax = bbox_labels[rand_idx][3] - - xmin = norm_xmin * image_width - ymin = norm_ymin * image_height - wid = image_width * (norm_xmax - norm_xmin) - hei = image_height * (norm_ymax - norm_ymin) - range_size = 0 - - area = wid * hei - for scale_ind in range(0, len(scale_array) - 1): - if area > scale_array[scale_ind] ** 2 and area < \ - scale_array[scale_ind + 1] ** 2: - range_size = scale_ind + 1 - break - - if area > scale_array[len(scale_array) - 2]**2: - range_size = len(scale_array) - 2 - - scale_choose = 0.0 - if range_size == 0: - rand_idx_size = 0 - else: - # np.random.randint range: [low, high) - rng_rand_size = np.random.randint(0, range_size + 1) - rand_idx_size = rng_rand_size % (range_size + 1) - - if rand_idx_size == range_size: - min_resize_val = scale_array[rand_idx_size] / 2.0 - max_resize_val = min(2.0 * scale_array[rand_idx_size], - 2 * math.sqrt(wid * hei)) - scale_choose = random.uniform(min_resize_val, max_resize_val) - else: - min_resize_val = scale_array[rand_idx_size] / 2.0 - max_resize_val = 2.0 * scale_array[rand_idx_size] - scale_choose = random.uniform(min_resize_val, max_resize_val) - - sample_bbox_size = wid * resize_width / scale_choose - - w_off_orig = 0.0 - h_off_orig = 0.0 - if sample_bbox_size < max(image_height, image_width): - if wid <= sample_bbox_size: - w_off_orig = np.random.uniform(xmin + wid - sample_bbox_size, - xmin) - else: - w_off_orig = np.random.uniform(xmin, - xmin + wid - sample_bbox_size) - - if hei <= sample_bbox_size: - h_off_orig = np.random.uniform(ymin + hei - sample_bbox_size, - ymin) - else: - h_off_orig = np.random.uniform(ymin, - ymin + hei - sample_bbox_size) - - else: - w_off_orig = np.random.uniform(image_width - sample_bbox_size, 0.0) - h_off_orig = np.random.uniform(image_height - sample_bbox_size, 0.0) - - w_off_orig = math.floor(w_off_orig) - h_off_orig = math.floor(h_off_orig) - - # Figure out top left coordinates. - w_off = float(w_off_orig / image_width) - h_off = float(h_off_orig / image_height) - - sampled_bbox = [ - w_off, h_off, w_off + float(sample_bbox_size / image_width), - h_off + float(sample_bbox_size / image_height) - ] - return sampled_bbox - else: - return 0 - - -def jaccard_overlap(sample_bbox, object_bbox): - if sample_bbox[0] >= object_bbox[2] or \ - sample_bbox[2] <= object_bbox[0] or \ - sample_bbox[1] >= object_bbox[3] or \ - sample_bbox[3] <= object_bbox[1]: - return 0 - intersect_xmin = max(sample_bbox[0], object_bbox[0]) - intersect_ymin = max(sample_bbox[1], object_bbox[1]) - intersect_xmax = min(sample_bbox[2], object_bbox[2]) - intersect_ymax = min(sample_bbox[3], object_bbox[3]) - intersect_size = (intersect_xmax - intersect_xmin) * ( - intersect_ymax - intersect_ymin) - sample_bbox_size = bbox_area(sample_bbox) - object_bbox_size = bbox_area(object_bbox) - overlap = intersect_size / ( - sample_bbox_size + object_bbox_size - intersect_size) - return overlap - - -def intersect_bbox(bbox1, bbox2): - if bbox2[0] > bbox1[2] or bbox2[2] < bbox1[0] or \ - bbox2[1] > bbox1[3] or bbox2[3] < bbox1[1]: - intersection_box = [0.0, 0.0, 0.0, 0.0] - else: - intersection_box = [ - max(bbox1[0], bbox2[0]), max(bbox1[1], bbox2[1]), - min(bbox1[2], bbox2[2]), min(bbox1[3], bbox2[3]) - ] - return intersection_box - - -def bbox_coverage(bbox1, bbox2): - inter_box = intersect_bbox(bbox1, bbox2) - intersect_size = bbox_area(inter_box) - - if intersect_size > 0: - bbox1_size = bbox_area(bbox1) - return intersect_size / bbox1_size - else: - return 0. - - -def satisfy_sample_constraint(sampler, - sample_bbox, - gt_bboxes, - satisfy_all=False): - if sampler[6] == 0 and sampler[7] == 0: - return True - satisfied = [] - for i in range(len(gt_bboxes)): - object_bbox = [ - gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3] - ] - overlap = jaccard_overlap(sample_bbox, object_bbox) - if sampler[6] != 0 and \ - overlap < sampler[6]: - satisfied.append(False) - continue - if sampler[7] != 0 and \ - overlap > sampler[7]: - satisfied.append(False) - continue - satisfied.append(True) - if not satisfy_all: - return True - - if satisfy_all: - return np.all(satisfied) - else: - return False - - -def satisfy_sample_constraint_coverage(sampler, sample_bbox, gt_bboxes): - if sampler[6] == 0 and sampler[7] == 0: - has_jaccard_overlap = False - else: - has_jaccard_overlap = True - if sampler[8] == 0 and sampler[9] == 0: - has_object_coverage = False - else: - has_object_coverage = True - - if not has_jaccard_overlap and not has_object_coverage: - return True - found = False - for i in range(len(gt_bboxes)): - object_bbox = [ - gt_bboxes[i][0], gt_bboxes[i][1], gt_bboxes[i][2], gt_bboxes[i][3] - ] - if has_jaccard_overlap: - overlap = jaccard_overlap(sample_bbox, object_bbox) - if sampler[6] != 0 and \ - overlap < sampler[6]: - continue - if sampler[7] != 0 and \ - overlap > sampler[7]: - continue - found = True - if has_object_coverage: - object_coverage = bbox_coverage(object_bbox, sample_bbox) - if sampler[8] != 0 and \ - object_coverage < sampler[8]: - continue - if sampler[9] != 0 and \ - object_coverage > sampler[9]: - continue - found = True - if found: - return True - return found - - -def crop_image_sampling(img, sample_bbox, image_width, image_height, - target_size): - # no clipping here - xmin = int(sample_bbox[0] * image_width) - xmax = int(sample_bbox[2] * image_width) - ymin = int(sample_bbox[1] * image_height) - ymax = int(sample_bbox[3] * image_height) - - w_off = xmin - h_off = ymin - width = xmax - xmin - height = ymax - ymin - cross_xmin = max(0.0, float(w_off)) - cross_ymin = max(0.0, float(h_off)) - cross_xmax = min(float(w_off + width - 1.0), float(image_width)) - cross_ymax = min(float(h_off + height - 1.0), float(image_height)) - cross_width = cross_xmax - cross_xmin - cross_height = cross_ymax - cross_ymin - - roi_xmin = 0 if w_off >= 0 else abs(w_off) - roi_ymin = 0 if h_off >= 0 else abs(h_off) - roi_width = cross_width - roi_height = cross_height - - roi_y1 = int(roi_ymin) - roi_y2 = int(roi_ymin + roi_height) - roi_x1 = int(roi_xmin) - roi_x2 = int(roi_xmin + roi_width) - - cross_y1 = int(cross_ymin) - cross_y2 = int(cross_ymin + cross_height) - cross_x1 = int(cross_xmin) - cross_x2 = int(cross_xmin + cross_width) - - sample_img = np.zeros((height, width, 3)) - sample_img[roi_y1: roi_y2, roi_x1: roi_x2] = \ - img[cross_y1: cross_y2, cross_x1: cross_x2] - - sample_img = cv2.resize( - sample_img, (target_size, target_size), interpolation=cv2.INTER_AREA) - - return sample_img - - -def is_poly(segm): - assert isinstance(segm, (list, dict)), \ - "Invalid segm type: {}".format(type(segm)) - return isinstance(segm, list) - - -def gaussian_radius(bbox_size, min_overlap): - height, width = bbox_size - - a1 = 1 - b1 = (height + width) - c1 = width * height * (1 - min_overlap) / (1 + min_overlap) - sq1 = np.sqrt(b1**2 - 4 * a1 * c1) - radius1 = (b1 + sq1) / (2 * a1) - - a2 = 4 - b2 = 2 * (height + width) - c2 = (1 - min_overlap) * width * height - sq2 = np.sqrt(b2**2 - 4 * a2 * c2) - radius2 = (b2 + sq2) / 2 - - a3 = 4 * min_overlap - b3 = -2 * min_overlap * (height + width) - c3 = (min_overlap - 1) * width * height - sq3 = np.sqrt(b3**2 - 4 * a3 * c3) - radius3 = (b3 + sq3) / 2 - return min(radius1, radius2, radius3) - - -def draw_gaussian(heatmap, center, radius, k=1, delte=6): - diameter = 2 * radius + 1 - sigma = diameter / delte - gaussian = gaussian2D((diameter, diameter), sigma_x=sigma, sigma_y=sigma) - - x, y = center - - height, width = heatmap.shape[0:2] - - left, right = min(x, radius), min(width - x, radius + 1) - top, bottom = min(y, radius), min(height - y, radius + 1) - - masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] - masked_gaussian = gaussian[radius - top:radius + bottom, radius - left: - radius + right] - np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap) - - -def gaussian2D(shape, sigma_x=1, sigma_y=1): - m, n = [(ss - 1.) / 2. for ss in shape] - y, x = np.ogrid[-m:m + 1, -n:n + 1] - - h = np.exp(-(x * x / (2 * sigma_x * sigma_x) + y * y / (2 * sigma_y * - sigma_y))) - h[h < np.finfo(h.dtype).eps * h.max()] = 0 - return h - - -def draw_umich_gaussian(heatmap, center, radius, k=1): - """ - draw_umich_gaussian, refer to https://github.com/xingyizhou/CenterNet/blob/master/src/lib/utils/image.py#L126 - """ - diameter = 2 * radius + 1 - gaussian = gaussian2D( - (diameter, diameter), sigma_x=diameter / 6, sigma_y=diameter / 6) - - x, y = int(center[0]), int(center[1]) - - height, width = heatmap.shape[0:2] - - left, right = min(x, radius), min(width - x, radius + 1) - top, bottom = min(y, radius), min(height - y, radius + 1) - - masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] - masked_gaussian = gaussian[radius - top:radius + bottom, radius - left: - radius + right] - if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: - np.maximum(masked_heatmap, masked_gaussian * k, out=masked_heatmap) - return heatmap - - -def get_border(border, size): - i = 1 - while size - border // i <= border // i: - i *= 2 - return border // i diff --git a/pdfdet/models/Paddle/ppdet/data/transform/operators.py b/pdfdet/models/Paddle/ppdet/data/transform/operators.py deleted file mode 100644 index 5c51a93..0000000 --- a/pdfdet/models/Paddle/ppdet/data/transform/operators.py +++ /dev/null @@ -1,4148 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# function: -# operators to process sample, -# eg: decode/resize/crop image - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -try: - from collections.abc import Sequence -except Exception: - from collections import Sequence - -from numbers import Number, Integral - -import uuid -import random -import math -import numpy as np -import os -import copy -import logging -import cv2 -from PIL import Image, ImageDraw, ImageEnhance -import pickle -import threading -MUTEX = threading.Lock() - -import paddle -from ppdet.core.workspace import serializable -from ..reader import Compose - -from .op_helper import (satisfy_sample_constraint, filter_and_process, - generate_sample_bbox, clip_bbox, data_anchor_sampling, - satisfy_sample_constraint_coverage, crop_image_sampling, - generate_sample_bbox_square, bbox_area_sampling, - is_poly, get_border) - -from ppdet.utils.logger import setup_logger -from ppdet.utils.compact import imagedraw_textsize_c - -from ppdet.modeling.keypoint_utils import get_affine_transform, affine_transform -logger = setup_logger(__name__) - -registered_ops = [] - - -def register_op(cls): - registered_ops.append(cls.__name__) - if not hasattr(BaseOperator, cls.__name__): - setattr(BaseOperator, cls.__name__, cls) - else: - raise KeyError("The {} class has been registered.".format(cls.__name__)) - return serializable(cls) - - -class BboxError(ValueError): - pass - - -class ImageError(ValueError): - pass - - -class BaseOperator(object): - def __init__(self, name=None): - if name is None: - name = self.__class__.__name__ - self._id = name + '_' + str(uuid.uuid4())[-6:] - - def apply(self, sample, context=None): - """ Process a sample. - Args: - sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx} - context (dict): info about this sample processing - Returns: - result (dict): a processed sample - """ - return sample - - def __call__(self, sample, context=None): - """ Process a sample. - Args: - sample (dict): a dict of sample, eg: {'image':xx, 'label': xxx} - context (dict): info about this sample processing - Returns: - result (dict): a processed sample - """ - if isinstance(sample, Sequence): - for i in range(len(sample)): - sample[i] = self.apply(sample[i], context) - else: - sample = self.apply(sample, context) - return sample - - def __str__(self): - return str(self._id) - - -@register_op -class Decode(BaseOperator): - def __init__(self): - """ Transform the image data to numpy format following the rgb format - """ - super(Decode, self).__init__() - - def apply(self, sample, context=None): - """ load image if 'im_file' field is not empty but 'image' is""" - if 'image' not in sample: - with open(sample['im_file'], 'rb') as f: - sample['image'] = f.read() - sample.pop('im_file') - - try: - im = sample['image'] - data = np.frombuffer(im, dtype='uint8') - im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode - if 'keep_ori_im' in sample and sample['keep_ori_im']: - sample['ori_image'] = im - im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) - except: - im = sample['image'] - - sample['image'] = im - if 'h' not in sample: - sample['h'] = im.shape[0] - elif sample['h'] != im.shape[0]: - logger.warning( - "The actual image height: {} is not equal to the " - "height: {} in annotation, and update sample['h'] by actual " - "image height.".format(im.shape[0], sample['h'])) - sample['h'] = im.shape[0] - if 'w' not in sample: - sample['w'] = im.shape[1] - elif sample['w'] != im.shape[1]: - logger.warning( - "The actual image width: {} is not equal to the " - "width: {} in annotation, and update sample['w'] by actual " - "image width.".format(im.shape[1], sample['w'])) - sample['w'] = im.shape[1] - - sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32) - sample['scale_factor'] = np.array([1., 1.], dtype=np.float32) - return sample - - -def _make_dirs(dirname): - try: - from pathlib import Path - except ImportError: - from pathlib2 import Path - Path(dirname).mkdir(exist_ok=True) - - -@register_op -class DecodeCache(BaseOperator): - def __init__(self, cache_root=None): - '''decode image and caching - ''' - super(DecodeCache, self).__init__() - - self.use_cache = False if cache_root is None else True - self.cache_root = cache_root - - if cache_root is not None: - _make_dirs(cache_root) - - def apply(self, sample, context=None): - - if self.use_cache and os.path.exists( - self.cache_path(self.cache_root, sample['im_file'])): - path = self.cache_path(self.cache_root, sample['im_file']) - im = self.load(path) - - else: - if 'image' not in sample: - with open(sample['im_file'], 'rb') as f: - sample['image'] = f.read() - - im = sample['image'] - data = np.frombuffer(im, dtype='uint8') - im = cv2.imdecode(data, 1) # BGR mode, but need RGB mode - if 'keep_ori_im' in sample and sample['keep_ori_im']: - sample['ori_image'] = im - im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) - - if self.use_cache and not os.path.exists( - self.cache_path(self.cache_root, sample['im_file'])): - path = self.cache_path(self.cache_root, sample['im_file']) - self.dump(im, path) - - sample['image'] = im - sample['h'] = im.shape[0] - sample['w'] = im.shape[1] - - sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32) - sample['scale_factor'] = np.array([1., 1.], dtype=np.float32) - - sample.pop('im_file') - - return sample - - @staticmethod - def cache_path(dir_oot, im_file): - return os.path.join(dir_oot, os.path.basename(im_file) + '.pkl') - - @staticmethod - def load(path): - with open(path, 'rb') as f: - im = pickle.load(f) - return im - - @staticmethod - def dump(obj, path): - MUTEX.acquire() - try: - with open(path, 'wb') as f: - pickle.dump(obj, f) - - except Exception as e: - logger.warning('dump {} occurs exception {}'.format(path, str(e))) - - finally: - MUTEX.release() - - -@register_op -class SniperDecodeCrop(BaseOperator): - def __init__(self): - super(SniperDecodeCrop, self).__init__() - - def __call__(self, sample, context=None): - if 'image' not in sample: - with open(sample['im_file'], 'rb') as f: - sample['image'] = f.read() - sample.pop('im_file') - - im = sample['image'] - data = np.frombuffer(im, dtype='uint8') - im = cv2.imdecode(data, cv2.IMREAD_COLOR) # BGR mode, but need RGB mode - if 'keep_ori_im' in sample and sample['keep_ori_im']: - sample['ori_image'] = im - im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB) - - chip = sample['chip'] - x1, y1, x2, y2 = [int(xi) for xi in chip] - im = im[max(y1, 0):min(y2, im.shape[0]), max(x1, 0):min(x2, im.shape[ - 1]), :] - - sample['image'] = im - h = im.shape[0] - w = im.shape[1] - # sample['im_info'] = [h, w, 1.0] - sample['h'] = h - sample['w'] = w - - sample['im_shape'] = np.array(im.shape[:2], dtype=np.float32) - sample['scale_factor'] = np.array([1., 1.], dtype=np.float32) - return sample - - -@register_op -class Permute(BaseOperator): - def __init__(self): - """ - Change the channel to be (C, H, W) - """ - super(Permute, self).__init__() - - def apply(self, sample, context=None): - im = sample['image'] - im = im.transpose((2, 0, 1)) - sample['image'] = im - - if 'pre_image' in sample: - pre_im = sample['pre_image'] - pre_im = pre_im.transpose((2, 0, 1)) - sample['pre_image'] = pre_im - return sample - - -@register_op -class Lighting(BaseOperator): - """ - Lighting the image by eigenvalues and eigenvectors - Args: - eigval (list): eigenvalues - eigvec (list): eigenvectors - alphastd (float): random weight of lighting, 0.1 by default - """ - - def __init__(self, eigval, eigvec, alphastd=0.1): - super(Lighting, self).__init__() - self.alphastd = alphastd - self.eigval = np.array(eigval).astype('float32') - self.eigvec = np.array(eigvec).astype('float32') - - def apply(self, sample, context=None): - alpha = np.random.normal(scale=self.alphastd, size=(3, )) - sample['image'] += np.dot(self.eigvec, self.eigval * alpha) - - if 'pre_image' in sample: - sample['pre_image'] += np.dot(self.eigvec, self.eigval * alpha) - return sample - - -@register_op -class RandomErasingImage(BaseOperator): - def __init__(self, prob=0.5, lower=0.02, higher=0.4, aspect_ratio=0.3): - """ - Random Erasing Data Augmentation, see https://arxiv.org/abs/1708.04896 - Args: - prob (float): probability to carry out random erasing - lower (float): lower limit of the erasing area ratio - higher (float): upper limit of the erasing area ratio - aspect_ratio (float): aspect ratio of the erasing region - """ - super(RandomErasingImage, self).__init__() - self.prob = prob - self.lower = lower - self.higher = higher - self.aspect_ratio = aspect_ratio - - def apply(self, sample, context=None): - gt_bbox = sample['gt_bbox'] - im = sample['image'] - if not isinstance(im, np.ndarray): - raise TypeError("{}: image is not a numpy array.".format(self)) - if len(im.shape) != 3: - raise ImageError("{}: image is not 3-dimensional.".format(self)) - - for idx in range(gt_bbox.shape[0]): - if self.prob <= np.random.rand(): - continue - - x1, y1, x2, y2 = gt_bbox[idx, :] - w_bbox = x2 - x1 - h_bbox = y2 - y1 - area = w_bbox * h_bbox - - target_area = random.uniform(self.lower, self.higher) * area - aspect_ratio = random.uniform(self.aspect_ratio, - 1 / self.aspect_ratio) - - h = int(round(math.sqrt(target_area * aspect_ratio))) - w = int(round(math.sqrt(target_area / aspect_ratio))) - - if w < w_bbox and h < h_bbox: - off_y1 = random.randint(0, int(h_bbox - h)) - off_x1 = random.randint(0, int(w_bbox - w)) - im[int(y1 + off_y1):int(y1 + off_y1 + h), int(x1 + off_x1):int( - x1 + off_x1 + w), :] = 0 - sample['image'] = im - return sample - - -@register_op -class NormalizeImage(BaseOperator): - def __init__(self, - mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225], - is_scale=True, - norm_type='mean_std'): - """ - Args: - mean (list): the pixel mean - std (list): the pixel variance - is_scale (bool): scale the pixel to [0,1] - norm_type (str): type in ['mean_std', 'none'] - """ - super(NormalizeImage, self).__init__() - self.mean = mean - self.std = std - self.is_scale = is_scale - self.norm_type = norm_type - if not (isinstance(self.mean, list) and isinstance(self.std, list) and - isinstance(self.is_scale, bool) and - self.norm_type in ['mean_std', 'none']): - raise TypeError("{}: input type is invalid.".format(self)) - from functools import reduce - if reduce(lambda x, y: x * y, self.std) == 0: - raise ValueError('{}: std is invalid!'.format(self)) - - def apply(self, sample, context=None): - """Normalize the image. - Operators: - 1.(optional) Scale the pixel to [0,1] - 2.(optional) Each pixel minus mean and is divided by std - """ - im = sample['image'] - - im = im.astype(np.float32, copy=False) - if self.is_scale: - scale = 1.0 / 255.0 - im *= scale - - if self.norm_type == 'mean_std': - mean = np.array(self.mean)[np.newaxis, np.newaxis, :] - std = np.array(self.std)[np.newaxis, np.newaxis, :] - im -= mean - im /= std - - sample['image'] = im - - if 'pre_image' in sample: - pre_im = sample['pre_image'] - pre_im = pre_im.astype(np.float32, copy=False) - if self.is_scale: - scale = 1.0 / 255.0 - pre_im *= scale - - if self.norm_type == 'mean_std': - mean = np.array(self.mean)[np.newaxis, np.newaxis, :] - std = np.array(self.std)[np.newaxis, np.newaxis, :] - pre_im -= mean - pre_im /= std - sample['pre_image'] = pre_im - - return sample - - -@register_op -class GridMask(BaseOperator): - def __init__(self, - use_h=True, - use_w=True, - rotate=1, - offset=False, - ratio=0.5, - mode=1, - prob=0.7, - upper_iter=360000): - """ - GridMask Data Augmentation, see https://arxiv.org/abs/2001.04086 - Args: - use_h (bool): whether to mask vertically - use_w (boo;): whether to mask horizontally - rotate (float): angle for the mask to rotate - offset (float): mask offset - ratio (float): mask ratio - mode (int): gridmask mode - prob (float): max probability to carry out gridmask - upper_iter (int): suggested to be equal to global max_iter - """ - super(GridMask, self).__init__() - self.use_h = use_h - self.use_w = use_w - self.rotate = rotate - self.offset = offset - self.ratio = ratio - self.mode = mode - self.prob = prob - self.upper_iter = upper_iter - - from .gridmask_utils import Gridmask - self.gridmask_op = Gridmask( - use_h, - use_w, - rotate=rotate, - offset=offset, - ratio=ratio, - mode=mode, - prob=prob, - upper_iter=upper_iter) - - def apply(self, sample, context=None): - sample['image'] = self.gridmask_op(sample['image'], sample['curr_iter']) - return sample - - -@register_op -class RandomDistort(BaseOperator): - """Random color distortion. - Args: - hue (list): hue settings. in [lower, upper, probability] format. - saturation (list): saturation settings. in [lower, upper, probability] format. - contrast (list): contrast settings. in [lower, upper, probability] format. - brightness (list): brightness settings. in [lower, upper, probability] format. - random_apply (bool): whether to apply in random (yolo) or fixed (SSD) order. - count (int): the number of doing distrot. - random_channel (bool): whether to swap channels randomly. - prob (float): the probability of enhancing the sample. - """ - - def __init__(self, - hue=[-18, 18, 0.5], - saturation=[0.5, 1.5, 0.5], - contrast=[0.5, 1.5, 0.5], - brightness=[0.5, 1.5, 0.5], - random_apply=True, - count=4, - random_channel=False, - prob=1.0): - super(RandomDistort, self).__init__() - self.hue = hue - self.saturation = saturation - self.contrast = contrast - self.brightness = brightness - self.random_apply = random_apply - self.count = count - self.random_channel = random_channel - self.prob = prob - - def apply_hue(self, img): - low, high, prob = self.hue - if np.random.uniform(0., 1.) < prob: - return img - delta = np.random.uniform(low, high) - img = np.array(img.convert('HSV')) - img[:, :, 0] = img[:, :, 0] + delta - img = Image.fromarray(img, mode='HSV').convert('RGB') - return img - - def apply_saturation(self, img): - low, high, prob = self.saturation - if np.random.uniform(0., 1.) < prob: - return img - delta = np.random.uniform(low, high) - img = ImageEnhance.Color(img).enhance(delta) - return img - - def apply_contrast(self, img): - low, high, prob = self.contrast - if np.random.uniform(0., 1.) < prob: - return img - delta = np.random.uniform(low, high) - img = ImageEnhance.Contrast(img).enhance(delta) - return img - - def apply_brightness(self, img): - low, high, prob = self.brightness - if np.random.uniform(0., 1.) < prob: - return img - delta = np.random.uniform(low, high) - img = ImageEnhance.Brightness(img).enhance(delta) - return img - - def apply(self, sample, context=None): - if random.random() > self.prob: - return sample - img = sample['image'] - img = Image.fromarray(img.astype(np.uint8)) - if self.random_apply: - functions = [ - self.apply_brightness, self.apply_contrast, - self.apply_saturation, self.apply_hue - ] - distortions = np.random.permutation(functions)[:self.count] - for func in distortions: - img = func(img) - img = np.asarray(img).astype(np.float32) - sample['image'] = img - return sample - - img = self.apply_brightness(img) - mode = np.random.randint(0, 2) - if mode: - img = self.apply_contrast(img) - img = self.apply_saturation(img) - img = self.apply_hue(img) - if not mode: - img = self.apply_contrast(img) - - img = np.asarray(img).astype(np.float32) - if self.random_channel: - if np.random.randint(0, 2): - img = img[..., np.random.permutation(3)] - sample['image'] = img - return sample - - -@register_op -class PhotoMetricDistortion(BaseOperator): - """Apply photometric distortion to image sequentially, every transformation - is applied with a probability of 0.5. The position of random contrast is in - second or second to last. - - 1. random brightness - 2. random contrast (mode 0) - 3. convert color from BGR to HSV - 4. random saturation - 5. random hue - 6. convert color from HSV to BGR - 7. random contrast (mode 1) - 8. randomly swap channels - - Args: - brightness_delta (int): delta of brightness. - contrast_range (tuple): range of contrast. - saturation_range (tuple): range of saturation. - hue_delta (int): delta of hue. - """ - - def __init__(self, - brightness_delta=32, - contrast_range=(0.5, 1.5), - saturation_range=(0.5, 1.5), - hue_delta=18): - super(PhotoMetricDistortion, self).__init__() - self.brightness_delta = brightness_delta - self.contrast_lower, self.contrast_upper = contrast_range - self.saturation_lower, self.saturation_upper = saturation_range - self.hue_delta = hue_delta - - def apply(self, results, context=None): - """Call function to perform photometric distortion on images. - - Args: - results (dict): Result dict from loading pipeline. - - Returns: - dict: Result dict with images distorted. - """ - - img = results['image'] - img = img.astype(np.float32) - # random brightness - if np.random.randint(2): - delta = np.random.uniform(-self.brightness_delta, - self.brightness_delta) - img += delta - - # mode == 0 --> do random contrast first - # mode == 1 --> do random contrast last - mode = np.random.randint(2) - if mode == 1: - if np.random.randint(2): - alpha = np.random.uniform(self.contrast_lower, - self.contrast_upper) - img *= alpha - - # convert color from BGR to HSV - img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) - - # random saturation - if np.random.randint(2): - img[..., 1] *= np.random.uniform(self.saturation_lower, - self.saturation_upper) - - # random hue - if np.random.randint(2): - img[..., 0] += np.random.uniform(-self.hue_delta, self.hue_delta) - img[..., 0][img[..., 0] > 360] -= 360 - img[..., 0][img[..., 0] < 0] += 360 - - # convert color from HSV to BGR - img = cv2.cvtColor(img, cv2.COLOR_HSV2BGR) - - # random contrast - if mode == 0: - if np.random.randint(2): - alpha = np.random.uniform(self.contrast_lower, - self.contrast_upper) - img *= alpha - - # randomly swap channels - if np.random.randint(2): - img = img[..., np.random.permutation(3)] - - results['image'] = img - return results - - def __repr__(self): - repr_str = self.__class__.__name__ - repr_str += f'(\nbrightness_delta={self.brightness_delta},\n' - repr_str += 'contrast_range=' - repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n' - repr_str += 'saturation_range=' - repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n' - repr_str += f'hue_delta={self.hue_delta})' - return repr_str - - -@register_op -class AutoAugment(BaseOperator): - def __init__(self, autoaug_type="v1"): - """ - Args: - autoaug_type (str): autoaug type, support v0, v1, v2, v3, test - """ - super(AutoAugment, self).__init__() - self.autoaug_type = autoaug_type - - def apply(self, sample, context=None): - """ - Learning Data Augmentation Strategies for Object Detection, see https://arxiv.org/abs/1906.11172 - """ - im = sample['image'] - gt_bbox = sample['gt_bbox'] - if not isinstance(im, np.ndarray): - raise TypeError("{}: image is not a numpy array.".format(self)) - if len(im.shape) != 3: - raise ImageError("{}: image is not 3-dimensional.".format(self)) - if len(gt_bbox) == 0: - return sample - - height, width, _ = im.shape - norm_gt_bbox = np.ones_like(gt_bbox, dtype=np.float32) - norm_gt_bbox[:, 0] = gt_bbox[:, 1] / float(height) - norm_gt_bbox[:, 1] = gt_bbox[:, 0] / float(width) - norm_gt_bbox[:, 2] = gt_bbox[:, 3] / float(height) - norm_gt_bbox[:, 3] = gt_bbox[:, 2] / float(width) - - from .autoaugment_utils import distort_image_with_autoaugment - im, norm_gt_bbox = distort_image_with_autoaugment(im, norm_gt_bbox, - self.autoaug_type) - - gt_bbox[:, 0] = norm_gt_bbox[:, 1] * float(width) - gt_bbox[:, 1] = norm_gt_bbox[:, 0] * float(height) - gt_bbox[:, 2] = norm_gt_bbox[:, 3] * float(width) - gt_bbox[:, 3] = norm_gt_bbox[:, 2] * float(height) - - sample['image'] = im - sample['gt_bbox'] = gt_bbox - return sample - - -@register_op -class RandomFlip(BaseOperator): - def __init__(self, prob=0.5): - """ - Args: - prob (float): the probability of flipping image - """ - super(RandomFlip, self).__init__() - self.prob = prob - if not (isinstance(self.prob, float)): - raise TypeError("{}: input type is invalid.".format(self)) - - def apply_segm(self, segms, height, width): - def _flip_poly(poly, width): - flipped_poly = np.array(poly) - flipped_poly[0::2] = width - np.array(poly[0::2]) - return flipped_poly.tolist() - - def _flip_rle(rle, height, width): - if 'counts' in rle and type(rle['counts']) == list: - rle = mask_util.frPyObjects(rle, height, width) - mask = mask_util.decode(rle) - mask = mask[:, ::-1] - rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) - return rle - - flipped_segms = [] - for segm in segms: - if is_poly(segm): - # Polygon format - flipped_segms.append([_flip_poly(poly, width) for poly in segm]) - else: - # RLE format - import pycocotools.mask as mask_util - flipped_segms.append(_flip_rle(segm, height, width)) - return flipped_segms - - def apply_keypoint(self, gt_keypoint, width): - for i in range(gt_keypoint.shape[1]): - if i % 2 == 0: - old_x = gt_keypoint[:, i].copy() - gt_keypoint[:, i] = width - old_x - return gt_keypoint - - def apply_image(self, image): - return image[:, ::-1, :] - - def apply_bbox(self, bbox, width): - oldx1 = bbox[:, 0].copy() - oldx2 = bbox[:, 2].copy() - bbox[:, 0] = width - oldx2 - bbox[:, 2] = width - oldx1 - return bbox - - def apply(self, sample, context=None): - """Filp the image and bounding box. - Operators: - 1. Flip the image numpy. - 2. Transform the bboxes' x coordinates. - (Must judge whether the coordinates are normalized!) - 3. Transform the segmentations' x coordinates. - (Must judge whether the coordinates are normalized!) - Output: - sample: the image, bounding box and segmentation part - in sample are flipped. - """ - if np.random.uniform(0, 1) < self.prob: - im = sample['image'] - height, width = im.shape[:2] - im = self.apply_image(im) - if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: - sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], width) - if 'gt_poly' in sample and len(sample['gt_poly']) > 0: - sample['gt_poly'] = self.apply_segm(sample['gt_poly'], height, - width) - if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0: - sample['gt_keypoint'] = self.apply_keypoint( - sample['gt_keypoint'], width) - - if 'semantic' in sample and sample['semantic']: - sample['semantic'] = sample['semantic'][:, ::-1] - - if 'gt_segm' in sample and sample['gt_segm'].any(): - sample['gt_segm'] = sample['gt_segm'][:, :, ::-1] - - sample['flipped'] = True - sample['image'] = im - return sample - - -@register_op -class Resize(BaseOperator): - def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR): - """ - Resize image to target size. if keep_ratio is True, - resize the image's long side to the maximum of target_size - if keep_ratio is False, resize the image to target size(h, w) - Args: - target_size (int|list): image target size - keep_ratio (bool): whether keep_ratio or not, default true - interp (int): the interpolation method - """ - super(Resize, self).__init__() - self.keep_ratio = keep_ratio - self.interp = interp - if not isinstance(target_size, (Integral, Sequence)): - raise TypeError( - "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}". - format(type(target_size))) - if isinstance(target_size, Integral): - target_size = [target_size, target_size] - self.target_size = target_size - - def apply_image(self, image, scale): - im_scale_x, im_scale_y = scale - - return cv2.resize( - image, - None, - None, - fx=im_scale_x, - fy=im_scale_y, - interpolation=self.interp) - - def apply_bbox(self, bbox, scale, size): - im_scale_x, im_scale_y = scale - resize_w, resize_h = size - bbox[:, 0::2] *= im_scale_x - bbox[:, 1::2] *= im_scale_y - bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w) - bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h) - return bbox - - def apply_area(self, area, scale): - im_scale_x, im_scale_y = scale - return area * im_scale_x * im_scale_y - - def apply_joints(self, joints, scale, size): - im_scale_x, im_scale_y = scale - resize_w, resize_h = size - joints[..., 0] *= im_scale_x - joints[..., 1] *= im_scale_y - joints[..., 0] = np.clip(joints[..., 0], 0, resize_w) - joints[..., 1] = np.clip(joints[..., 1], 0, resize_h) - return joints - - def apply_segm(self, segms, im_size, scale): - def _resize_poly(poly, im_scale_x, im_scale_y): - resized_poly = np.array(poly).astype('float32') - resized_poly[0::2] *= im_scale_x - resized_poly[1::2] *= im_scale_y - return resized_poly.tolist() - - def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y): - if 'counts' in rle and type(rle['counts']) == list: - rle = mask_util.frPyObjects(rle, im_h, im_w) - - mask = mask_util.decode(rle) - mask = cv2.resize( - mask, - None, - None, - fx=im_scale_x, - fy=im_scale_y, - interpolation=self.interp) - rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) - return rle - - im_h, im_w = im_size - im_scale_x, im_scale_y = scale - resized_segms = [] - for segm in segms: - if is_poly(segm): - # Polygon format - resized_segms.append([ - _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm - ]) - else: - # RLE format - import pycocotools.mask as mask_util - resized_segms.append( - _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y)) - - return resized_segms - - def apply(self, sample, context=None): - """ Resize the image numpy. - """ - im = sample['image'] - if not isinstance(im, np.ndarray): - raise TypeError("{}: image type is not numpy.".format(self)) - - # apply image - if len(im.shape) == 3: - im_shape = im.shape - else: - im_shape = im[0].shape - - if self.keep_ratio: - im_size_min = np.min(im_shape[0:2]) - im_size_max = np.max(im_shape[0:2]) - - target_size_min = np.min(self.target_size) - target_size_max = np.max(self.target_size) - - im_scale = min(target_size_min / im_size_min, - target_size_max / im_size_max) - - resize_h = int(im_scale * float(im_shape[0]) + 0.5) - resize_w = int(im_scale * float(im_shape[1]) + 0.5) - else: - resize_h, resize_w = self.target_size - - im_scale_y = resize_h / im_shape[0] - im_scale_x = resize_w / im_shape[1] - - if len(im.shape) == 3: - im = self.apply_image(sample['image'], [im_scale_x, im_scale_y]) - sample['image'] = im.astype(np.float32) - else: - resized_images = [] - for one_im in im: - applied_im = self.apply_image(one_im, [im_scale_x, im_scale_y]) - resized_images.append(applied_im) - - sample['image'] = np.array(resized_images) - - # 2d keypoints resize - if 'kps2d' in sample.keys(): - kps2d = sample['kps2d'] - kps2d[:, :, 0] = kps2d[:, :, 0] * im_scale_x - kps2d[:, :, 1] = kps2d[:, :, 1] * im_scale_y - - sample['kps2d'] = kps2d - - sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32) - if 'scale_factor' in sample: - scale_factor = sample['scale_factor'] - sample['scale_factor'] = np.asarray( - [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x], - dtype=np.float32) - else: - sample['scale_factor'] = np.asarray( - [im_scale_y, im_scale_x], dtype=np.float32) - - # apply bbox - if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: - sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], - [im_scale_x, im_scale_y], - [resize_w, resize_h]) - - # apply areas - if 'gt_areas' in sample: - sample['gt_areas'] = self.apply_area(sample['gt_areas'], - [im_scale_x, im_scale_y]) - - # apply polygon - if 'gt_poly' in sample and len(sample['gt_poly']) > 0: - sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2], - [im_scale_x, im_scale_y]) - - # apply semantic - if 'semantic' in sample and sample['semantic']: - semantic = sample['semantic'] - semantic = cv2.resize( - semantic.astype('float32'), - None, - None, - fx=im_scale_x, - fy=im_scale_y, - interpolation=self.interp) - semantic = np.asarray(semantic).astype('int32') - semantic = np.expand_dims(semantic, 0) - sample['semantic'] = semantic - - # apply gt_segm - if 'gt_segm' in sample and len(sample['gt_segm']) > 0: - masks = [ - cv2.resize( - gt_segm, - None, - None, - fx=im_scale_x, - fy=im_scale_y, - interpolation=cv2.INTER_NEAREST) - for gt_segm in sample['gt_segm'] - ] - sample['gt_segm'] = np.asarray(masks).astype(np.uint8) - - if 'gt_joints' in sample: - sample['gt_joints'] = self.apply_joints(sample['gt_joints'], - [im_scale_x, im_scale_y], - [resize_w, resize_h]) - - return sample - - -@register_op -class MultiscaleTestResize(BaseOperator): - def __init__(self, - origin_target_size=[800, 1333], - target_size=[], - interp=cv2.INTER_LINEAR, - use_flip=True): - """ - Rescale image to the each size in target size, and capped at max_size. - Args: - origin_target_size (list): origin target size of image - target_size (list): A list of target sizes of image. - interp (int): the interpolation method. - use_flip (bool): whether use flip augmentation. - """ - super(MultiscaleTestResize, self).__init__() - self.interp = interp - self.use_flip = use_flip - - if not isinstance(target_size, Sequence): - raise TypeError( - "Type of target_size is invalid. Must be List or Tuple, now is {}". - format(type(target_size))) - self.target_size = target_size - - if not isinstance(origin_target_size, Sequence): - raise TypeError( - "Type of origin_target_size is invalid. Must be List or Tuple, now is {}". - format(type(origin_target_size))) - - self.origin_target_size = origin_target_size - - def apply(self, sample, context=None): - """ Resize the image numpy for multi-scale test. - """ - samples = [] - resizer = Resize( - self.origin_target_size, keep_ratio=True, interp=self.interp) - samples.append(resizer(sample.copy(), context)) - if self.use_flip: - flipper = RandomFlip(1.1) - samples.append(flipper(sample.copy(), context=context)) - - for size in self.target_size: - resizer = Resize(size, keep_ratio=True, interp=self.interp) - samples.append(resizer(sample.copy(), context)) - - return samples - - -@register_op -class RandomResize(BaseOperator): - def __init__(self, - target_size, - keep_ratio=True, - interp=cv2.INTER_LINEAR, - random_range=False, - random_size=True, - random_interp=False): - """ - Resize image to target size randomly. random target_size and interpolation method - Args: - target_size (int, list, tuple): image target size, if random size is True, must be list or tuple - keep_ratio (bool): whether keep_raio or not, default true - interp (int): the interpolation method - random_range (bool): whether random select target size of image, the target_size must be - a [[min_short_edge, long_edge], [max_short_edge, long_edge]] - random_size (bool): whether random select target size of image - random_interp (bool): whether random select interpolation method - """ - super(RandomResize, self).__init__() - self.keep_ratio = keep_ratio - self.interp = interp - self.interps = [ - cv2.INTER_NEAREST, - cv2.INTER_LINEAR, - cv2.INTER_AREA, - cv2.INTER_CUBIC, - cv2.INTER_LANCZOS4, - ] - assert isinstance(target_size, ( - Integral, Sequence)), "target_size must be Integer, List or Tuple" - if (random_range or random_size) and not isinstance(target_size, - Sequence): - raise TypeError( - "Type of target_size is invalid when random_size or random_range is True. Must be List or Tuple, now is {}". - format(type(target_size))) - if random_range and not len(target_size) == 2: - raise TypeError( - "target_size must be two list as [[min_short_edge, long_edge], [max_short_edge, long_edge]] when random_range is True." - ) - self.target_size = target_size - self.random_range = random_range - self.random_size = random_size - self.random_interp = random_interp - - def apply(self, sample, context=None): - """ Resize the image numpy. - """ - if self.random_range: - short_edge = np.random.randint(self.target_size[0][0], - self.target_size[1][0] + 1) - long_edge = max(self.target_size[0][1], self.target_size[1][1] + 1) - target_size = [short_edge, long_edge] - else: - if self.random_size: - target_size = random.choice(self.target_size) - else: - target_size = self.target_size - - if self.random_interp: - interp = random.choice(self.interps) - else: - interp = self.interp - - resizer = Resize(target_size, self.keep_ratio, interp) - return resizer(sample, context=context) - - -@register_op -class RandomExpand(BaseOperator): - """Random expand the canvas. - Args: - ratio (float): maximum expansion ratio. - prob (float): probability to expand. - fill_value (list): color value used to fill the canvas. in RGB order. - """ - - def __init__(self, ratio=4., prob=0.5, fill_value=(127.5, 127.5, 127.5)): - super(RandomExpand, self).__init__() - assert ratio > 1.01, "expand ratio must be larger than 1.01" - self.ratio = ratio - self.prob = prob - assert isinstance(fill_value, (Number, Sequence)), \ - "fill value must be either float or sequence" - if isinstance(fill_value, Number): - fill_value = (fill_value, ) * 3 - if not isinstance(fill_value, tuple): - fill_value = tuple(fill_value) - self.fill_value = fill_value - - def apply(self, sample, context=None): - if np.random.uniform(0., 1.) < self.prob: - return sample - - im = sample['image'] - height, width = im.shape[:2] - ratio = np.random.uniform(1., self.ratio) - h = int(height * ratio) - w = int(width * ratio) - if not h > height or not w > width: - return sample - y = np.random.randint(0, h - height) - x = np.random.randint(0, w - width) - offsets, size = [x, y], [h, w] - - pad = Pad(size, - pad_mode=-1, - offsets=offsets, - fill_value=self.fill_value) - - return pad(sample, context=context) - - -@register_op -class CropWithSampling(BaseOperator): - def __init__(self, batch_sampler, satisfy_all=False, avoid_no_bbox=True): - """ - Args: - batch_sampler (list): Multiple sets of different - parameters for cropping. - satisfy_all (bool): whether all boxes must satisfy. - e.g.[[1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0], - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 1.0], - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 1.0], - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 1.0], - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 1.0], - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 1.0], - [1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0]] - [max sample, max trial, min scale, max scale, - min aspect ratio, max aspect ratio, - min overlap, max overlap] - avoid_no_bbox (bool): whether to avoid the - situation where the box does not appear. - """ - super(CropWithSampling, self).__init__() - self.batch_sampler = batch_sampler - self.satisfy_all = satisfy_all - self.avoid_no_bbox = avoid_no_bbox - - def apply(self, sample, context): - """ - Crop the image and modify bounding box. - Operators: - 1. Scale the image width and height. - 2. Crop the image according to a radom sample. - 3. Rescale the bounding box. - 4. Determine if the new bbox is satisfied in the new image. - Returns: - sample: the image, bounding box are replaced. - """ - assert 'image' in sample, "image data not found" - im = sample['image'] - gt_bbox = sample['gt_bbox'] - gt_class = sample['gt_class'] - im_height, im_width = im.shape[:2] - gt_score = None - if 'gt_score' in sample: - gt_score = sample['gt_score'] - sampled_bbox = [] - gt_bbox = gt_bbox.tolist() - for sampler in self.batch_sampler: - found = 0 - for i in range(sampler[1]): - if found >= sampler[0]: - break - sample_bbox = generate_sample_bbox(sampler) - if satisfy_sample_constraint(sampler, sample_bbox, gt_bbox, - self.satisfy_all): - sampled_bbox.append(sample_bbox) - found = found + 1 - im = np.array(im) - while sampled_bbox: - idx = int(np.random.uniform(0, len(sampled_bbox))) - sample_bbox = sampled_bbox.pop(idx) - sample_bbox = clip_bbox(sample_bbox) - crop_bbox, crop_class, crop_score = \ - filter_and_process(sample_bbox, gt_bbox, gt_class, scores=gt_score) - if self.avoid_no_bbox: - if len(crop_bbox) < 1: - continue - xmin = int(sample_bbox[0] * im_width) - xmax = int(sample_bbox[2] * im_width) - ymin = int(sample_bbox[1] * im_height) - ymax = int(sample_bbox[3] * im_height) - im = im[ymin:ymax, xmin:xmax] - sample['image'] = im - sample['gt_bbox'] = crop_bbox - sample['gt_class'] = crop_class - sample['gt_score'] = crop_score - return sample - return sample - - -@register_op -class CropWithDataAchorSampling(BaseOperator): - def __init__(self, - batch_sampler, - anchor_sampler=None, - target_size=None, - das_anchor_scales=[16, 32, 64, 128], - sampling_prob=0.5, - min_size=8., - avoid_no_bbox=True): - """ - Args: - anchor_sampler (list): anchor_sampling sets of different - parameters for cropping. - batch_sampler (list): Multiple sets of different - parameters for cropping. - e.g.[[1, 10, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.2, 0.0]] - [[1, 50, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0], - [1, 50, 0.3, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0]] - [max sample, max trial, min scale, max scale, - min aspect ratio, max aspect ratio, - min overlap, max overlap, min coverage, max coverage] - target_size (int): target image size. - das_anchor_scales (list[float]): a list of anchor scales in data - anchor smapling. - min_size (float): minimum size of sampled bbox. - avoid_no_bbox (bool): whether to avoid the - situation where the box does not appear. - """ - super(CropWithDataAchorSampling, self).__init__() - self.anchor_sampler = anchor_sampler - self.batch_sampler = batch_sampler - self.target_size = target_size - self.sampling_prob = sampling_prob - self.min_size = min_size - self.avoid_no_bbox = avoid_no_bbox - self.das_anchor_scales = np.array(das_anchor_scales) - - def apply(self, sample, context): - """ - Crop the image and modify bounding box. - Operators: - 1. Scale the image width and height. - 2. Crop the image according to a radom sample. - 3. Rescale the bounding box. - 4. Determine if the new bbox is satisfied in the new image. - Returns: - sample: the image, bounding box are replaced. - """ - assert 'image' in sample, "image data not found" - im = sample['image'] - gt_bbox = sample['gt_bbox'] - gt_class = sample['gt_class'] - image_height, image_width = im.shape[:2] - gt_bbox[:, 0] /= image_width - gt_bbox[:, 1] /= image_height - gt_bbox[:, 2] /= image_width - gt_bbox[:, 3] /= image_height - gt_score = None - if 'gt_score' in sample: - gt_score = sample['gt_score'] - sampled_bbox = [] - gt_bbox = gt_bbox.tolist() - - prob = np.random.uniform(0., 1.) - if prob > self.sampling_prob: # anchor sampling - assert self.anchor_sampler - for sampler in self.anchor_sampler: - found = 0 - for i in range(sampler[1]): - if found >= sampler[0]: - break - sample_bbox = data_anchor_sampling( - gt_bbox, image_width, image_height, - self.das_anchor_scales, self.target_size) - if sample_bbox == 0: - break - if satisfy_sample_constraint_coverage(sampler, sample_bbox, - gt_bbox): - sampled_bbox.append(sample_bbox) - found = found + 1 - im = np.array(im) - while sampled_bbox: - idx = int(np.random.uniform(0, len(sampled_bbox))) - sample_bbox = sampled_bbox.pop(idx) - - if 'gt_keypoint' in sample.keys(): - keypoints = (sample['gt_keypoint'], - sample['keypoint_ignore']) - crop_bbox, crop_class, crop_score, gt_keypoints = \ - filter_and_process(sample_bbox, gt_bbox, gt_class, - scores=gt_score, - keypoints=keypoints) - else: - crop_bbox, crop_class, crop_score = filter_and_process( - sample_bbox, gt_bbox, gt_class, scores=gt_score) - crop_bbox, crop_class, crop_score = bbox_area_sampling( - crop_bbox, crop_class, crop_score, self.target_size, - self.min_size) - - if self.avoid_no_bbox: - if len(crop_bbox) < 1: - continue - im = crop_image_sampling(im, sample_bbox, image_width, - image_height, self.target_size) - height, width = im.shape[:2] - crop_bbox[:, 0] *= width - crop_bbox[:, 1] *= height - crop_bbox[:, 2] *= width - crop_bbox[:, 3] *= height - sample['image'] = im - sample['gt_bbox'] = crop_bbox - sample['gt_class'] = crop_class - if 'gt_score' in sample: - sample['gt_score'] = crop_score - if 'gt_keypoint' in sample.keys(): - sample['gt_keypoint'] = gt_keypoints[0] - sample['keypoint_ignore'] = gt_keypoints[1] - return sample - return sample - - else: - for sampler in self.batch_sampler: - found = 0 - for i in range(sampler[1]): - if found >= sampler[0]: - break - sample_bbox = generate_sample_bbox_square( - sampler, image_width, image_height) - if satisfy_sample_constraint_coverage(sampler, sample_bbox, - gt_bbox): - sampled_bbox.append(sample_bbox) - found = found + 1 - im = np.array(im) - while sampled_bbox: - idx = int(np.random.uniform(0, len(sampled_bbox))) - sample_bbox = sampled_bbox.pop(idx) - sample_bbox = clip_bbox(sample_bbox) - - if 'gt_keypoint' in sample.keys(): - keypoints = (sample['gt_keypoint'], - sample['keypoint_ignore']) - crop_bbox, crop_class, crop_score, gt_keypoints = \ - filter_and_process(sample_bbox, gt_bbox, gt_class, - scores=gt_score, - keypoints=keypoints) - else: - crop_bbox, crop_class, crop_score = filter_and_process( - sample_bbox, gt_bbox, gt_class, scores=gt_score) - # sampling bbox according the bbox area - crop_bbox, crop_class, crop_score = bbox_area_sampling( - crop_bbox, crop_class, crop_score, self.target_size, - self.min_size) - - if self.avoid_no_bbox: - if len(crop_bbox) < 1: - continue - xmin = int(sample_bbox[0] * image_width) - xmax = int(sample_bbox[2] * image_width) - ymin = int(sample_bbox[1] * image_height) - ymax = int(sample_bbox[3] * image_height) - im = im[ymin:ymax, xmin:xmax] - height, width = im.shape[:2] - crop_bbox[:, 0] *= width - crop_bbox[:, 1] *= height - crop_bbox[:, 2] *= width - crop_bbox[:, 3] *= height - sample['image'] = im - sample['gt_bbox'] = crop_bbox - sample['gt_class'] = crop_class - if 'gt_score' in sample: - sample['gt_score'] = crop_score - if 'gt_keypoint' in sample.keys(): - sample['gt_keypoint'] = gt_keypoints[0] - sample['keypoint_ignore'] = gt_keypoints[1] - return sample - return sample - - -@register_op -class RandomCrop(BaseOperator): - """Random crop image and bboxes. - Args: - aspect_ratio (list): aspect ratio of cropped region. - in [min, max] format. - thresholds (list): iou thresholds for decide a valid bbox crop. - scaling (list): ratio between a cropped region and the original image. - in [min, max] format. - num_attempts (int): number of tries before giving up. - allow_no_crop (bool): allow return without actually cropping them. - cover_all_box (bool): ensure all bboxes are covered in the final crop. - is_mask_crop(bool): whether crop the segmentation. - """ - - def __init__(self, - aspect_ratio=[.5, 2.], - thresholds=[.0, .1, .3, .5, .7, .9], - scaling=[.3, 1.], - num_attempts=50, - allow_no_crop=True, - cover_all_box=False, - is_mask_crop=False, - ioumode="iou", - prob=1.0): - super(RandomCrop, self).__init__() - self.aspect_ratio = aspect_ratio - self.thresholds = thresholds - self.scaling = scaling - self.num_attempts = num_attempts - self.allow_no_crop = allow_no_crop - self.cover_all_box = cover_all_box - self.is_mask_crop = is_mask_crop - self.ioumode = ioumode - self.prob = prob - - def crop_segms(self, segms, valid_ids, crop, height, width): - def _crop_poly(segm, crop): - xmin, ymin, xmax, ymax = crop - crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin] - crop_p = np.array(crop_coord).reshape(4, 2) - crop_p = Polygon(crop_p) - - crop_segm = list() - for poly in segm: - poly = np.array(poly).reshape(len(poly) // 2, 2) - polygon = Polygon(poly) - if not polygon.is_valid: - exterior = polygon.exterior - multi_lines = exterior.intersection(exterior) - polygons = shapely.ops.polygonize(multi_lines) - polygon = MultiPolygon(polygons) - multi_polygon = list() - if isinstance(polygon, MultiPolygon): - multi_polygon = copy.deepcopy(polygon) - else: - multi_polygon.append(copy.deepcopy(polygon)) - for per_polygon in multi_polygon: - inter = per_polygon.intersection(crop_p) - if not inter: - continue - if isinstance(inter, (MultiPolygon, GeometryCollection)): - for part in inter: - if not isinstance(part, Polygon): - continue - part = np.squeeze( - np.array(part.exterior.coords[:-1]).reshape(1, - -1)) - part[0::2] -= xmin - part[1::2] -= ymin - crop_segm.append(part.tolist()) - elif isinstance(inter, Polygon): - crop_poly = np.squeeze( - np.array(inter.exterior.coords[:-1]).reshape(1, -1)) - crop_poly[0::2] -= xmin - crop_poly[1::2] -= ymin - crop_segm.append(crop_poly.tolist()) - else: - continue - return crop_segm - - def _crop_rle(rle, crop, height, width): - if 'counts' in rle and type(rle['counts']) == list: - rle = mask_util.frPyObjects(rle, height, width) - mask = mask_util.decode(rle) - mask = mask[crop[1]:crop[3], crop[0]:crop[2]] - rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) - return rle - - crop_segms = [] - for id in valid_ids: - segm = segms[id] - if is_poly(segm): - import copy - import shapely.ops - from shapely.geometry import Polygon, MultiPolygon, GeometryCollection - logging.getLogger("shapely").setLevel(logging.WARNING) - # Polygon format - crop_segms.append(_crop_poly(segm, crop)) - else: - # RLE format - import pycocotools.mask as mask_util - crop_segms.append(_crop_rle(segm, crop, height, width)) - return crop_segms - - def set_fake_bboxes(self, sample): - sample['gt_bbox'] = np.array( - [ - [32, 32, 128, 128], - [32, 32, 128, 256], - [32, 64, 128, 128], - [32, 64, 128, 256], - [64, 64, 128, 256], - [64, 64, 256, 256], - [64, 32, 128, 256], - [64, 32, 128, 256], - [96, 32, 128, 256], - [96, 32, 128, 256], - ], - dtype=np.float32) - sample['gt_class'] = np.array( - [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]], np.int32) - return sample - - def apply(self, sample, context=None): - if random.random() > self.prob: - return sample - - if 'gt_bbox' not in sample: - # only used in semi-det as unsup data - sample = self.set_fake_bboxes(sample) - sample = self.random_crop(sample, fake_bboxes=True) - del sample['gt_bbox'] - del sample['gt_class'] - return sample - - if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0: - return sample - sample = self.random_crop(sample) - return sample - - def random_crop(self, sample, fake_bboxes=False): - h, w = sample['image'].shape[:2] - gt_bbox = sample['gt_bbox'] - - # NOTE Original method attempts to generate one candidate for each - # threshold then randomly sample one from the resulting list. - # Here a short circuit approach is taken, i.e., randomly choose a - # threshold and attempt to find a valid crop, and simply return the - # first one found. - # The probability is not exactly the same, kinda resembling the - # "Monty Hall" problem. Actually carrying out the attempts will affect - # observability (just like opening doors in the "Monty Hall" game). - thresholds = list(self.thresholds) - if self.allow_no_crop: - thresholds.append('no_crop') - np.random.shuffle(thresholds) - - for thresh in thresholds: - if thresh == 'no_crop': - return sample - - found = False - for i in range(self.num_attempts): - scale = np.random.uniform(*self.scaling) - if self.aspect_ratio is not None: - min_ar, max_ar = self.aspect_ratio - aspect_ratio = np.random.uniform( - max(min_ar, scale**2), min(max_ar, scale**-2)) - h_scale = scale / np.sqrt(aspect_ratio) - w_scale = scale * np.sqrt(aspect_ratio) - else: - h_scale = np.random.uniform(*self.scaling) - w_scale = np.random.uniform(*self.scaling) - crop_h = h * h_scale - crop_w = w * w_scale - if self.aspect_ratio is None: - if crop_h / crop_w < 0.5 or crop_h / crop_w > 2.0: - continue - - crop_h = int(crop_h) - crop_w = int(crop_w) - crop_y = np.random.randint(0, h - crop_h) - crop_x = np.random.randint(0, w - crop_w) - crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h] - if self.ioumode == "iof": - iou = self._gtcropiou_matrix( - gt_bbox, np.array( - [crop_box], dtype=np.float32)) - elif self.ioumode == "iou": - iou = self._iou_matrix( - gt_bbox, np.array( - [crop_box], dtype=np.float32)) - if iou.max() < thresh: - continue - - if self.cover_all_box and iou.min() < thresh: - continue - - cropped_box, valid_ids = self._crop_box_with_center_constraint( - gt_bbox, np.array( - crop_box, dtype=np.float32)) - if valid_ids.size > 0: - found = True - break - - if found: - if self.is_mask_crop and 'gt_poly' in sample and len(sample[ - 'gt_poly']) > 0: - crop_polys = self.crop_segms( - sample['gt_poly'], - valid_ids, - np.array( - crop_box, dtype=np.int64), - h, - w) - if [] in crop_polys: - delete_id = list() - valid_polys = list() - for id, crop_poly in enumerate(crop_polys): - if crop_poly == []: - delete_id.append(id) - else: - valid_polys.append(crop_poly) - valid_ids = np.delete(valid_ids, delete_id) - if len(valid_polys) == 0: - return sample - sample['gt_poly'] = valid_polys - else: - sample['gt_poly'] = crop_polys - - if 'gt_segm' in sample: - sample['gt_segm'] = self._crop_segm(sample['gt_segm'], - crop_box) - sample['gt_segm'] = np.take( - sample['gt_segm'], valid_ids, axis=0) - - sample['image'] = self._crop_image(sample['image'], crop_box) - if fake_bboxes == True: - return sample - - sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0) - sample['gt_class'] = np.take( - sample['gt_class'], valid_ids, axis=0) - if 'gt_score' in sample: - sample['gt_score'] = np.take( - sample['gt_score'], valid_ids, axis=0) - - if 'is_crowd' in sample: - sample['is_crowd'] = np.take( - sample['is_crowd'], valid_ids, axis=0) - - if 'difficult' in sample: - sample['difficult'] = np.take( - sample['difficult'], valid_ids, axis=0) - - if 'gt_joints' in sample: - sample['gt_joints'] = self._crop_joints(sample['gt_joints'], - crop_box) - - return sample - - return sample - - def _iou_matrix(self, a, b): - tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2]) - br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) - - area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2) - area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) - area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) - area_o = (area_a[:, np.newaxis] + area_b - area_i) - return area_i / (area_o + 1e-10) - - def _gtcropiou_matrix(self, a, b): - tl_i = np.maximum(a[:, np.newaxis, :2], b[:, :2]) - br_i = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) - - area_i = np.prod(br_i - tl_i, axis=2) * (tl_i < br_i).all(axis=2) - area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) - area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) - area_o = (area_a[:, np.newaxis] + area_b - area_i) - return area_i / (area_a + 1e-10) - - def _crop_box_with_center_constraint(self, box, crop): - cropped_box = box.copy() - - cropped_box[:, :2] = np.maximum(box[:, :2], crop[:2]) - cropped_box[:, 2:] = np.minimum(box[:, 2:], crop[2:]) - cropped_box[:, :2] -= crop[:2] - cropped_box[:, 2:] -= crop[:2] - - centers = (box[:, :2] + box[:, 2:]) / 2 - valid = np.logical_and(crop[:2] <= centers, - centers < crop[2:]).all(axis=1) - valid = np.logical_and( - valid, (cropped_box[:, :2] < cropped_box[:, 2:]).all(axis=1)) - - return cropped_box, np.where(valid)[0] - - def _crop_image(self, img, crop): - x1, y1, x2, y2 = crop - return img[y1:y2, x1:x2, :] - - def _crop_segm(self, segm, crop): - x1, y1, x2, y2 = crop - return segm[:, y1:y2, x1:x2] - - def _crop_joints(self, joints, crop): - x1, y1, x2, y2 = crop - joints[joints[..., 0] > x2, :] = 0 - joints[joints[..., 1] > y2, :] = 0 - joints[joints[..., 0] < x1, :] = 0 - joints[joints[..., 1] < y1, :] = 0 - joints[..., 0] -= x1 - joints[..., 1] -= y1 - return joints - - -@register_op -class RandomScaledCrop(BaseOperator): - """Resize image and bbox based on long side (with optional random scaling), - then crop or pad image to target size. - Args: - target_size (int|list): target size, "hw" format. - scale_range (list): random scale range. - interp (int): interpolation method, default to `cv2.INTER_LINEAR`. - fill_value (float|list|tuple): color value used to fill the canvas, - in RGB order. - """ - - def __init__(self, - target_size=512, - scale_range=[.1, 2.], - interp=cv2.INTER_LINEAR, - fill_value=(123.675, 116.28, 103.53)): - super(RandomScaledCrop, self).__init__() - assert isinstance(target_size, ( - Integral, Sequence)), "target_size must be Integer, List or Tuple" - if isinstance(target_size, Integral): - target_size = [target_size, ] * 2 - - self.target_size = target_size - self.scale_range = scale_range - self.interp = interp - assert isinstance(fill_value, (Number, Sequence)), \ - "fill value must be either float or sequence" - if isinstance(fill_value, Number): - fill_value = (fill_value, ) * 3 - if not isinstance(fill_value, tuple): - fill_value = tuple(fill_value) - self.fill_value = fill_value - - def apply_image(self, img, output_size, offset_x, offset_y): - th, tw = self.target_size - rh, rw = output_size - img = cv2.resize( - img, (rw, rh), interpolation=self.interp).astype(np.float32) - canvas = np.ones([th, tw, 3], dtype=np.float32) - canvas *= np.array(self.fill_value, dtype=np.float32) - canvas[:min(th, rh), :min(tw, rw)] = \ - img[offset_y:offset_y + th, offset_x:offset_x + tw] - return canvas - - def apply_bbox(self, gt_bbox, gt_class, scale, offset_x, offset_y): - th, tw = self.target_size - shift_array = np.array( - [ - offset_x, - offset_y, - ] * 2, dtype=np.float32) - boxes = gt_bbox * scale - shift_array - boxes[:, 0::2] = np.clip(boxes[:, 0::2], 0, tw) - boxes[:, 1::2] = np.clip(boxes[:, 1::2], 0, th) - # filter boxes with no area - area = np.prod(boxes[..., 2:] - boxes[..., :2], axis=1) - valid = (area > 1.).nonzero()[0] - return boxes[valid], gt_class[valid], valid - - def apply_segm(self, segms, output_size, offset_x, offset_y, valid=None): - th, tw = self.target_size - rh, rw = output_size - out_segms = [] - for segm in segms: - segm = cv2.resize(segm, (rw, rh), interpolation=cv2.INTER_NEAREST) - segm = segm.astype(np.float32) - canvas = np.zeros([th, tw], dtype=segm.dtype) - canvas[:min(th, rh), :min(tw, rw)] = \ - segm[offset_y:offset_y + th, offset_x:offset_x + tw] - out_segms.append(canvas) - out_segms = np.stack(out_segms) - return out_segms if valid is None else out_segms[valid] - - def apply(self, sample, context=None): - img = sample['image'] - h, w = img.shape[:2] - random_scale = np.random.uniform(*self.scale_range) - target_scale_size = [t * random_scale for t in self.target_size] - # Compute actual rescaling applied to image. - scale = min(target_scale_size[0] / h, target_scale_size[1] / w) - output_size = [int(round(h * scale)), int(round(w * scale))] - # get offset - offset_x = int( - max(0, np.random.uniform(0., output_size[1] - self.target_size[1]))) - offset_y = int( - max(0, np.random.uniform(0., output_size[0] - self.target_size[0]))) - - # apply to image - sample['image'] = self.apply_image(img, output_size, offset_x, offset_y) - - # apply to bbox - valid = None - if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: - sample['gt_bbox'], sample['gt_class'], valid = self.apply_bbox( - sample['gt_bbox'], sample['gt_class'], scale, offset_x, - offset_y) - - # apply to segm - if 'gt_segm' in sample and len(sample['gt_segm']) > 0: - sample['gt_segm'] = self.apply_segm(sample['gt_segm'], output_size, - offset_x, offset_y, valid) - - sample['im_shape'] = np.asarray(output_size, dtype=np.float32) - scale_factor = sample['scale_factor'] - sample['scale_factor'] = np.asarray( - [scale_factor[0] * scale, scale_factor[1] * scale], - dtype=np.float32) - - return sample - - -@register_op -class Cutmix(BaseOperator): - def __init__(self, alpha=1.5, beta=1.5): - """ - CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features, see https://arxiv.org/abs/1905.04899 - Cutmix image and gt_bbbox/gt_score - Args: - alpha (float): alpha parameter of beta distribute - beta (float): beta parameter of beta distribute - """ - super(Cutmix, self).__init__() - self.alpha = alpha - self.beta = beta - if self.alpha <= 0.0: - raise ValueError("alpha shold be positive in {}".format(self)) - if self.beta <= 0.0: - raise ValueError("beta shold be positive in {}".format(self)) - - def apply_image(self, img1, img2, factor): - """ _rand_bbox """ - h = max(img1.shape[0], img2.shape[0]) - w = max(img1.shape[1], img2.shape[1]) - cut_rat = np.sqrt(1. - factor) - - cut_w = np.int32(w * cut_rat) - cut_h = np.int32(h * cut_rat) - - # uniform - cx = np.random.randint(w) - cy = np.random.randint(h) - - bbx1 = np.clip(cx - cut_w // 2, 0, w - 1) - bby1 = np.clip(cy - cut_h // 2, 0, h - 1) - bbx2 = np.clip(cx + cut_w // 2, 0, w - 1) - bby2 = np.clip(cy + cut_h // 2, 0, h - 1) - - img_1_pad = np.zeros((h, w, img1.shape[2]), 'float32') - img_1_pad[:img1.shape[0], :img1.shape[1], :] = \ - img1.astype('float32') - img_2_pad = np.zeros((h, w, img2.shape[2]), 'float32') - img_2_pad[:img2.shape[0], :img2.shape[1], :] = \ - img2.astype('float32') - img_1_pad[bby1:bby2, bbx1:bbx2, :] = img_2_pad[bby1:bby2, bbx1:bbx2, :] - return img_1_pad - - def __call__(self, sample, context=None): - if not isinstance(sample, Sequence): - return sample - - assert len(sample) == 2, 'cutmix need two samples' - - factor = np.random.beta(self.alpha, self.beta) - factor = max(0.0, min(1.0, factor)) - if factor >= 1.0: - return sample[0] - if factor <= 0.0: - return sample[1] - img1 = sample[0]['image'] - img2 = sample[1]['image'] - img = self.apply_image(img1, img2, factor) - gt_bbox1 = sample[0]['gt_bbox'] - gt_bbox2 = sample[1]['gt_bbox'] - gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0) - gt_class1 = sample[0]['gt_class'] - gt_class2 = sample[1]['gt_class'] - gt_class = np.concatenate((gt_class1, gt_class2), axis=0) - gt_score1 = np.ones_like(sample[0]['gt_class']) - gt_score2 = np.ones_like(sample[1]['gt_class']) - gt_score = np.concatenate( - (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0) - result = copy.deepcopy(sample[0]) - result['image'] = img - result['gt_bbox'] = gt_bbox - result['gt_score'] = gt_score - result['gt_class'] = gt_class - if 'is_crowd' in sample[0]: - is_crowd1 = sample[0]['is_crowd'] - is_crowd2 = sample[1]['is_crowd'] - is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0) - result['is_crowd'] = is_crowd - if 'difficult' in sample[0]: - is_difficult1 = sample[0]['difficult'] - is_difficult2 = sample[1]['difficult'] - is_difficult = np.concatenate( - (is_difficult1, is_difficult2), axis=0) - result['difficult'] = is_difficult - return result - - -@register_op -class Mixup(BaseOperator): - def __init__(self, alpha=1.5, beta=1.5): - """ Mixup image and gt_bbbox/gt_score - Args: - alpha (float): alpha parameter of beta distribute - beta (float): beta parameter of beta distribute - """ - super(Mixup, self).__init__() - self.alpha = alpha - self.beta = beta - if self.alpha <= 0.0: - raise ValueError("alpha shold be positive in {}".format(self)) - if self.beta <= 0.0: - raise ValueError("beta shold be positive in {}".format(self)) - - def apply_image(self, img1, img2, factor): - h = max(img1.shape[0], img2.shape[0]) - w = max(img1.shape[1], img2.shape[1]) - img = np.zeros((h, w, img1.shape[2]), 'float32') - img[:img1.shape[0], :img1.shape[1], :] = \ - img1.astype('float32') * factor - img[:img2.shape[0], :img2.shape[1], :] += \ - img2.astype('float32') * (1.0 - factor) - return img.astype('uint8') - - def __call__(self, sample, context=None): - if not isinstance(sample, Sequence): - return sample - - assert len(sample) == 2, 'mixup need two samples' - - factor = np.random.beta(self.alpha, self.beta) - factor = max(0.0, min(1.0, factor)) - if factor >= 1.0: - return sample[0] - if factor <= 0.0: - return sample[1] - im = self.apply_image(sample[0]['image'], sample[1]['image'], factor) - result = copy.deepcopy(sample[0]) - result['image'] = im - # apply bbox and score - if 'gt_bbox' in sample[0]: - gt_bbox1 = sample[0]['gt_bbox'] - gt_bbox2 = sample[1]['gt_bbox'] - gt_bbox = np.concatenate((gt_bbox1, gt_bbox2), axis=0) - result['gt_bbox'] = gt_bbox - if 'gt_class' in sample[0]: - gt_class1 = sample[0]['gt_class'] - gt_class2 = sample[1]['gt_class'] - gt_class = np.concatenate((gt_class1, gt_class2), axis=0) - result['gt_class'] = gt_class - - gt_score1 = np.ones_like(sample[0]['gt_class']) - gt_score2 = np.ones_like(sample[1]['gt_class']) - gt_score = np.concatenate( - (gt_score1 * factor, gt_score2 * (1. - factor)), axis=0) - result['gt_score'] = gt_score.astype('float32') - if 'is_crowd' in sample[0]: - is_crowd1 = sample[0]['is_crowd'] - is_crowd2 = sample[1]['is_crowd'] - is_crowd = np.concatenate((is_crowd1, is_crowd2), axis=0) - result['is_crowd'] = is_crowd - if 'difficult' in sample[0]: - is_difficult1 = sample[0]['difficult'] - is_difficult2 = sample[1]['difficult'] - is_difficult = np.concatenate( - (is_difficult1, is_difficult2), axis=0) - result['difficult'] = is_difficult - - if 'gt_ide' in sample[0]: - gt_ide1 = sample[0]['gt_ide'] - gt_ide2 = sample[1]['gt_ide'] - gt_ide = np.concatenate((gt_ide1, gt_ide2), axis=0) - result['gt_ide'] = gt_ide - return result - - -@register_op -class NormalizeBox(BaseOperator): - """Transform the bounding box's coornidates to [0,1].""" - - def __init__(self): - super(NormalizeBox, self).__init__() - - def apply(self, sample, context): - im = sample['image'] - if 'gt_bbox' in sample.keys(): - gt_bbox = sample['gt_bbox'] - height, width, _ = im.shape - for i in range(gt_bbox.shape[0]): - gt_bbox[i][0] = gt_bbox[i][0] / width - gt_bbox[i][1] = gt_bbox[i][1] / height - gt_bbox[i][2] = gt_bbox[i][2] / width - gt_bbox[i][3] = gt_bbox[i][3] / height - sample['gt_bbox'] = gt_bbox - - if 'gt_keypoint' in sample.keys(): - gt_keypoint = sample['gt_keypoint'] - - for i in range(gt_keypoint.shape[1]): - if i % 2: - gt_keypoint[:, i] = gt_keypoint[:, i] / height - else: - gt_keypoint[:, i] = gt_keypoint[:, i] / width - sample['gt_keypoint'] = gt_keypoint - - return sample - else: - return sample - - -@register_op -class BboxXYXY2XYWH(BaseOperator): - """ - Convert bbox XYXY format to XYWH format. - """ - - def __init__(self): - super(BboxXYXY2XYWH, self).__init__() - - def apply(self, sample, context=None): - if 'gt_bbox' in sample.keys(): - bbox = sample['gt_bbox'] - bbox[:, 2:4] = bbox[:, 2:4] - bbox[:, :2] - bbox[:, :2] = bbox[:, :2] + bbox[:, 2:4] / 2. - sample['gt_bbox'] = bbox - return sample - else: - return sample - - -@register_op -class PadBox(BaseOperator): - def __init__(self, num_max_boxes=50): - """ - Pad zeros to bboxes if number of bboxes is less than num_max_boxes. - Args: - num_max_boxes (int): the max number of bboxes - """ - self.num_max_boxes = num_max_boxes - super(PadBox, self).__init__() - - def apply(self, sample, context=None): - assert 'gt_bbox' in sample - bbox = sample['gt_bbox'] - gt_num = min(self.num_max_boxes, len(bbox)) - num_max = self.num_max_boxes - # fields = context['fields'] if context else [] - pad_bbox = np.zeros((num_max, 4), dtype=np.float32) - if gt_num > 0: - pad_bbox[:gt_num, :] = bbox[:gt_num, :] - sample['gt_bbox'] = pad_bbox - if 'gt_class' in sample: - pad_class = np.zeros((num_max, ), dtype=np.int32) - if gt_num > 0: - pad_class[:gt_num] = sample['gt_class'][:gt_num, 0] - sample['gt_class'] = pad_class - if 'gt_score' in sample: - pad_score = np.zeros((num_max, ), dtype=np.float32) - if gt_num > 0: - pad_score[:gt_num] = sample['gt_score'][:gt_num, 0] - sample['gt_score'] = pad_score - # in training, for example in op ExpandImage, - # the bbox and gt_class is expandded, but the difficult is not, - # so, judging by it's length - if 'difficult' in sample: - pad_diff = np.zeros((num_max, ), dtype=np.int32) - if gt_num > 0: - pad_diff[:gt_num] = sample['difficult'][:gt_num, 0] - sample['difficult'] = pad_diff - if 'is_crowd' in sample: - pad_crowd = np.zeros((num_max, ), dtype=np.int32) - if gt_num > 0: - pad_crowd[:gt_num] = sample['is_crowd'][:gt_num, 0] - sample['is_crowd'] = pad_crowd - if 'gt_ide' in sample: - pad_ide = np.zeros((num_max, ), dtype=np.int32) - if gt_num > 0: - pad_ide[:gt_num] = sample['gt_ide'][:gt_num, 0] - sample['gt_ide'] = pad_ide - return sample - - -@register_op -class DebugVisibleImage(BaseOperator): - """ - In debug mode, visualize images according to `gt_box`. - (Currently only supported when not cropping and flipping image.) - """ - - def __init__(self, output_dir='output/debug', is_normalized=False): - super(DebugVisibleImage, self).__init__() - self.is_normalized = is_normalized - self.output_dir = output_dir - if not os.path.isdir(output_dir): - os.makedirs(output_dir) - if not isinstance(self.is_normalized, bool): - raise TypeError("{}: input type is invalid.".format(self)) - - def apply(self, sample, context=None): - image = Image.fromarray(sample['image'].astype(np.uint8)) - out_file_name = '{:012d}.jpg'.format(sample['im_id'][0]) - width = sample['w'] - height = sample['h'] - gt_bbox = sample['gt_bbox'] - gt_class = sample['gt_class'] - draw = ImageDraw.Draw(image) - for i in range(gt_bbox.shape[0]): - if self.is_normalized: - gt_bbox[i][0] = gt_bbox[i][0] * width - gt_bbox[i][1] = gt_bbox[i][1] * height - gt_bbox[i][2] = gt_bbox[i][2] * width - gt_bbox[i][3] = gt_bbox[i][3] * height - - xmin, ymin, xmax, ymax = gt_bbox[i] - draw.line( - [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), - (xmin, ymin)], - width=2, - fill='green') - # draw label - text = str(gt_class[i][0]) - tw, th = imagedraw_textsize_c(draw, text) - draw.rectangle( - [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green') - draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) - - if 'gt_keypoint' in sample.keys(): - gt_keypoint = sample['gt_keypoint'] - if self.is_normalized: - for i in range(gt_keypoint.shape[1]): - if i % 2: - gt_keypoint[:, i] = gt_keypoint[:, i] * height - else: - gt_keypoint[:, i] = gt_keypoint[:, i] * width - for i in range(gt_keypoint.shape[0]): - keypoint = gt_keypoint[i] - for j in range(int(keypoint.shape[0] / 2)): - x1 = round(keypoint[2 * j]).astype(np.int32) - y1 = round(keypoint[2 * j + 1]).astype(np.int32) - draw.ellipse( - (x1, y1, x1 + 5, y1 + 5), fill='green', outline='green') - save_path = os.path.join(self.output_dir, out_file_name) - image.save(save_path, quality=95) - return sample - - -@register_op -class Pad(BaseOperator): - def __init__(self, - size=None, - size_divisor=32, - pad_mode=0, - offsets=None, - fill_value=(127.5, 127.5, 127.5)): - """ - Pad image to a specified size or multiple of size_divisor. - Args: - size (int, Sequence): image target size, if None, pad to multiple of size_divisor, default None - size_divisor (int): size divisor, default 32 - pad_mode (int): pad mode, currently only supports four modes [-1, 0, 1, 2]. if -1, use specified offsets - if 0, only pad to right and bottom. if 1, pad according to center. if 2, only pad left and top - offsets (list): [offset_x, offset_y], specify offset while padding, only supported pad_mode=-1 - fill_value (bool): rgb value of pad area, default (127.5, 127.5, 127.5) - """ - super(Pad, self).__init__() - - if not isinstance(size, (int, Sequence)): - raise TypeError( - "Type of target_size is invalid when random_size is True. \ - Must be List, now is {}".format(type(size))) - - if isinstance(size, int): - size = [size, size] - - assert pad_mode in [ - -1, 0, 1, 2 - ], 'currently only supports four modes [-1, 0, 1, 2]' - if pad_mode == -1: - assert offsets, 'if pad_mode is -1, offsets should not be None' - - self.size = size - self.size_divisor = size_divisor - self.pad_mode = pad_mode - self.fill_value = fill_value - self.offsets = offsets - - def apply_segm(self, segms, offsets, im_size, size): - def _expand_poly(poly, x, y): - expanded_poly = np.array(poly) - expanded_poly[0::2] += x - expanded_poly[1::2] += y - return expanded_poly.tolist() - - def _expand_rle(rle, x, y, height, width, h, w): - if 'counts' in rle and type(rle['counts']) == list: - rle = mask_util.frPyObjects(rle, height, width) - mask = mask_util.decode(rle) - expanded_mask = np.full((h, w), 0).astype(mask.dtype) - expanded_mask[y:y + height, x:x + width] = mask - rle = mask_util.encode( - np.array( - expanded_mask, order='F', dtype=np.uint8)) - return rle - - x, y = offsets - height, width = im_size - h, w = size - expanded_segms = [] - for segm in segms: - if is_poly(segm): - # Polygon format - expanded_segms.append( - [_expand_poly(poly, x, y) for poly in segm]) - else: - # RLE format - import pycocotools.mask as mask_util - expanded_segms.append( - _expand_rle(segm, x, y, height, width, h, w)) - return expanded_segms - - def apply_bbox(self, bbox, offsets): - return bbox + np.array(offsets * 2, dtype=np.float32) - - def apply_keypoint(self, keypoints, offsets): - n = len(keypoints[0]) // 2 - return keypoints + np.array(offsets * n, dtype=np.float32) - - def apply_image(self, image, offsets, im_size, size): - x, y = offsets - im_h, im_w = im_size - h, w = size - canvas = np.ones((h, w, 3), dtype=np.float32) - canvas *= np.array(self.fill_value, dtype=np.float32) - canvas[y:y + im_h, x:x + im_w, :] = image.astype(np.float32) - return canvas - - def apply(self, sample, context=None): - im = sample['image'] - im_h, im_w = im.shape[:2] - if self.size: - h, w = self.size - assert ( - im_h <= h and im_w <= w - ), '(h, w) of target size should be greater than (im_h, im_w)' - else: - h = int(np.ceil(im_h / self.size_divisor) * self.size_divisor) - w = int(np.ceil(im_w / self.size_divisor) * self.size_divisor) - - if h == im_h and w == im_w: - sample['image'] = im.astype(np.float32) - return sample - - if self.pad_mode == -1: - offset_x, offset_y = self.offsets - elif self.pad_mode == 0: - offset_y, offset_x = 0, 0 - elif self.pad_mode == 1: - offset_y, offset_x = (h - im_h) // 2, (w - im_w) // 2 - else: - offset_y, offset_x = h - im_h, w - im_w - - offsets, im_size, size = [offset_x, offset_y], [im_h, im_w], [h, w] - - sample['image'] = self.apply_image(im, offsets, im_size, size) - - if self.pad_mode == 0: - return sample - if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: - sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], offsets) - - if 'gt_poly' in sample and len(sample['gt_poly']) > 0: - sample['gt_poly'] = self.apply_segm(sample['gt_poly'], offsets, - im_size, size) - - if 'gt_keypoint' in sample and len(sample['gt_keypoint']) > 0: - sample['gt_keypoint'] = self.apply_keypoint(sample['gt_keypoint'], - offsets) - - return sample - - -@register_op -class Poly2Mask(BaseOperator): - """ - gt poly to mask annotations. - Args: - del_poly (bool): Whether to delete poly after generating mask. Default: False. - """ - - def __init__(self, del_poly=False): - super(Poly2Mask, self).__init__() - import pycocotools.mask as maskUtils - self.maskutils = maskUtils - self.del_poly = del_poly - - def _poly2mask(self, mask_ann, img_h, img_w): - if isinstance(mask_ann, list): - # polygon -- a single object might consist of multiple parts - # we merge all parts into one mask rle code - rles = self.maskutils.frPyObjects(mask_ann, img_h, img_w) - rle = self.maskutils.merge(rles) - elif isinstance(mask_ann['counts'], list): - # uncompressed RLE - rle = self.maskutils.frPyObjects(mask_ann, img_h, img_w) - else: - # rle - rle = mask_ann - mask = self.maskutils.decode(rle) - return mask - - def apply(self, sample, context=None): - assert 'gt_poly' in sample - im_h, im_w = sample['im_shape'] - masks = [ - self._poly2mask(gt_poly, im_h, im_w) - for gt_poly in sample['gt_poly'] - ] - sample['gt_segm'] = np.asarray(masks).astype(np.uint8) - if self.del_poly: - del (sample['gt_poly']) - - return sample - - -@register_op -class AugmentHSV(BaseOperator): - """ - Augment the SV channel of image data. - Args: - fraction (float): the fraction for augment. Default: 0.5. - is_bgr (bool): whether the image is BGR mode. Default: True. - hgain (float): H channel gains - sgain (float): S channel gains - vgain (float): V channel gains - """ - - def __init__(self, - fraction=0.50, - is_bgr=True, - hgain=None, - sgain=None, - vgain=None): - super(AugmentHSV, self).__init__() - self.fraction = fraction - self.is_bgr = is_bgr - self.hgain = hgain - self.sgain = sgain - self.vgain = vgain - self.use_hsvgain = False if hgain is None else True - - def apply(self, sample, context=None): - img = sample['image'] - if self.is_bgr: - img_hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) - else: - img_hsv = cv2.cvtColor(img, cv2.COLOR_RGB2HSV) - - if self.use_hsvgain: - hsv_augs = np.random.uniform( - -1, 1, 3) * [self.hgain, self.sgain, self.vgain] - # random selection of h, s, v - hsv_augs *= np.random.randint(0, 2, 3) - img_hsv[..., 0] = (img_hsv[..., 0] + hsv_augs[0]) % 180 - img_hsv[..., 1] = np.clip(img_hsv[..., 1] + hsv_augs[1], 0, 255) - img_hsv[..., 2] = np.clip(img_hsv[..., 2] + hsv_augs[2], 0, 255) - - else: - S = img_hsv[:, :, 1].astype(np.float32) - V = img_hsv[:, :, 2].astype(np.float32) - - a = (random.random() * 2 - 1) * self.fraction + 1 - S *= a - if a > 1: - np.clip(S, a_min=0, a_max=255, out=S) - - a = (random.random() * 2 - 1) * self.fraction + 1 - V *= a - if a > 1: - np.clip(V, a_min=0, a_max=255, out=V) - - img_hsv[:, :, 1] = S.astype(np.uint8) - img_hsv[:, :, 2] = V.astype(np.uint8) - - if self.is_bgr: - cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img) - else: - cv2.cvtColor(img_hsv, cv2.COLOR_HSV2RGB, dst=img) - - sample['image'] = img.astype(np.float32) - return sample - - -@register_op -class Norm2PixelBbox(BaseOperator): - """ - Transform the bounding box's coornidates which is in [0,1] to pixels. - """ - - def __init__(self): - super(Norm2PixelBbox, self).__init__() - - def apply(self, sample, context=None): - assert 'gt_bbox' in sample - bbox = sample['gt_bbox'] - height, width = sample['image'].shape[:2] - bbox[:, 0::2] = bbox[:, 0::2] * width - bbox[:, 1::2] = bbox[:, 1::2] * height - sample['gt_bbox'] = bbox - return sample - - -@register_op -class BboxCXCYWH2XYXY(BaseOperator): - """ - Convert bbox CXCYWH format to XYXY format. - [center_x, center_y, width, height] -> [x0, y0, x1, y1] - """ - - def __init__(self): - super(BboxCXCYWH2XYXY, self).__init__() - - def apply(self, sample, context=None): - assert 'gt_bbox' in sample - bbox0 = sample['gt_bbox'] - bbox = bbox0.copy() - - bbox[:, :2] = bbox0[:, :2] - bbox0[:, 2:4] / 2. - bbox[:, 2:4] = bbox0[:, :2] + bbox0[:, 2:4] / 2. - sample['gt_bbox'] = bbox - return sample - - -@register_op -class RandomResizeCrop(BaseOperator): - """Random resize and crop image and bboxes. - Args: - resizes (list): resize image to one of resizes. if keep_ratio is True and mode is - 'long', resize the image's long side to the maximum of target_size, if keep_ratio is - True and mode is 'short', resize the image's short side to the minimum of target_size. - cropsizes (list): crop sizes after resize, [(min_crop_1, max_crop_1), ...] - mode (str): resize mode, `long` or `short`. Details see resizes. - prob (float): probability of this op. - keep_ratio (bool): whether keep_ratio or not, default true - interp (int): the interpolation method - thresholds (list): iou thresholds for decide a valid bbox crop. - num_attempts (int): number of tries before giving up. - allow_no_crop (bool): allow return without actually cropping them. - cover_all_box (bool): ensure all bboxes are covered in the final crop. - is_mask_crop(bool): whether crop the segmentation. - """ - - def __init__(self, - resizes, - cropsizes, - prob=0.5, - mode='short', - keep_ratio=True, - interp=cv2.INTER_LINEAR, - num_attempts=3, - cover_all_box=False, - allow_no_crop=False, - thresholds=[0.3, 0.5, 0.7], - is_mask_crop=False, - ioumode="iou"): - super(RandomResizeCrop, self).__init__() - - self.resizes = resizes - self.cropsizes = cropsizes - self.prob = prob - self.mode = mode - self.ioumode = ioumode - - self.resizer = Resize(0, keep_ratio=keep_ratio, interp=interp) - self.croper = RandomCrop( - num_attempts=num_attempts, - cover_all_box=cover_all_box, - thresholds=thresholds, - allow_no_crop=allow_no_crop, - is_mask_crop=is_mask_crop) - - def _format_size(self, size): - if isinstance(size, Integral): - size = (size, size) - return size - - def apply(self, sample, context=None): - if random.random() < self.prob: - _resize = self._format_size(random.choice(self.resizes)) - _cropsize = self._format_size(random.choice(self.cropsizes)) - sample = self._resize( - self.resizer, - sample, - size=_resize, - mode=self.mode, - context=context) - sample = self._random_crop( - self.croper, sample, size=_cropsize, context=context) - return sample - - @staticmethod - def _random_crop(croper, sample, size, context=None): - if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0: - return sample - - self = croper - h, w = sample['image'].shape[:2] - gt_bbox = sample['gt_bbox'] - cropsize = size - min_crop = min(cropsize) - max_crop = max(cropsize) - - thresholds = list(self.thresholds) - np.random.shuffle(thresholds) - - for thresh in thresholds: - found = False - for _ in range(self.num_attempts): - - crop_h = random.randint(min_crop, min(h, max_crop)) - crop_w = random.randint(min_crop, min(w, max_crop)) - - crop_y = random.randint(0, h - crop_h) - crop_x = random.randint(0, w - crop_w) - - crop_box = [crop_x, crop_y, crop_x + crop_w, crop_y + crop_h] - if self.ioumode == "iof": - iou = self._gtcropiou_matrix( - gt_bbox, np.array( - [crop_box], dtype=np.float32)) - elif self.ioumode == "iou": - iou = self._iou_matrix( - gt_bbox, np.array( - [crop_box], dtype=np.float32)) - if iou.max() < thresh: - continue - - if self.cover_all_box and iou.min() < thresh: - continue - - cropped_box, valid_ids = self._crop_box_with_center_constraint( - gt_bbox, np.array( - crop_box, dtype=np.float32)) - if valid_ids.size > 0: - found = True - break - - if found: - if self.is_mask_crop and 'gt_poly' in sample and len(sample[ - 'gt_poly']) > 0: - crop_polys = self.crop_segms( - sample['gt_poly'], - valid_ids, - np.array( - crop_box, dtype=np.int64), - h, - w) - if [] in crop_polys: - delete_id = list() - valid_polys = list() - for id, crop_poly in enumerate(crop_polys): - if crop_poly == []: - delete_id.append(id) - else: - valid_polys.append(crop_poly) - valid_ids = np.delete(valid_ids, delete_id) - if len(valid_polys) == 0: - return sample - sample['gt_poly'] = valid_polys - else: - sample['gt_poly'] = crop_polys - - if 'gt_segm' in sample: - sample['gt_segm'] = self._crop_segm(sample['gt_segm'], - crop_box) - sample['gt_segm'] = np.take( - sample['gt_segm'], valid_ids, axis=0) - - sample['image'] = self._crop_image(sample['image'], crop_box) - sample['gt_bbox'] = np.take(cropped_box, valid_ids, axis=0) - sample['gt_class'] = np.take( - sample['gt_class'], valid_ids, axis=0) - if 'gt_score' in sample: - sample['gt_score'] = np.take( - sample['gt_score'], valid_ids, axis=0) - - if 'is_crowd' in sample: - sample['is_crowd'] = np.take( - sample['is_crowd'], valid_ids, axis=0) - - if 'gt_areas' in sample: - sample['gt_areas'] = np.take( - sample['gt_areas'], valid_ids, axis=0) - - if 'gt_joints' in sample: - gt_joints = self._crop_joints(sample['gt_joints'], crop_box) - sample['gt_joints'] = gt_joints[valid_ids] - return sample - - return sample - - @staticmethod - def _resize(resizer, sample, size, mode='short', context=None): - self = resizer - im = sample['image'] - target_size = size - - if not isinstance(im, np.ndarray): - raise TypeError("{}: image type is not numpy.".format(self)) - if len(im.shape) != 3: - raise ImageError('{}: image is not 3-dimensional.'.format(self)) - - # apply image - im_shape = im.shape - if self.keep_ratio: - - im_size_min = np.min(im_shape[0:2]) - im_size_max = np.max(im_shape[0:2]) - - target_size_min = np.min(target_size) - target_size_max = np.max(target_size) - - if mode == 'long': - im_scale = min(target_size_min / im_size_min, - target_size_max / im_size_max) - else: - im_scale = max(target_size_min / im_size_min, - target_size_max / im_size_max) - - resize_h = int(im_scale * float(im_shape[0]) + 0.5) - resize_w = int(im_scale * float(im_shape[1]) + 0.5) - - im_scale_x = im_scale - im_scale_y = im_scale - else: - resize_h, resize_w = target_size - im_scale_y = resize_h / im_shape[0] - im_scale_x = resize_w / im_shape[1] - - im = self.apply_image(sample['image'], [im_scale_x, im_scale_y]) - sample['image'] = im - sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32) - if 'scale_factor' in sample: - scale_factor = sample['scale_factor'] - sample['scale_factor'] = np.asarray( - [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x], - dtype=np.float32) - else: - sample['scale_factor'] = np.asarray( - [im_scale_y, im_scale_x], dtype=np.float32) - - # apply bbox - if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: - sample['gt_bbox'] = self.apply_bbox(sample['gt_bbox'], - [im_scale_x, im_scale_y], - [resize_w, resize_h]) - - # apply polygon - if 'gt_poly' in sample and len(sample['gt_poly']) > 0: - sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im_shape[:2], - [im_scale_x, im_scale_y]) - - # apply semantic - if 'semantic' in sample and sample['semantic']: - semantic = sample['semantic'] - semantic = cv2.resize( - semantic.astype('float32'), - None, - None, - fx=im_scale_x, - fy=im_scale_y, - interpolation=self.interp) - semantic = np.asarray(semantic).astype('int32') - semantic = np.expand_dims(semantic, 0) - sample['semantic'] = semantic - - # apply gt_segm - if 'gt_segm' in sample and len(sample['gt_segm']) > 0: - masks = [ - cv2.resize( - gt_segm, - None, - None, - fx=im_scale_x, - fy=im_scale_y, - interpolation=cv2.INTER_NEAREST) - for gt_segm in sample['gt_segm'] - ] - sample['gt_segm'] = np.asarray(masks).astype(np.uint8) - - if 'gt_joints' in sample: - sample['gt_joints'] = self.apply_joints(sample['gt_joints'], - [im_scale_x, im_scale_y], - [resize_w, resize_h]) - - return sample - - -@register_op -class RandomSelect(BaseOperator): - """ - Randomly choose a transformation between transforms1 and transforms2, - and the probability of choosing transforms1 is p. - - The code is based on https://github.com/facebookresearch/detr/blob/main/datasets/transforms.py - - """ - - def __init__(self, transforms1, transforms2, p=0.5): - super(RandomSelect, self).__init__() - self.transforms1 = Compose(transforms1) - self.transforms2 = Compose(transforms2) - self.p = p - - def apply(self, sample, context=None): - if random.random() < self.p: - return self.transforms1(sample) - return self.transforms2(sample) - - -@register_op -class RandomSelects(BaseOperator): - """ - Randomly choose a transformation between transforms1 and transforms2, - and the probability of choosing transforms1 is p. - - The code is based on https://github.com/facebookresearch/detr/blob/main/datasets/transforms.py - - """ - - def __init__(self, transforms_list, p=None): - super(RandomSelects, self).__init__() - if p is not None: - assert isinstance(p, (list, tuple)) - assert len(transforms_list) == len(p) - else: - assert len(transforms_list) > 0 - self.transforms = [Compose(t) for t in transforms_list] - self.p = p - - def apply(self, sample, context=None): - if self.p is None: - return random.choice(self.transforms)(sample) - else: - prob = random.random() - for p, t in zip(self.p, self.transforms): - if prob <= p: - return t(sample) - - -@register_op -class RandomShortSideResize(BaseOperator): - def __init__(self, - short_side_sizes, - max_size=None, - interp=cv2.INTER_LINEAR, - random_interp=False): - """ - Resize the image randomly according to the short side. If max_size is not None, - the long side is scaled according to max_size. The whole process will be keep ratio. - Args: - short_side_sizes (list|tuple): Image target short side size. - max_size (int): The size of the longest side of image after resize. - interp (int): The interpolation method. - random_interp (bool): Whether random select interpolation method. - """ - super(RandomShortSideResize, self).__init__() - - assert isinstance(short_side_sizes, - Sequence), "short_side_sizes must be List or Tuple" - - self.short_side_sizes = short_side_sizes - self.max_size = max_size - self.interp = interp - self.random_interp = random_interp - self.interps = [ - cv2.INTER_NEAREST, - cv2.INTER_LINEAR, - cv2.INTER_AREA, - cv2.INTER_CUBIC, - cv2.INTER_LANCZOS4, - ] - - def get_size_with_aspect_ratio(self, image_shape, size, max_size=None): - h, w = image_shape - max_clip = False - if max_size is not None: - min_original_size = float(min((w, h))) - max_original_size = float(max((w, h))) - if max_original_size / min_original_size * size > max_size: - size = int(max_size * min_original_size / max_original_size) - max_clip = True - - if (w <= h and w == size) or (h <= w and h == size): - return (w, h) - - if w < h: - ow = size - oh = int(round(size * h / w)) if not max_clip else max_size - else: - oh = size - ow = int(round(size * w / h)) if not max_clip else max_size - - return (ow, oh) - - def resize(self, - sample, - target_size, - max_size=None, - interp=cv2.INTER_LINEAR): - im = sample['image'] - if not isinstance(im, np.ndarray): - raise TypeError("{}: image type is not numpy.".format(self)) - if len(im.shape) != 3: - raise ImageError('{}: image is not 3-dimensional.'.format(self)) - - target_size = self.get_size_with_aspect_ratio(im.shape[:2], target_size, - max_size) - im_scale_y, im_scale_x = target_size[1] / im.shape[0], target_size[ - 0] / im.shape[1] - - sample['image'] = cv2.resize(im, target_size, interpolation=interp) - sample['im_shape'] = np.asarray(target_size[::-1], dtype=np.float32) - if 'scale_factor' in sample: - scale_factor = sample['scale_factor'] - sample['scale_factor'] = np.asarray( - [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x], - dtype=np.float32) - else: - sample['scale_factor'] = np.asarray( - [im_scale_y, im_scale_x], dtype=np.float32) - - # apply bbox - if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: - sample['gt_bbox'] = self.apply_bbox( - sample['gt_bbox'], [im_scale_x, im_scale_y], target_size) - # apply polygon - if 'gt_poly' in sample and len(sample['gt_poly']) > 0: - sample['gt_poly'] = self.apply_segm(sample['gt_poly'], im.shape[:2], - [im_scale_x, im_scale_y]) - # apply semantic - if 'semantic' in sample and sample['semantic']: - semantic = sample['semantic'] - semantic = cv2.resize( - semantic.astype('float32'), - target_size, - interpolation=self.interp) - semantic = np.asarray(semantic).astype('int32') - semantic = np.expand_dims(semantic, 0) - sample['semantic'] = semantic - # apply gt_segm - if 'gt_segm' in sample and len(sample['gt_segm']) > 0: - masks = [ - cv2.resize( - gt_segm, target_size, interpolation=cv2.INTER_NEAREST) - for gt_segm in sample['gt_segm'] - ] - sample['gt_segm'] = np.asarray(masks).astype(np.uint8) - - if 'gt_joints' in sample: - sample['gt_joints'] = self.apply_joints( - sample['gt_joints'], [im_scale_x, im_scale_y], target_size) - - # apply areas - if 'gt_areas' in sample: - sample['gt_areas'] = self.apply_area(sample['gt_areas'], - [im_scale_x, im_scale_y]) - - return sample - - def apply_bbox(self, bbox, scale, size): - im_scale_x, im_scale_y = scale - resize_w, resize_h = size - bbox[:, 0::2] *= im_scale_x - bbox[:, 1::2] *= im_scale_y - bbox[:, 0::2] = np.clip(bbox[:, 0::2], 0, resize_w) - bbox[:, 1::2] = np.clip(bbox[:, 1::2], 0, resize_h) - return bbox.astype('float32') - - def apply_joints(self, joints, scale, size): - im_scale_x, im_scale_y = scale - resize_w, resize_h = size - joints[..., 0] *= im_scale_x - joints[..., 1] *= im_scale_y - # joints[joints[..., 0] >= resize_w, :] = 0 - # joints[joints[..., 1] >= resize_h, :] = 0 - # joints[joints[..., 0] < 0, :] = 0 - # joints[joints[..., 1] < 0, :] = 0 - joints[..., 0] = np.clip(joints[..., 0], 0, resize_w) - joints[..., 1] = np.clip(joints[..., 1], 0, resize_h) - return joints - - def apply_area(self, area, scale): - im_scale_x, im_scale_y = scale - return area * im_scale_x * im_scale_y - - def apply_segm(self, segms, im_size, scale): - def _resize_poly(poly, im_scale_x, im_scale_y): - resized_poly = np.array(poly).astype('float32') - resized_poly[0::2] *= im_scale_x - resized_poly[1::2] *= im_scale_y - return resized_poly.tolist() - - def _resize_rle(rle, im_h, im_w, im_scale_x, im_scale_y): - if 'counts' in rle and type(rle['counts']) == list: - rle = mask_util.frPyObjects(rle, im_h, im_w) - - mask = mask_util.decode(rle) - mask = cv2.resize( - mask, - None, - None, - fx=im_scale_x, - fy=im_scale_y, - interpolation=self.interp) - rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) - return rle - - im_h, im_w = im_size - im_scale_x, im_scale_y = scale - resized_segms = [] - for segm in segms: - if is_poly(segm): - # Polygon format - resized_segms.append([ - _resize_poly(poly, im_scale_x, im_scale_y) for poly in segm - ]) - else: - # RLE format - import pycocotools.mask as mask_util - resized_segms.append( - _resize_rle(segm, im_h, im_w, im_scale_x, im_scale_y)) - - return resized_segms - - def apply(self, sample, context=None): - target_size = random.choice(self.short_side_sizes) - interp = random.choice( - self.interps) if self.random_interp else self.interp - - return self.resize(sample, target_size, self.max_size, interp) - - -@register_op -class RandomShortSideRangeResize(RandomShortSideResize): - def __init__(self, scales, interp=cv2.INTER_LINEAR, random_interp=False): - """ - Resize the image randomly according to the short side. If max_size is not None, - the long side is scaled according to max_size. The whole process will be keep ratio. - Args: - short_side_sizes (list|tuple): Image target short side size. - interp (int): The interpolation method. - random_interp (bool): Whether random select interpolation method. - """ - super(RandomShortSideRangeResize, self).__init__(scales, None, interp, - random_interp) - - assert isinstance(scales, - Sequence), "short_side_sizes must be List or Tuple" - - self.scales = scales - - def random_sample(self, img_scales): - img_scale_long = [max(s) for s in img_scales] - img_scale_short = [min(s) for s in img_scales] - long_edge = np.random.randint( - min(img_scale_long), max(img_scale_long) + 1) - short_edge = np.random.randint( - min(img_scale_short), max(img_scale_short) + 1) - img_scale = (long_edge, short_edge) - return img_scale - - def apply(self, sample, context=None): - long_edge, short_edge = self.random_sample(self.short_side_sizes) - # print("target size:{}".format((long_edge, short_edge))) - interp = random.choice( - self.interps) if self.random_interp else self.interp - - return self.resize(sample, short_edge, long_edge, interp) - - -@register_op -class RandomSizeCrop(BaseOperator): - """ - Cut the image randomly according to `min_size` and `max_size` - Args: - min_size (int): Min size for edges of cropped image. - max_size (int): Max size for edges of cropped image. If it - is set to larger than length of the input image, - the output will keep the origin length. - keep_empty (bool): Whether to keep the cropped result with no object. - If it is set to False, the no-object result will not - be returned, replaced by the original input. - """ - - def __init__(self, min_size, max_size, keep_empty=True): - super(RandomSizeCrop, self).__init__() - self.min_size = min_size - self.max_size = max_size - self.keep_empty = keep_empty - - from paddle.vision.transforms.functional import crop as paddle_crop - self.paddle_crop = paddle_crop - - @staticmethod - def get_crop_params(img_shape, output_size): - """Get parameters for ``crop`` for a random crop. - Args: - img_shape (list|tuple): Image's height and width. - output_size (list|tuple): Expected output size of the crop. - Returns: - tuple: params (i, j, h, w) to be passed to ``crop`` for random crop. - """ - h, w = img_shape - th, tw = output_size - - if h + 1 < th or w + 1 < tw: - raise ValueError( - "Required crop size {} is larger then input image size {}". - format((th, tw), (h, w))) - - if w == tw and h == th: - return 0, 0, h, w - - i = random.randint(0, h - th + 1) - j = random.randint(0, w - tw + 1) - return i, j, th, tw - - def crop(self, sample, region): - keep_index = None - # apply bbox and check whether the cropped result is valid - if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: - croped_bbox = self.apply_bbox(sample['gt_bbox'], region) - bbox = croped_bbox.reshape([-1, 2, 2]) - area = (bbox[:, 1, :] - bbox[:, 0, :]).prod(axis=1) - keep_index = np.where(area > 0)[0] - - if not self.keep_empty and len(keep_index) == 0: - # When keep_empty is set to False, cropped with no-object will - # not be used and return the origin content. - return sample - - sample['gt_bbox'] = croped_bbox[keep_index] if len( - keep_index) > 0 else np.zeros( - [0, 4], dtype=np.float32) - sample['gt_class'] = sample['gt_class'][keep_index] if len( - keep_index) > 0 else np.zeros( - [0, 1], dtype=np.float32) - if 'gt_score' in sample: - sample['gt_score'] = sample['gt_score'][keep_index] if len( - keep_index) > 0 else np.zeros( - [0, 1], dtype=np.float32) - if 'is_crowd' in sample: - sample['is_crowd'] = sample['is_crowd'][keep_index] if len( - keep_index) > 0 else np.zeros( - [0, 1], dtype=np.float32) - if 'gt_areas' in sample: - sample['gt_areas'] = np.take( - sample['gt_areas'], keep_index, axis=0) - - image_shape = sample['image'].shape[:2] - sample['image'] = self.paddle_crop(sample['image'], *region) - sample['im_shape'] = np.array( - sample['image'].shape[:2], dtype=np.float32) - - # apply polygon - if 'gt_poly' in sample and len(sample['gt_poly']) > 0: - sample['gt_poly'] = self.apply_segm(sample['gt_poly'], region, - image_shape) - sample['gt_poly'] = np.array(sample['gt_poly']) - if keep_index is not None and len(keep_index) > 0: - sample['gt_poly'] = sample['gt_poly'][keep_index] - sample['gt_poly'] = sample['gt_poly'].tolist() - # apply gt_segm - if 'gt_segm' in sample and len(sample['gt_segm']) > 0: - i, j, h, w = region - sample['gt_segm'] = sample['gt_segm'][:, i:i + h, j:j + w] - if keep_index is not None and len(keep_index) > 0: - sample['gt_segm'] = sample['gt_segm'][keep_index] - - if 'gt_joints' in sample: - gt_joints = self._crop_joints(sample['gt_joints'], region) - sample['gt_joints'] = gt_joints - if keep_index is not None: - sample['gt_joints'] = sample['gt_joints'][keep_index] - - return sample - - def apply_bbox(self, bbox, region): - i, j, h, w = region - region_size = np.asarray([w, h]) - crop_bbox = bbox - np.asarray([j, i, j, i]) - crop_bbox = np.minimum(crop_bbox.reshape([-1, 2, 2]), region_size) - crop_bbox = crop_bbox.clip(min=0) - return crop_bbox.reshape([-1, 4]).astype('float32') - - def _crop_joints(self, joints, region): - y1, x1, h, w = region - x2 = x1 + w - y2 = y1 + h - # x1, y1, x2, y2 = crop - joints[..., 0] -= x1 - joints[..., 1] -= y1 - joints[joints[..., 0] > w, :] = 0 - joints[joints[..., 1] > h, :] = 0 - joints[joints[..., 0] < 0, :] = 0 - joints[joints[..., 1] < 0, :] = 0 - return joints - - def apply_segm(self, segms, region, image_shape): - def _crop_poly(segm, crop): - xmin, ymin, xmax, ymax = crop - crop_coord = [xmin, ymin, xmin, ymax, xmax, ymax, xmax, ymin] - crop_p = np.array(crop_coord).reshape(4, 2) - crop_p = Polygon(crop_p) - - crop_segm = list() - for poly in segm: - poly = np.array(poly).reshape(len(poly) // 2, 2) - polygon = Polygon(poly) - if not polygon.is_valid: - exterior = polygon.exterior - multi_lines = exterior.intersection(exterior) - polygons = shapely.ops.polygonize(multi_lines) - polygon = MultiPolygon(polygons) - multi_polygon = list() - if isinstance(polygon, MultiPolygon): - multi_polygon = copy.deepcopy(polygon) - else: - multi_polygon.append(copy.deepcopy(polygon)) - for per_polygon in multi_polygon: - inter = per_polygon.intersection(crop_p) - if not inter: - continue - if isinstance(inter, (MultiPolygon, GeometryCollection)): - for part in inter: - if not isinstance(part, Polygon): - continue - part = np.squeeze( - np.array(part.exterior.coords[:-1]).reshape(1, - -1)) - part[0::2] -= xmin - part[1::2] -= ymin - crop_segm.append(part.tolist()) - elif isinstance(inter, Polygon): - crop_poly = np.squeeze( - np.array(inter.exterior.coords[:-1]).reshape(1, -1)) - crop_poly[0::2] -= xmin - crop_poly[1::2] -= ymin - crop_segm.append(crop_poly.tolist()) - else: - continue - return crop_segm - - def _crop_rle(rle, crop, height, width): - if 'counts' in rle and type(rle['counts']) == list: - rle = mask_util.frPyObjects(rle, height, width) - mask = mask_util.decode(rle) - mask = mask[crop[1]:crop[3], crop[0]:crop[2]] - rle = mask_util.encode(np.array(mask, order='F', dtype=np.uint8)) - return rle - - i, j, h, w = region - crop = [j, i, j + w, i + h] - height, width = image_shape - crop_segms = [] - for segm in segms: - if is_poly(segm): - import copy - import shapely.ops - from shapely.geometry import Polygon, MultiPolygon, GeometryCollection - # Polygon format - crop_segms.append(_crop_poly(segm, crop)) - else: - # RLE format - import pycocotools.mask as mask_util - crop_segms.append(_crop_rle(segm, crop, height, width)) - return crop_segms - - def apply(self, sample, context=None): - h = random.randint(self.min_size, - min(sample['image'].shape[0], self.max_size)) - w = random.randint(self.min_size, - min(sample['image'].shape[1], self.max_size)) - - region = self.get_crop_params(sample['image'].shape[:2], [h, w]) - return self.crop(sample, region) - - -@register_op -class WarpAffine(BaseOperator): - def __init__(self, - keep_res=False, - pad=31, - input_h=512, - input_w=512, - scale=0.4, - shift=0.1, - down_ratio=4): - """WarpAffine - Warp affine the image - The code is based on https://github.com/xingyizhou/CenterNet/blob/master/src/lib/datasets/sample/ctdet.py - """ - super(WarpAffine, self).__init__() - self.keep_res = keep_res - self.pad = pad - self.input_h = input_h - self.input_w = input_w - self.scale = scale - self.shift = shift - self.down_ratio = down_ratio - - def apply(self, sample, context=None): - img = sample['image'] - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - - h, w = img.shape[:2] - - if self.keep_res: - # True in detection eval/infer - input_h = (h | self.pad) + 1 - input_w = (w | self.pad) + 1 - s = np.array([input_w, input_h], dtype=np.float32) - c = np.array([w // 2, h // 2], dtype=np.float32) - else: - # False in centertrack eval_mot/eval_mot - s = max(h, w) * 1.0 - input_h, input_w = self.input_h, self.input_w - c = np.array([w / 2., h / 2.], dtype=np.float32) - - trans_input = get_affine_transform(c, s, 0, [input_w, input_h]) - img = cv2.resize(img, (w, h)) - inp = cv2.warpAffine( - img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR) - sample['image'] = inp - - if not self.keep_res: - out_h = input_h // self.down_ratio - out_w = input_w // self.down_ratio - trans_output = get_affine_transform(c, s, 0, [out_w, out_h]) - - sample.update({ - 'center': c, - 'scale': s, - 'out_height': out_h, - 'out_width': out_w, - 'inp_height': input_h, - 'inp_width': input_w, - 'trans_input': trans_input, - 'trans_output': trans_output, - }) - return sample - - -@register_op -class FlipWarpAffine(BaseOperator): - def __init__(self, - keep_res=False, - pad=31, - input_h=512, - input_w=512, - not_rand_crop=False, - scale=0.4, - shift=0.1, - flip=0.5, - is_scale=True, - use_random=True, - add_pre_img=False): - """FlipWarpAffine - 1. Random Crop - 2. Flip the image horizontal - 3. Warp affine the image - 4. (Optinal) Add previous image - """ - super(FlipWarpAffine, self).__init__() - self.keep_res = keep_res - self.pad = pad - self.input_h = input_h - self.input_w = input_w - self.not_rand_crop = not_rand_crop - self.scale = scale - self.shift = shift - self.flip = flip - self.is_scale = is_scale - self.use_random = use_random - self.add_pre_img = add_pre_img - - def __call__(self, samples, context=None): - if self.add_pre_img: - assert isinstance(samples, Sequence) and len(samples) == 2 - sample, pre_sample = samples[0], samples[1] - else: - sample = samples - - img = sample['image'] - img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) - if 'gt_bbox' in sample and len(sample['gt_bbox']) == 0: - return sample - - h, w = img.shape[:2] - flipped = 0 - - if self.keep_res: - input_h = (h | self.pad) + 1 - input_w = (w | self.pad) + 1 - s = np.array([input_w, input_h], dtype=np.float32) - c = np.array([w // 2, h // 2], dtype=np.float32) - else: - # centernet training default - s = max(h, w) * 1.0 - input_h, input_w = self.input_h, self.input_w - c = np.array([w / 2., h / 2.], dtype=np.float32) - - if self.use_random: - gt_bbox = sample['gt_bbox'] - if not self.not_rand_crop: - # centernet default - s = s * np.random.choice(np.arange(0.6, 1.4, 0.1)) - w_border = get_border(128, w) - h_border = get_border(128, h) - c[0] = np.random.randint(low=w_border, high=w - w_border) - c[1] = np.random.randint(low=h_border, high=h - h_border) - else: - sf = self.scale - cf = self.shift - c[0] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf) - c[1] += s * np.clip(np.random.randn() * cf, -2 * cf, 2 * cf) - s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf) - - if np.random.random() < self.flip: - img = img[:, ::-1, :] - c[0] = w - c[0] - 1 - oldx1 = gt_bbox[:, 0].copy() - oldx2 = gt_bbox[:, 2].copy() - gt_bbox[:, 0] = w - oldx2 - 1 - gt_bbox[:, 2] = w - oldx1 - 1 - flipped = 1 - sample['gt_bbox'] = gt_bbox - - trans_input = get_affine_transform(c, s, 0, [input_w, input_h]) - inp = cv2.warpAffine( - img, trans_input, (input_w, input_h), flags=cv2.INTER_LINEAR) - if self.is_scale: - inp = (inp.astype(np.float32) / 255.) - - sample['image'] = inp - sample['center'] = c - sample['scale'] = s - - if self.add_pre_img: - sample['trans_input'] = trans_input - - # previous image, use same aug trans_input as current image - pre_img = pre_sample['image'] - pre_img = cv2.cvtColor(pre_img, cv2.COLOR_RGB2BGR) - if flipped: - pre_img = pre_img[:, ::-1, :].copy() - pre_inp = cv2.warpAffine( - pre_img, - trans_input, (input_w, input_h), - flags=cv2.INTER_LINEAR) - if self.is_scale: - pre_inp = (pre_inp.astype(np.float32) / 255.) - sample['pre_image'] = pre_inp - - # if empty gt_bbox - if 'gt_bbox' in pre_sample and len(pre_sample['gt_bbox']) == 0: - return sample - pre_gt_bbox = pre_sample['gt_bbox'] - if flipped: - pre_oldx1 = pre_gt_bbox[:, 0].copy() - pre_oldx2 = pre_gt_bbox[:, 2].copy() - pre_gt_bbox[:, 0] = w - pre_oldx1 - 1 - pre_gt_bbox[:, 2] = w - pre_oldx2 - 1 - sample['pre_gt_bbox'] = pre_gt_bbox - - sample['pre_gt_class'] = pre_sample['gt_class'] - sample['pre_gt_track_id'] = pre_sample['gt_track_id'] - del pre_sample - - return sample - - -@register_op -class CenterRandColor(BaseOperator): - """Random color for CenterNet series models. - Args: - saturation (float): saturation settings. - contrast (float): contrast settings. - brightness (float): brightness settings. - """ - - def __init__(self, saturation=0.4, contrast=0.4, brightness=0.4): - super(CenterRandColor, self).__init__() - self.saturation = saturation - self.contrast = contrast - self.brightness = brightness - - def apply_saturation(self, img, img_gray): - alpha = 1. + np.random.uniform( - low=-self.saturation, high=self.saturation) - self._blend(alpha, img, img_gray[:, :, None]) - return img - - def apply_contrast(self, img, img_gray): - alpha = 1. + np.random.uniform(low=-self.contrast, high=self.contrast) - img_mean = img_gray.mean() - self._blend(alpha, img, img_mean) - return img - - def apply_brightness(self, img, img_gray): - alpha = 1 + np.random.uniform( - low=-self.brightness, high=self.brightness) - img *= alpha - return img - - def _blend(self, alpha, img, img_mean): - img *= alpha - img_mean *= (1 - alpha) - img += img_mean - - def apply(self, sample, context=None): - functions = [ - self.apply_brightness, - self.apply_contrast, - self.apply_saturation, - ] - - img = sample['image'] - img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) - distortions = np.random.permutation(functions) - for func in distortions: - img = func(img, img_gray) - sample['image'] = img - - if 'pre_image' in sample: - pre_img = sample['pre_image'] - pre_img_gray = cv2.cvtColor(pre_img, cv2.COLOR_BGR2GRAY) - pre_distortions = np.random.permutation(functions) - for func in pre_distortions: - pre_img = func(pre_img, pre_img_gray) - sample['pre_image'] = pre_img - - return sample - - -@register_op -class Mosaic(BaseOperator): - """ Mosaic operator for image and gt_bboxes - The code is based on https://github.com/Megvii-BaseDetection/YOLOX/blob/main/yolox/data/datasets/mosaicdetection.py - - 1. get mosaic coords - 2. clip bbox and get mosaic_labels - 3. random_affine augment - 4. Mixup augment as copypaste (optinal), not used in tiny/nano - - Args: - prob (float): probability of using Mosaic, 1.0 as default - input_dim (list[int]): input shape - degrees (list[2]): the rotate range to apply, transform range is [min, max] - translate (list[2]): the translate range to apply, transform range is [min, max] - scale (list[2]): the scale range to apply, transform range is [min, max] - shear (list[2]): the shear range to apply, transform range is [min, max] - enable_mixup (bool): whether to enable Mixup or not - mixup_prob (float): probability of using Mixup, 1.0 as default - mixup_scale (list[int]): scale range of Mixup - remove_outside_box (bool): whether remove outside boxes, False as - default in COCO dataset, True in MOT dataset - """ - - def __init__(self, - prob=1.0, - input_dim=[640, 640], - degrees=[-10, 10], - translate=[-0.1, 0.1], - scale=[0.1, 2], - shear=[-2, 2], - enable_mixup=True, - mixup_prob=1.0, - mixup_scale=[0.5, 1.5], - remove_outside_box=False): - super(Mosaic, self).__init__() - self.prob = prob - if isinstance(input_dim, Integral): - input_dim = [input_dim, input_dim] - self.input_dim = input_dim - self.degrees = degrees - self.translate = translate - self.scale = scale - self.shear = shear - self.enable_mixup = enable_mixup - self.mixup_prob = mixup_prob - self.mixup_scale = mixup_scale - self.remove_outside_box = remove_outside_box - - def get_mosaic_coords(self, mosaic_idx, xc, yc, w, h, input_h, input_w): - # (x1, y1, x2, y2) means coords in large image, - # small_coords means coords in small image in mosaic aug. - if mosaic_idx == 0: - # top left - x1, y1, x2, y2 = max(xc - w, 0), max(yc - h, 0), xc, yc - small_coords = w - (x2 - x1), h - (y2 - y1), w, h - elif mosaic_idx == 1: - # top right - x1, y1, x2, y2 = xc, max(yc - h, 0), min(xc + w, input_w * 2), yc - small_coords = 0, h - (y2 - y1), min(w, x2 - x1), h - elif mosaic_idx == 2: - # bottom left - x1, y1, x2, y2 = max(xc - w, 0), yc, xc, min(input_h * 2, yc + h) - small_coords = w - (x2 - x1), 0, w, min(y2 - y1, h) - elif mosaic_idx == 3: - # bottom right - x1, y1, x2, y2 = xc, yc, min(xc + w, input_w * 2), min(input_h * 2, - yc + h) - small_coords = 0, 0, min(w, x2 - x1), min(y2 - y1, h) - - return (x1, y1, x2, y2), small_coords - - def random_affine_augment(self, - img, - labels=[], - input_dim=[640, 640], - degrees=[-10, 10], - scales=[0.1, 2], - shears=[-2, 2], - translates=[-0.1, 0.1]): - # random rotation and scale - degree = random.uniform(degrees[0], degrees[1]) - scale = random.uniform(scales[0], scales[1]) - assert scale > 0, "Argument scale should be positive." - R = cv2.getRotationMatrix2D(angle=degree, center=(0, 0), scale=scale) - M = np.ones([2, 3]) - - # random shear - shear = random.uniform(shears[0], shears[1]) - shear_x = math.tan(shear * math.pi / 180) - shear_y = math.tan(shear * math.pi / 180) - M[0] = R[0] + shear_y * R[1] - M[1] = R[1] + shear_x * R[0] - - # random translation - translate = random.uniform(translates[0], translates[1]) - translation_x = translate * input_dim[0] - translation_y = translate * input_dim[1] - M[0, 2] = translation_x - M[1, 2] = translation_y - - # warpAffine - img = cv2.warpAffine( - img, M, dsize=tuple(input_dim), borderValue=(114, 114, 114)) - - num_gts = len(labels) - if num_gts > 0: - # warp corner points - corner_points = np.ones((4 * num_gts, 3)) - corner_points[:, :2] = labels[:, [0, 1, 2, 3, 0, 3, 2, 1]].reshape( - 4 * num_gts, 2) # x1y1, x2y2, x1y2, x2y1 - # apply affine transform - corner_points = corner_points @M.T - corner_points = corner_points.reshape(num_gts, 8) - - # create new boxes - corner_xs = corner_points[:, 0::2] - corner_ys = corner_points[:, 1::2] - new_bboxes = np.concatenate((corner_xs.min(1), corner_ys.min(1), - corner_xs.max(1), corner_ys.max(1))) - new_bboxes = new_bboxes.reshape(4, num_gts).T - - # clip boxes - new_bboxes[:, 0::2] = np.clip(new_bboxes[:, 0::2], 0, input_dim[0]) - new_bboxes[:, 1::2] = np.clip(new_bboxes[:, 1::2], 0, input_dim[1]) - labels[:, :4] = new_bboxes - - return img, labels - - def __call__(self, sample, context=None): - if not isinstance(sample, Sequence): - return sample - - assert len( - sample) == 5, "Mosaic needs 5 samples, 4 for mosaic and 1 for mixup." - if np.random.uniform(0., 1.) > self.prob: - return sample[0] - - mosaic_gt_bbox, mosaic_gt_class, mosaic_is_crowd, mosaic_difficult = [], [], [], [] - input_h, input_w = self.input_dim - yc = int(random.uniform(0.5 * input_h, 1.5 * input_h)) - xc = int(random.uniform(0.5 * input_w, 1.5 * input_w)) - mosaic_img = np.full((input_h * 2, input_w * 2, 3), 114, dtype=np.uint8) - - # 1. get mosaic coords - for mosaic_idx, sp in enumerate(sample[:4]): - img = sp['image'] - gt_bbox = sp['gt_bbox'] - h0, w0 = img.shape[:2] - scale = min(1. * input_h / h0, 1. * input_w / w0) - img = cv2.resize( - img, (int(w0 * scale), int(h0 * scale)), - interpolation=cv2.INTER_LINEAR) - (h, w, c) = img.shape[:3] - - # suffix l means large image, while s means small image in mosaic aug. - (l_x1, l_y1, l_x2, l_y2), ( - s_x1, s_y1, s_x2, s_y2) = self.get_mosaic_coords( - mosaic_idx, xc, yc, w, h, input_h, input_w) - - mosaic_img[l_y1:l_y2, l_x1:l_x2] = img[s_y1:s_y2, s_x1:s_x2] - padw, padh = l_x1 - s_x1, l_y1 - s_y1 - - # Normalized xywh to pixel xyxy format - _gt_bbox = gt_bbox.copy() - if len(gt_bbox) > 0: - _gt_bbox[:, 0] = scale * gt_bbox[:, 0] + padw - _gt_bbox[:, 1] = scale * gt_bbox[:, 1] + padh - _gt_bbox[:, 2] = scale * gt_bbox[:, 2] + padw - _gt_bbox[:, 3] = scale * gt_bbox[:, 3] + padh - - mosaic_gt_bbox.append(_gt_bbox) - mosaic_gt_class.append(sp['gt_class']) - if 'is_crowd' in sp: - mosaic_is_crowd.append(sp['is_crowd']) - if 'difficult' in sp: - mosaic_difficult.append(sp['difficult']) - - # 2. clip bbox and get mosaic_labels([gt_bbox, gt_class, is_crowd]) - if len(mosaic_gt_bbox): - mosaic_gt_bbox = np.concatenate(mosaic_gt_bbox, 0) - mosaic_gt_class = np.concatenate(mosaic_gt_class, 0) - if mosaic_is_crowd: - mosaic_is_crowd = np.concatenate(mosaic_is_crowd, 0) - mosaic_labels = np.concatenate([ - mosaic_gt_bbox, - mosaic_gt_class.astype(mosaic_gt_bbox.dtype), - mosaic_is_crowd.astype(mosaic_gt_bbox.dtype) - ], 1) - elif mosaic_difficult: - mosaic_difficult = np.concatenate(mosaic_difficult, 0) - mosaic_labels = np.concatenate([ - mosaic_gt_bbox, - mosaic_gt_class.astype(mosaic_gt_bbox.dtype), - mosaic_difficult.astype(mosaic_gt_bbox.dtype) - ], 1) - else: - mosaic_labels = np.concatenate([ - mosaic_gt_bbox, mosaic_gt_class.astype(mosaic_gt_bbox.dtype) - ], 1) - if self.remove_outside_box: - # for MOT dataset - flag1 = mosaic_gt_bbox[:, 0] < 2 * input_w - flag2 = mosaic_gt_bbox[:, 2] > 0 - flag3 = mosaic_gt_bbox[:, 1] < 2 * input_h - flag4 = mosaic_gt_bbox[:, 3] > 0 - flag_all = flag1 * flag2 * flag3 * flag4 - mosaic_labels = mosaic_labels[flag_all] - else: - mosaic_labels[:, 0] = np.clip(mosaic_labels[:, 0], 0, - 2 * input_w) - mosaic_labels[:, 1] = np.clip(mosaic_labels[:, 1], 0, - 2 * input_h) - mosaic_labels[:, 2] = np.clip(mosaic_labels[:, 2], 0, - 2 * input_w) - mosaic_labels[:, 3] = np.clip(mosaic_labels[:, 3], 0, - 2 * input_h) - else: - mosaic_labels = np.zeros((1, 6)) - - # 3. random_affine augment - mosaic_img, mosaic_labels = self.random_affine_augment( - mosaic_img, - mosaic_labels, - input_dim=self.input_dim, - degrees=self.degrees, - translates=self.translate, - scales=self.scale, - shears=self.shear) - - # 4. Mixup augment as copypaste, https://arxiv.org/abs/2012.07177 - # optinal, not used(enable_mixup=False) in tiny/nano - if (self.enable_mixup and not len(mosaic_labels) == 0 and - random.random() < self.mixup_prob): - sample_mixup = sample[4] - mixup_img = sample_mixup['image'] - if 'is_crowd' in sample_mixup: - cp_labels = np.concatenate([ - sample_mixup['gt_bbox'], - sample_mixup['gt_class'].astype(mosaic_labels.dtype), - sample_mixup['is_crowd'].astype(mosaic_labels.dtype) - ], 1) - elif 'difficult' in sample_mixup: - cp_labels = np.concatenate([ - sample_mixup['gt_bbox'], - sample_mixup['gt_class'].astype(mosaic_labels.dtype), - sample_mixup['difficult'].astype(mosaic_labels.dtype) - ], 1) - else: - cp_labels = np.concatenate([ - sample_mixup['gt_bbox'], - sample_mixup['gt_class'].astype(mosaic_labels.dtype) - ], 1) - mosaic_img, mosaic_labels = self.mixup_augment( - mosaic_img, mosaic_labels, self.input_dim, cp_labels, mixup_img) - - sample0 = sample[0] - sample0['image'] = mosaic_img.astype(np.uint8) # can not be float32 - sample0['h'] = float(mosaic_img.shape[0]) - sample0['w'] = float(mosaic_img.shape[1]) - sample0['im_shape'][0] = sample0['h'] - sample0['im_shape'][1] = sample0['w'] - sample0['gt_bbox'] = mosaic_labels[:, :4].astype(np.float32) - sample0['gt_class'] = mosaic_labels[:, 4:5].astype(np.float32) - if 'is_crowd' in sample[0]: - sample0['is_crowd'] = mosaic_labels[:, 5:6].astype(np.float32) - if 'difficult' in sample[0]: - sample0['difficult'] = mosaic_labels[:, 5:6].astype(np.float32) - return sample0 - - def mixup_augment(self, origin_img, origin_labels, input_dim, cp_labels, - img): - jit_factor = random.uniform(*self.mixup_scale) - FLIP = random.uniform(0, 1) > 0.5 - if len(img.shape) == 3: - cp_img = np.ones( - (input_dim[0], input_dim[1], 3), dtype=np.uint8) * 114 - else: - cp_img = np.ones(input_dim, dtype=np.uint8) * 114 - - cp_scale_ratio = min(input_dim[0] / img.shape[0], - input_dim[1] / img.shape[1]) - resized_img = cv2.resize( - img, (int(img.shape[1] * cp_scale_ratio), - int(img.shape[0] * cp_scale_ratio)), - interpolation=cv2.INTER_LINEAR) - - cp_img[:int(img.shape[0] * cp_scale_ratio), :int(img.shape[ - 1] * cp_scale_ratio)] = resized_img - - cp_img = cv2.resize(cp_img, (int(cp_img.shape[1] * jit_factor), - int(cp_img.shape[0] * jit_factor))) - cp_scale_ratio *= jit_factor - - if FLIP: - cp_img = cp_img[:, ::-1, :] - - origin_h, origin_w = cp_img.shape[:2] - target_h, target_w = origin_img.shape[:2] - padded_img = np.zeros( - (max(origin_h, target_h), max(origin_w, target_w), 3), - dtype=np.uint8) - padded_img[:origin_h, :origin_w] = cp_img - - x_offset, y_offset = 0, 0 - if padded_img.shape[0] > target_h: - y_offset = random.randint(0, padded_img.shape[0] - target_h - 1) - if padded_img.shape[1] > target_w: - x_offset = random.randint(0, padded_img.shape[1] - target_w - 1) - padded_cropped_img = padded_img[y_offset:y_offset + target_h, x_offset: - x_offset + target_w] - - # adjust boxes - cp_bboxes_origin_np = cp_labels[:, :4].copy() - cp_bboxes_origin_np[:, 0::2] = np.clip(cp_bboxes_origin_np[:, 0::2] * - cp_scale_ratio, 0, origin_w) - cp_bboxes_origin_np[:, 1::2] = np.clip(cp_bboxes_origin_np[:, 1::2] * - cp_scale_ratio, 0, origin_h) - - if FLIP: - cp_bboxes_origin_np[:, 0::2] = ( - origin_w - cp_bboxes_origin_np[:, 0::2][:, ::-1]) - cp_bboxes_transformed_np = cp_bboxes_origin_np.copy() - if self.remove_outside_box: - # for MOT dataset - cp_bboxes_transformed_np[:, 0::2] -= x_offset - cp_bboxes_transformed_np[:, 1::2] -= y_offset - else: - cp_bboxes_transformed_np[:, 0::2] = np.clip( - cp_bboxes_transformed_np[:, 0::2] - x_offset, 0, target_w) - cp_bboxes_transformed_np[:, 1::2] = np.clip( - cp_bboxes_transformed_np[:, 1::2] - y_offset, 0, target_h) - - cls_labels = cp_labels[:, 4:5].copy() - box_labels = cp_bboxes_transformed_np - if cp_labels.shape[-1] == 6: - crd_labels = cp_labels[:, 5:6].copy() - labels = np.hstack((box_labels, cls_labels, crd_labels)) - else: - labels = np.hstack((box_labels, cls_labels)) - if self.remove_outside_box: - labels = labels[labels[:, 0] < target_w] - labels = labels[labels[:, 2] > 0] - labels = labels[labels[:, 1] < target_h] - labels = labels[labels[:, 3] > 0] - - origin_labels = np.vstack((origin_labels, labels)) - origin_img = origin_img.astype(np.float32) - origin_img = 0.5 * origin_img + 0.5 * padded_cropped_img.astype( - np.float32) - - return origin_img.astype(np.uint8), origin_labels - - -@register_op -class PadResize(BaseOperator): - """ PadResize for image and gt_bbbox - - Args: - target_size (list[int]): input shape - fill_value (float): pixel value of padded image - """ - - def __init__(self, target_size, fill_value=114): - super(PadResize, self).__init__() - if isinstance(target_size, Integral): - target_size = [target_size, target_size] - self.target_size = target_size - self.fill_value = fill_value - - def _resize(self, img, bboxes, labels): - ratio = min(self.target_size[0] / img.shape[0], - self.target_size[1] / img.shape[1]) - w, h = int(img.shape[1] * ratio), int(img.shape[0] * ratio) - resized_img = cv2.resize(img, (w, h), interpolation=cv2.INTER_LINEAR) - - if len(bboxes) > 0: - bboxes *= ratio - mask = np.minimum(bboxes[:, 2] - bboxes[:, 0], - bboxes[:, 3] - bboxes[:, 1]) > 1 - bboxes = bboxes[mask] - labels = labels[mask] - return resized_img, bboxes, labels - - def _pad(self, img): - h, w, _ = img.shape - if h == self.target_size[0] and w == self.target_size[1]: - return img - padded_img = np.full( - (self.target_size[0], self.target_size[1], 3), - self.fill_value, - dtype=np.uint8) - padded_img[:h, :w] = img - return padded_img - - def apply(self, sample, context=None): - image = sample['image'] - bboxes = sample['gt_bbox'] - labels = sample['gt_class'] - image, bboxes, labels = self._resize(image, bboxes, labels) - sample['image'] = self._pad(image).astype(np.float32) - sample['gt_bbox'] = bboxes - sample['gt_class'] = labels - return sample - - -@register_op -class RandomShift(BaseOperator): - """ - Randomly shift image - - Args: - prob (float): probability to do random shift. - max_shift (int): max shift pixels - filter_thr (int): filter gt bboxes if one side is smaller than this - """ - - def __init__(self, prob=0.5, max_shift=32, filter_thr=1): - super(RandomShift, self).__init__() - self.prob = prob - self.max_shift = max_shift - self.filter_thr = filter_thr - - def calc_shift_coor(self, im_h, im_w, shift_h, shift_w): - return [ - max(0, shift_w), max(0, shift_h), min(im_w, im_w + shift_w), - min(im_h, im_h + shift_h) - ] - - def apply(self, sample, context=None): - if random.random() > self.prob: - return sample - - im = sample['image'] - gt_bbox = sample['gt_bbox'] - gt_class = sample['gt_class'] - im_h, im_w = im.shape[:2] - shift_h = random.randint(-self.max_shift, self.max_shift) - shift_w = random.randint(-self.max_shift, self.max_shift) - - gt_bbox[:, 0::2] += shift_w - gt_bbox[:, 1::2] += shift_h - gt_bbox[:, 0::2] = np.clip(gt_bbox[:, 0::2], 0, im_w) - gt_bbox[:, 1::2] = np.clip(gt_bbox[:, 1::2], 0, im_h) - gt_bbox_h = gt_bbox[:, 2] - gt_bbox[:, 0] - gt_bbox_w = gt_bbox[:, 3] - gt_bbox[:, 1] - keep = (gt_bbox_w > self.filter_thr) & (gt_bbox_h > self.filter_thr) - if not keep.any(): - return sample - - gt_bbox = gt_bbox[keep] - gt_class = gt_class[keep] - - # shift image - coor_new = self.calc_shift_coor(im_h, im_w, shift_h, shift_w) - # shift frame to the opposite direction - coor_old = self.calc_shift_coor(im_h, im_w, -shift_h, -shift_w) - canvas = np.zeros_like(im) - canvas[coor_new[1]:coor_new[3], coor_new[0]:coor_new[2]] \ - = im[coor_old[1]:coor_old[3], coor_old[0]:coor_old[2]] - - sample['image'] = canvas - sample['gt_bbox'] = gt_bbox - sample['gt_class'] = gt_class - return sample - - -@register_op -class StrongAugImage(BaseOperator): - def __init__(self, transforms): - super(StrongAugImage, self).__init__() - self.transforms = Compose(transforms) - - def apply(self, sample, context=None): - im = sample - im['image'] = sample['image'].astype('uint8') - results = self.transforms(im) - sample['image'] = results['image'].astype('uint8') - return sample - - -@register_op -class RandomColorJitter(BaseOperator): - def __init__(self, - prob=0.8, - brightness=0.4, - contrast=0.4, - saturation=0.4, - hue=0.1): - super(RandomColorJitter, self).__init__() - self.prob = prob - self.brightness = brightness - self.contrast = contrast - self.saturation = saturation - self.hue = hue - - def apply(self, sample, context=None): - if np.random.uniform(0, 1) < self.prob: - from paddle.vision.transforms import ColorJitter - transform = ColorJitter(self.brightness, self.contrast, - self.saturation, self.hue) - sample['image'] = transform(sample['image'].astype(np.uint8)) - sample['image'] = sample['image'].astype(np.float32) - return sample - - -@register_op -class RandomGrayscale(BaseOperator): - def __init__(self, prob=0.2): - super(RandomGrayscale, self).__init__() - self.prob = prob - - def apply(self, sample, context=None): - if np.random.uniform(0, 1) < self.prob: - from paddle.vision.transforms import Grayscale - transform = Grayscale(num_output_channels=3) - sample['image'] = transform(sample['image']) - return sample - - -@register_op -class RandomGaussianBlur(BaseOperator): - def __init__(self, prob=0.5, sigma=[0.1, 2.0]): - super(RandomGaussianBlur, self).__init__() - self.prob = prob - self.sigma = sigma - - def apply(self, sample, context=None): - if np.random.uniform(0, 1) < self.prob: - sigma = np.random.uniform(self.sigma[0], self.sigma[1]) - im = cv2.GaussianBlur(sample['image'], (23, 23), sigma) - sample['image'] = im - return sample - - -@register_op -class RandomErasing(BaseOperator): - def __init__(self, - prob=0.5, - scale=(0.02, 0.33), - ratio=(0.3, 3.3), - value=0, - inplace=False): - super(RandomErasing, self).__init__() - assert isinstance(scale, - (tuple, list)), "scale should be a tuple or list" - assert (scale[0] >= 0 and scale[1] <= 1 and scale[0] <= scale[1] - ), "scale should be of kind (min, max) and in range [0, 1]" - assert isinstance(ratio, - (tuple, list)), "ratio should be a tuple or list" - assert (ratio[0] >= 0 and - ratio[0] <= ratio[1]), "ratio should be of kind (min, max)" - assert isinstance( - value, (Number, str, tuple, - list)), "value should be a number, tuple, list or str" - if isinstance(value, str) and value != "random": - raise ValueError("value must be 'random' when type is str") - self.prob = prob - self.scale = scale - self.ratio = ratio - self.value = value - self.inplace = inplace - - def _erase(self, img, i, j, h, w, v, inplace=False): - if not inplace: - img = img.copy() - img[i:i + h, j:j + w, ...] = v - return img - - def _get_param(self, img, scale, ratio, value): - shape = np.asarray(img).astype(np.uint8).shape - h, w, c = shape[-3], shape[-2], shape[-1] - img_area = h * w - log_ratio = np.log(ratio) - for _ in range(1): - erase_area = np.random.uniform(*scale) * img_area - aspect_ratio = np.exp(np.random.uniform(*log_ratio)) - erase_h = int(round(np.sqrt(erase_area * aspect_ratio))) - erase_w = int(round(np.sqrt(erase_area / aspect_ratio))) - if erase_h >= h or erase_w >= w: - continue - - if value is None: - v = np.random.normal(size=[erase_h, erase_w, c]) * 255 - else: - v = np.array(value)[None, None, :] - top = np.random.randint(0, h - erase_h + 1) - left = np.random.randint(0, w - erase_w + 1) - return top, left, erase_h, erase_w, v - return 0, 0, h, w, img - - def apply(self, sample, context=None): - if random.random() < self.prob: - if isinstance(self.value, Number): - value = [self.value] - elif isinstance(self.value, str): - value = None - else: - value = self.value - if value is not None and not (len(value) == 1 or len(value) == 3): - raise ValueError( - "Value should be a single number or a sequence with length equals to image's channel." - ) - im = sample['image'] - top, left, erase_h, erase_w, v = self._get_param(im, self.scale, - self.ratio, value) - im = self._erase(im, top, left, erase_h, erase_w, v, self.inplace) - sample['image'] = im - return sample - - -@register_op -class RandomErasingCrop(BaseOperator): - def __init__(self): - super(RandomErasingCrop, self).__init__() - self.transform1 = RandomErasing( - prob=0.7, scale=(0.05, 0.2), ratio=(0.3, 3.3), value="random") - self.transform2 = RandomErasing( - prob=0.5, scale=(0.05, 0.2), ratio=(0.1, 6), value="random") - self.transform3 = RandomErasing( - prob=0.3, scale=(0.05, 0.2), ratio=(0.05, 8), value="random") - - def apply(self, sample, context=None): - sample = self.transform1(sample) - sample = self.transform2(sample) - sample = self.transform3(sample) - return sample diff --git a/pdfdet/models/Paddle/ppdet/data/transform/rotated_operators.py b/pdfdet/models/Paddle/ppdet/data/transform/rotated_operators.py deleted file mode 100644 index 5e9cebb..0000000 --- a/pdfdet/models/Paddle/ppdet/data/transform/rotated_operators.py +++ /dev/null @@ -1,480 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -try: - from collections.abc import Sequence -except Exception: - from collections import Sequence - -from numbers import Number, Integral - -import cv2 -import numpy as np -import math -import copy - -from .operators import register_op, BaseOperator -from ppdet.modeling.rbox_utils import poly2rbox_le135_np, poly2rbox_oc_np, rbox2poly_np -from ppdet.utils.logger import setup_logger -from ppdet.utils.compact import imagedraw_textsize_c -logger = setup_logger(__name__) - - -@register_op -class RRotate(BaseOperator): - """ Rotate Image, Polygon, Box - - Args: - scale (float): rotate scale - angle (float): rotate angle - fill_value (int, tuple): fill color - auto_bound (bool): whether auto bound or not - """ - - def __init__(self, scale=1.0, angle=0., fill_value=0., auto_bound=True): - super(RRotate, self).__init__() - self.scale = scale - self.angle = angle - self.fill_value = fill_value - self.auto_bound = auto_bound - - def get_rotated_matrix(self, angle, scale, h, w): - center = ((w - 1) * 0.5, (h - 1) * 0.5) - matrix = cv2.getRotationMatrix2D(center, -angle, scale) - # calculate the new size - cos = np.abs(matrix[0, 0]) - sin = np.abs(matrix[0, 1]) - new_w = h * sin + w * cos - new_h = h * cos + w * sin - # calculate offset - n_w = int(np.round(new_w)) - n_h = int(np.round(new_h)) - if self.auto_bound: - ratio = min(w / n_w, h / n_h) - matrix = cv2.getRotationMatrix2D(center, -angle, ratio) - else: - matrix[0, 2] += (new_w - w) * 0.5 - matrix[1, 2] += (new_h - h) * 0.5 - w = n_w - h = n_h - return matrix, h, w - - def get_rect_from_pts(self, pts, h, w): - """ get minimum rectangle of points - """ - assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct' - min_x, min_y = np.min(pts[:, 0::2], axis=1), np.min(pts[:, 1::2], - axis=1) - max_x, max_y = np.max(pts[:, 0::2], axis=1), np.max(pts[:, 1::2], - axis=1) - min_x, min_y = np.clip(min_x, 0, w), np.clip(min_y, 0, h) - max_x, max_y = np.clip(max_x, 0, w), np.clip(max_y, 0, h) - boxes = np.stack([min_x, min_y, max_x, max_y], axis=-1) - return boxes - - def apply_image(self, image, matrix, h, w): - return cv2.warpAffine( - image, matrix, (w, h), borderValue=self.fill_value) - - def apply_pts(self, pts, matrix, h, w): - assert pts.shape[-1] % 2 == 0, 'the dim of input [pts] is not correct' - # n is number of samples and m is two times the number of points due to (x, y) - _, m = pts.shape - # transpose points - pts_ = pts.reshape(-1, 2).T - # pad 1 to convert the points to homogeneous coordinates - padding = np.ones((1, pts_.shape[1]), pts.dtype) - rotated_pts = np.matmul(matrix, np.concatenate((pts_, padding), axis=0)) - return rotated_pts[:2, :].T.reshape(-1, m) - - def apply(self, sample, context=None): - image = sample['image'] - h, w = image.shape[:2] - matrix, h, w = self.get_rotated_matrix(self.angle, self.scale, h, w) - sample['image'] = self.apply_image(image, matrix, h, w) - polys = sample['gt_poly'] - # TODO: segment or keypoint to be processed - if len(polys) > 0: - pts = self.apply_pts(polys, matrix, h, w) - sample['gt_poly'] = pts - sample['gt_bbox'] = self.get_rect_from_pts(pts, h, w) - - return sample - - -@register_op -class RandomRRotate(BaseOperator): - """ Random Rotate Image - Args: - scale (float, tuple, list): rotate scale - scale_mode (str): mode of scale, [range, value, None] - angle (float, tuple, list): rotate angle - angle_mode (str): mode of angle, [range, value, None] - fill_value (float, tuple, list): fill value - rotate_prob (float): probability of rotation - auto_bound (bool): whether auto bound or not - """ - - def __init__(self, - scale=1.0, - scale_mode=None, - angle=0., - angle_mode=None, - fill_value=0., - rotate_prob=1.0, - auto_bound=True): - super(RandomRRotate, self).__init__() - self.scale = scale - self.scale_mode = scale_mode - self.angle = angle - self.angle_mode = angle_mode - self.fill_value = fill_value - self.rotate_prob = rotate_prob - self.auto_bound = auto_bound - - def get_angle(self, angle, angle_mode): - assert not angle_mode or angle_mode in [ - 'range', 'value' - ], 'angle mode should be in [range, value, None]' - if not angle_mode: - return angle - elif angle_mode == 'range': - low, high = angle - return np.random.rand() * (high - low) + low - elif angle_mode == 'value': - return np.random.choice(angle) - - def get_scale(self, scale, scale_mode): - assert not scale_mode or scale_mode in [ - 'range', 'value' - ], 'scale mode should be in [range, value, None]' - if not scale_mode: - return scale - elif scale_mode == 'range': - low, high = scale - return np.random.rand() * (high - low) + low - elif scale_mode == 'value': - return np.random.choice(scale) - - def apply(self, sample, context=None): - if np.random.rand() > self.rotate_prob: - return sample - - angle = self.get_angle(self.angle, self.angle_mode) - scale = self.get_scale(self.scale, self.scale_mode) - rotator = RRotate(scale, angle, self.fill_value, self.auto_bound) - return rotator(sample) - - -@register_op -class Poly2RBox(BaseOperator): - """ Polygon to Rotated Box, using new OpenCV definition since 4.5.1 - - Args: - filter_threshold (int, float): threshold to filter annotations - filter_mode (str): filter mode, ['area', 'edge'] - rbox_type (str): rbox type, ['le135', 'oc'] - - """ - - def __init__(self, filter_threshold=4, filter_mode=None, rbox_type='le135'): - super(Poly2RBox, self).__init__() - self.filter_fn = lambda size: self.filter(size, filter_threshold, filter_mode) - self.rbox_fn = poly2rbox_le135_np if rbox_type == 'le135' else poly2rbox_oc_np - - def filter(self, size, threshold, mode): - if mode == 'area': - if size[0] * size[1] < threshold: - return True - elif mode == 'edge': - if min(size) < threshold: - return True - return False - - def get_rbox(self, polys): - valid_ids, rboxes, bboxes = [], [], [] - for i, poly in enumerate(polys): - cx, cy, w, h, angle = self.rbox_fn(poly) - if self.filter_fn((w, h)): - continue - rboxes.append(np.array([cx, cy, w, h, angle], dtype=np.float32)) - valid_ids.append(i) - xmin, ymin = min(poly[0::2]), min(poly[1::2]) - xmax, ymax = max(poly[0::2]), max(poly[1::2]) - bboxes.append(np.array([xmin, ymin, xmax, ymax], dtype=np.float32)) - - if len(valid_ids) == 0: - rboxes = np.zeros((0, 5), dtype=np.float32) - bboxes = np.zeros((0, 4), dtype=np.float32) - else: - rboxes = np.stack(rboxes) - bboxes = np.stack(bboxes) - - return rboxes, bboxes, valid_ids - - def apply(self, sample, context=None): - rboxes, bboxes, valid_ids = self.get_rbox(sample['gt_poly']) - sample['gt_rbox'] = rboxes - sample['gt_bbox'] = bboxes - for k in ['gt_class', 'gt_score', 'gt_poly', 'is_crowd', 'difficult']: - if k in sample: - sample[k] = sample[k][valid_ids] - - return sample - - -@register_op -class Poly2Array(BaseOperator): - """ convert gt_poly to np.array for rotated bboxes - """ - - def __init__(self): - super(Poly2Array, self).__init__() - - def apply(self, sample, context=None): - if 'gt_poly' in sample: - sample['gt_poly'] = np.array( - sample['gt_poly'], dtype=np.float32).reshape((-1, 8)) - - return sample - - -@register_op -class RResize(BaseOperator): - def __init__(self, target_size, keep_ratio, interp=cv2.INTER_LINEAR): - """ - Resize image to target size. if keep_ratio is True, - resize the image's long side to the maximum of target_size - if keep_ratio is False, resize the image to target size(h, w) - Args: - target_size (int|list): image target size - keep_ratio (bool): whether keep_ratio or not, default true - interp (int): the interpolation method - """ - super(RResize, self).__init__() - self.keep_ratio = keep_ratio - self.interp = interp - if not isinstance(target_size, (Integral, Sequence)): - raise TypeError( - "Type of target_size is invalid. Must be Integer or List or Tuple, now is {}". - format(type(target_size))) - if isinstance(target_size, Integral): - target_size = [target_size, target_size] - self.target_size = target_size - - def apply_image(self, image, scale): - im_scale_x, im_scale_y = scale - - return cv2.resize( - image, - None, - None, - fx=im_scale_x, - fy=im_scale_y, - interpolation=self.interp) - - def apply_pts(self, pts, scale, size): - im_scale_x, im_scale_y = scale - resize_w, resize_h = size - pts[:, 0::2] *= im_scale_x - pts[:, 1::2] *= im_scale_y - pts[:, 0::2] = np.clip(pts[:, 0::2], 0, resize_w) - pts[:, 1::2] = np.clip(pts[:, 1::2], 0, resize_h) - return pts - - def apply(self, sample, context=None): - """ Resize the image numpy. - """ - im = sample['image'] - if not isinstance(im, np.ndarray): - raise TypeError("{}: image type is not numpy.".format(self)) - if len(im.shape) != 3: - raise ImageError('{}: image is not 3-dimensional.'.format(self)) - - # apply image - im_shape = im.shape - if self.keep_ratio: - - im_size_min = np.min(im_shape[0:2]) - im_size_max = np.max(im_shape[0:2]) - - target_size_min = np.min(self.target_size) - target_size_max = np.max(self.target_size) - - im_scale = min(target_size_min / im_size_min, - target_size_max / im_size_max) - - resize_h = im_scale * float(im_shape[0]) - resize_w = im_scale * float(im_shape[1]) - - im_scale_x = im_scale - im_scale_y = im_scale - else: - resize_h, resize_w = self.target_size - im_scale_y = resize_h / im_shape[0] - im_scale_x = resize_w / im_shape[1] - - im = self.apply_image(sample['image'], [im_scale_x, im_scale_y]) - sample['image'] = im.astype(np.float32) - sample['im_shape'] = np.asarray([resize_h, resize_w], dtype=np.float32) - if 'scale_factor' in sample: - scale_factor = sample['scale_factor'] - sample['scale_factor'] = np.asarray( - [scale_factor[0] * im_scale_y, scale_factor[1] * im_scale_x], - dtype=np.float32) - else: - sample['scale_factor'] = np.asarray( - [im_scale_y, im_scale_x], dtype=np.float32) - - # apply bbox - if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: - sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'], - [im_scale_x, im_scale_y], - [resize_w, resize_h]) - - # apply polygon - if 'gt_poly' in sample and len(sample['gt_poly']) > 0: - sample['gt_poly'] = self.apply_pts(sample['gt_poly'], - [im_scale_x, im_scale_y], - [resize_w, resize_h]) - - return sample - - -@register_op -class RandomRFlip(BaseOperator): - def __init__(self, prob=0.5): - """ - Args: - prob (float): the probability of flipping image - """ - super(RandomRFlip, self).__init__() - self.prob = prob - if not (isinstance(self.prob, float)): - raise TypeError("{}: input type is invalid.".format(self)) - - def apply_image(self, image): - return image[:, ::-1, :] - - def apply_pts(self, pts, width): - oldx = pts[:, 0::2].copy() - pts[:, 0::2] = width - oldx - 1 - return pts - - def apply(self, sample, context=None): - """Filp the image and bounding box. - Operators: - 1. Flip the image numpy. - 2. Transform the bboxes' x coordinates. - (Must judge whether the coordinates are normalized!) - 3. Transform the segmentations' x coordinates. - (Must judge whether the coordinates are normalized!) - Output: - sample: the image, bounding box and segmentation part - in sample are flipped. - """ - if np.random.uniform(0, 1) < self.prob: - im = sample['image'] - height, width = im.shape[:2] - im = self.apply_image(im) - if 'gt_bbox' in sample and len(sample['gt_bbox']) > 0: - sample['gt_bbox'] = self.apply_pts(sample['gt_bbox'], width) - if 'gt_poly' in sample and len(sample['gt_poly']) > 0: - sample['gt_poly'] = self.apply_pts(sample['gt_poly'], width) - - sample['flipped'] = True - sample['image'] = im - return sample - - -@register_op -class VisibleRBox(BaseOperator): - """ - In debug mode, visualize images according to `gt_box`. - (Currently only supported when not cropping and flipping image.) - """ - - def __init__(self, output_dir='debug'): - super(VisibleRBox, self).__init__() - self.output_dir = output_dir - if not os.path.isdir(output_dir): - os.makedirs(output_dir) - - def apply(self, sample, context=None): - image = Image.fromarray(sample['image'].astype(np.uint8)) - out_file_name = '{:012d}.jpg'.format(sample['im_id'][0]) - width = sample['w'] - height = sample['h'] - # gt_poly = sample['gt_rbox'] - gt_poly = sample['gt_poly'] - gt_class = sample['gt_class'] - draw = ImageDraw.Draw(image) - for i in range(gt_poly.shape[0]): - x1, y1, x2, y2, x3, y3, x4, y4 = gt_poly[i] - draw.line( - [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)], - width=2, - fill='green') - # draw label - xmin = min(x1, x2, x3, x4) - ymin = min(y1, y2, y3, y4) - text = str(gt_class[i][0]) - tw, th = imagedraw_textsize_c(draw, text) - draw.rectangle( - [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill='green') - draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255)) - - if 'gt_keypoint' in sample.keys(): - gt_keypoint = sample['gt_keypoint'] - if self.is_normalized: - for i in range(gt_keypoint.shape[1]): - if i % 2: - gt_keypoint[:, i] = gt_keypoint[:, i] * height - else: - gt_keypoint[:, i] = gt_keypoint[:, i] * width - for i in range(gt_keypoint.shape[0]): - keypoint = gt_keypoint[i] - for j in range(int(keypoint.shape[0] / 2)): - x1 = round(keypoint[2 * j]).astype(np.int32) - y1 = round(keypoint[2 * j + 1]).astype(np.int32) - draw.ellipse( - (x1, y1, x1 + 5, y1 + 5), fill='green', outline='green') - save_path = os.path.join(self.output_dir, out_file_name) - image.save(save_path, quality=95) - return sample - - -@register_op -class Rbox2Poly(BaseOperator): - """ - Convert rbbox format to poly format. - """ - - def __init__(self): - super(Rbox2Poly, self).__init__() - - def apply(self, sample, context=None): - assert 'gt_rbox' in sample - assert sample['gt_rbox'].shape[1] == 5 - rboxes = sample['gt_rbox'] - polys = rbox2poly_np(rboxes) - sample['gt_poly'] = polys - xmin, ymin = polys[:, 0::2].min(1), polys[:, 1::2].min(1) - xmax, ymax = polys[:, 0::2].max(1), polys[:, 1::2].max(1) - sample['gt_bbox'] = np.stack([xmin, ymin, xmin, ymin], axis=1) - return sample diff --git a/pdfdet/models/Paddle/ppdet/data/utils.py b/pdfdet/models/Paddle/ppdet/data/utils.py deleted file mode 100644 index 02573e6..0000000 --- a/pdfdet/models/Paddle/ppdet/data/utils.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import numbers -import numpy as np - -try: - from collections.abc import Sequence, Mapping -except: - from collections import Sequence, Mapping - - -def default_collate_fn(batch): - """ - Default batch collating function for :code:`paddle.io.DataLoader`, - get input data as a list of sample datas, each element in list - if the data of a sample, and sample data should composed of list, - dictionary, string, number, numpy array, this - function will parse input data recursively and stack number, - numpy array and paddle.Tensor datas as batch datas. e.g. for - following input data: - [{'image': np.array(shape=[3, 224, 224]), 'label': 1}, - {'image': np.array(shape=[3, 224, 224]), 'label': 3}, - {'image': np.array(shape=[3, 224, 224]), 'label': 4}, - {'image': np.array(shape=[3, 224, 224]), 'label': 5},] - - - This default collate function zipped each number and numpy array - field together and stack each field as the batch field as follows: - {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])} - Args: - batch(list of sample data): batch should be a list of sample data. - - Returns: - Batched data: batched each number, numpy array and paddle.Tensor - in input data. - """ - sample = batch[0] - if isinstance(sample, np.ndarray): - batch = np.stack(batch, axis=0) - return batch - elif isinstance(sample, numbers.Number): - batch = np.array(batch) - return batch - elif isinstance(sample, (str, bytes)): - return batch - elif isinstance(sample, Mapping): - return { - key: default_collate_fn([d[key] for d in batch]) - for key in sample - } - elif isinstance(sample, Sequence): - sample_fields_num = len(sample) - if not all(len(sample) == sample_fields_num for sample in iter(batch)): - raise RuntimeError( - "fileds number not same among samples in a batch") - return [default_collate_fn(fields) for fields in zip(*batch)] - - raise TypeError("batch data con only contains: tensor, numpy.ndarray, " - "dict, list, number, but got {}".format(type(sample))) diff --git a/pdfdet/models/Paddle/ppdet/engine/__init__.py b/pdfdet/models/Paddle/ppdet/engine/__init__.py deleted file mode 100644 index 91166e8..0000000 --- a/pdfdet/models/Paddle/ppdet/engine/__init__.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import trainer -from .trainer import * - -from . import trainer_cot -from .trainer_cot import * - -from . import callbacks -from .callbacks import * - -from . import env -from .env import * - -__all__ = trainer.__all__ \ - + callbacks.__all__ \ - + env.__all__ - -from . import tracker -from .tracker import * -__all__ = __all__ + tracker.__all__ - -from . import trainer_ssod -from .trainer_ssod import * -__all__ = __all__ + trainer_ssod.__all__ diff --git a/pdfdet/models/Paddle/ppdet/engine/callbacks.py b/pdfdet/models/Paddle/ppdet/engine/callbacks.py deleted file mode 100644 index 87dcd61..0000000 --- a/pdfdet/models/Paddle/ppdet/engine/callbacks.py +++ /dev/null @@ -1,693 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import sys -import datetime -import six -import copy -import json - -import paddle -import paddle.distributed as dist - -from ppdet.utils.checkpoint import save_model, save_semi_model -from ppdet.metrics import get_infer_results - -from ppdet.utils.logger import setup_logger -logger = setup_logger('ppdet.engine') - -__all__ = [ - 'Callback', 'ComposeCallback', 'LogPrinter', 'Checkpointer', - 'VisualDLWriter', 'SniperProposalsGenerator' -] - - -class Callback(object): - def __init__(self, model): - self.model = model - - def on_step_begin(self, status): - pass - - def on_step_end(self, status): - pass - - def on_epoch_begin(self, status): - pass - - def on_epoch_end(self, status): - pass - - def on_train_begin(self, status): - pass - - def on_train_end(self, status): - pass - - -class ComposeCallback(object): - def __init__(self, callbacks): - callbacks = [c for c in list(callbacks) if c is not None] - for c in callbacks: - assert isinstance( - c, Callback), "callback should be subclass of Callback" - self._callbacks = callbacks - - def on_step_begin(self, status): - for c in self._callbacks: - c.on_step_begin(status) - - def on_step_end(self, status): - for c in self._callbacks: - c.on_step_end(status) - - def on_epoch_begin(self, status): - for c in self._callbacks: - c.on_epoch_begin(status) - - def on_epoch_end(self, status): - for c in self._callbacks: - c.on_epoch_end(status) - - def on_train_begin(self, status): - for c in self._callbacks: - c.on_train_begin(status) - - def on_train_end(self, status): - for c in self._callbacks: - c.on_train_end(status) - - -class LogPrinter(Callback): - def __init__(self, model): - super(LogPrinter, self).__init__(model) - - def on_step_end(self, status): - if dist.get_world_size() < 2 or dist.get_rank() == 0: - mode = status['mode'] - if mode == 'train': - epoch_id = status['epoch_id'] - step_id = status['step_id'] - steps_per_epoch = status['steps_per_epoch'] - training_staus = status['training_staus'] - batch_time = status['batch_time'] - data_time = status['data_time'] - - epoches = self.model.cfg.epoch - batch_size = self.model.cfg['{}Reader'.format(mode.capitalize( - ))]['batch_size'] - - logs = training_staus.log() - space_fmt = ':' + str(len(str(steps_per_epoch))) + 'd' - if step_id % self.model.cfg.log_iter == 0: - eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id - eta_sec = eta_steps * batch_time.global_avg - eta_str = str(datetime.timedelta(seconds=int(eta_sec))) - ips = float(batch_size) / batch_time.avg - fmt = ' '.join([ - 'Epoch: [{}]', - '[{' + space_fmt + '}/{}]', - 'learning_rate: {lr:.6f}', - '{meters}', - 'eta: {eta}', - 'batch_cost: {btime}', - 'data_cost: {dtime}', - 'ips: {ips:.4f} images/s', - ]) - fmt = fmt.format( - epoch_id, - step_id, - steps_per_epoch, - lr=status['learning_rate'], - meters=logs, - eta=eta_str, - btime=str(batch_time), - dtime=str(data_time), - ips=ips) - logger.info(fmt) - if mode == 'eval': - step_id = status['step_id'] - if step_id % 100 == 0: - logger.info("Eval iter: {}".format(step_id)) - - def on_epoch_end(self, status): - if dist.get_world_size() < 2 or dist.get_rank() == 0: - mode = status['mode'] - if mode == 'eval': - sample_num = status['sample_num'] - cost_time = status['cost_time'] - logger.info('Total sample number: {}, average FPS: {}'.format( - sample_num, sample_num / cost_time)) - - -class Checkpointer(Callback): - def __init__(self, model): - super(Checkpointer, self).__init__(model) - self.best_ap = -1000. - self.save_dir = self.model.cfg.save_dir - if hasattr(self.model.model, 'student_model'): - self.weight = self.model.model.student_model - else: - self.weight = self.model.model - - def on_epoch_end(self, status): - # Checkpointer only performed during training - mode = status['mode'] - epoch_id = status['epoch_id'] - weight = None - save_name = None - if dist.get_world_size() < 2 or dist.get_rank() == 0: - if mode == 'train': - end_epoch = self.model.cfg.epoch - if ( - epoch_id + 1 - ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1: - save_name = str( - epoch_id) if epoch_id != end_epoch - 1 else "model_final" - weight = self.weight.state_dict() - elif mode == 'eval': - if 'save_best_model' in status and status['save_best_model']: - for metric in self.model._metrics: - map_res = metric.get_results() - eval_func = "ap" - if 'pose3d' in map_res: - key = 'pose3d' - eval_func = "mpjpe" - elif 'bbox' in map_res: - key = 'bbox' - elif 'keypoint' in map_res: - key = 'keypoint' - else: - key = 'mask' - if key not in map_res: - logger.warning("Evaluation results empty, this may be due to " \ - "training iterations being too few or not " \ - "loading the correct weights.") - return - if map_res[key][0] >= self.best_ap: - self.best_ap = map_res[key][0] - save_name = 'best_model' - weight = self.weight.state_dict() - logger.info("Best test {} {} is {:0.3f}.".format( - key, eval_func, abs(self.best_ap))) - if weight: - if self.model.use_ema: - exchange_save_model = status.get('exchange_save_model', - False) - if not exchange_save_model: - # save model and ema_model - save_model( - status['weight'], - self.model.optimizer, - self.save_dir, - save_name, - epoch_id + 1, - ema_model=weight) - else: - # save model(student model) and ema_model(teacher model) - # in DenseTeacher SSOD, the teacher model will be higher, - # so exchange when saving pdparams - student_model = status['weight'] # model - teacher_model = weight # ema_model - save_model( - teacher_model, - self.model.optimizer, - self.save_dir, - save_name, - epoch_id + 1, - ema_model=student_model) - del teacher_model - del student_model - else: - save_model(weight, self.model.optimizer, self.save_dir, - save_name, epoch_id + 1) - - -class WiferFaceEval(Callback): - def __init__(self, model): - super(WiferFaceEval, self).__init__(model) - - def on_epoch_begin(self, status): - assert self.model.mode == 'eval', \ - "WiferFaceEval can only be set during evaluation" - for metric in self.model._metrics: - metric.update(self.model.model) - sys.exit() - - -class VisualDLWriter(Callback): - """ - Use VisualDL to log data or image - """ - - def __init__(self, model): - super(VisualDLWriter, self).__init__(model) - - assert six.PY3, "VisualDL requires Python >= 3.5" - try: - from visualdl import LogWriter - except Exception as e: - logger.error('visualdl not found, plaese install visualdl. ' - 'for example: `pip install visualdl`.') - raise e - self.vdl_writer = LogWriter( - model.cfg.get('vdl_log_dir', 'vdl_log_dir/scalar')) - self.vdl_loss_step = 0 - self.vdl_mAP_step = 0 - self.vdl_image_step = 0 - self.vdl_image_frame = 0 - - def on_step_end(self, status): - mode = status['mode'] - if dist.get_world_size() < 2 or dist.get_rank() == 0: - if mode == 'train': - training_staus = status['training_staus'] - for loss_name, loss_value in training_staus.get().items(): - self.vdl_writer.add_scalar(loss_name, loss_value, - self.vdl_loss_step) - self.vdl_loss_step += 1 - elif mode == 'test': - ori_image = status['original_image'] - result_image = status['result_image'] - self.vdl_writer.add_image( - "original/frame_{}".format(self.vdl_image_frame), ori_image, - self.vdl_image_step) - self.vdl_writer.add_image( - "result/frame_{}".format(self.vdl_image_frame), - result_image, self.vdl_image_step) - self.vdl_image_step += 1 - # each frame can display ten pictures at most. - if self.vdl_image_step % 10 == 0: - self.vdl_image_step = 0 - self.vdl_image_frame += 1 - - def on_epoch_end(self, status): - mode = status['mode'] - if dist.get_world_size() < 2 or dist.get_rank() == 0: - if mode == 'eval': - for metric in self.model._metrics: - for key, map_value in metric.get_results().items(): - self.vdl_writer.add_scalar("{}-mAP".format(key), - map_value[0], - self.vdl_mAP_step) - self.vdl_mAP_step += 1 - - -class WandbCallback(Callback): - def __init__(self, model): - super(WandbCallback, self).__init__(model) - - try: - import wandb - self.wandb = wandb - except Exception as e: - logger.error('wandb not found, please install wandb. ' - 'Use: `pip install wandb`.') - raise e - - self.wandb_params = model.cfg.get('wandb', None) - self.save_dir = self.model.cfg.save_dir - if self.wandb_params is None: - self.wandb_params = {} - for k, v in model.cfg.items(): - if k.startswith("wandb_"): - self.wandb_params.update({k.lstrip("wandb_"): v}) - - self._run = None - if dist.get_world_size() < 2 or dist.get_rank() == 0: - _ = self.run - self.run.config.update(self.model.cfg) - self.run.define_metric("epoch") - self.run.define_metric("eval/*", step_metric="epoch") - - self.best_ap = -1000. - self.fps = [] - - @property - def run(self): - if self._run is None: - if self.wandb.run is not None: - logger.info( - "There is an ongoing wandb run which will be used" - "for logging. Please use `wandb.finish()` to end that" - "if the behaviour is not intended") - self._run = self.wandb.run - else: - self._run = self.wandb.init(**self.wandb_params) - return self._run - - def save_model(self, - optimizer, - save_dir, - save_name, - last_epoch, - ema_model=None, - ap=None, - fps=None, - tags=None): - if dist.get_world_size() < 2 or dist.get_rank() == 0: - model_path = os.path.join(save_dir, save_name) - metadata = {} - metadata["last_epoch"] = last_epoch - if ap: - metadata["ap"] = ap - - if fps: - metadata["fps"] = fps - - if ema_model is None: - ema_artifact = self.wandb.Artifact( - name="ema_model-{}".format(self.run.id), - type="model", - metadata=metadata) - model_artifact = self.wandb.Artifact( - name="model-{}".format(self.run.id), - type="model", - metadata=metadata) - - ema_artifact.add_file(model_path + ".pdema", name="model_ema") - model_artifact.add_file(model_path + ".pdparams", name="model") - - self.run.log_artifact(ema_artifact, aliases=tags) - self.run.log_artfact(model_artifact, aliases=tags) - else: - model_artifact = self.wandb.Artifact( - name="model-{}".format(self.run.id), - type="model", - metadata=metadata) - model_artifact.add_file(model_path + ".pdparams", name="model") - self.run.log_artifact(model_artifact, aliases=tags) - - def on_step_end(self, status): - - mode = status['mode'] - if dist.get_world_size() < 2 or dist.get_rank() == 0: - if mode == 'train': - training_status = status['training_staus'].get() - for k, v in training_status.items(): - training_status[k] = float(v) - - # calculate ips, data_cost, batch_cost - batch_time = status['batch_time'] - data_time = status['data_time'] - batch_size = self.model.cfg['{}Reader'.format(mode.capitalize( - ))]['batch_size'] - - ips = float(batch_size) / float(batch_time.avg) - data_cost = float(data_time.avg) - batch_cost = float(batch_time.avg) - - metrics = {"train/" + k: v for k, v in training_status.items()} - - metrics["train/ips"] = ips - metrics["train/data_cost"] = data_cost - metrics["train/batch_cost"] = batch_cost - - self.fps.append(ips) - self.run.log(metrics) - - def on_epoch_end(self, status): - mode = status['mode'] - epoch_id = status['epoch_id'] - save_name = None - if dist.get_world_size() < 2 or dist.get_rank() == 0: - if mode == 'train': - fps = sum(self.fps) / len(self.fps) - self.fps = [] - - end_epoch = self.model.cfg.epoch - if ( - epoch_id + 1 - ) % self.model.cfg.snapshot_epoch == 0 or epoch_id == end_epoch - 1: - save_name = str( - epoch_id) if epoch_id != end_epoch - 1 else "model_final" - tags = ["latest", "epoch_{}".format(epoch_id)] - self.save_model( - self.model.optimizer, - self.save_dir, - save_name, - epoch_id + 1, - self.model.use_ema, - fps=fps, - tags=tags) - if mode == 'eval': - sample_num = status['sample_num'] - cost_time = status['cost_time'] - - fps = sample_num / cost_time - - merged_dict = {} - for metric in self.model._metrics: - for key, map_value in metric.get_results().items(): - merged_dict["eval/{}-mAP".format(key)] = map_value[0] - merged_dict["epoch"] = status["epoch_id"] - merged_dict["eval/fps"] = sample_num / cost_time - - self.run.log(merged_dict) - - if 'save_best_model' in status and status['save_best_model']: - for metric in self.model._metrics: - map_res = metric.get_results() - if 'pose3d' in map_res: - key = 'pose3d' - elif 'bbox' in map_res: - key = 'bbox' - elif 'keypoint' in map_res: - key = 'keypoint' - else: - key = 'mask' - if key not in map_res: - logger.warning("Evaluation results empty, this may be due to " \ - "training iterations being too few or not " \ - "loading the correct weights.") - return - if map_res[key][0] >= self.best_ap: - self.best_ap = map_res[key][0] - save_name = 'best_model' - tags = ["best", "epoch_{}".format(epoch_id)] - - self.save_model( - self.model.optimizer, - self.save_dir, - save_name, - last_epoch=epoch_id + 1, - ema_model=self.model.use_ema, - ap=abs(self.best_ap), - fps=fps, - tags=tags) - - def on_train_end(self, status): - self.run.finish() - - -class SniperProposalsGenerator(Callback): - def __init__(self, model): - super(SniperProposalsGenerator, self).__init__(model) - ori_dataset = self.model.dataset - self.dataset = self._create_new_dataset(ori_dataset) - self.loader = self.model.loader - self.cfg = self.model.cfg - self.infer_model = self.model.model - - def _create_new_dataset(self, ori_dataset): - dataset = copy.deepcopy(ori_dataset) - # init anno_cropper - dataset.init_anno_cropper() - # generate infer roidbs - ori_roidbs = dataset.get_ori_roidbs() - roidbs = dataset.anno_cropper.crop_infer_anno_records(ori_roidbs) - # set new roidbs - dataset.set_roidbs(roidbs) - - return dataset - - def _eval_with_loader(self, loader): - results = [] - with paddle.no_grad(): - self.infer_model.eval() - for step_id, data in enumerate(loader): - outs = self.infer_model(data) - for key in ['im_shape', 'scale_factor', 'im_id']: - outs[key] = data[key] - for key, value in outs.items(): - if hasattr(value, 'numpy'): - outs[key] = value.numpy() - - results.append(outs) - - return results - - def on_train_end(self, status): - self.loader.dataset = self.dataset - results = self._eval_with_loader(self.loader) - results = self.dataset.anno_cropper.aggregate_chips_detections(results) - # sniper - proposals = [] - clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} - for outs in results: - batch_res = get_infer_results(outs, clsid2catid) - start = 0 - for i, im_id in enumerate(outs['im_id']): - bbox_num = outs['bbox_num'] - end = start + bbox_num[i] - bbox_res = batch_res['bbox'][start:end] \ - if 'bbox' in batch_res else None - if bbox_res: - proposals += bbox_res - logger.info("save proposals in {}".format(self.cfg.proposals_path)) - with open(self.cfg.proposals_path, 'w') as f: - json.dump(proposals, f) - - -class SemiLogPrinter(LogPrinter): - def __init__(self, model): - super(SemiLogPrinter, self).__init__(model) - - def on_step_end(self, status): - if dist.get_world_size() < 2 or dist.get_rank() == 0: - mode = status['mode'] - if mode == 'train': - epoch_id = status['epoch_id'] - step_id = status['step_id'] - iter_id = status['iter_id'] - steps_per_epoch = status['steps_per_epoch'] - training_staus = status['training_staus'] - batch_time = status['batch_time'] - data_time = status['data_time'] - - epoches = self.model.cfg.epoch - batch_size = self.model.cfg['{}Reader'.format(mode.capitalize( - ))]['batch_size'] - iters = epoches * steps_per_epoch - logs = training_staus.log() - iter_space_fmt = ':' + str(len(str(iters))) + 'd' - space_fmt = ':' + str(len(str(iters))) + 'd' - if step_id % self.model.cfg.log_iter == 0: - eta_steps = (epoches - epoch_id) * steps_per_epoch - step_id - eta_sec = eta_steps * batch_time.global_avg - eta_str = str(datetime.timedelta(seconds=int(eta_sec))) - ips = float(batch_size) / batch_time.avg - fmt = ' '.join([ - '{' + iter_space_fmt + '}/{} iters', - 'Epoch: [{}]', - '[{' + space_fmt + '}/{}]', - 'learning_rate: {lr:.6f}', - '{meters}', - 'eta: {eta}', - 'batch_cost: {btime}', - 'data_cost: {dtime}', - 'ips: {ips:.4f} images/s', - ]) - fmt = fmt.format( - iter_id, - iters, - epoch_id, - step_id, - steps_per_epoch, - lr=status['learning_rate'], - meters=logs, - eta=eta_str, - btime=str(batch_time), - dtime=str(data_time), - ips=ips) - logger.info(fmt) - if mode == 'eval': - step_id = status['step_id'] - if step_id % 100 == 0: - logger.info("Eval iter: {}".format(step_id)) - - -class SemiCheckpointer(Checkpointer): - def __init__(self, model): - super(SemiCheckpointer, self).__init__(model) - cfg = self.model.cfg - self.best_ap = 0. - self.save_dir = os.path.join(self.model.cfg.save_dir, - self.model.cfg.filename) - if hasattr(self.model.model, 'student') and hasattr(self.model.model, - 'teacher'): - self.weight = (self.model.model.teacher, self.model.model.student) - elif hasattr(self.model.model, 'student') or hasattr(self.model.model, - 'teacher'): - raise AttributeError( - "model has no attribute 'student' or 'teacher'") - else: - raise AttributeError( - "model has no attribute 'student' and 'teacher'") - - def every_n_iters(self, iter_id, n): - return (iter_id + 1) % n == 0 if n > 0 else False - - def on_step_end(self, status): - # Checkpointer only performed during training - mode = status['mode'] - eval_interval = status['eval_interval'] - save_interval = status['save_interval'] - iter_id = status['iter_id'] - epoch_id = status['epoch_id'] - t_weight = None - s_weight = None - save_name = None - if dist.get_world_size() < 2 or dist.get_rank() == 0: - if self.every_n_iters(iter_id, save_interval) and mode == 'train': - save_name = "last_epoch" - # save_name = str(iter_id + 1) - t_weight = self.weight[0].state_dict() - s_weight = self.weight[1].state_dict() - save_semi_model(t_weight, s_weight, self.model.optimizer, - self.save_dir, save_name, epoch_id + 1, - iter_id + 1) - - def on_epoch_end(self, status): - # Checkpointer only performed during training - mode = status['mode'] - eval_interval = status['eval_interval'] - save_interval = status['save_interval'] - iter_id = status['iter_id'] - epoch_id = status['epoch_id'] - t_weight = None - s_weight = None - save_name = None - if dist.get_world_size() < 2 or dist.get_rank() == 0: - if self.every_n_iters(iter_id, eval_interval) and mode == 'eval': - if 'save_best_model' in status and status['save_best_model']: - for metric in self.model._metrics: - map_res = metric.get_results() - if 'bbox' in map_res: - key = 'bbox' - elif 'keypoint' in map_res: - key = 'keypoint' - else: - key = 'mask' - if key not in map_res: - logger.warning("Evaluation results empty, this may be due to " \ - "training iterations being too few or not " \ - "loading the correct weights.") - return - if map_res[key][0] > self.best_ap: - self.best_ap = map_res[key][0] - save_name = 'best_model' - t_weight = self.weight[0].state_dict() - s_weight = self.weight[1].state_dict() - logger.info("Best teacher test {} ap is {:0.3f}.". - format(key, self.best_ap)) - if t_weight and s_weight: - save_semi_model(t_weight, s_weight, - self.model.optimizer, self.save_dir, - save_name, epoch_id + 1, iter_id + 1) diff --git a/pdfdet/models/Paddle/ppdet/engine/env.py b/pdfdet/models/Paddle/ppdet/engine/env.py deleted file mode 100644 index 0a89657..0000000 --- a/pdfdet/models/Paddle/ppdet/engine/env.py +++ /dev/null @@ -1,50 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import random -import numpy as np - -import paddle -from paddle.distributed import fleet - -__all__ = ['init_parallel_env', 'set_random_seed', 'init_fleet_env'] - - -def init_fleet_env(find_unused_parameters=False): - strategy = fleet.DistributedStrategy() - strategy.find_unused_parameters = find_unused_parameters - fleet.init(is_collective=True, strategy=strategy) - - -def init_parallel_env(): - env = os.environ - dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env - if dist: - trainer_id = int(env['PADDLE_TRAINER_ID']) - local_seed = (99 + trainer_id) - random.seed(local_seed) - np.random.seed(local_seed) - - paddle.distributed.init_parallel_env() - - -def set_random_seed(seed): - paddle.seed(seed) - random.seed(seed) - np.random.seed(seed) diff --git a/pdfdet/models/Paddle/ppdet/engine/export_utils.py b/pdfdet/models/Paddle/ppdet/engine/export_utils.py deleted file mode 100644 index daaa39a..0000000 --- a/pdfdet/models/Paddle/ppdet/engine/export_utils.py +++ /dev/null @@ -1,373 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import yaml -from collections import OrderedDict - -import paddle -from ppdet.data.source.category import get_categories - -from ppdet.utils.logger import setup_logger -logger = setup_logger('ppdet.engine') - -# Global dictionary -TRT_MIN_SUBGRAPH = { - 'YOLO': 3, - 'PPYOLOE': 3, - 'SSD': 60, - 'RCNN': 40, - 'RetinaNet': 40, - 'S2ANet': 80, - 'EfficientDet': 40, - 'Face': 3, - 'TTFNet': 60, - 'FCOS': 16, - 'SOLOv2': 60, - 'HigherHRNet': 3, - 'HRNet': 3, - 'DeepSORT': 3, - 'ByteTrack': 10, - 'CenterTrack': 5, - 'JDE': 10, - 'FairMOT': 5, - 'GFL': 16, - 'PicoDet': 3, - 'CenterNet': 5, - 'TOOD': 5, - 'YOLOX': 8, - 'YOLOF': 40, - 'METRO_Body': 3, - 'DETR': 3, - 'CLRNet': 3 -} - -KEYPOINT_ARCH = ['HigherHRNet', 'TopDownHRNet'] -MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack'] -LANE_ARCH = ['CLRNet'] - -TO_STATIC_SPEC = { - 'yolov3_darknet53_270e_coco': [{ - 'im_id': paddle.static.InputSpec( - name='im_id', shape=[-1, 1], dtype='float32'), - 'is_crowd': paddle.static.InputSpec( - name='is_crowd', shape=[-1, 50], dtype='float32'), - 'gt_bbox': paddle.static.InputSpec( - name='gt_bbox', shape=[-1, 50, 4], dtype='float32'), - 'curr_iter': paddle.static.InputSpec( - name='curr_iter', shape=[-1], dtype='float32'), - 'image': paddle.static.InputSpec( - name='image', shape=[-1, 3, -1, -1], dtype='float32'), - 'im_shape': paddle.static.InputSpec( - name='im_shape', shape=[-1, 2], dtype='float32'), - 'scale_factor': paddle.static.InputSpec( - name='scale_factor', shape=[-1, 2], dtype='float32'), - 'target0': paddle.static.InputSpec( - name='target0', shape=[-1, 3, 86, -1, -1], dtype='float32'), - 'target1': paddle.static.InputSpec( - name='target1', shape=[-1, 3, 86, -1, -1], dtype='float32'), - 'target2': paddle.static.InputSpec( - name='target2', shape=[-1, 3, 86, -1, -1], dtype='float32'), - }], - 'tinypose_128x96': [{ - 'center': paddle.static.InputSpec( - name='center', shape=[-1, 2], dtype='float32'), - 'scale': paddle.static.InputSpec( - name='scale', shape=[-1, 2], dtype='float32'), - 'im_id': paddle.static.InputSpec( - name='im_id', shape=[-1, 1], dtype='float32'), - 'image': paddle.static.InputSpec( - name='image', shape=[-1, 3, 128, 96], dtype='float32'), - 'score': paddle.static.InputSpec( - name='score', shape=[-1], dtype='float32'), - 'rotate': paddle.static.InputSpec( - name='rotate', shape=[-1], dtype='float32'), - 'target': paddle.static.InputSpec( - name='target', shape=[-1, 17, 32, 24], dtype='float32'), - 'target_weight': paddle.static.InputSpec( - name='target_weight', shape=[-1, 17, 1], dtype='float32'), - }], - 'fcos_r50_fpn_1x_coco': [{ - 'im_id': paddle.static.InputSpec( - name='im_id', shape=[-1, 1], dtype='float32'), - 'curr_iter': paddle.static.InputSpec( - name='curr_iter', shape=[-1], dtype='float32'), - 'image': paddle.static.InputSpec( - name='image', shape=[-1, 3, -1, -1], dtype='float32'), - 'im_shape': paddle.static.InputSpec( - name='im_shape', shape=[-1, 2], dtype='float32'), - 'scale_factor': paddle.static.InputSpec( - name='scale_factor', shape=[-1, 2], dtype='float32'), - 'reg_target0': paddle.static.InputSpec( - name='reg_target0', shape=[-1, 160, 160, 4], dtype='float32'), - 'labels0': paddle.static.InputSpec( - name='labels0', shape=[-1, 160, 160, 1], dtype='int32'), - 'centerness0': paddle.static.InputSpec( - name='centerness0', shape=[-1, 160, 160, 1], dtype='float32'), - 'reg_target1': paddle.static.InputSpec( - name='reg_target1', shape=[-1, 80, 80, 4], dtype='float32'), - 'labels1': paddle.static.InputSpec( - name='labels1', shape=[-1, 80, 80, 1], dtype='int32'), - 'centerness1': paddle.static.InputSpec( - name='centerness1', shape=[-1, 80, 80, 1], dtype='float32'), - 'reg_target2': paddle.static.InputSpec( - name='reg_target2', shape=[-1, 40, 40, 4], dtype='float32'), - 'labels2': paddle.static.InputSpec( - name='labels2', shape=[-1, 40, 40, 1], dtype='int32'), - 'centerness2': paddle.static.InputSpec( - name='centerness2', shape=[-1, 40, 40, 1], dtype='float32'), - 'reg_target3': paddle.static.InputSpec( - name='reg_target3', shape=[-1, 20, 20, 4], dtype='float32'), - 'labels3': paddle.static.InputSpec( - name='labels3', shape=[-1, 20, 20, 1], dtype='int32'), - 'centerness3': paddle.static.InputSpec( - name='centerness3', shape=[-1, 20, 20, 1], dtype='float32'), - 'reg_target4': paddle.static.InputSpec( - name='reg_target4', shape=[-1, 10, 10, 4], dtype='float32'), - 'labels4': paddle.static.InputSpec( - name='labels4', shape=[-1, 10, 10, 1], dtype='int32'), - 'centerness4': paddle.static.InputSpec( - name='centerness4', shape=[-1, 10, 10, 1], dtype='float32'), - }], - 'picodet_s_320_coco_lcnet': [{ - 'im_id': paddle.static.InputSpec( - name='im_id', shape=[-1, 1], dtype='float32'), - 'is_crowd': paddle.static.InputSpec( - name='is_crowd', shape=[-1, -1, 1], dtype='float32'), - 'gt_class': paddle.static.InputSpec( - name='gt_class', shape=[-1, -1, 1], dtype='int32'), - 'gt_bbox': paddle.static.InputSpec( - name='gt_bbox', shape=[-1, -1, 4], dtype='float32'), - 'curr_iter': paddle.static.InputSpec( - name='curr_iter', shape=[-1], dtype='float32'), - 'image': paddle.static.InputSpec( - name='image', shape=[-1, 3, -1, -1], dtype='float32'), - 'im_shape': paddle.static.InputSpec( - name='im_shape', shape=[-1, 2], dtype='float32'), - 'scale_factor': paddle.static.InputSpec( - name='scale_factor', shape=[-1, 2], dtype='float32'), - 'pad_gt_mask': paddle.static.InputSpec( - name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'), - }], - 'ppyoloe_crn_s_300e_coco': [{ - 'im_id': paddle.static.InputSpec( - name='im_id', shape=[-1, 1], dtype='float32'), - 'is_crowd': paddle.static.InputSpec( - name='is_crowd', shape=[-1, -1, 1], dtype='float32'), - 'gt_class': paddle.static.InputSpec( - name='gt_class', shape=[-1, -1, 1], dtype='int32'), - 'gt_bbox': paddle.static.InputSpec( - name='gt_bbox', shape=[-1, -1, 4], dtype='float32'), - 'curr_iter': paddle.static.InputSpec( - name='curr_iter', shape=[-1], dtype='float32'), - 'image': paddle.static.InputSpec( - name='image', shape=[-1, 3, -1, -1], dtype='float32'), - 'im_shape': paddle.static.InputSpec( - name='im_shape', shape=[-1, 2], dtype='float32'), - 'scale_factor': paddle.static.InputSpec( - name='scale_factor', shape=[-1, 2], dtype='float32'), - 'pad_gt_mask': paddle.static.InputSpec( - name='pad_gt_mask', shape=[-1, -1, 1], dtype='float32'), - }], -} - - -def apply_to_static(config, model): - filename = config.get('filename', None) - spec = TO_STATIC_SPEC.get(filename, None) - model = paddle.jit.to_static(model, input_spec=spec) - logger.info("Successfully to apply @to_static with specs: {}".format(spec)) - return model - - -def _prune_input_spec(input_spec, program, targets): - # try to prune static program to figure out pruned input spec - # so we perform following operations in static mode - device = paddle.get_device() - paddle.enable_static() - paddle.set_device(device) - pruned_input_spec = [{}] - program = program.clone() - program = program._prune(targets=targets) - global_block = program.global_block() - for name, spec in input_spec[0].items(): - try: - v = global_block.var(name) - pruned_input_spec[0][name] = spec - except Exception: - pass - paddle.disable_static(place=device) - return pruned_input_spec - - -def _parse_reader(reader_cfg, dataset_cfg, metric, arch, image_shape): - preprocess_list = [] - label_list = [] - if arch != "lane_arch": - anno_file = dataset_cfg.get_anno() - - clsid2catid, catid2name = get_categories(metric, anno_file, arch) - - label_list = [str(cat) for cat in catid2name.values()] - - fuse_normalize = reader_cfg.get('fuse_normalize', False) - sample_transforms = reader_cfg['sample_transforms'] - for st in sample_transforms[1:]: - for key, value in st.items(): - p = {'type': key} - if key == 'Resize': - if int(image_shape[1]) != -1: - value['target_size'] = image_shape[1:] - value['interp'] = value.get('interp', 1) # cv2.INTER_LINEAR - if fuse_normalize and key == 'NormalizeImage': - continue - p.update(value) - preprocess_list.append(p) - batch_transforms = reader_cfg.get('batch_transforms', None) - if batch_transforms: - for bt in batch_transforms: - for key, value in bt.items(): - # for deploy/infer, use PadStride(stride) instead PadBatch(pad_to_stride) - if key == 'PadBatch': - preprocess_list.append({ - 'type': 'PadStride', - 'stride': value['pad_to_stride'] - }) - break - elif key == "CULaneResize": - # cut and resize - p = {'type': key} - p.update(value) - p.update({"cut_height": dataset_cfg.cut_height}) - preprocess_list.append(p) - break - - return preprocess_list, label_list - - -def _parse_tracker(tracker_cfg): - tracker_params = {} - for k, v in tracker_cfg.items(): - tracker_params.update({k: v}) - return tracker_params - - -def _dump_infer_config(config, path, image_shape, model): - arch_state = False - from ppdet.core.config.yaml_helpers import setup_orderdict - setup_orderdict() - use_dynamic_shape = True if image_shape[2] == -1 else False - infer_cfg = OrderedDict({ - 'mode': 'paddle', - 'draw_threshold': 0.5, - 'metric': config['metric'], - 'use_dynamic_shape': use_dynamic_shape - }) - export_onnx = config.get('export_onnx', False) - export_eb = config.get('export_eb', False) - - infer_arch = config['architecture'] - if 'RCNN' in infer_arch and export_onnx: - logger.warning( - "Exporting RCNN model to ONNX only support batch_size = 1") - infer_cfg['export_onnx'] = True - infer_cfg['export_eb'] = export_eb - - if infer_arch in MOT_ARCH: - if infer_arch == 'DeepSORT': - tracker_cfg = config['DeepSORTTracker'] - elif infer_arch == 'CenterTrack': - tracker_cfg = config['CenterTracker'] - else: - tracker_cfg = config['JDETracker'] - infer_cfg['tracker'] = _parse_tracker(tracker_cfg) - - for arch, min_subgraph_size in TRT_MIN_SUBGRAPH.items(): - if arch in infer_arch: - infer_cfg['arch'] = arch - infer_cfg['min_subgraph_size'] = min_subgraph_size - arch_state = True - break - - if infer_arch == 'PPYOLOEWithAuxHead': - infer_arch = 'PPYOLOE' - - if infer_arch in ['PPYOLOE', 'YOLOX', 'YOLOF']: - infer_cfg['arch'] = infer_arch - infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch] - arch_state = True - - if not arch_state: - logger.error( - 'Architecture: {} is not supported for exporting model now.\n'. - format(infer_arch) + - 'Please set TRT_MIN_SUBGRAPH in ppdet/engine/export_utils.py') - os._exit(0) - if 'mask_head' in config[config['architecture']] and config[config[ - 'architecture']]['mask_head']: - infer_cfg['mask'] = True - label_arch = 'detection_arch' - if infer_arch in KEYPOINT_ARCH: - label_arch = 'keypoint_arch' - - if infer_arch in LANE_ARCH: - infer_cfg['arch'] = infer_arch - infer_cfg['min_subgraph_size'] = TRT_MIN_SUBGRAPH[infer_arch] - infer_cfg['img_w'] = config['img_w'] - infer_cfg['ori_img_h'] = config['ori_img_h'] - infer_cfg['cut_height'] = config['cut_height'] - label_arch = 'lane_arch' - head_name = "CLRHead" - infer_cfg['conf_threshold'] = config[head_name]['conf_threshold'] - infer_cfg['nms_thres'] = config[head_name]['nms_thres'] - infer_cfg['max_lanes'] = config[head_name]['max_lanes'] - infer_cfg['num_points'] = config[head_name]['num_points'] - arch_state = True - - if infer_arch in MOT_ARCH: - if config['metric'] in ['COCO', 'VOC']: - # MOT model run as Detector - reader_cfg = config['TestReader'] - dataset_cfg = config['TestDataset'] - else: - # 'metric' in ['MOT', 'MCMOT', 'KITTI'] - label_arch = 'mot_arch' - reader_cfg = config['TestMOTReader'] - dataset_cfg = config['TestMOTDataset'] - else: - reader_cfg = config['TestReader'] - dataset_cfg = config['TestDataset'] - - infer_cfg['Preprocess'], infer_cfg['label_list'] = _parse_reader( - reader_cfg, dataset_cfg, config['metric'], label_arch, image_shape[1:]) - - if infer_arch == 'PicoDet': - if hasattr(config, 'export') and config['export'].get( - 'post_process', - False) and not config['export'].get('benchmark', False): - infer_cfg['arch'] = 'GFL' - head_name = 'PicoHeadV2' if config['PicoHeadV2'] else 'PicoHead' - infer_cfg['NMS'] = config[head_name]['nms'] - # In order to speed up the prediction, the threshold of nms - # is adjusted here, which can be changed in infer_cfg.yml - config[head_name]['nms']["score_threshold"] = 0.3 - config[head_name]['nms']["nms_threshold"] = 0.5 - infer_cfg['fpn_stride'] = config[head_name]['fpn_stride'] - - yaml.dump(infer_cfg, open(path, 'w')) - logger.info("Export inference config file to {}".format(os.path.join(path))) diff --git a/pdfdet/models/Paddle/ppdet/engine/tracker.py b/pdfdet/models/Paddle/ppdet/engine/tracker.py deleted file mode 100644 index 90eb0c5..0000000 --- a/pdfdet/models/Paddle/ppdet/engine/tracker.py +++ /dev/null @@ -1,731 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import glob -import re -import paddle -import paddle.nn as nn -import numpy as np -from tqdm import tqdm -from collections import defaultdict - -from ppdet.core.workspace import create -from ppdet.utils.checkpoint import load_weight, load_pretrain_weight -from ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box -from ppdet.modeling.mot.utils import MOTTimer, load_det_results, write_mot_results, save_vis_results -from ppdet.modeling.mot.tracker import JDETracker, CenterTracker -from ppdet.modeling.mot.tracker import DeepSORTTracker, OCSORTTracker, BOTSORTTracker -from ppdet.modeling.architectures import YOLOX -from ppdet.metrics import Metric, MOTMetric, KITTIMOTMetric, MCMOTMetric -from ppdet.data.source.category import get_categories -import ppdet.utils.stats as stats - -from .callbacks import Callback, ComposeCallback - -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack'] -MOT_ARCH_JDE = MOT_ARCH[:2] -MOT_ARCH_SDE = MOT_ARCH[2:4] -MOT_DATA_TYPE = ['mot', 'mcmot', 'kitti'] - -__all__ = ['Tracker'] - - -class Tracker(object): - def __init__(self, cfg, mode='eval'): - self.cfg = cfg - assert mode.lower() in ['test', 'eval'], \ - "mode should be 'test' or 'eval'" - self.mode = mode.lower() - self.optimizer = None - - # build MOT data loader - self.dataset = cfg['{}MOTDataset'.format(self.mode.capitalize())] - - # build model - self.model = create(cfg.architecture) - - if isinstance(self.model.detector, YOLOX): - for k, m in self.model.named_sublayers(): - if isinstance(m, nn.BatchNorm2D): - m._epsilon = 1e-3 # for amp(fp16) - m._momentum = 0.97 # 0.03 in pytorch - - anno_file = self.dataset.get_anno() - clsid2catid, catid2name = get_categories( - self.cfg.metric, anno_file=anno_file) - self.ids2names = [] - for k, v in catid2name.items(): - self.ids2names.append(v) - - self.status = {} - self.start_epoch = 0 - - # initial default callbacks - self._init_callbacks() - - # initial default metrics - self._init_metrics() - self._reset_metrics() - - def _init_callbacks(self): - self._callbacks = [] - self._compose_callback = None - - def _init_metrics(self): - if self.mode in ['test']: - self._metrics = [] - return - - if self.cfg.metric == 'MOT': - self._metrics = [MOTMetric(), ] - elif self.cfg.metric == 'MCMOT': - self._metrics = [MCMOTMetric(self.cfg.num_classes), ] - elif self.cfg.metric == 'KITTI': - self._metrics = [KITTIMOTMetric(), ] - else: - logger.warning("Metric not support for metric type {}".format( - self.cfg.metric)) - self._metrics = [] - - def _reset_metrics(self): - for metric in self._metrics: - metric.reset() - - def register_callbacks(self, callbacks): - callbacks = [h for h in list(callbacks) if h is not None] - for c in callbacks: - assert isinstance(c, Callback), \ - "metrics shoule be instances of subclass of Metric" - self._callbacks.extend(callbacks) - self._compose_callback = ComposeCallback(self._callbacks) - - def register_metrics(self, metrics): - metrics = [m for m in list(metrics) if m is not None] - for m in metrics: - assert isinstance(m, Metric), \ - "metrics shoule be instances of subclass of Metric" - self._metrics.extend(metrics) - - def load_weights_jde(self, weights): - load_weight(self.model, weights, self.optimizer) - - def load_weights_sde(self, det_weights, reid_weights): - with_detector = self.model.detector is not None - with_reid = self.model.reid is not None - - if with_detector: - load_weight(self.model.detector, det_weights) - if with_reid: - load_weight(self.model.reid, reid_weights) - else: - load_weight(self.model.reid, reid_weights) - - def _eval_seq_centertrack(self, - dataloader, - save_dir=None, - show_image=False, - frame_rate=30, - draw_threshold=0): - assert isinstance(self.model.tracker, CenterTracker) - if save_dir: - if not os.path.exists(save_dir): os.makedirs(save_dir) - tracker = self.model.tracker - - timer = MOTTimer() - frame_id = 0 - self.status['mode'] = 'track' - self.model.eval() - results = defaultdict(list) # only support single class now - - for step_id, data in enumerate(tqdm(dataloader)): - self.status['step_id'] = step_id - if step_id == 0: - self.model.reset_tracking() - - # forward - timer.tic() - pred_ret = self.model(data) - - online_targets = tracker.update(pred_ret) - online_tlwhs, online_scores, online_ids = [], [], [] - for t in online_targets: - bbox = t['bbox'] - tlwh = [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]] - tscore = float(t['score']) - tid = int(t['tracking_id']) - if tlwh[2] * tlwh[3] > 0: - online_tlwhs.append(tlwh) - online_ids.append(tid) - online_scores.append(tscore) - timer.toc() - # save results - results[0].append( - (frame_id + 1, online_tlwhs, online_scores, online_ids)) - save_vis_results(data, frame_id, online_ids, online_tlwhs, - online_scores, timer.average_time, show_image, - save_dir, self.cfg.num_classes, self.ids2names) - frame_id += 1 - return results, frame_id, timer.average_time, timer.calls - - def _eval_seq_jde(self, - dataloader, - save_dir=None, - show_image=False, - frame_rate=30, - draw_threshold=0): - if save_dir: - if not os.path.exists(save_dir): os.makedirs(save_dir) - tracker = self.model.tracker - tracker.max_time_lost = int(frame_rate / 30.0 * tracker.track_buffer) - - timer = MOTTimer() - frame_id = 0 - self.status['mode'] = 'track' - self.model.eval() - results = defaultdict(list) # support single class and multi classes - - for step_id, data in enumerate(tqdm(dataloader)): - self.status['step_id'] = step_id - # forward - timer.tic() - pred_dets, pred_embs = self.model(data) - - pred_dets, pred_embs = pred_dets.numpy(), pred_embs.numpy() - online_targets_dict = self.model.tracker.update(pred_dets, - pred_embs) - online_tlwhs = defaultdict(list) - online_scores = defaultdict(list) - online_ids = defaultdict(list) - for cls_id in range(self.cfg.num_classes): - online_targets = online_targets_dict[cls_id] - for t in online_targets: - tlwh = t.tlwh - tid = t.track_id - tscore = t.score - if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue - if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[ - 3] > tracker.vertical_ratio: - continue - online_tlwhs[cls_id].append(tlwh) - online_ids[cls_id].append(tid) - online_scores[cls_id].append(tscore) - # save results - results[cls_id].append( - (frame_id + 1, online_tlwhs[cls_id], online_scores[cls_id], - online_ids[cls_id])) - - timer.toc() - save_vis_results(data, frame_id, online_ids, online_tlwhs, - online_scores, timer.average_time, show_image, - save_dir, self.cfg.num_classes, self.ids2names) - frame_id += 1 - - return results, frame_id, timer.average_time, timer.calls - - def _eval_seq_sde(self, - dataloader, - save_dir=None, - show_image=False, - frame_rate=30, - seq_name='', - scaled=False, - det_file='', - draw_threshold=0): - if save_dir: - if not os.path.exists(save_dir): os.makedirs(save_dir) - use_detector = False if not self.model.detector else True - use_reid = hasattr(self.model, 'reid') - if use_reid and self.model.reid is not None: - use_reid = True - else: - use_reid = False - - timer = MOTTimer() - results = defaultdict(list) - frame_id = 0 - self.status['mode'] = 'track' - self.model.eval() - if use_reid: - self.model.reid.eval() - if not use_detector: - dets_list = load_det_results(det_file, len(dataloader)) - logger.info('Finish loading detection results file {}.'.format( - det_file)) - - tracker = self.model.tracker - for step_id, data in enumerate(tqdm(dataloader)): - self.status['step_id'] = step_id - ori_image = data['ori_image'] # [bs, H, W, 3] - ori_image_shape = data['ori_image'].shape[1:3] - # ori_image_shape: [H, W] - - input_shape = data['image'].shape[2:] - # input_shape: [h, w], before data transforms, set in model config - - im_shape = data['im_shape'][0].numpy() - # im_shape: [new_h, new_w], after data transforms - scale_factor = data['scale_factor'][0].numpy() - - empty_detections = False - # when it has no detected bboxes, will not inference reid model - # and if visualize, use original image instead - - # forward - timer.tic() - if not use_detector: - dets = dets_list[frame_id] - bbox_tlwh = np.array(dets['bbox'], dtype='float32') - if bbox_tlwh.shape[0] > 0: - # detector outputs: pred_cls_ids, pred_scores, pred_bboxes - pred_cls_ids = np.array(dets['cls_id'], dtype='float32') - pred_scores = np.array(dets['score'], dtype='float32') - pred_bboxes = np.concatenate( - (bbox_tlwh[:, 0:2], - bbox_tlwh[:, 2:4] + bbox_tlwh[:, 0:2]), - axis=1) - else: - logger.warning( - 'Frame {} has not object, try to modify score threshold.'. - format(frame_id)) - empty_detections = True - else: - outs = self.model.detector(data) - outs['bbox'] = outs['bbox'].numpy() - outs['bbox_num'] = outs['bbox_num'].numpy() - - if len(outs['bbox']) > 0 and empty_detections == False: - # detector outputs: pred_cls_ids, pred_scores, pred_bboxes - pred_cls_ids = outs['bbox'][:, 0:1] - pred_scores = outs['bbox'][:, 1:2] - if not scaled: - # Note: scaled=False only in JDE YOLOv3 or other detectors - # with LetterBoxResize and JDEBBoxPostProcess. - # - # 'scaled' means whether the coords after detector outputs - # have been scaled back to the original image, set True - # in general detector, set False in JDE YOLOv3. - pred_bboxes = scale_coords(outs['bbox'][:, 2:], - input_shape, im_shape, - scale_factor) - else: - pred_bboxes = outs['bbox'][:, 2:] - pred_dets_old = np.concatenate( - (pred_cls_ids, pred_scores, pred_bboxes), axis=1) - else: - logger.warning( - 'Frame {} has not detected object, try to modify score threshold.'. - format(frame_id)) - empty_detections = True - - if not empty_detections: - pred_xyxys, keep_idx = clip_box(pred_bboxes, ori_image_shape) - if len(keep_idx[0]) == 0: - logger.warning( - 'Frame {} has not detected object left after clip_box.'. - format(frame_id)) - empty_detections = True - - if empty_detections: - timer.toc() - # if visualize, use original image instead - online_ids, online_tlwhs, online_scores = None, None, None - save_vis_results(data, frame_id, online_ids, online_tlwhs, - online_scores, timer.average_time, show_image, - save_dir, self.cfg.num_classes, self.ids2names) - frame_id += 1 - # thus will not inference reid model - continue - - pred_cls_ids = pred_cls_ids[keep_idx[0]] - pred_scores = pred_scores[keep_idx[0]] - pred_dets = np.concatenate( - (pred_cls_ids, pred_scores, pred_xyxys), axis=1) - - if use_reid: - crops = get_crops( - pred_xyxys, - ori_image, - w=tracker.input_size[0], - h=tracker.input_size[1]) - crops = paddle.to_tensor(crops) - - data.update({'crops': crops}) - pred_embs = self.model(data)['embeddings'].numpy() - else: - pred_embs = None - - if isinstance(tracker, DeepSORTTracker): - online_tlwhs, online_scores, online_ids = [], [], [] - tracker.predict() - online_targets = tracker.update(pred_dets, pred_embs) - for t in online_targets: - if not t.is_confirmed() or t.time_since_update > 1: - continue - tlwh = t.to_tlwh() - tscore = t.score - tid = t.track_id - if tscore < draw_threshold: continue - if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue - if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[ - 3] > tracker.vertical_ratio: - continue - online_tlwhs.append(tlwh) - online_scores.append(tscore) - online_ids.append(tid) - timer.toc() - - # save results - results[0].append( - (frame_id + 1, online_tlwhs, online_scores, online_ids)) - save_vis_results(data, frame_id, online_ids, online_tlwhs, - online_scores, timer.average_time, show_image, - save_dir, self.cfg.num_classes, self.ids2names) - - elif isinstance(tracker, JDETracker): - # trick hyperparams only used for MOTChallenge (MOT17, MOT20) Test-set - tracker.track_buffer, tracker.conf_thres = get_trick_hyperparams( - seq_name, tracker.track_buffer, tracker.conf_thres) - - online_targets_dict = tracker.update(pred_dets_old, pred_embs) - online_tlwhs = defaultdict(list) - online_scores = defaultdict(list) - online_ids = defaultdict(list) - for cls_id in range(self.cfg.num_classes): - online_targets = online_targets_dict[cls_id] - for t in online_targets: - tlwh = t.tlwh - tid = t.track_id - tscore = t.score - if tlwh[2] * tlwh[3] <= tracker.min_box_area: continue - if tracker.vertical_ratio > 0 and tlwh[2] / tlwh[ - 3] > tracker.vertical_ratio: - continue - online_tlwhs[cls_id].append(tlwh) - online_ids[cls_id].append(tid) - online_scores[cls_id].append(tscore) - # save results - results[cls_id].append( - (frame_id + 1, online_tlwhs[cls_id], - online_scores[cls_id], online_ids[cls_id])) - timer.toc() - save_vis_results(data, frame_id, online_ids, online_tlwhs, - online_scores, timer.average_time, show_image, - save_dir, self.cfg.num_classes, self.ids2names) - - elif isinstance(tracker, OCSORTTracker): - # OC_SORT Tracker - online_targets = tracker.update(pred_dets_old, pred_embs) - online_tlwhs = [] - online_ids = [] - online_scores = [] - for t in online_targets: - tlwh = [t[0], t[1], t[2] - t[0], t[3] - t[1]] - tscore = float(t[4]) - tid = int(t[5]) - if tlwh[2] * tlwh[3] > 0: - online_tlwhs.append(tlwh) - online_ids.append(tid) - online_scores.append(tscore) - timer.toc() - # save results - results[0].append( - (frame_id + 1, online_tlwhs, online_scores, online_ids)) - save_vis_results(data, frame_id, online_ids, online_tlwhs, - online_scores, timer.average_time, show_image, - save_dir, self.cfg.num_classes, self.ids2names) - - elif isinstance(tracker, BOTSORTTracker): - # BOTSORT Tracker - online_targets = tracker.update( - pred_dets_old, img=ori_image.numpy()) - online_tlwhs = [] - online_ids = [] - online_scores = [] - for t in online_targets: - tlwh = t.tlwh - tid = t.track_id - tscore = t.score - if tlwh[2] * tlwh[3] > 0: - online_tlwhs.append(tlwh) - online_ids.append(tid) - online_scores.append(tscore) - timer.toc() - # save results - results[0].append( - (frame_id + 1, online_tlwhs, online_scores, online_ids)) - save_vis_results(data, frame_id, online_ids, online_tlwhs, - online_scores, timer.average_time, show_image, - save_dir, self.cfg.num_classes, self.ids2names) - - else: - raise ValueError(tracker) - frame_id += 1 - - return results, frame_id, timer.average_time, timer.calls - - def mot_evaluate(self, - data_root, - seqs, - output_dir, - data_type='mot', - model_type='JDE', - save_images=False, - save_videos=False, - show_image=False, - scaled=False, - det_results_dir=''): - if not os.path.exists(output_dir): os.makedirs(output_dir) - result_root = os.path.join(output_dir, 'mot_results') - if not os.path.exists(result_root): os.makedirs(result_root) - assert data_type in MOT_DATA_TYPE, \ - "data_type should be 'mot', 'mcmot' or 'kitti'" - assert model_type in MOT_ARCH, \ - "model_type should be 'JDE', 'DeepSORT', 'FairMOT' or 'ByteTrack'" - - # run tracking - n_frame = 0 - timer_avgs, timer_calls = [], [] - for seq in seqs: - infer_dir = os.path.join(data_root, seq) - if not os.path.exists(infer_dir) or not os.path.isdir(infer_dir): - logger.warning("Seq {} error, {} has no images.".format( - seq, infer_dir)) - continue - if os.path.exists(os.path.join(infer_dir, 'img1')): - infer_dir = os.path.join(infer_dir, 'img1') - - frame_rate = 30 - seqinfo = os.path.join(data_root, seq, 'seqinfo.ini') - if os.path.exists(seqinfo): - meta_info = open(seqinfo).read() - frame_rate = int(meta_info[meta_info.find('frameRate') + 10: - meta_info.find('\nseqLength')]) - - save_dir = os.path.join(output_dir, 'mot_outputs', - seq) if save_images or save_videos else None - logger.info('Evaluate seq: {}'.format(seq)) - - self.dataset.set_images(self.get_infer_images(infer_dir)) - dataloader = create('EvalMOTReader')(self.dataset, 0) - - result_filename = os.path.join(result_root, '{}.txt'.format(seq)) - - with paddle.no_grad(): - if model_type in MOT_ARCH_JDE: - results, nf, ta, tc = self._eval_seq_jde( - dataloader, - save_dir=save_dir, - show_image=show_image, - frame_rate=frame_rate) - elif model_type in MOT_ARCH_SDE: - results, nf, ta, tc = self._eval_seq_sde( - dataloader, - save_dir=save_dir, - show_image=show_image, - frame_rate=frame_rate, - seq_name=seq, - scaled=scaled, - det_file=os.path.join(det_results_dir, - '{}.txt'.format(seq))) - elif model_type == 'CenterTrack': - results, nf, ta, tc = self._eval_seq_centertrack( - dataloader, - save_dir=save_dir, - show_image=show_image, - frame_rate=frame_rate) - else: - raise ValueError(model_type) - - write_mot_results(result_filename, results, data_type, - self.cfg.num_classes) - n_frame += nf - timer_avgs.append(ta) - timer_calls.append(tc) - - if save_videos: - output_video_path = os.path.join(save_dir, '..', - '{}_vis.mp4'.format(seq)) - cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg {}'.format( - save_dir, output_video_path) - os.system(cmd_str) - logger.info('Save video in {}.'.format(output_video_path)) - - # update metrics - for metric in self._metrics: - metric.update(data_root, seq, data_type, result_root, - result_filename) - - timer_avgs = np.asarray(timer_avgs) - timer_calls = np.asarray(timer_calls) - all_time = np.dot(timer_avgs, timer_calls) - avg_time = all_time / np.sum(timer_calls) - logger.info('Time elapsed: {:.2f} seconds, FPS: {:.2f}'.format( - all_time, 1.0 / avg_time)) - - # accumulate metric to log out - for metric in self._metrics: - metric.accumulate() - metric.log() - # reset metric states for metric may performed multiple times - self._reset_metrics() - - def get_infer_images(self, infer_dir): - assert infer_dir is None or os.path.isdir(infer_dir), \ - "{} is not a directory".format(infer_dir) - images = set() - assert os.path.isdir(infer_dir), \ - "infer_dir {} is not a directory".format(infer_dir) - exts = ['jpg', 'jpeg', 'png', 'bmp'] - exts += [ext.upper() for ext in exts] - for ext in exts: - images.update(glob.glob('{}/*.{}'.format(infer_dir, ext))) - images = list(images) - images.sort() - assert len(images) > 0, "no image found in {}".format(infer_dir) - logger.info("Found {} inference images in total.".format(len(images))) - return images - - def mot_predict_seq(self, - video_file, - frame_rate, - image_dir, - output_dir, - data_type='mot', - model_type='JDE', - save_images=False, - save_videos=True, - show_image=False, - scaled=False, - det_results_dir='', - draw_threshold=0.5): - assert video_file is not None or image_dir is not None, \ - "--video_file or --image_dir should be set." - assert video_file is None or os.path.isfile(video_file), \ - "{} is not a file".format(video_file) - assert image_dir is None or os.path.isdir(image_dir), \ - "{} is not a directory".format(image_dir) - - if not os.path.exists(output_dir): os.makedirs(output_dir) - result_root = os.path.join(output_dir, 'mot_results') - if not os.path.exists(result_root): os.makedirs(result_root) - assert data_type in MOT_DATA_TYPE, \ - "data_type should be 'mot', 'mcmot' or 'kitti'" - assert model_type in MOT_ARCH, \ - "model_type should be 'JDE', 'DeepSORT', 'FairMOT' or 'ByteTrack'" - - # run tracking - if video_file: - seq = video_file.split('/')[-1].split('.')[0] - self.dataset.set_video(video_file, frame_rate) - logger.info('Starting tracking video {}'.format(video_file)) - elif image_dir: - seq = image_dir.split('/')[-1].split('.')[0] - if os.path.exists(os.path.join(image_dir, 'img1')): - image_dir = os.path.join(image_dir, 'img1') - images = [ - '{}/{}'.format(image_dir, x) for x in os.listdir(image_dir) - ] - images.sort() - self.dataset.set_images(images) - logger.info('Starting tracking folder {}, found {} images'.format( - image_dir, len(images))) - else: - raise ValueError('--video_file or --image_dir should be set.') - - save_dir = os.path.join(output_dir, 'mot_outputs', - seq) if save_images or save_videos else None - - dataloader = create('TestMOTReader')(self.dataset, 0) - result_filename = os.path.join(result_root, '{}.txt'.format(seq)) - if frame_rate == -1: - frame_rate = self.dataset.frame_rate - - with paddle.no_grad(): - if model_type in MOT_ARCH_JDE: - results, nf, ta, tc = self._eval_seq_jde( - dataloader, - save_dir=save_dir, - show_image=show_image, - frame_rate=frame_rate, - draw_threshold=draw_threshold) - elif model_type in MOT_ARCH_SDE: - results, nf, ta, tc = self._eval_seq_sde( - dataloader, - save_dir=save_dir, - show_image=show_image, - frame_rate=frame_rate, - seq_name=seq, - scaled=scaled, - det_file=os.path.join(det_results_dir, - '{}.txt'.format(seq)), - draw_threshold=draw_threshold) - elif model_type == 'CenterTrack': - results, nf, ta, tc = self._eval_seq_centertrack( - dataloader, - save_dir=save_dir, - show_image=show_image, - frame_rate=frame_rate) - else: - raise ValueError(model_type) - - if save_videos: - output_video_path = os.path.join(save_dir, '..', - '{}_vis.mp4'.format(seq)) - cmd_str = 'ffmpeg -f image2 -i {}/%05d.jpg {}'.format( - save_dir, output_video_path) - os.system(cmd_str) - logger.info('Save video in {}'.format(output_video_path)) - - write_mot_results(result_filename, results, data_type, - self.cfg.num_classes) - - -def get_trick_hyperparams(video_name, ori_buffer, ori_thresh): - if video_name[:3] != 'MOT': - # only used for MOTChallenge (MOT17, MOT20) Test-set - return ori_buffer, ori_thresh - - video_name = video_name[:8] - if 'MOT17-05' in video_name: - track_buffer = 14 - elif 'MOT17-13' in video_name: - track_buffer = 25 - else: - track_buffer = ori_buffer - - if 'MOT17-01' in video_name: - track_thresh = 0.65 - elif 'MOT17-06' in video_name: - track_thresh = 0.65 - elif 'MOT17-12' in video_name: - track_thresh = 0.7 - elif 'MOT17-14' in video_name: - track_thresh = 0.67 - else: - track_thresh = ori_thresh - - if 'MOT20-06' in video_name or 'MOT20-08' in video_name: - track_thresh = 0.3 - else: - track_thresh = ori_thresh - - return track_buffer, ori_thresh diff --git a/pdfdet/models/Paddle/ppdet/engine/trainer.py b/pdfdet/models/Paddle/ppdet/engine/trainer.py deleted file mode 100644 index f2d44d1..0000000 --- a/pdfdet/models/Paddle/ppdet/engine/trainer.py +++ /dev/null @@ -1,1321 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import sys -import copy -import time -from tqdm import tqdm - -import numpy as np -import typing -from PIL import Image, ImageOps, ImageFile - -ImageFile.LOAD_TRUNCATED_IMAGES = True - -import paddle -import paddle.nn as nn -import paddle.distributed as dist -from paddle.distributed import fleet -from paddle.static import InputSpec -from ppdet.optimizer import ModelEMA - -from ppdet.core.workspace import create -from ppdet.utils.checkpoint import load_weight, load_pretrain_weight -from ppdet.utils.visualizer import visualize_results, save_result -from ppdet.metrics import get_infer_results, KeyPointTopDownCOCOEval, KeyPointTopDownCOCOWholeBadyHandEval, KeyPointTopDownMPIIEval, Pose3DEval -from ppdet.metrics import Metric, COCOMetric, VOCMetric, WiderFaceMetric, RBoxMetric, JDEDetMetric, SNIPERCOCOMetric, CULaneMetric -from ppdet.data.source.sniper_coco import SniperCOCODataSet -from ppdet.data.source.category import get_categories -import ppdet.utils.stats as stats -from ppdet.utils.fuse_utils import fuse_conv_bn -from ppdet.utils import profiler -from ppdet.modeling.post_process import multiclass_nms -from ppdet.modeling.lane_utils import imshow_lanes - -from .callbacks import Callback, ComposeCallback, LogPrinter, Checkpointer, WiferFaceEval, VisualDLWriter, SniperProposalsGenerator, WandbCallback, SemiCheckpointer, SemiLogPrinter -from .export_utils import _dump_infer_config, _prune_input_spec, apply_to_static - -from paddle.distributed.fleet.utils.hybrid_parallel_util import fused_allreduce_gradients - -from ppdet.utils.logger import setup_logger -logger = setup_logger('ppdet.engine') - -__all__ = ['Trainer'] - -MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack'] - - -class Trainer(object): - def __init__(self, cfg, mode='train'): - self.cfg = cfg.copy() - assert mode.lower() in ['train', 'eval', 'test'], \ - "mode should be 'train', 'eval' or 'test'" - self.mode = mode.lower() - self.optimizer = None - self.is_loaded_weights = False - self.use_amp = self.cfg.get('amp', False) - self.amp_level = self.cfg.get('amp_level', 'O1') - self.custom_white_list = self.cfg.get('custom_white_list', None) - self.custom_black_list = self.cfg.get('custom_black_list', None) - self.use_master_grad = self.cfg.get('master_grad', False) - if 'slim' in cfg and cfg['slim_type'] == 'PTQ': - self.cfg['TestDataset'] = create('TestDataset')() - - # build data loader - capital_mode = self.mode.capitalize() - if cfg.architecture in MOT_ARCH and self.mode in [ - 'eval', 'test' - ] and cfg.metric not in ['COCO', 'VOC']: - self.dataset = self.cfg['{}MOTDataset'.format( - capital_mode)] = create('{}MOTDataset'.format(capital_mode))() - else: - self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create( - '{}Dataset'.format(capital_mode))() - - if cfg.architecture == 'DeepSORT' and self.mode == 'train': - logger.error('DeepSORT has no need of training on mot dataset.') - sys.exit(1) - - if cfg.architecture == 'FairMOT' and self.mode == 'eval': - images = self.parse_mot_images(cfg) - self.dataset.set_images(images) - - if self.mode == 'train': - self.loader = create('{}Reader'.format(capital_mode))( - self.dataset, cfg.worker_num) - - if cfg.architecture == 'JDE' and self.mode == 'train': - self.cfg['JDEEmbeddingHead'][ - 'num_identities'] = self.dataset.num_identities_dict[0] - # JDE only support single class MOT now. - - if cfg.architecture == 'FairMOT' and self.mode == 'train': - self.cfg['FairMOTEmbeddingHead'][ - 'num_identities_dict'] = self.dataset.num_identities_dict - # FairMOT support single class and multi-class MOT now. - - # build model - if 'model' not in self.cfg: - self.model = create(cfg.architecture) - else: - self.model = self.cfg.model - self.is_loaded_weights = True - - if cfg.architecture == 'YOLOX': - for k, m in self.model.named_sublayers(): - if isinstance(m, nn.BatchNorm2D): - m._epsilon = 1e-3 # for amp(fp16) - m._momentum = 0.97 # 0.03 in pytorch - - #normalize params for deploy - if 'slim' in cfg and cfg['slim_type'] == 'OFA': - self.model.model.load_meanstd(cfg['TestReader'][ - 'sample_transforms']) - elif 'slim' in cfg and cfg['slim_type'] == 'Distill': - self.model.student_model.load_meanstd(cfg['TestReader'][ - 'sample_transforms']) - elif 'slim' in cfg and cfg[ - 'slim_type'] == 'DistillPrune' and self.mode == 'train': - self.model.student_model.load_meanstd(cfg['TestReader'][ - 'sample_transforms']) - else: - self.model.load_meanstd(cfg['TestReader']['sample_transforms']) - - # EvalDataset build with BatchSampler to evaluate in single device - # TODO: multi-device evaluate - if self.mode == 'eval': - if cfg.architecture == 'FairMOT': - self.loader = create('EvalMOTReader')(self.dataset, 0) - elif cfg.architecture == "METRO_Body": - reader_name = '{}Reader'.format(self.mode.capitalize()) - self.loader = create(reader_name)(self.dataset, cfg.worker_num) - else: - self._eval_batch_sampler = paddle.io.BatchSampler( - self.dataset, batch_size=self.cfg.EvalReader['batch_size']) - reader_name = '{}Reader'.format(self.mode.capitalize()) - # If metric is VOC, need to be set collate_batch=False. - if cfg.metric == 'VOC': - self.cfg[reader_name]['collate_batch'] = False - self.loader = create(reader_name)(self.dataset, cfg.worker_num, - self._eval_batch_sampler) - # TestDataset build after user set images, skip loader creation here - - # get Params - print_params = self.cfg.get('print_params', False) - if print_params: - params = sum([ - p.numel() for n, p in self.model.named_parameters() - if all([x not in n for x in ['_mean', '_variance', 'aux_']]) - ]) # exclude BatchNorm running status - logger.info('Model Params : {} M.'.format((params / 1e6).numpy()[ - 0])) - - # build optimizer in train mode - if self.mode == 'train': - steps_per_epoch = len(self.loader) - if steps_per_epoch < 1: - logger.warning( - "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader." - ) - self.lr = create('LearningRate')(steps_per_epoch) - self.optimizer = create('OptimizerBuilder')(self.lr, self.model) - - # Unstructured pruner is only enabled in the train mode. - if self.cfg.get('unstructured_prune'): - self.pruner = create('UnstructuredPruner')(self.model, - steps_per_epoch) - if self.use_amp and self.amp_level == 'O2': - paddle_version = paddle.__version__[:3] - # paddle version >= 2.5.0 or develop - if paddle_version in ["2.5", "0.0"]: - self.model, self.optimizer = paddle.amp.decorate( - models=self.model, - optimizers=self.optimizer, - level=self.amp_level, - master_grad=self.use_master_grad) - else: - self.model, self.optimizer = paddle.amp.decorate( - models=self.model, - optimizers=self.optimizer, - level=self.amp_level) - self.use_ema = ('use_ema' in cfg and cfg['use_ema']) - if self.use_ema: - ema_decay = self.cfg.get('ema_decay', 0.9998) - ema_decay_type = self.cfg.get('ema_decay_type', 'threshold') - cycle_epoch = self.cfg.get('cycle_epoch', -1) - ema_black_list = self.cfg.get('ema_black_list', None) - ema_filter_no_grad = self.cfg.get('ema_filter_no_grad', False) - self.ema = ModelEMA( - self.model, - decay=ema_decay, - ema_decay_type=ema_decay_type, - cycle_epoch=cycle_epoch, - ema_black_list=ema_black_list, - ema_filter_no_grad=ema_filter_no_grad) - - self._nranks = dist.get_world_size() - self._local_rank = dist.get_rank() - - self.status = {} - - self.start_epoch = 0 - self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch - - # initial default callbacks - self._init_callbacks() - - # initial default metrics - self._init_metrics() - self._reset_metrics() - - def _init_callbacks(self): - if self.mode == 'train': - if self.cfg.get('ssod_method', - False) and self.cfg['ssod_method'] == 'Semi_RTDETR': - self._callbacks = [SemiLogPrinter(self), SemiCheckpointer(self)] - else: - self._callbacks = [LogPrinter(self), Checkpointer(self)] - if self.cfg.get('use_vdl', False): - self._callbacks.append(VisualDLWriter(self)) - if self.cfg.get('save_proposals', False): - self._callbacks.append(SniperProposalsGenerator(self)) - if self.cfg.get('use_wandb', False) or 'wandb' in self.cfg: - self._callbacks.append(WandbCallback(self)) - self._compose_callback = ComposeCallback(self._callbacks) - elif self.mode == 'eval': - self._callbacks = [LogPrinter(self)] - if self.cfg.metric == 'WiderFace': - self._callbacks.append(WiferFaceEval(self)) - self._compose_callback = ComposeCallback(self._callbacks) - elif self.mode == 'test' and self.cfg.get('use_vdl', False): - self._callbacks = [VisualDLWriter(self)] - self._compose_callback = ComposeCallback(self._callbacks) - else: - self._callbacks = [] - self._compose_callback = None - - def _init_metrics(self, validate=False): - if self.mode == 'test' or (self.mode == 'train' and not validate): - self._metrics = [] - return - classwise = self.cfg['classwise'] if 'classwise' in self.cfg else False - if self.cfg.metric == 'COCO' or self.cfg.metric == "SNIPERCOCO": - # TODO: bias should be unified - bias = 1 if self.cfg.get('bias', False) else 0 - output_eval = self.cfg['output_eval'] \ - if 'output_eval' in self.cfg else None - save_prediction_only = self.cfg.get('save_prediction_only', False) - - # pass clsid2catid info to metric instance to avoid multiple loading - # annotation file - clsid2catid = {v: k for k, v in self.dataset.catid2clsid.items()} \ - if self.mode == 'eval' else None - - # when do validation in train, annotation file should be get from - # EvalReader instead of self.dataset(which is TrainReader) - if self.mode == 'train' and validate: - eval_dataset = self.cfg['EvalDataset'] - eval_dataset.check_or_download_dataset() - anno_file = eval_dataset.get_anno() - dataset = eval_dataset - else: - dataset = self.dataset - anno_file = dataset.get_anno() - - IouType = self.cfg['IouType'] if 'IouType' in self.cfg else 'bbox' - if self.cfg.metric == "COCO": - self._metrics = [ - COCOMetric( - anno_file=anno_file, - clsid2catid=clsid2catid, - classwise=classwise, - output_eval=output_eval, - bias=bias, - IouType=IouType, - save_prediction_only=save_prediction_only) - ] - elif self.cfg.metric == "SNIPERCOCO": # sniper - self._metrics = [ - SNIPERCOCOMetric( - anno_file=anno_file, - dataset=dataset, - clsid2catid=clsid2catid, - classwise=classwise, - output_eval=output_eval, - bias=bias, - IouType=IouType, - save_prediction_only=save_prediction_only) - ] - elif self.cfg.metric == 'RBOX': - # TODO: bias should be unified - bias = self.cfg['bias'] if 'bias' in self.cfg else 0 - output_eval = self.cfg['output_eval'] \ - if 'output_eval' in self.cfg else None - save_prediction_only = self.cfg.get('save_prediction_only', False) - imid2path = self.cfg.get('imid2path', None) - - # when do validation in train, annotation file should be get from - # EvalReader instead of self.dataset(which is TrainReader) - anno_file = self.dataset.get_anno() - if self.mode == 'train' and validate: - eval_dataset = self.cfg['EvalDataset'] - eval_dataset.check_or_download_dataset() - anno_file = eval_dataset.get_anno() - - self._metrics = [ - RBoxMetric( - anno_file=anno_file, - classwise=classwise, - output_eval=output_eval, - bias=bias, - save_prediction_only=save_prediction_only, - imid2path=imid2path) - ] - elif self.cfg.metric == 'VOC': - output_eval = self.cfg['output_eval'] \ - if 'output_eval' in self.cfg else None - save_prediction_only = self.cfg.get('save_prediction_only', False) - - self._metrics = [ - VOCMetric( - label_list=self.dataset.get_label_list(), - class_num=self.cfg.num_classes, - map_type=self.cfg.map_type, - classwise=classwise, - output_eval=output_eval, - save_prediction_only=save_prediction_only) - ] - elif self.cfg.metric == 'WiderFace': - multi_scale = self.cfg.multi_scale_eval if 'multi_scale_eval' in self.cfg else True - self._metrics = [ - WiderFaceMetric( - image_dir=os.path.join(self.dataset.dataset_dir, - self.dataset.image_dir), - anno_file=self.dataset.get_anno(), - multi_scale=multi_scale) - ] - elif self.cfg.metric == 'KeyPointTopDownCOCOEval': - eval_dataset = self.cfg['EvalDataset'] - eval_dataset.check_or_download_dataset() - anno_file = eval_dataset.get_anno() - save_prediction_only = self.cfg.get('save_prediction_only', False) - self._metrics = [ - KeyPointTopDownCOCOEval( - anno_file, - len(eval_dataset), - self.cfg.num_joints, - self.cfg.save_dir, - save_prediction_only=save_prediction_only) - ] - elif self.cfg.metric == 'KeyPointTopDownCOCOWholeBadyHandEval': - eval_dataset = self.cfg['EvalDataset'] - eval_dataset.check_or_download_dataset() - anno_file = eval_dataset.get_anno() - save_prediction_only = self.cfg.get('save_prediction_only', False) - self._metrics = [ - KeyPointTopDownCOCOWholeBadyHandEval( - anno_file, - len(eval_dataset), - self.cfg.num_joints, - self.cfg.save_dir, - save_prediction_only=save_prediction_only) - ] - elif self.cfg.metric == 'KeyPointTopDownMPIIEval': - eval_dataset = self.cfg['EvalDataset'] - eval_dataset.check_or_download_dataset() - anno_file = eval_dataset.get_anno() - save_prediction_only = self.cfg.get('save_prediction_only', False) - self._metrics = [ - KeyPointTopDownMPIIEval( - anno_file, - len(eval_dataset), - self.cfg.num_joints, - self.cfg.save_dir, - save_prediction_only=save_prediction_only) - ] - elif self.cfg.metric == 'Pose3DEval': - save_prediction_only = self.cfg.get('save_prediction_only', False) - self._metrics = [ - Pose3DEval( - self.cfg.save_dir, - save_prediction_only=save_prediction_only) - ] - elif self.cfg.metric == 'MOTDet': - self._metrics = [JDEDetMetric(), ] - elif self.cfg.metric == 'CULaneMetric': - output_eval = self.cfg.get('output_eval', None) - self._metrics = [ - CULaneMetric( - cfg=self.cfg, - output_eval=output_eval, - split=self.dataset.split, - dataset_dir=self.cfg.dataset_dir) - ] - else: - logger.warning("Metric not support for metric type {}".format( - self.cfg.metric)) - self._metrics = [] - - def _reset_metrics(self): - for metric in self._metrics: - metric.reset() - - def register_callbacks(self, callbacks): - callbacks = [c for c in list(callbacks) if c is not None] - for c in callbacks: - assert isinstance(c, Callback), \ - "metrics shoule be instances of subclass of Metric" - self._callbacks.extend(callbacks) - self._compose_callback = ComposeCallback(self._callbacks) - - def register_metrics(self, metrics): - metrics = [m for m in list(metrics) if m is not None] - for m in metrics: - assert isinstance(m, Metric), \ - "metrics shoule be instances of subclass of Metric" - self._metrics.extend(metrics) - - def load_weights(self, weights, ARSL_eval=False): - if self.is_loaded_weights: - return - self.start_epoch = 0 - load_pretrain_weight(self.model, weights, ARSL_eval) - logger.debug("Load weights {} to start training".format(weights)) - - def load_weights_sde(self, det_weights, reid_weights): - if self.model.detector: - load_weight(self.model.detector, det_weights) - if self.model.reid: - load_weight(self.model.reid, reid_weights) - else: - load_weight(self.model.reid, reid_weights) - - def resume_weights(self, weights): - # support Distill resume weights - if hasattr(self.model, 'student_model'): - self.start_epoch = load_weight(self.model.student_model, weights, - self.optimizer) - else: - self.start_epoch = load_weight(self.model, weights, self.optimizer, - self.ema if self.use_ema else None) - logger.debug("Resume weights of epoch {}".format(self.start_epoch)) - - def train(self, validate=False): - assert self.mode == 'train', "Model not in 'train' mode" - Init_mark = False - if validate: - self.cfg['EvalDataset'] = self.cfg.EvalDataset = create( - "EvalDataset")() - - model = self.model - if self.cfg.get('to_static', False): - model = apply_to_static(self.cfg, model) - sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and - (self.cfg.use_gpu or self.cfg.use_mlu) and self._nranks > 1) - if sync_bn: - model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model) - - # enabel auto mixed precision mode - if self.use_amp: - scaler = paddle.amp.GradScaler( - enable=self.cfg.use_gpu or self.cfg.use_npu or self.cfg.use_mlu, - init_loss_scaling=self.cfg.get('init_loss_scaling', 1024)) - # get distributed model - if self.cfg.get('fleet', False): - model = fleet.distributed_model(model) - self.optimizer = fleet.distributed_optimizer(self.optimizer) - elif self._nranks > 1: - find_unused_parameters = self.cfg[ - 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False - model = paddle.DataParallel( - model, find_unused_parameters=find_unused_parameters) - - self.status.update({ - 'epoch_id': self.start_epoch, - 'step_id': 0, - 'steps_per_epoch': len(self.loader) - }) - - self.status['batch_time'] = stats.SmoothedValue( - self.cfg.log_iter, fmt='{avg:.4f}') - self.status['data_time'] = stats.SmoothedValue( - self.cfg.log_iter, fmt='{avg:.4f}') - self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) - - if self.cfg.get('print_flops', False): - flops_loader = create('{}Reader'.format(self.mode.capitalize()))( - self.dataset, self.cfg.worker_num) - self._flops(flops_loader) - profiler_options = self.cfg.get('profiler_options', None) - - self._compose_callback.on_train_begin(self.status) - - use_fused_allreduce_gradients = self.cfg[ - 'use_fused_allreduce_gradients'] if 'use_fused_allreduce_gradients' in self.cfg else False - - for epoch_id in range(self.start_epoch, self.cfg.epoch): - self.status['mode'] = 'train' - self.status['epoch_id'] = epoch_id - self._compose_callback.on_epoch_begin(self.status) - self.loader.dataset.set_epoch(epoch_id) - model.train() - iter_tic = time.time() - for step_id, data in enumerate(self.loader): - self.status['data_time'].update(time.time() - iter_tic) - self.status['step_id'] = step_id - profiler.add_profiler_step(profiler_options) - self._compose_callback.on_step_begin(self.status) - data['epoch_id'] = epoch_id - if self.cfg.get('to_static', - False) and 'image_file' in data.keys(): - data.pop('image_file') - - if self.use_amp: - if isinstance( - model, paddle. - DataParallel) and use_fused_allreduce_gradients: - with model.no_sync(): - with paddle.amp.auto_cast( - enable=self.cfg.use_gpu or - self.cfg.use_npu or self.cfg.use_mlu, - custom_white_list=self.custom_white_list, - custom_black_list=self.custom_black_list, - level=self.amp_level): - # model forward - outputs = model(data) - loss = outputs['loss'] - # model backward - scaled_loss = scaler.scale(loss) - scaled_loss.backward() - fused_allreduce_gradients( - list(model.parameters()), None) - else: - with paddle.amp.auto_cast( - enable=self.cfg.use_gpu or self.cfg.use_npu or - self.cfg.use_mlu, - custom_white_list=self.custom_white_list, - custom_black_list=self.custom_black_list, - level=self.amp_level): - # model forward - outputs = model(data) - loss = outputs['loss'] - # model backward - scaled_loss = scaler.scale(loss) - scaled_loss.backward() - # in dygraph mode, optimizer.minimize is equal to optimizer.step - scaler.minimize(self.optimizer, scaled_loss) - else: - if isinstance( - model, paddle. - DataParallel) and use_fused_allreduce_gradients: - with model.no_sync(): - # model forward - outputs = model(data) - loss = outputs['loss'] - # model backward - loss.backward() - fused_allreduce_gradients( - list(model.parameters()), None) - else: - # model forward - outputs = model(data) - loss = outputs['loss'] - # model backward - loss.backward() - self.optimizer.step() - curr_lr = self.optimizer.get_lr() - self.lr.step() - if self.cfg.get('unstructured_prune'): - self.pruner.step() - self.optimizer.clear_grad() - self.status['learning_rate'] = curr_lr - - if self._nranks < 2 or self._local_rank == 0: - self.status['training_staus'].update(outputs) - - self.status['batch_time'].update(time.time() - iter_tic) - self._compose_callback.on_step_end(self.status) - if self.use_ema: - self.ema.update() - iter_tic = time.time() - - if self.cfg.get('unstructured_prune'): - self.pruner.update_params() - - is_snapshot = (self._nranks < 2 or (self._local_rank == 0 or self.cfg.metric == "Pose3DEval")) \ - and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1) - if is_snapshot and self.use_ema: - # apply ema weight on model - weight = copy.deepcopy(self.model.state_dict()) - self.model.set_dict(self.ema.apply()) - self.status['weight'] = weight - - self._compose_callback.on_epoch_end(self.status) - - if validate and is_snapshot: - if not hasattr(self, '_eval_loader'): - # build evaluation dataset and loader - self._eval_dataset = self.cfg.EvalDataset - self._eval_batch_sampler = \ - paddle.io.BatchSampler( - self._eval_dataset, - batch_size=self.cfg.EvalReader['batch_size']) - # If metric is VOC, need to be set collate_batch=False. - if self.cfg.metric == 'VOC': - self.cfg['EvalReader']['collate_batch'] = False - if self.cfg.metric == "Pose3DEval": - self._eval_loader = create('EvalReader')( - self._eval_dataset, self.cfg.worker_num) - else: - self._eval_loader = create('EvalReader')( - self._eval_dataset, - self.cfg.worker_num, - batch_sampler=self._eval_batch_sampler) - # if validation in training is enabled, metrics should be re-init - # Init_mark makes sure this code will only execute once - if validate and Init_mark == False: - Init_mark = True - self._init_metrics(validate=validate) - self._reset_metrics() - - with paddle.no_grad(): - self.status['save_best_model'] = True - self._eval_with_loader(self._eval_loader) - - if is_snapshot and self.use_ema: - # reset original weight - self.model.set_dict(weight) - self.status.pop('weight') - - self._compose_callback.on_train_end(self.status) - - def _eval_with_loader(self, loader): - sample_num = 0 - tic = time.time() - self._compose_callback.on_epoch_begin(self.status) - self.status['mode'] = 'eval' - - self.model.eval() - if self.cfg.get('print_flops', False): - flops_loader = create('{}Reader'.format(self.mode.capitalize()))( - self.dataset, self.cfg.worker_num, self._eval_batch_sampler) - self._flops(flops_loader) - for step_id, data in enumerate(loader): - self.status['step_id'] = step_id - self._compose_callback.on_step_begin(self.status) - # forward - if self.use_amp: - with paddle.amp.auto_cast( - enable=self.cfg.use_gpu or self.cfg.use_npu or - self.cfg.use_mlu, - custom_white_list=self.custom_white_list, - custom_black_list=self.custom_black_list, - level=self.amp_level): - outs = self.model(data) - else: - outs = self.model(data) - - # update metrics - for metric in self._metrics: - metric.update(data, outs) - - # multi-scale inputs: all inputs have same im_id - if isinstance(data, typing.Sequence): - sample_num += data[0]['im_id'].numpy().shape[0] - else: - sample_num += data['im_id'].numpy().shape[0] - self._compose_callback.on_step_end(self.status) - - self.status['sample_num'] = sample_num - self.status['cost_time'] = time.time() - tic - - # accumulate metric to log out - for metric in self._metrics: - metric.accumulate() - metric.log() - self._compose_callback.on_epoch_end(self.status) - # reset metric states for metric may performed multiple times - self._reset_metrics() - - def evaluate(self): - # get distributed model - if self.cfg.get('fleet', False): - self.model = fleet.distributed_model(self.model) - self.optimizer = fleet.distributed_optimizer(self.optimizer) - elif self._nranks > 1: - find_unused_parameters = self.cfg[ - 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False - self.model = paddle.DataParallel( - self.model, find_unused_parameters=find_unused_parameters) - with paddle.no_grad(): - self._eval_with_loader(self.loader) - - def _eval_with_loader_slice(self, - loader, - slice_size=[640, 640], - overlap_ratio=[0.25, 0.25], - combine_method='nms', - match_threshold=0.6, - match_metric='iou'): - sample_num = 0 - tic = time.time() - self._compose_callback.on_epoch_begin(self.status) - self.status['mode'] = 'eval' - self.model.eval() - if self.cfg.get('print_flops', False): - flops_loader = create('{}Reader'.format(self.mode.capitalize()))( - self.dataset, self.cfg.worker_num, self._eval_batch_sampler) - self._flops(flops_loader) - - merged_bboxs = [] - for step_id, data in enumerate(loader): - self.status['step_id'] = step_id - self._compose_callback.on_step_begin(self.status) - # forward - if self.use_amp: - with paddle.amp.auto_cast( - enable=self.cfg.use_gpu or self.cfg.use_npu or - self.cfg.use_mlu, - custom_white_list=self.custom_white_list, - custom_black_list=self.custom_black_list, - level=self.amp_level): - outs = self.model(data) - else: - outs = self.model(data) - - shift_amount = data['st_pix'] - outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount - outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount - merged_bboxs.append(outs['bbox']) - - if data['is_last'] > 0: - # merge matching predictions - merged_results = {'bbox': []} - if combine_method == 'nms': - final_boxes = multiclass_nms( - np.concatenate(merged_bboxs), self.cfg.num_classes, - match_threshold, match_metric) - merged_results['bbox'] = np.concatenate(final_boxes) - elif combine_method == 'concat': - merged_results['bbox'] = np.concatenate(merged_bboxs) - else: - raise ValueError( - "Now only support 'nms' or 'concat' to fuse detection results." - ) - merged_results['im_id'] = np.array([[0]]) - merged_results['bbox_num'] = np.array( - [len(merged_results['bbox'])]) - - merged_bboxs = [] - data['im_id'] = data['ori_im_id'] - # update metrics - for metric in self._metrics: - metric.update(data, merged_results) - - # multi-scale inputs: all inputs have same im_id - if isinstance(data, typing.Sequence): - sample_num += data[0]['im_id'].numpy().shape[0] - else: - sample_num += data['im_id'].numpy().shape[0] - - self._compose_callback.on_step_end(self.status) - - self.status['sample_num'] = sample_num - self.status['cost_time'] = time.time() - tic - - # accumulate metric to log out - for metric in self._metrics: - metric.accumulate() - metric.log() - self._compose_callback.on_epoch_end(self.status) - # reset metric states for metric may performed multiple times - self._reset_metrics() - - def evaluate_slice(self, - slice_size=[640, 640], - overlap_ratio=[0.25, 0.25], - combine_method='nms', - match_threshold=0.6, - match_metric='iou'): - with paddle.no_grad(): - self._eval_with_loader_slice(self.loader, slice_size, overlap_ratio, - combine_method, match_threshold, - match_metric) - - def slice_predict(self, - images, - slice_size=[640, 640], - overlap_ratio=[0.25, 0.25], - combine_method='nms', - match_threshold=0.6, - match_metric='iou', - draw_threshold=0.5, - output_dir='output', - save_results=False, - visualize=True): - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - self.dataset.set_slice_images(images, slice_size, overlap_ratio) - loader = create('TestReader')(self.dataset, 0) - imid2path = self.dataset.get_imid2path() - - def setup_metrics_for_loader(): - # mem - metrics = copy.deepcopy(self._metrics) - mode = self.mode - save_prediction_only = self.cfg[ - 'save_prediction_only'] if 'save_prediction_only' in self.cfg else None - output_eval = self.cfg[ - 'output_eval'] if 'output_eval' in self.cfg else None - - # modify - self.mode = '_test' - self.cfg['save_prediction_only'] = True - self.cfg['output_eval'] = output_dir - self.cfg['imid2path'] = imid2path - self._init_metrics() - - # restore - self.mode = mode - self.cfg.pop('save_prediction_only') - if save_prediction_only is not None: - self.cfg['save_prediction_only'] = save_prediction_only - - self.cfg.pop('output_eval') - if output_eval is not None: - self.cfg['output_eval'] = output_eval - - self.cfg.pop('imid2path') - - _metrics = copy.deepcopy(self._metrics) - self._metrics = metrics - - return _metrics - - if save_results: - metrics = setup_metrics_for_loader() - else: - metrics = [] - - anno_file = self.dataset.get_anno() - clsid2catid, catid2name = get_categories( - self.cfg.metric, anno_file=anno_file) - - # Run Infer - self.status['mode'] = 'test' - self.model.eval() - if self.cfg.get('print_flops', False): - flops_loader = create('TestReader')(self.dataset, 0) - self._flops(flops_loader) - - results = [] # all images - merged_bboxs = [] # single image - for step_id, data in enumerate(tqdm(loader)): - self.status['step_id'] = step_id - # forward - outs = self.model(data) - - outs['bbox'] = outs['bbox'].numpy() # only in test mode - shift_amount = data['st_pix'] - outs['bbox'][:, 2:4] = outs['bbox'][:, 2:4] + shift_amount.numpy() - outs['bbox'][:, 4:6] = outs['bbox'][:, 4:6] + shift_amount.numpy() - merged_bboxs.append(outs['bbox']) - - if data['is_last'] > 0: - # merge matching predictions - merged_results = {'bbox': []} - if combine_method == 'nms': - final_boxes = multiclass_nms( - np.concatenate(merged_bboxs), self.cfg.num_classes, - match_threshold, match_metric) - merged_results['bbox'] = np.concatenate(final_boxes) - elif combine_method == 'concat': - merged_results['bbox'] = np.concatenate(merged_bboxs) - else: - raise ValueError( - "Now only support 'nms' or 'concat' to fuse detection results." - ) - merged_results['im_id'] = np.array([[0]]) - merged_results['bbox_num'] = np.array( - [len(merged_results['bbox'])]) - - merged_bboxs = [] - data['im_id'] = data['ori_im_id'] - - for _m in metrics: - _m.update(data, merged_results) - - for key in ['im_shape', 'scale_factor', 'im_id']: - if isinstance(data, typing.Sequence): - merged_results[key] = data[0][key] - else: - merged_results[key] = data[key] - for key, value in merged_results.items(): - if hasattr(value, 'numpy'): - merged_results[key] = value.numpy() - results.append(merged_results) - - for _m in metrics: - _m.accumulate() - _m.reset() - - if visualize: - for outs in results: - batch_res = get_infer_results(outs, clsid2catid) - bbox_num = outs['bbox_num'] - - start = 0 - for i, im_id in enumerate(outs['im_id']): - image_path = imid2path[int(im_id)] - image = Image.open(image_path).convert('RGB') - image = ImageOps.exif_transpose(image) - self.status['original_image'] = np.array(image.copy()) - - end = start + bbox_num[i] - bbox_res = batch_res['bbox'][start:end] \ - if 'bbox' in batch_res else None - mask_res = batch_res['mask'][start:end] \ - if 'mask' in batch_res else None - segm_res = batch_res['segm'][start:end] \ - if 'segm' in batch_res else None - keypoint_res = batch_res['keypoint'][start:end] \ - if 'keypoint' in batch_res else None - pose3d_res = batch_res['pose3d'][start:end] \ - if 'pose3d' in batch_res else None - image = visualize_results( - image, bbox_res, mask_res, segm_res, keypoint_res, - pose3d_res, int(im_id), catid2name, draw_threshold) - self.status['result_image'] = np.array(image.copy()) - if self._compose_callback: - self._compose_callback.on_step_end(self.status) - # save image with detection - save_name = self._get_save_image_name(output_dir, - image_path) - logger.info("Detection bbox results save in {}".format( - save_name)) - image.save(save_name, quality=95) - - start = end - - def predict(self, - images): - - self.dataset.set_images(images) - loader = create('TestReader')(self.dataset, 0) - - # Run Infer - self.model.eval() - results = [] - for step_id, data in enumerate(loader): - # forward - if hasattr(self.model, 'modelTeacher'): - outs = self.model.modelTeacher(data) - else: - outs = self.model(data) - - for key in ['im_shape', 'scale_factor', 'im_id']: - if isinstance(data, typing.Sequence): - outs[key] = data[0][key] - else: - outs[key] = data[key] - for key, value in outs.items(): - if hasattr(value, 'numpy'): - outs[key] = value.numpy() - results.append(outs) - - return results - - def _get_save_image_name(self, output_dir, image_path): - """ - Get save image name from source image path. - """ - image_name = os.path.split(image_path)[-1] - name, ext = os.path.splitext(image_name) - return os.path.join(output_dir, "{}".format(name)) + ext - - def _get_infer_cfg_and_input_spec(self, - save_dir, - prune_input=True, - kl_quant=False, - yaml_name=None): - if yaml_name is None: - yaml_name = 'infer_cfg.yml' - image_shape = None - im_shape = [None, 2] - scale_factor = [None, 2] - if self.cfg.architecture in MOT_ARCH: - test_reader_name = 'TestMOTReader' - else: - test_reader_name = 'TestReader' - if 'inputs_def' in self.cfg[test_reader_name]: - inputs_def = self.cfg[test_reader_name]['inputs_def'] - image_shape = inputs_def.get('image_shape', None) - # set image_shape=[None, 3, -1, -1] as default - if image_shape is None: - image_shape = [None, 3, -1, -1] - - if len(image_shape) == 3: - image_shape = [None] + image_shape - else: - im_shape = [image_shape[0], 2] - scale_factor = [image_shape[0], 2] - - if hasattr(self.model, 'deploy'): - self.model.deploy = True - - if 'slim' not in self.cfg: - for layer in self.model.sublayers(): - if hasattr(layer, 'convert_to_deploy'): - layer.convert_to_deploy() - - if hasattr(self.cfg, 'export') and 'fuse_conv_bn' in self.cfg[ - 'export'] and self.cfg['export']['fuse_conv_bn']: - self.model = fuse_conv_bn(self.model) - - export_post_process = self.cfg['export'].get( - 'post_process', False) if hasattr(self.cfg, 'export') else True - export_nms = self.cfg['export'].get('nms', False) if hasattr( - self.cfg, 'export') else True - export_benchmark = self.cfg['export'].get( - 'benchmark', False) if hasattr(self.cfg, 'export') else False - if hasattr(self.model, 'fuse_norm'): - self.model.fuse_norm = self.cfg['TestReader'].get('fuse_normalize', - False) - if hasattr(self.model, 'export_post_process'): - self.model.export_post_process = export_post_process if not export_benchmark else False - if hasattr(self.model, 'export_nms'): - self.model.export_nms = export_nms if not export_benchmark else False - if export_post_process and not export_benchmark: - image_shape = [None] + image_shape[1:] - - # Save infer cfg - _dump_infer_config(self.cfg, - os.path.join(save_dir, yaml_name), image_shape, - self.model) - - input_spec = [{ - "image": InputSpec( - shape=image_shape, name='image'), - "im_shape": InputSpec( - shape=im_shape, name='im_shape'), - "scale_factor": InputSpec( - shape=scale_factor, name='scale_factor') - }] - if self.cfg.architecture == 'DeepSORT': - input_spec[0].update({ - "crops": InputSpec( - shape=[None, 3, 192, 64], name='crops') - }) - - if self.cfg.architecture == 'CLRNet': - input_spec[0].update({ - "full_img_path": str, - "img_name": str, - }) - if prune_input: - static_model = paddle.jit.to_static( - self.model, input_spec=input_spec) - # NOTE: dy2st do not pruned program, but jit.save will prune program - # input spec, prune input spec here and save with pruned input spec - pruned_input_spec = _prune_input_spec( - input_spec, static_model.forward.main_program, - static_model.forward.outputs) - else: - static_model = None - pruned_input_spec = input_spec - - # TODO: Hard code, delete it when support prune input_spec. - if self.cfg.architecture == 'PicoDet' and not export_post_process: - pruned_input_spec = [{ - "image": InputSpec( - shape=image_shape, name='image') - }] - if kl_quant: - if self.cfg.architecture == 'PicoDet' or 'ppyoloe' in self.cfg.weights: - pruned_input_spec = [{ - "image": InputSpec( - shape=image_shape, name='image'), - "scale_factor": InputSpec( - shape=scale_factor, name='scale_factor') - }] - elif 'tinypose' in self.cfg.weights: - pruned_input_spec = [{ - "image": InputSpec( - shape=image_shape, name='image') - }] - - return static_model, pruned_input_spec - - def export(self, output_dir='output_inference', for_fd=False): - if hasattr(self.model, 'aux_neck'): - self.model.__delattr__('aux_neck') - if hasattr(self.model, 'aux_head'): - self.model.__delattr__('aux_head') - self.model.eval() - - model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0] - if for_fd: - save_dir = output_dir - save_name = 'inference' - yaml_name = 'inference.yml' - else: - save_dir = os.path.join(output_dir, model_name) - save_name = 'model' - yaml_name = None - - if not os.path.exists(save_dir): - os.makedirs(save_dir) - - static_model, pruned_input_spec = self._get_infer_cfg_and_input_spec( - save_dir, yaml_name=yaml_name) - - # dy2st and save model - if 'slim' not in self.cfg or 'QAT' not in self.cfg['slim_type']: - paddle.jit.save( - static_model, - os.path.join(save_dir, save_name), - input_spec=pruned_input_spec) - else: - self.cfg.slim.save_quantized_model( - self.model, - os.path.join(save_dir, save_name), - input_spec=pruned_input_spec) - logger.info("Export model and saved in {}".format(save_dir)) - - def post_quant(self, output_dir='output_inference'): - model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0] - save_dir = os.path.join(output_dir, model_name) - if not os.path.exists(save_dir): - os.makedirs(save_dir) - - for idx, data in enumerate(self.loader): - self.model(data) - if idx == int(self.cfg.get('quant_batch_num', 10)): - break - - # TODO: support prune input_spec - kl_quant = True if hasattr(self.cfg.slim, 'ptq') else False - _, pruned_input_spec = self._get_infer_cfg_and_input_spec( - save_dir, prune_input=False, kl_quant=kl_quant) - - self.cfg.slim.save_quantized_model( - self.model, - os.path.join(save_dir, 'model'), - input_spec=pruned_input_spec) - logger.info("Export Post-Quant model and saved in {}".format(save_dir)) - - def _flops(self, loader): - if hasattr(self.model, 'aux_neck'): - self.model.__delattr__('aux_neck') - if hasattr(self.model, 'aux_head'): - self.model.__delattr__('aux_head') - self.model.eval() - try: - import paddleslim - except Exception as e: - logger.warning( - 'Unable to calculate flops, please install paddleslim, for example: `pip install paddleslim`' - ) - return - - from paddleslim.analysis import dygraph_flops as flops - input_data = None - for data in loader: - input_data = data - break - - input_spec = [{ - "image": input_data['image'][0].unsqueeze(0), - "im_shape": input_data['im_shape'][0].unsqueeze(0), - "scale_factor": input_data['scale_factor'][0].unsqueeze(0) - }] - flops = flops(self.model, input_spec) / (1000**3) - logger.info(" Model FLOPs : {:.6f}G. (image shape is {})".format( - flops, input_data['image'][0].unsqueeze(0).shape)) - - def parse_mot_images(self, cfg): - import glob - # for quant - dataset_dir = cfg['EvalMOTDataset'].dataset_dir - data_root = cfg['EvalMOTDataset'].data_root - data_root = '{}/{}'.format(dataset_dir, data_root) - seqs = os.listdir(data_root) - seqs.sort() - all_images = [] - for seq in seqs: - infer_dir = os.path.join(data_root, seq) - assert infer_dir is None or os.path.isdir(infer_dir), \ - "{} is not a directory".format(infer_dir) - images = set() - exts = ['jpg', 'jpeg', 'png', 'bmp'] - exts += [ext.upper() for ext in exts] - for ext in exts: - images.update(glob.glob('{}/*.{}'.format(infer_dir, ext))) - images = list(images) - images.sort() - assert len(images) > 0, "no image found in {}".format(infer_dir) - all_images.extend(images) - logger.info("Found {} inference images in total.".format( - len(images))) - return all_images - - def predict_culane(self, - images, - output_dir='output', - save_results=False, - visualize=True): - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - self.dataset.set_images(images) - loader = create('TestReader')(self.dataset, 0) - - imid2path = self.dataset.get_imid2path() - - def setup_metrics_for_loader(): - # mem - metrics = copy.deepcopy(self._metrics) - mode = self.mode - save_prediction_only = self.cfg[ - 'save_prediction_only'] if 'save_prediction_only' in self.cfg else None - output_eval = self.cfg[ - 'output_eval'] if 'output_eval' in self.cfg else None - - # modify - self.mode = '_test' - self.cfg['save_prediction_only'] = True - self.cfg['output_eval'] = output_dir - self.cfg['imid2path'] = imid2path - self._init_metrics() - - # restore - self.mode = mode - self.cfg.pop('save_prediction_only') - if save_prediction_only is not None: - self.cfg['save_prediction_only'] = save_prediction_only - - self.cfg.pop('output_eval') - if output_eval is not None: - self.cfg['output_eval'] = output_eval - - self.cfg.pop('imid2path') - - _metrics = copy.deepcopy(self._metrics) - self._metrics = metrics - - return _metrics - - if save_results: - metrics = setup_metrics_for_loader() - else: - metrics = [] - - # Run Infer - self.status['mode'] = 'test' - self.model.eval() - if self.cfg.get('print_flops', False): - flops_loader = create('TestReader')(self.dataset, 0) - self._flops(flops_loader) - results = [] - for step_id, data in enumerate(tqdm(loader)): - self.status['step_id'] = step_id - # forward - outs = self.model(data) - - for _m in metrics: - _m.update(data, outs) - - for key in ['im_shape', 'scale_factor', 'im_id']: - if isinstance(data, typing.Sequence): - outs[key] = data[0][key] - else: - outs[key] = data[key] - for key, value in outs.items(): - if hasattr(value, 'numpy'): - outs[key] = value.numpy() - results.append(outs) - - for _m in metrics: - _m.accumulate() - _m.reset() - - if visualize: - import cv2 - - for outs in results: - for i in range(len(outs['img_path'])): - lanes = outs['lanes'][i] - img_path = outs['img_path'][i] - img = cv2.imread(img_path) - out_file = os.path.join(output_dir, - os.path.basename(img_path)) - lanes = [ - lane.to_array( - sample_y_range=[ - self.cfg['sample_y']['start'], - self.cfg['sample_y']['end'], - self.cfg['sample_y']['step'] - ], - img_w=self.cfg.ori_img_w, - img_h=self.cfg.ori_img_h) for lane in lanes - ] - imshow_lanes(img, lanes, out_file=out_file) - - return results diff --git a/pdfdet/models/Paddle/ppdet/engine/trainer_cot.py b/pdfdet/models/Paddle/ppdet/engine/trainer_cot.py deleted file mode 100644 index 38d95fa..0000000 --- a/pdfdet/models/Paddle/ppdet/engine/trainer_cot.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from ppdet.core.workspace import create -from ppdet.utils.logger import setup_logger -logger = setup_logger('ppdet.engine') - -from . import Trainer -__all__ = ['TrainerCot'] - -class TrainerCot(Trainer): - """ - Trainer for label-cotuning - calculate the relationship between base_classes and novel_classes - """ - def __init__(self, cfg, mode='train'): - super(TrainerCot, self).__init__(cfg, mode) - self.cotuning_init() - - def cotuning_init(self): - num_classes_novel = self.cfg['num_classes'] - - self.load_weights(self.cfg.pretrain_weights) - - self.model.eval() - relationship = self.model.relationship_learning(self.loader, num_classes_novel) - - self.model.init_cot_head(relationship) - self.optimizer = create('OptimizerBuilder')(self.lr, self.model) - - diff --git a/pdfdet/models/Paddle/ppdet/engine/trainer_ssod.py b/pdfdet/models/Paddle/ppdet/engine/trainer_ssod.py deleted file mode 100644 index ab4a100..0000000 --- a/pdfdet/models/Paddle/ppdet/engine/trainer_ssod.py +++ /dev/null @@ -1,1192 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import copy -import time -import typing -import numpy as np - -import paddle -import paddle.nn as nn -import paddle.distributed as dist -from paddle.distributed import fleet -from ppdet.optimizer import ModelEMA, SimpleModelEMA -from ppdet.core.workspace import create -from ppdet.utils.checkpoint import load_weight, load_pretrain_weight, save_model -import ppdet.utils.stats as stats -from ppdet.utils import profiler -from ppdet.modeling.ssod.utils import align_weak_strong_shape -from .trainer import Trainer -from ppdet.utils.logger import setup_logger -from paddle.static import InputSpec -from ppdet.engine.export_utils import _dump_infer_config, _prune_input_spec -MOT_ARCH = ['JDE', 'FairMOT', 'DeepSORT', 'ByteTrack', 'CenterTrack'] - -logger = setup_logger('ppdet.engine') - -__all__ = ['Trainer_DenseTeacher', 'Trainer_ARSL', 'Trainer_Semi_RTDETR'] - - -class Trainer_DenseTeacher(Trainer): - def __init__(self, cfg, mode='train'): - self.cfg = cfg - assert mode.lower() in ['train', 'eval', 'test'], \ - "mode should be 'train', 'eval' or 'test'" - self.mode = mode.lower() - self.optimizer = None - self.is_loaded_weights = False - self.use_amp = self.cfg.get('amp', False) - self.amp_level = self.cfg.get('amp_level', 'O1') - self.custom_white_list = self.cfg.get('custom_white_list', None) - self.custom_black_list = self.cfg.get('custom_black_list', None) - - # build data loader - capital_mode = self.mode.capitalize() - self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create( - '{}Dataset'.format(capital_mode))() - - if self.mode == 'train': - self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create( - 'UnsupTrainDataset') - self.loader = create('SemiTrainReader')( - self.dataset, self.dataset_unlabel, cfg.worker_num) - - # build model - if 'model' not in self.cfg: - self.model = create(cfg.architecture) - else: - self.model = self.cfg.model - self.is_loaded_weights = True - - # EvalDataset build with BatchSampler to evaluate in single device - # TODO: multi-device evaluate - if self.mode == 'eval': - self._eval_batch_sampler = paddle.io.BatchSampler( - self.dataset, batch_size=self.cfg.EvalReader['batch_size']) - # If metric is VOC, need to be set collate_batch=False. - if cfg.metric == 'VOC': - cfg['EvalReader']['collate_batch'] = False - self.loader = create('EvalReader')(self.dataset, cfg.worker_num, - self._eval_batch_sampler) - # TestDataset build after user set images, skip loader creation here - - # build optimizer in train mode - if self.mode == 'train': - steps_per_epoch = len(self.loader) - if steps_per_epoch < 1: - logger.warning( - "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader." - ) - self.lr = create('LearningRate')(steps_per_epoch) - self.optimizer = create('OptimizerBuilder')(self.lr, self.model) - - # Unstructured pruner is only enabled in the train mode. - if self.cfg.get('unstructured_prune'): - self.pruner = create('UnstructuredPruner')(self.model, - steps_per_epoch) - if self.use_amp and self.amp_level == 'O2': - self.model, self.optimizer = paddle.amp.decorate( - models=self.model, - optimizers=self.optimizer, - level=self.amp_level) - - self.use_ema = ('use_ema' in cfg and cfg['use_ema']) - if self.use_ema: - ema_decay = self.cfg.get('ema_decay', 0.9998) - ema_decay_type = self.cfg.get('ema_decay_type', 'threshold') - cycle_epoch = self.cfg.get('cycle_epoch', -1) - ema_black_list = self.cfg.get('ema_black_list', None) - self.ema = ModelEMA( - self.model, - decay=ema_decay, - ema_decay_type=ema_decay_type, - cycle_epoch=cycle_epoch, - ema_black_list=ema_black_list) - self.ema_start_iters = self.cfg.get('ema_start_iters', 0) - - # simple_ema for SSOD - self.use_simple_ema = ('use_simple_ema' in cfg and - cfg['use_simple_ema']) - if self.use_simple_ema: - self.use_ema = True - ema_decay = self.cfg.get('ema_decay', 0.9996) - self.ema = SimpleModelEMA(self.model, decay=ema_decay) - self.ema_start_iters = self.cfg.get('ema_start_iters', 0) - - self._nranks = dist.get_world_size() - self._local_rank = dist.get_rank() - - self.status = {} - - self.start_epoch = 0 - self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch - - # initial default callbacks - self._init_callbacks() - - # initial default metrics - self._init_metrics() - self._reset_metrics() - - def load_weights(self, weights): - if self.is_loaded_weights: - return - self.start_epoch = 0 - load_pretrain_weight(self.model, weights) - load_pretrain_weight(self.ema.model, weights) - logger.info("Load weights {} to start training for teacher and student". - format(weights)) - - def resume_weights(self, weights, exchange=True): - # support Distill resume weights - if hasattr(self.model, 'student_model'): - self.start_epoch = load_weight(self.model.student_model, weights, - self.optimizer, exchange) - else: - self.start_epoch = load_weight(self.model, weights, self.optimizer, - self.ema - if self.use_ema else None, exchange) - logger.debug("Resume weights of epoch {}".format(self.start_epoch)) - - def train(self, validate=False): - self.semi_start_iters = self.cfg.get('semi_start_iters', 5000) - Init_mark = False - if validate: - self.cfg['EvalDataset'] = self.cfg.EvalDataset = create( - "EvalDataset")() - - sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and - self.cfg.use_gpu and self._nranks > 1) - if sync_bn: - self.model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm( - self.model) - - if self.cfg.get('fleet', False): - self.model = fleet.distributed_model(self.model) - self.optimizer = fleet.distributed_optimizer(self.optimizer) - elif self._nranks > 1: - find_unused_parameters = self.cfg[ - 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False - self.model = paddle.DataParallel( - self.model, find_unused_parameters=find_unused_parameters) - self.ema.model = paddle.DataParallel( - self.ema.model, find_unused_parameters=find_unused_parameters) - - self.status.update({ - 'epoch_id': self.start_epoch, - 'step_id': 0, - 'steps_per_epoch': len(self.loader), - 'exchange_save_model': True, - }) - # Note: exchange_save_model - # in DenseTeacher SSOD, the teacher model will be higher, so exchange when saving pdparams - - self.status['batch_time'] = stats.SmoothedValue( - self.cfg.log_iter, fmt='{avg:.4f}') - self.status['data_time'] = stats.SmoothedValue( - self.cfg.log_iter, fmt='{avg:.4f}') - self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) - profiler_options = self.cfg.get('profiler_options', None) - self._compose_callback.on_train_begin(self.status) - - train_cfg = self.cfg.DenseTeacher['train_cfg'] - concat_sup_data = train_cfg.get('concat_sup_data', True) - - for param in self.ema.model.parameters(): - param.stop_gradient = True - - for epoch_id in range(self.start_epoch, self.cfg.epoch): - self.status['mode'] = 'train' - self.status['epoch_id'] = epoch_id - self._compose_callback.on_epoch_begin(self.status) - self.loader.dataset_label.set_epoch(epoch_id) - self.loader.dataset_unlabel.set_epoch(epoch_id) - iter_tic = time.time() - loss_dict = { - 'loss': paddle.to_tensor([0]), - 'loss_sup_sum': paddle.to_tensor([0]), - 'loss_unsup_sum': paddle.to_tensor([0]), - 'fg_sum': paddle.to_tensor([0]), - } - if self._nranks > 1: - for k in self.model._layers.get_loss_keys(): - loss_dict.update({k: paddle.to_tensor([0.])}) - for k in self.model._layers.get_loss_keys(): - loss_dict.update({'distill_' + k: paddle.to_tensor([0.])}) - else: - for k in self.model.get_loss_keys(): - loss_dict.update({k: paddle.to_tensor([0.])}) - for k in self.model.get_loss_keys(): - loss_dict.update({'distill_' + k: paddle.to_tensor([0.])}) - - # Note: for step_id, data in enumerate(self.loader): # enumerate bug - for step_id in range(len(self.loader)): - data = next(self.loader) - - self.model.train() - self.ema.model.eval() - data_sup_w, data_sup_s, data_unsup_w, data_unsup_s = data - - self.status['data_time'].update(time.time() - iter_tic) - self.status['step_id'] = step_id - profiler.add_profiler_step(profiler_options) - self._compose_callback.on_step_begin(self.status) - - if data_sup_w['image'].shape != data_sup_s['image'].shape: - data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w, - data_sup_s) - - data_sup_w['epoch_id'] = epoch_id - data_sup_s['epoch_id'] = epoch_id - if concat_sup_data: - for k, v in data_sup_s.items(): - if k in ['epoch_id']: - continue - data_sup_s[k] = paddle.concat([v, data_sup_w[k]]) - loss_dict_sup = self.model(data_sup_s) - else: - loss_dict_sup_w = self.model(data_sup_w) - loss_dict_sup = self.model(data_sup_s) - for k, v in loss_dict_sup_w.items(): - loss_dict_sup[k] = (loss_dict_sup[k] + v) * 0.5 - - losses_sup = loss_dict_sup['loss'] * train_cfg['sup_weight'] - losses_sup.backward() - - losses = losses_sup.detach() - loss_dict.update(loss_dict_sup) - loss_dict.update({'loss_sup_sum': loss_dict['loss']}) - - curr_iter = len(self.loader) * epoch_id + step_id - st_iter = self.semi_start_iters - if curr_iter == st_iter: - logger.info("***" * 30) - logger.info('Semi starting ...') - logger.info("***" * 30) - if curr_iter > st_iter: - unsup_weight = train_cfg['unsup_weight'] - if train_cfg['suppress'] == 'linear': - tar_iter = st_iter * 2 - if curr_iter <= tar_iter: - unsup_weight *= (curr_iter - st_iter) / st_iter - elif train_cfg['suppress'] == 'exp': - tar_iter = st_iter + 2000 - if curr_iter <= tar_iter: - scale = np.exp((curr_iter - tar_iter) / 1000) - unsup_weight *= scale - elif train_cfg['suppress'] == 'step': - tar_iter = st_iter * 2 - if curr_iter <= tar_iter: - unsup_weight *= 0.25 - else: - raise ValueError - - if data_unsup_w['image'].shape != data_unsup_s[ - 'image'].shape: - data_unsup_w, data_unsup_s = align_weak_strong_shape( - data_unsup_w, data_unsup_s) - - data_unsup_w['epoch_id'] = epoch_id - data_unsup_s['epoch_id'] = epoch_id - - data_unsup_s['get_data'] = True - student_preds = self.model(data_unsup_s) - - with paddle.no_grad(): - data_unsup_w['is_teacher'] = True - teacher_preds = self.ema.model(data_unsup_w) - - train_cfg['curr_iter'] = curr_iter - train_cfg['st_iter'] = st_iter - if self._nranks > 1: - loss_dict_unsup = self.model._layers.get_ssod_loss( - student_preds, teacher_preds, train_cfg) - else: - loss_dict_unsup = self.model.get_ssod_loss( - student_preds, teacher_preds, train_cfg) - - fg_num = loss_dict_unsup["fg_sum"] - del loss_dict_unsup["fg_sum"] - distill_weights = train_cfg['loss_weight'] - loss_dict_unsup = { - k: v * distill_weights[k] - for k, v in loss_dict_unsup.items() - } - - losses_unsup = sum([ - metrics_value - for metrics_value in loss_dict_unsup.values() - ]) * unsup_weight - losses_unsup.backward() - - loss_dict.update(loss_dict_unsup) - loss_dict.update({'loss_unsup_sum': losses_unsup}) - losses += losses_unsup.detach() - loss_dict.update({"fg_sum": fg_num}) - loss_dict['loss'] = losses - - self.optimizer.step() - curr_lr = self.optimizer.get_lr() - self.lr.step() - self.optimizer.clear_grad() - self.status['learning_rate'] = curr_lr - if self._nranks < 2 or self._local_rank == 0: - self.status['training_staus'].update(loss_dict) - - self.status['batch_time'].update(time.time() - iter_tic) - self._compose_callback.on_step_end(self.status) - # Note: ema_start_iters - if self.use_ema and curr_iter == self.ema_start_iters: - logger.info("***" * 30) - logger.info('EMA starting ...') - logger.info("***" * 30) - self.ema.update(self.model, decay=0) - elif self.use_ema and curr_iter > self.ema_start_iters: - self.ema.update(self.model) - iter_tic = time.time() - - is_snapshot = (self._nranks < 2 or self._local_rank == 0) \ - and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 or epoch_id == self.end_epoch - 1) - if is_snapshot and self.use_ema: - # apply ema weight on model - weight = copy.deepcopy(self.ema.model.state_dict()) - for k, v in weight.items(): - if paddle.is_floating_point(v): - weight[k].stop_gradient = True - self.status['weight'] = weight - - self._compose_callback.on_epoch_end(self.status) - - if validate and is_snapshot: - if not hasattr(self, '_eval_loader'): - # build evaluation dataset and loader - self._eval_dataset = self.cfg.EvalDataset - self._eval_batch_sampler = \ - paddle.io.BatchSampler( - self._eval_dataset, - batch_size=self.cfg.EvalReader['batch_size']) - # If metric is VOC, need to be set collate_batch=False. - if self.cfg.metric == 'VOC': - self.cfg['EvalReader']['collate_batch'] = False - self._eval_loader = create('EvalReader')( - self._eval_dataset, - self.cfg.worker_num, - batch_sampler=self._eval_batch_sampler) - # if validation in training is enabled, metrics should be re-init - # Init_mark makes sure this code will only execute once - if validate and Init_mark == False: - Init_mark = True - self._init_metrics(validate=validate) - self._reset_metrics() - - with paddle.no_grad(): - self.status['save_best_model'] = True - self._eval_with_loader(self._eval_loader) - - if is_snapshot and self.use_ema: - self.status.pop('weight') - - self._compose_callback.on_train_end(self.status) - - def evaluate(self): - # get distributed model - if self.cfg.get('fleet', False): - self.model = fleet.distributed_model(self.model) - self.optimizer = fleet.distributed_optimizer(self.optimizer) - elif self._nranks > 1: - find_unused_parameters = self.cfg[ - 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False - self.model = paddle.DataParallel( - self.model, find_unused_parameters=find_unused_parameters) - with paddle.no_grad(): - self._eval_with_loader(self.loader) - - def _eval_with_loader(self, loader): - sample_num = 0 - tic = time.time() - self._compose_callback.on_epoch_begin(self.status) - self.status['mode'] = 'eval' - - test_cfg = self.cfg.DenseTeacher['test_cfg'] - if test_cfg['inference_on'] == 'teacher': - logger.info("***** teacher model evaluating *****") - eval_model = self.ema.model - else: - logger.info("***** student model evaluating *****") - eval_model = self.model - - eval_model.eval() - if self.cfg.get('print_flops', False): - flops_loader = create('{}Reader'.format(self.mode.capitalize()))( - self.dataset, self.cfg.worker_num, self._eval_batch_sampler) - self._flops(flops_loader) - for step_id, data in enumerate(loader): - self.status['step_id'] = step_id - self._compose_callback.on_step_begin(self.status) - # forward - if self.use_amp: - with paddle.amp.auto_cast( - enable=self.cfg.use_gpu or self.cfg.use_mlu, - custom_white_list=self.custom_white_list, - custom_black_list=self.custom_black_list, - level=self.amp_level): - outs = eval_model(data) - else: - outs = eval_model(data) - - # update metrics - for metric in self._metrics: - metric.update(data, outs) - - # multi-scale inputs: all inputs have same im_id - if isinstance(data, typing.Sequence): - sample_num += data[0]['im_id'].numpy().shape[0] - else: - sample_num += data['im_id'].numpy().shape[0] - self._compose_callback.on_step_end(self.status) - - self.status['sample_num'] = sample_num - self.status['cost_time'] = time.time() - tic - - # accumulate metric to log out - for metric in self._metrics: - metric.accumulate() - metric.log() - self._compose_callback.on_epoch_end(self.status) - self._reset_metrics() - - -class Trainer_ARSL(Trainer): - def __init__(self, cfg, mode='train'): - self.cfg = cfg - assert mode.lower() in ['train', 'eval', 'test'], \ - "mode should be 'train', 'eval' or 'test'" - self.mode = mode.lower() - self.optimizer = None - self.is_loaded_weights = False - capital_mode = self.mode.capitalize() - self.use_ema = False - self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create( - '{}Dataset'.format(capital_mode))() - if self.mode == 'train': - self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create( - 'UnsupTrainDataset') - self.loader = create('SemiTrainReader')( - self.dataset, self.dataset_unlabel, cfg.worker_num) - - # build model - if 'model' not in self.cfg: - self.student_model = create(cfg.architecture) - self.teacher_model = create(cfg.architecture) - self.model = EnsembleTSModel(self.teacher_model, self.student_model) - else: - self.model = self.cfg.model - self.is_loaded_weights = True - # save path for burn-in model - self.base_path = cfg.get('weights') - self.base_path = os.path.dirname(self.base_path) - - # EvalDataset build with BatchSampler to evaluate in single device - # TODO: multi-device evaluate - if self.mode == 'eval': - self._eval_batch_sampler = paddle.io.BatchSampler( - self.dataset, batch_size=self.cfg.EvalReader['batch_size']) - self.loader = create('{}Reader'.format(self.mode.capitalize()))( - self.dataset, cfg.worker_num, self._eval_batch_sampler) - # TestDataset build after user set images, skip loader creation here - - self.start_epoch = 0 - self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch - self.epoch_iter = self.cfg.epoch_iter # set fixed iter in each epoch to control checkpoint - - # build optimizer in train mode - if self.mode == 'train': - steps_per_epoch = self.epoch_iter - self.lr = create('LearningRate')(steps_per_epoch) - self.optimizer = create('OptimizerBuilder')(self.lr, - self.model.modelStudent) - - self._nranks = dist.get_world_size() - self._local_rank = dist.get_rank() - - self.status = {} - - # initial default callbacks - self._init_callbacks() - - # initial default metrics - self._init_metrics() - self._reset_metrics() - self.iter = 0 - - def resume_weights(self, weights): - # support Distill resume weights - if hasattr(self.model, 'student_model'): - self.start_epoch = load_weight(self.model.student_model, weights, - self.optimizer) - else: - self.start_epoch = load_weight(self.model, weights, self.optimizer) - logger.debug("Resume weights of epoch {}".format(self.start_epoch)) - - def train(self, validate=False): - assert self.mode == 'train', "Model not in 'train' mode" - Init_mark = False - - # if validation in training is enabled, metrics should be re-init - if validate: - self._init_metrics(validate=validate) - self._reset_metrics() - - if self.cfg.get('fleet', False): - self.model.modelStudent = fleet.distributed_model( - self.model.modelStudent) - self.optimizer = fleet.distributed_optimizer(self.optimizer) - elif self._nranks > 1: - find_unused_parameters = self.cfg[ - 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False - self.model.modelStudent = paddle.DataParallel( - self.model.modelStudent, - find_unused_parameters=find_unused_parameters) - - # set fixed iter in each epoch to control checkpoint - self.status.update({ - 'epoch_id': self.start_epoch, - 'step_id': 0, - 'steps_per_epoch': self.epoch_iter - }) - print('338 Len of DataLoader: {}'.format(len(self.loader))) - - self.status['batch_time'] = stats.SmoothedValue( - self.cfg.log_iter, fmt='{avg:.4f}') - self.status['data_time'] = stats.SmoothedValue( - self.cfg.log_iter, fmt='{avg:.4f}') - self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) - - self._compose_callback.on_train_begin(self.status) - - epoch_id = self.start_epoch - self.iter = self.start_epoch * self.epoch_iter - # use iter rather than epoch to control training schedule - while self.iter < self.cfg.max_iter: - # epoch loop - self.status['mode'] = 'train' - self.status['epoch_id'] = epoch_id - self._compose_callback.on_epoch_begin(self.status) - self.loader.dataset_label.set_epoch(epoch_id) - self.loader.dataset_unlabel.set_epoch(epoch_id) - paddle.device.cuda.empty_cache() # clear GPU memory - # set model status - self.model.modelStudent.train() - self.model.modelTeacher.eval() - iter_tic = time.time() - - # iter loop in each eopch - for step_id in range(self.epoch_iter): - data = next(self.loader) - self.status['data_time'].update(time.time() - iter_tic) - self.status['step_id'] = step_id - # profiler.add_profiler_step(profiler_options) - self._compose_callback.on_step_begin(self.status) - - # model forward and calculate loss - loss_dict = self.run_step_full_semisup(data) - - if (step_id + 1) % self.cfg.optimize_rate == 0: - self.optimizer.step() - self.optimizer.clear_grad() - curr_lr = self.optimizer.get_lr() - self.lr.step() - - # update log status - self.status['learning_rate'] = curr_lr - if self._nranks < 2 or self._local_rank == 0: - self.status['training_staus'].update(loss_dict) - self.status['batch_time'].update(time.time() - iter_tic) - self._compose_callback.on_step_end(self.status) - self.iter += 1 - iter_tic = time.time() - - self._compose_callback.on_epoch_end(self.status) - - if validate and (self._nranks < 2 or self._local_rank == 0) \ - and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 \ - or epoch_id == self.end_epoch - 1): - if not hasattr(self, '_eval_loader'): - # build evaluation dataset and loader - self._eval_dataset = self.cfg.EvalDataset - self._eval_batch_sampler = \ - paddle.io.BatchSampler( - self._eval_dataset, - batch_size=self.cfg.EvalReader['batch_size']) - self._eval_loader = create('EvalReader')( - self._eval_dataset, - self.cfg.worker_num, - batch_sampler=self._eval_batch_sampler) - if validate and Init_mark == False: - Init_mark = True - self._init_metrics(validate=validate) - self._reset_metrics() - with paddle.no_grad(): - self.status['save_best_model'] = True - # before burn-in stage, eval student. after burn-in stage, eval teacher - if self.iter <= self.cfg.SEMISUPNET['BURN_UP_STEP']: - print("start eval student model") - self._eval_with_loader( - self._eval_loader, mode="student") - else: - print("start eval teacher model") - self._eval_with_loader( - self._eval_loader, mode="teacher") - - epoch_id += 1 - - self._compose_callback.on_train_end(self.status) - - def merge_data(self, data1, data2): - data = copy.deepcopy(data1) - for k, v in data1.items(): - if type(v) is paddle.Tensor: - data[k] = paddle.concat(x=[data[k], data2[k]], axis=0) - elif type(v) is list: - data[k].extend(data2[k]) - return data - - def run_step_full_semisup(self, data): - label_data_k, label_data_q, unlabel_data_k, unlabel_data_q = data - data_merge = self.merge_data(label_data_k, label_data_q) - loss_sup_dict = self.model.modelStudent(data_merge, branch="supervised") - loss_dict = {} - for key in loss_sup_dict.keys(): - if key[:4] == "loss": - loss_dict[key] = loss_sup_dict[key] * 1 - losses_sup = paddle.add_n(list(loss_dict.values())) - # norm loss when using gradient accumulation - losses_sup = losses_sup / self.cfg.optimize_rate - losses_sup.backward() - - for key in loss_sup_dict.keys(): - loss_dict[key + "_pseudo"] = paddle.to_tensor([0]) - loss_dict["loss_tot"] = losses_sup - """ - semi-supervised training after burn-in stage - """ - if self.iter >= self.cfg.SEMISUPNET['BURN_UP_STEP']: - # init teacher model with burn-up weight - if self.iter == self.cfg.SEMISUPNET['BURN_UP_STEP']: - print( - 'Starting semi-supervised learning and load the teacher model.' - ) - self._update_teacher_model(keep_rate=0.00) - # save burn-in model - if dist.get_world_size() < 2 or dist.get_rank() == 0: - print('saving burn-in model.') - save_name = 'burnIn' - epoch_id = self.iter // self.epoch_iter - save_model(self.model, self.optimizer, self.base_path, - save_name, epoch_id) - # Update teacher model with EMA - elif (self.iter + 1) % self.cfg.optimize_rate == 0: - self._update_teacher_model( - keep_rate=self.cfg.SEMISUPNET['EMA_KEEP_RATE']) - - #warm-up weight for pseudo loss - pseudo_weight = self.cfg.SEMISUPNET['UNSUP_LOSS_WEIGHT'] - pseudo_warmup_iter = self.cfg.SEMISUPNET['PSEUDO_WARM_UP_STEPS'] - temp = self.iter - self.cfg.SEMISUPNET['BURN_UP_STEP'] - if temp <= pseudo_warmup_iter: - pseudo_weight *= (temp / pseudo_warmup_iter) - - # get teacher predictions on weak-augmented unlabeled data - with paddle.no_grad(): - teacher_pred = self.model.modelTeacher( - unlabel_data_k, branch='semi_supervised') - - # calculate unsupervised loss on strong-augmented unlabeled data - loss_unsup_dict = self.model.modelStudent( - unlabel_data_q, - branch="semi_supervised", - teacher_prediction=teacher_pred, ) - - for key in loss_unsup_dict.keys(): - if key[-6:] == "pseudo": - loss_unsup_dict[key] = loss_unsup_dict[key] * pseudo_weight - losses_unsup = paddle.add_n(list(loss_unsup_dict.values())) - # norm loss when using gradient accumulation - losses_unsup = losses_unsup / self.cfg.optimize_rate - losses_unsup.backward() - - loss_dict.update(loss_unsup_dict) - loss_dict["loss_tot"] += losses_unsup - return loss_dict - - def export(self, output_dir='output_inference'): - self.model.eval() - model_name = os.path.splitext(os.path.split(self.cfg.filename)[-1])[0] - save_dir = os.path.join(output_dir, model_name) - if not os.path.exists(save_dir): - os.makedirs(save_dir) - image_shape = None - if self.cfg.architecture in MOT_ARCH: - test_reader_name = 'TestMOTReader' - else: - test_reader_name = 'TestReader' - if 'inputs_def' in self.cfg[test_reader_name]: - inputs_def = self.cfg[test_reader_name]['inputs_def'] - image_shape = inputs_def.get('image_shape', None) - # set image_shape=[3, -1, -1] as default - if image_shape is None: - image_shape = [3, -1, -1] - - self.model.modelTeacher.eval() - if hasattr(self.model.modelTeacher, 'deploy'): - self.model.modelTeacher.deploy = True - - # Save infer cfg - _dump_infer_config(self.cfg, - os.path.join(save_dir, 'infer_cfg.yml'), image_shape, - self.model.modelTeacher) - - input_spec = [{ - "image": InputSpec( - shape=[None] + image_shape, name='image'), - "im_shape": InputSpec( - shape=[None, 2], name='im_shape'), - "scale_factor": InputSpec( - shape=[None, 2], name='scale_factor') - }] - if self.cfg.architecture == 'DeepSORT': - input_spec[0].update({ - "crops": InputSpec( - shape=[None, 3, 192, 64], name='crops') - }) - - static_model = paddle.jit.to_static( - self.model.modelTeacher, input_spec=input_spec) - # NOTE: dy2st do not pruned program, but jit.save will prune program - # input spec, prune input spec here and save with pruned input spec - pruned_input_spec = _prune_input_spec(input_spec, - static_model.forward.main_program, - static_model.forward.outputs) - - # dy2st and save model - if 'slim' not in self.cfg or self.cfg['slim_type'] != 'QAT': - paddle.jit.save( - static_model, - os.path.join(save_dir, 'model'), - input_spec=pruned_input_spec) - else: - self.cfg.slim.save_quantized_model( - self.model.modelTeacher, - os.path.join(save_dir, 'model'), - input_spec=pruned_input_spec) - logger.info("Export model and saved in {}".format(save_dir)) - - def _eval_with_loader(self, loader, mode="teacher"): - sample_num = 0 - tic = time.time() - self._compose_callback.on_epoch_begin(self.status) - self.status['mode'] = 'eval' - # self.model.eval() - self.model.modelTeacher.eval() - self.model.modelStudent.eval() - for step_id, data in enumerate(loader): - self.status['step_id'] = step_id - self._compose_callback.on_step_begin(self.status) - if mode == "teacher": - outs = self.model.modelTeacher(data) - else: - outs = self.model.modelStudent(data) - - # update metrics - for metric in self._metrics: - metric.update(data, outs) - - sample_num += data['im_id'].numpy().shape[0] - self._compose_callback.on_step_end(self.status) - - self.status['sample_num'] = sample_num - self.status['cost_time'] = time.time() - tic - - # accumulate metric to log out - for metric in self._metrics: - metric.accumulate() - metric.log() - self._compose_callback.on_epoch_end(self.status) - # reset metric states for metric may performed multiple times - self._reset_metrics() - - def evaluate(self): - with paddle.no_grad(): - self._eval_with_loader(self.loader) - - @paddle.no_grad() - def _update_teacher_model(self, keep_rate=0.996): - student_model_dict = copy.deepcopy(self.model.modelStudent.state_dict()) - new_teacher_dict = dict() - for key, value in self.model.modelTeacher.state_dict().items(): - if key in student_model_dict.keys(): - v = student_model_dict[key] * (1 - keep_rate - ) + value * keep_rate - v.stop_gradient = True - new_teacher_dict[key] = v - else: - raise Exception("{} is not found in student model".format(key)) - - self.model.modelTeacher.set_dict(new_teacher_dict) - - -class EnsembleTSModel(nn.Layer): - def __init__(self, modelTeacher, modelStudent): - super(EnsembleTSModel, self).__init__() - self.modelTeacher = modelTeacher - self.modelStudent = modelStudent - - -class Trainer_Semi_RTDETR(Trainer): - def __init__(self, cfg, mode='train'): - self.cfg = cfg - assert mode.lower() in ['train', 'eval', 'test'], \ - "mode should be 'train', 'eval' or 'test'" - self.mode = mode.lower() - self.optimizer = None - self.is_loaded_weights = False - self.use_amp = self.cfg.get('amp', False) - self.amp_level = self.cfg.get('amp_level', 'O1') - self.custom_white_list = self.cfg.get('custom_white_list', None) - self.custom_black_list = self.cfg.get('custom_black_list', None) - - # build data loader - capital_mode = self.mode.capitalize() - self.dataset = self.cfg['{}Dataset'.format(capital_mode)] = create( - '{}Dataset'.format(capital_mode))() - - if self.mode == 'train': - self.dataset_unlabel = self.cfg['UnsupTrainDataset'] = create( - 'UnsupTrainDataset') - self.loader = create('SemiTrainReader')( - self.dataset, self.dataset_unlabel, cfg.worker_num) - - # build model - if 'model' not in self.cfg: - self.model = create(cfg.SSOD) - else: - self.model = self.cfg.model - self.is_loaded_weights = True - - # EvalDataset build with BatchSampler to evaluate in single device - # TODO: multi-device evaluate - if self.mode == 'eval': - self._eval_batch_sampler = paddle.io.BatchSampler( - self.dataset, batch_size=self.cfg.EvalReader['batch_size']) - # If metric is VOC, need to be set collate_batch=False. - if cfg.metric == 'VOC': - cfg['EvalReader']['collate_batch'] = False - self.loader = create('EvalReader')(self.dataset, cfg.worker_num, - self._eval_batch_sampler) - # TestDataset build after user set images, skip loader creation here - - # build optimizer in train mode - if self.mode == 'train': - steps_per_epoch = len(self.loader) - if steps_per_epoch < 1: - logger.warning( - "Samples in dataset are less than batch_size, please set smaller batch_size in TrainReader." - ) - self.lr = create('LearningRate')(steps_per_epoch) - self.optimizer = create('OptimizerBuilder')(self.lr, self.model) - - # Unstructured pruner is only enabled in the train mode. - if self.cfg.get('unstructured_prune'): - self.pruner = create('UnstructuredPruner')(self.model, - steps_per_epoch) - if self.use_amp and self.amp_level == 'O2': - self.model, self.optimizer = paddle.amp.decorate( - models=self.model, - optimizers=self.optimizer, - level=self.amp_level) - - self._nranks = dist.get_world_size() - self._local_rank = dist.get_rank() - - self.status = {} - - self.start_epoch = 0 - self.start_iter = 0 - self.end_epoch = 0 if 'epoch' not in cfg else cfg.epoch - - # initial default callbacks - self._init_callbacks() - - # initial default metrics - self._init_metrics() - self._reset_metrics() - - def load_semi_weights(self, t_weights, s_weights): - if self.is_loaded_weights: - return - self.start_epoch = 0 - load_pretrain_weight(self.model.teacher, t_weights) - load_pretrain_weight(self.model.student, s_weights) - logger.info("Load teacher weights {} to start training".format( - t_weights)) - logger.info("Load student weights {} to start training".format( - s_weights)) - - def resume_weights(self, weights, exchange=True): - # support Distill resume weights - if hasattr(self.model, 'student_model'): - self.start_epoch = load_weight(self.model.student_model, weights, - self.optimizer, exchange) - else: - self.start_iter, self.start_epoch = load_weight( - self.model, weights, self.optimizer, self.ema - if self.use_ema else None, exchange) - logger.debug("Resume weights of epoch {}".format(self.start_epoch)) - logger.debug("Resume weights of iter {}".format(self.start_iter)) - - def train(self, validate=False): - assert self.mode == 'train', "Model not in 'train' mode" - Init_mark = False - if validate: - self.cfg.EvalDataset = create("EvalDataset")() - - model = self.model - sync_bn = (getattr(self.cfg, 'norm_type', None) == 'sync_bn' and - self.cfg.use_gpu and self._nranks > 1) - if sync_bn: - # self.model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm( - # self.model) - model.teacher = paddle.nn.SyncBatchNorm.convert_sync_batchnorm( - model.teacher) - model.student = paddle.nn.SyncBatchNorm.convert_sync_batchnorm( - self.model.student) - - if self.cfg.get('fleet', False): - # model = fleet.distributed_model(model) - model = fleet.distributed_model(model) - - self.optimizer = fleet.distributed_optimizer(self.optimizer) - elif self._nranks > 1: - find_unused_parameters = self.cfg[ - 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False - model = paddle.DataParallel( - model, find_unused_parameters=find_unused_parameters) - - if self.cfg.get('amp', False): - scaler = amp.GradScaler( - enable=self.cfg.use_gpu or self.cfg.use_npu, - init_loss_scaling=1024) - - self.status.update({ - 'epoch_id': self.start_epoch, - 'iter_id': self.start_iter, - # 'step_id': self.start_step, - 'steps_per_epoch': len(self.loader), - }) - - self.status['batch_time'] = stats.SmoothedValue( - self.cfg.log_iter, fmt='{avg:.4f}') - self.status['data_time'] = stats.SmoothedValue( - self.cfg.log_iter, fmt='{avg:.4f}') - self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) - - if self.cfg.get('print_flops', False): - flops_loader = create('{}Reader'.format(self.mode.capitalize()))( - self.dataset, self.cfg.worker_num) - self._flops(flops_loader) - profiler_options = self.cfg.get('profiler_options', None) - - self._compose_callback.on_train_begin(self.status) - iter_id = self.start_iter - self.status['iter_id'] = iter_id - self.status['eval_interval'] = self.cfg.eval_interval - self.status['save_interval'] = self.cfg.save_interval - for epoch_id in range(self.start_epoch, self.cfg.epoch): - self.status['mode'] = 'train' - self.status['epoch_id'] = epoch_id - self._compose_callback.on_epoch_begin(self.status) - self.loader.dataset_label.set_epoch(epoch_id) - self.loader.dataset_unlabel.set_epoch(epoch_id) - iter_tic = time.time() - if self._nranks > 1: - # print(model) - model._layers.teacher.eval() - model._layers.student.train() - else: - model.teacher.eval() - model.student.train() - iter_tic = time.time() - for step_id in range(len(self.loader)): - data = next(self.loader) - data_sup_w, data_sup_s, data_unsup_w, data_unsup_s = data - data_sup_w['epoch_id'] = epoch_id - data_sup_s['epoch_id'] = epoch_id - data_unsup_w['epoch_id'] = epoch_id - data_unsup_s['epoch_id'] = epoch_id - data = [data_sup_w, data_sup_s, data_unsup_w, data_unsup_s] - iter_id += 1 - self.status['data_time'].update(time.time() - iter_tic) - self.status['step_id'] = step_id - self.status['iter_id'] = iter_id - data.append(iter_id) - profiler.add_profiler_step(profiler_options) - self._compose_callback.on_step_begin(self.status) - if self.cfg.get('amp', False): - with amp.auto_cast(enable=self.cfg.use_gpu): - # model forward - if self._nranks > 1: - outputs = model._layers(data) - else: - outputs = model(data) - loss = outputs['loss'] - - scaled_loss = scaler.scale(loss) - scaled_loss.backward() - scaler.minimize(self.optimizer, scaled_loss) - else: - outputs = model(data) - loss = outputs['loss'] - # model backward - loss.backward() - self.optimizer.step() - curr_lr = self.optimizer.get_lr() - self.lr.step() - if self.cfg.get('unstructured_prune'): - self.pruner.step() - self.optimizer.clear_grad() - # print(outputs) - # outputs=reduce_dict(outputs) - # if self.model.debug: - # check_gradient(model) - # self.check_gradient() - self.status['learning_rate'] = curr_lr - if self._nranks < 2 or self._local_rank == 0: - self.status['training_staus'].update(outputs) - - self.status['batch_time'].update(time.time() - iter_tic) - - if validate and (self._nranks < 2 or self._local_rank == 0) and \ - ((iter_id + 1) % self.cfg.eval_interval == 0): - if not hasattr(self, '_eval_loader'): - # build evaluation dataset and loader - self._eval_dataset = self.cfg.EvalDataset - self._eval_batch_sampler = \ - paddle.io.BatchSampler( - self._eval_dataset, - batch_size=self.cfg.EvalReader['batch_size']) - # If metric is VOC, need to be set collate_batch=False. - if self.cfg.metric == 'VOC': - self.cfg['EvalReader']['collate_batch'] = False - self._eval_loader = create('EvalReader')( - self._eval_dataset, - self.cfg.worker_num, - batch_sampler=self._eval_batch_sampler) - # if validation in training is enabled, metrics should be re-init - # Init_mark makes sure this code will only execute once - if validate and Init_mark == False: - Init_mark = True - self._init_metrics(validate=validate) - self._reset_metrics() - - with paddle.no_grad(): - self.status['save_best_model'] = True - self._eval_with_loader(self._eval_loader) - model._layers.student.train() - - self._compose_callback.on_step_end(self.status) - - iter_tic = time.time() - - if self.cfg.get('unstructured_prune'): - self.pruner.update_params() - self._compose_callback.on_epoch_end(self.status) - - self._compose_callback.on_train_end(self.status) - - def _eval_with_loader(self, loader): - sample_num = 0 - tic = time.time() - self._compose_callback.on_epoch_begin(self.status) - self.status['mode'] = 'eval' - self.model.eval() - if self.cfg.get('print_flops', False): - flops_loader = create('{}Reader'.format(self.mode.capitalize()))( - self.dataset, self.cfg.worker_num, self._eval_batch_sampler) - self._flops(flops_loader) - print("*****teacher evaluate*****") - for step_id, data in enumerate(loader): - self.status['step_id'] = step_id - self._compose_callback.on_step_begin(self.status) - # forward - outs = self.model.teacher(data) - - # update metrics - for metric in self._metrics: - metric.update(data, outs) - - # multi-scale inputs: all inputs have same im_id - if isinstance(data, typing.Sequence): - sample_num += data[0]['im_id'].numpy().shape[0] - else: - sample_num += data['im_id'].numpy().shape[0] - self._compose_callback.on_step_end(self.status) - - self.status['sample_num'] = sample_num - self.status['cost_time'] = time.time() - tic - - # accumulate metric to log out - for metric in self._metrics: - metric.accumulate() - metric.log() - self._compose_callback.on_epoch_end(self.status) - # reset metric states for metric may performed multiple times - self._reset_metrics() - - print("*****student evaluate*****") - for step_id, data in enumerate(loader): - self.status['step_id'] = step_id - self._compose_callback.on_step_begin(self.status) - # forward - outs = self.model.student(data) - - # update metrics - for metric in self._metrics: - metric.update(data, outs) - - # multi-scale inputs: all inputs have same im_id - if isinstance(data, typing.Sequence): - sample_num += data[0]['im_id'].numpy().shape[0] - else: - sample_num += data['im_id'].numpy().shape[0] - self._compose_callback.on_step_end(self.status) - - self.status['sample_num'] = sample_num - self.status['cost_time'] = time.time() - tic - - # accumulate metric to log out - for metric in self._metrics: - metric.accumulate() - metric.log() - # reset metric states for metric may performed multiple times - self._reset_metrics() - self.status['mode'] = 'train' - - def evaluate(self): - with paddle.no_grad(): - self._eval_with_loader(self.loader) diff --git a/pdfdet/models/Paddle/ppdet/ext_op/README.md b/pdfdet/models/Paddle/ppdet/ext_op/README.md deleted file mode 100644 index 0d67062..0000000 --- a/pdfdet/models/Paddle/ppdet/ext_op/README.md +++ /dev/null @@ -1,35 +0,0 @@ -# 自定义OP编译 -旋转框IOU计算OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。 - -## 1. 环境依赖 -- Paddle >= 2.0.1 -- gcc 8.2 - -## 2. 安装 -``` -python setup.py install -``` - -编译完成后即可使用,以下为`rbox_iou`的使用示例 -``` -# 引入自定义op -from ext_op import rbox_iou - -paddle.set_device('gpu:0') -paddle.disable_static() - -rbox1 = np.random.rand(13000, 5) -rbox2 = np.random.rand(7, 5) - -pd_rbox1 = paddle.to_tensor(rbox1) -pd_rbox2 = paddle.to_tensor(rbox2) - -iou = rbox_iou(pd_rbox1, pd_rbox2) -print('iou', iou) -``` - -## 3. 单元测试 -可以通过执行单元测试来确认自定义算子功能的正确性,执行单元测试的示例如下所示: -``` -python unittest/test_matched_rbox_iou.py -``` diff --git a/pdfdet/models/Paddle/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cc b/pdfdet/models/Paddle/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cc deleted file mode 100644 index b16e8c1..0000000 --- a/pdfdet/models/Paddle/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cc +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// The code is based on -// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/ - -#include "../rbox_iou/rbox_iou_utils.h" -#include "paddle/extension.h" - -template -void matched_rbox_iou_cpu_kernel(const int rbox_num, const T *rbox1_data_ptr, - const T *rbox2_data_ptr, T *output_data_ptr) { - - int i; - for (i = 0; i < rbox_num; i++) { - output_data_ptr[i] = - rbox_iou_single(rbox1_data_ptr + i * 5, rbox2_data_ptr + i * 5); - } -} - -#define CHECK_INPUT_CPU(x) \ - PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") - -std::vector -MatchedRboxIouCPUForward(const paddle::Tensor &rbox1, - const paddle::Tensor &rbox2) { - CHECK_INPUT_CPU(rbox1); - CHECK_INPUT_CPU(rbox2); - PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim"); - - auto rbox_num = rbox1.shape()[0]; - auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::CPUPlace()); - - PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "matched_rbox_iou_cpu_kernel", ([&] { - matched_rbox_iou_cpu_kernel( - rbox_num, rbox1.data(), - rbox2.data(), output.data()); - })); - - return {output}; -} - -#ifdef PADDLE_WITH_CUDA -std::vector -MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1, - const paddle::Tensor &rbox2); -#endif - -#define CHECK_INPUT_SAME(x1, x2) \ - PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.") - -std::vector MatchedRboxIouForward(const paddle::Tensor &rbox1, - const paddle::Tensor &rbox2) { - CHECK_INPUT_SAME(rbox1, rbox2); - if (rbox1.is_cpu()) { - return MatchedRboxIouCPUForward(rbox1, rbox2); -#ifdef PADDLE_WITH_CUDA - } else if (rbox1.is_gpu()) { - return MatchedRboxIouCUDAForward(rbox1, rbox2); -#endif - } -} - -std::vector> -MatchedRboxIouInferShape(std::vector rbox1_shape, - std::vector rbox2_shape) { - return {{rbox1_shape[0]}}; -} - -std::vector MatchedRboxIouInferDtype(paddle::DataType t1, - paddle::DataType t2) { - return {t1}; -} - -PD_BUILD_OP(matched_rbox_iou) - .Inputs({"RBOX1", "RBOX2"}) - .Outputs({"Output"}) - .SetKernelFn(PD_KERNEL(MatchedRboxIouForward)) - .SetInferShapeFn(PD_INFER_SHAPE(MatchedRboxIouInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(MatchedRboxIouInferDtype)); diff --git a/pdfdet/models/Paddle/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cu b/pdfdet/models/Paddle/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cu deleted file mode 100644 index 53454d1..0000000 --- a/pdfdet/models/Paddle/ppdet/ext_op/csrc/matched_rbox_iou/matched_rbox_iou.cu +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// The code is based on -// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/ - -#include "../rbox_iou/rbox_iou_utils.h" -#include "paddle/extension.h" - -template -__global__ void -matched_rbox_iou_cuda_kernel(const int rbox_num, const T *rbox1_data_ptr, - const T *rbox2_data_ptr, T *output_data_ptr) { - for (int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < rbox_num; - tid += blockDim.x * gridDim.x) { - output_data_ptr[tid] = - rbox_iou_single(rbox1_data_ptr + tid * 5, rbox2_data_ptr + tid * 5); - } -} - -#define CHECK_INPUT_GPU(x) \ - PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") - -std::vector -MatchedRboxIouCUDAForward(const paddle::Tensor &rbox1, - const paddle::Tensor &rbox2) { - CHECK_INPUT_GPU(rbox1); - CHECK_INPUT_GPU(rbox2); - PD_CHECK(rbox1.shape()[0] == rbox2.shape()[0], "inputs must be same dim"); - - auto rbox_num = rbox1.shape()[0]; - - auto output = paddle::empty({rbox_num}, rbox1.dtype(), paddle::GPUPlace()); - - const int thread_per_block = 512; - const int block_per_grid = CeilDiv(rbox_num, thread_per_block); - - PD_DISPATCH_FLOATING_TYPES( - rbox1.type(), "matched_rbox_iou_cuda_kernel", ([&] { - matched_rbox_iou_cuda_kernel< - data_t><<>>( - rbox_num, rbox1.data(), rbox2.data(), - output.data()); - })); - - return {output}; -} diff --git a/pdfdet/models/Paddle/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cc b/pdfdet/models/Paddle/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cc deleted file mode 100644 index 44f4eb6..0000000 --- a/pdfdet/models/Paddle/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cc +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "../rbox_iou/rbox_iou_utils.h" -#include "paddle/extension.h" - -template -void nms_rotated_cpu_kernel(const T *boxes_data, const float threshold, - const int64_t num_boxes, int64_t *num_keep_boxes, - int64_t *output_data) { - - int num_masks = CeilDiv(num_boxes, 64); - std::vector masks(num_masks, 0); - for (int64_t i = 0; i < num_boxes; ++i) { - if (masks[i / 64] & 1ULL << (i % 64)) - continue; - T box_1[5]; - for (int k = 0; k < 5; ++k) { - box_1[k] = boxes_data[i * 5 + k]; - } - for (int64_t j = i + 1; j < num_boxes; ++j) { - if (masks[j / 64] & 1ULL << (j % 64)) - continue; - T box_2[5]; - for (int k = 0; k < 5; ++k) { - box_2[k] = boxes_data[j * 5 + k]; - } - if (rbox_iou_single(box_1, box_2) > threshold) { - masks[j / 64] |= 1ULL << (j % 64); - } - } - } - int64_t output_data_idx = 0; - for (int64_t i = 0; i < num_boxes; ++i) { - if (masks[i / 64] & 1ULL << (i % 64)) - continue; - output_data[output_data_idx++] = i; - } - *num_keep_boxes = output_data_idx; - for (; output_data_idx < num_boxes; ++output_data_idx) { - output_data[output_data_idx] = 0; - } -} - -#define CHECK_INPUT_CPU(x) \ - PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") - -std::vector NMSRotatedCPUForward(const paddle::Tensor &boxes, - const paddle::Tensor &scores, - float threshold) { - CHECK_INPUT_CPU(boxes); - CHECK_INPUT_CPU(scores); - - auto num_boxes = boxes.shape()[0]; - - auto order_t = - std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true)); - auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0); - - auto keep = - paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace()); - int64_t num_keep_boxes = 0; - - PD_DISPATCH_FLOATING_TYPES(boxes.type(), "nms_rotated_cpu_kernel", ([&] { - nms_rotated_cpu_kernel( - boxes_sorted.data(), threshold, - num_boxes, &num_keep_boxes, - keep.data()); - })); - - keep = keep.slice(0, num_keep_boxes); - return {paddle::gather(order_t, keep, /* axis=*/0)}; -} - -#ifdef PADDLE_WITH_CUDA -std::vector NMSRotatedCUDAForward(const paddle::Tensor &boxes, - const paddle::Tensor &scores, - float threshold); -#endif - -std::vector NMSRotatedForward(const paddle::Tensor &boxes, - const paddle::Tensor &scores, - float threshold) { - if (boxes.is_cpu()) { - return NMSRotatedCPUForward(boxes, scores, threshold); -#ifdef PADDLE_WITH_CUDA - } else if (boxes.is_gpu()) { - return NMSRotatedCUDAForward(boxes, scores, threshold); -#endif - } -} - -std::vector> -NMSRotatedInferShape(std::vector boxes_shape, - std::vector scores_shape) { - return {{-1}}; -} - -std::vector NMSRotatedInferDtype(paddle::DataType t1, - paddle::DataType t2) { - return {paddle::DataType::INT64}; -} - -PD_BUILD_OP(nms_rotated) - .Inputs({"Boxes", "Scores"}) - .Outputs({"Output"}) - .Attrs({"threshold: float"}) - .SetKernelFn(PD_KERNEL(NMSRotatedForward)) - .SetInferShapeFn(PD_INFER_SHAPE(NMSRotatedInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(NMSRotatedInferDtype)); \ No newline at end of file diff --git a/pdfdet/models/Paddle/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cu b/pdfdet/models/Paddle/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cu deleted file mode 100644 index d20dddb..0000000 --- a/pdfdet/models/Paddle/ppdet/ext_op/csrc/nms_rotated/nms_rotated.cu +++ /dev/null @@ -1,96 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "../rbox_iou/rbox_iou_utils.h" -#include "paddle/extension.h" - -static const int64_t threadsPerBlock = sizeof(int64_t) * 8; - -template -__global__ void -nms_rotated_cuda_kernel(const T *boxes_data, const float threshold, - const int64_t num_boxes, int64_t *masks) { - auto raw_start = blockIdx.y; - auto col_start = blockIdx.x; - if (raw_start > col_start) - return; - const int raw_last_storage = - min(num_boxes - raw_start * threadsPerBlock, threadsPerBlock); - const int col_last_storage = - min(num_boxes - col_start * threadsPerBlock, threadsPerBlock); - if (threadIdx.x < raw_last_storage) { - int64_t mask = 0; - auto current_box_idx = raw_start * threadsPerBlock + threadIdx.x; - const T *current_box = boxes_data + current_box_idx * 5; - for (int i = 0; i < col_last_storage; ++i) { - const T *target_box = boxes_data + (col_start * threadsPerBlock + i) * 5; - if (rbox_iou_single(current_box, target_box) > threshold) { - mask |= 1ULL << i; - } - } - const int blocks_per_line = CeilDiv(num_boxes, threadsPerBlock); - masks[current_box_idx * blocks_per_line + col_start] = mask; - } -} - -#define CHECK_INPUT_GPU(x) \ - PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") - -std::vector NMSRotatedCUDAForward(const paddle::Tensor &boxes, - const paddle::Tensor &scores, - float threshold) { - CHECK_INPUT_GPU(boxes); - CHECK_INPUT_GPU(scores); - - auto num_boxes = boxes.shape()[0]; - auto order_t = - std::get<1>(paddle::argsort(scores, /* axis=*/0, /* descending=*/true)); - auto boxes_sorted = paddle::gather(boxes, order_t, /* axis=*/0); - - const auto blocks_per_line = CeilDiv(num_boxes, threadsPerBlock); - dim3 block(threadsPerBlock); - dim3 grid(blocks_per_line, blocks_per_line); - auto mask_dev = paddle::empty({num_boxes * blocks_per_line}, - paddle::DataType::INT64, paddle::GPUPlace()); - - PD_DISPATCH_FLOATING_TYPES( - boxes.type(), "nms_rotated_cuda_kernel", ([&] { - nms_rotated_cuda_kernel<<>>( - boxes_sorted.data(), threshold, num_boxes, - mask_dev.data()); - })); - - auto mask_host = mask_dev.copy_to(paddle::CPUPlace(), true); - auto keep_host = - paddle::empty({num_boxes}, paddle::DataType::INT64, paddle::CPUPlace()); - int64_t *keep_host_ptr = keep_host.data(); - int64_t *mask_host_ptr = mask_host.data(); - std::vector remv(blocks_per_line); - int64_t last_box_num = 0; - for (int64_t i = 0; i < num_boxes; ++i) { - auto remv_element_id = i / threadsPerBlock; - auto remv_bit_id = i % threadsPerBlock; - if (!(remv[remv_element_id] & 1ULL << remv_bit_id)) { - keep_host_ptr[last_box_num++] = i; - int64_t *current_mask = mask_host_ptr + i * blocks_per_line; - for (auto j = remv_element_id; j < blocks_per_line; ++j) { - remv[j] |= current_mask[j]; - } - } - } - - keep_host = keep_host.slice(0, last_box_num); - auto keep_dev = keep_host.copy_to(paddle::GPUPlace(), true); - return {paddle::gather(order_t, keep_dev, /* axis=*/0)}; -} \ No newline at end of file diff --git a/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cc b/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cc deleted file mode 100644 index c8e7528..0000000 --- a/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cc +++ /dev/null @@ -1,95 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// The code is based on -// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/ - -#include "paddle/extension.h" -#include "rbox_iou_utils.h" - -template -void rbox_iou_cpu_kernel(const int rbox1_num, const int rbox2_num, - const T *rbox1_data_ptr, const T *rbox2_data_ptr, - T *output_data_ptr) { - - int i, j; - for (i = 0; i < rbox1_num; i++) { - for (j = 0; j < rbox2_num; j++) { - int offset = i * rbox2_num + j; - output_data_ptr[offset] = - rbox_iou_single(rbox1_data_ptr + i * 5, rbox2_data_ptr + j * 5); - } - } -} - -#define CHECK_INPUT_CPU(x) \ - PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") - -std::vector RboxIouCPUForward(const paddle::Tensor &rbox1, - const paddle::Tensor &rbox2) { - CHECK_INPUT_CPU(rbox1); - CHECK_INPUT_CPU(rbox2); - - auto rbox1_num = rbox1.shape()[0]; - auto rbox2_num = rbox2.shape()[0]; - - auto output = - paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::CPUPlace()); - - PD_DISPATCH_FLOATING_TYPES(rbox1.type(), "rbox_iou_cpu_kernel", ([&] { - rbox_iou_cpu_kernel( - rbox1_num, rbox2_num, rbox1.data(), - rbox2.data(), output.data()); - })); - - return {output}; -} - -#ifdef PADDLE_WITH_CUDA -std::vector RboxIouCUDAForward(const paddle::Tensor &rbox1, - const paddle::Tensor &rbox2); -#endif - -#define CHECK_INPUT_SAME(x1, x2) \ - PD_CHECK(x1.place() == x2.place(), "input must be smae pacle.") - -std::vector RboxIouForward(const paddle::Tensor &rbox1, - const paddle::Tensor &rbox2) { - CHECK_INPUT_SAME(rbox1, rbox2); - if (rbox1.is_cpu()) { - return RboxIouCPUForward(rbox1, rbox2); -#ifdef PADDLE_WITH_CUDA - } else if (rbox1.is_gpu()) { - return RboxIouCUDAForward(rbox1, rbox2); -#endif - } -} - -std::vector> -RboxIouInferShape(std::vector rbox1_shape, - std::vector rbox2_shape) { - return {{rbox1_shape[0], rbox2_shape[0]}}; -} - -std::vector RboxIouInferDtype(paddle::DataType t1, - paddle::DataType t2) { - return {t1}; -} - -PD_BUILD_OP(rbox_iou) - .Inputs({"RBox1", "RBox2"}) - .Outputs({"Output"}) - .SetKernelFn(PD_KERNEL(RboxIouForward)) - .SetInferShapeFn(PD_INFER_SHAPE(RboxIouInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(RboxIouInferDtype)); diff --git a/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cu b/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cu deleted file mode 100644 index baedb6d..0000000 --- a/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou.cu +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// The code is based on -// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/ - -#include "paddle/extension.h" -#include "rbox_iou_utils.h" - -// 2D block with 32 * 16 = 512 threads per block -const int BLOCK_DIM_X = 32; -const int BLOCK_DIM_Y = 16; - -template -__global__ void rbox_iou_cuda_kernel(const int rbox1_num, const int rbox2_num, - const T *rbox1_data_ptr, - const T *rbox2_data_ptr, - T *output_data_ptr) { - - // get row_start and col_start - const int rbox1_block_idx = blockIdx.x * blockDim.x; - const int rbox2_block_idx = blockIdx.y * blockDim.y; - - const int rbox1_thread_num = min(rbox1_num - rbox1_block_idx, blockDim.x); - const int rbox2_thread_num = min(rbox2_num - rbox2_block_idx, blockDim.y); - - __shared__ T block_boxes1[BLOCK_DIM_X * 5]; - __shared__ T block_boxes2[BLOCK_DIM_Y * 5]; - - // It's safe to copy using threadIdx.x since BLOCK_DIM_X >= BLOCK_DIM_Y - if (threadIdx.x < rbox1_thread_num && threadIdx.y == 0) { - block_boxes1[threadIdx.x * 5 + 0] = - rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 0]; - block_boxes1[threadIdx.x * 5 + 1] = - rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 1]; - block_boxes1[threadIdx.x * 5 + 2] = - rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 2]; - block_boxes1[threadIdx.x * 5 + 3] = - rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 3]; - block_boxes1[threadIdx.x * 5 + 4] = - rbox1_data_ptr[(rbox1_block_idx + threadIdx.x) * 5 + 4]; - } - - // threadIdx.x < BLOCK_DIM_Y=rbox2_thread_num, just use same condition as - // above: threadIdx.y == 0 - if (threadIdx.x < rbox2_thread_num && threadIdx.y == 0) { - block_boxes2[threadIdx.x * 5 + 0] = - rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 0]; - block_boxes2[threadIdx.x * 5 + 1] = - rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 1]; - block_boxes2[threadIdx.x * 5 + 2] = - rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 2]; - block_boxes2[threadIdx.x * 5 + 3] = - rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 3]; - block_boxes2[threadIdx.x * 5 + 4] = - rbox2_data_ptr[(rbox2_block_idx + threadIdx.x) * 5 + 4]; - } - - // sync - __syncthreads(); - - if (threadIdx.x < rbox1_thread_num && threadIdx.y < rbox2_thread_num) { - int offset = (rbox1_block_idx + threadIdx.x) * rbox2_num + rbox2_block_idx + - threadIdx.y; - output_data_ptr[offset] = rbox_iou_single( - block_boxes1 + threadIdx.x * 5, block_boxes2 + threadIdx.y * 5); - } -} - -#define CHECK_INPUT_GPU(x) \ - PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") - -std::vector RboxIouCUDAForward(const paddle::Tensor &rbox1, - const paddle::Tensor &rbox2) { - CHECK_INPUT_GPU(rbox1); - CHECK_INPUT_GPU(rbox2); - - auto rbox1_num = rbox1.shape()[0]; - auto rbox2_num = rbox2.shape()[0]; - - auto output = - paddle::empty({rbox1_num, rbox2_num}, rbox1.dtype(), paddle::GPUPlace()); - - const int blocks_x = CeilDiv(rbox1_num, BLOCK_DIM_X); - const int blocks_y = CeilDiv(rbox2_num, BLOCK_DIM_Y); - - dim3 blocks(blocks_x, blocks_y); - dim3 threads(BLOCK_DIM_X, BLOCK_DIM_Y); - - PD_DISPATCH_FLOATING_TYPES( - rbox1.type(), "rbox_iou_cuda_kernel", ([&] { - rbox_iou_cuda_kernel<<>>( - rbox1_num, rbox2_num, rbox1.data(), rbox2.data(), - output.data()); - })); - - return {output}; -} diff --git a/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou_utils.h b/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou_utils.h deleted file mode 100644 index 6f275dd..0000000 --- a/pdfdet/models/Paddle/ppdet/ext_op/csrc/rbox_iou/rbox_iou_utils.h +++ /dev/null @@ -1,356 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -// The code is based on -// https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/csrc/box_iou_rotated/ - -#pragma once - -#include -#include -#include - -#ifdef __CUDACC__ -// Designates functions callable from the host (CPU) and the device (GPU) -#define HOST_DEVICE __host__ __device__ -#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__ -#else -#include -#define HOST_DEVICE -#define HOST_DEVICE_INLINE HOST_DEVICE inline -#endif - -namespace { - -template struct RotatedBox { T x_ctr, y_ctr, w, h, a; }; - -template struct Point { - T x, y; - HOST_DEVICE_INLINE Point(const T &px = 0, const T &py = 0) : x(px), y(py) {} - HOST_DEVICE_INLINE Point operator+(const Point &p) const { - return Point(x + p.x, y + p.y); - } - HOST_DEVICE_INLINE Point &operator+=(const Point &p) { - x += p.x; - y += p.y; - return *this; - } - HOST_DEVICE_INLINE Point operator-(const Point &p) const { - return Point(x - p.x, y - p.y); - } - HOST_DEVICE_INLINE Point operator*(const T coeff) const { - return Point(x * coeff, y * coeff); - } -}; - -template -HOST_DEVICE_INLINE T dot_2d(const Point &A, const Point &B) { - return A.x * B.x + A.y * B.y; -} - -template -HOST_DEVICE_INLINE T cross_2d(const Point &A, const Point &B) { - return A.x * B.y - B.x * A.y; -} - -template -HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox &box, - Point (&pts)[4]) { - // M_PI / 180. == 0.01745329251 - // double theta = box.a * 0.01745329251; - // MODIFIED - double theta = box.a; - T cosTheta2 = (T)cos(theta) * 0.5f; - T sinTheta2 = (T)sin(theta) * 0.5f; - - // y: top --> down; x: left --> right - pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w; - pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w; - pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w; - pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w; - pts[2].x = 2 * box.x_ctr - pts[0].x; - pts[2].y = 2 * box.y_ctr - pts[0].y; - pts[3].x = 2 * box.x_ctr - pts[1].x; - pts[3].y = 2 * box.y_ctr - pts[1].y; -} - -template -HOST_DEVICE_INLINE int get_intersection_points(const Point (&pts1)[4], - const Point (&pts2)[4], - Point (&intersections)[24]) { - // Line vector - // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1] - Point vec1[4], vec2[4]; - for (int i = 0; i < 4; i++) { - vec1[i] = pts1[(i + 1) % 4] - pts1[i]; - vec2[i] = pts2[(i + 1) % 4] - pts2[i]; - } - - // Line test - test all line combos for intersection - int num = 0; // number of intersections - for (int i = 0; i < 4; i++) { - for (int j = 0; j < 4; j++) { - // Solve for 2x2 Ax=b - T det = cross_2d(vec2[j], vec1[i]); - - // This takes care of parallel lines - if (fabs(det) <= 1e-14) { - continue; - } - - auto vec12 = pts2[j] - pts1[i]; - - T t1 = cross_2d(vec2[j], vec12) / det; - T t2 = cross_2d(vec1[i], vec12) / det; - - if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) { - intersections[num++] = pts1[i] + vec1[i] * t1; - } - } - } - - // Check for vertices of rect1 inside rect2 - { - const auto &AB = vec2[0]; - const auto &DA = vec2[3]; - auto ABdotAB = dot_2d(AB, AB); - auto ADdotAD = dot_2d(DA, DA); - for (int i = 0; i < 4; i++) { - // assume ABCD is the rectangle, and P is the point to be judged - // P is inside ABCD iff. P's projection on AB lies within AB - // and P's projection on AD lies within AD - - auto AP = pts1[i] - pts2[0]; - - auto APdotAB = dot_2d(AP, AB); - auto APdotAD = -dot_2d(AP, DA); - - if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && - (APdotAD <= ADdotAD)) { - intersections[num++] = pts1[i]; - } - } - } - - // Reverse the check - check for vertices of rect2 inside rect1 - { - const auto &AB = vec1[0]; - const auto &DA = vec1[3]; - auto ABdotAB = dot_2d(AB, AB); - auto ADdotAD = dot_2d(DA, DA); - for (int i = 0; i < 4; i++) { - auto AP = pts2[i] - pts1[0]; - - auto APdotAB = dot_2d(AP, AB); - auto APdotAD = -dot_2d(AP, DA); - - if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) && - (APdotAD <= ADdotAD)) { - intersections[num++] = pts2[i]; - } - } - } - - return num; -} - -template -HOST_DEVICE_INLINE int convex_hull_graham(const Point (&p)[24], - const int &num_in, Point (&q)[24], - bool shift_to_zero = false) { - assert(num_in >= 2); - - // Step 1: - // Find point with minimum y - // if more than 1 points have the same minimum y, - // pick the one with the minimum x. - int t = 0; - for (int i = 1; i < num_in; i++) { - if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) { - t = i; - } - } - auto &start = p[t]; // starting point - - // Step 2: - // Subtract starting point from every points (for sorting in the next step) - for (int i = 0; i < num_in; i++) { - q[i] = p[i] - start; - } - - // Swap the starting point to position 0 - auto tmp = q[0]; - q[0] = q[t]; - q[t] = tmp; - - // Step 3: - // Sort point 1 ~ num_in according to their relative cross-product values - // (essentially sorting according to angles) - // If the angles are the same, sort according to their distance to origin - T dist[24]; - for (int i = 0; i < num_in; i++) { - dist[i] = dot_2d(q[i], q[i]); - } - -#ifdef __CUDACC__ - // CUDA version - // In the future, we can potentially use thrust - // for sorting here to improve speed (though not guaranteed) - for (int i = 1; i < num_in - 1; i++) { - for (int j = i + 1; j < num_in; j++) { - T crossProduct = cross_2d(q[i], q[j]); - if ((crossProduct < -1e-6) || - (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) { - auto q_tmp = q[i]; - q[i] = q[j]; - q[j] = q_tmp; - auto dist_tmp = dist[i]; - dist[i] = dist[j]; - dist[j] = dist_tmp; - } - } - } -#else - // CPU version - std::sort(q + 1, q + num_in, - [](const Point &A, const Point &B) -> bool { - T temp = cross_2d(A, B); - if (fabs(temp) < 1e-6) { - return dot_2d(A, A) < dot_2d(B, B); - } else { - return temp > 0; - } - }); -#endif - - // Step 4: - // Make sure there are at least 2 points (that don't overlap with each other) - // in the stack - int k; // index of the non-overlapped second point - for (k = 1; k < num_in; k++) { - if (dist[k] > 1e-8) { - break; - } - } - if (k == num_in) { - // We reach the end, which means the convex hull is just one point - q[0] = p[t]; - return 1; - } - q[1] = q[k]; - int m = 2; // 2 points in the stack - // Step 5: - // Finally we can start the scanning process. - // When a non-convex relationship between the 3 points is found - // (either concave shape or duplicated points), - // we pop the previous point from the stack - // until the 3-point relationship is convex again, or - // until the stack only contains two points - for (int i = k + 1; i < num_in; i++) { - while (m > 1 && cross_2d(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) { - m--; - } - q[m++] = q[i]; - } - - // Step 6 (Optional): - // In general sense we need the original coordinates, so we - // need to shift the points back (reverting Step 2) - // But if we're only interested in getting the area/perimeter of the shape - // We can simply return. - if (!shift_to_zero) { - for (int i = 0; i < m; i++) { - q[i] += start; - } - } - - return m; -} - -template -HOST_DEVICE_INLINE T polygon_area(const Point (&q)[24], const int &m) { - if (m <= 2) { - return 0; - } - - T area = 0; - for (int i = 1; i < m - 1; i++) { - area += fabs(cross_2d(q[i] - q[0], q[i + 1] - q[0])); - } - - return area / 2.0; -} - -template -HOST_DEVICE_INLINE T rboxes_intersection(const RotatedBox &box1, - const RotatedBox &box2) { - // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned - // from rotated_rect_intersection_pts - Point intersectPts[24], orderedPts[24]; - - Point pts1[4]; - Point pts2[4]; - get_rotated_vertices(box1, pts1); - get_rotated_vertices(box2, pts2); - - int num = get_intersection_points(pts1, pts2, intersectPts); - - if (num <= 2) { - return 0.0; - } - - // Convex Hull to order the intersection points in clockwise order and find - // the contour area. - int num_convex = convex_hull_graham(intersectPts, num, orderedPts, true); - return polygon_area(orderedPts, num_convex); -} - -} // namespace - -template -HOST_DEVICE_INLINE T rbox_iou_single(T const *const box1_raw, - T const *const box2_raw) { - // shift center to the middle point to achieve higher precision in result - RotatedBox box1, box2; - auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0; - auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0; - box1.x_ctr = box1_raw[0] - center_shift_x; - box1.y_ctr = box1_raw[1] - center_shift_y; - box1.w = box1_raw[2]; - box1.h = box1_raw[3]; - box1.a = box1_raw[4]; - box2.x_ctr = box2_raw[0] - center_shift_x; - box2.y_ctr = box2_raw[1] - center_shift_y; - box2.w = box2_raw[2]; - box2.h = box2_raw[3]; - box2.a = box2_raw[4]; - - if (box1.w < 1e-2 || box1.h < 1e-2 || box2.w < 1e-2 || box2.h < 1e-2) { - return 0.f; - } - const T area1 = box1.w * box1.h; - const T area2 = box2.w * box2.h; - - const T intersection = rboxes_intersection(box1, box2); - const T iou = intersection / (area1 + area2 - intersection); - return iou; -} - -/** - Computes ceil(a / b) -*/ - -HOST_DEVICE inline int CeilDiv(const int a, const int b) { - return (a + b - 1) / b; -} \ No newline at end of file diff --git a/pdfdet/models/Paddle/ppdet/ext_op/setup.py b/pdfdet/models/Paddle/ppdet/ext_op/setup.py deleted file mode 100644 index 5892f46..0000000 --- a/pdfdet/models/Paddle/ppdet/ext_op/setup.py +++ /dev/null @@ -1,33 +0,0 @@ -import os -import glob -import paddle -from paddle.utils.cpp_extension import CppExtension, CUDAExtension, setup - - -def get_extensions(): - root_dir = os.path.dirname(os.path.abspath(__file__)) - ext_root_dir = os.path.join(root_dir, 'csrc') - sources = [] - for ext_name in os.listdir(ext_root_dir): - ext_dir = os.path.join(ext_root_dir, ext_name) - source = glob.glob(os.path.join(ext_dir, '*.cc')) - kwargs = dict() - if paddle.device.is_compiled_with_cuda(): - source += glob.glob(os.path.join(ext_dir, '*.cu')) - - if not source: - continue - - sources += source - - if paddle.device.is_compiled_with_cuda(): - extension = CUDAExtension( - sources, extra_compile_args={'cxx': ['-DPADDLE_WITH_CUDA']}) - else: - extension = CppExtension(sources) - - return extension - - -if __name__ == "__main__": - setup(name='ext_op', ext_modules=get_extensions()) diff --git a/pdfdet/models/Paddle/ppdet/ext_op/unittest/test_matched_rbox_iou.py b/pdfdet/models/Paddle/ppdet/ext_op/unittest/test_matched_rbox_iou.py deleted file mode 100644 index af7b076..0000000 --- a/pdfdet/models/Paddle/ppdet/ext_op/unittest/test_matched_rbox_iou.py +++ /dev/null @@ -1,149 +0,0 @@ -import numpy as np -import sys -import time -from shapely.geometry import Polygon -import paddle -import unittest - -from ext_op import matched_rbox_iou - - -def rbox2poly_single(rrect, get_best_begin_point=False): - """ - rrect:[x_ctr,y_ctr,w,h,angle] - to - poly:[x0,y0,x1,y1,x2,y2,x3,y3] - """ - x_ctr, y_ctr, width, height, angle = rrect[:5] - tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2 - # rect 2x4 - rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]]) - R = np.array([[np.cos(angle), -np.sin(angle)], - [np.sin(angle), np.cos(angle)]]) - # poly - poly = R.dot(rect) - x0, x1, x2, x3 = poly[0, :4] + x_ctr - y0, y1, y2, y3 = poly[1, :4] + y_ctr - poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64) - return poly - - -def intersection(g, p): - """ - Intersection. - """ - - g = g[:8].reshape((4, 2)) - p = p[:8].reshape((4, 2)) - - a = g - b = p - - use_filter = True - if use_filter: - # step1: - inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0])) - inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0])) - inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1])) - inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1])) - if inter_x1 >= inter_x2 or inter_y1 >= inter_y2: - return 0. - x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0])) - x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0])) - y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1])) - y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1])) - if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2: - return 0. - - g = Polygon(g) - p = Polygon(p) - if not g.is_valid or not p.is_valid: - return 0 - - inter = Polygon(g).intersection(Polygon(p)).area - union = g.area + p.area - inter - if union == 0: - return 0 - else: - return inter / union - - -def matched_rbox_overlaps(anchors, gt_bboxes, use_cv2=False): - """ - - Args: - anchors: [M, 5] x1,y1,x2,y2,angle - gt_bboxes: [M, 5] x1,y1,x2,y2,angle - - Returns: - macthed_iou: [M] - """ - assert anchors.shape[1] == 5 - assert gt_bboxes.shape[1] == 5 - - gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes] - anchors_ploy = [rbox2poly_single(e) for e in anchors] - - num = len(anchors_ploy) - iou = np.zeros((num, ), dtype=np.float64) - - start_time = time.time() - for i in range(num): - try: - iou[i] = intersection(gt_bboxes_ploy[i], anchors_ploy[i]) - except Exception as e: - print('cur gt_bboxes_ploy[i]', gt_bboxes_ploy[i], - 'anchors_ploy[j]', anchors_ploy[i], e) - return iou - - -def gen_sample(n): - rbox = np.random.rand(n, 5) - rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001 - rbox[:, 4] = rbox[:, 4] - 0.5 - return rbox - - -class MatchedRBoxIoUTest(unittest.TestCase): - def setUp(self): - self.initTestCase() - self.rbox1 = gen_sample(self.n) - self.rbox2 = gen_sample(self.n) - - def initTestCase(self): - self.n = 1000 - - def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2): - self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg) - - def get_places(self): - places = [paddle.CPUPlace()] - if paddle.device.is_compiled_with_cuda(): - places.append(paddle.CUDAPlace(0)) - - return places - - def check_output(self, place): - paddle.disable_static() - pd_rbox1 = paddle.to_tensor(self.rbox1, place=place) - pd_rbox2 = paddle.to_tensor(self.rbox2, place=place) - actual_t = matched_rbox_iou(pd_rbox1, pd_rbox2).numpy() - poly_rbox1 = self.rbox1 - poly_rbox2 = self.rbox2 - poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024 - poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024 - expect_t = matched_rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False) - self.assertAllClose( - actual_t, - expect_t, - msg="rbox_iou has diff at {} \nExpect {}\nBut got {}".format( - str(place), str(expect_t), str(actual_t))) - - def test_output(self): - places = self.get_places() - for place in places: - self.check_output(place) - - -if __name__ == "__main__": - unittest.main() diff --git a/pdfdet/models/Paddle/ppdet/ext_op/unittest/test_rbox_iou.py b/pdfdet/models/Paddle/ppdet/ext_op/unittest/test_rbox_iou.py deleted file mode 100644 index 8ef19ae..0000000 --- a/pdfdet/models/Paddle/ppdet/ext_op/unittest/test_rbox_iou.py +++ /dev/null @@ -1,151 +0,0 @@ -import numpy as np -import sys -import time -from shapely.geometry import Polygon -import paddle -import unittest - -from ext_op import rbox_iou - - -def rbox2poly_single(rrect, get_best_begin_point=False): - """ - rrect:[x_ctr,y_ctr,w,h,angle] - to - poly:[x0,y0,x1,y1,x2,y2,x3,y3] - """ - x_ctr, y_ctr, width, height, angle = rrect[:5] - tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2 - # rect 2x4 - rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]]) - R = np.array([[np.cos(angle), -np.sin(angle)], - [np.sin(angle), np.cos(angle)]]) - # poly - poly = R.dot(rect) - x0, x1, x2, x3 = poly[0, :4] + x_ctr - y0, y1, y2, y3 = poly[1, :4] + y_ctr - poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float64) - return poly - - -def intersection(g, p): - """ - Intersection. - """ - - g = g[:8].reshape((4, 2)) - p = p[:8].reshape((4, 2)) - - a = g - b = p - - use_filter = True - if use_filter: - # step1: - inter_x1 = np.maximum(np.min(a[:, 0]), np.min(b[:, 0])) - inter_x2 = np.minimum(np.max(a[:, 0]), np.max(b[:, 0])) - inter_y1 = np.maximum(np.min(a[:, 1]), np.min(b[:, 1])) - inter_y2 = np.minimum(np.max(a[:, 1]), np.max(b[:, 1])) - if inter_x1 >= inter_x2 or inter_y1 >= inter_y2: - return 0. - x1 = np.minimum(np.min(a[:, 0]), np.min(b[:, 0])) - x2 = np.maximum(np.max(a[:, 0]), np.max(b[:, 0])) - y1 = np.minimum(np.min(a[:, 1]), np.min(b[:, 1])) - y2 = np.maximum(np.max(a[:, 1]), np.max(b[:, 1])) - if x1 >= x2 or y1 >= y2 or (x2 - x1) < 2 or (y2 - y1) < 2: - return 0. - - g = Polygon(g) - p = Polygon(p) - if not g.is_valid or not p.is_valid: - return 0 - - inter = Polygon(g).intersection(Polygon(p)).area - union = g.area + p.area - inter - if union == 0: - return 0 - else: - return inter / union - - -def rbox_overlaps(anchors, gt_bboxes, use_cv2=False): - """ - - Args: - anchors: [NA, 5] x1,y1,x2,y2,angle - gt_bboxes: [M, 5] x1,y1,x2,y2,angle - - Returns: - iou: [NA, M] - """ - assert anchors.shape[1] == 5 - assert gt_bboxes.shape[1] == 5 - - gt_bboxes_ploy = [rbox2poly_single(e) for e in gt_bboxes] - anchors_ploy = [rbox2poly_single(e) for e in anchors] - - num_gt, num_anchors = len(gt_bboxes_ploy), len(anchors_ploy) - iou = np.zeros((num_anchors, num_gt), dtype=np.float64) - - start_time = time.time() - for i in range(num_anchors): - for j in range(num_gt): - try: - iou[i, j] = intersection(anchors_ploy[i], gt_bboxes_ploy[j]) - except Exception as e: - print('cur anchors_ploy[i]', anchors_ploy[i], - 'gt_bboxes_ploy[j]', gt_bboxes_ploy[j], e) - return iou - - -def gen_sample(n): - rbox = np.random.rand(n, 5) - rbox[:, 0:4] = rbox[:, 0:4] * 0.45 + 0.001 - rbox[:, 4] = rbox[:, 4] - 0.5 - return rbox - - -class RBoxIoUTest(unittest.TestCase): - def setUp(self): - self.initTestCase() - self.rbox1 = gen_sample(self.n) - self.rbox2 = gen_sample(self.m) - - def initTestCase(self): - self.n = 13000 - self.m = 7 - - def assertAllClose(self, x, y, msg, atol=5e-1, rtol=1e-2): - self.assertTrue(np.allclose(x, y, atol=atol, rtol=rtol), msg=msg) - - def get_places(self): - places = [paddle.CPUPlace()] - if paddle.device.is_compiled_with_cuda(): - places.append(paddle.CUDAPlace(0)) - - return places - - def check_output(self, place): - paddle.disable_static() - pd_rbox1 = paddle.to_tensor(self.rbox1, place=place) - pd_rbox2 = paddle.to_tensor(self.rbox2, place=place) - actual_t = rbox_iou(pd_rbox1, pd_rbox2).numpy() - poly_rbox1 = self.rbox1 - poly_rbox2 = self.rbox2 - poly_rbox1[:, 0:4] = self.rbox1[:, 0:4] * 1024 - poly_rbox2[:, 0:4] = self.rbox2[:, 0:4] * 1024 - expect_t = rbox_overlaps(poly_rbox1, poly_rbox2, use_cv2=False) - self.assertAllClose( - actual_t, - expect_t, - msg="rbox_iou has diff at {} \nExpect {}\nBut got {}".format( - str(place), str(expect_t), str(actual_t))) - - def test_output(self): - places = self.get_places() - for place in places: - self.check_output(place) - - -if __name__ == "__main__": - unittest.main() diff --git a/pdfdet/models/Paddle/ppdet/metrics/__init__.py b/pdfdet/models/Paddle/ppdet/metrics/__init__.py deleted file mode 100644 index 288f158..0000000 --- a/pdfdet/models/Paddle/ppdet/metrics/__init__.py +++ /dev/null @@ -1,34 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import metrics -from . import keypoint_metrics - -from .metrics import * -from .keypoint_metrics import * -from .pose3d_metrics import * - -__all__ = metrics.__all__ + keypoint_metrics.__all__ - -from . import mot_metrics -from .mot_metrics import * -__all__ = metrics.__all__ + mot_metrics.__all__ - -from . import mcmot_metrics -from .mcmot_metrics import * -__all__ = metrics.__all__ + mcmot_metrics.__all__ - -from . import culane_metrics -from .culane_metrics import * -__all__ = metrics.__all__ + culane_metrics.__all__ \ No newline at end of file diff --git a/pdfdet/models/Paddle/ppdet/metrics/coco_utils.py b/pdfdet/models/Paddle/ppdet/metrics/coco_utils.py deleted file mode 100644 index b7a4d7e..0000000 --- a/pdfdet/models/Paddle/ppdet/metrics/coco_utils.py +++ /dev/null @@ -1,188 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import sys -import numpy as np -import itertools - -from ppdet.metrics.json_results import get_det_res, get_det_poly_res, get_seg_res, get_solov2_segm_res, get_keypoint_res, get_pose3d_res -from ppdet.metrics.map_utils import draw_pr_curve - -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - - -def get_infer_results(outs, catid, bias=0): - """ - Get result at the stage of inference. - The output format is dictionary containing bbox or mask result. - - For example, bbox result is a list and each element contains - image_id, category_id, bbox and score. - """ - if outs is None or len(outs) == 0: - raise ValueError( - 'The number of valid detection result if zero. Please use reasonable model and check input data.' - ) - - im_id = outs['im_id'] - - infer_res = {} - if 'bbox' in outs: - if len(outs['bbox']) > 0 and len(outs['bbox'][0]) > 6: - infer_res['bbox'] = get_det_poly_res( - outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias) - else: - infer_res['bbox'] = get_det_res( - outs['bbox'], outs['bbox_num'], im_id, catid, bias=bias) - - if 'mask' in outs: - # mask post process - infer_res['mask'] = get_seg_res(outs['mask'], outs['bbox'], - outs['bbox_num'], im_id, catid) - - if 'segm' in outs: - infer_res['segm'] = get_solov2_segm_res(outs, im_id, catid) - - if 'keypoint' in outs: - infer_res['keypoint'] = get_keypoint_res(outs, im_id) - outs['bbox_num'] = [len(infer_res['keypoint'])] - - if 'pose3d' in outs: - infer_res['pose3d'] = get_pose3d_res(outs, im_id) - outs['bbox_num'] = [len(infer_res['pose3d'])] - - return infer_res - - -def cocoapi_eval(jsonfile, - style, - coco_gt=None, - anno_file=None, - max_dets=(100, 300, 1000), - classwise=False, - sigmas=None, - use_area=True): - """ - Args: - jsonfile (str): Evaluation json file, eg: bbox.json, mask.json. - style (str): COCOeval style, can be `bbox` , `segm` , `proposal`, `keypoints` and `keypoints_crowd`. - coco_gt (str): Whether to load COCOAPI through anno_file, - eg: coco_gt = COCO(anno_file) - anno_file (str): COCO annotations file. - max_dets (tuple): COCO evaluation maxDets. - classwise (bool): Whether per-category AP and draw P-R Curve or not. - sigmas (nparray): keypoint labelling sigmas. - use_area (bool): If gt annotations (eg. CrowdPose, AIC) - do not have 'area', please set use_area=False. - """ - assert coco_gt != None or anno_file != None - if style == 'keypoints_crowd': - #please install xtcocotools==1.6 - from xtcocotools.coco import COCO - from xtcocotools.cocoeval import COCOeval - else: - from pycocotools.coco import COCO - from pycocotools.cocoeval import COCOeval - - if coco_gt == None: - coco_gt = COCO(anno_file) - logger.info("Start evaluate...") - coco_dt = coco_gt.loadRes(jsonfile) - if style == 'proposal': - coco_eval = COCOeval(coco_gt, coco_dt, 'bbox') - coco_eval.params.useCats = 0 - coco_eval.params.maxDets = list(max_dets) - elif style == 'keypoints_crowd': - coco_eval = COCOeval(coco_gt, coco_dt, style, sigmas, use_area) - else: - coco_eval = COCOeval(coco_gt, coco_dt, style) - coco_eval.evaluate() - coco_eval.accumulate() - coco_eval.summarize() - if classwise: - # Compute per-category AP and PR curve - try: - from terminaltables import AsciiTable - except Exception as e: - logger.error( - 'terminaltables not found, plaese install terminaltables. ' - 'for example: `pip install terminaltables`.') - raise e - precisions = coco_eval.eval['precision'] - cat_ids = coco_gt.getCatIds() - # precision: (iou, recall, cls, area range, max dets) - assert len(cat_ids) == precisions.shape[2] - results_per_category = [] - for idx, catId in enumerate(cat_ids): - # area range index 0: all area ranges - # max dets index -1: typically 100 per image - nm = coco_gt.loadCats(catId)[0] - precision = precisions[:, :, idx, 0, -1] - precision = precision[precision > -1] - if precision.size: - ap = np.mean(precision) - else: - ap = float('nan') - results_per_category.append( - (str(nm["name"]), '{:0.3f}'.format(float(ap)))) - pr_array = precisions[0, :, idx, 0, 2] - recall_array = np.arange(0.0, 1.01, 0.01) - draw_pr_curve( - pr_array, - recall_array, - out_dir=style + '_pr_curve', - file_name='{}_precision_recall_curve.jpg'.format(nm["name"])) - - num_columns = min(6, len(results_per_category) * 2) - results_flatten = list(itertools.chain(*results_per_category)) - headers = ['category', 'AP'] * (num_columns // 2) - results_2d = itertools.zip_longest( - * [results_flatten[i::num_columns] for i in range(num_columns)]) - table_data = [headers] - table_data += [result for result in results_2d] - table = AsciiTable(table_data) - logger.info('Per-category of {} AP: \n{}'.format(style, table.table)) - logger.info("per-category PR curve has output to {} folder.".format( - style + '_pr_curve')) - # flush coco evaluation result - sys.stdout.flush() - return coco_eval.stats - - -def json_eval_results(metric, json_directory, dataset): - """ - cocoapi eval with already exists proposal.json, bbox.json or mask.json - """ - assert metric == 'COCO' - anno_file = dataset.get_anno() - json_file_list = ['proposal.json', 'bbox.json', 'mask.json'] - if json_directory: - assert os.path.exists( - json_directory), "The json directory:{} does not exist".format( - json_directory) - for k, v in enumerate(json_file_list): - json_file_list[k] = os.path.join(str(json_directory), v) - - coco_eval_style = ['proposal', 'bbox', 'segm'] - for i, v_json in enumerate(json_file_list): - if os.path.exists(v_json): - cocoapi_eval(v_json, coco_eval_style[i], anno_file=anno_file) - else: - logger.info("{} not exists!".format(v_json)) diff --git a/pdfdet/models/Paddle/ppdet/metrics/culane_metrics.py b/pdfdet/models/Paddle/ppdet/metrics/culane_metrics.py deleted file mode 100644 index 848d2c1..0000000 --- a/pdfdet/models/Paddle/ppdet/metrics/culane_metrics.py +++ /dev/null @@ -1,327 +0,0 @@ -import os -import cv2 -import numpy as np -import os.path as osp -from functools import partial -from .metrics import Metric -from scipy.interpolate import splprep, splev -from scipy.optimize import linear_sum_assignment -from shapely.geometry import LineString, Polygon -from ppdet.utils.logger import setup_logger - -logger = setup_logger(__name__) - -__all__ = [ - 'draw_lane', 'discrete_cross_iou', 'continuous_cross_iou', 'interp', - 'culane_metric', 'load_culane_img_data', 'load_culane_data', - 'eval_predictions', "CULaneMetric" -] - -LIST_FILE = { - 'train': 'list/train_gt.txt', - 'val': 'list/val.txt', - 'test': 'list/test.txt', -} - -CATEGORYS = { - 'normal': 'list/test_split/test0_normal.txt', - 'crowd': 'list/test_split/test1_crowd.txt', - 'hlight': 'list/test_split/test2_hlight.txt', - 'shadow': 'list/test_split/test3_shadow.txt', - 'noline': 'list/test_split/test4_noline.txt', - 'arrow': 'list/test_split/test5_arrow.txt', - 'curve': 'list/test_split/test6_curve.txt', - 'cross': 'list/test_split/test7_cross.txt', - 'night': 'list/test_split/test8_night.txt', -} - - -def draw_lane(lane, img=None, img_shape=None, width=30): - if img is None: - img = np.zeros(img_shape, dtype=np.uint8) - lane = lane.astype(np.int32) - for p1, p2 in zip(lane[:-1], lane[1:]): - cv2.line( - img, tuple(p1), tuple(p2), color=(255, 255, 255), thickness=width) - return img - - -def discrete_cross_iou(xs, ys, width=30, img_shape=(590, 1640, 3)): - xs = [draw_lane(lane, img_shape=img_shape, width=width) > 0 for lane in xs] - ys = [draw_lane(lane, img_shape=img_shape, width=width) > 0 for lane in ys] - - ious = np.zeros((len(xs), len(ys))) - for i, x in enumerate(xs): - for j, y in enumerate(ys): - ious[i, j] = (x & y).sum() / (x | y).sum() - return ious - - -def continuous_cross_iou(xs, ys, width=30, img_shape=(590, 1640, 3)): - h, w, _ = img_shape - image = Polygon([(0, 0), (0, h - 1), (w - 1, h - 1), (w - 1, 0)]) - xs = [ - LineString(lane).buffer( - distance=width / 2., cap_style=1, join_style=2).intersection(image) - for lane in xs - ] - ys = [ - LineString(lane).buffer( - distance=width / 2., cap_style=1, join_style=2).intersection(image) - for lane in ys - ] - - ious = np.zeros((len(xs), len(ys))) - for i, x in enumerate(xs): - for j, y in enumerate(ys): - ious[i, j] = x.intersection(y).area / x.union(y).area - - return ious - - -def interp(points, n=50): - x = [x for x, _ in points] - y = [y for _, y in points] - tck, u = splprep([x, y], s=0, t=n, k=min(3, len(points) - 1)) - - u = np.linspace(0., 1., num=(len(u) - 1) * n + 1) - return np.array(splev(u, tck)).T - - -def culane_metric(pred, - anno, - width=30, - iou_thresholds=[0.5], - official=True, - img_shape=(590, 1640, 3)): - _metric = {} - for thr in iou_thresholds: - tp = 0 - fp = 0 if len(anno) != 0 else len(pred) - fn = 0 if len(pred) != 0 else len(anno) - _metric[thr] = [tp, fp, fn] - - interp_pred = np.array( - [interp( - pred_lane, n=5) for pred_lane in pred], dtype=object) # (4, 50, 2) - interp_anno = np.array( - [interp( - anno_lane, n=5) for anno_lane in anno], dtype=object) # (4, 50, 2) - - if official: - ious = discrete_cross_iou( - interp_pred, interp_anno, width=width, img_shape=img_shape) - else: - ious = continuous_cross_iou( - interp_pred, interp_anno, width=width, img_shape=img_shape) - - row_ind, col_ind = linear_sum_assignment(1 - ious) - - _metric = {} - for thr in iou_thresholds: - tp = int((ious[row_ind, col_ind] > thr).sum()) - fp = len(pred) - tp - fn = len(anno) - tp - _metric[thr] = [tp, fp, fn] - return _metric - - -def load_culane_img_data(path): - with open(path, 'r') as data_file: - img_data = data_file.readlines() - img_data = [line.split() for line in img_data] - img_data = [list(map(float, lane)) for lane in img_data] - img_data = [[(lane[i], lane[i + 1]) for i in range(0, len(lane), 2)] - for lane in img_data] - img_data = [lane for lane in img_data if len(lane) >= 2] - - return img_data - - -def load_culane_data(data_dir, file_list_path): - with open(file_list_path, 'r') as file_list: - filepaths = [ - os.path.join(data_dir, - line[1 if line[0] == '/' else 0:].rstrip().replace( - '.jpg', '.lines.txt')) - for line in file_list.readlines() - ] - - data = [] - for path in filepaths: - img_data = load_culane_img_data(path) - data.append(img_data) - - return data - - -def eval_predictions(pred_dir, - anno_dir, - list_path, - iou_thresholds=[0.5], - width=30, - official=True, - sequential=False): - logger.info('Calculating metric for List: {}'.format(list_path)) - predictions = load_culane_data(pred_dir, list_path) - annotations = load_culane_data(anno_dir, list_path) - img_shape = (590, 1640, 3) - if sequential: - results = map(partial( - culane_metric, - width=width, - official=official, - iou_thresholds=iou_thresholds, - img_shape=img_shape), - predictions, - annotations) - else: - from multiprocessing import Pool, cpu_count - from itertools import repeat - with Pool(cpu_count()) as p: - results = p.starmap(culane_metric, - zip(predictions, annotations, - repeat(width), - repeat(iou_thresholds), - repeat(official), repeat(img_shape))) - - mean_f1, mean_prec, mean_recall, total_tp, total_fp, total_fn = 0, 0, 0, 0, 0, 0 - ret = {} - for thr in iou_thresholds: - tp = sum(m[thr][0] for m in results) - fp = sum(m[thr][1] for m in results) - fn = sum(m[thr][2] for m in results) - precision = float(tp) / (tp + fp) if tp != 0 else 0 - recall = float(tp) / (tp + fn) if tp != 0 else 0 - f1 = 2 * precision * recall / (precision + recall) if tp != 0 else 0 - logger.info('iou thr: {:.2f}, tp: {}, fp: {}, fn: {},' - 'precision: {}, recall: {}, f1: {}'.format( - thr, tp, fp, fn, precision, recall, f1)) - mean_f1 += f1 / len(iou_thresholds) - mean_prec += precision / len(iou_thresholds) - mean_recall += recall / len(iou_thresholds) - total_tp += tp - total_fp += fp - total_fn += fn - ret[thr] = { - 'TP': tp, - 'FP': fp, - 'FN': fn, - 'Precision': precision, - 'Recall': recall, - 'F1': f1 - } - if len(iou_thresholds) > 2: - logger.info( - 'mean result, total_tp: {}, total_fp: {}, total_fn: {},' - 'precision: {}, recall: {}, f1: {}'.format( - total_tp, total_fp, total_fn, mean_prec, mean_recall, mean_f1)) - ret['mean'] = { - 'TP': total_tp, - 'FP': total_fp, - 'FN': total_fn, - 'Precision': mean_prec, - 'Recall': mean_recall, - 'F1': mean_f1 - } - return ret - - -class CULaneMetric(Metric): - def __init__(self, - cfg, - output_eval=None, - split="test", - dataset_dir="dataset/CULane/"): - super(CULaneMetric, self).__init__() - self.output_eval = "evaluation" if output_eval is None else output_eval - self.dataset_dir = dataset_dir - self.split = split - self.list_path = osp.join(dataset_dir, LIST_FILE[split]) - self.predictions = [] - self.img_names = [] - self.lanes = [] - self.eval_results = {} - self.cfg = cfg - self.reset() - - def reset(self): - self.predictions = [] - self.img_names = [] - self.lanes = [] - self.eval_results = {} - - def get_prediction_string(self, pred): - ys = np.arange(270, 590, 8) / self.cfg.ori_img_h - out = [] - for lane in pred: - xs = lane(ys) - valid_mask = (xs >= 0) & (xs < 1) - xs = xs * self.cfg.ori_img_w - lane_xs = xs[valid_mask] - lane_ys = ys[valid_mask] * self.cfg.ori_img_h - lane_xs, lane_ys = lane_xs[::-1], lane_ys[::-1] - lane_str = ' '.join([ - '{:.5f} {:.5f}'.format(x, y) for x, y in zip(lane_xs, lane_ys) - ]) - if lane_str != '': - out.append(lane_str) - - return '\n'.join(out) - - def accumulate(self): - loss_lines = [[], [], [], []] - for idx, pred in enumerate(self.predictions): - output_dir = os.path.join(self.output_eval, - os.path.dirname(self.img_names[idx])) - output_filename = os.path.basename(self.img_names[ - idx])[:-3] + 'lines.txt' - os.makedirs(output_dir, exist_ok=True) - output = self.get_prediction_string(pred) - - # store loss lines - lanes = self.lanes[idx] - if len(lanes) - len(pred) in [1, 2, 3, 4]: - loss_lines[len(lanes) - len(pred) - 1].append(self.img_names[ - idx]) - - with open(os.path.join(output_dir, output_filename), - 'w') as out_file: - out_file.write(output) - - for i, names in enumerate(loss_lines): - with open( - os.path.join(output_dir, 'loss_{}_lines.txt'.format(i + 1)), - 'w') as f: - for name in names: - f.write(name + '\n') - - for cate, cate_file in CATEGORYS.items(): - result = eval_predictions( - self.output_eval, - self.dataset_dir, - os.path.join(self.dataset_dir, cate_file), - iou_thresholds=[0.5], - official=True) - - result = eval_predictions( - self.output_eval, - self.dataset_dir, - self.list_path, - iou_thresholds=np.linspace(0.5, 0.95, 10), - official=True) - self.eval_results['F1@50'] = result[0.5]['F1'] - self.eval_results['result'] = result - - def update(self, inputs, outputs): - assert len(inputs['img_name']) == len(outputs['lanes']) - self.predictions.extend(outputs['lanes']) - self.img_names.extend(inputs['img_name']) - self.lanes.extend(inputs['lane_line']) - - def log(self): - logger.info(self.eval_results) - - # abstract method for getting metric results - def get_results(self): - return self.eval_results diff --git a/pdfdet/models/Paddle/ppdet/metrics/json_results.py b/pdfdet/models/Paddle/ppdet/metrics/json_results.py deleted file mode 100644 index d2575af..0000000 --- a/pdfdet/models/Paddle/ppdet/metrics/json_results.py +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import six -import numpy as np - - -def get_det_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0): - det_res = [] - k = 0 - for i in range(len(bbox_nums)): - cur_image_id = int(image_id[i][0]) - det_nums = bbox_nums[i] - for j in range(det_nums): - dt = bboxes[k] - k = k + 1 - num_id, score, xmin, ymin, xmax, ymax = dt.tolist() - if int(num_id) < 0: - continue - category_id = label_to_cat_id_map[int(num_id)] - w = xmax - xmin + bias - h = ymax - ymin + bias - bbox = [xmin, ymin, w, h] - dt_res = { - 'image_id': cur_image_id, - 'category_id': category_id, - 'bbox': bbox, - 'score': score - } - det_res.append(dt_res) - return det_res - - -def get_det_poly_res(bboxes, bbox_nums, image_id, label_to_cat_id_map, bias=0): - det_res = [] - k = 0 - for i in range(len(bbox_nums)): - cur_image_id = int(image_id[i][0]) - det_nums = bbox_nums[i] - for j in range(det_nums): - dt = bboxes[k] - k = k + 1 - num_id, score, x1, y1, x2, y2, x3, y3, x4, y4 = dt.tolist() - if int(num_id) < 0: - continue - category_id = label_to_cat_id_map[int(num_id)] - rbox = [x1, y1, x2, y2, x3, y3, x4, y4] - dt_res = { - 'image_id': cur_image_id, - 'category_id': category_id, - 'bbox': rbox, - 'score': score - } - det_res.append(dt_res) - return det_res - - -def strip_mask(mask): - row = mask[0, 0, :] - col = mask[0, :, 0] - im_h = len(col) - np.count_nonzero(col == -1) - im_w = len(row) - np.count_nonzero(row == -1) - return mask[:, :im_h, :im_w] - - -def get_seg_res(masks, bboxes, mask_nums, image_id, label_to_cat_id_map): - import pycocotools.mask as mask_util - seg_res = [] - k = 0 - for i in range(len(mask_nums)): - cur_image_id = int(image_id[i][0]) - det_nums = mask_nums[i] - mask_i = masks[k:k + det_nums] - mask_i = strip_mask(mask_i) - for j in range(det_nums): - mask = mask_i[j].astype(np.uint8) - score = float(bboxes[k][1]) - label = int(bboxes[k][0]) - k = k + 1 - if label == -1: - continue - cat_id = label_to_cat_id_map[label] - rle = mask_util.encode( - np.array( - mask[:, :, None], order="F", dtype="uint8"))[0] - if six.PY3: - if 'counts' in rle: - rle['counts'] = rle['counts'].decode("utf8") - sg_res = { - 'image_id': cur_image_id, - 'category_id': cat_id, - 'segmentation': rle, - 'score': score - } - seg_res.append(sg_res) - return seg_res - - -def get_solov2_segm_res(results, image_id, num_id_to_cat_id_map): - import pycocotools.mask as mask_util - segm_res = [] - # for each batch - segms = results['segm'].astype(np.uint8) - clsid_labels = results['cate_label'] - clsid_scores = results['cate_score'] - lengths = segms.shape[0] - im_id = int(image_id[0][0]) - if lengths == 0 or segms is None: - return None - # for each sample - for i in range(lengths - 1): - clsid = int(clsid_labels[i]) - catid = num_id_to_cat_id_map[clsid] - score = float(clsid_scores[i]) - mask = segms[i] - segm = mask_util.encode(np.array(mask[:, :, np.newaxis], order='F'))[0] - segm['counts'] = segm['counts'].decode('utf8') - coco_res = { - 'image_id': im_id, - 'category_id': catid, - 'segmentation': segm, - 'score': score - } - segm_res.append(coco_res) - return segm_res - - -def get_keypoint_res(results, im_id): - anns = [] - preds = results['keypoint'] - for idx in range(im_id.shape[0]): - image_id = im_id[idx].item() - kpts, scores = preds[idx] - for kpt, score in zip(kpts, scores): - kpt = kpt.flatten() - ann = { - 'image_id': image_id, - 'category_id': 1, # XXX hard code - 'keypoints': kpt.tolist(), - 'score': float(score) - } - x = kpt[0::3] - y = kpt[1::3] - x0, x1, y0, y1 = np.min(x).item(), np.max(x).item(), np.min(y).item( - ), np.max(y).item() - ann['area'] = (x1 - x0) * (y1 - y0) - ann['bbox'] = [x0, y0, x1 - x0, y1 - y0] - anns.append(ann) - return anns - - -def get_pose3d_res(results, im_id): - anns = [] - preds = results['pose3d'] - for idx in range(im_id.shape[0]): - image_id = im_id[idx].item() - pose3d = preds[idx] - ann = { - 'image_id': image_id, - 'category_id': 1, # XXX hard code - 'pose3d': pose3d.tolist(), - 'score': float(1.) - } - anns.append(ann) - return anns diff --git a/pdfdet/models/Paddle/ppdet/metrics/keypoint_metrics.py b/pdfdet/models/Paddle/ppdet/metrics/keypoint_metrics.py deleted file mode 100644 index 26e9ecb..0000000 --- a/pdfdet/models/Paddle/ppdet/metrics/keypoint_metrics.py +++ /dev/null @@ -1,571 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import json -from collections import defaultdict, OrderedDict -import numpy as np -import paddle -from pycocotools.coco import COCO -from pycocotools.cocoeval import COCOeval -from ..modeling.keypoint_utils import oks_nms, keypoint_pck_accuracy, keypoint_auc, keypoint_epe -from scipy.io import loadmat, savemat -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -__all__ = [ - 'KeyPointTopDownCOCOEval', 'KeyPointTopDownCOCOWholeBadyHandEval', - 'KeyPointTopDownMPIIEval' -] - - -class KeyPointTopDownCOCOEval(object): - """refer to - https://github.com/leoxiaobin/deep-high-resolution-net.pytorch - Copyright (c) Microsoft, under the MIT License. - """ - - def __init__(self, - anno_file, - num_samples, - num_joints, - output_eval, - iou_type='keypoints', - in_vis_thre=0.2, - oks_thre=0.9, - save_prediction_only=False): - super(KeyPointTopDownCOCOEval, self).__init__() - self.coco = COCO(anno_file) - self.num_samples = num_samples - self.num_joints = num_joints - self.iou_type = iou_type - self.in_vis_thre = in_vis_thre - self.oks_thre = oks_thre - self.output_eval = output_eval - self.res_file = os.path.join(output_eval, "keypoints_results.json") - self.save_prediction_only = save_prediction_only - self.reset() - - def reset(self): - self.results = { - 'all_preds': np.zeros( - (self.num_samples, self.num_joints, 3), dtype=np.float32), - 'all_boxes': np.zeros((self.num_samples, 6)), - 'image_path': [] - } - self.eval_results = {} - self.idx = 0 - - def update(self, inputs, outputs): - kpts, _ = outputs['keypoint'][0] - - num_images = inputs['image'].shape[0] - self.results['all_preds'][self.idx:self.idx + num_images, :, 0: - 3] = kpts[:, :, 0:3] - self.results['all_boxes'][self.idx:self.idx + num_images, 0:2] = inputs[ - 'center'].numpy()[:, 0:2] if isinstance( - inputs['center'], paddle.Tensor) else inputs['center'][:, 0:2] - self.results['all_boxes'][self.idx:self.idx + num_images, 2:4] = inputs[ - 'scale'].numpy()[:, 0:2] if isinstance( - inputs['scale'], paddle.Tensor) else inputs['scale'][:, 0:2] - self.results['all_boxes'][self.idx:self.idx + num_images, 4] = np.prod( - inputs['scale'].numpy() * 200, - 1) if isinstance(inputs['scale'], paddle.Tensor) else np.prod( - inputs['scale'] * 200, 1) - self.results['all_boxes'][ - self.idx:self.idx + num_images, - 5] = np.squeeze(inputs['score'].numpy()) if isinstance( - inputs['score'], paddle.Tensor) else np.squeeze(inputs['score']) - if isinstance(inputs['im_id'], paddle.Tensor): - self.results['image_path'].extend(inputs['im_id'].numpy()) - else: - self.results['image_path'].extend(inputs['im_id']) - self.idx += num_images - - def _write_coco_keypoint_results(self, keypoints): - data_pack = [{ - 'cat_id': 1, - 'cls': 'person', - 'ann_type': 'keypoints', - 'keypoints': keypoints - }] - results = self._coco_keypoint_results_one_category_kernel(data_pack[0]) - if not os.path.exists(self.output_eval): - os.makedirs(self.output_eval) - with open(self.res_file, 'w') as f: - json.dump(results, f, sort_keys=True, indent=4) - logger.info(f'The keypoint result is saved to {self.res_file}.') - try: - json.load(open(self.res_file)) - except Exception: - content = [] - with open(self.res_file, 'r') as f: - for line in f: - content.append(line) - content[-1] = ']' - with open(self.res_file, 'w') as f: - for c in content: - f.write(c) - - def _coco_keypoint_results_one_category_kernel(self, data_pack): - cat_id = data_pack['cat_id'] - keypoints = data_pack['keypoints'] - cat_results = [] - - for img_kpts in keypoints: - if len(img_kpts) == 0: - continue - - _key_points = np.array( - [img_kpts[k]['keypoints'] for k in range(len(img_kpts))]) - _key_points = _key_points.reshape(_key_points.shape[0], -1) - - result = [{ - 'image_id': img_kpts[k]['image'], - 'category_id': cat_id, - 'keypoints': _key_points[k].tolist(), - 'score': img_kpts[k]['score'], - 'center': list(img_kpts[k]['center']), - 'scale': list(img_kpts[k]['scale']) - } for k in range(len(img_kpts))] - cat_results.extend(result) - - return cat_results - - def get_final_results(self, preds, all_boxes, img_path): - _kpts = [] - for idx, kpt in enumerate(preds): - _kpts.append({ - 'keypoints': kpt, - 'center': all_boxes[idx][0:2], - 'scale': all_boxes[idx][2:4], - 'area': all_boxes[idx][4], - 'score': all_boxes[idx][5], - 'image': int(img_path[idx]) - }) - # image x person x (keypoints) - kpts = defaultdict(list) - for kpt in _kpts: - kpts[kpt['image']].append(kpt) - - # rescoring and oks nms - num_joints = preds.shape[1] - in_vis_thre = self.in_vis_thre - oks_thre = self.oks_thre - oks_nmsed_kpts = [] - for img in kpts.keys(): - img_kpts = kpts[img] - for n_p in img_kpts: - box_score = n_p['score'] - kpt_score = 0 - valid_num = 0 - for n_jt in range(0, num_joints): - t_s = n_p['keypoints'][n_jt][2] - if t_s > in_vis_thre: - kpt_score = kpt_score + t_s - valid_num = valid_num + 1 - if valid_num != 0: - kpt_score = kpt_score / valid_num - # rescoring - n_p['score'] = kpt_score * box_score - - keep = oks_nms([img_kpts[i] for i in range(len(img_kpts))], - oks_thre) - - if len(keep) == 0: - oks_nmsed_kpts.append(img_kpts) - else: - oks_nmsed_kpts.append([img_kpts[_keep] for _keep in keep]) - - self._write_coco_keypoint_results(oks_nmsed_kpts) - - def accumulate(self): - self.get_final_results(self.results['all_preds'], - self.results['all_boxes'], - self.results['image_path']) - if self.save_prediction_only: - logger.info(f'The keypoint result is saved to {self.res_file} ' - 'and do not evaluate the mAP.') - return - coco_dt = self.coco.loadRes(self.res_file) - coco_eval = COCOeval(self.coco, coco_dt, 'keypoints') - coco_eval.params.useSegm = None - coco_eval.evaluate() - coco_eval.accumulate() - coco_eval.summarize() - - keypoint_stats = [] - for ind in range(len(coco_eval.stats)): - keypoint_stats.append((coco_eval.stats[ind])) - self.eval_results['keypoint'] = keypoint_stats - - def log(self): - if self.save_prediction_only: - return - stats_names = [ - 'AP', 'Ap .5', 'AP .75', 'AP (M)', 'AP (L)', 'AR', 'AR .5', - 'AR .75', 'AR (M)', 'AR (L)' - ] - num_values = len(stats_names) - print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |') - print('|---' * (num_values + 1) + '|') - - print(' '.join([ - '| {:.3f}'.format(value) for value in self.eval_results['keypoint'] - ]) + ' |') - - def get_results(self): - return self.eval_results - - -class KeyPointTopDownCOCOWholeBadyHandEval(object): - def __init__(self, - anno_file, - num_samples, - num_joints, - output_eval, - save_prediction_only=False): - super(KeyPointTopDownCOCOWholeBadyHandEval, self).__init__() - self.coco = COCO(anno_file) - self.num_samples = num_samples - self.num_joints = num_joints - self.output_eval = output_eval - self.res_file = os.path.join(output_eval, "keypoints_results.json") - self.save_prediction_only = save_prediction_only - self.parse_dataset() - self.reset() - - def parse_dataset(self): - gt_db = [] - num_joints = self.num_joints - coco = self.coco - img_ids = coco.getImgIds() - for img_id in img_ids: - ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False) - objs = coco.loadAnns(ann_ids) - - for obj in objs: - for type in ['left', 'right']: - if (obj[f'{type}hand_valid'] and - max(obj[f'{type}hand_kpts']) > 0): - - joints = np.zeros((num_joints, 3), dtype=np.float32) - joints_vis = np.zeros((num_joints, 3), dtype=np.float32) - - keypoints = np.array(obj[f'{type}hand_kpts']) - keypoints = keypoints.reshape(-1, 3) - joints[:, :2] = keypoints[:, :2] - joints_vis[:, :2] = np.minimum(1, keypoints[:, 2:3]) - - gt_db.append({ - 'bbox': obj[f'{type}hand_box'], - 'gt_joints': joints, - 'joints_vis': joints_vis, - }) - self.db = gt_db - - def reset(self): - self.results = { - 'preds': np.zeros( - (self.num_samples, self.num_joints, 3), dtype=np.float32), - } - self.eval_results = {} - self.idx = 0 - - def update(self, inputs, outputs): - kpts, _ = outputs['keypoint'][0] - num_images = inputs['image'].shape[0] - self.results['preds'][self.idx:self.idx + num_images, :, 0: - 3] = kpts[:, :, 0:3] - self.idx += num_images - - def accumulate(self): - self.get_final_results(self.results['preds']) - if self.save_prediction_only: - logger.info(f'The keypoint result is saved to {self.res_file} ' - 'and do not evaluate the mAP.') - return - - self.eval_results = self.evaluate(self.res_file, ('PCK', 'AUC', 'EPE')) - - def get_final_results(self, preds): - kpts = [] - for idx, kpt in enumerate(preds): - kpts.append({'keypoints': kpt.tolist()}) - - self._write_keypoint_results(kpts) - - def _write_keypoint_results(self, keypoints): - if not os.path.exists(self.output_eval): - os.makedirs(self.output_eval) - with open(self.res_file, 'w') as f: - json.dump(keypoints, f, sort_keys=True, indent=4) - logger.info(f'The keypoint result is saved to {self.res_file}.') - try: - json.load(open(self.res_file)) - except Exception: - content = [] - with open(self.res_file, 'r') as f: - for line in f: - content.append(line) - content[-1] = ']' - with open(self.res_file, 'w') as f: - for c in content: - f.write(c) - - def log(self): - if self.save_prediction_only: - return - for item, value in self.eval_results.items(): - print("{} : {}".format(item, value)) - - def get_results(self): - return self.eval_results - - def evaluate(self, res_file, metrics, pck_thr=0.2, auc_nor=30): - """Keypoint evaluation. - - Args: - res_file (str): Json file stored prediction results. - metrics (str | list[str]): Metric to be performed. - Options: 'PCK', 'AUC', 'EPE'. - pck_thr (float): PCK threshold, default as 0.2. - auc_nor (float): AUC normalization factor, default as 30 pixel. - - Returns: - List: Evaluation results for evaluation metric. - """ - info_str = [] - - with open(res_file, 'r') as fin: - preds = json.load(fin) - assert len(preds) == len(self.db) - - outputs = [] - gts = [] - masks = [] - threshold_bbox = [] - - for pred, item in zip(preds, self.db): - outputs.append(np.array(pred['keypoints'])[:, :-1]) - gts.append(np.array(item['gt_joints'])[:, :-1]) - masks.append((np.array(item['joints_vis'])[:, 0]) > 0) - if 'PCK' in metrics: - bbox = np.array(item['bbox']) - bbox_thr = np.max(bbox[2:]) - threshold_bbox.append(np.array([bbox_thr, bbox_thr])) - - outputs = np.array(outputs) - gts = np.array(gts) - masks = np.array(masks) - threshold_bbox = np.array(threshold_bbox) - - if 'PCK' in metrics: - _, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr, - threshold_bbox) - info_str.append(('PCK', pck)) - - if 'AUC' in metrics: - info_str.append(('AUC', keypoint_auc(outputs, gts, masks, auc_nor))) - - if 'EPE' in metrics: - info_str.append(('EPE', keypoint_epe(outputs, gts, masks))) - - name_value = OrderedDict(info_str) - - return name_value - - -class KeyPointTopDownMPIIEval(object): - def __init__(self, - anno_file, - num_samples, - num_joints, - output_eval, - oks_thre=0.9, - save_prediction_only=False): - super(KeyPointTopDownMPIIEval, self).__init__() - self.ann_file = anno_file - self.res_file = os.path.join(output_eval, "keypoints_results.json") - self.save_prediction_only = save_prediction_only - self.reset() - - def reset(self): - self.results = [] - self.eval_results = {} - self.idx = 0 - - def update(self, inputs, outputs): - kpts, _ = outputs['keypoint'][0] - - num_images = inputs['image'].shape[0] - results = {} - results['preds'] = kpts[:, :, 0:3] - results['boxes'] = np.zeros((num_images, 6)) - results['boxes'][:, 0:2] = inputs['center'].numpy()[:, 0:2] - results['boxes'][:, 2:4] = inputs['scale'].numpy()[:, 0:2] - results['boxes'][:, 4] = np.prod(inputs['scale'].numpy() * 200, 1) - results['boxes'][:, 5] = np.squeeze(inputs['score'].numpy()) - results['image_path'] = inputs['image_file'] - - self.results.append(results) - - def accumulate(self): - self._mpii_keypoint_results_save() - if self.save_prediction_only: - logger.info(f'The keypoint result is saved to {self.res_file} ' - 'and do not evaluate the mAP.') - return - - self.eval_results = self.evaluate(self.results) - - def _mpii_keypoint_results_save(self): - results = [] - for res in self.results: - if len(res) == 0: - continue - result = [{ - 'preds': res['preds'][k].tolist(), - 'boxes': res['boxes'][k].tolist(), - 'image_path': res['image_path'][k], - } for k in range(len(res))] - results.extend(result) - with open(self.res_file, 'w') as f: - json.dump(results, f, sort_keys=True, indent=4) - logger.info(f'The keypoint result is saved to {self.res_file}.') - - def log(self): - if self.save_prediction_only: - return - for item, value in self.eval_results.items(): - print("{} : {}".format(item, value)) - - def get_results(self): - return self.eval_results - - def evaluate(self, outputs, savepath=None): - """Evaluate PCKh for MPII dataset. refer to - https://github.com/leoxiaobin/deep-high-resolution-net.pytorch - Copyright (c) Microsoft, under the MIT License. - - Args: - outputs(list(preds, boxes)): - - * preds (np.ndarray[N,K,3]): The first two dimensions are - coordinates, score is the third dimension of the array. - * boxes (np.ndarray[N,6]): [center[0], center[1], scale[0] - , scale[1],area, score] - - Returns: - dict: PCKh for each joint - """ - - kpts = [] - for output in outputs: - preds = output['preds'] - batch_size = preds.shape[0] - for i in range(batch_size): - kpts.append({'keypoints': preds[i]}) - - preds = np.stack([kpt['keypoints'] for kpt in kpts]) - - # convert 0-based index to 1-based index, - # and get the first two dimensions. - preds = preds[..., :2] + 1.0 - - if savepath is not None: - pred_file = os.path.join(savepath, 'pred.mat') - savemat(pred_file, mdict={'preds': preds}) - - SC_BIAS = 0.6 - threshold = 0.5 - - gt_file = os.path.join( - os.path.dirname(self.ann_file), 'mpii_gt_val.mat') - gt_dict = loadmat(gt_file) - dataset_joints = gt_dict['dataset_joints'] - jnt_missing = gt_dict['jnt_missing'] - pos_gt_src = gt_dict['pos_gt_src'] - headboxes_src = gt_dict['headboxes_src'] - - pos_pred_src = np.transpose(preds, [1, 2, 0]) - - head = np.where(dataset_joints == 'head')[1][0] - lsho = np.where(dataset_joints == 'lsho')[1][0] - lelb = np.where(dataset_joints == 'lelb')[1][0] - lwri = np.where(dataset_joints == 'lwri')[1][0] - lhip = np.where(dataset_joints == 'lhip')[1][0] - lkne = np.where(dataset_joints == 'lkne')[1][0] - lank = np.where(dataset_joints == 'lank')[1][0] - - rsho = np.where(dataset_joints == 'rsho')[1][0] - relb = np.where(dataset_joints == 'relb')[1][0] - rwri = np.where(dataset_joints == 'rwri')[1][0] - rkne = np.where(dataset_joints == 'rkne')[1][0] - rank = np.where(dataset_joints == 'rank')[1][0] - rhip = np.where(dataset_joints == 'rhip')[1][0] - - jnt_visible = 1 - jnt_missing - uv_error = pos_pred_src - pos_gt_src - uv_err = np.linalg.norm(uv_error, axis=1) - headsizes = headboxes_src[1, :, :] - headboxes_src[0, :, :] - headsizes = np.linalg.norm(headsizes, axis=0) - headsizes *= SC_BIAS - scale = headsizes * np.ones((len(uv_err), 1), dtype=np.float32) - scaled_uv_err = uv_err / scale - scaled_uv_err = scaled_uv_err * jnt_visible - jnt_count = np.sum(jnt_visible, axis=1) - less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible - PCKh = 100. * np.sum(less_than_threshold, axis=1) / jnt_count - - # save - rng = np.arange(0, 0.5 + 0.01, 0.01) - pckAll = np.zeros((len(rng), 16), dtype=np.float32) - - for r, threshold in enumerate(rng): - less_than_threshold = (scaled_uv_err <= threshold) * jnt_visible - pckAll[r, :] = 100. * np.sum(less_than_threshold, - axis=1) / jnt_count - - PCKh = np.ma.array(PCKh, mask=False) - PCKh.mask[6:8] = True - - jnt_count = np.ma.array(jnt_count, mask=False) - jnt_count.mask[6:8] = True - jnt_ratio = jnt_count / np.sum(jnt_count).astype(np.float64) - - name_value = [ #noqa - ('Head', PCKh[head]), - ('Shoulder', 0.5 * (PCKh[lsho] + PCKh[rsho])), - ('Elbow', 0.5 * (PCKh[lelb] + PCKh[relb])), - ('Wrist', 0.5 * (PCKh[lwri] + PCKh[rwri])), - ('Hip', 0.5 * (PCKh[lhip] + PCKh[rhip])), - ('Knee', 0.5 * (PCKh[lkne] + PCKh[rkne])), - ('Ankle', 0.5 * (PCKh[lank] + PCKh[rank])), - ('PCKh', np.sum(PCKh * jnt_ratio)), - ('PCKh@0.1', np.sum(pckAll[11, :] * jnt_ratio)) - ] - name_value = OrderedDict(name_value) - - return name_value - - def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): - """sort kpts and remove the repeated ones.""" - kpts = sorted(kpts, key=lambda x: x[key]) - num = len(kpts) - for i in range(num - 1, 0, -1): - if kpts[i][key] == kpts[i - 1][key]: - del kpts[i] - - return kpts diff --git a/pdfdet/models/Paddle/ppdet/metrics/map_utils.py b/pdfdet/models/Paddle/ppdet/metrics/map_utils.py deleted file mode 100644 index 57f12d9..0000000 --- a/pdfdet/models/Paddle/ppdet/metrics/map_utils.py +++ /dev/null @@ -1,436 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import os -import sys -import numpy as np -import itertools -import paddle -from ppdet.modeling.rbox_utils import poly2rbox_np - -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -__all__ = [ - 'draw_pr_curve', - 'bbox_area', - 'jaccard_overlap', - 'prune_zero_padding', - 'DetectionMAP', - 'ap_per_class', - 'compute_ap', -] - - -def draw_pr_curve(precision, - recall, - iou=0.5, - out_dir='pr_curve', - file_name='precision_recall_curve.jpg'): - if not os.path.exists(out_dir): - os.makedirs(out_dir) - output_path = os.path.join(out_dir, file_name) - try: - import matplotlib.pyplot as plt - except Exception as e: - logger.error('Matplotlib not found, plaese install matplotlib.' - 'for example: `pip install matplotlib`.') - raise e - plt.cla() - plt.figure('P-R Curve') - plt.title('Precision/Recall Curve(IoU={})'.format(iou)) - plt.xlabel('Recall') - plt.ylabel('Precision') - plt.grid(True) - plt.plot(recall, precision) - plt.savefig(output_path) - - -def bbox_area(bbox, is_bbox_normalized): - """ - Calculate area of a bounding box - """ - norm = 1. - float(is_bbox_normalized) - width = bbox[2] - bbox[0] + norm - height = bbox[3] - bbox[1] + norm - return width * height - - -def jaccard_overlap(pred, gt, is_bbox_normalized=False): - """ - Calculate jaccard overlap ratio between two bounding box - """ - if pred[0] >= gt[2] or pred[2] <= gt[0] or \ - pred[1] >= gt[3] or pred[3] <= gt[1]: - return 0. - inter_xmin = max(pred[0], gt[0]) - inter_ymin = max(pred[1], gt[1]) - inter_xmax = min(pred[2], gt[2]) - inter_ymax = min(pred[3], gt[3]) - inter_size = bbox_area([inter_xmin, inter_ymin, inter_xmax, inter_ymax], - is_bbox_normalized) - pred_size = bbox_area(pred, is_bbox_normalized) - gt_size = bbox_area(gt, is_bbox_normalized) - overlap = float(inter_size) / (pred_size + gt_size - inter_size) - return overlap - - -def calc_rbox_iou(pred, gt_poly): - """ - calc iou between rotated bbox - """ - # calc iou of bounding box for speedup - pred = np.array(pred, np.float32).reshape(-1, 2) - gt_poly = np.array(gt_poly, np.float32).reshape(-1, 2) - pred_rect = [ - np.min(pred[:, 0]), np.min(pred[:, 1]), np.max(pred[:, 0]), - np.max(pred[:, 1]) - ] - gt_rect = [ - np.min(gt_poly[:, 0]), np.min(gt_poly[:, 1]), np.max(gt_poly[:, 0]), - np.max(gt_poly[:, 1]) - ] - iou = jaccard_overlap(pred_rect, gt_rect, False) - - if iou <= 0: - return iou - - # calc rbox iou - pred_rbox = poly2rbox_np(pred.reshape(-1, 8)).reshape(-1, 5) - gt_rbox = poly2rbox_np(gt_poly.reshape(-1, 8)).reshape(-1, 5) - try: - from ext_op import rbox_iou - except Exception as e: - print("import custom_ops error, try install ext_op " \ - "following ppdet/ext_op/README.md", e) - sys.stdout.flush() - sys.exit(-1) - pd_gt_rbox = paddle.to_tensor(gt_rbox, dtype='float32') - pd_pred_rbox = paddle.to_tensor(pred_rbox, dtype='float32') - iou = rbox_iou(pd_gt_rbox, pd_pred_rbox) - iou = iou.numpy() - return iou[0][0] - - -def prune_zero_padding(gt_box, gt_label, difficult=None): - valid_cnt = 0 - for i in range(len(gt_box)): - if (gt_box[i] == 0).all(): - break - valid_cnt += 1 - return (gt_box[:valid_cnt], gt_label[:valid_cnt], difficult[:valid_cnt] - if difficult is not None else None) - - -class DetectionMAP(object): - """ - Calculate detection mean average precision. - Currently support two types: 11point and integral - - Args: - class_num (int): The class number. - overlap_thresh (float): The threshold of overlap - ratio between prediction bounding box and - ground truth bounding box for deciding - true/false positive. Default 0.5. - map_type (str): Calculation method of mean average - precision, currently support '11point' and - 'integral'. Default '11point'. - is_bbox_normalized (bool): Whether bounding boxes - is normalized to range[0, 1]. Default False. - evaluate_difficult (bool): Whether to evaluate - difficult bounding boxes. Default False. - catid2name (dict): Mapping between category id and category name. - classwise (bool): Whether per-category AP and draw - P-R Curve or not. - """ - - def __init__(self, - class_num, - overlap_thresh=0.5, - map_type='11point', - is_bbox_normalized=False, - evaluate_difficult=False, - catid2name=None, - classwise=False): - self.class_num = class_num - self.overlap_thresh = overlap_thresh - assert map_type in ['11point', 'integral'], \ - "map_type currently only support '11point' "\ - "and 'integral'" - self.map_type = map_type - self.is_bbox_normalized = is_bbox_normalized - self.evaluate_difficult = evaluate_difficult - self.classwise = classwise - self.classes = [] - for cname in catid2name.values(): - self.classes.append(cname) - self.reset() - - def update(self, bbox, score, label, gt_box, gt_label, difficult=None): - """ - Update metric statics from given prediction and ground - truth infomations. - """ - if difficult is None: - difficult = np.zeros_like(gt_label) - - # record class gt count - for gtl, diff in zip(gt_label, difficult): - if self.evaluate_difficult or int(diff) == 0: - self.class_gt_counts[int(np.array(gtl))] += 1 - - # record class score positive - visited = [False] * len(gt_label) - for b, s, l in zip(bbox, score, label): - pred = b.tolist() if isinstance(b, np.ndarray) else b - max_idx = -1 - max_overlap = -1.0 - for i, gl in enumerate(gt_label): - if int(gl) == int(l): - if len(gt_box[i]) == 8: - overlap = calc_rbox_iou(pred, gt_box[i]) - else: - overlap = jaccard_overlap(pred, gt_box[i], - self.is_bbox_normalized) - if overlap > max_overlap: - max_overlap = overlap - max_idx = i - - if max_overlap > self.overlap_thresh: - if self.evaluate_difficult or \ - int(np.array(difficult[max_idx])) == 0: - if not visited[max_idx]: - self.class_score_poss[int(l)].append([s, 1.0]) - visited[max_idx] = True - else: - self.class_score_poss[int(l)].append([s, 0.0]) - else: - self.class_score_poss[int(l)].append([s, 0.0]) - - def reset(self): - """ - Reset metric statics - """ - self.class_score_poss = [[] for _ in range(self.class_num)] - self.class_gt_counts = [0] * self.class_num - self.mAP = 0.0 - - def accumulate(self): - """ - Accumulate metric results and calculate mAP - """ - mAP = 0. - valid_cnt = 0 - eval_results = [] - for score_pos, count in zip(self.class_score_poss, - self.class_gt_counts): - if count == 0: continue - if len(score_pos) == 0: - valid_cnt += 1 - continue - - accum_tp_list, accum_fp_list = \ - self._get_tp_fp_accum(score_pos) - precision = [] - recall = [] - for ac_tp, ac_fp in zip(accum_tp_list, accum_fp_list): - precision.append(float(ac_tp) / (ac_tp + ac_fp)) - recall.append(float(ac_tp) / count) - - one_class_ap = 0.0 - if self.map_type == '11point': - max_precisions = [0.] * 11 - start_idx = len(precision) - 1 - for j in range(10, -1, -1): - for i in range(start_idx, -1, -1): - if recall[i] < float(j) / 10.: - start_idx = i - if j > 0: - max_precisions[j - 1] = max_precisions[j] - break - else: - if max_precisions[j] < precision[i]: - max_precisions[j] = precision[i] - one_class_ap = sum(max_precisions) / 11. - mAP += one_class_ap - valid_cnt += 1 - elif self.map_type == 'integral': - import math - prev_recall = 0. - for i in range(len(precision)): - recall_gap = math.fabs(recall[i] - prev_recall) - if recall_gap > 1e-6: - one_class_ap += precision[i] * recall_gap - prev_recall = recall[i] - mAP += one_class_ap - valid_cnt += 1 - else: - logger.error("Unspported mAP type {}".format(self.map_type)) - sys.exit(1) - eval_results.append({ - 'class': self.classes[valid_cnt - 1], - 'ap': one_class_ap, - 'precision': precision, - 'recall': recall, - }) - self.eval_results = eval_results - self.mAP = mAP / float(valid_cnt) if valid_cnt > 0 else mAP - - def get_map(self): - """ - Get mAP result - """ - if self.mAP is None: - logger.error("mAP is not calculated.") - if self.classwise: - # Compute per-category AP and PR curve - try: - from terminaltables import AsciiTable - except Exception as e: - logger.error( - 'terminaltables not found, plaese install terminaltables. ' - 'for example: `pip install terminaltables`.') - raise e - results_per_category = [] - for eval_result in self.eval_results: - results_per_category.append( - (str(eval_result['class']), - '{:0.3f}'.format(float(eval_result['ap'])))) - draw_pr_curve( - eval_result['precision'], - eval_result['recall'], - out_dir='voc_pr_curve', - file_name='{}_precision_recall_curve.jpg'.format( - eval_result['class'])) - - num_columns = min(6, len(results_per_category) * 2) - results_flatten = list(itertools.chain(*results_per_category)) - headers = ['category', 'AP'] * (num_columns // 2) - results_2d = itertools.zip_longest(* [ - results_flatten[i::num_columns] for i in range(num_columns) - ]) - table_data = [headers] - table_data += [result for result in results_2d] - table = AsciiTable(table_data) - logger.info('Per-category of VOC AP: \n{}'.format(table.table)) - logger.info( - "per-category PR curve has output to voc_pr_curve folder.") - return self.mAP - - def _get_tp_fp_accum(self, score_pos_list): - """ - Calculate accumulating true/false positive results from - [score, pos] records - """ - sorted_list = sorted(score_pos_list, key=lambda s: s[0], reverse=True) - accum_tp = 0 - accum_fp = 0 - accum_tp_list = [] - accum_fp_list = [] - for (score, pos) in sorted_list: - accum_tp += int(pos) - accum_tp_list.append(accum_tp) - accum_fp += 1 - int(pos) - accum_fp_list.append(accum_fp) - return accum_tp_list, accum_fp_list - - -def ap_per_class(tp, conf, pred_cls, target_cls): - """ - Computes the average precision, given the recall and precision curves. - Method originally from https://github.com/rafaelpadilla/Object-Detection-Metrics. - - Args: - tp (list): True positives. - conf (list): Objectness value from 0-1. - pred_cls (list): Predicted object classes. - target_cls (list): Target object classes. - """ - tp, conf, pred_cls, target_cls = np.array(tp), np.array(conf), np.array( - pred_cls), np.array(target_cls) - - # Sort by objectness - i = np.argsort(-conf) - tp, conf, pred_cls = tp[i], conf[i], pred_cls[i] - - # Find unique classes - unique_classes = np.unique(np.concatenate((pred_cls, target_cls), 0)) - - # Create Precision-Recall curve and compute AP for each class - ap, p, r = [], [], [] - for c in unique_classes: - i = pred_cls == c - n_gt = sum(target_cls == c) # Number of ground truth objects - n_p = sum(i) # Number of predicted objects - - if (n_p == 0) and (n_gt == 0): - continue - elif (n_p == 0) or (n_gt == 0): - ap.append(0) - r.append(0) - p.append(0) - else: - # Accumulate FPs and TPs - fpc = np.cumsum(1 - tp[i]) - tpc = np.cumsum(tp[i]) - - # Recall - recall_curve = tpc / (n_gt + 1e-16) - r.append(tpc[-1] / (n_gt + 1e-16)) - - # Precision - precision_curve = tpc / (tpc + fpc) - p.append(tpc[-1] / (tpc[-1] + fpc[-1])) - - # AP from recall-precision curve - ap.append(compute_ap(recall_curve, precision_curve)) - - return np.array(ap), unique_classes.astype('int32'), np.array(r), np.array( - p) - - -def compute_ap(recall, precision): - """ - Computes the average precision, given the recall and precision curves. - Code originally from https://github.com/rbgirshick/py-faster-rcnn. - - Args: - recall (list): The recall curve. - precision (list): The precision curve. - - Returns: - The average precision as computed in py-faster-rcnn. - """ - # correct AP calculation - # first append sentinel values at the end - mrec = np.concatenate(([0.], recall, [1.])) - mpre = np.concatenate(([0.], precision, [0.])) - - # compute the precision envelope - for i in range(mpre.size - 1, 0, -1): - mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) - - # to calculate area under PR curve, look for points - # where X axis (recall) changes value - i = np.where(mrec[1:] != mrec[:-1])[0] - - # and sum (\Delta recall) * prec - ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) - return ap diff --git a/pdfdet/models/Paddle/ppdet/metrics/mcmot_metrics.py b/pdfdet/models/Paddle/ppdet/metrics/mcmot_metrics.py deleted file mode 100644 index bf74d32..0000000 --- a/pdfdet/models/Paddle/ppdet/metrics/mcmot_metrics.py +++ /dev/null @@ -1,470 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import copy -import sys -import math -from collections import defaultdict - -import numpy as np -import pandas as pd - -from .metrics import Metric -try: - import motmetrics as mm - from motmetrics.math_util import quiet_divide - metrics = mm.metrics.motchallenge_metrics - mh = mm.metrics.create() -except: - pass -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -__all__ = ['MCMOTEvaluator', 'MCMOTMetric'] - -METRICS_LIST = [ - 'num_frames', 'num_matches', 'num_switches', 'num_transfer', 'num_ascend', - 'num_migrate', 'num_false_positives', 'num_misses', 'num_detections', - 'num_objects', 'num_predictions', 'num_unique_objects', 'mostly_tracked', - 'partially_tracked', 'mostly_lost', 'num_fragmentations', 'motp', 'mota', - 'precision', 'recall', 'idfp', 'idfn', 'idtp', 'idp', 'idr', 'idf1' -] - -NAME_MAP = { - 'num_frames': 'num_frames', - 'num_matches': 'num_matches', - 'num_switches': 'IDs', - 'num_transfer': 'IDt', - 'num_ascend': 'IDa', - 'num_migrate': 'IDm', - 'num_false_positives': 'FP', - 'num_misses': 'FN', - 'num_detections': 'num_detections', - 'num_objects': 'num_objects', - 'num_predictions': 'num_predictions', - 'num_unique_objects': 'GT', - 'mostly_tracked': 'MT', - 'partially_tracked': 'partially_tracked', - 'mostly_lost': 'ML', - 'num_fragmentations': 'FM', - 'motp': 'MOTP', - 'mota': 'MOTA', - 'precision': 'Prcn', - 'recall': 'Rcll', - 'idfp': 'idfp', - 'idfn': 'idfn', - 'idtp': 'idtp', - 'idp': 'IDP', - 'idr': 'IDR', - 'idf1': 'IDF1' -} - - -def parse_accs_metrics(seq_acc, index_name, verbose=False): - """ - Parse the evaluation indicators of multiple MOTAccumulator - """ - mh = mm.metrics.create() - summary = MCMOTEvaluator.get_summary(seq_acc, index_name, METRICS_LIST) - summary.loc['OVERALL', 'motp'] = (summary['motp'] * summary['num_detections']).sum() / \ - summary.loc['OVERALL', 'num_detections'] - if verbose: - strsummary = mm.io.render_summary( - summary, formatters=mh.formatters, namemap=NAME_MAP) - print(strsummary) - - return summary - - -def seqs_overall_metrics(summary_df, verbose=False): - """ - Calculate overall metrics for multiple sequences - """ - add_col = [ - 'num_frames', 'num_matches', 'num_switches', 'num_transfer', - 'num_ascend', 'num_migrate', 'num_false_positives', 'num_misses', - 'num_detections', 'num_objects', 'num_predictions', - 'num_unique_objects', 'mostly_tracked', 'partially_tracked', - 'mostly_lost', 'num_fragmentations', 'idfp', 'idfn', 'idtp' - ] - calc_col = ['motp', 'mota', 'precision', 'recall', 'idp', 'idr', 'idf1'] - calc_df = summary_df.copy() - - overall_dic = {} - for col in add_col: - overall_dic[col] = calc_df[col].sum() - - for col in calc_col: - overall_dic[col] = getattr(MCMOTMetricOverall, col + '_overall')( - calc_df, overall_dic) - - overall_df = pd.DataFrame(overall_dic, index=['overall_calc']) - calc_df = pd.concat([calc_df, overall_df]) - - if verbose: - mh = mm.metrics.create() - str_calc_df = mm.io.render_summary( - calc_df, formatters=mh.formatters, namemap=NAME_MAP) - print(str_calc_df) - - return calc_df - - -class MCMOTMetricOverall(object): - def motp_overall(summary_df, overall_dic): - motp = quiet_divide((summary_df['motp'] * - summary_df['num_detections']).sum(), - overall_dic['num_detections']) - return motp - - def mota_overall(summary_df, overall_dic): - del summary_df - mota = 1. - quiet_divide( - (overall_dic['num_misses'] + overall_dic['num_switches'] + - overall_dic['num_false_positives']), overall_dic['num_objects']) - return mota - - def precision_overall(summary_df, overall_dic): - del summary_df - precision = quiet_divide(overall_dic['num_detections'], ( - overall_dic['num_false_positives'] + overall_dic['num_detections'])) - return precision - - def recall_overall(summary_df, overall_dic): - del summary_df - recall = quiet_divide(overall_dic['num_detections'], - overall_dic['num_objects']) - return recall - - def idp_overall(summary_df, overall_dic): - del summary_df - idp = quiet_divide(overall_dic['idtp'], - (overall_dic['idtp'] + overall_dic['idfp'])) - return idp - - def idr_overall(summary_df, overall_dic): - del summary_df - idr = quiet_divide(overall_dic['idtp'], - (overall_dic['idtp'] + overall_dic['idfn'])) - return idr - - def idf1_overall(summary_df, overall_dic): - del summary_df - idf1 = quiet_divide(2. * overall_dic['idtp'], ( - overall_dic['num_objects'] + overall_dic['num_predictions'])) - return idf1 - - -def read_mcmot_results_union(filename, is_gt, is_ignore): - results_dict = dict() - if os.path.isfile(filename): - all_result = np.loadtxt(filename, delimiter=',') - if all_result.shape[0] == 0 or all_result.shape[1] < 7: - return results_dict - if is_ignore: - return results_dict - if is_gt: - # only for test use - all_result = all_result[all_result[:, 7] != 0] - all_result[:, 7] = all_result[:, 7] - 1 - - if all_result.shape[0] == 0: - return results_dict - - class_unique = np.unique(all_result[:, 7]) - - last_max_id = 0 - result_cls_list = [] - for cls in class_unique: - result_cls_split = all_result[all_result[:, 7] == cls] - result_cls_split[:, 1] = result_cls_split[:, 1] + last_max_id - # make sure track id different between every category - last_max_id = max(np.unique(result_cls_split[:, 1])) + 1 - result_cls_list.append(result_cls_split) - - results_con = np.concatenate(result_cls_list) - - for line in range(len(results_con)): - linelist = results_con[line] - fid = int(linelist[0]) - if fid < 1: - continue - results_dict.setdefault(fid, list()) - - if is_gt: - score = 1 - else: - score = float(linelist[6]) - - tlwh = tuple(map(float, linelist[2:6])) - target_id = int(linelist[1]) - cls = int(linelist[7]) - - results_dict[fid].append((tlwh, target_id, cls, score)) - - return results_dict - - -def read_mcmot_results(filename, is_gt, is_ignore): - results_dict = dict() - if os.path.isfile(filename): - with open(filename, 'r') as f: - for line in f.readlines(): - linelist = line.strip().split(',') - if len(linelist) < 7: - continue - fid = int(linelist[0]) - if fid < 1: - continue - cid = int(linelist[7]) - if is_gt: - score = 1 - # only for test use - cid -= 1 - else: - score = float(linelist[6]) - - cls_result_dict = results_dict.setdefault(cid, dict()) - cls_result_dict.setdefault(fid, list()) - - tlwh = tuple(map(float, linelist[2:6])) - target_id = int(linelist[1]) - cls_result_dict[fid].append((tlwh, target_id, score)) - return results_dict - - -def read_results(filename, - data_type, - is_gt=False, - is_ignore=False, - multi_class=False, - union=False): - if data_type in ['mcmot', 'lab']: - if multi_class: - if union: - # The results are evaluated by union all the categories. - # Track IDs between different categories cannot be duplicate. - read_fun = read_mcmot_results_union - else: - # The results are evaluated separately by category. - read_fun = read_mcmot_results - else: - raise ValueError('multi_class: {}, MCMOT should have cls_id.'. - format(multi_class)) - else: - raise ValueError('Unknown data type: {}'.format(data_type)) - - return read_fun(filename, is_gt, is_ignore) - - -def unzip_objs(objs): - if len(objs) > 0: - tlwhs, ids, scores = zip(*objs) - else: - tlwhs, ids, scores = [], [], [] - tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4) - return tlwhs, ids, scores - - -def unzip_objs_cls(objs): - if len(objs) > 0: - tlwhs, ids, cls, scores = zip(*objs) - else: - tlwhs, ids, cls, scores = [], [], [], [] - tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4) - ids = np.array(ids) - cls = np.array(cls) - scores = np.array(scores) - return tlwhs, ids, cls, scores - - -class MCMOTEvaluator(object): - def __init__(self, data_root, seq_name, data_type, num_classes): - self.data_root = data_root - self.seq_name = seq_name - self.data_type = data_type - self.num_classes = num_classes - - self.load_annotations() - try: - import motmetrics as mm - mm.lap.default_solver = 'lap' - except Exception as e: - raise RuntimeError( - 'Unable to use MCMOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics' - ) - self.reset_accumulator() - - self.class_accs = [] - - def load_annotations(self): - assert self.data_type == 'mcmot' - self.gt_filename = os.path.join(self.data_root, '../', 'sequences', - '{}.txt'.format(self.seq_name)) - if not os.path.exists(self.gt_filename): - logger.warning( - "gt_filename '{}' of MCMOTEvaluator is not exist, so the MOTA will be -INF." - ) - - def reset_accumulator(self): - self.acc = mm.MOTAccumulator(auto_id=True) - - def eval_frame_dict(self, trk_objs, gt_objs, rtn_events=False, union=False): - if union: - trk_tlwhs, trk_ids, trk_cls = unzip_objs_cls(trk_objs)[:3] - gt_tlwhs, gt_ids, gt_cls = unzip_objs_cls(gt_objs)[:3] - - # get distance matrix - iou_distance = mm.distances.iou_matrix( - gt_tlwhs, trk_tlwhs, max_iou=0.5) - - # Set the distance between objects of different categories to nan - gt_cls_len = len(gt_cls) - trk_cls_len = len(trk_cls) - # When the number of GT or Trk is 0, iou_distance dimension is (0,0) - if gt_cls_len != 0 and trk_cls_len != 0: - gt_cls = gt_cls.reshape(gt_cls_len, 1) - gt_cls = np.repeat(gt_cls, trk_cls_len, axis=1) - trk_cls = trk_cls.reshape(1, trk_cls_len) - trk_cls = np.repeat(trk_cls, gt_cls_len, axis=0) - iou_distance = np.where(gt_cls == trk_cls, iou_distance, np.nan) - - else: - trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2] - gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2] - - # get distance matrix - iou_distance = mm.distances.iou_matrix( - gt_tlwhs, trk_tlwhs, max_iou=0.5) - - self.acc.update(gt_ids, trk_ids, iou_distance) - - if rtn_events and iou_distance.size > 0 and hasattr(self.acc, - 'mot_events'): - events = self.acc.mot_events # only supported by https://github.com/longcw/py-motmetrics - else: - events = None - return events - - def eval_file(self, result_filename): - # evaluation of each category - gt_frame_dict = read_results( - self.gt_filename, - self.data_type, - is_gt=True, - multi_class=True, - union=False) - result_frame_dict = read_results( - result_filename, - self.data_type, - is_gt=False, - multi_class=True, - union=False) - - for cid in range(self.num_classes): - self.reset_accumulator() - cls_result_frame_dict = result_frame_dict.setdefault(cid, dict()) - cls_gt_frame_dict = gt_frame_dict.setdefault(cid, dict()) - - # only labeled frames will be evaluated - frames = sorted(list(set(cls_gt_frame_dict.keys()))) - - for frame_id in frames: - trk_objs = cls_result_frame_dict.get(frame_id, []) - gt_objs = cls_gt_frame_dict.get(frame_id, []) - self.eval_frame_dict(trk_objs, gt_objs, rtn_events=False) - - self.class_accs.append(self.acc) - - return self.class_accs - - @staticmethod - def get_summary(accs, - names, - metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1', - 'precision', 'recall')): - names = copy.deepcopy(names) - if metrics is None: - metrics = mm.metrics.motchallenge_metrics - metrics = copy.deepcopy(metrics) - - mh = mm.metrics.create() - summary = mh.compute_many( - accs, metrics=metrics, names=names, generate_overall=True) - - return summary - - @staticmethod - def save_summary(summary, filename): - import pandas as pd - writer = pd.ExcelWriter(filename) - summary.to_excel(writer) - writer.save() - - -class MCMOTMetric(Metric): - def __init__(self, num_classes, save_summary=False): - self.num_classes = num_classes - self.save_summary = save_summary - self.MCMOTEvaluator = MCMOTEvaluator - self.result_root = None - self.reset() - - self.seqs_overall = defaultdict(list) - - def reset(self): - self.accs = [] - self.seqs = [] - - def update(self, data_root, seq, data_type, result_root, result_filename): - evaluator = self.MCMOTEvaluator(data_root, seq, data_type, - self.num_classes) - seq_acc = evaluator.eval_file(result_filename) - self.accs.append(seq_acc) - self.seqs.append(seq) - self.result_root = result_root - - cls_index_name = [ - '{}_{}'.format(seq, i) for i in range(self.num_classes) - ] - summary = parse_accs_metrics(seq_acc, cls_index_name) - summary.rename( - index={'OVERALL': '{}_OVERALL'.format(seq)}, inplace=True) - for row in range(len(summary)): - self.seqs_overall[row].append(summary.iloc[row:row + 1]) - - def accumulate(self): - self.cls_summary_list = [] - for row in range(self.num_classes): - seqs_cls_df = pd.concat(self.seqs_overall[row]) - seqs_cls_summary = seqs_overall_metrics(seqs_cls_df) - cls_summary_overall = seqs_cls_summary.iloc[-1:].copy() - cls_summary_overall.rename( - index={'overall_calc': 'overall_calc_{}'.format(row)}, - inplace=True) - self.cls_summary_list.append(cls_summary_overall) - - def log(self): - seqs_summary = seqs_overall_metrics( - pd.concat(self.seqs_overall[self.num_classes]), verbose=True) - class_summary = seqs_overall_metrics( - pd.concat(self.cls_summary_list), verbose=True) - - def get_results(self): - return 1 diff --git a/pdfdet/models/Paddle/ppdet/metrics/metrics.py b/pdfdet/models/Paddle/ppdet/metrics/metrics.py deleted file mode 100644 index b473509..0000000 --- a/pdfdet/models/Paddle/ppdet/metrics/metrics.py +++ /dev/null @@ -1,505 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import sys -import json -import paddle -import numpy as np -import typing -from collections import defaultdict -from pathlib import Path - -from .map_utils import prune_zero_padding, DetectionMAP -from .coco_utils import get_infer_results, cocoapi_eval -from .widerface_utils import face_eval_run -from ppdet.data.source.category import get_categories -from ppdet.modeling.rbox_utils import poly2rbox_np - -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -__all__ = [ - 'Metric', 'COCOMetric', 'VOCMetric', 'WiderFaceMetric', 'get_infer_results', - 'RBoxMetric', 'SNIPERCOCOMetric' -] - -COCO_SIGMAS = np.array([ - .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, - .89, .89 -]) / 10.0 -CROWD_SIGMAS = np.array( - [.79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89, .79, - .79]) / 10.0 - - -class Metric(paddle.metric.Metric): - def name(self): - return self.__class__.__name__ - - def reset(self): - pass - - def accumulate(self): - pass - - # paddle.metric.Metric defined :metch:`update`, :meth:`accumulate` - # :metch:`reset`, in ppdet, we also need following 2 methods: - - # abstract method for logging metric results - def log(self): - pass - - # abstract method for getting metric results - def get_results(self): - pass - - -class COCOMetric(Metric): - def __init__(self, anno_file, **kwargs): - self.anno_file = anno_file - self.clsid2catid = kwargs.get('clsid2catid', None) - if self.clsid2catid is None: - self.clsid2catid, _ = get_categories('COCO', anno_file) - self.classwise = kwargs.get('classwise', False) - self.output_eval = kwargs.get('output_eval', None) - # TODO: bias should be unified - self.bias = kwargs.get('bias', 0) - self.save_prediction_only = kwargs.get('save_prediction_only', False) - self.iou_type = kwargs.get('IouType', 'bbox') - - if not self.save_prediction_only: - assert os.path.isfile(anno_file), \ - "anno_file {} not a file".format(anno_file) - - if self.output_eval is not None: - Path(self.output_eval).mkdir(exist_ok=True) - - self.reset() - - def reset(self): - # only bbox and mask evaluation support currently - self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []} - self.eval_results = {} - - def update(self, inputs, outputs): - outs = {} - # outputs Tensor -> numpy.ndarray - for k, v in outputs.items(): - outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v - - # multi-scale inputs: all inputs have same im_id - if isinstance(inputs, typing.Sequence): - im_id = inputs[0]['im_id'] - else: - im_id = inputs['im_id'] - outs['im_id'] = im_id.numpy() if isinstance(im_id, - paddle.Tensor) else im_id - - infer_results = get_infer_results( - outs, self.clsid2catid, bias=self.bias) - self.results['bbox'] += infer_results[ - 'bbox'] if 'bbox' in infer_results else [] - self.results['mask'] += infer_results[ - 'mask'] if 'mask' in infer_results else [] - self.results['segm'] += infer_results[ - 'segm'] if 'segm' in infer_results else [] - self.results['keypoint'] += infer_results[ - 'keypoint'] if 'keypoint' in infer_results else [] - - def accumulate(self): - if len(self.results['bbox']) > 0: - output = "bbox.json" - if self.output_eval: - output = os.path.join(self.output_eval, output) - with open(output, 'w') as f: - json.dump(self.results['bbox'], f) - logger.info('The bbox result is saved to bbox.json.') - - if self.save_prediction_only: - logger.info('The bbox result is saved to {} and do not ' - 'evaluate the mAP.'.format(output)) - else: - bbox_stats = cocoapi_eval( - output, - 'bbox', - anno_file=self.anno_file, - classwise=self.classwise) - self.eval_results['bbox'] = bbox_stats - sys.stdout.flush() - - if len(self.results['mask']) > 0: - output = "mask.json" - if self.output_eval: - output = os.path.join(self.output_eval, output) - with open(output, 'w') as f: - json.dump(self.results['mask'], f) - logger.info('The mask result is saved to mask.json.') - - if self.save_prediction_only: - logger.info('The mask result is saved to {} and do not ' - 'evaluate the mAP.'.format(output)) - else: - seg_stats = cocoapi_eval( - output, - 'segm', - anno_file=self.anno_file, - classwise=self.classwise) - self.eval_results['mask'] = seg_stats - sys.stdout.flush() - - if len(self.results['segm']) > 0: - output = "segm.json" - if self.output_eval: - output = os.path.join(self.output_eval, output) - with open(output, 'w') as f: - json.dump(self.results['segm'], f) - logger.info('The segm result is saved to segm.json.') - - if self.save_prediction_only: - logger.info('The segm result is saved to {} and do not ' - 'evaluate the mAP.'.format(output)) - else: - seg_stats = cocoapi_eval( - output, - 'segm', - anno_file=self.anno_file, - classwise=self.classwise) - self.eval_results['mask'] = seg_stats - sys.stdout.flush() - - if len(self.results['keypoint']) > 0: - output = "keypoint.json" - if self.output_eval: - output = os.path.join(self.output_eval, output) - with open(output, 'w') as f: - json.dump(self.results['keypoint'], f) - logger.info('The keypoint result is saved to keypoint.json.') - - if self.save_prediction_only: - logger.info('The keypoint result is saved to {} and do not ' - 'evaluate the mAP.'.format(output)) - else: - style = 'keypoints' - use_area = True - sigmas = COCO_SIGMAS - if self.iou_type == 'keypoints_crowd': - style = 'keypoints_crowd' - use_area = False - sigmas = CROWD_SIGMAS - keypoint_stats = cocoapi_eval( - output, - style, - anno_file=self.anno_file, - classwise=self.classwise, - sigmas=sigmas, - use_area=use_area) - self.eval_results['keypoint'] = keypoint_stats - sys.stdout.flush() - - def log(self): - pass - - def get_results(self): - return self.eval_results - - -class VOCMetric(Metric): - def __init__(self, - label_list, - class_num=20, - overlap_thresh=0.5, - map_type='11point', - is_bbox_normalized=False, - evaluate_difficult=False, - classwise=False, - output_eval=None, - save_prediction_only=False): - assert os.path.isfile(label_list), \ - "label_list {} not a file".format(label_list) - self.clsid2catid, self.catid2name = get_categories('VOC', label_list) - - self.overlap_thresh = overlap_thresh - self.map_type = map_type - self.evaluate_difficult = evaluate_difficult - self.output_eval = output_eval - self.save_prediction_only = save_prediction_only - self.detection_map = DetectionMAP( - class_num=class_num, - overlap_thresh=overlap_thresh, - map_type=map_type, - is_bbox_normalized=is_bbox_normalized, - evaluate_difficult=evaluate_difficult, - catid2name=self.catid2name, - classwise=classwise) - - self.reset() - - def reset(self): - self.results = {'bbox': [], 'score': [], 'label': []} - self.detection_map.reset() - - def update(self, inputs, outputs): - bbox_np = outputs['bbox'].numpy() if isinstance( - outputs['bbox'], paddle.Tensor) else outputs['bbox'] - bboxes = bbox_np[:, 2:] - scores = bbox_np[:, 1] - labels = bbox_np[:, 0] - bbox_lengths = outputs['bbox_num'].numpy() if isinstance( - outputs['bbox_num'], paddle.Tensor) else outputs['bbox_num'] - - self.results['bbox'].append(bboxes.tolist()) - self.results['score'].append(scores.tolist()) - self.results['label'].append(labels.tolist()) - - if bboxes.shape == (1, 1) or bboxes is None: - return - if self.save_prediction_only: - return - - gt_boxes = inputs['gt_bbox'] - gt_labels = inputs['gt_class'] - difficults = inputs['difficult'] if not self.evaluate_difficult \ - else None - - if 'scale_factor' in inputs: - scale_factor = inputs['scale_factor'].numpy() if isinstance( - inputs['scale_factor'], - paddle.Tensor) else inputs['scale_factor'] - else: - scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32') - - bbox_idx = 0 - for i in range(len(gt_boxes)): - gt_box = gt_boxes[i].numpy() if isinstance( - gt_boxes[i], paddle.Tensor) else gt_boxes[i] - h, w = scale_factor[i] - gt_box = gt_box / np.array([w, h, w, h]) - gt_label = gt_labels[i].numpy() if isinstance( - gt_labels[i], paddle.Tensor) else gt_labels[i] - if difficults is not None: - difficult = difficults[i].numpy() if isinstance( - difficults[i], paddle.Tensor) else difficults[i] - else: - difficult = None - bbox_num = bbox_lengths[i] - bbox = bboxes[bbox_idx:bbox_idx + bbox_num] - score = scores[bbox_idx:bbox_idx + bbox_num] - label = labels[bbox_idx:bbox_idx + bbox_num] - gt_box, gt_label, difficult = prune_zero_padding(gt_box, gt_label, - difficult) - self.detection_map.update(bbox, score, label, gt_box, gt_label, - difficult) - bbox_idx += bbox_num - - def accumulate(self): - output = "bbox.json" - if self.output_eval: - output = os.path.join(self.output_eval, output) - with open(output, 'w') as f: - json.dump(self.results, f) - logger.info('The bbox result is saved to bbox.json.') - if self.save_prediction_only: - return - - logger.info("Accumulating evaluatation results...") - self.detection_map.accumulate() - - def log(self): - map_stat = 100. * self.detection_map.get_map() - logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh, - self.map_type, map_stat)) - - def get_results(self): - return {'bbox': [self.detection_map.get_map()]} - - -class WiderFaceMetric(Metric): - def __init__(self, image_dir, anno_file, multi_scale=True): - self.image_dir = image_dir - self.anno_file = anno_file - self.multi_scale = multi_scale - self.clsid2catid, self.catid2name = get_categories('widerface') - - def update(self, model): - - face_eval_run( - model, - self.image_dir, - self.anno_file, - pred_dir='output/pred', - eval_mode='widerface', - multi_scale=self.multi_scale) - - -class RBoxMetric(Metric): - def __init__(self, anno_file, **kwargs): - self.anno_file = anno_file - self.clsid2catid, self.catid2name = get_categories('RBOX', anno_file) - self.catid2clsid = {v: k for k, v in self.clsid2catid.items()} - self.classwise = kwargs.get('classwise', False) - self.output_eval = kwargs.get('output_eval', None) - self.save_prediction_only = kwargs.get('save_prediction_only', False) - self.overlap_thresh = kwargs.get('overlap_thresh', 0.5) - self.map_type = kwargs.get('map_type', '11point') - self.evaluate_difficult = kwargs.get('evaluate_difficult', False) - self.imid2path = kwargs.get('imid2path', None) - class_num = len(self.catid2name) - self.detection_map = DetectionMAP( - class_num=class_num, - overlap_thresh=self.overlap_thresh, - map_type=self.map_type, - is_bbox_normalized=False, - evaluate_difficult=self.evaluate_difficult, - catid2name=self.catid2name, - classwise=self.classwise) - - self.reset() - - def reset(self): - self.results = [] - self.detection_map.reset() - - def update(self, inputs, outputs): - outs = {} - # outputs Tensor -> numpy.ndarray - for k, v in outputs.items(): - outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v - - im_id = inputs['im_id'] - im_id = im_id.numpy() if isinstance(im_id, paddle.Tensor) else im_id - outs['im_id'] = im_id - - infer_results = get_infer_results(outs, self.clsid2catid) - infer_results = infer_results['bbox'] if 'bbox' in infer_results else [] - self.results += infer_results - if self.save_prediction_only: - return - - gt_boxes = inputs['gt_poly'] - gt_labels = inputs['gt_class'] - - if 'scale_factor' in inputs: - scale_factor = inputs['scale_factor'].numpy() if isinstance( - inputs['scale_factor'], - paddle.Tensor) else inputs['scale_factor'] - else: - scale_factor = np.ones((gt_boxes.shape[0], 2)).astype('float32') - - for i in range(len(gt_boxes)): - gt_box = gt_boxes[i].numpy() if isinstance( - gt_boxes[i], paddle.Tensor) else gt_boxes[i] - h, w = scale_factor[i] - gt_box = gt_box / np.array([w, h, w, h, w, h, w, h]) - gt_label = gt_labels[i].numpy() if isinstance( - gt_labels[i], paddle.Tensor) else gt_labels[i] - gt_box, gt_label, _ = prune_zero_padding(gt_box, gt_label) - bbox = [ - res['bbox'] for res in infer_results - if int(res['image_id']) == int(im_id[i]) - ] - score = [ - res['score'] for res in infer_results - if int(res['image_id']) == int(im_id[i]) - ] - label = [ - self.catid2clsid[int(res['category_id'])] - for res in infer_results - if int(res['image_id']) == int(im_id[i]) - ] - self.detection_map.update(bbox, score, label, gt_box, gt_label) - - def save_results(self, results, output_dir, imid2path): - if imid2path: - data_dicts = defaultdict(list) - for result in results: - image_id = result['image_id'] - data_dicts[image_id].append(result) - - for image_id, image_path in imid2path.items(): - basename = os.path.splitext(os.path.split(image_path)[-1])[0] - output = os.path.join(output_dir, "{}.txt".format(basename)) - dets = data_dicts.get(image_id, []) - with open(output, 'w') as f: - for det in dets: - catid, bbox, score = det['category_id'], det[ - 'bbox'], det['score'] - bbox_pred = '{} {} '.format(self.catid2name[catid], - score) + ' '.join( - [str(e) for e in bbox]) - f.write(bbox_pred + '\n') - - logger.info('The bbox result is saved to {}.'.format(output_dir)) - else: - output = os.path.join(output_dir, "bbox.json") - with open(output, 'w') as f: - json.dump(results, f) - - logger.info('The bbox result is saved to {}.'.format(output)) - - def accumulate(self): - if self.output_eval: - self.save_results(self.results, self.output_eval, self.imid2path) - - if not self.save_prediction_only: - logger.info("Accumulating evaluatation results...") - self.detection_map.accumulate() - - def log(self): - map_stat = 100. * self.detection_map.get_map() - logger.info("mAP({:.2f}, {}) = {:.2f}%".format(self.overlap_thresh, - self.map_type, map_stat)) - - def get_results(self): - return {'bbox': [self.detection_map.get_map()]} - - -class SNIPERCOCOMetric(COCOMetric): - def __init__(self, anno_file, **kwargs): - super(SNIPERCOCOMetric, self).__init__(anno_file, **kwargs) - self.dataset = kwargs["dataset"] - self.chip_results = [] - - def reset(self): - # only bbox and mask evaluation support currently - self.results = {'bbox': [], 'mask': [], 'segm': [], 'keypoint': []} - self.eval_results = {} - self.chip_results = [] - - def update(self, inputs, outputs): - outs = {} - # outputs Tensor -> numpy.ndarray - for k, v in outputs.items(): - outs[k] = v.numpy() if isinstance(v, paddle.Tensor) else v - - im_id = inputs['im_id'] - outs['im_id'] = im_id.numpy() if isinstance(im_id, - paddle.Tensor) else im_id - - self.chip_results.append(outs) - - def accumulate(self): - results = self.dataset.anno_cropper.aggregate_chips_detections( - self.chip_results) - for outs in results: - infer_results = get_infer_results( - outs, self.clsid2catid, bias=self.bias) - self.results['bbox'] += infer_results[ - 'bbox'] if 'bbox' in infer_results else [] - - super(SNIPERCOCOMetric, self).accumulate() diff --git a/pdfdet/models/Paddle/ppdet/metrics/mot_metrics.py b/pdfdet/models/Paddle/ppdet/metrics/mot_metrics.py deleted file mode 100644 index f61ae9c..0000000 --- a/pdfdet/models/Paddle/ppdet/metrics/mot_metrics.py +++ /dev/null @@ -1,1243 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import copy -import sys -import math -from collections import defaultdict -import numpy as np - -from ppdet.modeling.bbox_utils import bbox_iou_np_expand -from .map_utils import ap_per_class -from .metrics import Metric -from .munkres import Munkres - -try: - import motmetrics as mm - mm.lap.default_solver = 'lap' -except: - pass - -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -__all__ = ['MOTEvaluator', 'MOTMetric', 'JDEDetMetric', 'KITTIMOTMetric'] - - -def read_mot_results(filename, is_gt=False, is_ignore=False): - valid_label = [1] - ignore_labels = [2, 7, 8, 12] # only in motchallenge datasets like 'MOT16' - if is_gt: - logger.info( - "In MOT16/17 dataset the valid_label of ground truth is '{}', " - "in other dataset it should be '0' for single classs MOT.".format( - valid_label[0])) - results_dict = dict() - if os.path.isfile(filename): - with open(filename, 'r') as f: - for line in f.readlines(): - linelist = line.split(',') - if len(linelist) < 7: - continue - fid = int(linelist[0]) - if fid < 1: - continue - results_dict.setdefault(fid, list()) - - if is_gt: - label = int(float(linelist[7])) - mark = int(float(linelist[6])) - if mark == 0 or label not in valid_label: - continue - score = 1 - elif is_ignore: - if 'MOT16-' in filename or 'MOT17-' in filename or 'MOT15-' in filename or 'MOT20-' in filename: - label = int(float(linelist[7])) - vis_ratio = float(linelist[8]) - if label not in ignore_labels and vis_ratio >= 0: - continue - else: - continue - score = 1 - else: - score = float(linelist[6]) - - tlwh = tuple(map(float, linelist[2:6])) - target_id = int(linelist[1]) - - results_dict[fid].append((tlwh, target_id, score)) - return results_dict - - -""" -MOT dataset label list, see in https://motchallenge.net -labels={'ped', ... % 1 - 'person_on_vhcl', ... % 2 - 'car', ... % 3 - 'bicycle', ... % 4 - 'mbike', ... % 5 - 'non_mot_vhcl', ... % 6 - 'static_person', ... % 7 - 'distractor', ... % 8 - 'occluder', ... % 9 - 'occluder_on_grnd', ... % 10 - 'occluder_full', ... % 11 - 'reflection', ... % 12 - 'crowd' ... % 13 -}; -""" - - -def unzip_objs(objs): - if len(objs) > 0: - tlwhs, ids, scores = zip(*objs) - else: - tlwhs, ids, scores = [], [], [] - tlwhs = np.asarray(tlwhs, dtype=float).reshape(-1, 4) - return tlwhs, ids, scores - - -class MOTEvaluator(object): - def __init__(self, data_root, seq_name, data_type): - self.data_root = data_root - self.seq_name = seq_name - self.data_type = data_type - - self.load_annotations() - try: - import motmetrics as mm - mm.lap.default_solver = 'lap' - except Exception as e: - raise RuntimeError( - 'Unable to use MOT metric, please install motmetrics, for example: `pip install motmetrics`, see https://github.com/longcw/py-motmetrics' - ) - self.reset_accumulator() - - def load_annotations(self): - assert self.data_type == 'mot' - gt_filename = os.path.join(self.data_root, self.seq_name, 'gt', - 'gt.txt') - if not os.path.exists(gt_filename): - logger.warning( - "gt_filename '{}' of MOTEvaluator is not exist, so the MOTA will be -INF." - ) - self.gt_frame_dict = read_mot_results(gt_filename, is_gt=True) - self.gt_ignore_frame_dict = read_mot_results( - gt_filename, is_ignore=True) - - def reset_accumulator(self): - self.acc = mm.MOTAccumulator(auto_id=True) - - def eval_frame(self, frame_id, trk_tlwhs, trk_ids, rtn_events=False): - # results - trk_tlwhs = np.copy(trk_tlwhs) - trk_ids = np.copy(trk_ids) - - # gts - gt_objs = self.gt_frame_dict.get(frame_id, []) - gt_tlwhs, gt_ids = unzip_objs(gt_objs)[:2] - - # ignore boxes - ignore_objs = self.gt_ignore_frame_dict.get(frame_id, []) - ignore_tlwhs = unzip_objs(ignore_objs)[0] - - # remove ignored results - keep = np.ones(len(trk_tlwhs), dtype=bool) - iou_distance = mm.distances.iou_matrix( - ignore_tlwhs, trk_tlwhs, max_iou=0.5) - if len(iou_distance) > 0: - match_is, match_js = mm.lap.linear_sum_assignment(iou_distance) - match_is, match_js = map(lambda a: np.asarray(a, dtype=int), [match_is, match_js]) - match_ious = iou_distance[match_is, match_js] - - match_js = np.asarray(match_js, dtype=int) - match_js = match_js[np.logical_not(np.isnan(match_ious))] - keep[match_js] = False - trk_tlwhs = trk_tlwhs[keep] - trk_ids = trk_ids[keep] - - # get distance matrix - iou_distance = mm.distances.iou_matrix(gt_tlwhs, trk_tlwhs, max_iou=0.5) - - # acc - self.acc.update(gt_ids, trk_ids, iou_distance) - - if rtn_events and iou_distance.size > 0 and hasattr(self.acc, - 'last_mot_events'): - events = self.acc.last_mot_events # only supported by https://github.com/longcw/py-motmetrics - else: - events = None - return events - - def eval_file(self, filename): - self.reset_accumulator() - - result_frame_dict = read_mot_results(filename, is_gt=False) - frames = sorted(list(set(result_frame_dict.keys()))) - for frame_id in frames: - trk_objs = result_frame_dict.get(frame_id, []) - trk_tlwhs, trk_ids = unzip_objs(trk_objs)[:2] - self.eval_frame(frame_id, trk_tlwhs, trk_ids, rtn_events=False) - - return self.acc - - @staticmethod - def get_summary(accs, - names, - metrics=('mota', 'num_switches', 'idp', 'idr', 'idf1', - 'precision', 'recall')): - names = copy.deepcopy(names) - if metrics is None: - metrics = mm.metrics.motchallenge_metrics - metrics = copy.deepcopy(metrics) - - mh = mm.metrics.create() - summary = mh.compute_many( - accs, metrics=metrics, names=names, generate_overall=True) - return summary - - @staticmethod - def save_summary(summary, filename): - import pandas as pd - writer = pd.ExcelWriter(filename) - summary.to_excel(writer) - writer.save() - - -class MOTMetric(Metric): - def __init__(self, save_summary=False): - self.save_summary = save_summary - self.MOTEvaluator = MOTEvaluator - self.result_root = None - self.reset() - - def reset(self): - self.accs = [] - self.seqs = [] - - def update(self, data_root, seq, data_type, result_root, result_filename): - evaluator = self.MOTEvaluator(data_root, seq, data_type) - self.accs.append(evaluator.eval_file(result_filename)) - self.seqs.append(seq) - self.result_root = result_root - - def accumulate(self): - metrics = mm.metrics.motchallenge_metrics - mh = mm.metrics.create() - summary = self.MOTEvaluator.get_summary(self.accs, self.seqs, metrics) - self.strsummary = mm.io.render_summary( - summary, - formatters=mh.formatters, - namemap=mm.io.motchallenge_metric_names) - if self.save_summary: - self.MOTEvaluator.save_summary( - summary, os.path.join(self.result_root, 'summary.xlsx')) - - def log(self): - print(self.strsummary) - - def get_results(self): - return self.strsummary - - -class JDEDetMetric(Metric): - # Note this detection AP metric is different from COCOMetric or VOCMetric, - # and the bboxes coordinates are not scaled to the original image - def __init__(self, overlap_thresh=0.5): - self.overlap_thresh = overlap_thresh - self.reset() - - def reset(self): - self.AP_accum = np.zeros(1) - self.AP_accum_count = np.zeros(1) - - def update(self, inputs, outputs): - bboxes = outputs['bbox'][:, 2:].numpy() - scores = outputs['bbox'][:, 1].numpy() - labels = outputs['bbox'][:, 0].numpy() - bbox_lengths = outputs['bbox_num'].numpy() - if bboxes.shape[0] == 1 and bboxes.sum() == 0.0: - return - - gt_boxes = inputs['gt_bbox'].numpy()[0] - gt_labels = inputs['gt_class'].numpy()[0] - if gt_labels.shape[0] == 0: - return - - correct = [] - detected = [] - for i in range(bboxes.shape[0]): - obj_pred = 0 - pred_bbox = bboxes[i].reshape(1, 4) - # Compute iou with target boxes - iou = bbox_iou_np_expand(pred_bbox, gt_boxes, x1y1x2y2=True)[0] - # Extract index of largest overlap - best_i = np.argmax(iou) - # If overlap exceeds threshold and classification is correct mark as correct - if iou[best_i] > self.overlap_thresh and obj_pred == gt_labels[ - best_i] and best_i not in detected: - correct.append(1) - detected.append(best_i) - else: - correct.append(0) - - # Compute Average Precision (AP) per class - target_cls = list(gt_labels.T[0]) - AP, AP_class, R, P = ap_per_class( - tp=correct, - conf=scores, - pred_cls=np.zeros_like(scores), - target_cls=target_cls) - self.AP_accum_count += np.bincount(AP_class, minlength=1) - self.AP_accum += np.bincount(AP_class, minlength=1, weights=AP) - - def accumulate(self): - logger.info("Accumulating evaluatation results...") - self.map_stat = self.AP_accum[0] / (self.AP_accum_count[0] + 1E-16) - - def log(self): - map_stat = 100. * self.map_stat - logger.info("mAP({:.2f}) = {:.2f}%".format(self.overlap_thresh, - map_stat)) - - def get_results(self): - return self.map_stat - - -""" -Following code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/evaluate_tracking.py -""" - - -class tData: - """ - Utility class to load data. - """ - def __init__(self,frame=-1,obj_type="unset",truncation=-1,occlusion=-1,\ - obs_angle=-10,x1=-1,y1=-1,x2=-1,y2=-1,w=-1,h=-1,l=-1,\ - X=-1000,Y=-1000,Z=-1000,yaw=-10,score=-1000,track_id=-1): - """ - Constructor, initializes the object given the parameters. - """ - self.frame = frame - self.track_id = track_id - self.obj_type = obj_type - self.truncation = truncation - self.occlusion = occlusion - self.obs_angle = obs_angle - self.x1 = x1 - self.y1 = y1 - self.x2 = x2 - self.y2 = y2 - self.w = w - self.h = h - self.l = l - self.X = X - self.Y = Y - self.Z = Z - self.yaw = yaw - self.score = score - self.ignored = False - self.valid = False - self.tracker = -1 - - def __str__(self): - attrs = vars(self) - return '\n'.join("%s: %s" % item for item in attrs.items()) - - -class KITTIEvaluation(object): - """ KITTI tracking statistics (CLEAR MOT, id-switches, fragments, ML/PT/MT, precision/recall) - MOTA - Multi-object tracking accuracy in [0,100] - MOTP - Multi-object tracking precision in [0,100] (3D) / [td,100] (2D) - MOTAL - Multi-object tracking accuracy in [0,100] with log10(id-switches) - - id-switches - number of id switches - fragments - number of fragmentations - - MT, PT, ML - number of mostly tracked, partially tracked and mostly lost trajectories - - recall - recall = percentage of detected targets - precision - precision = percentage of correctly detected targets - FAR - number of false alarms per frame - falsepositives - number of false positives (FP) - missed - number of missed targets (FN) - """ - def __init__(self, result_path, gt_path, min_overlap=0.5, max_truncation = 0,\ - min_height = 25, max_occlusion = 2, cls="car",\ - n_frames=[], seqs=[], n_sequences=0): - # get number of sequences and - # get number of frames per sequence from test mapping - # (created while extracting the benchmark) - self.gt_path = os.path.join(gt_path, "../labels") - self.n_frames = n_frames - self.sequence_name = seqs - self.n_sequences = n_sequences - - self.cls = cls # class to evaluate, i.e. pedestrian or car - - self.result_path = result_path - - # statistics and numbers for evaluation - self.n_gt = 0 # number of ground truth detections minus ignored false negatives and true positives - self.n_igt = 0 # number of ignored ground truth detections - self.n_gts = [ - ] # number of ground truth detections minus ignored false negatives and true positives PER SEQUENCE - self.n_igts = [ - ] # number of ground ignored truth detections PER SEQUENCE - self.n_gt_trajectories = 0 - self.n_gt_seq = [] - self.n_tr = 0 # number of tracker detections minus ignored tracker detections - self.n_trs = [ - ] # number of tracker detections minus ignored tracker detections PER SEQUENCE - self.n_itr = 0 # number of ignored tracker detections - self.n_itrs = [] # number of ignored tracker detections PER SEQUENCE - self.n_igttr = 0 # number of ignored ground truth detections where the corresponding associated tracker detection is also ignored - self.n_tr_trajectories = 0 - self.n_tr_seq = [] - self.MOTA = 0 - self.MOTP = 0 - self.MOTAL = 0 - self.MODA = 0 - self.MODP = 0 - self.MODP_t = [] - self.recall = 0 - self.precision = 0 - self.F1 = 0 - self.FAR = 0 - self.total_cost = 0 - self.itp = 0 # number of ignored true positives - self.itps = [] # number of ignored true positives PER SEQUENCE - self.tp = 0 # number of true positives including ignored true positives! - self.tps = [ - ] # number of true positives including ignored true positives PER SEQUENCE - self.fn = 0 # number of false negatives WITHOUT ignored false negatives - self.fns = [ - ] # number of false negatives WITHOUT ignored false negatives PER SEQUENCE - self.ifn = 0 # number of ignored false negatives - self.ifns = [] # number of ignored false negatives PER SEQUENCE - self.fp = 0 # number of false positives - # a bit tricky, the number of ignored false negatives and ignored true positives - # is subtracted, but if both tracker detection and ground truth detection - # are ignored this number is added again to avoid double counting - self.fps = [] # above PER SEQUENCE - self.mme = 0 - self.fragments = 0 - self.id_switches = 0 - self.MT = 0 - self.PT = 0 - self.ML = 0 - - self.min_overlap = min_overlap # minimum bounding box overlap for 3rd party metrics - self.max_truncation = max_truncation # maximum truncation of an object for evaluation - self.max_occlusion = max_occlusion # maximum occlusion of an object for evaluation - self.min_height = min_height # minimum height of an object for evaluation - self.n_sample_points = 500 - - # this should be enough to hold all groundtruth trajectories - # is expanded if necessary and reduced in any case - self.gt_trajectories = [[] for x in range(self.n_sequences)] - self.ign_trajectories = [[] for x in range(self.n_sequences)] - - def loadGroundtruth(self): - try: - self._loadData(self.gt_path, cls=self.cls, loading_groundtruth=True) - except IOError: - return False - return True - - def loadTracker(self): - try: - if not self._loadData( - self.result_path, cls=self.cls, loading_groundtruth=False): - return False - except IOError: - return False - return True - - def _loadData(self, - root_dir, - cls, - min_score=-1000, - loading_groundtruth=False): - """ - Generic loader for ground truth and tracking data. - Use loadGroundtruth() or loadTracker() to load this data. - Loads detections in KITTI format from textfiles. - """ - # construct objectDetections object to hold detection data - t_data = tData() - data = [] - eval_2d = True - eval_3d = True - - seq_data = [] - n_trajectories = 0 - n_trajectories_seq = [] - for seq, s_name in enumerate(self.sequence_name): - i = 0 - filename = os.path.join(root_dir, "%s.txt" % s_name) - f = open(filename, "r") - - f_data = [ - [] for x in range(self.n_frames[seq]) - ] # current set has only 1059 entries, sufficient length is checked anyway - ids = [] - n_in_seq = 0 - id_frame_cache = [] - for line in f: - # KITTI tracking benchmark data format: - # (frame,tracklet_id,objectType,truncation,occlusion,alpha,x1,y1,x2,y2,h,w,l,X,Y,Z,ry) - line = line.strip() - fields = line.split(" ") - # classes that should be loaded (ignored neighboring classes) - if "car" in cls.lower(): - classes = ["car", "van"] - elif "pedestrian" in cls.lower(): - classes = ["pedestrian", "person_sitting"] - else: - classes = [cls.lower()] - classes += ["dontcare"] - if not any([s for s in classes if s in fields[2].lower()]): - continue - # get fields from table - t_data.frame = int(float(fields[0])) # frame - t_data.track_id = int(float(fields[1])) # id - t_data.obj_type = fields[ - 2].lower() # object type [car, pedestrian, cyclist, ...] - t_data.truncation = int( - float(fields[3])) # truncation [-1,0,1,2] - t_data.occlusion = int( - float(fields[4])) # occlusion [-1,0,1,2] - t_data.obs_angle = float(fields[5]) # observation angle [rad] - t_data.x1 = float(fields[6]) # left [px] - t_data.y1 = float(fields[7]) # top [px] - t_data.x2 = float(fields[8]) # right [px] - t_data.y2 = float(fields[9]) # bottom [px] - t_data.h = float(fields[10]) # height [m] - t_data.w = float(fields[11]) # width [m] - t_data.l = float(fields[12]) # length [m] - t_data.X = float(fields[13]) # X [m] - t_data.Y = float(fields[14]) # Y [m] - t_data.Z = float(fields[15]) # Z [m] - t_data.yaw = float(fields[16]) # yaw angle [rad] - if not loading_groundtruth: - if len(fields) == 17: - t_data.score = -1 - elif len(fields) == 18: - t_data.score = float(fields[17]) # detection score - else: - logger.info("file is not in KITTI format") - return - - # do not consider objects marked as invalid - if t_data.track_id is -1 and t_data.obj_type != "dontcare": - continue - - idx = t_data.frame - # check if length for frame data is sufficient - if idx >= len(f_data): - print("extend f_data", idx, len(f_data)) - f_data += [[] for x in range(max(500, idx - len(f_data)))] - try: - id_frame = (t_data.frame, t_data.track_id) - if id_frame in id_frame_cache and not loading_groundtruth: - logger.info( - "track ids are not unique for sequence %d: frame %d" - % (seq, t_data.frame)) - logger.info( - "track id %d occurred at least twice for this frame" - % t_data.track_id) - logger.info("Exiting...") - #continue # this allows to evaluate non-unique result files - return False - id_frame_cache.append(id_frame) - f_data[t_data.frame].append(copy.copy(t_data)) - except: - print(len(f_data), idx) - raise - - if t_data.track_id not in ids and t_data.obj_type != "dontcare": - ids.append(t_data.track_id) - n_trajectories += 1 - n_in_seq += 1 - - # check if uploaded data provides information for 2D and 3D evaluation - if not loading_groundtruth and eval_2d is True and ( - t_data.x1 == -1 or t_data.x2 == -1 or t_data.y1 == -1 or - t_data.y2 == -1): - eval_2d = False - if not loading_groundtruth and eval_3d is True and ( - t_data.X == -1000 or t_data.Y == -1000 or - t_data.Z == -1000): - eval_3d = False - - # only add existing frames - n_trajectories_seq.append(n_in_seq) - seq_data.append(f_data) - f.close() - - if not loading_groundtruth: - self.tracker = seq_data - self.n_tr_trajectories = n_trajectories - self.eval_2d = eval_2d - self.eval_3d = eval_3d - self.n_tr_seq = n_trajectories_seq - if self.n_tr_trajectories == 0: - return False - else: - # split ground truth and DontCare areas - self.dcareas = [] - self.groundtruth = [] - for seq_idx in range(len(seq_data)): - seq_gt = seq_data[seq_idx] - s_g, s_dc = [], [] - for f in range(len(seq_gt)): - all_gt = seq_gt[f] - g, dc = [], [] - for gg in all_gt: - if gg.obj_type == "dontcare": - dc.append(gg) - else: - g.append(gg) - s_g.append(g) - s_dc.append(dc) - self.dcareas.append(s_dc) - self.groundtruth.append(s_g) - self.n_gt_seq = n_trajectories_seq - self.n_gt_trajectories = n_trajectories - return True - - def boxoverlap(self, a, b, criterion="union"): - """ - boxoverlap computes intersection over union for bbox a and b in KITTI format. - If the criterion is 'union', overlap = (a inter b) / a union b). - If the criterion is 'a', overlap = (a inter b) / a, where b should be a dontcare area. - """ - x1 = max(a.x1, b.x1) - y1 = max(a.y1, b.y1) - x2 = min(a.x2, b.x2) - y2 = min(a.y2, b.y2) - - w = x2 - x1 - h = y2 - y1 - - if w <= 0. or h <= 0.: - return 0. - inter = w * h - aarea = (a.x2 - a.x1) * (a.y2 - a.y1) - barea = (b.x2 - b.x1) * (b.y2 - b.y1) - # intersection over union overlap - if criterion.lower() == "union": - o = inter / float(aarea + barea - inter) - elif criterion.lower() == "a": - o = float(inter) / float(aarea) - else: - raise TypeError("Unkown type for criterion") - return o - - def compute3rdPartyMetrics(self): - """ - Computes the metrics defined in - - Stiefelhagen 2008: Evaluating Multiple Object Tracking Performance: The CLEAR MOT Metrics - MOTA, MOTAL, MOTP - - Nevatia 2008: Global Data Association for Multi-Object Tracking Using Network Flows - MT/PT/ML - """ - # construct Munkres object for Hungarian Method association - hm = Munkres() - max_cost = 1e9 - - # go through all frames and associate ground truth and tracker results - # groundtruth and tracker contain lists for every single frame containing lists of KITTI format detections - fr, ids = 0, 0 - for seq_idx in range(len(self.groundtruth)): - seq_gt = self.groundtruth[seq_idx] - seq_dc = self.dcareas[seq_idx] # don't care areas - seq_tracker = self.tracker[seq_idx] - seq_trajectories = defaultdict(list) - seq_ignored = defaultdict(list) - - # statistics over the current sequence, check the corresponding - # variable comments in __init__ to get their meaning - seqtp = 0 - seqitp = 0 - seqfn = 0 - seqifn = 0 - seqfp = 0 - seqigt = 0 - seqitr = 0 - - last_ids = [[], []] - n_gts = 0 - n_trs = 0 - - for f in range(len(seq_gt)): - g = seq_gt[f] - dc = seq_dc[f] - - t = seq_tracker[f] - # counting total number of ground truth and tracker objects - self.n_gt += len(g) - self.n_tr += len(t) - - n_gts += len(g) - n_trs += len(t) - - # use hungarian method to associate, using boxoverlap 0..1 as cost - # build cost matrix - cost_matrix = [] - this_ids = [[], []] - for gg in g: - # save current ids - this_ids[0].append(gg.track_id) - this_ids[1].append(-1) - gg.tracker = -1 - gg.id_switch = 0 - gg.fragmentation = 0 - cost_row = [] - for tt in t: - # overlap == 1 is cost ==0 - c = 1 - self.boxoverlap(gg, tt) - # gating for boxoverlap - if c <= self.min_overlap: - cost_row.append(c) - else: - cost_row.append(max_cost) # = 1e9 - cost_matrix.append(cost_row) - # all ground truth trajectories are initially not associated - # extend groundtruth trajectories lists (merge lists) - seq_trajectories[gg.track_id].append(-1) - seq_ignored[gg.track_id].append(False) - - if len(g) is 0: - cost_matrix = [[]] - # associate - association_matrix = hm.compute(cost_matrix) - - # tmp variables for sanity checks and MODP computation - tmptp = 0 - tmpfp = 0 - tmpfn = 0 - tmpc = 0 # this will sum up the overlaps for all true positives - tmpcs = [0] * len( - g) # this will save the overlaps for all true positives - # the reason is that some true positives might be ignored - # later such that the corrsponding overlaps can - # be subtracted from tmpc for MODP computation - - # mapping for tracker ids and ground truth ids - for row, col in association_matrix: - # apply gating on boxoverlap - c = cost_matrix[row][col] - if c < max_cost: - g[row].tracker = t[col].track_id - this_ids[1][row] = t[col].track_id - t[col].valid = True - g[row].distance = c - self.total_cost += 1 - c - tmpc += 1 - c - tmpcs[row] = 1 - c - seq_trajectories[g[row].track_id][-1] = t[col].track_id - - # true positives are only valid associations - self.tp += 1 - tmptp += 1 - else: - g[row].tracker = -1 - self.fn += 1 - tmpfn += 1 - - # associate tracker and DontCare areas - # ignore tracker in neighboring classes - nignoredtracker = 0 # number of ignored tracker detections - ignoredtrackers = dict() # will associate the track_id with -1 - # if it is not ignored and 1 if it is - # ignored; - # this is used to avoid double counting ignored - # cases, see the next loop - - for tt in t: - ignoredtrackers[tt.track_id] = -1 - # ignore detection if it belongs to a neighboring class or is - # smaller or equal to the minimum height - - tt_height = abs(tt.y1 - tt.y2) - if ((self.cls == "car" and tt.obj_type == "van") or - (self.cls == "pedestrian" and - tt.obj_type == "person_sitting") or - tt_height <= self.min_height) and not tt.valid: - nignoredtracker += 1 - tt.ignored = True - ignoredtrackers[tt.track_id] = 1 - continue - for d in dc: - overlap = self.boxoverlap(tt, d, "a") - if overlap > 0.5 and not tt.valid: - tt.ignored = True - nignoredtracker += 1 - ignoredtrackers[tt.track_id] = 1 - break - - # check for ignored FN/TP (truncation or neighboring object class) - ignoredfn = 0 # the number of ignored false negatives - nignoredtp = 0 # the number of ignored true positives - nignoredpairs = 0 # the number of ignored pairs, i.e. a true positive - # which is ignored but where the associated tracker - # detection has already been ignored - - gi = 0 - for gg in g: - if gg.tracker < 0: - if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\ - or (self.cls=="car" and gg.obj_type=="van") or (self.cls=="pedestrian" and gg.obj_type=="person_sitting"): - seq_ignored[gg.track_id][-1] = True - gg.ignored = True - ignoredfn += 1 - - elif gg.tracker >= 0: - if gg.occlusion>self.max_occlusion or gg.truncation>self.max_truncation\ - or (self.cls=="car" and gg.obj_type=="van") or (self.cls=="pedestrian" and gg.obj_type=="person_sitting"): - - seq_ignored[gg.track_id][-1] = True - gg.ignored = True - nignoredtp += 1 - - # if the associated tracker detection is already ignored, - # we want to avoid double counting ignored detections - if ignoredtrackers[gg.tracker] > 0: - nignoredpairs += 1 - - # for computing MODP, the overlaps from ignored detections - # are subtracted - tmpc -= tmpcs[gi] - gi += 1 - - # the below might be confusion, check the comments in __init__ - # to see what the individual statistics represent - - # correct TP by number of ignored TP due to truncation - # ignored TP are shown as tracked in visualization - tmptp -= nignoredtp - - # count the number of ignored true positives - self.itp += nignoredtp - - # adjust the number of ground truth objects considered - self.n_gt -= (ignoredfn + nignoredtp) - - # count the number of ignored ground truth objects - self.n_igt += ignoredfn + nignoredtp - - # count the number of ignored tracker objects - self.n_itr += nignoredtracker - - # count the number of ignored pairs, i.e. associated tracker and - # ground truth objects that are both ignored - self.n_igttr += nignoredpairs - - # false negatives = associated gt bboxes exceding association threshold + non-associated gt bboxes - tmpfn += len(g) - len(association_matrix) - ignoredfn - self.fn += len(g) - len(association_matrix) - ignoredfn - self.ifn += ignoredfn - - # false positives = tracker bboxes - associated tracker bboxes - # mismatches (mme_t) - tmpfp += len( - t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs - self.fp += len( - t) - tmptp - nignoredtracker - nignoredtp + nignoredpairs - - # update sequence data - seqtp += tmptp - seqitp += nignoredtp - seqfp += tmpfp - seqfn += tmpfn - seqifn += ignoredfn - seqigt += ignoredfn + nignoredtp - seqitr += nignoredtracker - - # sanity checks - # - the number of true positives minues ignored true positives - # should be greater or equal to 0 - # - the number of false negatives should be greater or equal to 0 - # - the number of false positives needs to be greater or equal to 0 - # otherwise ignored detections might be counted double - # - the number of counted true positives (plus ignored ones) - # and the number of counted false negatives (plus ignored ones) - # should match the total number of ground truth objects - # - the number of counted true positives (plus ignored ones) - # and the number of counted false positives - # plus the number of ignored tracker detections should - # match the total number of tracker detections; note that - # nignoredpairs is subtracted here to avoid double counting - # of ignored detection sin nignoredtp and nignoredtracker - if tmptp < 0: - print(tmptp, nignoredtp) - raise NameError("Something went wrong! TP is negative") - if tmpfn < 0: - print(tmpfn, - len(g), - len(association_matrix), ignoredfn, nignoredpairs) - raise NameError("Something went wrong! FN is negative") - if tmpfp < 0: - print(tmpfp, - len(t), tmptp, nignoredtracker, nignoredtp, - nignoredpairs) - raise NameError("Something went wrong! FP is negative") - if tmptp + tmpfn is not len(g) - ignoredfn - nignoredtp: - print("seqidx", seq_idx) - print("frame ", f) - print("TP ", tmptp) - print("FN ", tmpfn) - print("FP ", tmpfp) - print("nGT ", len(g)) - print("nAss ", len(association_matrix)) - print("ign GT", ignoredfn) - print("ign TP", nignoredtp) - raise NameError( - "Something went wrong! nGroundtruth is not TP+FN") - if tmptp + tmpfp + nignoredtp + nignoredtracker - nignoredpairs is not len( - t): - print(seq_idx, f, len(t), tmptp, tmpfp) - print(len(association_matrix), association_matrix) - raise NameError( - "Something went wrong! nTracker is not TP+FP") - - # check for id switches or fragmentations - for i, tt in enumerate(this_ids[0]): - if tt in last_ids[0]: - idx = last_ids[0].index(tt) - tid = this_ids[1][i] - lid = last_ids[1][idx] - if tid != lid and lid != -1 and tid != -1: - if g[i].truncation < self.max_truncation: - g[i].id_switch = 1 - ids += 1 - if tid != lid and lid != -1: - if g[i].truncation < self.max_truncation: - g[i].fragmentation = 1 - fr += 1 - - # save current index - last_ids = this_ids - # compute MOTP_t - MODP_t = 1 - if tmptp != 0: - MODP_t = tmpc / float(tmptp) - self.MODP_t.append(MODP_t) - - # remove empty lists for current gt trajectories - self.gt_trajectories[seq_idx] = seq_trajectories - self.ign_trajectories[seq_idx] = seq_ignored - - # gather statistics for "per sequence" statistics. - self.n_gts.append(n_gts) - self.n_trs.append(n_trs) - self.tps.append(seqtp) - self.itps.append(seqitp) - self.fps.append(seqfp) - self.fns.append(seqfn) - self.ifns.append(seqifn) - self.n_igts.append(seqigt) - self.n_itrs.append(seqitr) - - # compute MT/PT/ML, fragments, idswitches for all groundtruth trajectories - n_ignored_tr_total = 0 - for seq_idx, ( - seq_trajectories, seq_ignored - ) in enumerate(zip(self.gt_trajectories, self.ign_trajectories)): - if len(seq_trajectories) == 0: - continue - tmpMT, tmpML, tmpPT, tmpId_switches, tmpFragments = [0] * 5 - n_ignored_tr = 0 - for g, ign_g in zip(seq_trajectories.values(), - seq_ignored.values()): - # all frames of this gt trajectory are ignored - if all(ign_g): - n_ignored_tr += 1 - n_ignored_tr_total += 1 - continue - # all frames of this gt trajectory are not assigned to any detections - if all([this == -1 for this in g]): - tmpML += 1 - self.ML += 1 - continue - # compute tracked frames in trajectory - last_id = g[0] - # first detection (necessary to be in gt_trajectories) is always tracked - tracked = 1 if g[0] >= 0 else 0 - lgt = 0 if ign_g[0] else 1 - for f in range(1, len(g)): - if ign_g[f]: - last_id = -1 - continue - lgt += 1 - if last_id != g[f] and last_id != -1 and g[f] != -1 and g[ - f - 1] != -1: - tmpId_switches += 1 - self.id_switches += 1 - if f < len(g) - 1 and g[f - 1] != g[ - f] and last_id != -1 and g[f] != -1 and g[f + - 1] != -1: - tmpFragments += 1 - self.fragments += 1 - if g[f] != -1: - tracked += 1 - last_id = g[f] - # handle last frame; tracked state is handled in for loop (g[f]!=-1) - if len(g) > 1 and g[f - 1] != g[f] and last_id != -1 and g[ - f] != -1 and not ign_g[f]: - tmpFragments += 1 - self.fragments += 1 - - # compute MT/PT/ML - tracking_ratio = tracked / float(len(g) - sum(ign_g)) - if tracking_ratio > 0.8: - tmpMT += 1 - self.MT += 1 - elif tracking_ratio < 0.2: - tmpML += 1 - self.ML += 1 - else: # 0.2 <= tracking_ratio <= 0.8 - tmpPT += 1 - self.PT += 1 - - if (self.n_gt_trajectories - n_ignored_tr_total) == 0: - self.MT = 0. - self.PT = 0. - self.ML = 0. - else: - self.MT /= float(self.n_gt_trajectories - n_ignored_tr_total) - self.PT /= float(self.n_gt_trajectories - n_ignored_tr_total) - self.ML /= float(self.n_gt_trajectories - n_ignored_tr_total) - - # precision/recall etc. - if (self.fp + self.tp) == 0 or (self.tp + self.fn) == 0: - self.recall = 0. - self.precision = 0. - else: - self.recall = self.tp / float(self.tp + self.fn) - self.precision = self.tp / float(self.fp + self.tp) - if (self.recall + self.precision) == 0: - self.F1 = 0. - else: - self.F1 = 2. * (self.precision * self.recall) / ( - self.precision + self.recall) - if sum(self.n_frames) == 0: - self.FAR = "n/a" - else: - self.FAR = self.fp / float(sum(self.n_frames)) - - # compute CLEARMOT - if self.n_gt == 0: - self.MOTA = -float("inf") - self.MODA = -float("inf") - else: - self.MOTA = 1 - (self.fn + self.fp + self.id_switches - ) / float(self.n_gt) - self.MODA = 1 - (self.fn + self.fp) / float(self.n_gt) - if self.tp == 0: - self.MOTP = float("inf") - else: - self.MOTP = self.total_cost / float(self.tp) - if self.n_gt != 0: - if self.id_switches == 0: - self.MOTAL = 1 - (self.fn + self.fp + self.id_switches - ) / float(self.n_gt) - else: - self.MOTAL = 1 - (self.fn + self.fp + - math.log10(self.id_switches) - ) / float(self.n_gt) - else: - self.MOTAL = -float("inf") - if sum(self.n_frames) == 0: - self.MODP = "n/a" - else: - self.MODP = sum(self.MODP_t) / float(sum(self.n_frames)) - return True - - def createSummary(self): - summary = "" - summary += "tracking evaluation summary".center(80, "=") + "\n" - summary += self.printEntry("Multiple Object Tracking Accuracy (MOTA)", - self.MOTA) + "\n" - summary += self.printEntry("Multiple Object Tracking Precision (MOTP)", - self.MOTP) + "\n" - summary += self.printEntry("Multiple Object Tracking Accuracy (MOTAL)", - self.MOTAL) + "\n" - summary += self.printEntry("Multiple Object Detection Accuracy (MODA)", - self.MODA) + "\n" - summary += self.printEntry("Multiple Object Detection Precision (MODP)", - self.MODP) + "\n" - summary += "\n" - summary += self.printEntry("Recall", self.recall) + "\n" - summary += self.printEntry("Precision", self.precision) + "\n" - summary += self.printEntry("F1", self.F1) + "\n" - summary += self.printEntry("False Alarm Rate", self.FAR) + "\n" - summary += "\n" - summary += self.printEntry("Mostly Tracked", self.MT) + "\n" - summary += self.printEntry("Partly Tracked", self.PT) + "\n" - summary += self.printEntry("Mostly Lost", self.ML) + "\n" - summary += "\n" - summary += self.printEntry("True Positives", self.tp) + "\n" - #summary += self.printEntry("True Positives per Sequence", self.tps) + "\n" - summary += self.printEntry("Ignored True Positives", self.itp) + "\n" - #summary += self.printEntry("Ignored True Positives per Sequence", self.itps) + "\n" - - summary += self.printEntry("False Positives", self.fp) + "\n" - #summary += self.printEntry("False Positives per Sequence", self.fps) + "\n" - summary += self.printEntry("False Negatives", self.fn) + "\n" - #summary += self.printEntry("False Negatives per Sequence", self.fns) + "\n" - summary += self.printEntry("ID-switches", self.id_switches) + "\n" - self.fp = self.fp / self.n_gt - self.fn = self.fn / self.n_gt - self.id_switches = self.id_switches / self.n_gt - summary += self.printEntry("False Positives Ratio", self.fp) + "\n" - #summary += self.printEntry("False Positives per Sequence", self.fps) + "\n" - summary += self.printEntry("False Negatives Ratio", self.fn) + "\n" - #summary += self.printEntry("False Negatives per Sequence", self.fns) + "\n" - summary += self.printEntry("Ignored False Negatives Ratio", - self.ifn) + "\n" - - #summary += self.printEntry("Ignored False Negatives per Sequence", self.ifns) + "\n" - summary += self.printEntry("Missed Targets", self.fn) + "\n" - summary += self.printEntry("ID-switches", self.id_switches) + "\n" - summary += self.printEntry("Fragmentations", self.fragments) + "\n" - summary += "\n" - summary += self.printEntry("Ground Truth Objects (Total)", self.n_gt + - self.n_igt) + "\n" - #summary += self.printEntry("Ground Truth Objects (Total) per Sequence", self.n_gts) + "\n" - summary += self.printEntry("Ignored Ground Truth Objects", - self.n_igt) + "\n" - #summary += self.printEntry("Ignored Ground Truth Objects per Sequence", self.n_igts) + "\n" - summary += self.printEntry("Ground Truth Trajectories", - self.n_gt_trajectories) + "\n" - summary += "\n" - summary += self.printEntry("Tracker Objects (Total)", self.n_tr) + "\n" - #summary += self.printEntry("Tracker Objects (Total) per Sequence", self.n_trs) + "\n" - summary += self.printEntry("Ignored Tracker Objects", self.n_itr) + "\n" - #summary += self.printEntry("Ignored Tracker Objects per Sequence", self.n_itrs) + "\n" - summary += self.printEntry("Tracker Trajectories", - self.n_tr_trajectories) + "\n" - #summary += "\n" - #summary += self.printEntry("Ignored Tracker Objects with Associated Ignored Ground Truth Objects", self.n_igttr) + "\n" - summary += "=" * 80 - return summary - - def printEntry(self, key, val, width=(70, 10)): - """ - Pretty print an entry in a table fashion. - """ - s_out = key.ljust(width[0]) - if type(val) == int: - s = "%%%dd" % width[1] - s_out += s % val - elif type(val) == float: - s = "%%%df" % (width[1]) - s_out += s % val - else: - s_out += ("%s" % val).rjust(width[1]) - return s_out - - def saveToStats(self, save_summary): - """ - Save the statistics in a whitespace separate file. - """ - summary = self.createSummary() - if save_summary: - filename = os.path.join(self.result_path, - "summary_%s.txt" % self.cls) - dump = open(filename, "w+") - dump.write(summary) - dump.close() - return summary - - -class KITTIMOTMetric(Metric): - def __init__(self, save_summary=True): - self.save_summary = save_summary - self.MOTEvaluator = KITTIEvaluation - self.result_root = None - self.reset() - - def reset(self): - self.seqs = [] - self.n_sequences = 0 - self.n_frames = [] - self.strsummary = '' - - def update(self, data_root, seq, data_type, result_root, result_filename): - assert data_type == 'kitti', "data_type should 'kitti'" - self.result_root = result_root - self.gt_path = data_root - gt_path = '{}/../labels/{}.txt'.format(data_root, seq) - gt = open(gt_path, "r") - max_frame = 0 - for line in gt: - line = line.strip() - line_list = line.split(" ") - if int(line_list[0]) > max_frame: - max_frame = int(line_list[0]) - rs = open(result_filename, "r") - for line in rs: - line = line.strip() - line_list = line.split(" ") - if int(line_list[0]) > max_frame: - max_frame = int(line_list[0]) - gt.close() - rs.close() - self.n_frames.append(max_frame + 1) - self.seqs.append(seq) - self.n_sequences += 1 - - def accumulate(self): - logger.info("Processing Result for KITTI Tracking Benchmark") - e = self.MOTEvaluator(result_path=self.result_root, gt_path=self.gt_path,\ - n_frames=self.n_frames, seqs=self.seqs, n_sequences=self.n_sequences) - try: - if not e.loadTracker(): - return - logger.info("Loading Results - Success") - logger.info("Evaluate Object Class: %s" % c.upper()) - except: - logger.info("Caught exception while loading result data.") - if not e.loadGroundtruth(): - raise ValueError("Ground truth not found.") - logger.info("Loading Groundtruth - Success") - # sanity checks - if len(e.groundtruth) is not len(e.tracker): - logger.info( - "The uploaded data does not provide results for every sequence.") - return False - logger.info("Loaded %d Sequences." % len(e.groundtruth)) - logger.info("Start Evaluation...") - - if e.compute3rdPartyMetrics(): - self.strsummary = e.saveToStats(self.save_summary) - else: - logger.info( - "There seem to be no true positives or false positives at all in the submitted data." - ) - - def log(self): - print(self.strsummary) - - def get_results(self): - return self.strsummary diff --git a/pdfdet/models/Paddle/ppdet/metrics/munkres.py b/pdfdet/models/Paddle/ppdet/metrics/munkres.py deleted file mode 100644 index fbd4a92..0000000 --- a/pdfdet/models/Paddle/ppdet/metrics/munkres.py +++ /dev/null @@ -1,428 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is borrow from https://github.com/xingyizhou/CenterTrack/blob/master/src/tools/eval_kitti_track/munkres.py -""" - -import sys - -__all__ = ['Munkres', 'make_cost_matrix'] - - -class Munkres: - """ - Calculate the Munkres solution to the classical assignment problem. - See the module documentation for usage. - """ - - def __init__(self): - """Create a new instance""" - self.C = None - self.row_covered = [] - self.col_covered = [] - self.n = 0 - self.Z0_r = 0 - self.Z0_c = 0 - self.marked = None - self.path = None - - def make_cost_matrix(profit_matrix, inversion_function): - """ - **DEPRECATED** - - Please use the module function ``make_cost_matrix()``. - """ - import munkres - return munkres.make_cost_matrix(profit_matrix, inversion_function) - - make_cost_matrix = staticmethod(make_cost_matrix) - - def pad_matrix(self, matrix, pad_value=0): - """ - Pad a possibly non-square matrix to make it square. - - :Parameters: - matrix : list of lists - matrix to pad - - pad_value : int - value to use to pad the matrix - - :rtype: list of lists - :return: a new, possibly padded, matrix - """ - max_columns = 0 - total_rows = len(matrix) - - for row in matrix: - max_columns = max(max_columns, len(row)) - - total_rows = max(max_columns, total_rows) - - new_matrix = [] - for row in matrix: - row_len = len(row) - new_row = row[:] - if total_rows > row_len: - # Row too short. Pad it. - new_row += [0] * (total_rows - row_len) - new_matrix += [new_row] - - while len(new_matrix) < total_rows: - new_matrix += [[0] * total_rows] - - return new_matrix - - def compute(self, cost_matrix): - """ - Compute the indexes for the lowest-cost pairings between rows and - columns in the database. Returns a list of (row, column) tuples - that can be used to traverse the matrix. - - :Parameters: - cost_matrix : list of lists - The cost matrix. If this cost matrix is not square, it - will be padded with zeros, via a call to ``pad_matrix()``. - (This method does *not* modify the caller's matrix. It - operates on a copy of the matrix.) - - **WARNING**: This code handles square and rectangular - matrices. It does *not* handle irregular matrices. - - :rtype: list - :return: A list of ``(row, column)`` tuples that describe the lowest - cost path through the matrix - - """ - self.C = self.pad_matrix(cost_matrix) - self.n = len(self.C) - self.original_length = len(cost_matrix) - self.original_width = len(cost_matrix[0]) - self.row_covered = [False for i in range(self.n)] - self.col_covered = [False for i in range(self.n)] - self.Z0_r = 0 - self.Z0_c = 0 - self.path = self.__make_matrix(self.n * 2, 0) - self.marked = self.__make_matrix(self.n, 0) - - done = False - step = 1 - - steps = { - 1: self.__step1, - 2: self.__step2, - 3: self.__step3, - 4: self.__step4, - 5: self.__step5, - 6: self.__step6 - } - - while not done: - try: - func = steps[step] - step = func() - except KeyError: - done = True - - # Look for the starred columns - results = [] - for i in range(self.original_length): - for j in range(self.original_width): - if self.marked[i][j] == 1: - results += [(i, j)] - - return results - - def __copy_matrix(self, matrix): - """Return an exact copy of the supplied matrix""" - return copy.deepcopy(matrix) - - def __make_matrix(self, n, val): - """Create an *n*x*n* matrix, populating it with the specific value.""" - matrix = [] - for i in range(n): - matrix += [[val for j in range(n)]] - return matrix - - def __step1(self): - """ - For each row of the matrix, find the smallest element and - subtract it from every element in its row. Go to Step 2. - """ - C = self.C - n = self.n - for i in range(n): - minval = min(self.C[i]) - # Find the minimum value for this row and subtract that minimum - # from every element in the row. - for j in range(n): - self.C[i][j] -= minval - - return 2 - - def __step2(self): - """ - Find a zero (Z) in the resulting matrix. If there is no starred - zero in its row or column, star Z. Repeat for each element in the - matrix. Go to Step 3. - """ - n = self.n - for i in range(n): - for j in range(n): - if (self.C[i][j] == 0) and \ - (not self.col_covered[j]) and \ - (not self.row_covered[i]): - self.marked[i][j] = 1 - self.col_covered[j] = True - self.row_covered[i] = True - - self.__clear_covers() - return 3 - - def __step3(self): - """ - Cover each column containing a starred zero. If K columns are - covered, the starred zeros describe a complete set of unique - assignments. In this case, Go to DONE, otherwise, Go to Step 4. - """ - n = self.n - count = 0 - for i in range(n): - for j in range(n): - if self.marked[i][j] == 1: - self.col_covered[j] = True - count += 1 - - if count >= n: - step = 7 # done - else: - step = 4 - - return step - - def __step4(self): - """ - Find a noncovered zero and prime it. If there is no starred zero - in the row containing this primed zero, Go to Step 5. Otherwise, - cover this row and uncover the column containing the starred - zero. Continue in this manner until there are no uncovered zeros - left. Save the smallest uncovered value and Go to Step 6. - """ - step = 0 - done = False - row = -1 - col = -1 - star_col = -1 - while not done: - (row, col) = self.__find_a_zero() - if row < 0: - done = True - step = 6 - else: - self.marked[row][col] = 2 - star_col = self.__find_star_in_row(row) - if star_col >= 0: - col = star_col - self.row_covered[row] = True - self.col_covered[col] = False - else: - done = True - self.Z0_r = row - self.Z0_c = col - step = 5 - - return step - - def __step5(self): - """ - Construct a series of alternating primed and starred zeros as - follows. Let Z0 represent the uncovered primed zero found in Step 4. - Let Z1 denote the starred zero in the column of Z0 (if any). - Let Z2 denote the primed zero in the row of Z1 (there will always - be one). Continue until the series terminates at a primed zero - that has no starred zero in its column. Unstar each starred zero - of the series, star each primed zero of the series, erase all - primes and uncover every line in the matrix. Return to Step 3 - """ - count = 0 - path = self.path - path[count][0] = self.Z0_r - path[count][1] = self.Z0_c - done = False - while not done: - row = self.__find_star_in_col(path[count][1]) - if row >= 0: - count += 1 - path[count][0] = row - path[count][1] = path[count - 1][1] - else: - done = True - - if not done: - col = self.__find_prime_in_row(path[count][0]) - count += 1 - path[count][0] = path[count - 1][0] - path[count][1] = col - - self.__convert_path(path, count) - self.__clear_covers() - self.__erase_primes() - return 3 - - def __step6(self): - """ - Add the value found in Step 4 to every element of each covered - row, and subtract it from every element of each uncovered column. - Return to Step 4 without altering any stars, primes, or covered - lines. - """ - minval = self.__find_smallest() - for i in range(self.n): - for j in range(self.n): - if self.row_covered[i]: - self.C[i][j] += minval - if not self.col_covered[j]: - self.C[i][j] -= minval - return 4 - - def __find_smallest(self): - """Find the smallest uncovered value in the matrix.""" - minval = 2e9 # sys.maxint - for i in range(self.n): - for j in range(self.n): - if (not self.row_covered[i]) and (not self.col_covered[j]): - if minval > self.C[i][j]: - minval = self.C[i][j] - return minval - - def __find_a_zero(self): - """Find the first uncovered element with value 0""" - row = -1 - col = -1 - i = 0 - n = self.n - done = False - - while not done: - j = 0 - while True: - if (self.C[i][j] == 0) and \ - (not self.row_covered[i]) and \ - (not self.col_covered[j]): - row = i - col = j - done = True - j += 1 - if j >= n: - break - i += 1 - if i >= n: - done = True - - return (row, col) - - def __find_star_in_row(self, row): - """ - Find the first starred element in the specified row. Returns - the column index, or -1 if no starred element was found. - """ - col = -1 - for j in range(self.n): - if self.marked[row][j] == 1: - col = j - break - - return col - - def __find_star_in_col(self, col): - """ - Find the first starred element in the specified row. Returns - the row index, or -1 if no starred element was found. - """ - row = -1 - for i in range(self.n): - if self.marked[i][col] == 1: - row = i - break - - return row - - def __find_prime_in_row(self, row): - """ - Find the first prime element in the specified row. Returns - the column index, or -1 if no starred element was found. - """ - col = -1 - for j in range(self.n): - if self.marked[row][j] == 2: - col = j - break - - return col - - def __convert_path(self, path, count): - for i in range(count + 1): - if self.marked[path[i][0]][path[i][1]] == 1: - self.marked[path[i][0]][path[i][1]] = 0 - else: - self.marked[path[i][0]][path[i][1]] = 1 - - def __clear_covers(self): - """Clear all covered matrix cells""" - for i in range(self.n): - self.row_covered[i] = False - self.col_covered[i] = False - - def __erase_primes(self): - """Erase all prime markings""" - for i in range(self.n): - for j in range(self.n): - if self.marked[i][j] == 2: - self.marked[i][j] = 0 - - -def make_cost_matrix(profit_matrix, inversion_function): - """ - Create a cost matrix from a profit matrix by calling - 'inversion_function' to invert each value. The inversion - function must take one numeric argument (of any type) and return - another numeric argument which is presumed to be the cost inverse - of the original profit. - - This is a static method. Call it like this: - - .. python:: - - cost_matrix = Munkres.make_cost_matrix(matrix, inversion_func) - - For example: - - .. python:: - - cost_matrix = Munkres.make_cost_matrix(matrix, lambda x : sys.maxint - x) - - :Parameters: - profit_matrix : list of lists - The matrix to convert from a profit to a cost matrix - - inversion_function : function - The function to use to invert each entry in the profit matrix - - :rtype: list of lists - :return: The converted matrix - """ - cost_matrix = [] - for row in profit_matrix: - cost_matrix.append([inversion_function(value) for value in row]) - return cost_matrix diff --git a/pdfdet/models/Paddle/ppdet/metrics/pose3d_metrics.py b/pdfdet/models/Paddle/ppdet/metrics/pose3d_metrics.py deleted file mode 100644 index ea21de9..0000000 --- a/pdfdet/models/Paddle/ppdet/metrics/pose3d_metrics.py +++ /dev/null @@ -1,200 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -from paddle.distributed import ParallelEnv -import os -import json -from collections import defaultdict, OrderedDict -import numpy as np -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -__all__ = ['Pose3DEval'] - - -class AverageMeter(object): - def __init__(self): - self.reset() - - def reset(self): - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=1): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - -def mean_per_joint_position_error(pred, gt, has_3d_joints): - """ - Compute mPJPE - """ - gt = gt[has_3d_joints == 1] - gt = gt[:, :, :3] - pred = pred[has_3d_joints == 1] - - with paddle.no_grad(): - gt_pelvis = (gt[:, 2, :] + gt[:, 3, :]) / 2 - gt = gt - gt_pelvis[:, None, :] - pred_pelvis = (pred[:, 2, :] + pred[:, 3, :]) / 2 - pred = pred - pred_pelvis[:, None, :] - error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean(axis=-1).numpy() - return error - - -def compute_similarity_transform(S1, S2): - """Computes a similarity transform (sR, t) that takes - a set of 3D points S1 (3 x N) closest to a set of 3D points S2, - where R is an 3x3 rotation matrix, t 3x1 translation, s scale. - i.e. solves the orthogonal Procrutes problem. - """ - transposed = False - if S1.shape[0] != 3 and S1.shape[0] != 2: - S1 = S1.T - S2 = S2.T - transposed = True - assert (S2.shape[1] == S1.shape[1]) - - # 1. Remove mean. - mu1 = S1.mean(axis=1, keepdims=True) - mu2 = S2.mean(axis=1, keepdims=True) - X1 = S1 - mu1 - X2 = S2 - mu2 - - # 2. Compute variance of X1 used for scale. - var1 = np.sum(X1**2) - - # 3. The outer product of X1 and X2. - K = X1.dot(X2.T) - - # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are - # singular vectors of K. - U, s, Vh = np.linalg.svd(K) - V = Vh.T - # Construct Z that fixes the orientation of R to get det(R)=1. - Z = np.eye(U.shape[0]) - Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T))) - # Construct R. - R = V.dot(Z.dot(U.T)) - - # 5. Recover scale. - scale = np.trace(R.dot(K)) / var1 - - # 6. Recover translation. - t = mu2 - scale * (R.dot(mu1)) - - # 7. Error: - S1_hat = scale * R.dot(S1) + t - - if transposed: - S1_hat = S1_hat.T - - return S1_hat - - -def compute_similarity_transform_batch(S1, S2): - """Batched version of compute_similarity_transform.""" - S1_hat = np.zeros_like(S1) - for i in range(S1.shape[0]): - S1_hat[i] = compute_similarity_transform(S1[i], S2[i]) - return S1_hat - - -def reconstruction_error(S1, S2, reduction='mean'): - """Do Procrustes alignment and compute reconstruction error.""" - S1_hat = compute_similarity_transform_batch(S1, S2) - re = np.sqrt(((S1_hat - S2)**2).sum(axis=-1)).mean(axis=-1) - if reduction == 'mean': - re = re.mean() - elif reduction == 'sum': - re = re.sum() - return re - - -def all_gather(data): - if paddle.distributed.get_world_size() == 1: - return data - vlist = [] - paddle.distributed.all_gather(vlist, data) - data = paddle.concat(vlist, 0) - return data - - -class Pose3DEval(object): - def __init__(self, output_eval, save_prediction_only=False): - super(Pose3DEval, self).__init__() - self.output_eval = output_eval - self.res_file = os.path.join(output_eval, "pose3d_results.json") - self.save_prediction_only = save_prediction_only - self.reset() - - def reset(self): - self.PAmPJPE = AverageMeter() - self.mPJPE = AverageMeter() - self.eval_results = {} - - def get_human36m_joints(self, input): - J24_TO_J14 = paddle.to_tensor( - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18]) - J24_TO_J17 = paddle.to_tensor( - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19]) - return paddle.index_select(input, J24_TO_J14, axis=1) - - def update(self, inputs, outputs): - gt_3d_joints = all_gather(inputs['joints_3d'].cuda(ParallelEnv() - .local_rank)) - has_3d_joints = all_gather(inputs['has_3d_joints'].cuda(ParallelEnv() - .local_rank)) - pred_3d_joints = all_gather(outputs['pose3d']) - if gt_3d_joints.shape[1] == 24: - gt_3d_joints = self.get_human36m_joints(gt_3d_joints) - if pred_3d_joints.shape[1] == 24: - pred_3d_joints = self.get_human36m_joints(pred_3d_joints) - mPJPE_val = mean_per_joint_position_error(pred_3d_joints, gt_3d_joints, - has_3d_joints).mean() - PAmPJPE_val = reconstruction_error( - pred_3d_joints.numpy(), - gt_3d_joints[:, :, :3].numpy(), - reduction=None).mean() - count = int(np.sum(has_3d_joints.numpy())) - self.PAmPJPE.update(PAmPJPE_val * 1000., count) - self.mPJPE.update(mPJPE_val * 1000., count) - - def accumulate(self): - if self.save_prediction_only: - logger.info(f'The pose3d result is saved to {self.res_file} ' - 'and do not evaluate the model.') - return - self.eval_results['pose3d'] = [-self.mPJPE.avg, -self.PAmPJPE.avg] - - def log(self): - if self.save_prediction_only: - return - stats_names = ['mPJPE', 'PAmPJPE'] - num_values = len(stats_names) - print(' '.join(['| {}'.format(name) for name in stats_names]) + ' |') - print('|---' * (num_values + 1) + '|') - - print(' '.join([ - '| {:.3f}'.format(abs(value)) - for value in self.eval_results['pose3d'] - ]) + ' |') - - def get_results(self): - return self.eval_results diff --git a/pdfdet/models/Paddle/ppdet/metrics/widerface_utils.py b/pdfdet/models/Paddle/ppdet/metrics/widerface_utils.py deleted file mode 100644 index 2f64bf6..0000000 --- a/pdfdet/models/Paddle/ppdet/metrics/widerface_utils.py +++ /dev/null @@ -1,391 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import cv2 -import numpy as np -from collections import OrderedDict - -import paddle - -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -__all__ = ['face_eval_run', 'lmk2out'] - - -def face_eval_run(model, - image_dir, - gt_file, - pred_dir='output/pred', - eval_mode='widerface', - multi_scale=False): - # load ground truth files - with open(gt_file, 'r') as f: - gt_lines = f.readlines() - imid2path = [] - pos_gt = 0 - while pos_gt < len(gt_lines): - name_gt = gt_lines[pos_gt].strip('\n\t').split()[0] - imid2path.append(name_gt) - pos_gt += 1 - n_gt = int(gt_lines[pos_gt].strip('\n\t').split()[0]) - pos_gt += 1 + n_gt - logger.info('The ground truth file load {} images'.format(len(imid2path))) - - dets_dist = OrderedDict() - for iter_id, im_path in enumerate(imid2path): - image_path = os.path.join(image_dir, im_path) - if eval_mode == 'fddb': - image_path += '.jpg' - assert os.path.exists(image_path) - image = cv2.imread(image_path) - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - if multi_scale: - shrink, max_shrink = get_shrink(image.shape[0], image.shape[1]) - det0 = detect_face(model, image, shrink) - det1 = flip_test(model, image, shrink) - [det2, det3] = multi_scale_test(model, image, max_shrink) - det4 = multi_scale_test_pyramid(model, image, max_shrink) - det = np.row_stack((det0, det1, det2, det3, det4)) - dets = bbox_vote(det) - else: - dets = detect_face(model, image, 1) - if eval_mode == 'widerface': - save_widerface_bboxes(image_path, dets, pred_dir) - else: - dets_dist[im_path] = dets - if iter_id % 100 == 0: - logger.info('Test iter {}'.format(iter_id)) - if eval_mode == 'fddb': - save_fddb_bboxes(dets_dist, pred_dir) - logger.info("Finish evaluation.") - - -def detect_face(model, image, shrink): - image_shape = [image.shape[0], image.shape[1]] - if shrink != 1: - h, w = int(image_shape[0] * shrink), int(image_shape[1] * shrink) - image = cv2.resize(image, (w, h)) - image_shape = [h, w] - - img = face_img_process(image) - image_shape = np.asarray([image_shape]) - scale_factor = np.asarray([[shrink, shrink]]) - data = { - "image": paddle.to_tensor( - img, dtype='float32'), - "im_shape": paddle.to_tensor( - image_shape, dtype='float32'), - "scale_factor": paddle.to_tensor( - scale_factor, dtype='float32') - } - model.eval() - detection = model(data) - detection = detection['bbox'].numpy() - # layout: xmin, ymin, xmax. ymax, score - if np.prod(detection.shape) == 1: - logger.info("No face detected") - return np.array([[0, 0, 0, 0, 0]]) - det_conf = detection[:, 1] - det_xmin = detection[:, 2] - det_ymin = detection[:, 3] - det_xmax = detection[:, 4] - det_ymax = detection[:, 5] - - det = np.column_stack((det_xmin, det_ymin, det_xmax, det_ymax, det_conf)) - return det - - -def flip_test(model, image, shrink): - img = cv2.flip(image, 1) - det_f = detect_face(model, img, shrink) - det_t = np.zeros(det_f.shape) - img_width = image.shape[1] - det_t[:, 0] = img_width - det_f[:, 2] - det_t[:, 1] = det_f[:, 1] - det_t[:, 2] = img_width - det_f[:, 0] - det_t[:, 3] = det_f[:, 3] - det_t[:, 4] = det_f[:, 4] - return det_t - - -def multi_scale_test(model, image, max_shrink): - # Shrink detecting is only used to detect big faces - st = 0.5 if max_shrink >= 0.75 else 0.5 * max_shrink - det_s = detect_face(model, image, st) - index = np.where( - np.maximum(det_s[:, 2] - det_s[:, 0] + 1, det_s[:, 3] - det_s[:, 1] + 1) - > 30)[0] - det_s = det_s[index, :] - # Enlarge one times - bt = min(2, max_shrink) if max_shrink > 1 else (st + max_shrink) / 2 - det_b = detect_face(model, image, bt) - - # Enlarge small image x times for small faces - if max_shrink > 2: - bt *= 2 - while bt < max_shrink: - det_b = np.row_stack((det_b, detect_face(model, image, bt))) - bt *= 2 - det_b = np.row_stack((det_b, detect_face(model, image, max_shrink))) - - # Enlarged images are only used to detect small faces. - if bt > 1: - index = np.where( - np.minimum(det_b[:, 2] - det_b[:, 0] + 1, - det_b[:, 3] - det_b[:, 1] + 1) < 100)[0] - det_b = det_b[index, :] - # Shrinked images are only used to detect big faces. - else: - index = np.where( - np.maximum(det_b[:, 2] - det_b[:, 0] + 1, - det_b[:, 3] - det_b[:, 1] + 1) > 30)[0] - det_b = det_b[index, :] - return det_s, det_b - - -def multi_scale_test_pyramid(model, image, max_shrink): - # Use image pyramids to detect faces - det_b = detect_face(model, image, 0.25) - index = np.where( - np.maximum(det_b[:, 2] - det_b[:, 0] + 1, det_b[:, 3] - det_b[:, 1] + 1) - > 30)[0] - det_b = det_b[index, :] - - st = [0.75, 1.25, 1.5, 1.75] - for i in range(len(st)): - if st[i] <= max_shrink: - det_temp = detect_face(model, image, st[i]) - # Enlarged images are only used to detect small faces. - if st[i] > 1: - index = np.where( - np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1, - det_temp[:, 3] - det_temp[:, 1] + 1) < 100)[0] - det_temp = det_temp[index, :] - # Shrinked images are only used to detect big faces. - else: - index = np.where( - np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1, - det_temp[:, 3] - det_temp[:, 1] + 1) > 30)[0] - det_temp = det_temp[index, :] - det_b = np.row_stack((det_b, det_temp)) - return det_b - - -def to_chw(image): - """ - Transpose image from HWC to CHW. - Args: - image (np.array): an image with HWC layout. - """ - # HWC to CHW - if len(image.shape) == 3: - image = np.swapaxes(image, 1, 2) - image = np.swapaxes(image, 1, 0) - return image - - -def face_img_process(image, - mean=[104., 117., 123.], - std=[127.502231, 127.502231, 127.502231]): - img = np.array(image) - img = to_chw(img) - img = img.astype('float32') - img -= np.array(mean)[:, np.newaxis, np.newaxis].astype('float32') - img /= np.array(std)[:, np.newaxis, np.newaxis].astype('float32') - img = [img] - img = np.array(img) - return img - - -def get_shrink(height, width): - """ - Args: - height (int): image height. - width (int): image width. - """ - # avoid out of memory - max_shrink_v1 = (0x7fffffff / 577.0 / (height * width))**0.5 - max_shrink_v2 = ((678 * 1024 * 2.0 * 2.0) / (height * width))**0.5 - - def get_round(x, loc): - str_x = str(x) - if '.' in str_x: - str_before, str_after = str_x.split('.') - len_after = len(str_after) - if len_after >= 3: - str_final = str_before + '.' + str_after[0:loc] - return float(str_final) - else: - return x - - max_shrink = get_round(min(max_shrink_v1, max_shrink_v2), 2) - 0.3 - if max_shrink >= 1.5 and max_shrink < 2: - max_shrink = max_shrink - 0.1 - elif max_shrink >= 2 and max_shrink < 3: - max_shrink = max_shrink - 0.2 - elif max_shrink >= 3 and max_shrink < 4: - max_shrink = max_shrink - 0.3 - elif max_shrink >= 4 and max_shrink < 5: - max_shrink = max_shrink - 0.4 - elif max_shrink >= 5: - max_shrink = max_shrink - 0.5 - elif max_shrink <= 0.1: - max_shrink = 0.1 - - shrink = max_shrink if max_shrink < 1 else 1 - return shrink, max_shrink - - -def bbox_vote(det): - order = det[:, 4].ravel().argsort()[::-1] - det = det[order, :] - if det.shape[0] == 0: - dets = np.array([[10, 10, 20, 20, 0.002]]) - det = np.empty(shape=[0, 5]) - while det.shape[0] > 0: - # IOU - area = (det[:, 2] - det[:, 0] + 1) * (det[:, 3] - det[:, 1] + 1) - xx1 = np.maximum(det[0, 0], det[:, 0]) - yy1 = np.maximum(det[0, 1], det[:, 1]) - xx2 = np.minimum(det[0, 2], det[:, 2]) - yy2 = np.minimum(det[0, 3], det[:, 3]) - w = np.maximum(0.0, xx2 - xx1 + 1) - h = np.maximum(0.0, yy2 - yy1 + 1) - inter = w * h - o = inter / (area[0] + area[:] - inter) - - # nms - merge_index = np.where(o >= 0.3)[0] - det_accu = det[merge_index, :] - det = np.delete(det, merge_index, 0) - if merge_index.shape[0] <= 1: - if det.shape[0] == 0: - try: - dets = np.row_stack((dets, det_accu)) - except: - dets = det_accu - continue - det_accu[:, 0:4] = det_accu[:, 0:4] * np.tile(det_accu[:, -1:], (1, 4)) - max_score = np.max(det_accu[:, 4]) - det_accu_sum = np.zeros((1, 5)) - det_accu_sum[:, 0:4] = np.sum(det_accu[:, 0:4], - axis=0) / np.sum(det_accu[:, -1:]) - det_accu_sum[:, 4] = max_score - try: - dets = np.row_stack((dets, det_accu_sum)) - except: - dets = det_accu_sum - dets = dets[0:750, :] - keep_index = np.where(dets[:, 4] >= 0.01)[0] - dets = dets[keep_index, :] - return dets - - -def save_widerface_bboxes(image_path, bboxes_scores, output_dir): - image_name = image_path.split('/')[-1] - image_class = image_path.split('/')[-2] - odir = os.path.join(output_dir, image_class) - if not os.path.exists(odir): - os.makedirs(odir) - - ofname = os.path.join(odir, '%s.txt' % (image_name[:-4])) - f = open(ofname, 'w') - f.write('{:s}\n'.format(image_class + '/' + image_name)) - f.write('{:d}\n'.format(bboxes_scores.shape[0])) - for box_score in bboxes_scores: - xmin, ymin, xmax, ymax, score = box_score - f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n'.format(xmin, ymin, ( - xmax - xmin + 1), (ymax - ymin + 1), score)) - f.close() - logger.info("The predicted result is saved as {}".format(ofname)) - - -def save_fddb_bboxes(bboxes_scores, - output_dir, - output_fname='pred_fddb_res.txt'): - if not os.path.exists(output_dir): - os.makedirs(output_dir) - predict_file = os.path.join(output_dir, output_fname) - f = open(predict_file, 'w') - for image_path, dets in bboxes_scores.iteritems(): - f.write('{:s}\n'.format(image_path)) - f.write('{:d}\n'.format(dets.shape[0])) - for box_score in dets: - xmin, ymin, xmax, ymax, score = box_score - width, height = xmax - xmin, ymax - ymin - f.write('{:.1f} {:.1f} {:.1f} {:.1f} {:.3f}\n' - .format(xmin, ymin, width, height, score)) - logger.info("The predicted result is saved as {}".format(predict_file)) - return predict_file - - -def lmk2out(results, is_bbox_normalized=False): - """ - Args: - results: request a dict, should include: `landmark`, `im_id`, - if is_bbox_normalized=True, also need `im_shape`. - is_bbox_normalized: whether or not landmark is normalized. - """ - xywh_res = [] - for t in results: - bboxes = t['bbox'][0] - lengths = t['bbox'][1][0] - im_ids = np.array(t['im_id'][0]).flatten() - if bboxes.shape == (1, 1) or bboxes is None: - continue - face_index = t['face_index'][0] - prior_box = t['prior_boxes'][0] - predict_lmk = t['landmark'][0] - prior = np.reshape(prior_box, (-1, 4)) - predictlmk = np.reshape(predict_lmk, (-1, 10)) - - k = 0 - for a in range(len(lengths)): - num = lengths[a] - im_id = int(im_ids[a]) - for i in range(num): - score = bboxes[k][1] - theindex = face_index[i][0] - me_prior = prior[theindex, :] - lmk_pred = predictlmk[theindex, :] - prior_w = me_prior[2] - me_prior[0] - prior_h = me_prior[3] - me_prior[1] - prior_w_center = (me_prior[2] + me_prior[0]) / 2 - prior_h_center = (me_prior[3] + me_prior[1]) / 2 - lmk_decode = np.zeros((10)) - for j in [0, 2, 4, 6, 8]: - lmk_decode[j] = lmk_pred[j] * 0.1 * prior_w + prior_w_center - for j in [1, 3, 5, 7, 9]: - lmk_decode[j] = lmk_pred[j] * 0.1 * prior_h + prior_h_center - im_shape = t['im_shape'][0][a].tolist() - image_h, image_w = int(im_shape[0]), int(im_shape[1]) - if is_bbox_normalized: - lmk_decode = lmk_decode * np.array([ - image_w, image_h, image_w, image_h, image_w, image_h, - image_w, image_h, image_w, image_h - ]) - lmk_res = { - 'image_id': im_id, - 'landmark': lmk_decode, - 'score': score, - } - xywh_res.append(lmk_res) - k += 1 - return xywh_res diff --git a/pdfdet/models/Paddle/ppdet/model_zoo/.gitignore b/pdfdet/models/Paddle/ppdet/model_zoo/.gitignore deleted file mode 100644 index f296851..0000000 --- a/pdfdet/models/Paddle/ppdet/model_zoo/.gitignore +++ /dev/null @@ -1 +0,0 @@ -MODEL_ZOO diff --git a/pdfdet/models/Paddle/ppdet/model_zoo/__init__.py b/pdfdet/models/Paddle/ppdet/model_zoo/__init__.py deleted file mode 100644 index 6db6eb6..0000000 --- a/pdfdet/models/Paddle/ppdet/model_zoo/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import model_zoo -from .model_zoo import * - -__all__ = model_zoo.__all__ diff --git a/pdfdet/models/Paddle/ppdet/model_zoo/model_zoo.py b/pdfdet/models/Paddle/ppdet/model_zoo/model_zoo.py deleted file mode 100644 index 27581ef..0000000 --- a/pdfdet/models/Paddle/ppdet/model_zoo/model_zoo.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os.path as osp -import pkg_resources - -try: - from collections.abc import Sequence -except: - from collections import Sequence - -from ppdet.core.workspace import load_config, create -from ppdet.utils.checkpoint import load_weight -from ppdet.utils.download import get_config_path - -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -__all__ = [ - 'list_model', 'get_config_file', 'get_weights_url', 'get_model', - 'MODEL_ZOO_FILENAME' -] - -MODEL_ZOO_FILENAME = 'MODEL_ZOO' - - -def list_model(filters=[]): - model_zoo_file = pkg_resources.resource_filename('ppdet.model_zoo', - MODEL_ZOO_FILENAME) - with open(model_zoo_file) as f: - model_names = f.read().splitlines() - - # filter model_name - def filt(name): - for f in filters: - if name.find(f) < 0: - return False - return True - - if isinstance(filters, str) or not isinstance(filters, Sequence): - filters = [filters] - model_names = [name for name in model_names if filt(name)] - if len(model_names) == 0 and len(filters) > 0: - raise ValueError("no model found, please check filters seeting, " - "filters can be set as following kinds:\n" - "\tDataset: coco, voc ...\n" - "\tArchitecture: yolo, rcnn, ssd ...\n" - "\tBackbone: resnet, vgg, darknet ...\n") - - model_str = "Available Models:\n" - for model_name in model_names: - model_str += "\t{}\n".format(model_name) - logger.info(model_str) - - -# models and configs save on bcebos under dygraph directory -def get_config_file(model_name): - return get_config_path("ppdet://configs/{}.yml".format(model_name)) - - -def get_weights_url(model_name): - return "ppdet://models/{}.pdparams".format(osp.split(model_name)[-1]) - - -def get_model(model_name, pretrained=True): - cfg_file = get_config_file(model_name) - cfg = load_config(cfg_file) - model = create(cfg.architecture) - - if pretrained: - load_weight(model, get_weights_url(model_name)) - - return model diff --git a/pdfdet/models/Paddle/ppdet/modeling/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/__init__.py deleted file mode 100644 index fc7caf4..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/__init__.py +++ /dev/null @@ -1,49 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import warnings -warnings.filterwarnings( - action='ignore', category=DeprecationWarning, module='ops') - -from . import ops -from . import backbones -from . import necks -from . import proposal_generator -from . import heads -from . import losses -from . import architectures -from . import post_process -from . import layers -from . import reid -from . import mot -from . import transformers -from . import assigners -from . import rbox_utils -from . import ssod - -from .ops import * -from .backbones import * -from .necks import * -from .proposal_generator import * -from .heads import * -from .losses import * -from .architectures import * -from .post_process import * -from .layers import * -from .reid import * -from .mot import * -from .transformers import * -from .assigners import * -from .rbox_utils import * -from .ssod import * diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/__init__.py deleted file mode 100644 index d22df32..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/__init__.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import meta_arch -from . import faster_rcnn -from . import mask_rcnn -from . import yolo -from . import ppyoloe -from . import cascade_rcnn -from . import ssd -from . import fcos -from . import solov2 -from . import ttfnet -from . import s2anet -from . import keypoint_hrhrnet -from . import keypoint_hrnet -from . import keypoint_vitpose -from . import jde -from . import deepsort -from . import fairmot -from . import centernet -from . import gfl -from . import picodet -from . import detr -from . import sparse_rcnn -from . import tood -from . import retinanet -from . import bytetrack -from . import yolox -from . import yolof -from . import pose3d_metro -from . import centertrack -from . import queryinst -from . import detr_ssod -from . import multi_stream_detector -from . import clrnet - -from .meta_arch import * -from .faster_rcnn import * -from .mask_rcnn import * -from .yolo import * -from .ppyoloe import * -from .cascade_rcnn import * -from .ssd import * -from .fcos import * -from .solov2 import * -from .ttfnet import * -from .s2anet import * -from .keypoint_hrhrnet import * -from .keypoint_hrnet import * -from .keypoint_vitpose import * -from .jde import * -from .deepsort import * -from .fairmot import * -from .centernet import * -from .blazeface import * -from .gfl import * -from .picodet import * -from .detr import * -from .sparse_rcnn import * -from .tood import * -from .retinanet import * -from .bytetrack import * -from .yolox import * -from .yolof import * -from .pose3d_metro import * -from .centertrack import * -from .queryinst import * -from .keypoint_petr import * -from .detr_ssod import * -from .multi_stream_detector import * -from .clrnet import * diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/blazeface.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/blazeface.py deleted file mode 100644 index 477732d..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/blazeface.py +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch -import paddle -import paddle.nn.functional as F - -__all__ = ['BlazeFace'] - - -@register -class BlazeFace(BaseArch): - """ - BlazeFace: Sub-millisecond Neural Face Detection on Mobile GPUs, - see https://arxiv.org/abs/1907.05047 - - Args: - backbone (nn.Layer): backbone instance - neck (nn.Layer): neck instance - blaze_head (nn.Layer): `blazeHead` instance - post_process (object): `BBoxPostProcess` instance - """ - - __category__ = 'architecture' - __inject__ = ['post_process'] - - def __init__(self, backbone, blaze_head, neck, post_process): - super(BlazeFace, self).__init__() - self.backbone = backbone - self.neck = neck - self.blaze_head = blaze_head - self.post_process = post_process - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - # backbone - backbone = create(cfg['backbone']) - # fpn - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) - # head - kwargs = {'input_shape': neck.out_shape} - blaze_head = create(cfg['blaze_head'], **kwargs) - - return { - 'backbone': backbone, - 'neck': neck, - 'blaze_head': blaze_head, - } - - def _forward(self): - # Backbone - body_feats = self.backbone(self.inputs) - # neck - neck_feats = self.neck(body_feats) - # blaze Head - if self.training: - return self.blaze_head(neck_feats, self.inputs['image'], - self.inputs['gt_bbox'], - self.inputs['gt_class']) - else: - preds, anchors = self.blaze_head(neck_feats, self.inputs['image']) - bbox, bbox_num, nms_keep_idx = self.post_process( - preds, anchors, self.inputs['im_shape'], - self.inputs['scale_factor']) - if self.use_extra_data: - extra_data = {} # record the bbox output before nms, such like scores and nms_keep_idx - """extra_data:{ - 'scores': predict scores, - 'nms_keep_idx': bbox index before nms, - } - """ - preds_logits = preds[1] # [[1xNumBBoxNumClass]] - extra_data['scores'] = F.softmax(paddle.concat( - preds_logits, axis=1)).transpose([0, 2, 1]) - extra_data['logits'] = paddle.concat( - preds_logits, axis=1).transpose([0, 2, 1]) - extra_data['nms_keep_idx'] = nms_keep_idx # bbox index before nms - return bbox, bbox_num, extra_data - else: - return bbox, bbox_num - - def get_loss(self, ): - return {"loss": self._forward()} - - def get_pred(self): - if self.use_extra_data: - bbox_pred, bbox_num, extra_data = self._forward() - output = { - "bbox": bbox_pred, - "bbox_num": bbox_num, - "extra_data": extra_data - } - else: - bbox_pred, bbox_num = self._forward() - output = { - "bbox": bbox_pred, - "bbox_num": bbox_num, - } - - return output diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/bytetrack.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/bytetrack.py deleted file mode 100644 index 1f3d0d1..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/bytetrack.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -__all__ = ['ByteTrack'] - - -@register -class ByteTrack(BaseArch): - """ - ByteTrack network, see https://arxiv.org/abs/2110.06864 - - Args: - detector (object): detector model instance - reid (object): reid model instance, default None - tracker (object): tracker instance - """ - __category__ = 'architecture' - - def __init__(self, - detector='YOLOX', - reid=None, - tracker='JDETracker'): - super(ByteTrack, self).__init__() - self.detector = detector - self.reid = reid - self.tracker = tracker - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - detector = create(cfg['detector']) - - if cfg['reid'] != 'None': - reid = create(cfg['reid']) - else: - reid = None - - tracker = create(cfg['tracker']) - - return { - "detector": detector, - "reid": reid, - "tracker": tracker, - } - - def _forward(self): - det_outs = self.detector(self.inputs) - - if self.training: - return det_outs - else: - if self.reid is not None: - assert 'crops' in self.inputs - crops = self.inputs['crops'] - pred_embs = self.reid(crops) - else: - pred_embs = None - det_outs['embeddings'] = pred_embs - return det_outs - - def get_loss(self): - return self._forward() - - def get_pred(self): - return self._forward() - diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/cascade_rcnn.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/cascade_rcnn.py deleted file mode 100644 index c5d454f..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/cascade_rcnn.py +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -__all__ = ['CascadeRCNN'] - - -@register -class CascadeRCNN(BaseArch): - """ - Cascade R-CNN network, see https://arxiv.org/abs/1712.00726 - - Args: - backbone (object): backbone instance - rpn_head (object): `RPNHead` instance - bbox_head (object): `BBoxHead` instance - bbox_post_process (object): `BBoxPostProcess` instance - neck (object): 'FPN' instance - mask_head (object): `MaskHead` instance - mask_post_process (object): `MaskPostProcess` instance - """ - __category__ = 'architecture' - __inject__ = [ - 'bbox_post_process', - 'mask_post_process', - ] - - def __init__(self, - backbone, - rpn_head, - bbox_head, - bbox_post_process, - neck=None, - mask_head=None, - mask_post_process=None): - super(CascadeRCNN, self).__init__() - self.backbone = backbone - self.rpn_head = rpn_head - self.bbox_head = bbox_head - self.bbox_post_process = bbox_post_process - self.neck = neck - self.mask_head = mask_head - self.mask_post_process = mask_post_process - self.with_mask = mask_head is not None - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - backbone = create(cfg['backbone']) - kwargs = {'input_shape': backbone.out_shape} - neck = cfg['neck'] and create(cfg['neck'], **kwargs) - - out_shape = neck and neck.out_shape or backbone.out_shape - kwargs = {'input_shape': out_shape} - rpn_head = create(cfg['rpn_head'], **kwargs) - bbox_head = create(cfg['bbox_head'], **kwargs) - - out_shape = neck and out_shape or bbox_head.get_head().out_shape - kwargs = {'input_shape': out_shape} - mask_head = cfg['mask_head'] and create(cfg['mask_head'], **kwargs) - return { - 'backbone': backbone, - 'neck': neck, - "rpn_head": rpn_head, - "bbox_head": bbox_head, - "mask_head": mask_head, - } - - def _forward(self): - body_feats = self.backbone(self.inputs) - if self.neck is not None: - body_feats = self.neck(body_feats) - - if self.training: - rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs) - bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num, - self.inputs) - rois, rois_num = self.bbox_head.get_assigned_rois() - bbox_targets = self.bbox_head.get_assigned_targets() - if self.with_mask: - mask_loss = self.mask_head(body_feats, rois, rois_num, - self.inputs, bbox_targets, bbox_feat) - return rpn_loss, bbox_loss, mask_loss - else: - return rpn_loss, bbox_loss, {} - else: - rois, rois_num, _ = self.rpn_head(body_feats, self.inputs) - preds, _ = self.bbox_head(body_feats, rois, rois_num, self.inputs) - refined_rois = self.bbox_head.get_refined_rois() - - im_shape = self.inputs['im_shape'] - scale_factor = self.inputs['scale_factor'] - - bbox, bbox_num, nms_keep_idx = self.bbox_post_process( - preds, (refined_rois, rois_num), im_shape, scale_factor) - # rescale the prediction back to origin image - bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred( - bbox, bbox_num, im_shape, scale_factor) - if not self.with_mask: - return bbox_pred, bbox_num, None - mask_out = self.mask_head(body_feats, bbox, bbox_num, self.inputs) - origin_shape = self.bbox_post_process.get_origin_shape() - mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num, - origin_shape) - return bbox_pred, bbox_num, mask_pred - - def get_loss(self, ): - rpn_loss, bbox_loss, mask_loss = self._forward() - loss = {} - loss.update(rpn_loss) - loss.update(bbox_loss) - if self.with_mask: - loss.update(mask_loss) - total_loss = paddle.add_n(list(loss.values())) - loss.update({'loss': total_loss}) - return loss - - def get_pred(self): - bbox_pred, bbox_num, mask_pred = self._forward() - output = { - 'bbox': bbox_pred, - 'bbox_num': bbox_num, - } - if self.with_mask: - output.update({'mask': mask_pred}) - return output diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/centernet.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/centernet.py deleted file mode 100644 index 439e5f8..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/centernet.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -__all__ = ['CenterNet'] - - -@register -class CenterNet(BaseArch): - """ - CenterNet network, see http://arxiv.org/abs/1904.07850 - - Args: - backbone (object): backbone instance - neck (object): FPN instance, default use 'CenterNetDLAFPN' - head (object): 'CenterNetHead' instance - post_process (object): 'CenterNetPostProcess' instance - for_mot (bool): whether return other features used in tracking model - - """ - __category__ = 'architecture' - __inject__ = ['post_process'] - __shared__ = ['for_mot'] - - def __init__(self, - backbone, - neck='CenterNetDLAFPN', - head='CenterNetHead', - post_process='CenterNetPostProcess', - for_mot=False): - super(CenterNet, self).__init__() - self.backbone = backbone - self.neck = neck - self.head = head - self.post_process = post_process - self.for_mot = for_mot - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - backbone = create(cfg['backbone']) - - kwargs = {'input_shape': backbone.out_shape} - neck = cfg['neck'] and create(cfg['neck'], **kwargs) - - out_shape = neck and neck.out_shape or backbone.out_shape - kwargs = {'input_shape': out_shape} - head = create(cfg['head'], **kwargs) - - return {'backbone': backbone, 'neck': neck, "head": head} - - def _forward(self): - neck_feat = self.backbone(self.inputs) - if self.neck is not None: - neck_feat = self.neck(neck_feat) - head_out = self.head(neck_feat, self.inputs) - if self.for_mot: - head_out.update({'neck_feat': neck_feat}) - elif self.training: - head_out['loss'] = head_out.pop('det_loss') - return head_out - - def get_pred(self): - head_out = self._forward() - bbox, bbox_num, bbox_inds, topk_clses, topk_ys, topk_xs = self.post_process( - head_out['heatmap'], - head_out['size'], - head_out['offset'], - im_shape=self.inputs['im_shape'], - scale_factor=self.inputs['scale_factor']) - - if self.for_mot: - output = { - "bbox": bbox, - "bbox_num": bbox_num, - "bbox_inds": bbox_inds, - "topk_clses": topk_clses, - "topk_ys": topk_ys, - "topk_xs": topk_xs, - "neck_feat": head_out['neck_feat'] - } - else: - output = {"bbox": bbox, "bbox_num": bbox_num} - return output - - def get_loss(self): - return self._forward() diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/centertrack.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/centertrack.py deleted file mode 100644 index b9880db..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/centertrack.py +++ /dev/null @@ -1,176 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import copy -import math -import numpy as np -import paddle -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -from ..keypoint_utils import affine_transform -from ppdet.data.transform.op_helper import gaussian_radius, gaussian2D, draw_umich_gaussian - -__all__ = ['CenterTrack'] - - -@register -class CenterTrack(BaseArch): - """ - CenterTrack network, see http://arxiv.org/abs/2004.01177 - - Args: - detector (object): 'CenterNet' instance - plugin_head (object): 'CenterTrackHead' instance - tracker (object): 'CenterTracker' instance - """ - __category__ = 'architecture' - __shared__ = ['mot_metric'] - - def __init__(self, - detector='CenterNet', - plugin_head='CenterTrackHead', - tracker='CenterTracker', - mot_metric=False): - super(CenterTrack, self).__init__() - self.detector = detector - self.plugin_head = plugin_head - self.tracker = tracker - self.mot_metric = mot_metric - self.pre_image = None - self.deploy = False - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - detector = create(cfg['detector']) - detector_out_shape = detector.neck and detector.neck.out_shape or detector.backbone.out_shape - - kwargs = {'input_shape': detector_out_shape} - plugin_head = create(cfg['plugin_head'], **kwargs) - tracker = create(cfg['tracker']) - - return { - 'detector': detector, - 'plugin_head': plugin_head, - 'tracker': tracker, - } - - def _forward(self): - if self.training: - det_outs = self.detector(self.inputs) - neck_feat = det_outs['neck_feat'] - - losses = {} - for k, v in det_outs.items(): - if 'loss' not in k: continue - losses.update({k: v}) - - plugin_outs = self.plugin_head(neck_feat, self.inputs) - for k, v in plugin_outs.items(): - if 'loss' not in k: continue - losses.update({k: v}) - - losses['loss'] = det_outs['det_loss'] + plugin_outs['plugin_loss'] - return losses - - else: - if not self.mot_metric: - # detection, support bs>=1 - det_outs = self.detector(self.inputs) - return { - 'bbox': det_outs['bbox'], - 'bbox_num': det_outs['bbox_num'] - } - - else: - # MOT, only support bs=1 - if not self.deploy: - if self.pre_image is None: - self.pre_image = self.inputs['image'] - # initializing tracker for the first frame - self.tracker.init_track([]) - self.inputs['pre_image'] = self.pre_image - self.pre_image = self.inputs[ - 'image'] # Note: update for next image - - # render input heatmap from tracker status - pre_hm = self.get_additional_inputs( - self.tracker.tracks, self.inputs, with_hm=True) - self.inputs['pre_hm'] = paddle.to_tensor(pre_hm) - - # model inference - det_outs = self.detector(self.inputs) - neck_feat = det_outs['neck_feat'] - result = self.plugin_head( - neck_feat, self.inputs, det_outs['bbox'], - det_outs['bbox_inds'], det_outs['topk_clses'], - det_outs['topk_ys'], det_outs['topk_xs']) - - if not self.deploy: - # convert the cropped and 4x downsampled output coordinate system - # back to the input image coordinate system - result = self.plugin_head.centertrack_post_process( - result, self.inputs, self.tracker.out_thresh) - return result - - def get_pred(self): - return self._forward() - - def get_loss(self): - return self._forward() - - def reset_tracking(self): - self.tracker.reset() - self.pre_image = None - - def get_additional_inputs(self, dets, meta, with_hm=True): - # Render input heatmap from previous trackings. - trans_input = meta['trans_input'][0].numpy() - inp_width, inp_height = int(meta['inp_width'][0]), int(meta[ - 'inp_height'][0]) - input_hm = np.zeros((1, inp_height, inp_width), dtype=np.float32) - - for det in dets: - if det['score'] < self.tracker.pre_thresh: - continue - bbox = affine_transform_bbox(det['bbox'], trans_input, inp_width, - inp_height) - h, w = bbox[3] - bbox[1], bbox[2] - bbox[0] - if (h > 0 and w > 0): - radius = gaussian_radius( - (math.ceil(h), math.ceil(w)), min_overlap=0.7) - radius = max(0, int(radius)) - ct = np.array( - [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2], - dtype=np.float32) - ct_int = ct.astype(np.int32) - if with_hm: - input_hm[0] = draw_umich_gaussian(input_hm[0], ct_int, - radius) - if with_hm: - input_hm = input_hm[np.newaxis] - return input_hm - - -def affine_transform_bbox(bbox, trans, width, height): - bbox = np.array(copy.deepcopy(bbox), dtype=np.float32) - bbox[:2] = affine_transform(bbox[:2], trans) - bbox[2:] = affine_transform(bbox[2:], trans) - bbox[[0, 2]] = np.clip(bbox[[0, 2]], 0, width - 1) - bbox[[1, 3]] = np.clip(bbox[[1, 3]], 0, height - 1) - return bbox diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/clrnet.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/clrnet.py deleted file mode 100644 index 8336fd8..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/clrnet.py +++ /dev/null @@ -1,67 +0,0 @@ -from .meta_arch import BaseArch -from ppdet.core.workspace import register, create -from paddle import in_dynamic_mode - -__all__ = ['CLRNet'] - - -@register -class CLRNet(BaseArch): - __category__ = 'architecture' - - def __init__(self, - backbone="CLRResNet", - neck="CLRFPN", - clr_head="CLRHead", - post_process=None): - super(CLRNet, self).__init__() - self.backbone = backbone - self.neck = neck - self.heads = clr_head - self.post_process = post_process - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - # backbone - backbone = create(cfg['backbone']) - # fpn - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) - # head - kwargs = {'input_shape': neck.out_shape} - clr_head = create(cfg['clr_head'], **kwargs) - - return { - 'backbone': backbone, - 'neck': neck, - 'clr_head': clr_head, - } - - def _forward(self): - # Backbone - body_feats = self.backbone(self.inputs['image']) - # neck - neck_feats = self.neck(body_feats) - # CRL Head - - if self.training: - output = self.heads(neck_feats, self.inputs) - else: - output = self.heads(neck_feats) - output = {'lanes': output} - # TODO: hard code fix as_lanes=False problem in clrnet_head.py "get_lanes" function for static mode - if in_dynamic_mode(): - output = self.heads.get_lanes(output['lanes']) - output = { - "lanes": output, - "img_path": self.inputs['full_img_path'], - "img_name": self.inputs['img_name'] - } - - return output - - def get_loss(self): - return self._forward() - - def get_pred(self): - return self._forward() diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/deepsort.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/deepsort.py deleted file mode 100644 index 164c279..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/deepsort.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch -from ppdet.modeling.mot.utils import Detection, get_crops, scale_coords, clip_box - -__all__ = ['DeepSORT'] - - -@register -class DeepSORT(BaseArch): - """ - DeepSORT network, see https://arxiv.org/abs/1703.07402 - - Args: - detector (object): detector model instance - reid (object): reid model instance - tracker (object): tracker instance - """ - __category__ = 'architecture' - - def __init__(self, - detector='YOLOv3', - reid='PCBPyramid', - tracker='DeepSORTTracker'): - super(DeepSORT, self).__init__() - self.detector = detector - self.reid = reid - self.tracker = tracker - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - if cfg['detector'] != 'None': - detector = create(cfg['detector']) - else: - detector = None - reid = create(cfg['reid']) - tracker = create(cfg['tracker']) - - return { - "detector": detector, - "reid": reid, - "tracker": tracker, - } - - def _forward(self): - crops = self.inputs['crops'] - outs = {} - outs['embeddings'] = self.reid(crops) - return outs - - def get_pred(self): - return self._forward() diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/detr.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/detr.py deleted file mode 100644 index 085f63f..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/detr.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -from .meta_arch import BaseArch -from ppdet.core.workspace import register, create - -__all__ = ['DETR'] -# Deformable DETR, DINO use the same architecture as DETR - - -@register -class DETR(BaseArch): - __category__ = 'architecture' - __inject__ = ['post_process', 'post_process_semi'] - __shared__ = ['with_mask', 'exclude_post_process'] - - def __init__(self, - backbone, - transformer='DETRTransformer', - detr_head='DETRHead', - neck=None, - post_process='DETRPostProcess', - post_process_semi=None, - with_mask=False, - exclude_post_process=False): - super(DETR, self).__init__() - self.backbone = backbone - self.transformer = transformer - self.detr_head = detr_head - self.neck = neck - self.post_process = post_process - self.with_mask = with_mask - self.exclude_post_process = exclude_post_process - self.post_process_semi = post_process_semi - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - # backbone - backbone = create(cfg['backbone']) - # neck - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) if cfg['neck'] else None - - # transformer - if neck is not None: - kwargs = {'input_shape': neck.out_shape} - transformer = create(cfg['transformer'], **kwargs) - # head - kwargs = { - 'hidden_dim': transformer.hidden_dim, - 'nhead': transformer.nhead, - 'input_shape': backbone.out_shape - } - detr_head = create(cfg['detr_head'], **kwargs) - - return { - 'backbone': backbone, - 'transformer': transformer, - "detr_head": detr_head, - "neck": neck - } - - def _forward(self): - # Backbone - body_feats = self.backbone(self.inputs) - - # Neck - if self.neck is not None: - body_feats = self.neck(body_feats) - - # Transformer - pad_mask = self.inputs.get('pad_mask', None) - out_transformer = self.transformer(body_feats, pad_mask, self.inputs) - - # DETR Head - if self.training: - detr_losses = self.detr_head(out_transformer, body_feats, - self.inputs) - detr_losses.update({ - 'loss': paddle.add_n( - [v for k, v in detr_losses.items() if 'log' not in k]) - }) - return detr_losses - else: - preds = self.detr_head(out_transformer, body_feats) - if self.exclude_post_process: - bbox, bbox_num, mask = preds - else: - bbox, bbox_num, mask = self.post_process( - preds, self.inputs['im_shape'], self.inputs['scale_factor'], - paddle.shape(self.inputs['image'])[2:]) - - output = {'bbox': bbox, 'bbox_num': bbox_num} - if self.with_mask: - output['mask'] = mask - return output - - def get_loss(self): - return self._forward() - - def get_pred(self): - return self._forward() diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/detr_ssod.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/detr_ssod.py deleted file mode 100644 index 567c234..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/detr_ssod.py +++ /dev/null @@ -1,341 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from ppdet.core.workspace import register, create, merge_config -import paddle - -import numpy as np -import paddle -import paddle.nn.functional as F -from ppdet.core.workspace import register, create -from ppdet.utils.logger import setup_logger -from ppdet.modeling.ssod.utils import filter_invalid -from .multi_stream_detector import MultiSteamDetector -logger = setup_logger(__name__) - -__all__ = ['DETR_SSOD'] -__shared__ = ['num_classes'] - - -@register -class DETR_SSOD(MultiSteamDetector): - def __init__(self, - teacher, - student, - train_cfg=None, - test_cfg=None, - RTDETRTransformer=None, - num_classes=80): - super(DETR_SSOD, self).__init__( - dict( - teacher=teacher, student=student), - train_cfg=train_cfg, - test_cfg=test_cfg, ) - self.ema_start_iters = train_cfg['ema_start_iters'] - self.momentum = 0.9996 - self.cls_thr = None - self.cls_thr_ig = None - self.num_classes = num_classes - if train_cfg is not None: - self.freeze("teacher") - self.unsup_weight = self.train_cfg['unsup_weight'] - self.sup_weight = self.train_cfg['sup_weight'] - self._teacher = None - self._student = None - self._transformer = None - - @classmethod - def from_config(cls, cfg): - teacher = create(cfg['teacher']) - merge_config(cfg) - student = create(cfg['student']) - train_cfg = cfg['train_cfg'] - test_cfg = cfg['test_cfg'] - RTDETRTransformer = cfg['RTDETRTransformer'] - return { - 'teacher': teacher, - 'student': student, - 'train_cfg': train_cfg, - 'test_cfg': test_cfg, - 'RTDETRTransformer': RTDETRTransformer - } - - def forward_train(self, inputs, **kwargs): - if isinstance(inputs, dict): - iter_id = inputs['iter_id'] - elif isinstance(inputs, list): - iter_id = inputs[-1] - if iter_id == self.ema_start_iters: - self.update_ema_model(momentum=0) - elif iter_id > self.ema_start_iters: - self.update_ema_model(momentum=self.momentum) - if iter_id > self.ema_start_iters: - data_sup_w, data_sup_s, data_unsup_w, data_unsup_s, _ = inputs - - if data_sup_w['image'].shape != data_sup_s['image'].shape: - data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w, - data_sup_s) - - if 'gt_bbox' in data_unsup_s.keys(): - del data_unsup_s['gt_bbox'] - if 'gt_class' in data_unsup_s.keys(): - del data_unsup_s['gt_class'] - if 'gt_class' in data_unsup_w.keys(): - del data_unsup_w['gt_class'] - if 'gt_bbox' in data_unsup_w.keys(): - del data_unsup_w['gt_bbox'] - for k, v in data_sup_s.items(): - if k in ['epoch_id']: - continue - elif k in ['gt_class', 'gt_bbox', 'is_crowd']: - data_sup_s[k].extend(data_sup_w[k]) - else: - data_sup_s[k] = paddle.concat([v, data_sup_w[k]]) - - loss = {} - body_feats = self.student.backbone(data_sup_s) - if self.student.neck is not None: - body_feats = self.student.neck(body_feats) - out_transformer = self.student.transformer(body_feats, None, - data_sup_s) - sup_loss = self.student.detr_head(out_transformer, body_feats, - data_sup_s) - sup_loss.update({ - 'loss': paddle.add_n( - [v for k, v in sup_loss.items() if 'log' not in k]) - }) - sup_loss = {"sup_" + k: v for k, v in sup_loss.items()} - - loss.update(**sup_loss) - unsup_loss = self.foward_unsup_train(data_unsup_w, data_unsup_s) - unsup_loss.update({ - 'loss': paddle.add_n( - [v for k, v in unsup_loss.items() if 'log' not in k]) - }) - unsup_loss = {"unsup_" + k: v for k, v in unsup_loss.items()} - unsup_loss.update({ - 'loss': paddle.add_n( - [v for k, v in unsup_loss.items() if 'log' not in k]) - }) - loss.update(**unsup_loss) - loss.update({'loss': loss['sup_loss'] + loss['unsup_loss']}) - else: - if iter_id == self.ema_start_iters: - logger.info("start semi_supervised_traing") - data_sup_w, data_sup_s, data_unsup_w, data_unsup_s, _ = inputs - - if data_sup_w['image'].shape != data_sup_s['image'].shape: - data_sup_w, data_sup_s = align_weak_strong_shape(data_sup_w, - data_sup_s) - for k, v in data_sup_s.items(): - if k in ['epoch_id']: - continue - elif k in ['gt_class', 'gt_bbox', 'is_crowd']: - data_sup_s[k].extend(data_sup_w[k]) - else: - data_sup_s[k] = paddle.concat([v, data_sup_w[k]]) - loss = {} - sup_loss = self.student(data_sup_s) - unsup_loss = { - "unsup_" + k: v * paddle.to_tensor(0) - for k, v in sup_loss.items() - } - sup_loss = {"sup_" + k: v for k, v in sup_loss.items()} - loss.update(**sup_loss) - unsup_loss.update({ - 'loss': paddle.add_n( - [v * 0 for k, v in sup_loss.items() if 'log' not in k]) - }) - unsup_loss = {"unsup_" + k: v * 0 for k, v in unsup_loss.items()} - loss.update(**unsup_loss) - loss.update({'loss': loss['sup_loss']}) - return loss - - def foward_unsup_train(self, data_unsup_w, data_unsup_s): - - with paddle.no_grad(): - body_feats = self.teacher.backbone(data_unsup_w) - if self.teacher.neck is not None: - body_feats = self.teacher.neck(body_feats, is_teacher=True) - out_transformer = self.teacher.transformer( - body_feats, None, data_unsup_w, is_teacher=True) - preds = self.teacher.detr_head(out_transformer, body_feats) - bbox, bbox_num = self.teacher.post_process_semi(preds) - self.place = body_feats[0].place - - proposal_bbox_list = bbox[:, -4:] - proposal_bbox_list = proposal_bbox_list.split( - tuple(np.array(bbox_num)), 0) - - proposal_label_list = paddle.cast(bbox[:, :1], np.float32) - proposal_label_list = proposal_label_list.split( - tuple(np.array(bbox_num)), 0) - proposal_score_list = paddle.cast(bbox[:, 1:self.num_classes + 1], - np.float32) - proposal_score_list = proposal_score_list.split( - tuple(np.array(bbox_num)), 0) - proposal_bbox_list = [ - paddle.to_tensor( - p, place=self.place) for p in proposal_bbox_list - ] - proposal_label_list = [ - paddle.to_tensor( - p, place=self.place) for p in proposal_label_list - ] - # filter invalid box roughly - if isinstance(self.train_cfg['pseudo_label_initial_score_thr'], float): - thr = self.train_cfg['pseudo_label_initial_score_thr'] - else: - # TODO: use dynamic threshold - raise NotImplementedError( - "Dynamic Threshold is not implemented yet.") - proposal_bbox_list, proposal_label_list, proposal_score_list = list( - zip(* [ - filter_invalid( - proposal[:, :4], - proposal_label, - proposal_score, - thr=thr, - min_size=self.train_cfg['min_pseduo_box_size'], ) - for proposal, proposal_label, proposal_score in - zip(proposal_bbox_list, proposal_label_list, - proposal_score_list) - ])) - - teacher_bboxes = list(proposal_bbox_list) - teacher_labels = proposal_label_list - teacher_info = [teacher_bboxes, teacher_labels] - student_unsup = data_unsup_s - return self.compute_pseudo_label_loss(student_unsup, teacher_info, - proposal_score_list) - - def compute_pseudo_label_loss(self, student_unsup, teacher_info, - proposal_score_list): - - pseudo_bboxes = list(teacher_info[0]) - pseudo_labels = list(teacher_info[1]) - losses = dict() - for i in range(len(pseudo_bboxes)): - if pseudo_labels[i].shape[0] == 0: - pseudo_bboxes[i] = paddle.zeros([0, 4]).numpy() - pseudo_labels[i] = paddle.zeros([0, 1]).numpy() - else: - pseudo_bboxes[i] = pseudo_bboxes[i][:, :4].numpy() - pseudo_labels[i] = pseudo_labels[i].numpy() - for i in range(len(pseudo_bboxes)): - pseudo_labels[i] = paddle.to_tensor( - pseudo_labels[i], dtype=paddle.int32, place=self.place) - pseudo_bboxes[i] = paddle.to_tensor( - pseudo_bboxes[i], dtype=paddle.float32, place=self.place) - student_unsup.update({ - 'gt_bbox': pseudo_bboxes, - 'gt_class': pseudo_labels - }) - pseudo_sum = 0 - for i in range(len(pseudo_bboxes)): - pseudo_sum += pseudo_bboxes[i].sum() - if pseudo_sum == 0: #input fake data when there are no pseudo labels - pseudo_bboxes[0] = paddle.ones([1, 4]) - 0.5 - pseudo_labels[0] = paddle.ones([1, 1]).astype('int32') - student_unsup.update({ - 'gt_bbox': pseudo_bboxes, - 'gt_class': pseudo_labels - }) - body_feats = self.student.backbone(student_unsup) - if self.student.neck is not None: - body_feats = self.student.neck(body_feats) - out_transformer = self.student.transformer(body_feats, None, - student_unsup) - losses = self.student.detr_head(out_transformer, body_feats, - student_unsup) - for n, v in losses.items(): - losses[n] = v * 0 - else: - gt_bbox = [] - gt_class = [] - images = [] - proposal_score = [] - for i in range(len(pseudo_bboxes)): - if pseudo_labels[i].shape[0] == 0: - continue - else: - proposal_score.append(proposal_score_list[i].max(-1) - .unsqueeze(-1)) - gt_class.append(pseudo_labels[i]) - gt_bbox.append(pseudo_bboxes[i]) - images.append(student_unsup['image'][i]) - images = paddle.stack(images) - student_unsup.update({ - 'image': images, - 'gt_bbox': gt_bbox, - 'gt_class': gt_class - }) - body_feats = self.student.backbone(student_unsup) - if self.student.neck is not None: - body_feats = self.student.neck(body_feats) - out_transformer = self.student.transformer(body_feats, None, - student_unsup) - student_unsup.update({'gt_score': proposal_score}) - losses = self.student.detr_head(out_transformer, body_feats, - student_unsup) - return losses - - -def box_cxcywh_to_xyxy(x): - x_c, y_c, w, h = x.unbind(-1) - b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] - return paddle.stack(b, axis=-1) - - -def box_xyxy_to_cxcywh(x): - x0, y0, x1, y1 = x.unbind(-1) - b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)] - return paddle.stack(b, axis=-1) - - -def get_size_with_aspect_ratio(image_size, size, max_size=None): - w, h = image_size - if max_size is not None: - min_original_size = float(min((w, h))) - max_original_size = float(max((w, h))) - if max_original_size / min_original_size * size > max_size: - size = int(round(max_size * min_original_size / max_original_size)) - - if (w <= h and w == size) or (h <= w and h == size): - return (w, h) - - if w < h: - ow = size - oh = int(size * h / w) - else: - oh = size - ow = int(size * w / h) - - return (ow, oh) - - -def align_weak_strong_shape(data_weak, data_strong): - shape_x = data_strong['image'].shape[2] - shape_y = data_strong['image'].shape[3] - - target_size = [shape_x, shape_y] - data_weak['image'] = F.interpolate( - data_weak['image'], - size=target_size, - mode='bilinear', - align_corners=False) - return data_weak, data_strong diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/fairmot.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/fairmot.py deleted file mode 100644 index 2714508..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/fairmot.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -__all__ = ['FairMOT'] - - -@register -class FairMOT(BaseArch): - """ - FairMOT network, see http://arxiv.org/abs/2004.01888 - - Args: - detector (object): 'CenterNet' instance - reid (object): 'FairMOTEmbeddingHead' instance - tracker (object): 'JDETracker' instance - loss (object): 'FairMOTLoss' instance - - """ - - __category__ = 'architecture' - __inject__ = ['loss'] - - def __init__(self, - detector='CenterNet', - reid='FairMOTEmbeddingHead', - tracker='JDETracker', - loss='FairMOTLoss'): - super(FairMOT, self).__init__() - self.detector = detector - self.reid = reid - self.tracker = tracker - self.loss = loss - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - detector = create(cfg['detector']) - detector_out_shape = detector.neck and detector.neck.out_shape or detector.backbone.out_shape - - kwargs = {'input_shape': detector_out_shape} - reid = create(cfg['reid'], **kwargs) - loss = create(cfg['loss']) - tracker = create(cfg['tracker']) - - return { - 'detector': detector, - 'reid': reid, - 'loss': loss, - 'tracker': tracker - } - - def _forward(self): - loss = dict() - # det_outs keys: - # train: neck_feat, det_loss, heatmap_loss, size_loss, offset_loss (optional: iou_loss) - # eval/infer: neck_feat, bbox, bbox_inds - det_outs = self.detector(self.inputs) - neck_feat = det_outs['neck_feat'] - if self.training: - reid_loss = self.reid(neck_feat, self.inputs) - - det_loss = det_outs['det_loss'] - loss = self.loss(det_loss, reid_loss) - for k, v in det_outs.items(): - if 'loss' not in k: - continue - loss.update({k: v}) - loss.update({'reid_loss': reid_loss}) - return loss - else: - pred_dets, pred_embs = self.reid( - neck_feat, self.inputs, det_outs['bbox'], det_outs['bbox_inds'], - det_outs['topk_clses']) - return pred_dets, pred_embs - - def get_pred(self): - output = self._forward() - return output - - def get_loss(self): - loss = self._forward() - return loss diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/faster_rcnn.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/faster_rcnn.py deleted file mode 100644 index 93fd0f9..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/faster_rcnn.py +++ /dev/null @@ -1,167 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch -import numpy as np - -__all__ = ['FasterRCNN'] - - -@register -class FasterRCNN(BaseArch): - """ - Faster R-CNN network, see https://arxiv.org/abs/1506.01497 - - Args: - backbone (object): backbone instance - rpn_head (object): `RPNHead` instance - bbox_head (object): `BBoxHead` instance - bbox_post_process (object): `BBoxPostProcess` instance - neck (object): 'FPN' instance - """ - __category__ = 'architecture' - __inject__ = ['bbox_post_process'] - - def __init__(self, - backbone, - rpn_head, - bbox_head, - bbox_post_process, - neck=None): - super(FasterRCNN, self).__init__() - self.backbone = backbone - self.neck = neck - self.rpn_head = rpn_head - self.bbox_head = bbox_head - self.bbox_post_process = bbox_post_process - - def init_cot_head(self, relationship): - self.bbox_head.init_cot_head(relationship) - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - backbone = create(cfg['backbone']) - kwargs = {'input_shape': backbone.out_shape} - neck = cfg['neck'] and create(cfg['neck'], **kwargs) - - out_shape = neck and neck.out_shape or backbone.out_shape - kwargs = {'input_shape': out_shape} - rpn_head = create(cfg['rpn_head'], **kwargs) - bbox_head = create(cfg['bbox_head'], **kwargs) - return { - 'backbone': backbone, - 'neck': neck, - "rpn_head": rpn_head, - "bbox_head": bbox_head, - } - - def _forward(self): - body_feats = self.backbone(self.inputs) - if self.neck is not None: - body_feats = self.neck(body_feats) - if self.training: - rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs) - bbox_loss, _ = self.bbox_head(body_feats, rois, rois_num, - self.inputs) - return rpn_loss, bbox_loss - else: - rois, rois_num, _ = self.rpn_head(body_feats, self.inputs) - preds, _ = self.bbox_head(body_feats, rois, rois_num, None) - im_shape = self.inputs['im_shape'] - scale_factor = self.inputs['scale_factor'] - bbox, bbox_num, nms_keep_idx = self.bbox_post_process( - preds, (rois, rois_num), im_shape, scale_factor) - - # rescale the prediction back to origin image - bboxes, bbox_pred, bbox_num = self.bbox_post_process.get_pred( - bbox, bbox_num, im_shape, scale_factor) - - if self.use_extra_data: - extra_data = { - } # record the bbox output before nms, such like scores and nms_keep_idx - """extra_data:{ - 'scores': predict scores, - 'nms_keep_idx': bbox index before nms, - } - """ - extra_data['scores'] = preds[1] # predict scores (probability) - # Todo: get logits output - extra_data[ - 'nms_keep_idx'] = nms_keep_idx # bbox index before nms - return bbox_pred, bbox_num, extra_data - else: - return bbox_pred, bbox_num - - def get_loss(self, ): - rpn_loss, bbox_loss = self._forward() - loss = {} - loss.update(rpn_loss) - loss.update(bbox_loss) - total_loss = paddle.add_n(list(loss.values())) - loss.update({'loss': total_loss}) - return loss - - def get_pred(self): - if self.use_extra_data: - bbox_pred, bbox_num, extra_data = self._forward() - output = { - 'bbox': bbox_pred, - 'bbox_num': bbox_num, - 'extra_data': extra_data - } - else: - bbox_pred, bbox_num = self._forward() - output = {'bbox': bbox_pred, 'bbox_num': bbox_num} - return output - - def target_bbox_forward(self, data): - body_feats = self.backbone(data) - if self.neck is not None: - body_feats = self.neck(body_feats) - rois = [roi for roi in data['gt_bbox']] - rois_num = paddle.concat([paddle.shape(roi)[0:1] for roi in rois]) - - preds, _ = self.bbox_head(body_feats, rois, rois_num, None, cot=True) - return preds - - def relationship_learning(self, loader, num_classes_novel): - print('computing relationship') - train_labels_list = [] - label_list = [] - - for step_id, data in enumerate(loader): - _, bbox_prob = self.target_bbox_forward(data) - batch_size = data['im_id'].shape[0] - for i in range(batch_size): - num_bbox = data['gt_class'][i].shape[0] - train_labels = data['gt_class'][i] - train_labels_list.append(train_labels.numpy().squeeze(1)) - base_labels = bbox_prob.detach().numpy()[:, :-1] - label_list.append(base_labels) - - labels = np.concatenate(train_labels_list, 0) - probabilities = np.concatenate(label_list, 0) - N_t = np.max(labels) + 1 - conditional = [] - for i in range(N_t): - this_class = probabilities[labels == i] - average = np.mean(this_class, axis=0, keepdims=True) - conditional.append(average) - return np.concatenate(conditional) diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/fcos.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/fcos.py deleted file mode 100644 index 8c338ca..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/fcos.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -__all__ = ['FCOS', 'ARSL_FCOS'] - - -@register -class FCOS(BaseArch): - """ - FCOS network, see https://arxiv.org/abs/1904.01355 - - Args: - backbone (object): backbone instance - neck (object): 'FPN' instance - fcos_head (object): 'FCOSHead' instance - ssod_loss (object): 'SSODFCOSLoss' instance, only used for semi-det(ssod) by DenseTeacher - """ - - __category__ = 'architecture' - __inject__ = ['ssod_loss'] - - def __init__(self, - backbone='ResNet', - neck='FPN', - fcos_head='FCOSHead', - ssod_loss='SSODFCOSLoss'): - super(FCOS, self).__init__() - self.backbone = backbone - self.neck = neck - self.fcos_head = fcos_head - - # for ssod, semi-det - self.is_teacher = False - self.ssod_loss = ssod_loss - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - backbone = create(cfg['backbone']) - - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) - - kwargs = {'input_shape': neck.out_shape} - fcos_head = create(cfg['fcos_head'], **kwargs) - - return { - 'backbone': backbone, - 'neck': neck, - "fcos_head": fcos_head, - } - - def _forward(self): - body_feats = self.backbone(self.inputs) - fpn_feats = self.neck(body_feats) - - self.is_teacher = self.inputs.get('is_teacher', False) - if self.training or self.is_teacher: - losses = self.fcos_head(fpn_feats, self.inputs) - return losses - else: - fcos_head_outs = self.fcos_head(fpn_feats) - bbox_pred, bbox_num = self.fcos_head.post_process( - fcos_head_outs, self.inputs['scale_factor']) - return {'bbox': bbox_pred, 'bbox_num': bbox_num} - - def get_loss(self): - return self._forward() - - def get_pred(self): - return self._forward() - - def get_loss_keys(self): - return ['loss_cls', 'loss_box', 'loss_quality'] - - def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg): - ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs, - train_cfg) - return ssod_losses - - -@register -class ARSL_FCOS(BaseArch): - """ - FCOS ARSL network, see https://arxiv.org/abs/ - - Args: - backbone (object): backbone instance - neck (object): 'FPN' instance - fcos_head (object): 'FCOSHead_ARSL' instance - fcos_cr_loss (object): 'FCOSLossCR' instance, only used for semi-det(ssod) by ARSL - """ - - __category__ = 'architecture' - __inject__ = ['fcos_cr_loss'] - - def __init__(self, - backbone, - neck, - fcos_head='FCOSHead_ARSL', - fcos_cr_loss='FCOSLossCR'): - super(ARSL_FCOS, self).__init__() - self.backbone = backbone - self.neck = neck - self.fcos_head = fcos_head - self.fcos_cr_loss = fcos_cr_loss - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - backbone = create(cfg['backbone']) - - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) - - kwargs = {'input_shape': neck.out_shape} - fcos_head = create(cfg['fcos_head'], **kwargs) - - # consistency regularization loss - fcos_cr_loss = create(cfg['fcos_cr_loss']) - - return { - 'backbone': backbone, - 'neck': neck, - 'fcos_head': fcos_head, - 'fcos_cr_loss': fcos_cr_loss, - } - - def forward(self, inputs, branch="supervised", teacher_prediction=None): - assert branch in ['supervised', 'semi_supervised'], \ - print('In ARSL, type must be supervised or semi_supervised.') - - if self.data_format == 'NHWC': - image = inputs['image'] - inputs['image'] = paddle.transpose(image, [0, 2, 3, 1]) - self.inputs = inputs - - if self.training: - if branch == "supervised": - out = self.get_loss() - else: - out = self.get_pseudo_loss(teacher_prediction) - else: - # norm test - if branch == "supervised": - out = self.get_pred() - # predict pseudo labels - else: - out = self.get_pseudo_pred() - return out - - # model forward - def model_forward(self): - body_feats = self.backbone(self.inputs) - fpn_feats = self.neck(body_feats) - fcos_head_outs = self.fcos_head(fpn_feats) - return fcos_head_outs - - # supervised loss for labeled data - def get_loss(self): - loss = {} - tag_labels, tag_bboxes, tag_centerness = [], [], [] - for i in range(len(self.fcos_head.fpn_stride)): - # labels, reg_target, centerness - k_lbl = 'labels{}'.format(i) - if k_lbl in self.inputs: - tag_labels.append(self.inputs[k_lbl]) - k_box = 'reg_target{}'.format(i) - if k_box in self.inputs: - tag_bboxes.append(self.inputs[k_box]) - k_ctn = 'centerness{}'.format(i) - if k_ctn in self.inputs: - tag_centerness.append(self.inputs[k_ctn]) - fcos_head_outs = self.model_forward() - loss_fcos = self.fcos_head.get_loss(fcos_head_outs, tag_labels, - tag_bboxes, tag_centerness) - loss.update(loss_fcos) - return loss - - # unsupervised loss for unlabeled data - def get_pseudo_loss(self, teacher_prediction): - loss = {} - fcos_head_outs = self.model_forward() - unsup_loss = self.fcos_cr_loss(fcos_head_outs, teacher_prediction) - for k in unsup_loss.keys(): - loss[k + '_pseudo'] = unsup_loss[k] - return loss - - # get detection results for test, decode and rescale the results to original size - def get_pred(self): - fcos_head_outs = self.model_forward() - scale_factor = self.inputs['scale_factor'] - bbox_pred, bbox_num = self.fcos_head.post_process(fcos_head_outs, - scale_factor) - output = {'bbox': bbox_pred, 'bbox_num': bbox_num} - return output - - # generate pseudo labels to guide student - def get_pseudo_pred(self): - fcos_head_outs = self.model_forward() - pred_cls, pred_loc, pred_iou = fcos_head_outs[1:] # 0 is locations - for lvl, _ in enumerate(pred_loc): - pred_loc[lvl] = pred_loc[lvl] / self.fcos_head.fpn_stride[lvl] - - return [pred_cls, pred_loc, pred_iou, self.fcos_head.fpn_stride] diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/gfl.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/gfl.py deleted file mode 100644 index 91c1307..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/gfl.py +++ /dev/null @@ -1,87 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -__all__ = ['GFL'] - - -@register -class GFL(BaseArch): - """ - Generalized Focal Loss network, see https://arxiv.org/abs/2006.04388 - - Args: - backbone (object): backbone instance - neck (object): 'FPN' instance - head (object): 'GFLHead' instance - """ - - __category__ = 'architecture' - - def __init__(self, backbone, neck, head='GFLHead'): - super(GFL, self).__init__() - self.backbone = backbone - self.neck = neck - self.head = head - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - backbone = create(cfg['backbone']) - - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) - - kwargs = {'input_shape': neck.out_shape} - head = create(cfg['head'], **kwargs) - - return { - 'backbone': backbone, - 'neck': neck, - "head": head, - } - - def _forward(self): - body_feats = self.backbone(self.inputs) - fpn_feats = self.neck(body_feats) - head_outs = self.head(fpn_feats) - if not self.training: - im_shape = self.inputs['im_shape'] - scale_factor = self.inputs['scale_factor'] - bboxes, bbox_num = self.head.post_process(head_outs, im_shape, - scale_factor) - return bboxes, bbox_num - else: - return head_outs - - def get_loss(self, ): - loss = {} - - head_outs = self._forward() - loss_gfl = self.head.get_loss(head_outs, self.inputs) - loss.update(loss_gfl) - total_loss = paddle.add_n(list(loss.values())) - loss.update({'loss': total_loss}) - return loss - - def get_pred(self): - bbox_pred, bbox_num = self._forward() - output = {'bbox': bbox_pred, 'bbox_num': bbox_num} - return output diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/jde.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/jde.py deleted file mode 100644 index 11b45c8..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/jde.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -__all__ = ['JDE'] - - -@register -class JDE(BaseArch): - __category__ = 'architecture' - __shared__ = ['metric'] - """ - JDE network, see https://arxiv.org/abs/1909.12605v1 - - Args: - detector (object): detector model instance - reid (object): reid model instance - tracker (object): tracker instance - metric (str): 'MOTDet' for training and detection evaluation, 'ReID' - for ReID embedding evaluation, or 'MOT' for multi object tracking - evaluation. - """ - - def __init__(self, - detector='YOLOv3', - reid='JDEEmbeddingHead', - tracker='JDETracker', - metric='MOT'): - super(JDE, self).__init__() - self.detector = detector - self.reid = reid - self.tracker = tracker - self.metric = metric - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - detector = create(cfg['detector']) - kwargs = {'input_shape': detector.neck.out_shape} - - reid = create(cfg['reid'], **kwargs) - - tracker = create(cfg['tracker']) - - return { - "detector": detector, - "reid": reid, - "tracker": tracker, - } - - def _forward(self): - det_outs = self.detector(self.inputs) - - if self.training: - emb_feats = det_outs['emb_feats'] - loss_confs = det_outs['det_losses']['loss_confs'] - loss_boxes = det_outs['det_losses']['loss_boxes'] - jde_losses = self.reid( - emb_feats, - self.inputs, - loss_confs=loss_confs, - loss_boxes=loss_boxes) - return jde_losses - else: - if self.metric == 'MOTDet': - det_results = { - 'bbox': det_outs['bbox'], - 'bbox_num': det_outs['bbox_num'], - } - return det_results - - elif self.metric == 'MOT': - emb_feats = det_outs['emb_feats'] - bboxes = det_outs['bbox'] - boxes_idx = det_outs['boxes_idx'] - nms_keep_idx = det_outs['nms_keep_idx'] - - pred_dets, pred_embs = self.reid( - emb_feats, - self.inputs, - bboxes=bboxes, - boxes_idx=boxes_idx, - nms_keep_idx=nms_keep_idx) - return pred_dets, pred_embs - - else: - raise ValueError("Unknown metric {} for multi object tracking.". - format(self.metric)) - - def get_loss(self): - return self._forward() - - def get_pred(self): - return self._forward() diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_hrhrnet.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_hrhrnet.py deleted file mode 100644 index 366e9e3..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_hrhrnet.py +++ /dev/null @@ -1,287 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from scipy.optimize import linear_sum_assignment -from collections import abc, defaultdict -import numpy as np -import paddle - -from ppdet.core.workspace import register, create, serializable -from .meta_arch import BaseArch -from .. import layers as L -from ..keypoint_utils import transpred - -__all__ = ['HigherHRNet'] - - -@register -class HigherHRNet(BaseArch): - __category__ = 'architecture' - - def __init__(self, - backbone='HRNet', - hrhrnet_head='HrHRNetHead', - post_process='HrHRNetPostProcess', - eval_flip=True, - flip_perm=None, - max_num_people=30): - """ - HigherHRNet network, see https://arxiv.org/abs/1908.10357; - HigherHRNet+swahr, see https://arxiv.org/abs/2012.15175 - - Args: - backbone (nn.Layer): backbone instance - hrhrnet_head (nn.Layer): keypoint_head instance - bbox_post_process (object): `BBoxPostProcess` instance - """ - super(HigherHRNet, self).__init__() - self.backbone = backbone - self.hrhrnet_head = hrhrnet_head - self.post_process = post_process - self.flip = eval_flip - self.flip_perm = paddle.to_tensor(flip_perm) - self.deploy = False - self.interpolate = L.Upsample(2, mode='bilinear') - self.pool = L.MaxPool(5, 1, 2) - self.max_num_people = max_num_people - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - # backbone - backbone = create(cfg['backbone']) - # head - kwargs = {'input_shape': backbone.out_shape} - hrhrnet_head = create(cfg['hrhrnet_head'], **kwargs) - post_process = create(cfg['post_process']) - - return { - 'backbone': backbone, - "hrhrnet_head": hrhrnet_head, - "post_process": post_process, - } - - def _forward(self): - if self.flip and not self.training and not self.deploy: - self.inputs['image'] = paddle.concat( - (self.inputs['image'], paddle.flip(self.inputs['image'], [3]))) - body_feats = self.backbone(self.inputs) - - if self.training: - return self.hrhrnet_head(body_feats, self.inputs) - else: - outputs = self.hrhrnet_head(body_feats) - - if self.flip and not self.deploy: - outputs = [paddle.split(o, 2) for o in outputs] - output_rflip = [ - paddle.flip(paddle.gather(o[1], self.flip_perm, 1), [3]) - for o in outputs - ] - output1 = [o[0] for o in outputs] - heatmap = (output1[0] + output_rflip[0]) / 2. - tagmaps = [output1[1], output_rflip[1]] - outputs = [heatmap] + tagmaps - outputs = self.get_topk(outputs) - - if self.deploy: - return outputs - - res_lst = [] - h = self.inputs['im_shape'][0, 0].numpy().item() - w = self.inputs['im_shape'][0, 1].numpy().item() - kpts, scores = self.post_process(*outputs, h, w) - res_lst.append([kpts, scores]) - return res_lst - - def get_loss(self): - return self._forward() - - def get_pred(self): - outputs = {} - res_lst = self._forward() - outputs['keypoint'] = res_lst - return outputs - - def get_topk(self, outputs): - # resize to image size - outputs = [self.interpolate(x) for x in outputs] - if len(outputs) == 3: - tagmap = paddle.concat( - (outputs[1].unsqueeze(4), outputs[2].unsqueeze(4)), axis=4) - else: - tagmap = outputs[1].unsqueeze(4) - - heatmap = outputs[0] - N, J = 1, self.hrhrnet_head.num_joints - heatmap_maxpool = self.pool(heatmap) - # topk - maxmap = heatmap * (heatmap == heatmap_maxpool) - maxmap = maxmap.reshape([N, J, -1]) - heat_k, inds_k = maxmap.topk(self.max_num_people, axis=2) - - outputs = [heatmap, tagmap, heat_k, inds_k] - return outputs - - -@register -@serializable -class HrHRNetPostProcess(object): - ''' - HrHRNet postprocess contain: - 1) get topk keypoints in the output heatmap - 2) sample the tagmap's value corresponding to each of the topk coordinate - 3) match different joints to combine to some people with Hungary algorithm - 4) adjust the coordinate by +-0.25 to decrease error std - 5) salvage missing joints by check positivity of heatmap - tagdiff_norm - Args: - max_num_people (int): max number of people support in postprocess - heat_thresh (float): value of topk below this threshhold will be ignored - tag_thresh (float): coord's value sampled in tagmap below this threshold belong to same people for init - - inputs(list[heatmap]): the output list of model, [heatmap, heatmap_maxpool, tagmap], heatmap_maxpool used to get topk - original_height, original_width (float): the original image size - ''' - - def __init__(self, max_num_people=30, heat_thresh=0.1, tag_thresh=1.): - self.max_num_people = max_num_people - self.heat_thresh = heat_thresh - self.tag_thresh = tag_thresh - - def lerp(self, j, y, x, heatmap): - H, W = heatmap.shape[-2:] - left = np.clip(x - 1, 0, W - 1) - right = np.clip(x + 1, 0, W - 1) - up = np.clip(y - 1, 0, H - 1) - down = np.clip(y + 1, 0, H - 1) - offset_y = np.where(heatmap[j, down, x] > heatmap[j, up, x], 0.25, - -0.25) - offset_x = np.where(heatmap[j, y, right] > heatmap[j, y, left], 0.25, - -0.25) - return offset_y + 0.5, offset_x + 0.5 - - def __call__(self, heatmap, tagmap, heat_k, inds_k, original_height, - original_width): - - N, J, H, W = heatmap.shape - assert N == 1, "only support batch size 1" - heatmap = heatmap[0].cpu().detach().numpy() - tagmap = tagmap[0].cpu().detach().numpy() - heats = heat_k[0].cpu().detach().numpy() - inds_np = inds_k[0].cpu().detach().numpy() - y = inds_np // W - x = inds_np % W - tags = tagmap[np.arange(J)[None, :].repeat(self.max_num_people), - y.flatten(), x.flatten()].reshape(J, -1, tagmap.shape[-1]) - coords = np.stack((y, x), axis=2) - # threshold - mask = heats > self.heat_thresh - # cluster - cluster = defaultdict(lambda: { - 'coords': np.zeros((J, 2), dtype=np.float32), - 'scores': np.zeros(J, dtype=np.float32), - 'tags': [] - }) - for jid, m in enumerate(mask): - num_valid = m.sum() - if num_valid == 0: - continue - valid_inds = np.where(m)[0] - valid_tags = tags[jid, m, :] - if len(cluster) == 0: # initialize - for i in valid_inds: - tag = tags[jid, i] - key = tag[0] - cluster[key]['tags'].append(tag) - cluster[key]['scores'][jid] = heats[jid, i] - cluster[key]['coords'][jid] = coords[jid, i] - continue - candidates = list(cluster.keys())[:self.max_num_people] - centroids = [ - np.mean( - cluster[k]['tags'], axis=0) for k in candidates - ] - num_clusters = len(centroids) - # shape is (num_valid, num_clusters, tag_dim) - dist = valid_tags[:, None, :] - np.array(centroids)[None, ...] - l2_dist = np.linalg.norm(dist, ord=2, axis=2) - # modulate dist with heat value, see `use_detection_val` - cost = np.round(l2_dist) * 100 - heats[jid, m, None] - # pad the cost matrix, otherwise new pose are ignored - if num_valid > num_clusters: - cost = np.pad(cost, ((0, 0), (0, num_valid - num_clusters)), - 'constant', - constant_values=((0, 0), (0, 1e-10))) - rows, cols = linear_sum_assignment(cost) - for y, x in zip(rows, cols): - tag = tags[jid, y] - if y < num_valid and x < num_clusters and \ - l2_dist[y, x] < self.tag_thresh: - key = candidates[x] # merge to cluster - else: - key = tag[0] # initialize new cluster - cluster[key]['tags'].append(tag) - cluster[key]['scores'][jid] = heats[jid, y] - cluster[key]['coords'][jid] = coords[jid, y] - - # shape is [k, J, 2] and [k, J] - pose_tags = np.array([cluster[k]['tags'] for k in cluster]) - pose_coords = np.array([cluster[k]['coords'] for k in cluster]) - pose_scores = np.array([cluster[k]['scores'] for k in cluster]) - valid = pose_scores > 0 - - pose_kpts = np.zeros((pose_scores.shape[0], J, 3), dtype=np.float32) - if valid.sum() == 0: - return pose_kpts, pose_kpts - - # refine coords - valid_coords = pose_coords[valid].astype(np.int32) - y = valid_coords[..., 0].flatten() - x = valid_coords[..., 1].flatten() - _, j = np.nonzero(valid) - offsets = self.lerp(j, y, x, heatmap) - pose_coords[valid, 0] += offsets[0] - pose_coords[valid, 1] += offsets[1] - - # mean score before salvage - mean_score = pose_scores.mean(axis=1) - pose_kpts[valid, 2] = pose_scores[valid] - - # salvage missing joints - if True: - for pid, coords in enumerate(pose_coords): - tag_mean = np.array(pose_tags[pid]).mean(axis=0) - norm = np.sum((tagmap - tag_mean)**2, axis=3)**0.5 - score = heatmap - np.round(norm) # (J, H, W) - flat_score = score.reshape(J, -1) - max_inds = np.argmax(flat_score, axis=1) - max_scores = np.max(flat_score, axis=1) - salvage_joints = (pose_scores[pid] == 0) & (max_scores > 0) - if salvage_joints.sum() == 0: - continue - y = max_inds[salvage_joints] // W - x = max_inds[salvage_joints] % W - offsets = self.lerp(salvage_joints.nonzero()[0], y, x, heatmap) - y = y.astype(np.float32) + offsets[0] - x = x.astype(np.float32) + offsets[1] - pose_coords[pid][salvage_joints, 0] = y - pose_coords[pid][salvage_joints, 1] = x - pose_kpts[pid][salvage_joints, 2] = max_scores[salvage_joints] - pose_kpts[..., :2] = transpred(pose_coords[..., :2][..., ::-1], - original_height, original_width, - min(H, W)) - return pose_kpts, mean_score diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_hrnet.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_hrnet.py deleted file mode 100644 index 8d50502..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_hrnet.py +++ /dev/null @@ -1,468 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import numpy as np -import math -import cv2 -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch -from ..keypoint_utils import transform_preds -from .. import layers as L -from paddle.nn import functional as F - -__all__ = ['TopDownHRNet', 'TinyPose3DHRNet', 'TinyPose3DHRHeatmapNet'] - - -@register -class TopDownHRNet(BaseArch): - __category__ = 'architecture' - __inject__ = ['loss'] - - def __init__(self, - width, - num_joints, - backbone='HRNet', - loss='KeyPointMSELoss', - post_process='HRNetPostProcess', - flip_perm=None, - flip=True, - shift_heatmap=True, - use_dark=True): - """ - HRNet network, see https://arxiv.org/abs/1902.09212 - - Args: - backbone (nn.Layer): backbone instance - post_process (object): `HRNetPostProcess` instance - flip_perm (list): The left-right joints exchange order list - use_dark(bool): Whether to use DARK in post processing - """ - super(TopDownHRNet, self).__init__() - self.backbone = backbone - self.post_process = HRNetPostProcess(use_dark) - self.loss = loss - self.flip_perm = flip_perm - self.flip = flip - self.final_conv = L.Conv2d(width, num_joints, 1, 1, 0, bias=True) - self.shift_heatmap = shift_heatmap - self.deploy = False - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - # backbone - backbone = create(cfg['backbone']) - - return {'backbone': backbone, } - - def _forward(self): - feats = self.backbone(self.inputs) - hrnet_outputs = self.final_conv(feats[0]) - - if self.training: - return self.loss(hrnet_outputs, self.inputs) - elif self.deploy: - outshape = hrnet_outputs.shape - max_idx = paddle.argmax( - hrnet_outputs.reshape( - (outshape[0], outshape[1], outshape[2] * outshape[3])), - axis=-1) - return hrnet_outputs, max_idx - else: - if self.flip: - self.inputs['image'] = self.inputs['image'].flip([3]) - feats = self.backbone(self.inputs) - output_flipped = self.final_conv(feats[0]) - output_flipped = self.flip_back(output_flipped.numpy(), - self.flip_perm) - output_flipped = paddle.to_tensor(output_flipped.copy()) - if self.shift_heatmap: - output_flipped[:, :, :, 1:] = output_flipped.clone( - )[:, :, :, 0:-1] - hrnet_outputs = (hrnet_outputs + output_flipped) * 0.5 - imshape = (self.inputs['im_shape'].numpy() - )[:, ::-1] if 'im_shape' in self.inputs else None - center = self.inputs['center'].numpy( - ) if 'center' in self.inputs else np.round(imshape / 2.) - scale = self.inputs['scale'].numpy( - ) if 'scale' in self.inputs else imshape / 200. - outputs = self.post_process(hrnet_outputs, center, scale) - return outputs - - def get_loss(self): - return self._forward() - - def get_pred(self): - res_lst = self._forward() - outputs = {'keypoint': res_lst} - return outputs - - def flip_back(self, output_flipped, matched_parts): - assert output_flipped.ndim == 4,\ - 'output_flipped should be [batch_size, num_joints, height, width]' - - output_flipped = output_flipped[:, :, :, ::-1] - - for pair in matched_parts: - tmp = output_flipped[:, pair[0], :, :].copy() - output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :] - output_flipped[:, pair[1], :, :] = tmp - - return output_flipped - - -class HRNetPostProcess(object): - def __init__(self, use_dark=True): - self.use_dark = use_dark - - def get_max_preds(self, heatmaps): - '''get predictions from score maps - - Args: - heatmaps: numpy.ndarray([batch_size, num_joints, height, width]) - - Returns: - preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords - maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints - ''' - assert isinstance(heatmaps, - np.ndarray), 'heatmaps should be numpy.ndarray' - assert heatmaps.ndim == 4, 'batch_images should be 4-ndim' - - batch_size = heatmaps.shape[0] - num_joints = heatmaps.shape[1] - width = heatmaps.shape[3] - heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1)) - idx = np.argmax(heatmaps_reshaped, 2) - maxvals = np.amax(heatmaps_reshaped, 2) - - maxvals = maxvals.reshape((batch_size, num_joints, 1)) - idx = idx.reshape((batch_size, num_joints, 1)) - - preds = np.tile(idx, (1, 1, 2)).astype(np.float32) - - preds[:, :, 0] = (preds[:, :, 0]) % width - preds[:, :, 1] = np.floor((preds[:, :, 1]) / width) - - pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2)) - pred_mask = pred_mask.astype(np.float32) - - preds *= pred_mask - - return preds, maxvals - - def gaussian_blur(self, heatmap, kernel): - border = (kernel - 1) // 2 - batch_size = heatmap.shape[0] - num_joints = heatmap.shape[1] - height = heatmap.shape[2] - width = heatmap.shape[3] - for i in range(batch_size): - for j in range(num_joints): - origin_max = np.max(heatmap[i, j]) - dr = np.zeros((height + 2 * border, width + 2 * border)) - dr[border:-border, border:-border] = heatmap[i, j].copy() - dr = cv2.GaussianBlur(dr, (kernel, kernel), 0) - heatmap[i, j] = dr[border:-border, border:-border].copy() - heatmap[i, j] *= origin_max / np.max(heatmap[i, j]) - return heatmap - - def dark_parse(self, hm, coord): - heatmap_height = hm.shape[0] - heatmap_width = hm.shape[1] - px = int(coord[0]) - py = int(coord[1]) - if 1 < px < heatmap_width - 2 and 1 < py < heatmap_height - 2: - dx = 0.5 * (hm[py][px + 1] - hm[py][px - 1]) - dy = 0.5 * (hm[py + 1][px] - hm[py - 1][px]) - dxx = 0.25 * (hm[py][px + 2] - 2 * hm[py][px] + hm[py][px - 2]) - dxy = 0.25 * (hm[py+1][px+1] - hm[py-1][px+1] - hm[py+1][px-1] \ - + hm[py-1][px-1]) - dyy = 0.25 * ( - hm[py + 2 * 1][px] - 2 * hm[py][px] + hm[py - 2 * 1][px]) - derivative = np.matrix([[dx], [dy]]) - hessian = np.matrix([[dxx, dxy], [dxy, dyy]]) - if dxx * dyy - dxy**2 != 0: - hessianinv = hessian.I - offset = -hessianinv * derivative - offset = np.squeeze(np.array(offset.T), axis=0) - coord += offset - return coord - - def dark_postprocess(self, hm, coords, kernelsize): - '''DARK postpocessing, Zhang et al. Distribution-Aware Coordinate - Representation for Human Pose Estimation (CVPR 2020). - ''' - - hm = self.gaussian_blur(hm, kernelsize) - hm = np.maximum(hm, 1e-10) - hm = np.log(hm) - for n in range(coords.shape[0]): - for p in range(coords.shape[1]): - coords[n, p] = self.dark_parse(hm[n][p], coords[n][p]) - return coords - - def get_final_preds(self, heatmaps, center, scale, kernelsize=3): - """the highest heatvalue location with a quarter offset in the - direction from the highest response to the second highest response. - - Args: - heatmaps (numpy.ndarray): The predicted heatmaps - center (numpy.ndarray): The boxes center - scale (numpy.ndarray): The scale factor - - Returns: - preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords - maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints - """ - coords, maxvals = self.get_max_preds(heatmaps) - - heatmap_height = heatmaps.shape[2] - heatmap_width = heatmaps.shape[3] - - if self.use_dark: - coords = self.dark_postprocess(heatmaps, coords, kernelsize) - else: - for n in range(coords.shape[0]): - for p in range(coords.shape[1]): - hm = heatmaps[n][p] - px = int(math.floor(coords[n][p][0] + 0.5)) - py = int(math.floor(coords[n][p][1] + 0.5)) - if 1 < px < heatmap_width - 1 and 1 < py < heatmap_height - 1: - diff = np.array([ - hm[py][px + 1] - hm[py][px - 1], - hm[py + 1][px] - hm[py - 1][px] - ]) - coords[n][p] += np.sign(diff) * .25 - preds = coords.copy() - - # Transform back - for i in range(coords.shape[0]): - preds[i] = transform_preds(coords[i], center[i], scale[i], - [heatmap_width, heatmap_height]) - - return preds, maxvals - - def __call__(self, output, center, scale): - preds, maxvals = self.get_final_preds(output.numpy(), center, scale) - outputs = [[ - np.concatenate( - (preds, maxvals), axis=-1), np.mean( - maxvals, axis=1) - ]] - return outputs - - -class TinyPose3DPostProcess(object): - def __init__(self): - pass - - def __call__(self, output, center, scale): - """ - Args: - output (numpy.ndarray): numpy.ndarray([batch_size, num_joints, 3]), keypoints coords - scale (numpy.ndarray): The scale factor - Returns: - preds: numpy.ndarray([batch_size, num_joints, 3]), keypoints coords - """ - - preds = output.numpy().copy() - - # Transform back - for i in range(output.shape[0]): # batch_size - preds[i][:, 0] = preds[i][:, 0] * scale[i][0] - preds[i][:, 1] = preds[i][:, 1] * scale[i][1] - - return preds - - -def soft_argmax(heatmaps, joint_num): - dims = heatmaps.shape - depth_dim = (int)(dims[1] / joint_num) - heatmaps = heatmaps.reshape((-1, joint_num, depth_dim * dims[2] * dims[3])) - heatmaps = F.softmax(heatmaps, 2) - heatmaps = heatmaps.reshape((-1, joint_num, depth_dim, dims[2], dims[3])) - - accu_x = heatmaps.sum(axis=(2, 3)) - accu_y = heatmaps.sum(axis=(2, 4)) - accu_z = heatmaps.sum(axis=(3, 4)) - - accu_x = accu_x * paddle.arange(1, 33) - accu_y = accu_y * paddle.arange(1, 33) - accu_z = accu_z * paddle.arange(1, 33) - - accu_x = accu_x.sum(axis=2, keepdim=True) - 1 - accu_y = accu_y.sum(axis=2, keepdim=True) - 1 - accu_z = accu_z.sum(axis=2, keepdim=True) - 1 - - coord_out = paddle.concat( - (accu_x, accu_y, accu_z), axis=2) # [batch_size, joint_num, 3] - - return coord_out - - -@register -class TinyPose3DHRHeatmapNet(BaseArch): - __category__ = 'architecture' - __inject__ = ['loss'] - - def __init__( - self, - width, # 40, backbone输出的channel数目 - num_joints, - backbone='HRNet', - loss='KeyPointRegressionMSELoss', - post_process=TinyPose3DPostProcess): - """ - Args: - backbone (nn.Layer): backbone instance - post_process (object): post process instance - """ - super(TinyPose3DHRHeatmapNet, self).__init__() - - self.backbone = backbone - self.post_process = TinyPose3DPostProcess() - self.loss = loss - self.deploy = False - self.num_joints = num_joints - - self.final_conv = L.Conv2d(width, num_joints * 32, 1, 1, 0, bias=True) - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - # backbone - backbone = create(cfg['backbone']) - - return {'backbone': backbone, } - - def _forward(self): - feats = self.backbone(self.inputs) # feats:[[batch_size, 40, 32, 24]] - - hrnet_outputs = self.final_conv(feats[0]) - res = soft_argmax(hrnet_outputs, self.num_joints) - return res - - def get_loss(self): - pose3d = self._forward() - loss = self.loss(pose3d, None, self.inputs) - outputs = {'loss': loss} - return outputs - - def get_pred(self): - res_lst = self._forward() - outputs = {'pose3d': res_lst} - return outputs - - def flip_back(self, output_flipped, matched_parts): - assert output_flipped.ndim == 4,\ - 'output_flipped should be [batch_size, num_joints, height, width]' - - output_flipped = output_flipped[:, :, :, ::-1] - - for pair in matched_parts: - tmp = output_flipped[:, pair[0], :, :].copy() - output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :] - output_flipped[:, pair[1], :, :] = tmp - - return output_flipped - - -@register -class TinyPose3DHRNet(BaseArch): - __category__ = 'architecture' - __inject__ = ['loss'] - - def __init__(self, - width, - num_joints, - fc_channel=768, - backbone='HRNet', - loss='KeyPointRegressionMSELoss', - post_process=TinyPose3DPostProcess): - """ - Args: - backbone (nn.Layer): backbone instance - post_process (object): post process instance - """ - super(TinyPose3DHRNet, self).__init__() - self.backbone = backbone - self.post_process = TinyPose3DPostProcess() - self.loss = loss - self.deploy = False - self.num_joints = num_joints - - self.final_conv = L.Conv2d(width, num_joints, 1, 1, 0, bias=True) - - self.flatten = paddle.nn.Flatten(start_axis=2, stop_axis=3) - self.fc1 = paddle.nn.Linear(fc_channel, 256) - self.act1 = paddle.nn.ReLU() - self.fc2 = paddle.nn.Linear(256, 64) - self.act2 = paddle.nn.ReLU() - self.fc3 = paddle.nn.Linear(64, 3) - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - # backbone - backbone = create(cfg['backbone']) - - return {'backbone': backbone, } - - def _forward(self): - ''' - self.inputs is a dict - ''' - feats = self.backbone( - self.inputs) # feats:[[batch_size, 40, width/4, height/4]] - - hrnet_outputs = self.final_conv( - feats[0]) # hrnet_outputs: [batch_size, num_joints*32,32,32] - - flatten_res = self.flatten( - hrnet_outputs) # [batch_size,num_joints*32,32*32] - - res = self.fc1(flatten_res) - res = self.act1(res) - res = self.fc2(res) - res = self.act2(res) - res = self.fc3(res) - - if self.training: - return self.loss(res, self.inputs) - else: # export model need - return res - - def get_loss(self): - return self._forward() - - def get_pred(self): - res_lst = self._forward() - outputs = {'pose3d': res_lst} - return outputs - - def flip_back(self, output_flipped, matched_parts): - assert output_flipped.ndim == 4,\ - 'output_flipped should be [batch_size, num_joints, height, width]' - - output_flipped = output_flipped[:, :, :, ::-1] - - for pair in matched_parts: - tmp = output_flipped[:, pair[0], :, :].copy() - output_flipped[:, pair[0], :, :] = output_flipped[:, pair[1], :, :] - output_flipped[:, pair[1], :, :] = tmp - - return output_flipped diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_petr.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_petr.py deleted file mode 100644 index b587c1f..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_petr.py +++ /dev/null @@ -1,217 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/detectors/petr.py -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -from ppdet.core.workspace import register -from .meta_arch import BaseArch -from .. import layers as L - -__all__ = ['PETR'] - - -@register -class PETR(BaseArch): - __category__ = 'architecture' - __inject__ = ['backbone', 'neck', 'bbox_head'] - - def __init__(self, - backbone='ResNet', - neck='ChannelMapper', - bbox_head='PETRHead'): - """ - PETR, see https://openaccess.thecvf.com/content/CVPR2022/papers/Shi_End-to-End_Multi-Person_Pose_Estimation_With_Transformers_CVPR_2022_paper.pdf - - Args: - backbone (nn.Layer): backbone instance - neck (nn.Layer): neck between backbone and head - bbox_head (nn.Layer): model output and loss - """ - super(PETR, self).__init__() - self.backbone = backbone - if neck is not None: - self.with_neck = True - self.neck = neck - self.bbox_head = bbox_head - self.deploy = False - - def extract_feat(self, img): - """Directly extract features from the backbone+neck.""" - x = self.backbone(img) - if self.with_neck: - x = self.neck(x) - return x - - def get_inputs(self): - img_metas = [] - gt_bboxes = [] - gt_labels = [] - gt_keypoints = [] - gt_areas = [] - pad_gt_mask = self.inputs['pad_gt_mask'].astype("bool").squeeze(-1) - for idx, im_shape in enumerate(self.inputs['im_shape']): - img_meta = { - 'img_shape': im_shape.astype("int32").tolist() + [1, ], - 'batch_input_shape': self.inputs['image'].shape[-2:], - 'image_name': self.inputs['image_file'][idx] - } - img_metas.append(img_meta) - if (not pad_gt_mask[idx].any()): - gt_keypoints.append(self.inputs['gt_joints'][idx][:1]) - gt_labels.append(self.inputs['gt_class'][idx][:1]) - gt_bboxes.append(self.inputs['gt_bbox'][idx][:1]) - gt_areas.append(self.inputs['gt_areas'][idx][:1]) - continue - - gt_keypoints.append(self.inputs['gt_joints'][idx][pad_gt_mask[idx]]) - gt_labels.append(self.inputs['gt_class'][idx][pad_gt_mask[idx]]) - gt_bboxes.append(self.inputs['gt_bbox'][idx][pad_gt_mask[idx]]) - gt_areas.append(self.inputs['gt_areas'][idx][pad_gt_mask[idx]]) - - return img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas - - def get_loss(self): - """ - Args: - img (Tensor): Input images of shape (N, C, H, W). - Typically these should be mean centered and std scaled. - img_metas (list[dict]): A List of image info dict where each dict - has: 'img_shape', 'scale_factor', 'flip', and may also contain - 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. - For details on the values of these keys see - :class:`mmdet.datasets.pipelines.Collect`. - gt_bboxes (list[Tensor]): Each item are the truth boxes for each - image in [tl_x, tl_y, br_x, br_y] format. - gt_labels (list[Tensor]): Class indices corresponding to each box. - gt_keypoints (list[Tensor]): Each item are the truth keypoints for - each image in [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x, - p^{K}_y, p^{K}_v] format. - gt_areas (list[Tensor]): mask areas corresponding to each box. - gt_bboxes_ignore (None | list[Tensor]): Specify which bounding - boxes can be ignored when computing the loss. - - Returns: - dict[str, Tensor]: A dictionary of loss components. - """ - - img_metas, gt_bboxes, gt_labels, gt_keypoints, gt_areas = self.get_inputs( - ) - gt_bboxes_ignore = getattr(self.inputs, 'gt_bboxes_ignore', None) - - x = self.extract_feat(self.inputs) - losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes, - gt_labels, gt_keypoints, gt_areas, - gt_bboxes_ignore) - loss = 0 - for k, v in losses.items(): - loss += v - losses['loss'] = loss - - return losses - - def get_pred_numpy(self): - """Used for computing network flops. - """ - - img = self.inputs['image'] - batch_size, _, height, width = img.shape - dummy_img_metas = [ - dict( - batch_input_shape=(height, width), - img_shape=(height, width, 3), - scale_factor=(1., 1., 1., 1.)) for _ in range(batch_size) - ] - x = self.extract_feat(img) - outs = self.bbox_head(x, img_metas=dummy_img_metas) - bbox_list = self.bbox_head.get_bboxes( - *outs, dummy_img_metas, rescale=True) - return bbox_list - - def get_pred(self): - """ - """ - img = self.inputs['image'] - batch_size, _, height, width = img.shape - img_metas = [ - dict( - batch_input_shape=(height, width), - img_shape=(height, width, 3), - scale_factor=self.inputs['scale_factor'][i]) - for i in range(batch_size) - ] - kptpred = self.simple_test( - self.inputs, img_metas=img_metas, rescale=True) - keypoints = kptpred[0][1][0] - bboxs = kptpred[0][0][0] - keypoints[..., 2] = bboxs[:, None, 4] - res_lst = [[keypoints, bboxs[:, 4]]] - outputs = {'keypoint': res_lst} - return outputs - - def simple_test(self, inputs, img_metas, rescale=False): - """Test function without test time augmentation. - - Args: - inputs (list[paddle.Tensor]): List of multiple images. - img_metas (list[dict]): List of image information. - rescale (bool, optional): Whether to rescale the results. - Defaults to False. - - Returns: - list[list[np.ndarray]]: BBox and keypoint results of each image - and classes. The outer list corresponds to each image. - The inner list corresponds to each class. - """ - batch_size = len(img_metas) - assert batch_size == 1, 'Currently only batch_size 1 for inference ' \ - f'mode is supported. Found batch_size {batch_size}.' - feat = self.extract_feat(inputs) - results_list = self.bbox_head.simple_test( - feat, img_metas, rescale=rescale) - - bbox_kpt_results = [ - self.bbox_kpt2result(det_bboxes, det_labels, det_kpts, - self.bbox_head.num_classes) - for det_bboxes, det_labels, det_kpts in results_list - ] - return bbox_kpt_results - - def bbox_kpt2result(self, bboxes, labels, kpts, num_classes): - """Convert detection results to a list of numpy arrays. - - Args: - bboxes (paddle.Tensor | np.ndarray): shape (n, 5). - labels (paddle.Tensor | np.ndarray): shape (n, ). - kpts (paddle.Tensor | np.ndarray): shape (n, K, 3). - num_classes (int): class number, including background class. - - Returns: - list(ndarray): bbox and keypoint results of each class. - """ - if bboxes.shape[0] == 0: - return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)], \ - [np.zeros((0, kpts.size(1), 3), dtype=np.float32) - for i in range(num_classes)] - else: - if isinstance(bboxes, paddle.Tensor): - bboxes = bboxes.numpy() - labels = labels.numpy() - kpts = kpts.numpy() - return [bboxes[labels == i, :] for i in range(num_classes)], \ - [kpts[labels == i, :, :] for i in range(num_classes)] diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_vitpose.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_vitpose.py deleted file mode 100644 index b00226a..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/keypoint_vitpose.py +++ /dev/null @@ -1,317 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import numpy as np -import math -import cv2 -from ppdet.core.workspace import register, create, serializable -from .meta_arch import BaseArch -from ..keypoint_utils import transform_preds -from .. import layers as L - -__all__ = ['VitPose_TopDown', 'VitPosePostProcess'] - - -@register -class VitPose_TopDown(BaseArch): - __category__ = 'architecture' - __inject__ = ['loss'] - - def __init__(self, backbone, head, loss, post_process, flip_test): - """ - VitPose network, see https://arxiv.org/pdf/2204.12484v2.pdf - - Args: - backbone (nn.Layer): backbone instance - post_process (object): `HRNetPostProcess` instance - - """ - super(VitPose_TopDown, self).__init__() - self.backbone = backbone - self.head = head - self.loss = loss - self.post_process = post_process - self.flip_test = flip_test - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - # backbone - backbone = create(cfg['backbone']) - #head - head = create(cfg['head']) - #post_process - post_process = create(cfg['post_process']) - - return { - 'backbone': backbone, - 'head': head, - 'post_process': post_process - } - - def _forward_train(self): - - feats = self.backbone.forward_features(self.inputs['image']) - vitpost_output = self.head(feats) - return self.loss(vitpost_output, self.inputs) - - def _forward_test(self): - - feats = self.backbone.forward_features(self.inputs['image']) - output_heatmap = self.head(feats) - - if self.flip_test: - img_flipped = self.inputs['image'].flip(3) - features_flipped = self.backbone.forward_features(img_flipped) - output_flipped_heatmap = self.head.inference_model(features_flipped, - self.flip_test) - - output_heatmap = (output_heatmap + output_flipped_heatmap) * 0.5 - - imshape = (self.inputs['im_shape'].numpy() - )[:, ::-1] if 'im_shape' in self.inputs else None - center = self.inputs['center'].numpy( - ) if 'center' in self.inputs else np.round(imshape / 2.) - scale = self.inputs['scale'].numpy( - ) if 'scale' in self.inputs else imshape / 200. - - result = self.post_process(output_heatmap.cpu().numpy(), center, scale) - - return result - - def get_loss(self): - return self._forward_train() - - def get_pred(self): - res_lst = self._forward_test() - outputs = {'keypoint': res_lst} - return outputs - - -@register -@serializable -class VitPosePostProcess(object): - def __init__(self, use_dark=False): - self.use_dark = use_dark - - def get_max_preds(self, heatmaps): - '''get predictions from score maps - - Args: - heatmaps: numpy.ndarray([batch_size, num_joints, height, width]) - - Returns: - preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords - maxvals: numpy.ndarray([batch_size, num_joints, 2]), the maximum confidence of the keypoints - ''' - assert isinstance(heatmaps, - np.ndarray), 'heatmaps should be numpy.ndarray' - assert heatmaps.ndim == 4, 'batch_images should be 4-ndim' - - batch_size = heatmaps.shape[0] - num_joints = heatmaps.shape[1] - width = heatmaps.shape[3] - heatmaps_reshaped = heatmaps.reshape((batch_size, num_joints, -1)) - idx = np.argmax(heatmaps_reshaped, 2) - maxvals = np.amax(heatmaps_reshaped, 2) - - maxvals = maxvals.reshape((batch_size, num_joints, 1)) - idx = idx.reshape((batch_size, num_joints, 1)) - - preds = np.tile(idx, (1, 1, 2)).astype(np.float32) - - preds[:, :, 0] = (preds[:, :, 0]) % width - preds[:, :, 1] = np.floor((preds[:, :, 1]) // width) - - pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2)) - pred_mask = pred_mask.astype(np.float32) - - preds *= pred_mask - - return preds, maxvals - - def post_datk_udp(self, coords, batch_heatmaps, kernel=3): - """DARK post-pocessing. Implemented by udp. Paper ref: Huang et al. The - Devil is in the Details: Delving into Unbiased Data Processing for Human - Pose Estimation (CVPR 2020). Zhang et al. Distribution-Aware Coordinate - Representation for Human Pose Estimation (CVPR 2020). - - Note: - - batch size: B - - num keypoints: K - - num persons: N - - height of heatmaps: H - - width of heatmaps: W - - B=1 for bottom_up paradigm where all persons share the same heatmap. - B=N for top_down paradigm where each person has its own heatmaps. - - Args: - coords (np.ndarray[N, K, 2]): Initial coordinates of human pose. - batch_heatmaps (np.ndarray[B, K, H, W]): batch_heatmaps - kernel (int): Gaussian kernel size (K) for modulation. - - Returns: - np.ndarray([N, K, 2]): Refined coordinates. - """ - if not isinstance(batch_heatmaps, np.ndarray): - batch_heatmaps = batch_heatmaps.cpu().numpy() - B, K, H, W = batch_heatmaps.shape - N = coords.shape[0] - assert (B == 1 or B == N) - for heatmaps in batch_heatmaps: - for heatmap in heatmaps: - cv2.GaussianBlur(heatmap, (kernel, kernel), 0, heatmap) - np.clip(batch_heatmaps, 0.001, 50, batch_heatmaps) - np.log(batch_heatmaps, batch_heatmaps) - - batch_heatmaps_pad = np.pad(batch_heatmaps, ((0, 0), (0, 0), (1, 1), - (1, 1)), - mode='edge').flatten() - - index = coords[..., 0] + 1 + (coords[..., 1] + 1) * (W + 2) - index += (W + 2) * (H + 2) * np.arange(0, B * K).reshape(-1, K) - index = index.astype(int).reshape(-1, 1) - i_ = batch_heatmaps_pad[index] - ix1 = batch_heatmaps_pad[index + 1] - iy1 = batch_heatmaps_pad[index + W + 2] - ix1y1 = batch_heatmaps_pad[index + W + 3] - ix1_y1_ = batch_heatmaps_pad[index - W - 3] - ix1_ = batch_heatmaps_pad[index - 1] - iy1_ = batch_heatmaps_pad[index - 2 - W] - - dx = 0.5 * (ix1 - ix1_) - dy = 0.5 * (iy1 - iy1_) - derivative = np.concatenate([dx, dy], axis=1) - derivative = derivative.reshape(N, K, 2, 1) - dxx = ix1 - 2 * i_ + ix1_ - dyy = iy1 - 2 * i_ + iy1_ - dxy = 0.5 * (ix1y1 - ix1 - iy1 + i_ + i_ - ix1_ - iy1_ + ix1_y1_) - hessian = np.concatenate([dxx, dxy, dxy, dyy], axis=1) - hessian = hessian.reshape(N, K, 2, 2) - hessian = np.linalg.inv(hessian + np.finfo(np.float32).eps * np.eye(2)) - coords -= np.einsum('ijmn,ijnk->ijmk', hessian, derivative).squeeze() - return coords - - def transform_preds_udp(self, - coords, - center, - scale, - output_size, - use_udp=True): - """Get final keypoint predictions from heatmaps and apply scaling and - translation to map them back to the image. - - Note: - num_keypoints: K - - Args: - coords (np.ndarray[K, ndims]): - - * If ndims=2, corrds are predicted keypoint location. - * If ndims=4, corrds are composed of (x, y, scores, tags) - * If ndims=5, corrds are composed of (x, y, scores, tags, - flipped_tags) - - center (np.ndarray[2, ]): Center of the bounding box (x, y). - scale (np.ndarray[2, ]): Scale of the bounding box - wrt [width, height]. - output_size (np.ndarray[2, ] | list(2,)): Size of the - destination heatmaps. - use_udp (bool): Use unbiased data processing - - Returns: - np.ndarray: Predicted coordinates in the images. - """ - - assert coords.shape[1] in (2, 4, 5) - assert len(center) == 2 - assert len(scale) == 2 - assert len(output_size) == 2 - - # Recover the scale which is normalized by a factor of 200. - scale = scale * 200.0 - - if use_udp: - scale_x = scale[0] / (output_size[0] - 1.0) - scale_y = scale[1] / (output_size[1] - 1.0) - else: - scale_x = scale[0] / output_size[0] - scale_y = scale[1] / output_size[1] - - target_coords = np.ones_like(coords) - target_coords[:, 0] = coords[:, 0] * scale_x + center[0] - scale[ - 0] * 0.5 - target_coords[:, 1] = coords[:, 1] * scale_y + center[1] - scale[ - 1] * 0.5 - - return target_coords - - def get_final_preds(self, heatmaps, center, scale, kernelsize=11): - """the highest heatvalue location with a quarter offset in the - direction from the highest response to the second highest response. - - Args: - heatmaps (numpy.ndarray): The predicted heatmaps - center (numpy.ndarray): The boxes center - scale (numpy.ndarray): The scale factor - - Returns: - preds: numpy.ndarray([batch_size, num_joints, 2]), keypoints coords - maxvals: numpy.ndarray([batch_size, num_joints, 1]), the maximum confidence of the keypoints - """ - coords, maxvals = self.get_max_preds(heatmaps) - - N, K, H, W = heatmaps.shape - - if self.use_dark: - coords = self.post_datk_udp(coords, heatmaps, kernelsize) - preds = coords.copy() - # Transform back to the image - for i in range(N): - preds[i] = self.transform_preds_udp(preds[i], center[i], - scale[i], [W, H]) - else: - for n in range(coords.shape[0]): - for p in range(coords.shape[1]): - hm = heatmaps[n][p] - px = int(math.floor(coords[n][p][0] + 0.5)) - py = int(math.floor(coords[n][p][1] + 0.5)) - if 1 < px < W - 1 and 1 < py < H - 1: - diff = np.array([ - hm[py][px + 1] - hm[py][px - 1], - hm[py + 1][px] - hm[py - 1][px] - ]) - coords[n][p] += np.sign(diff) * .25 - preds = coords.copy() - - # Transform back - for i in range(coords.shape[0]): - preds[i] = transform_preds(coords[i], center[i], scale[i], - [W, H]) - - return preds, maxvals - - def __call__(self, output, center, scale): - preds, maxvals = self.get_final_preds(output, center, scale) - outputs = [[ - np.concatenate( - (preds, maxvals), axis=-1), np.mean( - maxvals, axis=1) - ]] - return outputs \ No newline at end of file diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/mask_rcnn.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/mask_rcnn.py deleted file mode 100644 index 4f6a9ce..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/mask_rcnn.py +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -__all__ = ['MaskRCNN'] - - -@register -class MaskRCNN(BaseArch): - """ - Mask R-CNN network, see https://arxiv.org/abs/1703.06870 - - Args: - backbone (object): backbone instance - rpn_head (object): `RPNHead` instance - bbox_head (object): `BBoxHead` instance - mask_head (object): `MaskHead` instance - bbox_post_process (object): `BBoxPostProcess` instance - mask_post_process (object): `MaskPostProcess` instance - neck (object): 'FPN' instance - """ - - __category__ = 'architecture' - __inject__ = [ - 'bbox_post_process', - 'mask_post_process', - ] - - def __init__(self, - backbone, - rpn_head, - bbox_head, - mask_head, - bbox_post_process, - mask_post_process, - neck=None): - super(MaskRCNN, self).__init__() - self.backbone = backbone - self.neck = neck - self.rpn_head = rpn_head - self.bbox_head = bbox_head - self.mask_head = mask_head - - self.bbox_post_process = bbox_post_process - self.mask_post_process = mask_post_process - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - backbone = create(cfg['backbone']) - kwargs = {'input_shape': backbone.out_shape} - neck = cfg['neck'] and create(cfg['neck'], **kwargs) - - out_shape = neck and neck.out_shape or backbone.out_shape - kwargs = {'input_shape': out_shape} - rpn_head = create(cfg['rpn_head'], **kwargs) - bbox_head = create(cfg['bbox_head'], **kwargs) - - out_shape = neck and out_shape or bbox_head.get_head().out_shape - kwargs = {'input_shape': out_shape} - mask_head = create(cfg['mask_head'], **kwargs) - return { - 'backbone': backbone, - 'neck': neck, - "rpn_head": rpn_head, - "bbox_head": bbox_head, - "mask_head": mask_head, - } - - def _forward(self): - body_feats = self.backbone(self.inputs) - if self.neck is not None: - body_feats = self.neck(body_feats) - - if self.training: - rois, rois_num, rpn_loss = self.rpn_head(body_feats, self.inputs) - bbox_loss, bbox_feat = self.bbox_head(body_feats, rois, rois_num, - self.inputs) - rois, rois_num = self.bbox_head.get_assigned_rois() - bbox_targets = self.bbox_head.get_assigned_targets() - # Mask Head needs bbox_feat in Mask RCNN - mask_loss = self.mask_head(body_feats, rois, rois_num, self.inputs, - bbox_targets, bbox_feat) - return rpn_loss, bbox_loss, mask_loss - else: - rois, rois_num, _ = self.rpn_head(body_feats, self.inputs) - preds, feat_func = self.bbox_head(body_feats, rois, rois_num, None) - - im_shape = self.inputs['im_shape'] - scale_factor = self.inputs['scale_factor'] - - bbox, bbox_num, nms_keep_idx = self.bbox_post_process( - preds, (rois, rois_num), im_shape, scale_factor) - mask_out = self.mask_head( - body_feats, bbox, bbox_num, self.inputs, feat_func=feat_func) - - # rescale the prediction back to origin image - bbox, bbox_pred, bbox_num = self.bbox_post_process.get_pred( - bbox, bbox_num, im_shape, scale_factor) - origin_shape = self.bbox_post_process.get_origin_shape() - mask_pred = self.mask_post_process(mask_out, bbox_pred, bbox_num, - origin_shape) - - if self.use_extra_data: - extra_data = {} # record the bbox output before nms, such like scores and nms_keep_idx - """extra_data:{ - 'scores': predict scores, - 'nms_keep_idx': bbox index before nms, - } - """ - extra_data['scores'] = preds[1] # predict scores (probability) - # Todo: get logits output - extra_data['nms_keep_idx'] = nms_keep_idx # bbox index before nms - return bbox_pred, bbox_num, mask_pred, extra_data - else: - return bbox_pred, bbox_num, mask_pred - - def get_loss(self, ): - bbox_loss, mask_loss, rpn_loss = self._forward() - loss = {} - loss.update(rpn_loss) - loss.update(bbox_loss) - loss.update(mask_loss) - total_loss = paddle.add_n(list(loss.values())) - loss.update({'loss': total_loss}) - return loss - - def get_pred(self): - if self.use_extra_data: - bbox_pred, bbox_num, mask_pred, extra_data = self._forward() - output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred, 'extra_data': extra_data} - else: - bbox_pred, bbox_num, mask_pred = self._forward() - output = {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred} - return output diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/meta_arch.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/meta_arch.py deleted file mode 100644 index 370b2b1..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/meta_arch.py +++ /dev/null @@ -1,132 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import paddle -import paddle.nn as nn -import typing - -from ppdet.core.workspace import register -from ppdet.modeling.post_process import nms - -__all__ = ['BaseArch'] - - -@register -class BaseArch(nn.Layer): - def __init__(self, data_format='NCHW', use_extra_data=False): - super(BaseArch, self).__init__() - self.data_format = data_format - self.inputs = {} - self.fuse_norm = False - self.use_extra_data = use_extra_data - - def load_meanstd(self, cfg_transform): - scale = 1. - mean = np.array([0.485, 0.456, 0.406], dtype=np.float32) - std = np.array([0.229, 0.224, 0.225], dtype=np.float32) - for item in cfg_transform: - if 'NormalizeImage' in item: - mean = np.array( - item['NormalizeImage']['mean'], dtype=np.float32) - std = np.array(item['NormalizeImage']['std'], dtype=np.float32) - if item['NormalizeImage'].get('is_scale', True): - scale = 1. / 255. - break - if self.data_format == 'NHWC': - self.scale = paddle.to_tensor(scale / std).reshape((1, 1, 1, 3)) - self.bias = paddle.to_tensor(-mean / std).reshape((1, 1, 1, 3)) - else: - self.scale = paddle.to_tensor(scale / std).reshape((1, 3, 1, 1)) - self.bias = paddle.to_tensor(-mean / std).reshape((1, 3, 1, 1)) - - def forward(self, inputs): - if self.data_format == 'NHWC': - image = inputs['image'] - inputs['image'] = paddle.transpose(image, [0, 2, 3, 1]) - - if self.fuse_norm: - image = inputs['image'] - self.inputs['image'] = image * self.scale + self.bias - self.inputs['im_shape'] = inputs['im_shape'] - self.inputs['scale_factor'] = inputs['scale_factor'] - else: - self.inputs = inputs - - self.model_arch() - - if self.training: - out = self.get_loss() - else: - inputs_list = [] - # multi-scale input - if not isinstance(inputs, typing.Sequence): - inputs_list.append(inputs) - else: - inputs_list.extend(inputs) - outs = [] - for inp in inputs_list: - if self.fuse_norm: - self.inputs['image'] = inp['image'] * self.scale + self.bias - self.inputs['im_shape'] = inp['im_shape'] - self.inputs['scale_factor'] = inp['scale_factor'] - else: - self.inputs = inp - outs.append(self.get_pred()) - - # multi-scale test - if len(outs) > 1: - out = self.merge_multi_scale_predictions(outs) - else: - out = outs[0] - return out - - def merge_multi_scale_predictions(self, outs): - # default values for architectures not included in following list - num_classes = 80 - nms_threshold = 0.5 - keep_top_k = 100 - - if self.__class__.__name__ in ('CascadeRCNN', 'FasterRCNN', 'MaskRCNN'): - num_classes = self.bbox_head.num_classes - keep_top_k = self.bbox_post_process.nms.keep_top_k - nms_threshold = self.bbox_post_process.nms.nms_threshold - else: - raise Exception( - "Multi scale test only supports CascadeRCNN, FasterRCNN and MaskRCNN for now" - ) - - final_boxes = [] - all_scale_outs = paddle.concat([o['bbox'] for o in outs]).numpy() - for c in range(num_classes): - idxs = all_scale_outs[:, 0] == c - if np.count_nonzero(idxs) == 0: - continue - r = nms(all_scale_outs[idxs, 1:], nms_threshold) - final_boxes.append( - np.concatenate([np.full((r.shape[0], 1), c), r], 1)) - out = np.concatenate(final_boxes) - out = np.concatenate(sorted( - out, key=lambda e: e[1])[-keep_top_k:]).reshape((-1, 6)) - out = { - 'bbox': paddle.to_tensor(out), - 'bbox_num': paddle.to_tensor(np.array([out.shape[0], ])) - } - - return out - - def build_inputs(self, data, input_def): - inputs = {} - for i, k in enumerate(input_def): - inputs[k] = data[i] - return inputs - - def model_arch(self, ): - pass - - def get_loss(self, ): - raise NotImplementedError("Should implement get_loss method!") - - def get_pred(self, ): - raise NotImplementedError("Should implement get_pred method!") diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/multi_stream_detector.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/multi_stream_detector.py deleted file mode 100644 index 58c4fe0..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/multi_stream_detector.py +++ /dev/null @@ -1,69 +0,0 @@ -from typing import Dict -from collections import OrderedDict -from ppdet.modeling.architectures.meta_arch import BaseArch - - -class MultiSteamDetector(BaseArch): - def __init__(self, - model: Dict[str, BaseArch], - train_cfg=None, - test_cfg=None): - super(MultiSteamDetector, self).__init__() - self.submodules = list(model.keys()) - for k, v in model.items(): - setattr(self, k, v) - - self.train_cfg = train_cfg - self.test_cfg = test_cfg - self.inference_on = self.test_cfg.get("inference_on", - self.submodules[0]) - self.first_load = True - - def forward(self, inputs, return_loss=True, **kwargs): - """Calls either :func:`forward_train` or :func:`forward_test` depending - on whether ``return_loss`` is ``True``. - - Note this setting will change the expected inputs. When - ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor - and List[dict]), and when ``resturn_loss=False``, img and img_meta - should be double nested (i.e. List[Tensor], List[List[dict]]), with - the outer list indicating test time augmentations. - """ - if return_loss: - return self.forward_train(inputs, **kwargs) - else: - return self.forward_test(inputs, **kwargs) - - def get_loss(self, **kwargs): - # losses = self(**data) - - return self.forward_train(self, **kwargs) - - def model(self, **kwargs) -> BaseArch: - if "submodule" in kwargs: - assert (kwargs["submodule"] in self.submodules - ), "Detector does not contain submodule {}".format(kwargs[ - "submodule"]) - model: BaseArch = getattr(self, kwargs["submodule"]) - else: - model: BaseArch = getattr(self, self.inference_on) - return model - - def freeze(self, model_ref: str): - assert model_ref in self.submodules - model = getattr(self, model_ref) - model.eval() - for param in model.parameters(): - param.stop_gradient = True - - def update_ema_model(self, momentum=0.9996): - # print(momentum) - model_dict = self.student.state_dict() - new_dict = OrderedDict() - for key, value in self.teacher.state_dict().items(): - if key in model_dict.keys(): - new_dict[key] = (model_dict[key] * - (1 - momentum) + value * momentum) - else: - raise Exception("{} is not found in student model".format(key)) - self.teacher.set_dict(new_dict) diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/picodet.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/picodet.py deleted file mode 100644 index b6f4447..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/picodet.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -__all__ = ['PicoDet'] - - -@register -class PicoDet(BaseArch): - """ - Generalized Focal Loss network, see https://arxiv.org/abs/2006.04388 - - Args: - backbone (object): backbone instance - neck (object): 'FPN' instance - head (object): 'PicoHead' instance - """ - - __category__ = 'architecture' - - def __init__(self, backbone, neck, head='PicoHead', nms_cpu=False): - super(PicoDet, self).__init__() - self.backbone = backbone - self.neck = neck - self.head = head - self.export_post_process = True - self.export_nms = True - self.nms_cpu = nms_cpu - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - backbone = create(cfg['backbone']) - - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) - - kwargs = {'input_shape': neck.out_shape} - head = create(cfg['head'], **kwargs) - - return { - 'backbone': backbone, - 'neck': neck, - "head": head, - } - - def _forward(self): - body_feats = self.backbone(self.inputs) - fpn_feats = self.neck(body_feats) - head_outs = self.head(fpn_feats, self.export_post_process) - if self.training or not self.export_post_process: - return head_outs, None - else: - scale_factor = self.inputs['scale_factor'] - bboxes, bbox_num = self.head.post_process( - head_outs, - scale_factor, - export_nms=self.export_nms, - nms_cpu=self.nms_cpu) - return bboxes, bbox_num - - def get_loss(self, ): - loss = {} - - head_outs, _ = self._forward() - loss_gfl = self.head.get_loss(head_outs, self.inputs) - loss.update(loss_gfl) - total_loss = paddle.add_n(list(loss.values())) - loss.update({'loss': total_loss}) - return loss - - def get_pred(self): - if not self.export_post_process: - return {'picodet': self._forward()[0]} - elif self.export_nms: - bbox_pred, bbox_num = self._forward() - output = {'bbox': bbox_pred, 'bbox_num': bbox_num} - return output - else: - bboxes, mlvl_scores = self._forward() - output = {'bbox': bboxes, 'scores': mlvl_scores} - return output diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/pose3d_metro.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/pose3d_metro.py deleted file mode 100644 index 4275154..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/pose3d_metro.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch -from .. import layers as L - -__all__ = ['METRO_Body'] - - -def orthographic_projection(X, camera): - """Perform orthographic projection of 3D points X using the camera parameters - Args: - X: size = [B, N, 3] - camera: size = [B, 3] - Returns: - Projected 2D points -- size = [B, N, 2] - """ - camera = camera.reshape((-1, 1, 3)) - X_trans = X[:, :, :2] + camera[:, :, 1:] - shape = paddle.shape(X_trans) - X_2d = (camera[:, :, 0] * X_trans.reshape((shape[0], -1))).reshape(shape) - return X_2d - - -@register -class METRO_Body(BaseArch): - __category__ = 'architecture' - __inject__ = ['loss'] - - def __init__( - self, - num_joints, - backbone='HRNet', - trans_encoder='', - loss='Pose3DLoss', ): - """ - Modified from METRO network, see https://arxiv.org/abs/2012.09760 - - Args: - backbone (nn.Layer): backbone instance - """ - super(METRO_Body, self).__init__() - self.num_joints = num_joints - self.backbone = backbone - self.loss = loss - self.deploy = False - - self.trans_encoder = trans_encoder - self.conv_learn_tokens = paddle.nn.Conv1D(49, num_joints + 10, 1) - self.cam_param_fc = paddle.nn.Linear(3, 2) - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - # backbone - backbone = create(cfg['backbone']) - trans_encoder = create(cfg['trans_encoder']) - - return {'backbone': backbone, 'trans_encoder': trans_encoder} - - def _forward(self): - batch_size = self.inputs['image'].shape[0] - - image_feat = self.backbone(self.inputs) - image_feat_flatten = image_feat.reshape((batch_size, 2048, 49)) - image_feat_flatten = image_feat_flatten.transpose(perm=(0, 2, 1)) - # and apply a conv layer to learn image token for each 3d joint/vertex position - features = self.conv_learn_tokens(image_feat_flatten) # (B, J, C) - - if self.training: - # apply mask vertex/joint modeling - # meta_masks is a tensor of all the masks, randomly generated in dataloader - # we pre-define a [MASK] token, which is a floating-value vector with 0.01s - meta_masks = self.inputs['mjm_mask'].expand((-1, -1, 2048)) - constant_tensor = paddle.ones_like(features) * 0.01 - features = features * meta_masks + constant_tensor * (1 - meta_masks - ) - pred_out = self.trans_encoder(features) - - pred_3d_joints = pred_out[:, :self.num_joints, :] - cam_features = pred_out[:, self.num_joints:, :] - - # learn camera parameters - pred_2d_joints = self.cam_param_fc(cam_features) - return pred_3d_joints, pred_2d_joints - - def get_loss(self): - preds_3d, preds_2d = self._forward() - loss = self.loss(preds_3d, preds_2d, self.inputs) - output = {'loss': loss} - return output - - def get_pred(self): - preds_3d, preds_2d = self._forward() - outputs = {'pose3d': preds_3d, 'pose2d': preds_2d} - return outputs diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/ppyoloe.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/ppyoloe.py deleted file mode 100644 index 330542b..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/ppyoloe.py +++ /dev/null @@ -1,260 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import copy -import paddle -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -__all__ = ['PPYOLOE', 'PPYOLOEWithAuxHead'] -# PP-YOLOE and PP-YOLOE+ are recommended to use this architecture, especially when use distillation or aux head -# PP-YOLOE and PP-YOLOE+ can also use the same architecture of YOLOv3 in yolo.py when not use distillation or aux head - - -@register -class PPYOLOE(BaseArch): - """ - PPYOLOE network, see https://arxiv.org/abs/2203.16250 - - Args: - backbone (nn.Layer): backbone instance - neck (nn.Layer): neck instance - yolo_head (nn.Layer): anchor_head instance - post_process (object): `BBoxPostProcess` instance - ssod_loss (object): 'SSODPPYOLOELoss' instance, only used for semi-det(ssod) - for_distill (bool): whether for distillation - feat_distill_place (str): distill which feature for distillation - for_mot (bool): whether return other features for multi-object tracking - models, default False in pure object detection models. - """ - - __category__ = 'architecture' - __shared__ = ['for_distill'] - __inject__ = ['post_process', 'ssod_loss'] - - def __init__(self, - backbone='CSPResNet', - neck='CustomCSPPAN', - yolo_head='PPYOLOEHead', - post_process='BBoxPostProcess', - ssod_loss='SSODPPYOLOELoss', - for_distill=False, - feat_distill_place='neck_feats', - for_mot=False): - super(PPYOLOE, self).__init__() - self.backbone = backbone - self.neck = neck - self.yolo_head = yolo_head - self.post_process = post_process - self.for_mot = for_mot - - # for ssod, semi-det - self.is_teacher = False - self.ssod_loss = ssod_loss - - # distill - self.for_distill = for_distill - self.feat_distill_place = feat_distill_place - if for_distill: - assert feat_distill_place in ['backbone_feats', 'neck_feats'] - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - backbone = create(cfg['backbone']) - - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) - - kwargs = {'input_shape': neck.out_shape} - yolo_head = create(cfg['yolo_head'], **kwargs) - - return { - 'backbone': backbone, - 'neck': neck, - "yolo_head": yolo_head, - } - - def _forward(self): - body_feats = self.backbone(self.inputs) - neck_feats = self.neck(body_feats, self.for_mot) - - self.is_teacher = self.inputs.get('is_teacher', False) # for semi-det - if self.training or self.is_teacher: - yolo_losses = self.yolo_head(neck_feats, self.inputs) - - if self.for_distill: - if self.feat_distill_place == 'backbone_feats': - self.yolo_head.distill_pairs['backbone_feats'] = body_feats - elif self.feat_distill_place == 'neck_feats': - self.yolo_head.distill_pairs['neck_feats'] = neck_feats - else: - raise ValueError - return yolo_losses - else: - - yolo_head_outs = self.yolo_head(neck_feats) - - if self.post_process is not None: - bbox, bbox_num, nms_keep_idx = self.post_process( - yolo_head_outs, self.yolo_head.mask_anchors, - self.inputs['im_shape'], self.inputs['scale_factor']) - - else: - bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process( - yolo_head_outs, self.inputs['scale_factor']) - - if self.use_extra_data: - extra_data = {} # record the bbox output before nms, such like scores and nms_keep_idx - """extra_data:{ - 'scores': predict scores, - 'nms_keep_idx': bbox index before nms, - } - """ - extra_data['scores'] = yolo_head_outs[0] # predict scores (probability) - extra_data['nms_keep_idx'] = nms_keep_idx - output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data} - else: - output = {'bbox': bbox, 'bbox_num': bbox_num} - - return output - - def get_loss(self): - return self._forward() - - def get_pred(self): - return self._forward() - - def get_loss_keys(self): - return ['loss_cls', 'loss_iou', 'loss_dfl', 'loss_contrast'] - - def get_ssod_loss(self, student_head_outs, teacher_head_outs, train_cfg): - ssod_losses = self.ssod_loss(student_head_outs, teacher_head_outs, - train_cfg) - return ssod_losses - - -@register -class PPYOLOEWithAuxHead(BaseArch): - __category__ = 'architecture' - __inject__ = ['post_process'] - - def __init__(self, - backbone='CSPResNet', - neck='CustomCSPPAN', - yolo_head='PPYOLOEHead', - aux_head='SimpleConvHead', - post_process='BBoxPostProcess', - for_mot=False, - detach_epoch=5): - """ - PPYOLOE network, see https://arxiv.org/abs/2203.16250 - - Args: - backbone (nn.Layer): backbone instance - neck (nn.Layer): neck instance - yolo_head (nn.Layer): anchor_head instance - post_process (object): `BBoxPostProcess` instance - for_mot (bool): whether return other features for multi-object tracking - models, default False in pure object detection models. - """ - super(PPYOLOEWithAuxHead, self).__init__() - self.backbone = backbone - self.neck = neck - self.aux_neck = copy.deepcopy(self.neck) - - self.yolo_head = yolo_head - self.aux_head = aux_head - self.post_process = post_process - self.for_mot = for_mot - self.detach_epoch = detach_epoch - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - # backbone - backbone = create(cfg['backbone']) - - # fpn - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) - aux_neck = copy.deepcopy(neck) - - # head - kwargs = {'input_shape': neck.out_shape} - yolo_head = create(cfg['yolo_head'], **kwargs) - aux_head = create(cfg['aux_head'], **kwargs) - - return { - 'backbone': backbone, - 'neck': neck, - "yolo_head": yolo_head, - 'aux_head': aux_head, - } - - def _forward(self): - body_feats = self.backbone(self.inputs) - neck_feats = self.neck(body_feats, self.for_mot) - - if self.training: - if self.inputs['epoch_id'] >= self.detach_epoch: - aux_neck_feats = self.aux_neck([f.detach() for f in body_feats]) - dual_neck_feats = (paddle.concat( - [f.detach(), aux_f], axis=1) for f, aux_f in - zip(neck_feats, aux_neck_feats)) - else: - aux_neck_feats = self.aux_neck(body_feats) - dual_neck_feats = (paddle.concat( - [f, aux_f], axis=1) for f, aux_f in - zip(neck_feats, aux_neck_feats)) - aux_cls_scores, aux_bbox_preds = self.aux_head(dual_neck_feats) - loss = self.yolo_head( - neck_feats, - self.inputs, - aux_pred=[aux_cls_scores, aux_bbox_preds]) - return loss - else: - yolo_head_outs = self.yolo_head(neck_feats) - - if self.post_process is not None: - bbox, bbox_num, nms_keep_idx = self.post_process( - yolo_head_outs, self.yolo_head.mask_anchors, - self.inputs['im_shape'], self.inputs['scale_factor']) - else: - bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process( - yolo_head_outs, self.inputs['scale_factor']) - - if self.use_extra_data: - extra_data = {} # record the bbox output before nms, such like scores and nms_keep_idx - """extra_data:{ - 'scores': predict scores, - 'nms_keep_idx': bbox index before nms, - } - """ - extra_data['scores'] = yolo_head_outs[0] # predict scores (probability) - # Todo: get logits output - extra_data['nms_keep_idx'] = nms_keep_idx - output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data} - else: - output = {'bbox': bbox, 'bbox_num': bbox_num} - - return output - - def get_loss(self): - return self._forward() - - def get_pred(self): - return self._forward() diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/queryinst.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/queryinst.py deleted file mode 100644 index 76a65ed..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/queryinst.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle - -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -__all__ = ['QueryInst'] - - -@register -class QueryInst(BaseArch): - __category__ = 'architecture' - __inject__ = ['post_process'] - - def __init__(self, - backbone, - neck, - rpn_head, - roi_head, - post_process='SparsePostProcess'): - super(QueryInst, self).__init__() - self.backbone = backbone - self.neck = neck - self.rpn_head = rpn_head - self.roi_head = roi_head - self.post_process = post_process - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - backbone = create(cfg['backbone']) - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) - - kwargs = {'input_shape': neck.out_shape} - rpn_head = create(cfg['rpn_head'], **kwargs) - roi_head = create(cfg['roi_head'], **kwargs) - - return { - 'backbone': backbone, - 'neck': neck, - 'rpn_head': rpn_head, - "roi_head": roi_head - } - - def _forward(self, targets=None): - features = self.backbone(self.inputs) - features = self.neck(features) - - proposal_bboxes, proposal_features = self.rpn_head(self.inputs[ - 'img_whwh']) - outputs = self.roi_head(features, proposal_bboxes, proposal_features, - targets) - - if self.training: - return outputs - else: - bbox_pred, bbox_num, mask_pred = self.post_process( - outputs['class_logits'], outputs['bbox_pred'], - self.inputs['scale_factor_whwh'], self.inputs['ori_shape'], - outputs['mask_logits']) - return bbox_pred, bbox_num, mask_pred - - def get_loss(self): - targets = [] - for i in range(len(self.inputs['img_whwh'])): - boxes = self.inputs['gt_bbox'][i] - labels = self.inputs['gt_class'][i].squeeze(-1) - img_whwh = self.inputs['img_whwh'][i] - if boxes.shape[0] != 0: - img_whwh_tgt = img_whwh.unsqueeze(0).tile([boxes.shape[0], 1]) - else: - img_whwh_tgt = paddle.zeros_like(boxes) - gt_segm = self.inputs['gt_segm'][i].astype('float32') - targets.append({ - 'boxes': boxes, - 'labels': labels, - 'img_whwh': img_whwh, - 'img_whwh_tgt': img_whwh_tgt, - 'gt_segm': gt_segm - }) - losses = self._forward(targets) - losses.update({'loss': sum(losses.values())}) - return losses - - def get_pred(self): - bbox_pred, bbox_num, mask_pred = self._forward() - return {'bbox': bbox_pred, 'bbox_num': bbox_num, 'mask': mask_pred} diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/retinanet.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/retinanet.py deleted file mode 100644 index fc49f0e..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/retinanet.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch -import paddle -import paddle.nn.functional as F - -__all__ = ['RetinaNet'] - - -@register -class RetinaNet(BaseArch): - __category__ = 'architecture' - - def __init__(self, backbone, neck, head): - super(RetinaNet, self).__init__() - self.backbone = backbone - self.neck = neck - self.head = head - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - backbone = create(cfg['backbone']) - - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) - - kwargs = {'input_shape': neck.out_shape} - head = create(cfg['head'], **kwargs) - - return { - 'backbone': backbone, - 'neck': neck, - 'head': head, - } - - def _forward(self): - body_feats = self.backbone(self.inputs) - neck_feats = self.neck(body_feats) - - if self.training: - return self.head(neck_feats, self.inputs) - else: - head_outs = self.head(neck_feats) - bbox, bbox_num, nms_keep_idx = self.head.post_process( - head_outs, self.inputs['im_shape'], self.inputs['scale_factor']) - - if self.use_extra_data: - extra_data = {} # record the bbox output before nms, such like scores and nms_keep_idx - """extra_data:{ - 'scores': predict scores, - 'nms_keep_idx': bbox index before nms, - } - """ - preds_logits = self.head.decode_cls_logits(head_outs[0]) - preds_scores = F.sigmoid(preds_logits) - extra_data['logits'] = preds_logits - extra_data['scores'] = preds_scores - extra_data['nms_keep_idx'] = nms_keep_idx # bbox index before nms - return {'bbox': bbox, 'bbox_num': bbox_num, "extra_data": extra_data} - else: - return {'bbox': bbox, 'bbox_num': bbox_num} - - def get_loss(self): - return self._forward() - - def get_pred(self): - return self._forward() diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/s2anet.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/s2anet.py deleted file mode 100644 index 8fb71e2..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/s2anet.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -__all__ = ['S2ANet'] - - -@register -class S2ANet(BaseArch): - __category__ = 'architecture' - __inject__ = ['head'] - - def __init__(self, backbone, neck, head): - """ - S2ANet, see https://arxiv.org/pdf/2008.09397.pdf - - Args: - backbone (object): backbone instance - neck (object): `FPN` instance - head (object): `Head` instance - """ - super(S2ANet, self).__init__() - self.backbone = backbone - self.neck = neck - self.s2anet_head = head - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - backbone = create(cfg['backbone']) - kwargs = {'input_shape': backbone.out_shape} - neck = cfg['neck'] and create(cfg['neck'], **kwargs) - - out_shape = neck and neck.out_shape or backbone.out_shape - kwargs = {'input_shape': out_shape} - head = create(cfg['head'], **kwargs) - - return {'backbone': backbone, 'neck': neck, "head": head} - - def _forward(self): - body_feats = self.backbone(self.inputs) - if self.neck is not None: - body_feats = self.neck(body_feats) - if self.training: - loss = self.s2anet_head(body_feats, self.inputs) - return loss - else: - head_outs = self.s2anet_head(body_feats) - # post_process - bboxes, bbox_num = self.s2anet_head.get_bboxes(head_outs) - # rescale the prediction back to origin image - im_shape = self.inputs['im_shape'] - scale_factor = self.inputs['scale_factor'] - bboxes = self.s2anet_head.get_pred(bboxes, bbox_num, im_shape, - scale_factor) - # output - output = {'bbox': bboxes, 'bbox_num': bbox_num} - return output - - def get_loss(self, ): - loss = self._forward() - return loss - - def get_pred(self): - output = self._forward() - return output diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/solov2.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/solov2.py deleted file mode 100644 index 4e5fc21..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/solov2.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle - -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -__all__ = ['SOLOv2'] - - -@register -class SOLOv2(BaseArch): - """ - SOLOv2 network, see https://arxiv.org/abs/2003.10152 - - Args: - backbone (object): an backbone instance - solov2_head (object): an `SOLOv2Head` instance - mask_head (object): an `SOLOv2MaskHead` instance - neck (object): neck of network, such as feature pyramid network instance - """ - - __category__ = 'architecture' - - def __init__(self, backbone, solov2_head, mask_head, neck=None): - super(SOLOv2, self).__init__() - self.backbone = backbone - self.neck = neck - self.solov2_head = solov2_head - self.mask_head = mask_head - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - backbone = create(cfg['backbone']) - - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) - - kwargs = {'input_shape': neck.out_shape} - solov2_head = create(cfg['solov2_head'], **kwargs) - mask_head = create(cfg['mask_head'], **kwargs) - - return { - 'backbone': backbone, - 'neck': neck, - 'solov2_head': solov2_head, - 'mask_head': mask_head, - } - - def model_arch(self): - body_feats = self.backbone(self.inputs) - - body_feats = self.neck(body_feats) - - self.seg_pred = self.mask_head(body_feats) - - self.cate_pred_list, self.kernel_pred_list = self.solov2_head( - body_feats) - - def get_loss(self, ): - loss = {} - # get gt_ins_labels, gt_cate_labels, etc. - gt_ins_labels, gt_cate_labels, gt_grid_orders = [], [], [] - fg_num = self.inputs['fg_num'] - for i in range(len(self.solov2_head.seg_num_grids)): - ins_label = 'ins_label{}'.format(i) - if ins_label in self.inputs: - gt_ins_labels.append(self.inputs[ins_label]) - cate_label = 'cate_label{}'.format(i) - if cate_label in self.inputs: - gt_cate_labels.append(self.inputs[cate_label]) - grid_order = 'grid_order{}'.format(i) - if grid_order in self.inputs: - gt_grid_orders.append(self.inputs[grid_order]) - - loss_solov2 = self.solov2_head.get_loss( - self.cate_pred_list, self.kernel_pred_list, self.seg_pred, - gt_ins_labels, gt_cate_labels, gt_grid_orders, fg_num) - loss.update(loss_solov2) - total_loss = paddle.add_n(list(loss.values())) - loss.update({'loss': total_loss}) - return loss - - def get_pred(self): - seg_masks, cate_labels, cate_scores, bbox_num = self.solov2_head.get_prediction( - self.cate_pred_list, self.kernel_pred_list, self.seg_pred, - self.inputs['im_shape'], self.inputs['scale_factor']) - outs = { - "segm": seg_masks, - "bbox_num": bbox_num, - 'cate_label': cate_labels, - 'cate_score': cate_scores - } - return outs diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/sparse_rcnn.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/sparse_rcnn.py deleted file mode 100644 index 2cbc853..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/sparse_rcnn.py +++ /dev/null @@ -1,99 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -__all__ = ["SparseRCNN"] - - -@register -class SparseRCNN(BaseArch): - __category__ = 'architecture' - __inject__ = ["postprocess"] - - def __init__(self, - backbone, - neck, - head="SparsercnnHead", - postprocess="SparsePostProcess"): - super(SparseRCNN, self).__init__() - self.backbone = backbone - self.neck = neck - self.head = head - self.postprocess = postprocess - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - backbone = create(cfg['backbone']) - - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) - - kwargs = {'roi_input_shape': neck.out_shape} - head = create(cfg['head'], **kwargs) - - return { - 'backbone': backbone, - 'neck': neck, - "head": head, - } - - def _forward(self): - body_feats = self.backbone(self.inputs) - fpn_feats = self.neck(body_feats) - head_outs = self.head(fpn_feats, self.inputs["img_whwh"]) - - if not self.training: - bbox_pred, bbox_num = self.postprocess( - head_outs["pred_logits"], head_outs["pred_boxes"], - self.inputs["scale_factor_whwh"], self.inputs["ori_shape"]) - return bbox_pred, bbox_num - else: - return head_outs - - def get_loss(self): - batch_gt_class = self.inputs["gt_class"] - batch_gt_box = self.inputs["gt_bbox"] - batch_whwh = self.inputs["img_whwh"] - targets = [] - - for i in range(len(batch_gt_class)): - boxes = batch_gt_box[i] - labels = batch_gt_class[i].squeeze(-1) - img_whwh = batch_whwh[i] - img_whwh_tgt = img_whwh.unsqueeze(0).tile([int(boxes.shape[0]), 1]) - targets.append({ - "boxes": boxes, - "labels": labels, - "img_whwh": img_whwh, - "img_whwh_tgt": img_whwh_tgt - }) - - outputs = self._forward() - loss_dict = self.head.get_loss(outputs, targets) - acc = loss_dict["acc"] - loss_dict.pop("acc") - total_loss = sum(loss_dict.values()) - loss_dict.update({"loss": total_loss, "acc": acc}) - return loss_dict - - def get_pred(self): - bbox_pred, bbox_num = self._forward() - output = {'bbox': bbox_pred, 'bbox_num': bbox_num} - return output diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/ssd.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/ssd.py deleted file mode 100644 index b8669b7..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/ssd.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch -import paddle -import paddle.nn.functional as F - -__all__ = ['SSD'] - - -@register -class SSD(BaseArch): - """ - Single Shot MultiBox Detector, see https://arxiv.org/abs/1512.02325 - - Args: - backbone (nn.Layer): backbone instance - ssd_head (nn.Layer): `SSDHead` instance - post_process (object): `BBoxPostProcess` instance - """ - - __category__ = 'architecture' - __inject__ = ['post_process'] - - def __init__(self, backbone, ssd_head, post_process, r34_backbone=False): - super(SSD, self).__init__() - self.backbone = backbone - self.ssd_head = ssd_head - self.post_process = post_process - self.r34_backbone = r34_backbone - if self.r34_backbone: - from ppdet.modeling.backbones.resnet import ResNet - assert isinstance(self.backbone, ResNet) and \ - self.backbone.depth == 34, \ - "If you set r34_backbone=True, please use ResNet-34 as backbone." - self.backbone.res_layers[2].blocks[0].branch2a.conv._stride = [1, 1] - self.backbone.res_layers[2].blocks[0].short.conv._stride = [1, 1] - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - # backbone - backbone = create(cfg['backbone']) - - # head - kwargs = {'input_shape': backbone.out_shape} - ssd_head = create(cfg['ssd_head'], **kwargs) - - return { - 'backbone': backbone, - "ssd_head": ssd_head, - } - - def _forward(self): - # Backbone - body_feats = self.backbone(self.inputs) - - # SSD Head - if self.training: - return self.ssd_head(body_feats, self.inputs['image'], - self.inputs['gt_bbox'], - self.inputs['gt_class']) - else: - preds, anchors = self.ssd_head(body_feats, self.inputs['image']) - bbox, bbox_num, nms_keep_idx = self.post_process( - preds, anchors, self.inputs['im_shape'], - self.inputs['scale_factor']) - - if self.use_extra_data: - extra_data = {} # record the bbox output before nms, such like scores and nms_keep_idx - """extra_data:{ - 'scores': predict scores, - 'nms_keep_idx': bbox index before nms, - } - """ - preds_logits = preds[1] # [[1xNumBBoxNumClass]] - extra_data['scores'] = F.softmax(paddle.concat( - preds_logits, axis=1)).transpose([0, 2, 1]) - extra_data['logits'] = paddle.concat( - preds_logits, axis=1).transpose([0, 2, 1]) - extra_data['nms_keep_idx'] = nms_keep_idx # bbox index before nms - return bbox, bbox_num, extra_data - else: - return bbox, bbox_num - - def get_loss(self, ): - return {"loss": self._forward()} - - def get_pred(self): - if self.use_extra_data: - bbox_pred, bbox_num, extra_data = self._forward() - output = { - "bbox": bbox_pred, - "bbox_num": bbox_num, - "extra_data": extra_data - } - else: - bbox_pred, bbox_num = self._forward() - output = { - "bbox": bbox_pred, - "bbox_num": bbox_num, - } - return output diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/tood.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/tood.py deleted file mode 100644 index 157ec6f..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/tood.py +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -__all__ = ['TOOD'] - - -@register -class TOOD(BaseArch): - """ - TOOD: Task-aligned One-stage Object Detection, see https://arxiv.org/abs/2108.07755 - Args: - backbone (nn.Layer): backbone instance - neck (nn.Layer): 'FPN' instance - head (nn.Layer): 'TOODHead' instance - """ - - __category__ = 'architecture' - - def __init__(self, backbone, neck, head): - super(TOOD, self).__init__() - self.backbone = backbone - self.neck = neck - self.head = head - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - backbone = create(cfg['backbone']) - - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) - - kwargs = {'input_shape': neck.out_shape} - head = create(cfg['head'], **kwargs) - - return { - 'backbone': backbone, - 'neck': neck, - "head": head, - } - - def _forward(self): - body_feats = self.backbone(self.inputs) - fpn_feats = self.neck(body_feats) - head_outs = self.head(fpn_feats) - if not self.training: - bboxes, bbox_num = self.head.post_process( - head_outs, self.inputs['im_shape'], self.inputs['scale_factor']) - return bboxes, bbox_num - else: - loss = self.head.get_loss(head_outs, self.inputs) - return loss - - def get_loss(self): - return self._forward() - - def get_pred(self): - bbox_pred, bbox_num = self._forward() - output = {'bbox': bbox_pred, 'bbox_num': bbox_num} - return output diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/ttfnet.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/ttfnet.py deleted file mode 100644 index c3eb61c..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/ttfnet.py +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -__all__ = ['TTFNet'] - - -@register -class TTFNet(BaseArch): - """ - TTFNet network, see https://arxiv.org/abs/1909.00700 - - Args: - backbone (object): backbone instance - neck (object): 'TTFFPN' instance - ttf_head (object): 'TTFHead' instance - post_process (object): 'BBoxPostProcess' instance - """ - - __category__ = 'architecture' - __inject__ = ['post_process'] - - def __init__(self, - backbone='DarkNet', - neck='TTFFPN', - ttf_head='TTFHead', - post_process='BBoxPostProcess'): - super(TTFNet, self).__init__() - self.backbone = backbone - self.neck = neck - self.ttf_head = ttf_head - self.post_process = post_process - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - backbone = create(cfg['backbone']) - - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) - - kwargs = {'input_shape': neck.out_shape} - ttf_head = create(cfg['ttf_head'], **kwargs) - - return { - 'backbone': backbone, - 'neck': neck, - "ttf_head": ttf_head, - } - - def _forward(self): - body_feats = self.backbone(self.inputs) - body_feats = self.neck(body_feats) - hm, wh = self.ttf_head(body_feats) - if self.training: - return hm, wh - else: - bbox, bbox_num = self.post_process(hm, wh, self.inputs['im_shape'], - self.inputs['scale_factor']) - return bbox, bbox_num - - def get_loss(self, ): - loss = {} - heatmap = self.inputs['ttf_heatmap'] - box_target = self.inputs['ttf_box_target'] - reg_weight = self.inputs['ttf_reg_weight'] - hm, wh = self._forward() - head_loss = self.ttf_head.get_loss(hm, wh, heatmap, box_target, - reg_weight) - loss.update(head_loss) - total_loss = paddle.add_n(list(loss.values())) - loss.update({'loss': total_loss}) - return loss - - def get_pred(self): - bbox_pred, bbox_num = self._forward() - output = { - "bbox": bbox_pred, - "bbox_num": bbox_num, - } - return output diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/yolo.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/yolo.py deleted file mode 100644 index b004935..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/yolo.py +++ /dev/null @@ -1,150 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch -from ..post_process import JDEBBoxPostProcess - -__all__ = ['YOLOv3'] -# YOLOv3,PP-YOLO,PP-YOLOv2,PP-YOLOE,PP-YOLOE+ use the same architecture as YOLOv3 -# PP-YOLOE and PP-YOLOE+ are recommended to use PPYOLOE architecture in ppyoloe.py, especially when use distillation or aux head - - -@register -class YOLOv3(BaseArch): - __category__ = 'architecture' - __shared__ = ['data_format'] - __inject__ = ['post_process'] - - def __init__(self, - backbone='DarkNet', - neck='YOLOv3FPN', - yolo_head='YOLOv3Head', - post_process='BBoxPostProcess', - data_format='NCHW', - for_mot=False): - """ - YOLOv3 network, see https://arxiv.org/abs/1804.02767 - - Args: - backbone (nn.Layer): backbone instance - neck (nn.Layer): neck instance - yolo_head (nn.Layer): anchor_head instance - bbox_post_process (object): `BBoxPostProcess` instance - data_format (str): data format, NCHW or NHWC - for_mot (bool): whether return other features for multi-object tracking - models, default False in pure object detection models. - """ - super(YOLOv3, self).__init__(data_format=data_format) - self.backbone = backbone - self.neck = neck - self.yolo_head = yolo_head - self.post_process = post_process - self.for_mot = for_mot - self.return_idx = isinstance(post_process, JDEBBoxPostProcess) - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - # backbone - backbone = create(cfg['backbone']) - - # fpn - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) - - # head - kwargs = {'input_shape': neck.out_shape} - yolo_head = create(cfg['yolo_head'], **kwargs) - - return { - 'backbone': backbone, - 'neck': neck, - "yolo_head": yolo_head, - } - - def _forward(self): - body_feats = self.backbone(self.inputs) - if self.for_mot: - neck_feats = self.neck(body_feats, self.for_mot) - else: - neck_feats = self.neck(body_feats) - - if isinstance(neck_feats, dict): - assert self.for_mot == True - emb_feats = neck_feats['emb_feats'] - neck_feats = neck_feats['yolo_feats'] - - if self.training: - yolo_losses = self.yolo_head(neck_feats, self.inputs) - - if self.for_mot: - return {'det_losses': yolo_losses, 'emb_feats': emb_feats} - else: - return yolo_losses - - else: - yolo_head_outs = self.yolo_head(neck_feats) - - if self.for_mot: - # the detection part of JDE MOT model - boxes_idx, bbox, bbox_num, nms_keep_idx = self.post_process( - yolo_head_outs, self.yolo_head.mask_anchors) - output = { - 'bbox': bbox, - 'bbox_num': bbox_num, - 'boxes_idx': boxes_idx, - 'nms_keep_idx': nms_keep_idx, - 'emb_feats': emb_feats, - } - else: - if self.return_idx: - # the detection part of JDE MOT model - _, bbox, bbox_num, nms_keep_idx = self.post_process( - yolo_head_outs, self.yolo_head.mask_anchors) - elif self.post_process is not None: - # anchor based YOLOs: YOLOv3,PP-YOLO,PP-YOLOv2 use mask_anchors - bbox, bbox_num, nms_keep_idx = self.post_process( - yolo_head_outs, self.yolo_head.mask_anchors, - self.inputs['im_shape'], self.inputs['scale_factor']) - else: - # anchor free YOLOs: PP-YOLOE, PP-YOLOE+ - bbox, bbox_num, nms_keep_idx = self.yolo_head.post_process( - yolo_head_outs, self.inputs['scale_factor']) - - if self.use_extra_data: - extra_data = {} # record the bbox output before nms, such like scores and nms_keep_idx - """extra_data:{ - 'scores': predict scores, - 'nms_keep_idx': bbox index before nms, - } - """ - extra_data['scores'] = yolo_head_outs[0] # predict scores (probability) - # Todo: get logits output - extra_data['nms_keep_idx'] = nms_keep_idx - # Todo support for mask_anchors yolo - output = {'bbox': bbox, 'bbox_num': bbox_num, 'extra_data': extra_data} - else: - output = {'bbox': bbox, 'bbox_num': bbox_num} - - return output - - def get_loss(self): - return self._forward() - - def get_pred(self): - return self._forward() diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/yolof.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/yolof.py deleted file mode 100644 index b6a2920..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/yolof.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -__all__ = ['YOLOF'] - - -@register -class YOLOF(BaseArch): - __category__ = 'architecture' - - def __init__(self, - backbone='ResNet', - neck='DilatedEncoder', - head='YOLOFHead', - for_mot=False): - """ - YOLOF network, see https://arxiv.org/abs/2103.09460 - - Args: - backbone (nn.Layer): backbone instance - neck (nn.Layer): DilatedEncoder instance - head (nn.Layer): YOLOFHead instance - for_mot (bool): whether return other features for multi-object tracking - models, default False in pure object detection models. - """ - super(YOLOF, self).__init__() - self.backbone = backbone - self.neck = neck - self.head = head - self.for_mot = for_mot - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - # backbone - backbone = create(cfg['backbone']) - - # fpn - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) - - # head - kwargs = {'input_shape': neck.out_shape} - head = create(cfg['head'], **kwargs) - - return { - 'backbone': backbone, - 'neck': neck, - "head": head, - } - - def _forward(self): - body_feats = self.backbone(self.inputs) - neck_feats = self.neck(body_feats, self.for_mot) - - if self.training: - yolo_losses = self.head(neck_feats, self.inputs) - return yolo_losses - else: - yolo_head_outs = self.head(neck_feats) - bbox, bbox_num = self.head.post_process(yolo_head_outs, - self.inputs['im_shape'], - self.inputs['scale_factor']) - output = {'bbox': bbox, 'bbox_num': bbox_num} - return output - - def get_loss(self): - return self._forward() - - def get_pred(self): - return self._forward() diff --git a/pdfdet/models/Paddle/ppdet/modeling/architectures/yolox.py b/pdfdet/models/Paddle/ppdet/modeling/architectures/yolox.py deleted file mode 100644 index 8e02e9e..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/architectures/yolox.py +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from ppdet.core.workspace import register, create -from .meta_arch import BaseArch - -import random -import paddle -import paddle.nn.functional as F -import paddle.distributed as dist - -__all__ = ['YOLOX'] - - -@register -class YOLOX(BaseArch): - """ - YOLOX network, see https://arxiv.org/abs/2107.08430 - - Args: - backbone (nn.Layer): backbone instance - neck (nn.Layer): neck instance - head (nn.Layer): head instance - for_mot (bool): whether used for MOT or not - input_size (list[int]): initial scale, will be reset by self._preprocess() - size_stride (int): stride of the size range - size_range (list[int]): multi-scale range for training - random_interval (int): interval of iter to change self._input_size - """ - __category__ = 'architecture' - - def __init__(self, - backbone='CSPDarkNet', - neck='YOLOCSPPAN', - head='YOLOXHead', - for_mot=False, - input_size=[640, 640], - size_stride=32, - size_range=[15, 25], - random_interval=10): - super(YOLOX, self).__init__() - self.backbone = backbone - self.neck = neck - self.head = head - self.for_mot = for_mot - - self.input_size = input_size - self._input_size = paddle.to_tensor(input_size) - self.size_stride = size_stride - self.size_range = size_range - self.random_interval = random_interval - self._step = 0 - - @classmethod - def from_config(cls, cfg, *args, **kwargs): - # backbone - backbone = create(cfg['backbone']) - - # fpn - kwargs = {'input_shape': backbone.out_shape} - neck = create(cfg['neck'], **kwargs) - - # head - kwargs = {'input_shape': neck.out_shape} - head = create(cfg['head'], **kwargs) - - return { - 'backbone': backbone, - 'neck': neck, - "head": head, - } - - def _forward(self): - if self.training: - self._preprocess() - body_feats = self.backbone(self.inputs) - neck_feats = self.neck(body_feats, self.for_mot) - - if self.training: - yolox_losses = self.head(neck_feats, self.inputs) - yolox_losses.update({'size': self._input_size[0]}) - return yolox_losses - else: - head_outs = self.head(neck_feats) - bbox, bbox_num = self.head.post_process( - head_outs, self.inputs['im_shape'], self.inputs['scale_factor']) - return {'bbox': bbox, 'bbox_num': bbox_num} - - def get_loss(self): - return self._forward() - - def get_pred(self): - return self._forward() - - def _preprocess(self): - # YOLOX multi-scale training, interpolate resize before inputs of the network. - self._get_size() - scale_y = self._input_size[0] / self.input_size[0] - scale_x = self._input_size[1] / self.input_size[1] - if scale_x != 1 or scale_y != 1: - self.inputs['image'] = F.interpolate( - self.inputs['image'], - size=self._input_size, - mode='bilinear', - align_corners=False) - gt_bboxes = self.inputs['gt_bbox'] - for i in range(len(gt_bboxes)): - if len(gt_bboxes[i]) > 0: - gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x - gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y - self.inputs['gt_bbox'] = gt_bboxes - - def _get_size(self): - # random_interval = 10 as default, every 10 iters to change self._input_size - image_ratio = self.input_size[1] * 1.0 / self.input_size[0] - if self._step % self.random_interval == 0: - size_factor = random.randint(*self.size_range) - size = [ - self.size_stride * size_factor, - self.size_stride * int(size_factor * image_ratio) - ] - self._input_size = paddle.to_tensor(size) - self._step += 1 diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/__init__.py deleted file mode 100644 index f462a9f..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/assigners/__init__.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import utils -from . import task_aligned_assigner -from . import atss_assigner -from . import simota_assigner -from . import max_iou_assigner -from . import fcosr_assigner -from . import rotated_task_aligned_assigner -from . import task_aligned_assigner_cr -from . import uniform_assigner - -from .utils import * -from .task_aligned_assigner import * -from .atss_assigner import * -from .simota_assigner import * -from .max_iou_assigner import * -from .fcosr_assigner import * -from .rotated_task_aligned_assigner import * -from .task_aligned_assigner_cr import * -from .uniform_assigner import * -from .hungarian_assigner import * -from .pose_utils import * diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/atss_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/atss_assigner.py deleted file mode 100644 index f1aae2b..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/assigners/atss_assigner.py +++ /dev/null @@ -1,225 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - -from ppdet.core.workspace import register -from ..bbox_utils import iou_similarity, batch_iou_similarity -from ..bbox_utils import bbox_center -from .utils import (check_points_inside_bboxes, compute_max_iou_anchor, - compute_max_iou_gt) - -__all__ = ['ATSSAssigner'] - - -@register -class ATSSAssigner(nn.Layer): - """Bridging the Gap Between Anchor-based and Anchor-free Detection - via Adaptive Training Sample Selection - """ - __shared__ = ['num_classes'] - - def __init__(self, - topk=9, - num_classes=80, - force_gt_matching=False, - eps=1e-9, - sm_use=False): - super(ATSSAssigner, self).__init__() - self.topk = topk - self.num_classes = num_classes - self.force_gt_matching = force_gt_matching - self.eps = eps - self.sm_use = sm_use - - def _gather_topk_pyramid(self, gt2anchor_distances, num_anchors_list, - pad_gt_mask): - gt2anchor_distances_list = paddle.split( - gt2anchor_distances, num_anchors_list, axis=-1) - num_anchors_index = np.cumsum(num_anchors_list).tolist() - num_anchors_index = [0, ] + num_anchors_index[:-1] - is_in_topk_list = [] - topk_idxs_list = [] - for distances, anchors_index in zip(gt2anchor_distances_list, - num_anchors_index): - num_anchors = distances.shape[-1] - _, topk_idxs = paddle.topk( - distances, self.topk, axis=-1, largest=False) - topk_idxs_list.append(topk_idxs + anchors_index) - is_in_topk = F.one_hot(topk_idxs, num_anchors).sum( - axis=-2).astype(gt2anchor_distances.dtype) - is_in_topk_list.append(is_in_topk * pad_gt_mask) - is_in_topk_list = paddle.concat(is_in_topk_list, axis=-1) - topk_idxs_list = paddle.concat(topk_idxs_list, axis=-1) - return is_in_topk_list, topk_idxs_list - - @paddle.no_grad() - def forward(self, - anchor_bboxes, - num_anchors_list, - gt_labels, - gt_bboxes, - pad_gt_mask, - bg_index, - gt_scores=None, - pred_bboxes=None): - r"""This code is based on - https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/atss_assigner.py - - The assignment is done in following steps - 1. compute iou between all bbox (bbox of all pyramid levels) and gt - 2. compute center distance between all bbox and gt - 3. on each pyramid level, for each gt, select k bbox whose center - are closest to the gt center, so we total select k*l bbox as - candidates for each gt - 4. get corresponding iou for the these candidates, and compute the - mean and std, set mean + std as the iou threshold - 5. select these candidates whose iou are greater than or equal to - the threshold as positive - 6. limit the positive sample's center in gt - 7. if an anchor box is assigned to multiple gts, the one with the - highest iou will be selected. - Args: - anchor_bboxes (Tensor, float32): pre-defined anchors, shape(L, 4), - "xmin, xmax, ymin, ymax" format - num_anchors_list (List): num of anchors in each level - gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1) - gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4) - pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1) - bg_index (int): background index - gt_scores (Tensor|None, float32) Score of gt_bboxes, - shape(B, n, 1), if None, then it will initialize with one_hot label - pred_bboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 4) - Returns: - assigned_labels (Tensor): (B, L) - assigned_bboxes (Tensor): (B, L, 4) - assigned_scores (Tensor): (B, L, C), if pred_bboxes is not None, then output ious - """ - assert gt_labels.ndim == gt_bboxes.ndim and \ - gt_bboxes.ndim == 3 - - num_anchors, _ = anchor_bboxes.shape - batch_size, num_max_boxes, _ = gt_bboxes.shape - - # negative batch - if num_max_boxes == 0: - assigned_labels = paddle.full( - [batch_size, num_anchors], bg_index, dtype='int32') - assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4]) - assigned_scores = paddle.zeros( - [batch_size, num_anchors, self.num_classes]) - return assigned_labels, assigned_bboxes, assigned_scores - - # 1. compute iou between gt and anchor bbox, [B, n, L] - ious = iou_similarity(gt_bboxes.reshape([-1, 4]), anchor_bboxes) - ious = ious.reshape([batch_size, -1, num_anchors]) - - # 2. compute center distance between all anchors and gt, [B, n, L] - gt_centers = bbox_center(gt_bboxes.reshape([-1, 4])).unsqueeze(1) - anchor_centers = bbox_center(anchor_bboxes) - gt2anchor_distances = (gt_centers - anchor_centers.unsqueeze(0)) \ - .norm(2, axis=-1).reshape([batch_size, -1, num_anchors]) - - # 3. on each pyramid level, selecting topk closest candidates - # based on the center distance, [B, n, L] - is_in_topk, topk_idxs = self._gather_topk_pyramid( - gt2anchor_distances, num_anchors_list, pad_gt_mask) - - # 4. get corresponding iou for the these candidates, and compute the - # mean and std, 5. set mean + std as the iou threshold - iou_candidates = ious * is_in_topk - iou_threshold = paddle.index_sample( - iou_candidates.flatten(stop_axis=-2), - topk_idxs.flatten(stop_axis=-2)) - iou_threshold = iou_threshold.reshape([batch_size, num_max_boxes, -1]) - iou_threshold = iou_threshold.mean(axis=-1, keepdim=True) + \ - iou_threshold.std(axis=-1, keepdim=True) - is_in_topk = paddle.where(iou_candidates > iou_threshold, is_in_topk, - paddle.zeros_like(is_in_topk)) - - # 6. check the positive sample's center in gt, [B, n, L] - if self.sm_use: - is_in_gts = check_points_inside_bboxes( - anchor_centers, gt_bboxes, sm_use=True) - else: - is_in_gts = check_points_inside_bboxes(anchor_centers, gt_bboxes) - - # select positive sample, [B, n, L] - mask_positive = is_in_topk * is_in_gts * pad_gt_mask - - # 7. if an anchor box is assigned to multiple gts, - # the one with the highest iou will be selected. - mask_positive_sum = mask_positive.sum(axis=-2) - if mask_positive_sum.max() > 1: - mask_multiple_gts = ( - mask_positive_sum.unsqueeze(1) > 1).astype('int32').tile( - [1, num_max_boxes, 1]).astype('bool') - if self.sm_use: - is_max_iou = compute_max_iou_anchor(ious * mask_positive) - else: - is_max_iou = compute_max_iou_anchor(ious) - mask_positive = paddle.where(mask_multiple_gts, is_max_iou, - mask_positive) - mask_positive_sum = mask_positive.sum(axis=-2) - # 8. make sure every gt_bbox matches the anchor - if self.force_gt_matching: - is_max_iou = compute_max_iou_gt(ious) * pad_gt_mask - mask_max_iou = (is_max_iou.sum(-2, keepdim=True) == 1).tile( - [1, num_max_boxes, 1]) - mask_positive = paddle.where(mask_max_iou, is_max_iou, - mask_positive) - mask_positive_sum = mask_positive.sum(axis=-2) - assigned_gt_index = mask_positive.argmax(axis=-2) - - # assigned target - batch_ind = paddle.arange( - end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1) - assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes - assigned_labels = paddle.gather( - gt_labels.flatten(), assigned_gt_index.flatten(), axis=0) - assigned_labels = assigned_labels.reshape([batch_size, num_anchors]) - assigned_labels = paddle.where( - mask_positive_sum > 0, assigned_labels, - paddle.full_like(assigned_labels, bg_index)) - - assigned_bboxes = paddle.gather( - gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0) - assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4]) - - assigned_scores = F.one_hot(assigned_labels, self.num_classes + 1) - ind = list(range(self.num_classes + 1)) - ind.remove(bg_index) - assigned_scores = paddle.index_select( - assigned_scores, paddle.to_tensor(ind), axis=-1) - if pred_bboxes is not None: - # assigned iou - ious = batch_iou_similarity(gt_bboxes, pred_bboxes) * mask_positive - ious = ious.max(axis=-2).unsqueeze(-1) - assigned_scores *= ious - elif gt_scores is not None: - gather_scores = paddle.gather( - gt_scores.flatten(), assigned_gt_index.flatten(), axis=0) - gather_scores = gather_scores.reshape([batch_size, num_anchors]) - gather_scores = paddle.where(mask_positive_sum > 0, gather_scores, - paddle.zeros_like(gather_scores)) - assigned_scores *= gather_scores.unsqueeze(-1) - - return assigned_labels, assigned_bboxes, assigned_scores diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/clrnet_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/clrnet_assigner.py deleted file mode 100644 index 59c94a0..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/assigners/clrnet_assigner.py +++ /dev/null @@ -1,147 +0,0 @@ -import paddle -import paddle.nn.functional as F -from ppdet.modeling.losses.clrnet_line_iou_loss import line_iou - - -def distance_cost(predictions, targets, img_w): - """ - repeat predictions and targets to generate all combinations - use the abs distance as the new distance cost - """ - num_priors = predictions.shape[0] - num_targets = targets.shape[0] - predictions = paddle.repeat_interleave( - predictions, num_targets, axis=0)[..., 6:] - targets = paddle.concat(x=num_priors * [targets])[..., 6:] - invalid_masks = (targets < 0) | (targets >= img_w) - lengths = (~invalid_masks).sum(axis=1) - distances = paddle.abs(x=targets - predictions) - distances[invalid_masks] = 0.0 - distances = distances.sum(axis=1) / (lengths.cast("float32") + 1e-09) - distances = distances.reshape([num_priors, num_targets]) - return distances - - -def focal_cost(cls_pred, gt_labels, alpha=0.25, gamma=2, eps=1e-12): - """ - Args: - cls_pred (Tensor): Predicted classification logits, shape - [num_query, num_class]. - gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). - - Returns: - torch.Tensor: cls_cost value - """ - cls_pred = F.sigmoid(cls_pred) - neg_cost = -(1 - cls_pred + eps).log() * (1 - alpha) * cls_pred.pow(gamma) - pos_cost = -(cls_pred + eps).log() * alpha * (1 - cls_pred).pow(gamma) - cls_cost = pos_cost.index_select( - gt_labels, axis=1) - neg_cost.index_select( - gt_labels, axis=1) - return cls_cost - - -def dynamic_k_assign(cost, pair_wise_ious): - """ - Assign grouth truths with priors dynamically. - - Args: - cost: the assign cost. - pair_wise_ious: iou of grouth truth and priors. - - Returns: - prior_idx: the index of assigned prior. - gt_idx: the corresponding ground truth index. - """ - matching_matrix = paddle.zeros_like(cost) - ious_matrix = pair_wise_ious - ious_matrix[ious_matrix < 0] = 0.0 - n_candidate_k = 4 - topk_ious, _ = paddle.topk(ious_matrix, n_candidate_k, axis=0) - dynamic_ks = paddle.clip(x=topk_ious.sum(0).cast("int32"), min=1) - num_gt = cost.shape[1] - - for gt_idx in range(num_gt): - _, pos_idx = paddle.topk( - x=cost[:, gt_idx], k=dynamic_ks[gt_idx].item(), largest=False) - matching_matrix[pos_idx, gt_idx] = 1.0 - del topk_ious, dynamic_ks, pos_idx - matched_gt = matching_matrix.sum(axis=1) - - if (matched_gt > 1).sum() > 0: - matched_gt_indices = paddle.nonzero(matched_gt > 1)[:, 0] - cost_argmin = paddle.argmin( - cost.index_select(matched_gt_indices), axis=1) - matching_matrix[matched_gt_indices][0] *= 0.0 - matching_matrix[matched_gt_indices, cost_argmin] = 1.0 - - prior_idx = matching_matrix.sum(axis=1).nonzero() - gt_idx = matching_matrix[prior_idx].argmax(axis=-1) - return prior_idx.flatten(), gt_idx.flatten() - - -def cdist_paddle(x1, x2, p=2): - assert x1.shape[1] == x2.shape[1] - B, M = x1.shape - # if p == np.inf: - # dist = np.max(np.abs(x1[:, np.newaxis, :] - x2[np.newaxis, :, :]), axis=-1) - if p == 1: - dist = paddle.sum( - paddle.abs(x1.unsqueeze(axis=1) - x2.unsqueeze(axis=0)), axis=-1) - else: - dist = paddle.pow(paddle.sum(paddle.pow( - paddle.abs(x1.unsqueeze(axis=1) - x2.unsqueeze(axis=0)), p), - axis=-1), - 1 / p) - return dist - - -def assign(predictions, - targets, - img_w, - img_h, - distance_cost_weight=3.0, - cls_cost_weight=1.0): - """ - computes dynamicly matching based on the cost, including cls cost and lane similarity cost - Args: - predictions (Tensor): predictions predicted by each stage, shape: (num_priors, 78) - targets (Tensor): lane targets, shape: (num_targets, 78) - return: - matched_row_inds (Tensor): matched predictions, shape: (num_targets) - matched_col_inds (Tensor): matched targets, shape: (num_targets) - """ - predictions = predictions.detach().clone() - predictions[:, 3] *= img_w - 1 - predictions[:, 6:] *= img_w - 1 - - targets = targets.detach().clone() - distances_score = distance_cost(predictions, targets, img_w) - distances_score = 1 - distances_score / paddle.max(x=distances_score) + 0.01 - - cls_score = focal_cost(predictions[:, :2], targets[:, 1].cast('int64')) - - num_priors = predictions.shape[0] - num_targets = targets.shape[0] - target_start_xys = targets[:, 2:4] - target_start_xys[..., 0] *= (img_h - 1) - prediction_start_xys = predictions[:, 2:4] - prediction_start_xys[..., 0] *= (img_h - 1) - start_xys_score = cdist_paddle( - prediction_start_xys, target_start_xys, - p=2).reshape([num_priors, num_targets]) - - start_xys_score = 1 - start_xys_score / paddle.max(x=start_xys_score) + 0.01 - - target_thetas = targets[:, 4].unsqueeze(axis=-1) - theta_score = cdist_paddle( - predictions[:, 4].unsqueeze(axis=-1), target_thetas, - p=1).reshape([num_priors, num_targets]) * 180 - theta_score = 1 - theta_score / paddle.max(x=theta_score) + 0.01 - - cost = -(distances_score * start_xys_score * theta_score - )**2 * distance_cost_weight + cls_score * cls_cost_weight - iou = line_iou(predictions[..., 6:], targets[..., 6:], img_w, aligned=False) - - matched_row_inds, matched_col_inds = dynamic_k_assign(cost, iou) - return matched_row_inds, matched_col_inds diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/fcosr_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/fcosr_assigner.py deleted file mode 100644 index 46b743e..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/assigners/fcosr_assigner.py +++ /dev/null @@ -1,227 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - -from ppdet.core.workspace import register -from ppdet.modeling.rbox_utils import box2corners, check_points_in_polys, paddle_gather - -__all__ = ['FCOSRAssigner'] - -EPS = 1e-9 - - -@register -class FCOSRAssigner(nn.Layer): - """ FCOSR Assigner, refer to https://arxiv.org/abs/2111.10780 for details - - 1. compute normalized gaussian distribution score and refined gaussian distribution score - 2. refer to ellipse center sampling, sample points whose normalized gaussian distribution score is greater than threshold - 3. refer to multi-level sampling, assign ground truth to feature map which follows two conditions. - i). first, the ratio between the short edge of the target and the stride of the feature map is less than 2. - ii). second, the long edge of minimum bounding rectangle of the target is larger than the acceptance range of feature map - 4. refer to fuzzy sample label assignment, the points satisfying 2 and 3 will be assigned to the ground truth according to gaussian distribution score - """ - __shared__ = ['num_classes'] - - def __init__(self, - num_classes=80, - factor=12, - threshold=0.23, - boundary=[[-1, 128], [128, 320], [320, 10000]], - score_type='iou'): - super(FCOSRAssigner, self).__init__() - self.num_classes = num_classes - self.factor = factor - self.threshold = threshold - self.boundary = [ - paddle.to_tensor( - l, dtype=paddle.float32).reshape([1, 1, 2]) for l in boundary - ] - self.score_type = score_type - - def get_gaussian_distribution_score(self, points, gt_rboxes, gt_polys): - # projecting points to coordinate system defined by each rbox - # [B, N, 4, 2] -> 4 * [B, N, 1, 2] - a, b, c, d = gt_polys.split(4, axis=2) - # [1, L, 2] -> [1, 1, L, 2] - points = points.unsqueeze(0) - ab = b - a - ad = d - a - # [B, N, 5] -> [B, N, 2], [B, N, 2], [B, N, 1] - xy, wh, angle = gt_rboxes.split([2, 2, 1], axis=-1) - # [B, N, 2] -> [B, N, 1, 2] - xy = xy.unsqueeze(2) - # vector of points to center [B, N, L, 2] - vec = points - xy - # = |ab| * |vec| * cos(theta) [B, N, L] - vec_dot_ab = paddle.sum(vec * ab, axis=-1) - # = |ad| * |vec| * cos(theta) [B, N, L] - vec_dot_ad = paddle.sum(vec * ad, axis=-1) - # norm_ab [B, N, L] - norm_ab = paddle.sum(ab * ab, axis=-1).sqrt() - # norm_ad [B, N, L] - norm_ad = paddle.sum(ad * ad, axis=-1).sqrt() - # min(h, w), [B, N, 1] - min_edge = paddle.min(wh, axis=-1, keepdim=True) - # delta_x, delta_y [B, N, L] - delta_x = vec_dot_ab.pow(2) / (norm_ab.pow(3) * min_edge + EPS) - delta_y = vec_dot_ad.pow(2) / (norm_ad.pow(3) * min_edge + EPS) - # score [B, N, L] - norm_score = paddle.exp(-0.5 * self.factor * (delta_x + delta_y)) - - # simplified calculation - sigma = min_edge / self.factor - refined_score = norm_score / (2 * np.pi * sigma + EPS) - return norm_score, refined_score - - def get_rotated_inside_mask(self, points, gt_polys, scores): - inside_mask = check_points_in_polys(points, gt_polys) - center_mask = scores >= self.threshold - return (inside_mask & center_mask).cast(paddle.float32) - - def get_inside_range_mask(self, points, gt_bboxes, gt_rboxes, stride_tensor, - regress_range): - # [1, L, 2] -> [1, 1, L, 2] - points = points.unsqueeze(0) - # [B, n, 4] -> [B, n, 1, 4] - x1y1, x2y2 = gt_bboxes.unsqueeze(2).split(2, axis=-1) - # [B, n, L, 2] - lt = points - x1y1 - rb = x2y2 - points - # [B, n, L, 4] - ltrb = paddle.concat([lt, rb], axis=-1) - # [B, n, L, 4] -> [B, n, L] - inside_mask = paddle.min(ltrb, axis=-1) > EPS - # regress_range [1, L, 2] -> [1, 1, L, 2] - regress_range = regress_range.unsqueeze(0) - # stride_tensor [1, L, 1] -> [1, 1, L] - stride_tensor = stride_tensor.transpose((0, 2, 1)) - # fcos range - # [B, n, L, 4] -> [B, n, L] - ltrb_max = paddle.max(ltrb, axis=-1) - # [1, 1, L, 2] -> [1, 1, L] - low, high = regress_range[..., 0], regress_range[..., 1] - # [B, n, L] - regress_mask = (ltrb_max >= low) & (ltrb_max <= high) - # mask for rotated - # [B, n, 1] - min_edge = paddle.min(gt_rboxes[..., 2:4], axis=-1, keepdim=True) - # [B, n , L] - rotated_mask = ((min_edge / stride_tensor) < 2.0) & (ltrb_max > high) - mask = inside_mask & (regress_mask | rotated_mask) - return mask.cast(paddle.float32) - - @paddle.no_grad() - def forward(self, - anchor_points, - stride_tensor, - num_anchors_list, - gt_labels, - gt_bboxes, - gt_rboxes, - pad_gt_mask, - bg_index, - pred_rboxes=None): - r""" - - Args: - anchor_points (Tensor, float32): pre-defined anchor points, shape(1, L, 2), - "x, y" format - stride_tensor (Tensor, float32): stride tensor, shape (1, L, 1) - num_anchors_list (List): num of anchors in each level - gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1) - gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4) - gt_rboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 5) - pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1) - bg_index (int): background index - pred_rboxes (Tensor, float32, optional): predicted bounding boxes, shape(B, L, 5) - Returns: - assigned_labels (Tensor): (B, L) - assigned_rboxes (Tensor): (B, L, 5) - assigned_scores (Tensor): (B, L, C), if pred_rboxes is not None, then output ious - """ - - _, num_anchors, _ = anchor_points.shape - batch_size, num_max_boxes, _ = gt_rboxes.shape - if num_max_boxes == 0: - assigned_labels = paddle.full( - [batch_size, num_anchors], bg_index, dtype=gt_labels.dtype) - assigned_rboxes = paddle.zeros([batch_size, num_anchors, 5]) - assigned_scores = paddle.zeros( - [batch_size, num_anchors, self.num_classes]) - return assigned_labels, assigned_rboxes, assigned_scores - - # get normalized gaussian distribution score and refined distribution score - gt_polys = box2corners(gt_rboxes) - score, refined_score = self.get_gaussian_distribution_score( - anchor_points, gt_rboxes, gt_polys) - inside_mask = self.get_rotated_inside_mask(anchor_points, gt_polys, - score) - regress_ranges = [] - for num, bound in zip(num_anchors_list, self.boundary): - regress_ranges.append(bound.tile((1, num, 1))) - regress_ranges = paddle.concat(regress_ranges, axis=1) - regress_mask = self.get_inside_range_mask( - anchor_points, gt_bboxes, gt_rboxes, stride_tensor, regress_ranges) - # [B, n, L] - mask_positive = inside_mask * regress_mask * pad_gt_mask - refined_score = refined_score * mask_positive - (1. - mask_positive) - - argmax_refined_score = refined_score.argmax(axis=-2) - max_refined_score = refined_score.max(axis=-2) - assigned_gt_index = argmax_refined_score - - # assigned target - batch_ind = paddle.arange( - end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1) - assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes - assigned_labels = paddle.gather( - gt_labels.flatten(), assigned_gt_index.flatten(), axis=0) - assigned_labels = assigned_labels.reshape([batch_size, num_anchors]) - assigned_labels = paddle.where( - max_refined_score > 0, assigned_labels, - paddle.full_like(assigned_labels, bg_index)) - - assigned_rboxes = paddle.gather( - gt_rboxes.reshape([-1, 5]), assigned_gt_index.flatten(), axis=0) - assigned_rboxes = assigned_rboxes.reshape([batch_size, num_anchors, 5]) - - assigned_scores = F.one_hot(assigned_labels, self.num_classes + 1) - ind = list(range(self.num_classes + 1)) - ind.remove(bg_index) - assigned_scores = paddle.index_select( - assigned_scores, paddle.to_tensor(ind), axis=-1) - - if self.score_type == 'gaussian': - selected_scores = paddle_gather( - score, 1, argmax_refined_score.unsqueeze(-2)).squeeze(-2) - assigned_scores = assigned_scores * selected_scores.unsqueeze(-1) - elif self.score_type == 'iou': - assert pred_rboxes is not None, 'If score type is iou, pred_rboxes should not be None' - from ext_op import matched_rbox_iou - b, l = pred_rboxes.shape[:2] - iou_score = matched_rbox_iou( - pred_rboxes.reshape((-1, 5)), assigned_rboxes.reshape( - (-1, 5))).reshape((b, l, 1)) - assigned_scores = assigned_scores * iou_score - - return assigned_labels, assigned_rboxes, assigned_scores \ No newline at end of file diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/hungarian_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/hungarian_assigner.py deleted file mode 100644 index 154c27c..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/assigners/hungarian_assigner.py +++ /dev/null @@ -1,316 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -try: - from scipy.optimize import linear_sum_assignment -except ImportError: - linear_sum_assignment = None - -import paddle - -from ppdet.core.workspace import register - -__all__ = ['PoseHungarianAssigner', 'PseudoSampler'] - - -class AssignResult: - """Stores assignments between predicted and truth boxes. - - Attributes: - num_gts (int): the number of truth boxes considered when computing this - assignment - - gt_inds (LongTensor): for each predicted box indicates the 1-based - index of the assigned truth box. 0 means unassigned and -1 means - ignore. - - max_overlaps (FloatTensor): the iou between the predicted box and its - assigned truth box. - - labels (None | LongTensor): If specified, for each predicted box - indicates the category label of the assigned truth box. - """ - - def __init__(self, num_gts, gt_inds, max_overlaps, labels=None): - self.num_gts = num_gts - self.gt_inds = gt_inds - self.max_overlaps = max_overlaps - self.labels = labels - # Interface for possible user-defined properties - self._extra_properties = {} - - @property - def num_preds(self): - """int: the number of predictions in this assignment""" - return len(self.gt_inds) - - def set_extra_property(self, key, value): - """Set user-defined new property.""" - assert key not in self.info - self._extra_properties[key] = value - - def get_extra_property(self, key): - """Get user-defined property.""" - return self._extra_properties.get(key, None) - - @property - def info(self): - """dict: a dictionary of info about the object""" - basic_info = { - 'num_gts': self.num_gts, - 'num_preds': self.num_preds, - 'gt_inds': self.gt_inds, - 'max_overlaps': self.max_overlaps, - 'labels': self.labels, - } - basic_info.update(self._extra_properties) - return basic_info - - -@register -class PoseHungarianAssigner: - """Computes one-to-one matching between predictions and ground truth. - - This class computes an assignment between the targets and the predictions - based on the costs. The costs are weighted sum of three components: - classification cost, regression L1 cost and regression oks cost. The - targets don't include the no_object, so generally there are more - predictions than targets. After the one-to-one matching, the un-matched - are treated as backgrounds. Thus each query prediction will be assigned - with `0` or a positive integer indicating the ground truth index: - - - 0: negative sample, no assigned gt. - - positive integer: positive sample, index (1-based) of assigned gt. - - Args: - cls_weight (int | float, optional): The scale factor for classification - cost. Default 1.0. - kpt_weight (int | float, optional): The scale factor for regression - L1 cost. Default 1.0. - oks_weight (int | float, optional): The scale factor for regression - oks cost. Default 1.0. - """ - __inject__ = ['cls_cost', 'kpt_cost', 'oks_cost'] - - def __init__(self, - cls_cost='ClassificationCost', - kpt_cost='KptL1Cost', - oks_cost='OksCost'): - self.cls_cost = cls_cost - self.kpt_cost = kpt_cost - self.oks_cost = oks_cost - - def assign(self, - cls_pred, - kpt_pred, - gt_labels, - gt_keypoints, - gt_areas, - img_meta, - eps=1e-7): - """Computes one-to-one matching based on the weighted costs. - - This method assign each query prediction to a ground truth or - background. The `assigned_gt_inds` with -1 means don't care, - 0 means negative sample, and positive number is the index (1-based) - of assigned gt. - The assignment is done in the following steps, the order matters. - - 1. assign every prediction to -1 - 2. compute the weighted costs - 3. do Hungarian matching on CPU based on the costs - 4. assign all to 0 (background) first, then for each matched pair - between predictions and gts, treat this prediction as foreground - and assign the corresponding gt index (plus 1) to it. - - Args: - cls_pred (Tensor): Predicted classification logits, shape - [num_query, num_class]. - kpt_pred (Tensor): Predicted keypoints with normalized coordinates - (x_{i}, y_{i}), which are all in range [0, 1]. Shape - [num_query, K*2]. - gt_labels (Tensor): Label of `gt_keypoints`, shape (num_gt,). - gt_keypoints (Tensor): Ground truth keypoints with unnormalized - coordinates [p^{1}_x, p^{1}_y, p^{1}_v, ..., \ - p^{K}_x, p^{K}_y, p^{K}_v]. Shape [num_gt, K*3]. - gt_areas (Tensor): Ground truth mask areas, shape (num_gt,). - img_meta (dict): Meta information for current image. - eps (int | float, optional): A value added to the denominator for - numerical stability. Default 1e-7. - - Returns: - :obj:`AssignResult`: The assigned result. - """ - num_gts, num_kpts = gt_keypoints.shape[0], kpt_pred.shape[0] - if not gt_keypoints.astype('bool').any(): - num_gts = 0 - - # 1. assign -1 by default - assigned_gt_inds = paddle.full((num_kpts, ), -1, dtype="int64") - assigned_labels = paddle.full((num_kpts, ), -1, dtype="int64") - if num_gts == 0 or num_kpts == 0: - # No ground truth or keypoints, return empty assignment - if num_gts == 0: - # No ground truth, assign all to background - assigned_gt_inds[:] = 0 - return AssignResult( - num_gts, assigned_gt_inds, None, labels=assigned_labels) - img_h, img_w, _ = img_meta['img_shape'] - factor = paddle.to_tensor( - [img_w, img_h, img_w, img_h], dtype=gt_keypoints.dtype).reshape( - (1, -1)) - - # 2. compute the weighted costs - # classification cost - cls_cost = self.cls_cost(cls_pred, gt_labels) - - # keypoint regression L1 cost - gt_keypoints_reshape = gt_keypoints.reshape((gt_keypoints.shape[0], -1, - 3)) - valid_kpt_flag = gt_keypoints_reshape[..., -1] - kpt_pred_tmp = kpt_pred.clone().detach().reshape((kpt_pred.shape[0], -1, - 2)) - normalize_gt_keypoints = gt_keypoints_reshape[ - ..., :2] / factor[:, :2].unsqueeze(0) - kpt_cost = self.kpt_cost(kpt_pred_tmp, normalize_gt_keypoints, - valid_kpt_flag) - # keypoint OKS cost - kpt_pred_tmp = kpt_pred.clone().detach().reshape((kpt_pred.shape[0], -1, - 2)) - kpt_pred_tmp = kpt_pred_tmp * factor[:, :2].unsqueeze(0) - oks_cost = self.oks_cost(kpt_pred_tmp, gt_keypoints_reshape[..., :2], - valid_kpt_flag, gt_areas) - # weighted sum of above three costs - cost = cls_cost + kpt_cost + oks_cost - - # 3. do Hungarian matching on CPU using linear_sum_assignment - cost = cost.detach().cpu() - if linear_sum_assignment is None: - raise ImportError('Please run "pip install scipy" ' - 'to install scipy first.') - matched_row_inds, matched_col_inds = linear_sum_assignment(cost) - matched_row_inds = paddle.to_tensor(matched_row_inds) - matched_col_inds = paddle.to_tensor(matched_col_inds) - - # 4. assign backgrounds and foregrounds - # assign all indices to backgrounds first - assigned_gt_inds[:] = 0 - # assign foregrounds based on matching results - assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 - assigned_labels[matched_row_inds] = gt_labels[matched_col_inds][ - ..., 0].astype("int64") - return AssignResult( - num_gts, assigned_gt_inds, None, labels=assigned_labels) - - -class SamplingResult: - """Bbox sampling result. - """ - - def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, - gt_flags): - self.pos_inds = pos_inds - self.neg_inds = neg_inds - if pos_inds.size > 0: - self.pos_bboxes = bboxes[pos_inds] - self.neg_bboxes = bboxes[neg_inds] - self.pos_is_gt = gt_flags[pos_inds] - - self.num_gts = gt_bboxes.shape[0] - self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1 - - if gt_bboxes.numel() == 0: - # hack for index error case - assert self.pos_assigned_gt_inds.numel() == 0 - self.pos_gt_bboxes = paddle.zeros( - gt_bboxes.shape, dtype=gt_bboxes.dtype).reshape((-1, 4)) - else: - if len(gt_bboxes.shape) < 2: - gt_bboxes = gt_bboxes.reshape((-1, 4)) - - self.pos_gt_bboxes = paddle.index_select( - gt_bboxes, - self.pos_assigned_gt_inds.astype('int64'), - axis=0) - - if assign_result.labels is not None: - self.pos_gt_labels = assign_result.labels[pos_inds] - else: - self.pos_gt_labels = None - - @property - def bboxes(self): - """paddle.Tensor: concatenated positive and negative boxes""" - return paddle.concat([self.pos_bboxes, self.neg_bboxes]) - - def __nice__(self): - data = self.info.copy() - data['pos_bboxes'] = data.pop('pos_bboxes').shape - data['neg_bboxes'] = data.pop('neg_bboxes').shape - parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())] - body = ' ' + ',\n '.join(parts) - return '{\n' + body + '\n}' - - @property - def info(self): - """Returns a dictionary of info about the object.""" - return { - 'pos_inds': self.pos_inds, - 'neg_inds': self.neg_inds, - 'pos_bboxes': self.pos_bboxes, - 'neg_bboxes': self.neg_bboxes, - 'pos_is_gt': self.pos_is_gt, - 'num_gts': self.num_gts, - 'pos_assigned_gt_inds': self.pos_assigned_gt_inds, - } - - -@register -class PseudoSampler: - """A pseudo sampler that does not do sampling actually.""" - - def __init__(self, **kwargs): - pass - - def _sample_pos(self, **kwargs): - """Sample positive samples.""" - raise NotImplementedError - - def _sample_neg(self, **kwargs): - """Sample negative samples.""" - raise NotImplementedError - - def sample(self, assign_result, bboxes, gt_bboxes, *args, **kwargs): - """Directly returns the positive and negative indices of samples. - - Args: - assign_result (:obj:`AssignResult`): Assigned results - bboxes (paddle.Tensor): Bounding boxes - gt_bboxes (paddle.Tensor): Ground truth boxes - - Returns: - :obj:`SamplingResult`: sampler results - """ - pos_inds = paddle.nonzero( - assign_result.gt_inds > 0, as_tuple=False).squeeze(-1) - neg_inds = paddle.nonzero( - assign_result.gt_inds == 0, as_tuple=False).squeeze(-1) - gt_flags = paddle.zeros([bboxes.shape[0]], dtype='int32') - sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes, - assign_result, gt_flags) - return sampling_result diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/max_iou_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/max_iou_assigner.py deleted file mode 100644 index 98a4fdf..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/assigners/max_iou_assigner.py +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from ppdet.core.workspace import register -from ppdet.modeling.proposal_generator.target import label_box - -__all__ = ['MaxIoUAssigner'] - -@register -class MaxIoUAssigner(object): - """a standard bbox assigner based on max IoU, use ppdet's label_box - as backend. - Args: - positive_overlap (float): threshold for defining positive samples - negative_overlap (float): threshold for denining negative samples - allow_low_quality (bool): whether to lower IoU thr if a GT poorly - overlaps with candidate bboxes - """ - def __init__(self, - positive_overlap, - negative_overlap, - allow_low_quality=True): - self.positive_overlap = positive_overlap - self.negative_overlap = negative_overlap - self.allow_low_quality = allow_low_quality - - def __call__(self, bboxes, gt_bboxes): - matches, match_labels = label_box( - bboxes, - gt_bboxes, - positive_overlap=self.positive_overlap, - negative_overlap=self.negative_overlap, - allow_low_quality=self.allow_low_quality, - ignore_thresh=-1, - is_crowd=None, - assign_on_cpu=False) - return matches, match_labels diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/pose_utils.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/pose_utils.py deleted file mode 100644 index 313215a..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/assigners/pose_utils.py +++ /dev/null @@ -1,275 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import paddle -import paddle.nn.functional as F - -from ppdet.core.workspace import register - -__all__ = ['KptL1Cost', 'OksCost', 'ClassificationCost'] - - -def masked_fill(x, mask, value): - y = paddle.full(x.shape, value, x.dtype) - return paddle.where(mask, y, x) - - -@register -class KptL1Cost(object): - """KptL1Cost. - - this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py - - Args: - weight (int | float, optional): loss_weight. - """ - - def __init__(self, weight=1.0): - self.weight = weight - - def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag): - """ - Args: - kpt_pred (Tensor): Predicted keypoints with normalized coordinates - (x_{i}, y_{i}), which are all in range [0, 1]. Shape - [num_query, K, 2]. - gt_keypoints (Tensor): Ground truth keypoints with normalized - coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2]. - valid_kpt_flag (Tensor): valid flag of ground truth keypoints. - Shape [num_gt, K]. - - Returns: - paddle.Tensor: kpt_cost value with weight. - """ - kpt_cost = [] - for i in range(len(gt_keypoints)): - if gt_keypoints[i].size == 0: - kpt_cost.append(kpt_pred.sum() * 0) - kpt_pred_tmp = kpt_pred.clone() - valid_flag = valid_kpt_flag[i] > 0 - valid_flag_expand = valid_flag.unsqueeze(0).unsqueeze(-1).expand_as( - kpt_pred_tmp) - if not valid_flag_expand.all(): - kpt_pred_tmp = masked_fill(kpt_pred_tmp, ~valid_flag_expand, 0) - cost = F.pairwise_distance( - kpt_pred_tmp.reshape((kpt_pred_tmp.shape[0], -1)), - gt_keypoints[i].reshape((-1, )).unsqueeze(0), - p=1, - keepdim=True) - avg_factor = paddle.clip( - valid_flag.astype('float32').sum() * 2, 1.0) - cost = cost / avg_factor - kpt_cost.append(cost) - kpt_cost = paddle.concat(kpt_cost, axis=1) - return kpt_cost * self.weight - - -@register -class OksCost(object): - """OksCost. - - this function based on: https://github.com/hikvision-research/opera/blob/main/opera/core/bbox/match_costs/match_cost.py - - Args: - num_keypoints (int): number of keypoints - weight (int | float, optional): loss_weight. - """ - - def __init__(self, num_keypoints=17, weight=1.0): - self.weight = weight - if num_keypoints == 17: - self.sigmas = np.array( - [ - .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, - 1.07, .87, .87, .89, .89 - ], - dtype=np.float32) / 10.0 - elif num_keypoints == 14: - self.sigmas = np.array( - [ - .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, - .89, .79, .79 - ], - dtype=np.float32) / 10.0 - else: - raise ValueError(f'Unsupported keypoints number {num_keypoints}') - - def __call__(self, kpt_pred, gt_keypoints, valid_kpt_flag, gt_areas): - """ - Args: - kpt_pred (Tensor): Predicted keypoints with unnormalized - coordinates (x_{i}, y_{i}). Shape [num_query, K, 2]. - gt_keypoints (Tensor): Ground truth keypoints with unnormalized - coordinates (x_{i}, y_{i}). Shape [num_gt, K, 2]. - valid_kpt_flag (Tensor): valid flag of ground truth keypoints. - Shape [num_gt, K]. - gt_areas (Tensor): Ground truth mask areas. Shape [num_gt,]. - - Returns: - paddle.Tensor: oks_cost value with weight. - """ - sigmas = paddle.to_tensor(self.sigmas) - variances = (sigmas * 2)**2 - - oks_cost = [] - assert len(gt_keypoints) == len(gt_areas) - for i in range(len(gt_keypoints)): - if gt_keypoints[i].size == 0: - oks_cost.append(kpt_pred.sum() * 0) - squared_distance = \ - (kpt_pred[:, :, 0] - gt_keypoints[i, :, 0].unsqueeze(0)) ** 2 + \ - (kpt_pred[:, :, 1] - gt_keypoints[i, :, 1].unsqueeze(0)) ** 2 - vis_flag = (valid_kpt_flag[i] > 0).astype('int') - vis_ind = vis_flag.nonzero(as_tuple=False)[:, 0] - num_vis_kpt = vis_ind.shape[0] - # assert num_vis_kpt > 0 - if num_vis_kpt == 0: - oks_cost.append(paddle.zeros((squared_distance.shape[0], 1))) - continue - area = gt_areas[i] - - squared_distance0 = squared_distance / (area * variances * 2) - squared_distance0 = paddle.index_select( - squared_distance0, vis_ind, axis=1) - squared_distance1 = paddle.exp(-squared_distance0).sum(axis=1, - keepdim=True) - oks = squared_distance1 / num_vis_kpt - # The 1 is a constant that doesn't change the matching, so omitted. - oks_cost.append(-oks) - oks_cost = paddle.concat(oks_cost, axis=1) - return oks_cost * self.weight - - -@register -class ClassificationCost: - """ClsSoftmaxCost. - - Args: - weight (int | float, optional): loss_weight - """ - - def __init__(self, weight=1.): - self.weight = weight - - def __call__(self, cls_pred, gt_labels): - """ - Args: - cls_pred (Tensor): Predicted classification logits, shape - (num_query, num_class). - gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). - - Returns: - paddle.Tensor: cls_cost value with weight - """ - # Following the official DETR repo, contrary to the loss that - # NLL is used, we approximate it in 1 - cls_score[gt_label]. - # The 1 is a constant that doesn't change the matching, - # so it can be omitted. - cls_score = cls_pred.softmax(-1) - cls_cost = -cls_score[:, gt_labels] - return cls_cost * self.weight - - -@register -class FocalLossCost: - """FocalLossCost. - - Args: - weight (int | float, optional): loss_weight - alpha (int | float, optional): focal_loss alpha - gamma (int | float, optional): focal_loss gamma - eps (float, optional): default 1e-12 - binary_input (bool, optional): Whether the input is binary, - default False. - """ - - def __init__(self, - weight=1., - alpha=0.25, - gamma=2, - eps=1e-12, - binary_input=False): - self.weight = weight - self.alpha = alpha - self.gamma = gamma - self.eps = eps - self.binary_input = binary_input - - def _focal_loss_cost(self, cls_pred, gt_labels): - """ - Args: - cls_pred (Tensor): Predicted classification logits, shape - (num_query, num_class). - gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). - - Returns: - paddle.Tensor: cls_cost value with weight - """ - if gt_labels.size == 0: - return cls_pred.sum() * 0 - cls_pred = F.sigmoid(cls_pred) - neg_cost = -(1 - cls_pred + self.eps).log() * ( - 1 - self.alpha) * cls_pred.pow(self.gamma) - pos_cost = -(cls_pred + self.eps).log() * self.alpha * ( - 1 - cls_pred).pow(self.gamma) - - cls_cost = paddle.index_select( - pos_cost, gt_labels, axis=1) - paddle.index_select( - neg_cost, gt_labels, axis=1) - return cls_cost * self.weight - - def _mask_focal_loss_cost(self, cls_pred, gt_labels): - """ - Args: - cls_pred (Tensor): Predicted classfication logits - in shape (num_query, d1, ..., dn), dtype=paddle.float32. - gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn), - dtype=paddle.long. Labels should be binary. - - Returns: - Tensor: Focal cost matrix with weight in shape\ - (num_query, num_gt). - """ - cls_pred = cls_pred.flatten(1) - gt_labels = gt_labels.flatten(1).float() - n = cls_pred.shape[1] - cls_pred = F.sigmoid(cls_pred) - neg_cost = -(1 - cls_pred + self.eps).log() * ( - 1 - self.alpha) * cls_pred.pow(self.gamma) - pos_cost = -(cls_pred + self.eps).log() * self.alpha * ( - 1 - cls_pred).pow(self.gamma) - - cls_cost = paddle.einsum('nc,mc->nm', pos_cost, gt_labels) + \ - paddle.einsum('nc,mc->nm', neg_cost, (1 - gt_labels)) - return cls_cost / n * self.weight - - def __call__(self, cls_pred, gt_labels): - """ - Args: - cls_pred (Tensor): Predicted classfication logits. - gt_labels (Tensor)): Labels. - - Returns: - Tensor: Focal cost matrix with weight in shape\ - (num_query, num_gt). - """ - if self.binary_input: - return self._mask_focal_loss_cost(cls_pred, gt_labels) - else: - return self._focal_loss_cost(cls_pred, gt_labels) diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/rotated_task_aligned_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/rotated_task_aligned_assigner.py deleted file mode 100644 index eeb9a68..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/assigners/rotated_task_aligned_assigner.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - -from ppdet.core.workspace import register -from ..rbox_utils import rotated_iou_similarity, check_points_in_rotated_boxes -from .utils import gather_topk_anchors, compute_max_iou_anchor - -__all__ = ['RotatedTaskAlignedAssigner'] - - -@register -class RotatedTaskAlignedAssigner(nn.Layer): - """TOOD: Task-aligned One-stage Object Detection - """ - - def __init__(self, topk=13, alpha=1.0, beta=6.0, eps=1e-9): - super(RotatedTaskAlignedAssigner, self).__init__() - self.topk = topk - self.alpha = alpha - self.beta = beta - self.eps = eps - - @paddle.no_grad() - def forward(self, - pred_scores, - pred_bboxes, - anchor_points, - num_anchors_list, - gt_labels, - gt_bboxes, - pad_gt_mask, - bg_index, - gt_scores=None): - r"""This code is based on - https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py - - The assignment is done in following steps - 1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt - 2. select top-k bbox as candidates for each gt - 3. limit the positive sample's center in gt (because the anchor-free detector - only can predict positive distance) - 4. if an anchor box is assigned to multiple gts, the one with the - highest iou will be selected. - Args: - pred_scores (Tensor, float32): predicted class probability, shape(B, L, C) - pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 5) - anchor_points (Tensor, float32): pre-defined anchors, shape(1, L, 2), "cxcy" format - num_anchors_list (List): num of anchors in each level, shape(L) - gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1) - gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 5) - pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1) - bg_index (int): background index - gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1) - Returns: - assigned_labels (Tensor): (B, L) - assigned_bboxes (Tensor): (B, L, 5) - assigned_scores (Tensor): (B, L, C) - """ - assert pred_scores.ndim == pred_bboxes.ndim - assert gt_labels.ndim == gt_bboxes.ndim and \ - gt_bboxes.ndim == 3 - - batch_size, num_anchors, num_classes = pred_scores.shape - _, num_max_boxes, _ = gt_bboxes.shape - - # negative batch - if num_max_boxes == 0: - assigned_labels = paddle.full( - [batch_size, num_anchors], bg_index, dtype=gt_labels.dtype) - assigned_bboxes = paddle.zeros([batch_size, num_anchors, 5]) - assigned_scores = paddle.zeros( - [batch_size, num_anchors, num_classes]) - return assigned_labels, assigned_bboxes, assigned_scores - - # compute iou between gt and pred bbox, [B, n, L] - ious = rotated_iou_similarity(gt_bboxes, pred_bboxes) - ious = paddle.where(ious > 1 + self.eps, paddle.zeros_like(ious), ious) - ious.stop_gradient = True - # gather pred bboxes class score - pred_scores = pred_scores.transpose([0, 2, 1]) - batch_ind = paddle.arange( - end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1) - gt_labels_ind = paddle.stack( - [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)], - axis=-1) - bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind) - # compute alignment metrics, [B, n, L] - alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow( - self.beta) - - # check the positive sample's center in gt, [B, n, L] - is_in_gts = check_points_in_rotated_boxes(anchor_points, gt_bboxes) - - # select topk largest alignment metrics pred bbox as candidates - # for each gt, [B, n, L] - is_in_topk = gather_topk_anchors( - alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask) - - # select positive sample, [B, n, L] - mask_positive = is_in_topk * is_in_gts * pad_gt_mask - - # if an anchor box is assigned to multiple gts, - # the one with the highest iou will be selected, [B, n, L] - mask_positive_sum = mask_positive.sum(axis=-2) - if mask_positive_sum.max() > 1: - mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile( - [1, num_max_boxes, 1]) - is_max_iou = compute_max_iou_anchor(ious) - mask_positive = paddle.where(mask_multiple_gts, is_max_iou, - mask_positive) - mask_positive_sum = mask_positive.sum(axis=-2) - assigned_gt_index = mask_positive.argmax(axis=-2) - - # assigned target - assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes - assigned_labels = paddle.gather( - gt_labels.flatten(), assigned_gt_index.flatten(), axis=0) - assigned_labels = assigned_labels.reshape([batch_size, num_anchors]) - assigned_labels = paddle.where( - mask_positive_sum > 0, assigned_labels, - paddle.full_like(assigned_labels, bg_index)) - - assigned_bboxes = paddle.gather( - gt_bboxes.reshape([-1, 5]), assigned_gt_index.flatten(), axis=0) - assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 5]) - - assigned_scores = F.one_hot(assigned_labels, num_classes + 1) - ind = list(range(num_classes + 1)) - ind.remove(bg_index) - assigned_scores = paddle.index_select( - assigned_scores, paddle.to_tensor(ind), axis=-1) - # rescale alignment metrics - alignment_metrics *= mask_positive - max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True) - max_ious_per_instance = (ious * mask_positive).max(axis=-1, - keepdim=True) - alignment_metrics = alignment_metrics / ( - max_metrics_per_instance + self.eps) * max_ious_per_instance - alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1) - assigned_scores = assigned_scores * alignment_metrics - - assigned_bboxes.stop_gradient = True - assigned_scores.stop_gradient = True - assigned_labels.stop_gradient = True - return assigned_labels, assigned_bboxes, assigned_scores diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/simota_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/simota_assigner.py deleted file mode 100644 index 4ec87cb..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/assigners/simota_assigner.py +++ /dev/null @@ -1,265 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The code is based on: -# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/bbox/assigners/sim_ota_assigner.py - -import paddle -import numpy as np -import paddle.nn.functional as F - -from ppdet.modeling.losses.varifocal_loss import varifocal_loss -from ppdet.modeling.bbox_utils import batch_bbox_overlaps -from ppdet.core.workspace import register - - -@register -class SimOTAAssigner(object): - """Computes matching between predictions and ground truth. - Args: - center_radius (int | float, optional): Ground truth center size - to judge whether a prior is in center. Default 2.5. - candidate_topk (int, optional): The candidate top-k which used to - get top-k ious to calculate dynamic-k. Default 10. - iou_weight (int | float, optional): The scale factor for regression - iou cost. Default 3.0. - cls_weight (int | float, optional): The scale factor for classification - cost. Default 1.0. - num_classes (int): The num_classes of dataset. - use_vfl (int): Whether to use varifocal_loss when calculating the cost matrix. - """ - __shared__ = ['num_classes'] - - def __init__(self, - center_radius=2.5, - candidate_topk=10, - iou_weight=3.0, - cls_weight=1.0, - num_classes=80, - use_vfl=True): - self.center_radius = center_radius - self.candidate_topk = candidate_topk - self.iou_weight = iou_weight - self.cls_weight = cls_weight - self.num_classes = num_classes - self.use_vfl = use_vfl - - def get_in_gt_and_in_center_info(self, flatten_center_and_stride, - gt_bboxes): - num_gt = gt_bboxes.shape[0] - - flatten_x = flatten_center_and_stride[:, 0].unsqueeze(1).tile( - [1, num_gt]) - flatten_y = flatten_center_and_stride[:, 1].unsqueeze(1).tile( - [1, num_gt]) - flatten_stride_x = flatten_center_and_stride[:, 2].unsqueeze(1).tile( - [1, num_gt]) - flatten_stride_y = flatten_center_and_stride[:, 3].unsqueeze(1).tile( - [1, num_gt]) - - # is prior centers in gt bboxes, shape: [n_center, n_gt] - l_ = flatten_x - gt_bboxes[:, 0] - t_ = flatten_y - gt_bboxes[:, 1] - r_ = gt_bboxes[:, 2] - flatten_x - b_ = gt_bboxes[:, 3] - flatten_y - - deltas = paddle.stack([l_, t_, r_, b_], axis=1) - is_in_gts = deltas.min(axis=1) > 0 - is_in_gts_all = is_in_gts.sum(axis=1) > 0 - - # is prior centers in gt centers - gt_center_xs = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0 - gt_center_ys = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0 - ct_bound_l = gt_center_xs - self.center_radius * flatten_stride_x - ct_bound_t = gt_center_ys - self.center_radius * flatten_stride_y - ct_bound_r = gt_center_xs + self.center_radius * flatten_stride_x - ct_bound_b = gt_center_ys + self.center_radius * flatten_stride_y - - cl_ = flatten_x - ct_bound_l - ct_ = flatten_y - ct_bound_t - cr_ = ct_bound_r - flatten_x - cb_ = ct_bound_b - flatten_y - - ct_deltas = paddle.stack([cl_, ct_, cr_, cb_], axis=1) - is_in_cts = ct_deltas.min(axis=1) > 0 - is_in_cts_all = is_in_cts.sum(axis=1) > 0 - - # in any of gts or gt centers, shape: [n_center] - is_in_gts_or_centers_all = paddle.logical_or(is_in_gts_all, - is_in_cts_all) - - is_in_gts_or_centers_all_inds = paddle.nonzero( - is_in_gts_or_centers_all).squeeze(1) - - # both in gts and gt centers, shape: [num_fg, num_gt] - is_in_gts_and_centers = paddle.logical_and( - paddle.gather( - is_in_gts.cast('int'), is_in_gts_or_centers_all_inds, - axis=0).cast('bool'), - paddle.gather( - is_in_cts.cast('int'), is_in_gts_or_centers_all_inds, - axis=0).cast('bool')) - return is_in_gts_or_centers_all, is_in_gts_or_centers_all_inds, is_in_gts_and_centers - - def dynamic_k_matching(self, cost_matrix, pairwise_ious, num_gt): - match_matrix = np.zeros_like(cost_matrix.numpy()) - # select candidate topk ious for dynamic-k calculation - topk_ious, _ = paddle.topk( - pairwise_ious, - min(self.candidate_topk, pairwise_ious.shape[0]), - axis=0) - # calculate dynamic k for each gt - dynamic_ks = paddle.clip(topk_ious.sum(0).cast('int'), min=1) - for gt_idx in range(num_gt): - _, pos_idx = paddle.topk( - cost_matrix[:, gt_idx], k=dynamic_ks[gt_idx], largest=False) - match_matrix[:, gt_idx][pos_idx.numpy()] = 1.0 - - del topk_ious, dynamic_ks, pos_idx - - # match points more than two gts - extra_match_gts_mask = match_matrix.sum(1) > 1 - if extra_match_gts_mask.sum() > 0: - cost_matrix = cost_matrix.numpy() - cost_argmin = np.argmin( - cost_matrix[extra_match_gts_mask, :], axis=1) - match_matrix[extra_match_gts_mask, :] *= 0.0 - match_matrix[extra_match_gts_mask, cost_argmin] = 1.0 - # get foreground mask - match_fg_mask_inmatrix = match_matrix.sum(1) > 0 - match_gt_inds_to_fg = match_matrix[match_fg_mask_inmatrix, :].argmax(1) - - return match_gt_inds_to_fg, match_fg_mask_inmatrix - - def get_sample(self, assign_gt_inds, gt_bboxes): - pos_inds = np.unique(np.nonzero(assign_gt_inds > 0)[0]) - neg_inds = np.unique(np.nonzero(assign_gt_inds == 0)[0]) - pos_assigned_gt_inds = assign_gt_inds[pos_inds] - 1 - - if gt_bboxes.size == 0: - # hack for index error case - assert pos_assigned_gt_inds.size == 0 - pos_gt_bboxes = np.empty_like(gt_bboxes).reshape(-1, 4) - else: - if len(gt_bboxes.shape) < 2: - gt_bboxes = gt_bboxes.resize(-1, 4) - pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :] - return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds - - def __call__(self, - flatten_cls_pred_scores, - flatten_center_and_stride, - flatten_bboxes, - gt_bboxes, - gt_labels, - eps=1e-7): - """Assign gt to priors using SimOTA. - TODO: add comment. - Returns: - assign_result: The assigned result. - """ - num_gt = gt_bboxes.shape[0] - num_bboxes = flatten_bboxes.shape[0] - - if num_gt == 0 or num_bboxes == 0: - # No ground truth or boxes - label = np.ones([num_bboxes], dtype=np.int64) * self.num_classes - label_weight = np.ones([num_bboxes], dtype=np.float32) - bbox_target = np.zeros_like(flatten_center_and_stride) - return 0, label, label_weight, bbox_target - - is_in_gts_or_centers_all, is_in_gts_or_centers_all_inds, is_in_boxes_and_center = self.get_in_gt_and_in_center_info( - flatten_center_and_stride, gt_bboxes) - - # bboxes and scores to calculate matrix - valid_flatten_bboxes = flatten_bboxes[is_in_gts_or_centers_all_inds] - valid_cls_pred_scores = flatten_cls_pred_scores[ - is_in_gts_or_centers_all_inds] - num_valid_bboxes = valid_flatten_bboxes.shape[0] - - pairwise_ious = batch_bbox_overlaps(valid_flatten_bboxes, - gt_bboxes) # [num_points,num_gts] - if self.use_vfl: - gt_vfl_labels = gt_labels.squeeze(-1).unsqueeze(0).tile( - [num_valid_bboxes, 1]).reshape([-1]) - valid_pred_scores = valid_cls_pred_scores.unsqueeze(1).tile( - [1, num_gt, 1]).reshape([-1, self.num_classes]) - vfl_score = np.zeros(valid_pred_scores.shape) - vfl_score[np.arange(0, vfl_score.shape[0]), gt_vfl_labels.numpy( - )] = pairwise_ious.reshape([-1]) - vfl_score = paddle.to_tensor(vfl_score) - losses_vfl = varifocal_loss( - valid_pred_scores, vfl_score, - use_sigmoid=False).reshape([num_valid_bboxes, num_gt]) - losses_giou = batch_bbox_overlaps( - valid_flatten_bboxes, gt_bboxes, mode='giou') - cost_matrix = ( - losses_vfl * self.cls_weight + losses_giou * self.iou_weight + - paddle.logical_not(is_in_boxes_and_center).cast('float32') * - 100000000) - else: - iou_cost = -paddle.log(pairwise_ious + eps) - gt_onehot_label = (F.one_hot( - gt_labels.squeeze(-1).cast(paddle.int64), - flatten_cls_pred_scores.shape[-1]).cast('float32').unsqueeze(0) - .tile([num_valid_bboxes, 1, 1])) - - valid_pred_scores = valid_cls_pred_scores.unsqueeze(1).tile( - [1, num_gt, 1]) - cls_cost = F.binary_cross_entropy( - valid_pred_scores, gt_onehot_label, reduction='none').sum(-1) - - cost_matrix = ( - cls_cost * self.cls_weight + iou_cost * self.iou_weight + - paddle.logical_not(is_in_boxes_and_center).cast('float32') * - 100000000) - - match_gt_inds_to_fg, match_fg_mask_inmatrix = \ - self.dynamic_k_matching( - cost_matrix, pairwise_ious, num_gt) - - # sample and assign results - assigned_gt_inds = np.zeros([num_bboxes], dtype=np.int64) - match_fg_mask_inall = np.zeros_like(assigned_gt_inds) - match_fg_mask_inall[is_in_gts_or_centers_all.numpy( - )] = match_fg_mask_inmatrix - - assigned_gt_inds[match_fg_mask_inall.astype( - np.bool_)] = match_gt_inds_to_fg + 1 - - pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds \ - = self.get_sample(assigned_gt_inds, gt_bboxes.numpy()) - - bbox_target = np.zeros(flatten_bboxes.shape, paddle.common_ops_import.convert_dtype(flatten_bboxes.dtype)) - bbox_weight = np.zeros_like(bbox_target) - label = np.ones([num_bboxes], dtype=np.int64) * self.num_classes - label_weight = np.zeros([num_bboxes], dtype=np.float32) - - if len(pos_inds) > 0: - gt_labels = gt_labels.numpy() - pos_bbox_targets = pos_gt_bboxes - bbox_target[pos_inds, :] = pos_bbox_targets - bbox_weight[pos_inds, :] = 1.0 - if not np.any(gt_labels): - label[pos_inds] = 0 - else: - label[pos_inds] = gt_labels.squeeze(-1)[pos_assigned_gt_inds] - - label_weight[pos_inds] = 1.0 - if len(neg_inds) > 0: - label_weight[neg_inds] = 1.0 - - pos_num = max(pos_inds.size, 1) - - return pos_num, label, label_weight, bbox_target diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/task_aligned_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/task_aligned_assigner.py deleted file mode 100644 index 23af794..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/assigners/task_aligned_assigner.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - -from ppdet.core.workspace import register -from ..bbox_utils import batch_iou_similarity -from .utils import (gather_topk_anchors, check_points_inside_bboxes, - compute_max_iou_anchor) - -__all__ = ['TaskAlignedAssigner'] - - -def is_close_gt(anchor, gt, stride_lst, max_dist=2.0, alpha=2.): - """Calculate distance ratio of box1 and box2 in batch for larger stride - anchors dist/stride to promote the survive of large distance match - Args: - anchor (Tensor): box with the shape [L, 2] - gt (Tensor): box with the shape [N, M2, 4] - Return: - dist (Tensor): dist ratio between box1 and box2 with the shape [N, M1, M2] - """ - center1 = anchor.unsqueeze(0) - center2 = (gt[..., :2] + gt[..., -2:]) / 2. - center1 = center1.unsqueeze(1) # [N, M1, 2] -> [N, 1, M1, 2] - center2 = center2.unsqueeze(2) # [N, M2, 2] -> [N, M2, 1, 2] - - stride = paddle.concat([ - paddle.full([x], 32 / pow(2, idx)) for idx, x in enumerate(stride_lst) - ]).unsqueeze(0).unsqueeze(0) - dist = paddle.linalg.norm(center1 - center2, p=2, axis=-1) / stride - dist_ratio = dist - dist_ratio[dist < max_dist] = 1. - dist_ratio[dist >= max_dist] = 0. - return dist_ratio - - -@register -class TaskAlignedAssigner(nn.Layer): - """TOOD: Task-aligned One-stage Object Detection - """ - - def __init__(self, - topk=13, - alpha=1.0, - beta=6.0, - eps=1e-9, - is_close_gt=False): - super(TaskAlignedAssigner, self).__init__() - self.topk = topk - self.alpha = alpha - self.beta = beta - self.eps = eps - self.is_close_gt = is_close_gt - - @paddle.no_grad() - def forward(self, - pred_scores, - pred_bboxes, - anchor_points, - num_anchors_list, - gt_labels, - gt_bboxes, - pad_gt_mask, - bg_index, - gt_scores=None): - r"""This code is based on - https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py - - The assignment is done in following steps - 1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt - 2. select top-k bbox as candidates for each gt - 3. limit the positive sample's center in gt (because the anchor-free detector - only can predict positive distance) - 4. if an anchor box is assigned to multiple gts, the one with the - highest iou will be selected. - Args: - pred_scores (Tensor, float32): predicted class probability, shape(B, L, C) - pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4) - anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format - num_anchors_list (List): num of anchors in each level, shape(L) - gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1) - gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4) - pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1) - bg_index (int): background index - gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1) - Returns: - assigned_labels (Tensor): (B, L) - assigned_bboxes (Tensor): (B, L, 4) - assigned_scores (Tensor): (B, L, C) - """ - assert pred_scores.ndim == pred_bboxes.ndim - assert gt_labels.ndim == gt_bboxes.ndim and \ - gt_bboxes.ndim == 3 - - batch_size, num_anchors, num_classes = pred_scores.shape - _, num_max_boxes, _ = gt_bboxes.shape - - # negative batch - if num_max_boxes == 0: - assigned_labels = paddle.full( - [batch_size, num_anchors], bg_index, dtype='int32') - assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4]) - assigned_scores = paddle.zeros( - [batch_size, num_anchors, num_classes]) - return assigned_labels, assigned_bboxes, assigned_scores - - # compute iou between gt and pred bbox, [B, n, L] - ious = batch_iou_similarity(gt_bboxes, pred_bboxes) - # gather pred bboxes class score - pred_scores = pred_scores.transpose([0, 2, 1]) - batch_ind = paddle.arange( - end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1) - gt_labels_ind = paddle.stack( - [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)], - axis=-1) - bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind) - # compute alignment metrics, [B, n, L] - alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow( - self.beta) - - # check the positive sample's center in gt, [B, n, L] - if self.is_close_gt: - is_in_gts = is_close_gt(anchor_points, gt_bboxes, num_anchors_list) - else: - is_in_gts = check_points_inside_bboxes(anchor_points, gt_bboxes) - - # select topk largest alignment metrics pred bbox as candidates - # for each gt, [B, n, L] - is_in_topk = gather_topk_anchors( - alignment_metrics * is_in_gts, self.topk, topk_mask=pad_gt_mask) - - # select positive sample, [B, n, L] - mask_positive = is_in_topk * is_in_gts * pad_gt_mask - - # if an anchor box is assigned to multiple gts, - # the one with the highest iou will be selected, [B, n, L] - mask_positive_sum = mask_positive.sum(axis=-2) - if mask_positive_sum.max() > 1: - mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile( - [1, num_max_boxes, 1]) - is_max_iou = compute_max_iou_anchor(ious) - mask_positive = paddle.where(mask_multiple_gts, is_max_iou, - mask_positive) - mask_positive_sum = mask_positive.sum(axis=-2) - assigned_gt_index = mask_positive.argmax(axis=-2) - - # assigned target - assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes - assigned_labels = paddle.gather( - gt_labels.flatten(), assigned_gt_index.flatten(), axis=0) - assigned_labels = assigned_labels.reshape([batch_size, num_anchors]) - assigned_labels = paddle.where( - mask_positive_sum > 0, assigned_labels, - paddle.full_like(assigned_labels, bg_index)) - - assigned_bboxes = paddle.gather( - gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0) - assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4]) - - assigned_scores = F.one_hot(assigned_labels, num_classes + 1) - ind = list(range(num_classes + 1)) - ind.remove(bg_index) - assigned_scores = paddle.index_select( - assigned_scores, paddle.to_tensor(ind), axis=-1) - # rescale alignment metrics - alignment_metrics *= mask_positive - max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True) - max_ious_per_instance = (ious * mask_positive).max(axis=-1, - keepdim=True) - alignment_metrics = alignment_metrics / ( - max_metrics_per_instance + self.eps) * max_ious_per_instance - alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1) - assigned_scores = assigned_scores * alignment_metrics - - return assigned_labels, assigned_bboxes, assigned_scores diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/task_aligned_assigner_cr.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/task_aligned_assigner_cr.py deleted file mode 100644 index 5c50976..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/assigners/task_aligned_assigner_cr.py +++ /dev/null @@ -1,181 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - -from ppdet.core.workspace import register -from ..bbox_utils import batch_iou_similarity -from .utils import (gather_topk_anchors, check_points_inside_bboxes, - compute_max_iou_anchor) - -__all__ = ['TaskAlignedAssigner_CR'] - - -@register -class TaskAlignedAssigner_CR(nn.Layer): - """TOOD: Task-aligned One-stage Object Detection with Center R - """ - - def __init__(self, - topk=13, - alpha=1.0, - beta=6.0, - center_radius=None, - eps=1e-9): - super(TaskAlignedAssigner_CR, self).__init__() - self.topk = topk - self.alpha = alpha - self.beta = beta - self.center_radius = center_radius - self.eps = eps - - @paddle.no_grad() - def forward(self, - pred_scores, - pred_bboxes, - anchor_points, - stride_tensor, - gt_labels, - gt_bboxes, - pad_gt_mask, - bg_index, - gt_scores=None): - r"""This code is based on - https://github.com/fcjian/TOOD/blob/master/mmdet/core/bbox/assigners/task_aligned_assigner.py - - The assignment is done in following steps - 1. compute alignment metric between all bbox (bbox of all pyramid levels) and gt - 2. select top-k bbox as candidates for each gt - 3. limit the positive sample's center in gt (because the anchor-free detector - only can predict positive distance) - 4. if an anchor box is assigned to multiple gts, the one with the - highest iou will be selected. - Args: - pred_scores (Tensor, float32): predicted class probability, shape(B, L, C) - pred_bboxes (Tensor, float32): predicted bounding boxes, shape(B, L, 4) - anchor_points (Tensor, float32): pre-defined anchors, shape(L, 2), "cxcy" format - stride_tensor (Tensor, float32): stride of feature map, shape(L, 1) - gt_labels (Tensor, int64|int32): Label of gt_bboxes, shape(B, n, 1) - gt_bboxes (Tensor, float32): Ground truth bboxes, shape(B, n, 4) - pad_gt_mask (Tensor, float32): 1 means bbox, 0 means no bbox, shape(B, n, 1) - bg_index (int): background index - gt_scores (Tensor|None, float32) Score of gt_bboxes, shape(B, n, 1) - Returns: - assigned_labels (Tensor): (B, L) - assigned_bboxes (Tensor): (B, L, 4) - assigned_scores (Tensor): (B, L, C) - """ - assert pred_scores.ndim == pred_bboxes.ndim - assert gt_labels.ndim == gt_bboxes.ndim and \ - gt_bboxes.ndim == 3 - - batch_size, num_anchors, num_classes = pred_scores.shape - _, num_max_boxes, _ = gt_bboxes.shape - - # negative batch - if num_max_boxes == 0: - assigned_labels = paddle.full( - [batch_size, num_anchors], bg_index, dtype='int32') - assigned_bboxes = paddle.zeros([batch_size, num_anchors, 4]) - assigned_scores = paddle.zeros( - [batch_size, num_anchors, num_classes]) - return assigned_labels, assigned_bboxes, assigned_scores - - # compute iou between gt and pred bbox, [B, n, L] - ious = batch_iou_similarity(gt_bboxes, pred_bboxes) - # gather pred bboxes class score - pred_scores = pred_scores.transpose([0, 2, 1]) - batch_ind = paddle.arange( - end=batch_size, dtype=gt_labels.dtype).unsqueeze(-1) - gt_labels_ind = paddle.stack( - [batch_ind.tile([1, num_max_boxes]), gt_labels.squeeze(-1)], - axis=-1) - bbox_cls_scores = paddle.gather_nd(pred_scores, gt_labels_ind) - # compute alignment metrics, [B, n, L] - alignment_metrics = bbox_cls_scores.pow(self.alpha) * ious.pow( - self.beta) * pad_gt_mask - - # select positive sample, [B, n, L] - if self.center_radius is None: - # check the positive sample's center in gt, [B, n, L] - is_in_gts = check_points_inside_bboxes( - anchor_points, gt_bboxes, sm_use=True) - # select topk largest alignment metrics pred bbox as candidates - # for each gt, [B, n, L] - mask_positive = gather_topk_anchors( - alignment_metrics, self.topk, topk_mask=pad_gt_mask) * is_in_gts - else: - is_in_gts, is_in_center = check_points_inside_bboxes( - anchor_points, - gt_bboxes, - stride_tensor * self.center_radius, - sm_use=True) - is_in_gts *= pad_gt_mask - is_in_center *= pad_gt_mask - candidate_metrics = paddle.where( - is_in_gts.sum(-1, keepdim=True) == 0, - alignment_metrics + is_in_center, - alignment_metrics) - mask_positive = gather_topk_anchors( - candidate_metrics, self.topk, - topk_mask=pad_gt_mask) * paddle.cast((is_in_center > 0) | - (is_in_gts > 0), 'float32') - - # if an anchor box is assigned to multiple gts, - # the one with the highest iou will be selected, [B, n, L] - mask_positive_sum = mask_positive.sum(axis=-2) - if mask_positive_sum.max() > 1: - mask_multiple_gts = (mask_positive_sum.unsqueeze(1) > 1).tile( - [1, num_max_boxes, 1]) - is_max_iou = compute_max_iou_anchor(ious * mask_positive) - mask_positive = paddle.where(mask_multiple_gts, is_max_iou, - mask_positive) - mask_positive_sum = mask_positive.sum(axis=-2) - assigned_gt_index = mask_positive.argmax(axis=-2) - - # assigned target - assigned_gt_index = assigned_gt_index + batch_ind * num_max_boxes - assigned_labels = paddle.gather( - gt_labels.flatten(), assigned_gt_index.flatten(), axis=0) - assigned_labels = assigned_labels.reshape([batch_size, num_anchors]) - assigned_labels = paddle.where( - mask_positive_sum > 0, assigned_labels, - paddle.full_like(assigned_labels, bg_index)) - - assigned_bboxes = paddle.gather( - gt_bboxes.reshape([-1, 4]), assigned_gt_index.flatten(), axis=0) - assigned_bboxes = assigned_bboxes.reshape([batch_size, num_anchors, 4]) - - assigned_scores = F.one_hot(assigned_labels, num_classes + 1) - ind = list(range(num_classes + 1)) - ind.remove(bg_index) - assigned_scores = paddle.index_select( - assigned_scores, paddle.to_tensor(ind), axis=-1) - # rescale alignment metrics - alignment_metrics *= mask_positive - max_metrics_per_instance = alignment_metrics.max(axis=-1, keepdim=True) - max_ious_per_instance = (ious * mask_positive).max(axis=-1, - keepdim=True) - alignment_metrics = alignment_metrics / ( - max_metrics_per_instance + self.eps) * max_ious_per_instance - alignment_metrics = alignment_metrics.max(-2).unsqueeze(-1) - assigned_scores = assigned_scores * alignment_metrics - - return assigned_labels, assigned_bboxes, assigned_scores diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/uniform_assigner.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/uniform_assigner.py deleted file mode 100644 index 1c14805..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/assigners/uniform_assigner.py +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register - -from ppdet.modeling.bbox_utils import batch_bbox_overlaps -from ppdet.modeling.transformers import bbox_xyxy_to_cxcywh - -__all__ = ['UniformAssigner'] - - -def batch_p_dist(x, y, p=2): - """ - calculate pairwise p_dist, the first index of x and y are batch - return [x.shape[0], y.shape[0]] - """ - x = x.unsqueeze(1) - diff = x - y - return paddle.norm(diff, p=p, axis=list(range(2, diff.dim()))) - - -@register -class UniformAssigner(nn.Layer): - def __init__(self, pos_ignore_thr, neg_ignore_thr, match_times=4): - super(UniformAssigner, self).__init__() - self.pos_ignore_thr = pos_ignore_thr - self.neg_ignore_thr = neg_ignore_thr - self.match_times = match_times - - def forward(self, bbox_pred, anchor, gt_bboxes, gt_labels=None): - num_bboxes = bbox_pred.shape[0] - num_gts = gt_bboxes.shape[0] - match_labels = paddle.full([num_bboxes], -1, dtype=paddle.int32) - - pred_ious = batch_bbox_overlaps(bbox_pred, gt_bboxes) - pred_max_iou = pred_ious.max(axis=1) - neg_ignore = pred_max_iou > self.neg_ignore_thr - # exclude potential ignored neg samples first, deal with pos samples later - #match_labels: -2(ignore), -1(neg) or >=0(pos_inds) - match_labels = paddle.where(neg_ignore, - paddle.full_like(match_labels, -2), - match_labels) - - bbox_pred_c = bbox_xyxy_to_cxcywh(bbox_pred) - anchor_c = bbox_xyxy_to_cxcywh(anchor) - gt_bboxes_c = bbox_xyxy_to_cxcywh(gt_bboxes) - bbox_pred_dist = batch_p_dist(bbox_pred_c, gt_bboxes_c, p=1) - anchor_dist = batch_p_dist(anchor_c, gt_bboxes_c, p=1) - - top_pred = bbox_pred_dist.topk( - k=self.match_times, axis=0, largest=False)[1] - top_anchor = anchor_dist.topk( - k=self.match_times, axis=0, largest=False)[1] - - tar_pred = paddle.arange(num_gts).expand([self.match_times, num_gts]) - tar_anchor = paddle.arange(num_gts).expand([self.match_times, num_gts]) - pos_places = paddle.concat([top_pred, top_anchor]).reshape([-1]) - pos_inds = paddle.concat([tar_pred, tar_anchor]).reshape([-1]) - - pos_anchor = anchor[pos_places] - pos_tar_bbox = gt_bboxes[pos_inds] - pos_ious = batch_bbox_overlaps( - pos_anchor, pos_tar_bbox, is_aligned=True) - pos_ignore = pos_ious < self.pos_ignore_thr - pos_inds = paddle.where(pos_ignore, - paddle.full_like(pos_inds, -2), pos_inds) - match_labels[pos_places] = pos_inds - match_labels.stop_gradient = True - pos_keep = ~pos_ignore - - if pos_keep.sum() > 0: - pos_places_keep = pos_places[pos_keep] - pos_bbox_pred = bbox_pred[pos_places_keep].reshape([-1, 4]) - pos_bbox_tar = pos_tar_bbox[pos_keep].reshape([-1, 4]).detach() - else: - pos_bbox_pred = None - pos_bbox_tar = None - - return match_labels, pos_bbox_pred, pos_bbox_tar diff --git a/pdfdet/models/Paddle/ppdet/modeling/assigners/utils.py b/pdfdet/models/Paddle/ppdet/modeling/assigners/utils.py deleted file mode 100644 index 8fe7c93..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/assigners/utils.py +++ /dev/null @@ -1,230 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn.functional as F - -__all__ = [ - 'pad_gt', 'gather_topk_anchors', 'check_points_inside_bboxes', - 'compute_max_iou_anchor', 'compute_max_iou_gt', - 'generate_anchors_for_grid_cell' -] - - -def pad_gt(gt_labels, gt_bboxes, gt_scores=None): - r""" Pad 0 in gt_labels and gt_bboxes. - Args: - gt_labels (Tensor|List[Tensor], int64): Label of gt_bboxes, - shape is [B, n, 1] or [[n_1, 1], [n_2, 1], ...], here n = sum(n_i) - gt_bboxes (Tensor|List[Tensor], float32): Ground truth bboxes, - shape is [B, n, 4] or [[n_1, 4], [n_2, 4], ...], here n = sum(n_i) - gt_scores (Tensor|List[Tensor]|None, float32): Score of gt_bboxes, - shape is [B, n, 1] or [[n_1, 4], [n_2, 4], ...], here n = sum(n_i) - Returns: - pad_gt_labels (Tensor, int64): shape[B, n, 1] - pad_gt_bboxes (Tensor, float32): shape[B, n, 4] - pad_gt_scores (Tensor, float32): shape[B, n, 1] - pad_gt_mask (Tensor, float32): shape[B, n, 1], 1 means bbox, 0 means no bbox - """ - if isinstance(gt_labels, paddle.Tensor) and isinstance(gt_bboxes, - paddle.Tensor): - assert gt_labels.ndim == gt_bboxes.ndim and \ - gt_bboxes.ndim == 3 - pad_gt_mask = ( - gt_bboxes.sum(axis=-1, keepdim=True) > 0).astype(gt_bboxes.dtype) - if gt_scores is None: - gt_scores = pad_gt_mask.clone() - assert gt_labels.ndim == gt_scores.ndim - - return gt_labels, gt_bboxes, gt_scores, pad_gt_mask - elif isinstance(gt_labels, list) and isinstance(gt_bboxes, list): - assert len(gt_labels) == len(gt_bboxes), \ - 'The number of `gt_labels` and `gt_bboxes` is not equal. ' - num_max_boxes = max([len(a) for a in gt_bboxes]) - batch_size = len(gt_bboxes) - # pad label and bbox - pad_gt_labels = paddle.zeros( - [batch_size, num_max_boxes, 1], dtype=gt_labels[0].dtype) - pad_gt_bboxes = paddle.zeros( - [batch_size, num_max_boxes, 4], dtype=gt_bboxes[0].dtype) - pad_gt_scores = paddle.zeros( - [batch_size, num_max_boxes, 1], dtype=gt_bboxes[0].dtype) - pad_gt_mask = paddle.zeros( - [batch_size, num_max_boxes, 1], dtype=gt_bboxes[0].dtype) - for i, (label, bbox) in enumerate(zip(gt_labels, gt_bboxes)): - if len(label) > 0 and len(bbox) > 0: - pad_gt_labels[i, :len(label)] = label - pad_gt_bboxes[i, :len(bbox)] = bbox - pad_gt_mask[i, :len(bbox)] = 1. - if gt_scores is not None: - pad_gt_scores[i, :len(gt_scores[i])] = gt_scores[i] - if gt_scores is None: - pad_gt_scores = pad_gt_mask.clone() - return pad_gt_labels, pad_gt_bboxes, pad_gt_scores, pad_gt_mask - else: - raise ValueError('The input `gt_labels` or `gt_bboxes` is invalid! ') - - -def gather_topk_anchors(metrics, topk, largest=True, topk_mask=None, eps=1e-9): - r""" - Args: - metrics (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors - topk (int): The number of top elements to look for along the axis. - largest (bool) : largest is a flag, if set to true, - algorithm will sort by descending order, otherwise sort by - ascending order. Default: True - topk_mask (Tensor, float32): shape[B, n, 1], ignore bbox mask, - Default: None - eps (float): Default: 1e-9 - Returns: - is_in_topk (Tensor, float32): shape[B, n, L], value=1. means selected - """ - num_anchors = metrics.shape[-1] - topk_metrics, topk_idxs = paddle.topk( - metrics, topk, axis=-1, largest=largest) - if topk_mask is None: - topk_mask = ( - topk_metrics.max(axis=-1, keepdim=True) > eps).astype(metrics.dtype) - is_in_topk = F.one_hot(topk_idxs, num_anchors).sum( - axis=-2).astype(metrics.dtype) - return is_in_topk * topk_mask - - -def check_points_inside_bboxes(points, - bboxes, - center_radius_tensor=None, - eps=1e-9, - sm_use=False): - r""" - Args: - points (Tensor, float32): shape[L, 2], "xy" format, L: num_anchors - bboxes (Tensor, float32): shape[B, n, 4], "xmin, ymin, xmax, ymax" format - center_radius_tensor (Tensor, float32): shape [L, 1]. Default: None. - eps (float): Default: 1e-9 - Returns: - is_in_bboxes (Tensor, float32): shape[B, n, L], value=1. means selected - """ - points = points.unsqueeze([0, 1]) - x, y = points.chunk(2, axis=-1) - xmin, ymin, xmax, ymax = bboxes.unsqueeze(2).chunk(4, axis=-1) - # check whether `points` is in `bboxes` - l = x - xmin - t = y - ymin - r = xmax - x - b = ymax - y - delta_ltrb = paddle.concat([l, t, r, b], axis=-1) - is_in_bboxes = (delta_ltrb.min(axis=-1) > eps) - if center_radius_tensor is not None: - # check whether `points` is in `center_radius` - center_radius_tensor = center_radius_tensor.unsqueeze([0, 1]) - cx = (xmin + xmax) * 0.5 - cy = (ymin + ymax) * 0.5 - l = x - (cx - center_radius_tensor) - t = y - (cy - center_radius_tensor) - r = (cx + center_radius_tensor) - x - b = (cy + center_radius_tensor) - y - delta_ltrb_c = paddle.concat([l, t, r, b], axis=-1) - is_in_center = (delta_ltrb_c.min(axis=-1) > eps) - if sm_use: - return is_in_bboxes.astype(bboxes.dtype), is_in_center.astype( - bboxes.dtype) - else: - return (paddle.logical_and(is_in_bboxes, is_in_center), - paddle.logical_or(is_in_bboxes, is_in_center)) - - return is_in_bboxes.astype(bboxes.dtype) - - -def compute_max_iou_anchor(ious): - r""" - For each anchor, find the GT with the largest IOU. - Args: - ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors - Returns: - is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected - """ - num_max_boxes = ious.shape[-2] - max_iou_index = ious.argmax(axis=-2) - is_max_iou = F.one_hot(max_iou_index, num_max_boxes).transpose([0, 2, 1]) - return is_max_iou.astype(ious.dtype) - - -def compute_max_iou_gt(ious): - r""" - For each GT, find the anchor with the largest IOU. - Args: - ious (Tensor, float32): shape[B, n, L], n: num_gts, L: num_anchors - Returns: - is_max_iou (Tensor, float32): shape[B, n, L], value=1. means selected - """ - num_anchors = ious.shape[-1] - max_iou_index = ious.argmax(axis=-1) - is_max_iou = F.one_hot(max_iou_index, num_anchors) - return is_max_iou.astype(ious.dtype) - - -def generate_anchors_for_grid_cell(feats, - fpn_strides, - grid_cell_size=5.0, - grid_cell_offset=0.5, - dtype='float32'): - r""" - Like ATSS, generate anchors based on grid size. - Args: - feats (List[Tensor]): shape[s, (b, c, h, w)] - fpn_strides (tuple|list): shape[s], stride for each scale feature - grid_cell_size (float): anchor size - grid_cell_offset (float): The range is between 0 and 1. - Returns: - anchors (Tensor): shape[l, 4], "xmin, ymin, xmax, ymax" format. - anchor_points (Tensor): shape[l, 2], "x, y" format. - num_anchors_list (List[int]): shape[s], contains [s_1, s_2, ...]. - stride_tensor (Tensor): shape[l, 1], contains the stride for each scale. - """ - assert len(feats) == len(fpn_strides) - anchors = [] - anchor_points = [] - num_anchors_list = [] - stride_tensor = [] - for feat, stride in zip(feats, fpn_strides): - _, _, h, w = feat.shape - cell_half_size = grid_cell_size * stride * 0.5 - shift_x = (paddle.arange(end=w) + grid_cell_offset) * stride - shift_y = (paddle.arange(end=h) + grid_cell_offset) * stride - shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) - anchor = paddle.stack( - [ - shift_x - cell_half_size, shift_y - cell_half_size, - shift_x + cell_half_size, shift_y + cell_half_size - ], - axis=-1).astype(dtype) - anchor_point = paddle.stack([shift_x, shift_y], axis=-1).astype(dtype) - - anchors.append(anchor.reshape([-1, 4])) - anchor_points.append(anchor_point.reshape([-1, 2])) - num_anchors_list.append(len(anchors[-1])) - stride_tensor.append( - paddle.full( - [num_anchors_list[-1], 1], stride, dtype=dtype)) - anchors = paddle.concat(anchors) - anchors.stop_gradient = True - anchor_points = paddle.concat(anchor_points) - anchor_points.stop_gradient = True - stride_tensor = paddle.concat(stride_tensor) - stride_tensor.stop_gradient = True - return anchors, anchor_points, num_anchors_list, stride_tensor diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/__init__.py deleted file mode 100644 index bc000c7..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/__init__.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import vgg -from . import resnet -from . import darknet -from . import mobilenet_v1 -from . import mobilenet_v3 -from . import hrnet -from . import lite_hrnet -from . import blazenet -from . import ghostnet -from . import senet -from . import res2net -from . import dla -from . import shufflenet_v2 -from . import swin_transformer -from . import lcnet -from . import hardnet -from . import esnet -from . import cspresnet -from . import csp_darknet -from . import convnext -from . import vision_transformer -from . import mobileone -from . import trans_encoder -from . import focalnet -from . import vit_mae -from . import hgnet_v2 -from . import clrnet_resnet - -from .vgg import * -from .resnet import * -from .darknet import * -from .mobilenet_v1 import * -from .mobilenet_v3 import * -from .hrnet import * -from .lite_hrnet import * -from .blazenet import * -from .ghostnet import * -from .senet import * -from .res2net import * -from .dla import * -from .shufflenet_v2 import * -from .swin_transformer import * -from .lcnet import * -from .hardnet import * -from .esnet import * -from .cspresnet import * -from .csp_darknet import * -from .convnext import * -from .vision_transformer import * -from .mobileone import * -from .trans_encoder import * -from .focalnet import * -from .vitpose import * -from .vit_mae import * -from .hgnet_v2 import * -from .clrnet_resnet import * diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/blazenet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/blazenet.py deleted file mode 100644 index fbfdcec..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/blazenet.py +++ /dev/null @@ -1,319 +0,0 @@ -# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.nn.initializer import KaimingNormal -from ppdet.core.workspace import register, serializable -from ..shape_spec import ShapeSpec - -__all__ = ['BlazeNet'] - - -def hard_swish(x): - return x * F.relu6(x + 3) / 6. - - -class ConvBNLayer(nn.Layer): - def __init__(self, - in_channels, - out_channels, - kernel_size, - stride, - padding, - num_groups=1, - act='relu', - conv_lr=0.1, - conv_decay=0., - norm_decay=0., - norm_type='bn', - name=None): - super(ConvBNLayer, self).__init__() - self.act = act - self._conv = nn.Conv2D( - in_channels, - out_channels, - kernel_size=kernel_size, - stride=stride, - padding=padding, - groups=num_groups, - weight_attr=ParamAttr( - learning_rate=conv_lr, initializer=KaimingNormal()), - bias_attr=False) - - if norm_type in ['bn', 'sync_bn']: - self._batch_norm = nn.BatchNorm2D(out_channels) - - def forward(self, x): - x = self._conv(x) - x = self._batch_norm(x) - if self.act == "relu": - x = F.relu(x) - elif self.act == "relu6": - x = F.relu6(x) - elif self.act == 'leaky': - x = F.leaky_relu(x) - elif self.act == 'hard_swish': - x = hard_swish(x) - return x - - -class BlazeBlock(nn.Layer): - def __init__(self, - in_channels, - out_channels1, - out_channels2, - double_channels=None, - stride=1, - use_5x5kernel=True, - act='relu', - name=None): - super(BlazeBlock, self).__init__() - assert stride in [1, 2] - self.use_pool = not stride == 1 - self.use_double_block = double_channels is not None - self.conv_dw = [] - if use_5x5kernel: - self.conv_dw.append( - self.add_sublayer( - name + "1_dw", - ConvBNLayer( - in_channels=in_channels, - out_channels=out_channels1, - kernel_size=5, - stride=stride, - padding=2, - num_groups=out_channels1, - name=name + "1_dw"))) - else: - self.conv_dw.append( - self.add_sublayer( - name + "1_dw_1", - ConvBNLayer( - in_channels=in_channels, - out_channels=out_channels1, - kernel_size=3, - stride=1, - padding=1, - num_groups=out_channels1, - name=name + "1_dw_1"))) - self.conv_dw.append( - self.add_sublayer( - name + "1_dw_2", - ConvBNLayer( - in_channels=out_channels1, - out_channels=out_channels1, - kernel_size=3, - stride=stride, - padding=1, - num_groups=out_channels1, - name=name + "1_dw_2"))) - self.act = act if self.use_double_block else None - self.conv_pw = ConvBNLayer( - in_channels=out_channels1, - out_channels=out_channels2, - kernel_size=1, - stride=1, - padding=0, - act=self.act, - name=name + "1_sep") - if self.use_double_block: - self.conv_dw2 = [] - if use_5x5kernel: - self.conv_dw2.append( - self.add_sublayer( - name + "2_dw", - ConvBNLayer( - in_channels=out_channels2, - out_channels=out_channels2, - kernel_size=5, - stride=1, - padding=2, - num_groups=out_channels2, - name=name + "2_dw"))) - else: - self.conv_dw2.append( - self.add_sublayer( - name + "2_dw_1", - ConvBNLayer( - in_channels=out_channels2, - out_channels=out_channels2, - kernel_size=3, - stride=1, - padding=1, - num_groups=out_channels2, - name=name + "1_dw_1"))) - self.conv_dw2.append( - self.add_sublayer( - name + "2_dw_2", - ConvBNLayer( - in_channels=out_channels2, - out_channels=out_channels2, - kernel_size=3, - stride=1, - padding=1, - num_groups=out_channels2, - name=name + "2_dw_2"))) - self.conv_pw2 = ConvBNLayer( - in_channels=out_channels2, - out_channels=double_channels, - kernel_size=1, - stride=1, - padding=0, - name=name + "2_sep") - # shortcut - if self.use_pool: - shortcut_channel = double_channels or out_channels2 - self._shortcut = [] - self._shortcut.append( - self.add_sublayer( - name + '_shortcut_pool', - nn.MaxPool2D( - kernel_size=stride, stride=stride, ceil_mode=True))) - self._shortcut.append( - self.add_sublayer( - name + '_shortcut_conv', - ConvBNLayer( - in_channels=in_channels, - out_channels=shortcut_channel, - kernel_size=1, - stride=1, - padding=0, - name="shortcut" + name))) - - def forward(self, x): - y = x - for conv_dw_block in self.conv_dw: - y = conv_dw_block(y) - y = self.conv_pw(y) - if self.use_double_block: - for conv_dw2_block in self.conv_dw2: - y = conv_dw2_block(y) - y = self.conv_pw2(y) - if self.use_pool: - for shortcut in self._shortcut: - x = shortcut(x) - return F.relu(paddle.add(x, y)) - - -@register -@serializable -class BlazeNet(nn.Layer): - """ - BlazeFace, see https://arxiv.org/abs/1907.05047 - - Args: - blaze_filters (list): number of filter for each blaze block. - double_blaze_filters (list): number of filter for each double_blaze block. - use_5x5kernel (bool): whether or not filter size is 5x5 in depth-wise conv. - """ - - def __init__( - self, - blaze_filters=[[24, 24], [24, 24], [24, 48, 2], [48, 48], [48, 48]], - double_blaze_filters=[[48, 24, 96, 2], [96, 24, 96], [96, 24, 96], - [96, 24, 96, 2], [96, 24, 96], [96, 24, 96]], - use_5x5kernel=True, - act=None): - super(BlazeNet, self).__init__() - conv1_num_filters = blaze_filters[0][0] - self.conv1 = ConvBNLayer( - in_channels=3, - out_channels=conv1_num_filters, - kernel_size=3, - stride=2, - padding=1, - name="conv1") - in_channels = conv1_num_filters - self.blaze_block = [] - self._out_channels = [] - for k, v in enumerate(blaze_filters): - assert len(v) in [2, 3], \ - "blaze_filters {} not in [2, 3]" - if len(v) == 2: - self.blaze_block.append( - self.add_sublayer( - 'blaze_{}'.format(k), - BlazeBlock( - in_channels, - v[0], - v[1], - use_5x5kernel=use_5x5kernel, - act=act, - name='blaze_{}'.format(k)))) - elif len(v) == 3: - self.blaze_block.append( - self.add_sublayer( - 'blaze_{}'.format(k), - BlazeBlock( - in_channels, - v[0], - v[1], - stride=v[2], - use_5x5kernel=use_5x5kernel, - act=act, - name='blaze_{}'.format(k)))) - in_channels = v[1] - - for k, v in enumerate(double_blaze_filters): - assert len(v) in [3, 4], \ - "blaze_filters {} not in [3, 4]" - if len(v) == 3: - self.blaze_block.append( - self.add_sublayer( - 'double_blaze_{}'.format(k), - BlazeBlock( - in_channels, - v[0], - v[1], - double_channels=v[2], - use_5x5kernel=use_5x5kernel, - act=act, - name='double_blaze_{}'.format(k)))) - elif len(v) == 4: - self.blaze_block.append( - self.add_sublayer( - 'double_blaze_{}'.format(k), - BlazeBlock( - in_channels, - v[0], - v[1], - double_channels=v[2], - stride=v[3], - use_5x5kernel=use_5x5kernel, - act=act, - name='double_blaze_{}'.format(k)))) - in_channels = v[2] - self._out_channels.append(in_channels) - - def forward(self, inputs): - outs = [] - y = self.conv1(inputs['image']) - for block in self.blaze_block: - y = block(y) - outs.append(y) - return [outs[-4], outs[-1]] - - @property - def out_shape(self): - return [ - ShapeSpec(channels=c) - for c in [self._out_channels[-4], self._out_channels[-1]] - ] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/clrnet_resnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/clrnet_resnet.py deleted file mode 100644 index 00758df..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/clrnet_resnet.py +++ /dev/null @@ -1,697 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn - -from paddle.utils.download import get_weights_path_from_url -from ppdet.core.workspace import register, serializable -from ..shape_spec import ShapeSpec - -__all__ = ['CLRResNet'] - -model_urls = { - 'resnet18': - 'https://x2paddle.bj.bcebos.com/vision/models/resnet18-pt.pdparams', - 'resnet34': - 'https://x2paddle.bj.bcebos.com/vision/models/resnet34-pt.pdparams', - 'resnet50': - 'https://x2paddle.bj.bcebos.com/vision/models/resnet50-pt.pdparams', - 'resnet101': - 'https://x2paddle.bj.bcebos.com/vision/models/resnet101-pt.pdparams', - 'resnet152': - 'https://x2paddle.bj.bcebos.com/vision/models/resnet152-pt.pdparams', - 'resnext50_32x4d': - 'https://x2paddle.bj.bcebos.com/vision/models/resnext50_32x4d-pt.pdparams', - 'resnext101_32x8d': - 'https://x2paddle.bj.bcebos.com/vision/models/resnext101_32x8d-pt.pdparams', - 'wide_resnet50_2': - 'https://x2paddle.bj.bcebos.com/vision/models/wide_resnet50_2-pt.pdparams', - 'wide_resnet101_2': - 'https://x2paddle.bj.bcebos.com/vision/models/wide_resnet101_2-pt.pdparams', -} - - -class BasicBlock(nn.Layer): - expansion = 1 - - def __init__(self, - inplanes, - planes, - stride=1, - downsample=None, - groups=1, - base_width=64, - dilation=1, - norm_layer=None): - super(BasicBlock, self).__init__() - if norm_layer is None: - norm_layer = nn.BatchNorm2D - - if dilation > 1: - raise NotImplementedError( - "Dilation > 1 not supported in BasicBlock") - - self.conv1 = nn.Conv2D( - inplanes, planes, 3, padding=1, stride=stride, bias_attr=False) - self.bn1 = norm_layer(planes) - self.relu = nn.ReLU() - self.conv2 = nn.Conv2D(planes, planes, 3, padding=1, bias_attr=False) - self.bn2 = norm_layer(planes) - self.downsample = downsample - self.stride = stride - - def forward(self, x): - identity = x - - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - - out = self.conv2(out) - out = self.bn2(out) - - if self.downsample is not None: - identity = self.downsample(x) - - out += identity - out = self.relu(out) - - return out - - -class BottleneckBlock(nn.Layer): - - expansion = 4 - - def __init__(self, - inplanes, - planes, - stride=1, - downsample=None, - groups=1, - base_width=64, - dilation=1, - norm_layer=None): - super(BottleneckBlock, self).__init__() - if norm_layer is None: - norm_layer = nn.BatchNorm2D - width = int(planes * (base_width / 64.)) * groups - - self.conv1 = nn.Conv2D(inplanes, width, 1, bias_attr=False) - self.bn1 = norm_layer(width) - - self.conv2 = nn.Conv2D( - width, - width, - 3, - padding=dilation, - stride=stride, - groups=groups, - dilation=dilation, - bias_attr=False) - self.bn2 = norm_layer(width) - - self.conv3 = nn.Conv2D( - width, planes * self.expansion, 1, bias_attr=False) - self.bn3 = norm_layer(planes * self.expansion) - self.relu = nn.ReLU() - self.downsample = downsample - self.stride = stride - - def forward(self, x): - identity = x - - out = self.conv1(x) - out = self.bn1(out) - out = self.relu(out) - - out = self.conv2(out) - out = self.bn2(out) - out = self.relu(out) - - out = self.conv3(out) - out = self.bn3(out) - - if self.downsample is not None: - identity = self.downsample(x) - - out += identity - out = self.relu(out) - - return out - - -class ResNet(nn.Layer): - """ResNet model from - `"Deep Residual Learning for Image Recognition" `_. - Args: - Block (BasicBlock|BottleneckBlock): Block module of model. - depth (int, optional): Layers of ResNet, Default: 50. - width (int, optional): Base width per convolution group for each convolution block, Default: 64. - num_classes (int, optional): Output dim of last fc layer. If num_classes <= 0, last fc layer - will not be defined. Default: 1000. - with_pool (bool, optional): Use pool before the last fc layer or not. Default: True. - groups (int, optional): Number of groups for each convolution block, Default: 1. - Returns: - :ref:`api_paddle_nn_Layer`. An instance of ResNet model. - Examples: - .. code-block:: python - import paddle - from paddle.vision.models import ResNet - from paddle.vision.models.resnet import BottleneckBlock, BasicBlock - # build ResNet with 18 layers - resnet18 = ResNet(BasicBlock, 18) - # build ResNet with 50 layers - resnet50 = ResNet(BottleneckBlock, 50) - # build Wide ResNet model - wide_resnet50_2 = ResNet(BottleneckBlock, 50, width=64*2) - # build ResNeXt model - resnext50_32x4d = ResNet(BottleneckBlock, 50, width=4, groups=32) - x = paddle.rand([1, 3, 224, 224]) - out = resnet18(x) - print(out.shape) - # [1, 1000] - """ - - def __init__(self, block, depth=50, width=64, with_pool=True, groups=1): - super(ResNet, self).__init__() - layer_cfg = { - 18: [2, 2, 2, 2], - 34: [3, 4, 6, 3], - 50: [3, 4, 6, 3], - 101: [3, 4, 23, 3], - 152: [3, 8, 36, 3] - } - - layers = layer_cfg[depth] - self.groups = groups - self.base_width = width - self.with_pool = with_pool - self._norm_layer = nn.BatchNorm2D - - self.inplanes = 64 - self.dilation = 1 - - self.conv1 = nn.Conv2D( - 3, - self.inplanes, - kernel_size=7, - stride=2, - padding=3, - bias_attr=False) - self.bn1 = self._norm_layer(self.inplanes) - self.relu = nn.ReLU() - self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1) - self.layer1 = self._make_layer(block, 64, layers[0]) - self.layer2 = self._make_layer(block, 128, layers[1], stride=2) - self.layer3 = self._make_layer(block, 256, layers[2], stride=2) - self.layer4 = self._make_layer(block, 512, layers[3], stride=2) - if with_pool: - self.avgpool = nn.AdaptiveAvgPool2D((1, 1)) - - ch_out_list = [64, 128, 256, 512] - block = BottleneckBlock if depth >= 50 else BasicBlock - - self._out_channels = [block.expansion * v for v in ch_out_list] - self._out_strides = [4, 8, 16, 32] - self.return_idx = [0, 1, 2, 3] - - def _make_layer(self, block, planes, blocks, stride=1, dilate=False): - norm_layer = self._norm_layer - downsample = None - previous_dilation = self.dilation - if dilate: - self.dilation *= stride - stride = 1 - if stride != 1 or self.inplanes != planes * block.expansion: - downsample = nn.Sequential( - nn.Conv2D( - self.inplanes, - planes * block.expansion, - 1, - stride=stride, - bias_attr=False), - norm_layer(planes * block.expansion), ) - - layers = [] - layers.append( - block(self.inplanes, planes, stride, downsample, self.groups, - self.base_width, previous_dilation, norm_layer)) - self.inplanes = planes * block.expansion - for _ in range(1, blocks): - layers.append( - block( - self.inplanes, - planes, - groups=self.groups, - base_width=self.base_width, - norm_layer=norm_layer)) - - return nn.Sequential(*layers) - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=self._out_channels[i], stride=self._out_strides[i]) - for i in self.return_idx - ] - - def forward(self, x): - x = self.conv1(x) - x = self.bn1(x) - x = self.relu(x) - x = self.maxpool(x) - - out_layers = [] - x = self.layer1(x) - out_layers.append(x) - x = self.layer2(x) - out_layers.append(x) - x = self.layer3(x) - out_layers.append(x) - x = self.layer4(x) - out_layers.append(x) - - if self.with_pool: - x = self.avgpool(x) - - return out_layers - - -@register -@serializable -class CLRResNet(nn.Layer): - def __init__(self, - resnet='resnet18', - pretrained=True, - out_conv=False, - fea_stride=8, - out_channel=128, - in_channels=[64, 128, 256, 512], - cfg=None): - super(CLRResNet, self).__init__() - self.cfg = cfg - self.in_channels = in_channels - - self.model = eval(resnet)(pretrained=pretrained) - self.out = None - if out_conv: - out_channel = 512 - for chan in reversed(self.in_channels): - if chan < 0: continue - out_channel = chan - break - self.out = nn.Conv2D( - out_channel * self.model.expansion, - cfg.featuremap_out_channel, - kernel_size=1, - bias_attr=False) - - @property - def out_shape(self): - return self.model.out_shape - - def forward(self, x): - x = self.model(x) - if self.out: - x[-1] = self.out(x[-1]) - return x - - -def _resnet(arch, Block, depth, pretrained, **kwargs): - model = ResNet(Block, depth, **kwargs) - if pretrained: - assert arch in model_urls, "{} model do not have a pretrained model now, you should set pretrained=False".format( - arch) - weight_path = get_weights_path_from_url(model_urls[arch]) - - param = paddle.load(weight_path) - model.set_dict(param) - - return model - - -def resnet18(pretrained=False, **kwargs): - """ResNet 18-layer model from - `"Deep Residual Learning for Image Recognition" `_. - Args: - pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained - on ImageNet. Default: False. - **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. - Returns: - :ref:`api_paddle_nn_Layer`. An instance of ResNet 18-layer model. - Examples: - .. code-block:: python - import paddle - from paddle.vision.models import resnet18 - # build model - model = resnet18() - # build model and load imagenet pretrained weight - # model = resnet18(pretrained=True) - x = paddle.rand([1, 3, 224, 224]) - out = model(x) - print(out.shape) - # [1, 1000] - """ - return _resnet('resnet18', BasicBlock, 18, pretrained, **kwargs) - - -def resnet34(pretrained=False, **kwargs): - """ResNet 34-layer model from - `"Deep Residual Learning for Image Recognition" `_. - Args: - pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained - on ImageNet. Default: False. - **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. - Returns: - :ref:`api_paddle_nn_Layer`. An instance of ResNet 34-layer model. - Examples: - .. code-block:: python - import paddle - from paddle.vision.models import resnet34 - # build model - model = resnet34() - # build model and load imagenet pretrained weight - # model = resnet34(pretrained=True) - x = paddle.rand([1, 3, 224, 224]) - out = model(x) - print(out.shape) - # [1, 1000] - """ - return _resnet('resnet34', BasicBlock, 34, pretrained, **kwargs) - - -def resnet50(pretrained=False, **kwargs): - """ResNet 50-layer model from - `"Deep Residual Learning for Image Recognition" `_. - Args: - pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained - on ImageNet. Default: False. - **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. - Returns: - :ref:`api_paddle_nn_Layer`. An instance of ResNet 50-layer model. - Examples: - .. code-block:: python - import paddle - from paddle.vision.models import resnet50 - # build model - model = resnet50() - # build model and load imagenet pretrained weight - # model = resnet50(pretrained=True) - x = paddle.rand([1, 3, 224, 224]) - out = model(x) - print(out.shape) - # [1, 1000] - """ - return _resnet('resnet50', BottleneckBlock, 50, pretrained, **kwargs) - - -def resnet101(pretrained=False, **kwargs): - """ResNet 101-layer model from - `"Deep Residual Learning for Image Recognition" `_. - Args: - pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained - on ImageNet. Default: False. - **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. - Returns: - :ref:`api_paddle_nn_Layer`. An instance of ResNet 101-layer. - Examples: - .. code-block:: python - import paddle - from paddle.vision.models import resnet101 - # build model - model = resnet101() - # build model and load imagenet pretrained weight - # model = resnet101(pretrained=True) - x = paddle.rand([1, 3, 224, 224]) - out = model(x) - print(out.shape) - # [1, 1000] - """ - return _resnet('resnet101', BottleneckBlock, 101, pretrained, **kwargs) - - -def resnet152(pretrained=False, **kwargs): - """ResNet 152-layer model from - `"Deep Residual Learning for Image Recognition" `_. - Args: - pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained - on ImageNet. Default: False. - **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. - Returns: - :ref:`api_paddle_nn_Layer`. An instance of ResNet 152-layer model. - Examples: - .. code-block:: python - import paddle - from paddle.vision.models import resnet152 - # build model - model = resnet152() - # build model and load imagenet pretrained weight - # model = resnet152(pretrained=True) - x = paddle.rand([1, 3, 224, 224]) - out = model(x) - print(out.shape) - # [1, 1000] - """ - return _resnet('resnet152', BottleneckBlock, 152, pretrained, **kwargs) - - -def resnext50_32x4d(pretrained=False, **kwargs): - """ResNeXt-50 32x4d model from - `"Aggregated Residual Transformations for Deep Neural Networks" `_. - - Args: - pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained - on ImageNet. Default: False. - **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. - Returns: - :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-50 32x4d model. - Examples: - .. code-block:: python - import paddle - from paddle.vision.models import resnext50_32x4d - # build model - model = resnext50_32x4d() - # build model and load imagenet pretrained weight - # model = resnext50_32x4d(pretrained=True) - x = paddle.rand([1, 3, 224, 224]) - out = model(x) - print(out.shape) - # [1, 1000] - """ - kwargs['groups'] = 32 - kwargs['width'] = 4 - return _resnet('resnext50_32x4d', BottleneckBlock, 50, pretrained, **kwargs) - - -def resnext50_64x4d(pretrained=False, **kwargs): - """ResNeXt-50 64x4d model from - `"Aggregated Residual Transformations for Deep Neural Networks" `_. - - Args: - pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained - on ImageNet. Default: False. - **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. - Returns: - :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-50 64x4d model. - Examples: - .. code-block:: python - import paddle - from paddle.vision.models import resnext50_64x4d - # build model - model = resnext50_64x4d() - # build model and load imagenet pretrained weight - # model = resnext50_64x4d(pretrained=True) - x = paddle.rand([1, 3, 224, 224]) - out = model(x) - print(out.shape) - # [1, 1000] - """ - kwargs['groups'] = 64 - kwargs['width'] = 4 - return _resnet('resnext50_64x4d', BottleneckBlock, 50, pretrained, **kwargs) - - -def resnext101_32x4d(pretrained=False, **kwargs): - """ResNeXt-101 32x4d model from - `"Aggregated Residual Transformations for Deep Neural Networks" `_. - - Args: - pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained - on ImageNet. Default: False. - **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. - Returns: - :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-101 32x4d model. - Examples: - .. code-block:: python - import paddle - from paddle.vision.models import resnext101_32x4d - # build model - model = resnext101_32x4d() - # build model and load imagenet pretrained weight - # model = resnext101_32x4d(pretrained=True) - x = paddle.rand([1, 3, 224, 224]) - out = model(x) - print(out.shape) - # [1, 1000] - """ - kwargs['groups'] = 32 - kwargs['width'] = 4 - return _resnet('resnext101_32x4d', BottleneckBlock, 101, pretrained, - **kwargs) - - -def resnext101_64x4d(pretrained=False, **kwargs): - """ResNeXt-101 64x4d model from - `"Aggregated Residual Transformations for Deep Neural Networks" `_. - - Args: - pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained - on ImageNet. Default: False. - **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. - Returns: - :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-101 64x4d model. - Examples: - .. code-block:: python - import paddle - from paddle.vision.models import resnext101_64x4d - # build model - model = resnext101_64x4d() - # build model and load imagenet pretrained weight - # model = resnext101_64x4d(pretrained=True) - x = paddle.rand([1, 3, 224, 224]) - out = model(x) - print(out.shape) - # [1, 1000] - """ - kwargs['groups'] = 64 - kwargs['width'] = 4 - return _resnet('resnext101_64x4d', BottleneckBlock, 101, pretrained, - **kwargs) - - -def resnext152_32x4d(pretrained=False, **kwargs): - """ResNeXt-152 32x4d model from - `"Aggregated Residual Transformations for Deep Neural Networks" `_. - - Args: - pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained - on ImageNet. Default: False. - **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. - Returns: - :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-152 32x4d model. - Examples: - .. code-block:: python - import paddle - from paddle.vision.models import resnext152_32x4d - # build model - model = resnext152_32x4d() - # build model and load imagenet pretrained weight - # model = resnext152_32x4d(pretrained=True) - x = paddle.rand([1, 3, 224, 224]) - out = model(x) - print(out.shape) - # [1, 1000] - """ - kwargs['groups'] = 32 - kwargs['width'] = 4 - return _resnet('resnext152_32x4d', BottleneckBlock, 152, pretrained, - **kwargs) - - -def resnext152_64x4d(pretrained=False, **kwargs): - """ResNeXt-152 64x4d model from - `"Aggregated Residual Transformations for Deep Neural Networks" `_. - - Args: - pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained - on ImageNet. Default: False. - **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. - Returns: - :ref:`api_paddle_nn_Layer`. An instance of ResNeXt-152 64x4d model. - Examples: - .. code-block:: python - import paddle - from paddle.vision.models import resnext152_64x4d - # build model - model = resnext152_64x4d() - # build model and load imagenet pretrained weight - # model = resnext152_64x4d(pretrained=True) - x = paddle.rand([1, 3, 224, 224]) - out = model(x) - print(out.shape) - # [1, 1000] - """ - kwargs['groups'] = 64 - kwargs['width'] = 4 - return _resnet('resnext152_64x4d', BottleneckBlock, 152, pretrained, - **kwargs) - - -def wide_resnet50_2(pretrained=False, **kwargs): - """Wide ResNet-50-2 model from - `"Wide Residual Networks" `_. - Args: - pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained - on ImageNet. Default: False. - **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. - Returns: - :ref:`api_paddle_nn_Layer`. An instance of Wide ResNet-50-2 model. - Examples: - .. code-block:: python - import paddle - from paddle.vision.models import wide_resnet50_2 - # build model - model = wide_resnet50_2() - # build model and load imagenet pretrained weight - # model = wide_resnet50_2(pretrained=True) - x = paddle.rand([1, 3, 224, 224]) - out = model(x) - print(out.shape) - # [1, 1000] - """ - kwargs['width'] = 64 * 2 - return _resnet('wide_resnet50_2', BottleneckBlock, 50, pretrained, **kwargs) - - -def wide_resnet101_2(pretrained=False, **kwargs): - """Wide ResNet-101-2 model from - `"Wide Residual Networks" `_. - Args: - pretrained (bool, optional): Whether to load pre-trained weights. If True, returns a model pre-trained - on ImageNet. Default: False. - **kwargs (optional): Additional keyword arguments. For details, please refer to :ref:`ResNet `. - Returns: - :ref:`api_paddle_nn_Layer`. An instance of Wide ResNet-101-2 model. - Examples: - .. code-block:: python - import paddle - from paddle.vision.models import wide_resnet101_2 - # build model - model = wide_resnet101_2() - # build model and load imagenet pretrained weight - # model = wide_resnet101_2(pretrained=True) - x = paddle.rand([1, 3, 224, 224]) - out = model(x) - print(out.shape) - # [1, 1000] - """ - kwargs['width'] = 64 * 2 - return _resnet('wide_resnet101_2', BottleneckBlock, 101, pretrained, - **kwargs) diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/convnext.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/convnext.py deleted file mode 100644 index 476e12b..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/convnext.py +++ /dev/null @@ -1,245 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -''' -Modified from https://github.com/facebookresearch/ConvNeXt -Copyright (c) Meta Platforms, Inc. and affiliates. -All rights reserved. -This source code is licensed under the license found in the -LICENSE file in the root directory of this source tree. -''' - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.nn.initializer import Constant - -import numpy as np - -from ppdet.core.workspace import register, serializable -from ..shape_spec import ShapeSpec -from .transformer_utils import DropPath, trunc_normal_, zeros_ - -__all__ = ['ConvNeXt'] - - -class Block(nn.Layer): - r""" ConvNeXt Block. There are two equivalent implementations: - (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) - (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back - We use (2) as we find it slightly faster in Pypaddle - - Args: - dim (int): Number of input channels. - drop_path (float): Stochastic depth rate. Default: 0.0 - layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. - """ - - def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6): - super().__init__() - self.dwconv = nn.Conv2D( - dim, dim, kernel_size=7, padding=3, groups=dim) # depthwise conv - self.norm = LayerNorm(dim, eps=1e-6) - self.pwconv1 = nn.Linear( - dim, 4 * dim) # pointwise/1x1 convs, implemented with linear layers - self.act = nn.GELU() - self.pwconv2 = nn.Linear(4 * dim, dim) - - if layer_scale_init_value > 0: - self.gamma = self.create_parameter( - shape=(dim, ), - attr=ParamAttr(initializer=Constant(layer_scale_init_value))) - else: - self.gamma = None - - self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity( - ) - - def forward(self, x): - input = x - x = self.dwconv(x) - x = x.transpose([0, 2, 3, 1]) - x = self.norm(x) - x = self.pwconv1(x) - x = self.act(x) - x = self.pwconv2(x) - if self.gamma is not None: - x = self.gamma * x - x = x.transpose([0, 3, 1, 2]) - x = input + self.drop_path(x) - return x - - -class LayerNorm(nn.Layer): - r""" LayerNorm that supports two data formats: channels_last (default) or channels_first. - The ordering of the dimensions in the inputs. channels_last corresponds to inputs with - shape (batch_size, height, width, channels) while channels_first corresponds to inputs - with shape (batch_size, channels, height, width). - """ - - def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"): - super().__init__() - - self.weight = self.create_parameter( - shape=(normalized_shape, ), - attr=ParamAttr(initializer=Constant(1.))) - self.bias = self.create_parameter( - shape=(normalized_shape, ), - attr=ParamAttr(initializer=Constant(0.))) - - self.eps = eps - self.data_format = data_format - if self.data_format not in ["channels_last", "channels_first"]: - raise NotImplementedError - self.normalized_shape = (normalized_shape, ) - - def forward(self, x): - if self.data_format == "channels_last": - return F.layer_norm(x, self.normalized_shape, self.weight, - self.bias, self.eps) - elif self.data_format == "channels_first": - u = x.mean(1, keepdim=True) - s = (x - u).pow(2).mean(1, keepdim=True) - x = (x - u) / paddle.sqrt(s + self.eps) - x = self.weight[:, None, None] * x + self.bias[:, None, None] - return x - - -@register -@serializable -class ConvNeXt(nn.Layer): - r""" ConvNeXt - A Pypaddle impl of : `A ConvNet for the 2020s` - - https://arxiv.org/pdf/2201.03545.pdf - - Args: - in_chans (int): Number of input image channels. Default: 3 - depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3] - dims (int): Feature dimension at each stage. Default: [96, 192, 384, 768] - drop_path_rate (float): Stochastic depth rate. Default: 0. - layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. - """ - - arch_settings = { - 'tiny': { - 'depths': [3, 3, 9, 3], - 'dims': [96, 192, 384, 768] - }, - 'small': { - 'depths': [3, 3, 27, 3], - 'dims': [96, 192, 384, 768] - }, - 'base': { - 'depths': [3, 3, 27, 3], - 'dims': [128, 256, 512, 1024] - }, - 'large': { - 'depths': [3, 3, 27, 3], - 'dims': [192, 384, 768, 1536] - }, - 'xlarge': { - 'depths': [3, 3, 27, 3], - 'dims': [256, 512, 1024, 2048] - }, - } - - def __init__( - self, - arch='tiny', - in_chans=3, - drop_path_rate=0., - layer_scale_init_value=1e-6, - return_idx=[1, 2, 3], - norm_output=True, - pretrained=None, ): - super().__init__() - depths = self.arch_settings[arch]['depths'] - dims = self.arch_settings[arch]['dims'] - self.downsample_layers = nn.LayerList( - ) # stem and 3 intermediate downsampling conv layers - stem = nn.Sequential( - nn.Conv2D( - in_chans, dims[0], kernel_size=4, stride=4), - LayerNorm( - dims[0], eps=1e-6, data_format="channels_first")) - self.downsample_layers.append(stem) - for i in range(3): - downsample_layer = nn.Sequential( - LayerNorm( - dims[i], eps=1e-6, data_format="channels_first"), - nn.Conv2D( - dims[i], dims[i + 1], kernel_size=2, stride=2), ) - self.downsample_layers.append(downsample_layer) - - self.stages = nn.LayerList( - ) # 4 feature resolution stages, each consisting of multiple residual blocks - dp_rates = [x for x in np.linspace(0, drop_path_rate, sum(depths))] - cur = 0 - for i in range(4): - stage = nn.Sequential(* [ - Block( - dim=dims[i], - drop_path=dp_rates[cur + j], - layer_scale_init_value=layer_scale_init_value) - for j in range(depths[i]) - ]) - self.stages.append(stage) - cur += depths[i] - - self.return_idx = return_idx - self.dims = [dims[i] for i in return_idx] # [::-1] - - self.norm_output = norm_output - if norm_output: - self.norms = nn.LayerList([ - LayerNorm( - c, eps=1e-6, data_format="channels_first") - for c in self.dims - ]) - - self.apply(self._init_weights) - - if pretrained is not None: - if 'http' in pretrained: #URL - path = paddle.utils.download.get_weights_path_from_url( - pretrained) - else: #model in local path - path = pretrained - self.set_state_dict(paddle.load(path)) - - def _init_weights(self, m): - if isinstance(m, (nn.Conv2D, nn.Linear)): - trunc_normal_(m.weight) - zeros_(m.bias) - - def forward_features(self, x): - output = [] - for i in range(4): - x = self.downsample_layers[i](x) - x = self.stages[i](x) - output.append(x) - - outputs = [output[i] for i in self.return_idx] - if self.norm_output: - outputs = [self.norms[i](out) for i, out in enumerate(outputs)] - - return outputs - - def forward(self, x): - x = self.forward_features(x['image']) - return x - - @property - def out_shape(self): - return [ShapeSpec(channels=c) for c in self.dims] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/csp_darknet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/csp_darknet.py deleted file mode 100644 index 4c225d1..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/csp_darknet.py +++ /dev/null @@ -1,404 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.regularizer import L2Decay -from ppdet.core.workspace import register, serializable -from ppdet.modeling.initializer import conv_init_ -from ..shape_spec import ShapeSpec - -__all__ = [ - 'CSPDarkNet', 'BaseConv', 'DWConv', 'BottleNeck', 'SPPLayer', 'SPPFLayer' -] - - -class BaseConv(nn.Layer): - def __init__(self, - in_channels, - out_channels, - ksize, - stride, - groups=1, - bias=False, - act="silu"): - super(BaseConv, self).__init__() - self.conv = nn.Conv2D( - in_channels, - out_channels, - kernel_size=ksize, - stride=stride, - padding=(ksize - 1) // 2, - groups=groups, - bias_attr=bias) - self.bn = nn.BatchNorm2D( - out_channels, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - - self._init_weights() - - def _init_weights(self): - conv_init_(self.conv) - - def forward(self, x): - # use 'x * F.sigmoid(x)' replace 'silu' - x = self.bn(self.conv(x)) - y = x * F.sigmoid(x) - return y - - -class DWConv(nn.Layer): - """Depthwise Conv""" - - def __init__(self, - in_channels, - out_channels, - ksize, - stride=1, - bias=False, - act="silu"): - super(DWConv, self).__init__() - self.dw_conv = BaseConv( - in_channels, - in_channels, - ksize=ksize, - stride=stride, - groups=in_channels, - bias=bias, - act=act) - self.pw_conv = BaseConv( - in_channels, - out_channels, - ksize=1, - stride=1, - groups=1, - bias=bias, - act=act) - - def forward(self, x): - return self.pw_conv(self.dw_conv(x)) - - -class Focus(nn.Layer): - """Focus width and height information into channel space, used in YOLOX.""" - - def __init__(self, - in_channels, - out_channels, - ksize=3, - stride=1, - bias=False, - act="silu"): - super(Focus, self).__init__() - self.conv = BaseConv( - in_channels * 4, - out_channels, - ksize=ksize, - stride=stride, - bias=bias, - act=act) - - def forward(self, inputs): - # inputs [bs, C, H, W] -> outputs [bs, 4C, W/2, H/2] - top_left = inputs[:, :, 0::2, 0::2] - top_right = inputs[:, :, 0::2, 1::2] - bottom_left = inputs[:, :, 1::2, 0::2] - bottom_right = inputs[:, :, 1::2, 1::2] - outputs = paddle.concat( - [top_left, bottom_left, top_right, bottom_right], 1) - return self.conv(outputs) - - -class BottleNeck(nn.Layer): - def __init__(self, - in_channels, - out_channels, - shortcut=True, - expansion=0.5, - depthwise=False, - bias=False, - act="silu"): - super(BottleNeck, self).__init__() - hidden_channels = int(out_channels * expansion) - Conv = DWConv if depthwise else BaseConv - self.conv1 = BaseConv( - in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) - self.conv2 = Conv( - hidden_channels, - out_channels, - ksize=3, - stride=1, - bias=bias, - act=act) - self.add_shortcut = shortcut and in_channels == out_channels - - def forward(self, x): - y = self.conv2(self.conv1(x)) - if self.add_shortcut: - y = y + x - return y - - -class SPPLayer(nn.Layer): - """Spatial Pyramid Pooling (SPP) layer used in YOLOv3-SPP and YOLOX""" - - def __init__(self, - in_channels, - out_channels, - kernel_sizes=(5, 9, 13), - bias=False, - act="silu"): - super(SPPLayer, self).__init__() - hidden_channels = in_channels // 2 - self.conv1 = BaseConv( - in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) - self.maxpoolings = nn.LayerList([ - nn.MaxPool2D( - kernel_size=ks, stride=1, padding=ks // 2) - for ks in kernel_sizes - ]) - conv2_channels = hidden_channels * (len(kernel_sizes) + 1) - self.conv2 = BaseConv( - conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act) - - def forward(self, x): - x = self.conv1(x) - x = paddle.concat([x] + [mp(x) for mp in self.maxpoolings], axis=1) - x = self.conv2(x) - return x - - -class SPPFLayer(nn.Layer): - """ Spatial Pyramid Pooling - Fast (SPPF) layer used in YOLOv5 by Glenn Jocher, - equivalent to SPP(k=(5, 9, 13)) - """ - - def __init__(self, - in_channels, - out_channels, - ksize=5, - bias=False, - act='silu'): - super(SPPFLayer, self).__init__() - hidden_channels = in_channels // 2 - self.conv1 = BaseConv( - in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) - self.maxpooling = nn.MaxPool2D( - kernel_size=ksize, stride=1, padding=ksize // 2) - conv2_channels = hidden_channels * 4 - self.conv2 = BaseConv( - conv2_channels, out_channels, ksize=1, stride=1, bias=bias, act=act) - - def forward(self, x): - x = self.conv1(x) - y1 = self.maxpooling(x) - y2 = self.maxpooling(y1) - y3 = self.maxpooling(y2) - concats = paddle.concat([x, y1, y2, y3], axis=1) - out = self.conv2(concats) - return out - - -class CSPLayer(nn.Layer): - """CSP (Cross Stage Partial) layer with 3 convs, named C3 in YOLOv5""" - - def __init__(self, - in_channels, - out_channels, - num_blocks=1, - shortcut=True, - expansion=0.5, - depthwise=False, - bias=False, - act="silu"): - super(CSPLayer, self).__init__() - hidden_channels = int(out_channels * expansion) - self.conv1 = BaseConv( - in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) - self.conv2 = BaseConv( - in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) - self.bottlenecks = nn.Sequential(* [ - BottleNeck( - hidden_channels, - hidden_channels, - shortcut=shortcut, - expansion=1.0, - depthwise=depthwise, - bias=bias, - act=act) for _ in range(num_blocks) - ]) - self.conv3 = BaseConv( - hidden_channels * 2, - out_channels, - ksize=1, - stride=1, - bias=bias, - act=act) - - def forward(self, x): - x_1 = self.conv1(x) - x_1 = self.bottlenecks(x_1) - x_2 = self.conv2(x) - x = paddle.concat([x_1, x_2], axis=1) - x = self.conv3(x) - return x - - -@register -@serializable -class CSPDarkNet(nn.Layer): - """ - CSPDarkNet backbone. - Args: - arch (str): Architecture of CSPDarkNet, from {P5, P6, X}, default as X, - and 'X' means used in YOLOX, 'P5/P6' means used in YOLOv5. - depth_mult (float): Depth multiplier, multiply number of channels in - each layer, default as 1.0. - width_mult (float): Width multiplier, multiply number of blocks in - CSPLayer, default as 1.0. - depthwise (bool): Whether to use depth-wise conv layer. - act (str): Activation function type, default as 'silu'. - return_idx (list): Index of stages whose feature maps are returned. - """ - - __shared__ = ['depth_mult', 'width_mult', 'act', 'trt'] - - # in_channels, out_channels, num_blocks, add_shortcut, use_spp(use_sppf) - # 'X' means setting used in YOLOX, 'P5/P6' means setting used in YOLOv5. - arch_settings = { - 'X': [[64, 128, 3, True, False], [128, 256, 9, True, False], - [256, 512, 9, True, False], [512, 1024, 3, False, True]], - 'P5': [[64, 128, 3, True, False], [128, 256, 6, True, False], - [256, 512, 9, True, False], [512, 1024, 3, True, True]], - 'P6': [[64, 128, 3, True, False], [128, 256, 6, True, False], - [256, 512, 9, True, False], [512, 768, 3, True, False], - [768, 1024, 3, True, True]], - } - - def __init__(self, - arch='X', - depth_mult=1.0, - width_mult=1.0, - depthwise=False, - act='silu', - trt=False, - return_idx=[2, 3, 4]): - super(CSPDarkNet, self).__init__() - self.arch = arch - self.return_idx = return_idx - Conv = DWConv if depthwise else BaseConv - arch_setting = self.arch_settings[arch] - base_channels = int(arch_setting[0][0] * width_mult) - - # Note: differences between the latest YOLOv5 and the original YOLOX - # 1. self.stem, use SPPF(in YOLOv5) or SPP(in YOLOX) - # 2. use SPPF(in YOLOv5) or SPP(in YOLOX) - # 3. put SPPF before(YOLOv5) or SPP after(YOLOX) the last cspdark block's CSPLayer - # 4. whether SPPF(SPP)'CSPLayer add shortcut, True in YOLOv5, False in YOLOX - if arch in ['P5', 'P6']: - # in the latest YOLOv5, use Conv stem, and SPPF (fast, only single spp kernal size) - self.stem = Conv( - 3, base_channels, ksize=6, stride=2, bias=False, act=act) - spp_kernal_sizes = 5 - elif arch in ['X']: - # in the original YOLOX, use Focus stem, and SPP (three spp kernal sizes) - self.stem = Focus( - 3, base_channels, ksize=3, stride=1, bias=False, act=act) - spp_kernal_sizes = (5, 9, 13) - else: - raise AttributeError("Unsupported arch type: {}".format(arch)) - - _out_channels = [base_channels] - layers_num = 1 - self.csp_dark_blocks = [] - - for i, (in_channels, out_channels, num_blocks, shortcut, - use_spp) in enumerate(arch_setting): - in_channels = int(in_channels * width_mult) - out_channels = int(out_channels * width_mult) - _out_channels.append(out_channels) - num_blocks = max(round(num_blocks * depth_mult), 1) - stage = [] - - conv_layer = self.add_sublayer( - 'layers{}.stage{}.conv_layer'.format(layers_num, i + 1), - Conv( - in_channels, out_channels, 3, 2, bias=False, act=act)) - stage.append(conv_layer) - layers_num += 1 - - if use_spp and arch in ['X']: - # in YOLOX use SPPLayer - spp_layer = self.add_sublayer( - 'layers{}.stage{}.spp_layer'.format(layers_num, i + 1), - SPPLayer( - out_channels, - out_channels, - kernel_sizes=spp_kernal_sizes, - bias=False, - act=act)) - stage.append(spp_layer) - layers_num += 1 - - csp_layer = self.add_sublayer( - 'layers{}.stage{}.csp_layer'.format(layers_num, i + 1), - CSPLayer( - out_channels, - out_channels, - num_blocks=num_blocks, - shortcut=shortcut, - depthwise=depthwise, - bias=False, - act=act)) - stage.append(csp_layer) - layers_num += 1 - - if use_spp and arch in ['P5', 'P6']: - # in latest YOLOv5 use SPPFLayer instead of SPPLayer - sppf_layer = self.add_sublayer( - 'layers{}.stage{}.sppf_layer'.format(layers_num, i + 1), - SPPFLayer( - out_channels, - out_channels, - ksize=5, - bias=False, - act=act)) - stage.append(sppf_layer) - layers_num += 1 - - self.csp_dark_blocks.append(nn.Sequential(*stage)) - - self._out_channels = [_out_channels[i] for i in self.return_idx] - self.strides = [[2, 4, 8, 16, 32, 64][i] for i in self.return_idx] - - def forward(self, inputs): - x = inputs['image'] - outputs = [] - x = self.stem(x) - for i, layer in enumerate(self.csp_dark_blocks): - x = layer(x) - if i + 1 in self.return_idx: - outputs.append(x) - return outputs - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=c, stride=s) - for c, s in zip(self._out_channels, self.strides) - ] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/cspresnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/cspresnet.py deleted file mode 100644 index 5268ec8..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/cspresnet.py +++ /dev/null @@ -1,321 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.regularizer import L2Decay -from paddle.nn.initializer import Constant - -from ppdet.modeling.ops import get_act_fn -from ppdet.core.workspace import register, serializable -from ..shape_spec import ShapeSpec - -__all__ = ['CSPResNet', 'BasicBlock', 'EffectiveSELayer', 'ConvBNLayer'] - - -class ConvBNLayer(nn.Layer): - def __init__(self, - ch_in, - ch_out, - filter_size=3, - stride=1, - groups=1, - padding=0, - act=None): - super(ConvBNLayer, self).__init__() - - self.conv = nn.Conv2D( - in_channels=ch_in, - out_channels=ch_out, - kernel_size=filter_size, - stride=stride, - padding=padding, - groups=groups, - bias_attr=False) - - self.bn = nn.BatchNorm2D( - ch_out, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - self.act = get_act_fn(act) if act is None or isinstance(act, ( - str, dict)) else act - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.act(x) - - return x - - -class RepVggBlock(nn.Layer): - def __init__(self, ch_in, ch_out, act='relu', alpha=False): - super(RepVggBlock, self).__init__() - self.ch_in = ch_in - self.ch_out = ch_out - self.conv1 = ConvBNLayer( - ch_in, ch_out, 3, stride=1, padding=1, act=None) - self.conv2 = ConvBNLayer( - ch_in, ch_out, 1, stride=1, padding=0, act=None) - self.act = get_act_fn(act) if act is None or isinstance(act, ( - str, dict)) else act - if alpha: - self.alpha = self.create_parameter( - shape=[1], - attr=ParamAttr(initializer=Constant(value=1.)), - dtype="float32") - else: - self.alpha = None - - def forward(self, x): - if hasattr(self, 'conv'): - y = self.conv(x) - else: - if self.alpha: - y = self.conv1(x) + self.alpha * self.conv2(x) - else: - y = self.conv1(x) + self.conv2(x) - y = self.act(y) - return y - - def convert_to_deploy(self): - if not hasattr(self, 'conv'): - self.conv = nn.Conv2D( - in_channels=self.ch_in, - out_channels=self.ch_out, - kernel_size=3, - stride=1, - padding=1, - groups=1) - kernel, bias = self.get_equivalent_kernel_bias() - self.conv.weight.set_value(kernel) - self.conv.bias.set_value(bias) - self.__delattr__('conv1') - self.__delattr__('conv2') - - def get_equivalent_kernel_bias(self): - kernel3x3, bias3x3 = self._fuse_bn_tensor(self.conv1) - kernel1x1, bias1x1 = self._fuse_bn_tensor(self.conv2) - if self.alpha: - return kernel3x3 + self.alpha * self._pad_1x1_to_3x3_tensor( - kernel1x1), bias3x3 + self.alpha * bias1x1 - else: - return kernel3x3 + self._pad_1x1_to_3x3_tensor( - kernel1x1), bias3x3 + bias1x1 - - def _pad_1x1_to_3x3_tensor(self, kernel1x1): - if kernel1x1 is None: - return 0 - else: - return nn.functional.pad(kernel1x1, [1, 1, 1, 1]) - - def _fuse_bn_tensor(self, branch): - if branch is None: - return 0, 0 - kernel = branch.conv.weight - running_mean = branch.bn._mean - running_var = branch.bn._variance - gamma = branch.bn.weight - beta = branch.bn.bias - eps = branch.bn._epsilon - std = (running_var + eps).sqrt() - t = (gamma / std).reshape((-1, 1, 1, 1)) - return kernel * t, beta - running_mean * gamma / std - - -class BasicBlock(nn.Layer): - def __init__(self, - ch_in, - ch_out, - act='relu', - shortcut=True, - use_alpha=False): - super(BasicBlock, self).__init__() - assert ch_in == ch_out - self.conv1 = ConvBNLayer(ch_in, ch_out, 3, stride=1, padding=1, act=act) - self.conv2 = RepVggBlock(ch_out, ch_out, act=act, alpha=use_alpha) - self.shortcut = shortcut - - def forward(self, x): - y = self.conv1(x) - y = self.conv2(y) - if self.shortcut: - return paddle.add(x, y) - else: - return y - - -class EffectiveSELayer(nn.Layer): - """ Effective Squeeze-Excitation - From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667 - """ - - def __init__(self, channels, act='hardsigmoid'): - super(EffectiveSELayer, self).__init__() - self.fc = nn.Conv2D(channels, channels, kernel_size=1, padding=0) - self.act = get_act_fn(act) if act is None or isinstance(act, ( - str, dict)) else act - - def forward(self, x): - x_se = x.mean((2, 3), keepdim=True) - x_se = self.fc(x_se) - return x * self.act(x_se) - - -class CSPResStage(nn.Layer): - def __init__(self, - block_fn, - ch_in, - ch_out, - n, - stride, - act='relu', - attn='eca', - use_alpha=False): - super(CSPResStage, self).__init__() - - ch_mid = (ch_in + ch_out) // 2 - if stride == 2: - self.conv_down = ConvBNLayer( - ch_in, ch_mid, 3, stride=2, padding=1, act=act) - else: - self.conv_down = None - self.conv1 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act) - self.conv2 = ConvBNLayer(ch_mid, ch_mid // 2, 1, act=act) - self.blocks = nn.Sequential(*[ - block_fn( - ch_mid // 2, - ch_mid // 2, - act=act, - shortcut=True, - use_alpha=use_alpha) for i in range(n) - ]) - if attn: - self.attn = EffectiveSELayer(ch_mid, act='hardsigmoid') - else: - self.attn = None - - self.conv3 = ConvBNLayer(ch_mid, ch_out, 1, act=act) - - def forward(self, x): - if self.conv_down is not None: - x = self.conv_down(x) - y1 = self.conv1(x) - y2 = self.blocks(self.conv2(x)) - y = paddle.concat([y1, y2], axis=1) - if self.attn is not None: - y = self.attn(y) - y = self.conv3(y) - return y - - -@register -@serializable -class CSPResNet(nn.Layer): - __shared__ = ['width_mult', 'depth_mult', 'trt'] - - def __init__(self, - layers=[3, 6, 6, 3], - channels=[64, 128, 256, 512, 1024], - act='swish', - return_idx=[1, 2, 3], - depth_wise=False, - use_large_stem=False, - width_mult=1.0, - depth_mult=1.0, - trt=False, - use_checkpoint=False, - use_alpha=False, - **args): - super(CSPResNet, self).__init__() - self.use_checkpoint = use_checkpoint - channels = [max(round(c * width_mult), 1) for c in channels] - layers = [max(round(l * depth_mult), 1) for l in layers] - act = get_act_fn( - act, trt=trt) if act is None or isinstance(act, - (str, dict)) else act - - if use_large_stem: - self.stem = nn.Sequential( - ('conv1', ConvBNLayer( - 3, channels[0] // 2, 3, stride=2, padding=1, act=act)), - ('conv2', ConvBNLayer( - channels[0] // 2, - channels[0] // 2, - 3, - stride=1, - padding=1, - act=act)), ('conv3', ConvBNLayer( - channels[0] // 2, - channels[0], - 3, - stride=1, - padding=1, - act=act))) - else: - self.stem = nn.Sequential( - ('conv1', ConvBNLayer( - 3, channels[0] // 2, 3, stride=2, padding=1, act=act)), - ('conv2', ConvBNLayer( - channels[0] // 2, - channels[0], - 3, - stride=1, - padding=1, - act=act))) - - n = len(channels) - 1 - self.stages = nn.Sequential(*[(str(i), CSPResStage( - BasicBlock, - channels[i], - channels[i + 1], - layers[i], - 2, - act=act, - use_alpha=use_alpha)) for i in range(n)]) - - self._out_channels = channels[1:] - self._out_strides = [4 * 2**i for i in range(n)] - self.return_idx = return_idx - if use_checkpoint: - paddle.seed(0) - - def forward(self, inputs): - x = inputs['image'] - x = self.stem(x) - outs = [] - for idx, stage in enumerate(self.stages): - if self.use_checkpoint and self.training: - x = paddle.distributed.fleet.utils.recompute( - stage, x, **{"preserve_rng_state": True}) - else: - x = stage(x) - if idx in self.return_idx: - outs.append(x) - - return outs - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=self._out_channels[i], stride=self._out_strides[i]) - for i in self.return_idx - ] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/darknet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/darknet.py deleted file mode 100644 index c68c650..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/darknet.py +++ /dev/null @@ -1,345 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - -from ppdet.core.workspace import register, serializable -from ppdet.modeling.ops import batch_norm, mish -from ..shape_spec import ShapeSpec - -__all__ = ['DarkNet', 'ConvBNLayer'] - - -class ConvBNLayer(nn.Layer): - def __init__(self, - ch_in, - ch_out, - filter_size=3, - stride=1, - groups=1, - padding=0, - norm_type='bn', - norm_decay=0., - act="leaky", - freeze_norm=False, - data_format='NCHW', - name=''): - """ - conv + bn + activation layer - - Args: - ch_in (int): input channel - ch_out (int): output channel - filter_size (int): filter size, default 3 - stride (int): stride, default 1 - groups (int): number of groups of conv layer, default 1 - padding (int): padding size, default 0 - norm_type (str): batch norm type, default bn - norm_decay (str): decay for weight and bias of batch norm layer, default 0. - act (str): activation function type, default 'leaky', which means leaky_relu - freeze_norm (bool): whether to freeze norm, default False - data_format (str): data format, NCHW or NHWC - """ - super(ConvBNLayer, self).__init__() - - self.conv = nn.Conv2D( - in_channels=ch_in, - out_channels=ch_out, - kernel_size=filter_size, - stride=stride, - padding=padding, - groups=groups, - data_format=data_format, - bias_attr=False) - self.batch_norm = batch_norm( - ch_out, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - data_format=data_format) - self.act = act - - def forward(self, inputs): - out = self.conv(inputs) - out = self.batch_norm(out) - if self.act == 'leaky': - out = F.leaky_relu(out, 0.1) - else: - out = getattr(F, self.act)(out) - return out - - -class DownSample(nn.Layer): - def __init__(self, - ch_in, - ch_out, - filter_size=3, - stride=2, - padding=1, - norm_type='bn', - norm_decay=0., - freeze_norm=False, - data_format='NCHW'): - """ - downsample layer - - Args: - ch_in (int): input channel - ch_out (int): output channel - filter_size (int): filter size, default 3 - stride (int): stride, default 2 - padding (int): padding size, default 1 - norm_type (str): batch norm type, default bn - norm_decay (str): decay for weight and bias of batch norm layer, default 0. - freeze_norm (bool): whether to freeze norm, default False - data_format (str): data format, NCHW or NHWC - """ - - super(DownSample, self).__init__() - - self.conv_bn_layer = ConvBNLayer( - ch_in=ch_in, - ch_out=ch_out, - filter_size=filter_size, - stride=stride, - padding=padding, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - data_format=data_format) - self.ch_out = ch_out - - def forward(self, inputs): - out = self.conv_bn_layer(inputs) - return out - - -class BasicBlock(nn.Layer): - def __init__(self, - ch_in, - ch_out, - norm_type='bn', - norm_decay=0., - freeze_norm=False, - data_format='NCHW'): - """ - BasicBlock layer of DarkNet - - Args: - ch_in (int): input channel - ch_out (int): output channel - norm_type (str): batch norm type, default bn - norm_decay (str): decay for weight and bias of batch norm layer, default 0. - freeze_norm (bool): whether to freeze norm, default False - data_format (str): data format, NCHW or NHWC - """ - - super(BasicBlock, self).__init__() - - assert ch_in == ch_out and (ch_in % 2) == 0, \ - f"ch_in and ch_out should be the same even int, but the input \'ch_in is {ch_in}, \'ch_out is {ch_out}" - # example: - # --------------{conv1} --> {conv2} - # channel route: 10-->5 --> 5-->10 - self.conv1 = ConvBNLayer( - ch_in=ch_in, - ch_out=int(ch_out / 2), - filter_size=1, - stride=1, - padding=0, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - data_format=data_format) - self.conv2 = ConvBNLayer( - ch_in=int(ch_out / 2), - ch_out=ch_out, - filter_size=3, - stride=1, - padding=1, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - data_format=data_format) - - def forward(self, inputs): - conv1 = self.conv1(inputs) - conv2 = self.conv2(conv1) - out = paddle.add(x=inputs, y=conv2) - return out - - -class Blocks(nn.Layer): - def __init__(self, - ch_in, - ch_out, - count, - norm_type='bn', - norm_decay=0., - freeze_norm=False, - name=None, - data_format='NCHW'): - """ - Blocks layer, which consist of some BaickBlock layers - - Args: - ch_in (int): input channel - ch_out (int): output channel - count (int): number of BasicBlock layer - norm_type (str): batch norm type, default bn - norm_decay (str): decay for weight and bias of batch norm layer, default 0. - freeze_norm (bool): whether to freeze norm, default False - name (str): layer name - data_format (str): data format, NCHW or NHWC - """ - super(Blocks, self).__init__() - - self.basicblock0 = BasicBlock( - ch_in, - ch_out, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - data_format=data_format) - self.res_out_list = [] - for i in range(1, count): - block_name = '{}.{}'.format(name, i) - res_out = self.add_sublayer( - block_name, - BasicBlock( - ch_out, - ch_out, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - data_format=data_format)) - self.res_out_list.append(res_out) - self.ch_out = ch_out - - def forward(self, inputs): - y = self.basicblock0(inputs) - for basic_block_i in self.res_out_list: - y = basic_block_i(y) - return y - - -DarkNet_cfg = {53: ([1, 2, 8, 8, 4])} - - -@register -@serializable -class DarkNet(nn.Layer): - __shared__ = ['norm_type', 'data_format'] - - def __init__(self, - depth=53, - freeze_at=-1, - return_idx=[2, 3, 4], - num_stages=5, - norm_type='bn', - norm_decay=0., - freeze_norm=False, - data_format='NCHW'): - """ - Darknet, see https://pjreddie.com/darknet/yolo/ - - Args: - depth (int): depth of network - freeze_at (int): freeze the backbone at which stage - filter_size (int): filter size, default 3 - return_idx (list): index of stages whose feature maps are returned - norm_type (str): batch norm type, default bn - norm_decay (str): decay for weight and bias of batch norm layer, default 0. - data_format (str): data format, NCHW or NHWC - """ - super(DarkNet, self).__init__() - self.depth = depth - self.freeze_at = freeze_at - self.return_idx = return_idx - self.num_stages = num_stages - self.stages = DarkNet_cfg[self.depth][0:num_stages] - - self.conv0 = ConvBNLayer( - ch_in=3, - ch_out=32, - filter_size=3, - stride=1, - padding=1, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - data_format=data_format) - - self.downsample0 = DownSample( - ch_in=32, - ch_out=32 * 2, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - data_format=data_format) - - self._out_channels = [] - self.darknet_conv_block_list = [] - self.downsample_list = [] - ch_in = [64, 128, 256, 512, 1024] - for i, stage in enumerate(self.stages): - name = 'stage.{}'.format(i) - conv_block = self.add_sublayer( - name, - Blocks( - int(ch_in[i]), - int(ch_in[i]), - stage, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - data_format=data_format, - name=name)) - self.darknet_conv_block_list.append(conv_block) - if i in return_idx: - self._out_channels.append(int(ch_in[i])) - for i in range(num_stages - 1): - down_name = 'stage.{}.downsample'.format(i) - downsample = self.add_sublayer( - down_name, - DownSample( - ch_in=int(ch_in[i]), - ch_out=int(ch_in[i + 1]), - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - data_format=data_format)) - self.downsample_list.append(downsample) - - def forward(self, inputs): - x = inputs['image'] - - out = self.conv0(x) - out = self.downsample0(out) - blocks = [] - for i, conv_block_i in enumerate(self.darknet_conv_block_list): - out = conv_block_i(out) - if i == self.freeze_at: - out.stop_gradient = True - if i in self.return_idx: - blocks.append(out) - if i < self.num_stages - 1: - out = self.downsample_list[i](out) - return blocks - - @property - def out_shape(self): - return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/dla.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/dla.py deleted file mode 100644 index 51d1f07..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/dla.py +++ /dev/null @@ -1,283 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register, serializable -from ppdet.modeling.layers import ConvNormLayer -from ..shape_spec import ShapeSpec - -DLA_cfg = {34: ([1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512]), } - - -class BasicBlock(nn.Layer): - def __init__(self, ch_in, ch_out, stride=1): - super(BasicBlock, self).__init__() - self.conv1 = ConvNormLayer( - ch_in, - ch_out, - filter_size=3, - stride=stride, - bias_on=False, - norm_decay=None) - self.conv2 = ConvNormLayer( - ch_out, - ch_out, - filter_size=3, - stride=1, - bias_on=False, - norm_decay=None) - - def forward(self, inputs, residual=None): - if residual is None: - residual = inputs - - out = self.conv1(inputs) - out = F.relu(out) - - out = self.conv2(out) - - out = paddle.add(x=out, y=residual) - out = F.relu(out) - - return out - - -class Root(nn.Layer): - def __init__(self, ch_in, ch_out, kernel_size, residual): - super(Root, self).__init__() - self.conv = ConvNormLayer( - ch_in, - ch_out, - filter_size=1, - stride=1, - bias_on=False, - norm_decay=None) - self.residual = residual - - def forward(self, inputs): - children = inputs - out = self.conv(paddle.concat(inputs, axis=1)) - if self.residual: - out = paddle.add(x=out, y=children[0]) - out = F.relu(out) - - return out - - -class Tree(nn.Layer): - def __init__(self, - level, - block, - ch_in, - ch_out, - stride=1, - level_root=False, - root_dim=0, - root_kernel_size=1, - root_residual=False): - super(Tree, self).__init__() - if root_dim == 0: - root_dim = 2 * ch_out - if level_root: - root_dim += ch_in - if level == 1: - self.tree1 = block(ch_in, ch_out, stride) - self.tree2 = block(ch_out, ch_out, 1) - else: - self.tree1 = Tree( - level - 1, - block, - ch_in, - ch_out, - stride, - root_dim=0, - root_kernel_size=root_kernel_size, - root_residual=root_residual) - self.tree2 = Tree( - level - 1, - block, - ch_out, - ch_out, - 1, - root_dim=root_dim + ch_out, - root_kernel_size=root_kernel_size, - root_residual=root_residual) - - if level == 1: - self.root = Root(root_dim, ch_out, root_kernel_size, root_residual) - self.level_root = level_root - self.root_dim = root_dim - self.downsample = None - self.project = None - self.level = level - if stride > 1: - self.downsample = nn.MaxPool2D(stride, stride=stride) - if ch_in != ch_out: - self.project = ConvNormLayer( - ch_in, - ch_out, - filter_size=1, - stride=1, - bias_on=False, - norm_decay=None) - - def forward(self, x, residual=None, children=None): - children = [] if children is None else children - bottom = self.downsample(x) if self.downsample else x - residual = self.project(bottom) if self.project else bottom - if self.level_root: - children.append(bottom) - x1 = self.tree1(x, residual) - if self.level == 1: - x2 = self.tree2(x1) - x = self.root([x2, x1] + children) - else: - children.append(x1) - x = self.tree2(x1, children=children) - return x - - -@register -@serializable -class DLA(nn.Layer): - """ - DLA, see https://arxiv.org/pdf/1707.06484.pdf - - Args: - depth (int): DLA depth, only support 34 now. - residual_root (bool): whether use a reidual layer in the root block - pre_img (bool): add pre_img, only used in CenterTrack - pre_hm (bool): add pre_hm, only used in CenterTrack - """ - - def __init__(self, - depth=34, - residual_root=False, - pre_img=False, - pre_hm=False): - super(DLA, self).__init__() - assert depth == 34, 'Only support DLA with depth of 34 now.' - if depth == 34: - block = BasicBlock - levels, channels = DLA_cfg[depth] - self.channels = channels - self.num_levels = len(levels) - - self.base_layer = nn.Sequential( - ConvNormLayer( - 3, - channels[0], - filter_size=7, - stride=1, - bias_on=False, - norm_decay=None), - nn.ReLU()) - self.level0 = self._make_conv_level(channels[0], channels[0], levels[0]) - self.level1 = self._make_conv_level( - channels[0], channels[1], levels[1], stride=2) - self.level2 = Tree( - levels[2], - block, - channels[1], - channels[2], - 2, - level_root=False, - root_residual=residual_root) - self.level3 = Tree( - levels[3], - block, - channels[2], - channels[3], - 2, - level_root=True, - root_residual=residual_root) - self.level4 = Tree( - levels[4], - block, - channels[3], - channels[4], - 2, - level_root=True, - root_residual=residual_root) - self.level5 = Tree( - levels[5], - block, - channels[4], - channels[5], - 2, - level_root=True, - root_residual=residual_root) - - if pre_img: - self.pre_img_layer = nn.Sequential( - ConvNormLayer( - 3, - channels[0], - filter_size=7, - stride=1, - bias_on=False, - norm_decay=None), - nn.ReLU()) - if pre_hm: - self.pre_hm_layer = nn.Sequential( - ConvNormLayer( - 1, - channels[0], - filter_size=7, - stride=1, - bias_on=False, - norm_decay=None), - nn.ReLU()) - self.pre_img = pre_img - self.pre_hm = pre_hm - - def _make_conv_level(self, ch_in, ch_out, conv_num, stride=1): - modules = [] - for i in range(conv_num): - modules.extend([ - ConvNormLayer( - ch_in, - ch_out, - filter_size=3, - stride=stride if i == 0 else 1, - bias_on=False, - norm_decay=None), nn.ReLU() - ]) - ch_in = ch_out - return nn.Sequential(*modules) - - @property - def out_shape(self): - return [ - ShapeSpec(channels=self.channels[i]) for i in range(self.num_levels) - ] - - def forward(self, inputs): - outs = [] - feats = self.base_layer(inputs['image']) - - if self.pre_img and 'pre_image' in inputs and inputs[ - 'pre_image'] is not None: - feats = feats + self.pre_img_layer(inputs['pre_image']) - - if self.pre_hm and 'pre_hm' in inputs and inputs['pre_hm'] is not None: - feats = feats + self.pre_hm_layer(inputs['pre_hm']) - - for i in range(self.num_levels): - feats = getattr(self, 'level{}'.format(i))(feats) - outs.append(feats) - - return outs diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/esnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/esnet.py deleted file mode 100644 index 2b3f3c5..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/esnet.py +++ /dev/null @@ -1,290 +0,0 @@ -# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm -from paddle.nn.initializer import KaimingNormal -from paddle.regularizer import L2Decay - -from ppdet.core.workspace import register, serializable -from numbers import Integral -from ..shape_spec import ShapeSpec -from ppdet.modeling.ops import channel_shuffle -from ppdet.modeling.backbones.shufflenet_v2 import ConvBNLayer - -__all__ = ['ESNet'] - - -def make_divisible(v, divisor=16, min_value=None): - if min_value is None: - min_value = divisor - new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) - if new_v < 0.9 * v: - new_v += divisor - return new_v - - -class SEModule(nn.Layer): - def __init__(self, channel, reduction=4): - super(SEModule, self).__init__() - self.avg_pool = AdaptiveAvgPool2D(1) - self.conv1 = Conv2D( - in_channels=channel, - out_channels=channel // reduction, - kernel_size=1, - stride=1, - padding=0, - weight_attr=ParamAttr(), - bias_attr=ParamAttr()) - self.conv2 = Conv2D( - in_channels=channel // reduction, - out_channels=channel, - kernel_size=1, - stride=1, - padding=0, - weight_attr=ParamAttr(), - bias_attr=ParamAttr()) - - def forward(self, inputs): - outputs = self.avg_pool(inputs) - outputs = self.conv1(outputs) - outputs = F.relu(outputs) - outputs = self.conv2(outputs) - outputs = F.hardsigmoid(outputs) - return paddle.multiply(x=inputs, y=outputs) - - -class InvertedResidual(nn.Layer): - def __init__(self, - in_channels, - mid_channels, - out_channels, - stride, - act="relu"): - super(InvertedResidual, self).__init__() - self._conv_pw = ConvBNLayer( - in_channels=in_channels // 2, - out_channels=mid_channels // 2, - kernel_size=1, - stride=1, - padding=0, - groups=1, - act=act) - self._conv_dw = ConvBNLayer( - in_channels=mid_channels // 2, - out_channels=mid_channels // 2, - kernel_size=3, - stride=stride, - padding=1, - groups=mid_channels // 2, - act=None) - self._se = SEModule(mid_channels) - - self._conv_linear = ConvBNLayer( - in_channels=mid_channels, - out_channels=out_channels // 2, - kernel_size=1, - stride=1, - padding=0, - groups=1, - act=act) - - def forward(self, inputs): - x1, x2 = paddle.split( - inputs, - num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2], - axis=1) - x2 = self._conv_pw(x2) - x3 = self._conv_dw(x2) - x3 = paddle.concat([x2, x3], axis=1) - x3 = self._se(x3) - x3 = self._conv_linear(x3) - out = paddle.concat([x1, x3], axis=1) - return channel_shuffle(out, 2) - - -class InvertedResidualDS(nn.Layer): - def __init__(self, - in_channels, - mid_channels, - out_channels, - stride, - act="relu"): - super(InvertedResidualDS, self).__init__() - - # branch1 - self._conv_dw_1 = ConvBNLayer( - in_channels=in_channels, - out_channels=in_channels, - kernel_size=3, - stride=stride, - padding=1, - groups=in_channels, - act=None) - self._conv_linear_1 = ConvBNLayer( - in_channels=in_channels, - out_channels=out_channels // 2, - kernel_size=1, - stride=1, - padding=0, - groups=1, - act=act) - # branch2 - self._conv_pw_2 = ConvBNLayer( - in_channels=in_channels, - out_channels=mid_channels // 2, - kernel_size=1, - stride=1, - padding=0, - groups=1, - act=act) - self._conv_dw_2 = ConvBNLayer( - in_channels=mid_channels // 2, - out_channels=mid_channels // 2, - kernel_size=3, - stride=stride, - padding=1, - groups=mid_channels // 2, - act=None) - self._se = SEModule(mid_channels // 2) - self._conv_linear_2 = ConvBNLayer( - in_channels=mid_channels // 2, - out_channels=out_channels // 2, - kernel_size=1, - stride=1, - padding=0, - groups=1, - act=act) - self._conv_dw_mv1 = ConvBNLayer( - in_channels=out_channels, - out_channels=out_channels, - kernel_size=3, - stride=1, - padding=1, - groups=out_channels, - act="hard_swish") - self._conv_pw_mv1 = ConvBNLayer( - in_channels=out_channels, - out_channels=out_channels, - kernel_size=1, - stride=1, - padding=0, - groups=1, - act="hard_swish") - - def forward(self, inputs): - x1 = self._conv_dw_1(inputs) - x1 = self._conv_linear_1(x1) - x2 = self._conv_pw_2(inputs) - x2 = self._conv_dw_2(x2) - x2 = self._se(x2) - x2 = self._conv_linear_2(x2) - out = paddle.concat([x1, x2], axis=1) - out = self._conv_dw_mv1(out) - out = self._conv_pw_mv1(out) - - return out - - -@register -@serializable -class ESNet(nn.Layer): - def __init__(self, - scale=1.0, - act="hard_swish", - feature_maps=[4, 11, 14], - channel_ratio=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]): - super(ESNet, self).__init__() - self.scale = scale - if isinstance(feature_maps, Integral): - feature_maps = [feature_maps] - self.feature_maps = feature_maps - stage_repeats = [3, 7, 3] - - stage_out_channels = [ - -1, 24, make_divisible(128 * scale), make_divisible(256 * scale), - make_divisible(512 * scale), 1024 - ] - - self._out_channels = [] - self._feature_idx = 0 - # 1. conv1 - self._conv1 = ConvBNLayer( - in_channels=3, - out_channels=stage_out_channels[1], - kernel_size=3, - stride=2, - padding=1, - act=act) - self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1) - self._feature_idx += 1 - - # 2. bottleneck sequences - self._block_list = [] - arch_idx = 0 - for stage_id, num_repeat in enumerate(stage_repeats): - for i in range(num_repeat): - channels_scales = channel_ratio[arch_idx] - mid_c = make_divisible( - int(stage_out_channels[stage_id + 2] * channels_scales), - divisor=8) - if i == 0: - block = self.add_sublayer( - name=str(stage_id + 2) + '_' + str(i + 1), - sublayer=InvertedResidualDS( - in_channels=stage_out_channels[stage_id + 1], - mid_channels=mid_c, - out_channels=stage_out_channels[stage_id + 2], - stride=2, - act=act)) - else: - block = self.add_sublayer( - name=str(stage_id + 2) + '_' + str(i + 1), - sublayer=InvertedResidual( - in_channels=stage_out_channels[stage_id + 2], - mid_channels=mid_c, - out_channels=stage_out_channels[stage_id + 2], - stride=1, - act=act)) - self._block_list.append(block) - arch_idx += 1 - self._feature_idx += 1 - self._update_out_channels(stage_out_channels[stage_id + 2], - self._feature_idx, self.feature_maps) - - def _update_out_channels(self, channel, feature_idx, feature_maps): - if feature_idx in feature_maps: - self._out_channels.append(channel) - - def forward(self, inputs): - y = self._conv1(inputs['image']) - y = self._max_pool(y) - outs = [] - for i, inv in enumerate(self._block_list): - y = inv(y) - if i + 2 in self.feature_maps: - outs.append(y) - - return outs - - @property - def out_shape(self): - return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/focalnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/focalnet.py deleted file mode 100644 index 54c2877..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/focalnet.py +++ /dev/null @@ -1,720 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/microsoft/FocalNet/blob/main/classification/focalnet.py -""" -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.modeling.shape_spec import ShapeSpec -from ppdet.core.workspace import register, serializable -from .transformer_utils import DropPath, Identity -from .transformer_utils import add_parameter, to_2tuple -from .transformer_utils import ones_, zeros_, trunc_normal_ -from .swin_transformer import Mlp - -__all__ = ['FocalNet'] - -MODEL_cfg = { - 'focalnet_T_224_1k_srf': dict( - embed_dim=96, - depths=[2, 2, 6, 2], - focal_levels=[2, 2, 2, 2], - focal_windows=[3, 3, 3, 3], - drop_path_rate=0.2, - use_conv_embed=False, - use_postln=False, - use_postln_in_modulation=False, - use_layerscale=False, - normalize_modulator=False, - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_srf_pretrained.pdparams', - ), - 'focalnet_S_224_1k_srf': dict( - embed_dim=96, - depths=[2, 2, 18, 2], - focal_levels=[2, 2, 2, 2], - focal_windows=[3, 3, 3, 3], - drop_path_rate=0.3, - use_conv_embed=False, - use_postln=False, - use_postln_in_modulation=False, - use_layerscale=False, - normalize_modulator=False, - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_srf_pretrained.pdparams', - ), - 'focalnet_B_224_1k_srf': dict( - embed_dim=128, - depths=[2, 2, 18, 2], - focal_levels=[2, 2, 2, 2], - focal_windows=[3, 3, 3, 3], - drop_path_rate=0.5, - use_conv_embed=False, - use_postln=False, - use_postln_in_modulation=False, - use_layerscale=False, - normalize_modulator=False, - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_srf_pretrained.pdparams', - ), - 'focalnet_T_224_1k_lrf': dict( - embed_dim=96, - depths=[2, 2, 6, 2], - focal_levels=[3, 3, 3, 3], - focal_windows=[3, 3, 3, 3], - drop_path_rate=0.2, - use_conv_embed=False, - use_postln=False, - use_postln_in_modulation=False, - use_layerscale=False, - normalize_modulator=False, - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_tiny_lrf_pretrained.pdparams', - ), - 'focalnet_S_224_1k_lrf': dict( - embed_dim=96, - depths=[2, 2, 18, 2], - focal_levels=[3, 3, 3, 3], - focal_windows=[3, 3, 3, 3], - drop_path_rate=0.3, - use_conv_embed=False, - use_postln=False, - use_postln_in_modulation=False, - use_layerscale=False, - normalize_modulator=False, - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_small_lrf_pretrained.pdparams', - ), - 'focalnet_B_224_1k_lrf': dict( - embed_dim=128, - depths=[2, 2, 18, 2], - focal_levels=[3, 3, 3, 3], - focal_windows=[3, 3, 3, 3], - drop_path_rate=0.5, - use_conv_embed=False, - use_postln=False, - use_postln_in_modulation=False, - use_layerscale=False, - normalize_modulator=False, - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_base_lrf_pretrained.pdparams', - ), - 'focalnet_L_384_22k_fl3': dict( - embed_dim=192, - depths=[2, 2, 18, 2], - focal_levels=[3, 3, 3, 3], - focal_windows=[5, 5, 5, 5], - drop_path_rate=0.5, - use_conv_embed=True, - use_postln=True, - use_postln_in_modulation=False, - use_layerscale=True, - normalize_modulator=False, - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_pretrained.pdparams', - ), - 'focalnet_L_384_22k_fl4': dict( - embed_dim=192, - depths=[2, 2, 18, 2], - focal_levels=[4, 4, 4, 4], - focal_windows=[3, 3, 3, 3], - drop_path_rate=0.5, - use_conv_embed=True, - use_postln=True, - use_postln_in_modulation=False, - use_layerscale=True, - normalize_modulator=True, # - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_large_lrf_384_fl4_pretrained.pdparams', - ), - 'focalnet_XL_384_22k_fl3': dict( - embed_dim=256, - depths=[2, 2, 18, 2], - focal_levels=[3, 3, 3, 3], - focal_windows=[5, 5, 5, 5], - drop_path_rate=0.5, - use_conv_embed=True, - use_postln=True, - use_postln_in_modulation=False, - use_layerscale=True, - normalize_modulator=False, - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_pretrained.pdparams', - ), - 'focalnet_XL_384_22k_fl4': dict( - embed_dim=256, - depths=[2, 2, 18, 2], - focal_levels=[4, 4, 4, 4], - focal_windows=[3, 3, 3, 3], - drop_path_rate=0.5, - use_conv_embed=True, - use_postln=True, - use_postln_in_modulation=False, - use_layerscale=True, - normalize_modulator=False, - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_xlarge_lrf_384_fl4_pretrained.pdparams', - ), - 'focalnet_H_224_22k_fl3': dict( - embed_dim=352, - depths=[2, 2, 18, 2], - focal_levels=[3, 3, 3, 3], - focal_windows=[3, 3, 3, 3], - drop_path_rate=0.5, - use_conv_embed=True, - use_postln=True, - use_postln_in_modulation=True, # - use_layerscale=True, - normalize_modulator=False, - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_pretrained.pdparams', - ), - 'focalnet_H_224_22k_fl4': dict( - embed_dim=352, - depths=[2, 2, 18, 2], - focal_levels=[4, 4, 4, 4], - focal_windows=[3, 3, 3, 3], - drop_path_rate=0.5, - use_conv_embed=True, - use_postln=True, - use_postln_in_modulation=True, # - use_layerscale=True, - normalize_modulator=False, - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/focalnet_huge_lrf_224_fl4_pretrained.pdparams', - ), -} - - -class FocalModulation(nn.Layer): - """ - Args: - dim (int): Number of input channels. - proj_drop (float, optional): Dropout ratio of output. Default: 0.0 - focal_level (int): Number of focal levels - focal_window (int): Focal window size at focal level 1 - focal_factor (int): Step to increase the focal window. Default: 2 - use_postln_in_modulation (bool): Whether use post-modulation layernorm - normalize_modulator (bool): Whether use normalize in modulator - """ - - def __init__(self, - dim, - proj_drop=0., - focal_level=2, - focal_window=7, - focal_factor=2, - use_postln_in_modulation=False, - normalize_modulator=False): - super().__init__() - self.dim = dim - - # specific args for focalv3 - self.focal_level = focal_level - self.focal_window = focal_window - self.focal_factor = focal_factor - self.use_postln_in_modulation = use_postln_in_modulation - self.normalize_modulator = normalize_modulator - - self.f = nn.Linear( - dim, 2 * dim + (self.focal_level + 1), bias_attr=True) - self.h = nn.Conv2D( - dim, - dim, - kernel_size=1, - stride=1, - padding=0, - groups=1, - bias_attr=True) - - self.act = nn.GELU() - self.proj = nn.Linear(dim, dim) - self.proj_drop = nn.Dropout(proj_drop) - self.focal_layers = nn.LayerList() - - if self.use_postln_in_modulation: - self.ln = nn.LayerNorm(dim) - - for k in range(self.focal_level): - kernel_size = self.focal_factor * k + self.focal_window - self.focal_layers.append( - nn.Sequential( - nn.Conv2D( - dim, - dim, - kernel_size=kernel_size, - stride=1, - groups=dim, - padding=kernel_size // 2, - bias_attr=False), - nn.GELU())) - - def forward(self, x): - """ Forward function. - Args: - x: input features with shape of (B, H, W, C) - """ - _, _, _, C = x.shape - x = self.f(x) - x = x.transpose([0, 3, 1, 2]) - q, ctx, gates = paddle.split(x, (C, C, self.focal_level + 1), 1) - - ctx_all = 0 - for l in range(self.focal_level): - ctx = self.focal_layers[l](ctx) - ctx_all = ctx_all + ctx * gates[:, l:l + 1] - ctx_global = self.act(ctx.mean(2, keepdim=True).mean(3, keepdim=True)) - ctx_all = ctx_all + ctx_global * gates[:, self.focal_level:] - if self.normalize_modulator: - ctx_all = ctx_all / (self.focal_level + 1) - - x_out = q * self.h(ctx_all) - x_out = x_out.transpose([0, 2, 3, 1]) - if self.use_postln_in_modulation: - x_out = self.ln(x_out) - x_out = self.proj(x_out) - x_out = self.proj_drop(x_out) - return x_out - - -class FocalModulationBlock(nn.Layer): - """ Focal Modulation Block. - Args: - dim (int): Number of input channels. - mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. - drop (float, optional): Dropout rate. Default: 0.0 - drop_path (float, optional): Stochastic depth rate. Default: 0.0 - act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU - norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm - focal_level (int): number of focal levels - focal_window (int): focal kernel size at level 1 - use_postln (bool): Whether use layernorm after modulation. Default: False. - use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False. - normalize_modulator (bool): Whether use normalize in modulator - use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False - layerscale_value (float): Value for layer scale. Default: 1e-4 - """ - - def __init__(self, - dim, - mlp_ratio=4., - drop=0., - drop_path=0., - act_layer=nn.GELU, - norm_layer=nn.LayerNorm, - focal_level=2, - focal_window=9, - use_postln=False, - use_postln_in_modulation=False, - normalize_modulator=False, - use_layerscale=False, - layerscale_value=1e-4): - super().__init__() - self.dim = dim - self.mlp_ratio = mlp_ratio - self.focal_window = focal_window - self.focal_level = focal_level - self.use_postln = use_postln - self.use_layerscale = use_layerscale - - self.norm1 = norm_layer(dim) - self.modulation = FocalModulation( - dim, - proj_drop=drop, - focal_level=self.focal_level, - focal_window=self.focal_window, - use_postln_in_modulation=use_postln_in_modulation, - normalize_modulator=normalize_modulator) - - self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() - self.norm2 = norm_layer(dim) - mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = Mlp(in_features=dim, - hidden_features=mlp_hidden_dim, - act_layer=act_layer, - drop=drop) - self.H = None - self.W = None - - self.gamma_1 = 1.0 - self.gamma_2 = 1.0 - if self.use_layerscale: - self.gamma_1 = add_parameter(self, - layerscale_value * paddle.ones([dim])) - self.gamma_2 = add_parameter(self, - layerscale_value * paddle.ones([dim])) - - def forward(self, x): - """ - Args: - x: Input feature, tensor size (B, H*W, C). - """ - B, L, C = x.shape - H, W = self.H, self.W - assert L == H * W, "input feature has wrong size" - - shortcut = x - if not self.use_postln: - x = self.norm1(x) - x = x.reshape([-1, H, W, C]) - - # FM - x = self.modulation(x).reshape([-1, H * W, C]) - if self.use_postln: - x = self.norm1(x) - - # FFN - x = shortcut + self.drop_path(self.gamma_1 * x) - - if self.use_postln: - x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x))) - else: - x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) - return x - - -class BasicLayer(nn.Layer): - """ A basic focal modulation layer for one stage. - Args: - dim (int): Number of feature channels - depth (int): Depths of this stage. - mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. - drop (float, optional): Dropout rate. Default: 0.0 - drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 - norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm - downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None - focal_level (int): Number of focal levels - focal_window (int): Focal window size at focal level 1 - use_conv_embed (bool): Whether use overlapped convolution for patch embedding - use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False - layerscale_value (float): Value of layerscale - use_postln (bool): Whether use layernorm after modulation. Default: False. - use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False. - normalize_modulator (bool): Whether use normalize in modulator - use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. - """ - - def __init__(self, - dim, - depth, - mlp_ratio=4., - drop=0., - drop_path=0., - norm_layer=nn.LayerNorm, - downsample=None, - focal_level=2, - focal_window=9, - use_conv_embed=False, - use_layerscale=False, - layerscale_value=1e-4, - use_postln=False, - use_postln_in_modulation=False, - normalize_modulator=False, - use_checkpoint=False): - super().__init__() - self.depth = depth - self.use_checkpoint = use_checkpoint - - # build blocks - self.blocks = nn.LayerList([ - FocalModulationBlock( - dim=dim, - mlp_ratio=mlp_ratio, - drop=drop, - drop_path=drop_path[i] - if isinstance(drop_path, np.ndarray) else drop_path, - act_layer=nn.GELU, - norm_layer=norm_layer, - focal_level=focal_level, - focal_window=focal_window, - use_postln=use_postln, - use_postln_in_modulation=use_postln_in_modulation, - normalize_modulator=normalize_modulator, - use_layerscale=use_layerscale, - layerscale_value=layerscale_value) for i in range(depth) - ]) - - # patch merging layer - if downsample is not None: - self.downsample = downsample( - patch_size=2, - in_chans=dim, - embed_dim=2 * dim, - use_conv_embed=use_conv_embed, - norm_layer=norm_layer, - is_stem=False) - else: - self.downsample = None - - def forward(self, x, H, W): - """ - Args: - x: Input feature, tensor size (B, H*W, C). - """ - for blk in self.blocks: - blk.H, blk.W = H, W - x = blk(x) - - if self.downsample is not None: - x_reshaped = x.transpose([0, 2, 1]).reshape( - [x.shape[0], x.shape[-1], H, W]) - x_down = self.downsample(x_reshaped) - x_down = x_down.flatten(2).transpose([0, 2, 1]) - Wh, Ww = (H + 1) // 2, (W + 1) // 2 - return x, H, W, x_down, Wh, Ww - else: - return x, H, W, x, H, W - - -class PatchEmbed(nn.Layer): - """ Image to Patch Embedding - Args: - patch_size (int): Patch token size. Default: 4. - in_chans (int): Number of input image channels. Default: 3. - embed_dim (int): Number of linear projection output channels. Default: 96. - norm_layer (nn.Layer, optional): Normalization layer. Default: None - use_conv_embed (bool): Whether use overlapped convolution for patch embedding. Default: False - is_stem (bool): Is the stem block or not. - """ - - def __init__(self, - patch_size=4, - in_chans=3, - embed_dim=96, - norm_layer=None, - use_conv_embed=False, - is_stem=False): - super().__init__() - patch_size = to_2tuple(patch_size) - self.patch_size = patch_size - - self.in_chans = in_chans - self.embed_dim = embed_dim - - if use_conv_embed: - # if we choose to use conv embedding, then we treat the stem and non-stem differently - if is_stem: - kernel_size = 7 - padding = 2 - stride = 4 - else: - kernel_size = 3 - padding = 1 - stride = 2 - self.proj = nn.Conv2D( - in_chans, - embed_dim, - kernel_size=kernel_size, - stride=stride, - padding=padding) - else: - self.proj = nn.Conv2D( - in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) - - if norm_layer is not None: - self.norm = norm_layer(embed_dim) - else: - self.norm = None - - def forward(self, x): - _, _, H, W = x.shape - - if W % self.patch_size[1] != 0: - # for 3D tensor: [pad_left, pad_right] - # for 4D tensor: [pad_left, pad_right, pad_top, pad_bottom] - x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0]) - W += W % self.patch_size[1] - if H % self.patch_size[0] != 0: - x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]]) - H += H % self.patch_size[0] - - x = self.proj(x) - if self.norm is not None: - _, _, Wh, Ww = x.shape - x = x.flatten(2).transpose([0, 2, 1]) - x = self.norm(x) - x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww]) - - return x - - -@register -@serializable -class FocalNet(nn.Layer): - """ FocalNet backbone - Args: - arch (str): Architecture of FocalNet - out_indices (Sequence[int]): Output from which stages. - frozen_stages (int): Stages to be frozen (stop grad and set eval mode). - -1 means not freezing any parameters. - patch_size (int | tuple(int)): Patch size. Default: 4. - in_chans (int): Number of input image channels. Default: 3. - embed_dim (int): Number of linear projection output channels. Default: 96. - depths (tuple[int]): Depths of each FocalNet Transformer stage. - mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. - drop_rate (float): Dropout rate. - drop_path_rate (float): Stochastic depth rate. Default: 0.2. - norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm. - patch_norm (bool): If True, add normalization after patch embedding. Default: True. - focal_levels (Sequence[int]): Number of focal levels at four stages - focal_windows (Sequence[int]): Focal window sizes at first focal level at four stages - use_conv_embed (bool): Whether use overlapped convolution for patch embedding - use_layerscale (bool): Whether use layerscale proposed in CaiT. Default: False - layerscale_value (float): Value of layerscale - use_postln (bool): Whether use layernorm after modulation. Default: False. - use_postln_in_modulation (bool): Whether use post-modulation layernorm. Default: False. - normalize_modulator (bool): Whether use normalize in modulator - use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. - """ - - def __init__( - self, - arch='focalnet_T_224_1k_srf', - out_indices=(0, 1, 2, 3), - frozen_stages=-1, - patch_size=4, - in_chans=3, - embed_dim=96, - depths=[2, 2, 6, 2], - mlp_ratio=4., - drop_rate=0., - drop_path_rate=0.2, # 0.5 better for large+ models - norm_layer=nn.LayerNorm, - patch_norm=True, - focal_levels=[2, 2, 2, 2], - focal_windows=[3, 3, 3, 3], - use_conv_embed=False, - use_layerscale=False, - layerscale_value=1e-4, - use_postln=False, - use_postln_in_modulation=False, - normalize_modulator=False, - use_checkpoint=False, - pretrained=None): - super(FocalNet, self).__init__() - assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch) - - embed_dim = MODEL_cfg[arch]['embed_dim'] - depths = MODEL_cfg[arch]['depths'] - drop_path_rate = MODEL_cfg[arch]['drop_path_rate'] - focal_levels = MODEL_cfg[arch]['focal_levels'] - focal_windows = MODEL_cfg[arch]['focal_windows'] - use_conv_embed = MODEL_cfg[arch]['use_conv_embed'] - use_layerscale = MODEL_cfg[arch]['use_layerscale'] - use_postln = MODEL_cfg[arch]['use_postln'] - use_postln_in_modulation = MODEL_cfg[arch]['use_postln_in_modulation'] - normalize_modulator = MODEL_cfg[arch]['normalize_modulator'] - if pretrained is None: - pretrained = MODEL_cfg[arch]['pretrained'] - - self.out_indices = out_indices - self.frozen_stages = frozen_stages - self.num_layers = len(depths) - self.patch_norm = patch_norm - - # split image into non-overlapping patches - self.patch_embed = PatchEmbed( - patch_size=patch_size, - in_chans=in_chans, - embed_dim=embed_dim, - norm_layer=norm_layer if self.patch_norm else None, - use_conv_embed=use_conv_embed, - is_stem=True) - - self.pos_drop = nn.Dropout(p=drop_rate) - - # stochastic depth decay rule - dpr = np.linspace(0, drop_path_rate, sum(depths)) - - # build layers - self.layers = nn.LayerList() - for i_layer in range(self.num_layers): - layer = BasicLayer( - dim=int(embed_dim * 2**i_layer), - depth=depths[i_layer], - mlp_ratio=mlp_ratio, - drop=drop_rate, - drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], - norm_layer=norm_layer, - downsample=PatchEmbed - if (i_layer < self.num_layers - 1) else None, - focal_level=focal_levels[i_layer], - focal_window=focal_windows[i_layer], - use_conv_embed=use_conv_embed, - use_layerscale=use_layerscale, - layerscale_value=layerscale_value, - use_postln=use_postln, - use_postln_in_modulation=use_postln_in_modulation, - normalize_modulator=normalize_modulator, - use_checkpoint=use_checkpoint) - self.layers.append(layer) - - num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)] - self.num_features = num_features - - # add a norm layer for each output - for i_layer in out_indices: - layer = norm_layer(num_features[i_layer]) - layer_name = f'norm{i_layer}' - self.add_sublayer(layer_name, layer) - - self.apply(self._init_weights) - self._freeze_stages() - if pretrained: - if 'http' in pretrained: #URL - path = paddle.utils.download.get_weights_path_from_url( - pretrained) - else: #model in local path - path = pretrained - self.set_state_dict(paddle.load(path)) - - def _freeze_stages(self): - if self.frozen_stages >= 0: - self.patch_embed.eval() - for param in self.patch_embed.parameters(): - param.stop_gradient = True - - if self.frozen_stages >= 2: - self.pos_drop.eval() - for i in range(0, self.frozen_stages - 1): - m = self.layers[i] - m.eval() - for param in m.parameters(): - param.stop_gradient = True - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight) - if isinstance(m, nn.Linear) and m.bias is not None: - zeros_(m.bias) - elif isinstance(m, nn.LayerNorm): - zeros_(m.bias) - ones_(m.weight) - - def forward(self, x): - x = self.patch_embed(x['image']) - B, _, Wh, Ww = x.shape - x = x.flatten(2).transpose([0, 2, 1]) - x = self.pos_drop(x) - outs = [] - for i in range(self.num_layers): - layer = self.layers[i] - x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) - if i in self.out_indices: - norm_layer = getattr(self, f'norm{i}') - x_out = norm_layer(x_out) - out = x_out.reshape([-1, H, W, self.num_features[i]]).transpose( - (0, 3, 1, 2)) - outs.append(out) - - return outs - - @property - def out_shape(self): - out_strides = [4, 8, 16, 32] - return [ - ShapeSpec( - channels=self.num_features[i], stride=out_strides[i]) - for i in self.out_indices - ] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/ghostnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/ghostnet.py deleted file mode 100644 index cd333b4..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/ghostnet.py +++ /dev/null @@ -1,470 +0,0 @@ -# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import paddle -from paddle import ParamAttr -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.nn import AdaptiveAvgPool2D, Linear -from paddle.nn.initializer import Uniform - -from ppdet.core.workspace import register, serializable -from numbers import Integral -from ..shape_spec import ShapeSpec -from .mobilenet_v3 import make_divisible, ConvBNLayer - -__all__ = ['GhostNet'] - - -class ExtraBlockDW(nn.Layer): - def __init__(self, - in_c, - ch_1, - ch_2, - stride, - lr_mult, - conv_decay=0., - norm_type='bn', - norm_decay=0., - freeze_norm=False, - name=None): - super(ExtraBlockDW, self).__init__() - self.pointwise_conv = ConvBNLayer( - in_c=in_c, - out_c=ch_1, - filter_size=1, - stride=1, - padding=0, - act='relu6', - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + "_extra1") - self.depthwise_conv = ConvBNLayer( - in_c=ch_1, - out_c=ch_2, - filter_size=3, - stride=stride, - padding=1, # - num_groups=int(ch_1), - act='relu6', - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + "_extra2_dw") - self.normal_conv = ConvBNLayer( - in_c=ch_2, - out_c=ch_2, - filter_size=1, - stride=1, - padding=0, - act='relu6', - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + "_extra2_sep") - - def forward(self, inputs): - x = self.pointwise_conv(inputs) - x = self.depthwise_conv(x) - x = self.normal_conv(x) - return x - - -class SEBlock(nn.Layer): - def __init__(self, num_channels, lr_mult, reduction_ratio=4, name=None): - super(SEBlock, self).__init__() - self.pool2d_gap = AdaptiveAvgPool2D(1) - self._num_channels = num_channels - stdv = 1.0 / math.sqrt(num_channels * 1.0) - med_ch = num_channels // reduction_ratio - self.squeeze = Linear( - num_channels, - med_ch, - weight_attr=ParamAttr( - learning_rate=lr_mult, initializer=Uniform(-stdv, stdv)), - bias_attr=ParamAttr(learning_rate=lr_mult)) - stdv = 1.0 / math.sqrt(med_ch * 1.0) - self.excitation = Linear( - med_ch, - num_channels, - weight_attr=ParamAttr( - learning_rate=lr_mult, initializer=Uniform(-stdv, stdv)), - bias_attr=ParamAttr(learning_rate=lr_mult)) - - def forward(self, inputs): - pool = self.pool2d_gap(inputs) - pool = paddle.squeeze(pool, axis=[2, 3]) - squeeze = self.squeeze(pool) - squeeze = F.relu(squeeze) - excitation = self.excitation(squeeze) - excitation = paddle.clip(x=excitation, min=0, max=1) - excitation = paddle.unsqueeze(excitation, axis=[2, 3]) - out = paddle.multiply(inputs, excitation) - return out - - -class GhostModule(nn.Layer): - def __init__(self, - in_channels, - output_channels, - kernel_size=1, - ratio=2, - dw_size=3, - stride=1, - relu=True, - lr_mult=1., - conv_decay=0., - norm_type='bn', - norm_decay=0., - freeze_norm=False, - name=None): - super(GhostModule, self).__init__() - init_channels = int(math.ceil(output_channels / ratio)) - new_channels = int(init_channels * (ratio - 1)) - self.primary_conv = ConvBNLayer( - in_c=in_channels, - out_c=init_channels, - filter_size=kernel_size, - stride=stride, - padding=int((kernel_size - 1) // 2), - num_groups=1, - act="relu" if relu else None, - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + "_primary_conv") - self.cheap_operation = ConvBNLayer( - in_c=init_channels, - out_c=new_channels, - filter_size=dw_size, - stride=1, - padding=int((dw_size - 1) // 2), - num_groups=init_channels, - act="relu" if relu else None, - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + "_cheap_operation") - - def forward(self, inputs): - x = self.primary_conv(inputs) - y = self.cheap_operation(x) - out = paddle.concat([x, y], axis=1) - return out - - -class GhostBottleneck(nn.Layer): - def __init__(self, - in_channels, - hidden_dim, - output_channels, - kernel_size, - stride, - use_se, - lr_mult, - conv_decay=0., - norm_type='bn', - norm_decay=0., - freeze_norm=False, - return_list=False, - name=None): - super(GhostBottleneck, self).__init__() - self._stride = stride - self._use_se = use_se - self._num_channels = in_channels - self._output_channels = output_channels - self.return_list = return_list - - self.ghost_module_1 = GhostModule( - in_channels=in_channels, - output_channels=hidden_dim, - kernel_size=1, - stride=1, - relu=True, - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + "_ghost_module_1") - if stride == 2: - self.depthwise_conv = ConvBNLayer( - in_c=hidden_dim, - out_c=hidden_dim, - filter_size=kernel_size, - stride=stride, - padding=int((kernel_size - 1) // 2), - num_groups=hidden_dim, - act=None, - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + - "_depthwise_depthwise" # looks strange due to an old typo, will be fixed later. - ) - if use_se: - self.se_block = SEBlock(hidden_dim, lr_mult, name=name + "_se") - self.ghost_module_2 = GhostModule( - in_channels=hidden_dim, - output_channels=output_channels, - kernel_size=1, - relu=False, - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + "_ghost_module_2") - if stride != 1 or in_channels != output_channels: - self.shortcut_depthwise = ConvBNLayer( - in_c=in_channels, - out_c=in_channels, - filter_size=kernel_size, - stride=stride, - padding=int((kernel_size - 1) // 2), - num_groups=in_channels, - act=None, - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + - "_shortcut_depthwise_depthwise" # looks strange due to an old typo, will be fixed later. - ) - self.shortcut_conv = ConvBNLayer( - in_c=in_channels, - out_c=output_channels, - filter_size=1, - stride=1, - padding=0, - num_groups=1, - act=None, - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + "_shortcut_conv") - - def forward(self, inputs): - y = self.ghost_module_1(inputs) - x = y - if self._stride == 2: - x = self.depthwise_conv(x) - if self._use_se: - x = self.se_block(x) - x = self.ghost_module_2(x) - - if self._stride == 1 and self._num_channels == self._output_channels: - shortcut = inputs - else: - shortcut = self.shortcut_depthwise(inputs) - shortcut = self.shortcut_conv(shortcut) - x = paddle.add(x=x, y=shortcut) - - if self.return_list: - return [y, x] - else: - return x - - -@register -@serializable -class GhostNet(nn.Layer): - __shared__ = ['norm_type'] - - def __init__( - self, - scale=1.3, - feature_maps=[6, 12, 15], - with_extra_blocks=False, - extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]], - lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], - conv_decay=0., - norm_type='bn', - norm_decay=0.0, - freeze_norm=False): - super(GhostNet, self).__init__() - if isinstance(feature_maps, Integral): - feature_maps = [feature_maps] - if norm_type == 'sync_bn' and freeze_norm: - raise ValueError( - "The norm_type should not be sync_bn when freeze_norm is True") - self.feature_maps = feature_maps - self.with_extra_blocks = with_extra_blocks - self.extra_block_filters = extra_block_filters - - inplanes = 16 - self.cfgs = [ - # k, t, c, SE, s - [3, 16, 16, 0, 1], - [3, 48, 24, 0, 2], - [3, 72, 24, 0, 1], - [5, 72, 40, 1, 2], - [5, 120, 40, 1, 1], - [3, 240, 80, 0, 2], - [3, 200, 80, 0, 1], - [3, 184, 80, 0, 1], - [3, 184, 80, 0, 1], - [3, 480, 112, 1, 1], - [3, 672, 112, 1, 1], - [5, 672, 160, 1, 2], # SSDLite output - [5, 960, 160, 0, 1], - [5, 960, 160, 1, 1], - [5, 960, 160, 0, 1], - [5, 960, 160, 1, 1] - ] - self.scale = scale - conv1_out_ch = int(make_divisible(inplanes * self.scale, 4)) - self.conv1 = ConvBNLayer( - in_c=3, - out_c=conv1_out_ch, - filter_size=3, - stride=2, - padding=1, - num_groups=1, - act="relu", - lr_mult=1., - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name="conv1") - - # build inverted residual blocks - self._out_channels = [] - self.ghost_bottleneck_list = [] - idx = 0 - inplanes = conv1_out_ch - for k, exp_size, c, use_se, s in self.cfgs: - lr_idx = min(idx // 3, len(lr_mult_list) - 1) - lr_mult = lr_mult_list[lr_idx] - - # for SSD/SSDLite, first head input is after ResidualUnit expand_conv - return_list = self.with_extra_blocks and idx + 2 in self.feature_maps - - ghost_bottleneck = self.add_sublayer( - "_ghostbottleneck_" + str(idx), - sublayer=GhostBottleneck( - in_channels=inplanes, - hidden_dim=int(make_divisible(exp_size * self.scale, 4)), - output_channels=int(make_divisible(c * self.scale, 4)), - kernel_size=k, - stride=s, - use_se=use_se, - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - return_list=return_list, - name="_ghostbottleneck_" + str(idx))) - self.ghost_bottleneck_list.append(ghost_bottleneck) - inplanes = int(make_divisible(c * self.scale, 4)) - idx += 1 - self._update_out_channels( - int(make_divisible(exp_size * self.scale, 4)) - if return_list else inplanes, idx + 1, feature_maps) - - if self.with_extra_blocks: - self.extra_block_list = [] - extra_out_c = int(make_divisible(self.scale * self.cfgs[-1][1], 4)) - lr_idx = min(idx // 3, len(lr_mult_list) - 1) - lr_mult = lr_mult_list[lr_idx] - - conv_extra = self.add_sublayer( - "conv" + str(idx + 2), - sublayer=ConvBNLayer( - in_c=inplanes, - out_c=extra_out_c, - filter_size=1, - stride=1, - padding=0, - num_groups=1, - act="relu6", - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name="conv" + str(idx + 2))) - self.extra_block_list.append(conv_extra) - idx += 1 - self._update_out_channels(extra_out_c, idx + 1, feature_maps) - - for j, block_filter in enumerate(self.extra_block_filters): - in_c = extra_out_c if j == 0 else self.extra_block_filters[j - - 1][1] - conv_extra = self.add_sublayer( - "conv" + str(idx + 2), - sublayer=ExtraBlockDW( - in_c, - block_filter[0], - block_filter[1], - stride=2, - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name='conv' + str(idx + 2))) - self.extra_block_list.append(conv_extra) - idx += 1 - self._update_out_channels(block_filter[1], idx + 1, - feature_maps) - - def _update_out_channels(self, channel, feature_idx, feature_maps): - if feature_idx in feature_maps: - self._out_channels.append(channel) - - def forward(self, inputs): - x = self.conv1(inputs['image']) - outs = [] - for idx, ghost_bottleneck in enumerate(self.ghost_bottleneck_list): - x = ghost_bottleneck(x) - if idx + 2 in self.feature_maps: - if isinstance(x, list): - outs.append(x[0]) - x = x[1] - else: - outs.append(x) - - if not self.with_extra_blocks: - return outs - - for i, block in enumerate(self.extra_block_list): - idx = i + len(self.ghost_bottleneck_list) - x = block(x) - if idx + 2 in self.feature_maps: - outs.append(x) - return outs - - @property - def out_shape(self): - return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/hardnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/hardnet.py deleted file mode 100644 index 8615fb6..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/hardnet.py +++ /dev/null @@ -1,226 +0,0 @@ -# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -from ppdet.core.workspace import register -from ..shape_spec import ShapeSpec - -__all__ = ['HarDNet'] - - -def ConvLayer(in_channels, - out_channels, - kernel_size=3, - stride=1, - bias_attr=False): - layer = nn.Sequential( - ('conv', nn.Conv2D( - in_channels, - out_channels, - kernel_size=kernel_size, - stride=stride, - padding=kernel_size // 2, - groups=1, - bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels)), - ('relu', nn.ReLU6())) - return layer - - -def DWConvLayer(in_channels, - out_channels, - kernel_size=3, - stride=1, - bias_attr=False): - layer = nn.Sequential( - ('dwconv', nn.Conv2D( - in_channels, - out_channels, - kernel_size=kernel_size, - stride=stride, - padding=1, - groups=out_channels, - bias_attr=bias_attr)), ('norm', nn.BatchNorm2D(out_channels))) - return layer - - -def CombConvLayer(in_channels, out_channels, kernel_size=1, stride=1): - layer = nn.Sequential( - ('layer1', ConvLayer( - in_channels, out_channels, kernel_size=kernel_size)), - ('layer2', DWConvLayer( - out_channels, out_channels, stride=stride))) - return layer - - -class HarDBlock(nn.Layer): - def __init__(self, - in_channels, - growth_rate, - grmul, - n_layers, - keepBase=False, - residual_out=False, - dwconv=False): - super().__init__() - self.keepBase = keepBase - self.links = [] - layers_ = [] - self.out_channels = 0 - for i in range(n_layers): - outch, inch, link = self.get_link(i + 1, in_channels, growth_rate, - grmul) - self.links.append(link) - if dwconv: - layers_.append(CombConvLayer(inch, outch)) - else: - layers_.append(ConvLayer(inch, outch)) - - if (i % 2 == 0) or (i == n_layers - 1): - self.out_channels += outch - self.layers = nn.LayerList(layers_) - - def get_out_ch(self): - return self.out_channels - - def get_link(self, layer, base_ch, growth_rate, grmul): - if layer == 0: - return base_ch, 0, [] - out_channels = growth_rate - - link = [] - for i in range(10): - dv = 2**i - if layer % dv == 0: - k = layer - dv - link.append(k) - if i > 0: - out_channels *= grmul - - out_channels = int(int(out_channels + 1) / 2) * 2 - in_channels = 0 - - for i in link: - ch, _, _ = self.get_link(i, base_ch, growth_rate, grmul) - in_channels += ch - - return out_channels, in_channels, link - - def forward(self, x): - layers_ = [x] - - for layer in range(len(self.layers)): - link = self.links[layer] - tin = [] - for i in link: - tin.append(layers_[i]) - if len(tin) > 1: - x = paddle.concat(tin, 1) - else: - x = tin[0] - out = self.layers[layer](x) - layers_.append(out) - - t = len(layers_) - out_ = [] - for i in range(t): - if (i == 0 and self.keepBase) or (i == t - 1) or (i % 2 == 1): - out_.append(layers_[i]) - out = paddle.concat(out_, 1) - - return out - - -@register -class HarDNet(nn.Layer): - def __init__(self, depth_wise=False, return_idx=[1, 3, 8, 13], arch=85): - super(HarDNet, self).__init__() - assert arch in [68, 85], "HarDNet-{} is not supported.".format(arch) - if arch == 85: - first_ch = [48, 96] - second_kernel = 3 - ch_list = [192, 256, 320, 480, 720] - grmul = 1.7 - gr = [24, 24, 28, 36, 48] - n_layers = [8, 16, 16, 16, 16] - elif arch == 68: - first_ch = [32, 64] - second_kernel = 3 - ch_list = [128, 256, 320, 640] - grmul = 1.7 - gr = [14, 16, 20, 40] - n_layers = [8, 16, 16, 16] - else: - raise ValueError("HarDNet-{} is not supported.".format(arch)) - - self.return_idx = return_idx - self._out_channels = [96, 214, 458, 784] - - avg_pool = True - if depth_wise: - second_kernel = 1 - avg_pool = False - - blks = len(n_layers) - self.base = nn.LayerList([]) - - # First Layer: Standard Conv3x3, Stride=2 - self.base.append( - ConvLayer( - in_channels=3, - out_channels=first_ch[0], - kernel_size=3, - stride=2, - bias_attr=False)) - - # Second Layer - self.base.append( - ConvLayer( - first_ch[0], first_ch[1], kernel_size=second_kernel)) - - # Avgpooling or DWConv3x3 downsampling - if avg_pool: - self.base.append(nn.AvgPool2D(kernel_size=3, stride=2, padding=1)) - else: - self.base.append(DWConvLayer(first_ch[1], first_ch[1], stride=2)) - - # Build all HarDNet blocks - ch = first_ch[1] - for i in range(blks): - blk = HarDBlock(ch, gr[i], grmul, n_layers[i], dwconv=depth_wise) - ch = blk.out_channels - self.base.append(blk) - - if i != blks - 1: - self.base.append(ConvLayer(ch, ch_list[i], kernel_size=1)) - ch = ch_list[i] - if i == 0: - self.base.append( - nn.AvgPool2D( - kernel_size=2, stride=2, ceil_mode=True)) - elif i != blks - 1 and i != 1 and i != 3: - self.base.append(nn.AvgPool2D(kernel_size=2, stride=2)) - - def forward(self, inputs): - x = inputs['image'] - outs = [] - for i, layer in enumerate(self.base): - x = layer(x) - if i in self.return_idx: - outs.append(x) - return outs - - @property - def out_shape(self): - return [ShapeSpec(channels=self._out_channels[i]) for i in range(4)] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/hgnet_v2.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/hgnet_v2.py deleted file mode 100644 index 88f989a..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/hgnet_v2.py +++ /dev/null @@ -1,447 +0,0 @@ -# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.nn.initializer import KaimingNormal, Constant -from paddle.nn import Conv2D, BatchNorm2D, ReLU, AdaptiveAvgPool2D, MaxPool2D -from paddle.regularizer import L2Decay -from paddle import ParamAttr - -import copy - -from ppdet.core.workspace import register, serializable -from ..shape_spec import ShapeSpec - -__all__ = ['PPHGNetV2'] - -kaiming_normal_ = KaimingNormal() -zeros_ = Constant(value=0.) -ones_ = Constant(value=1.) - - -class LearnableAffineBlock(nn.Layer): - def __init__(self, - scale_value=1.0, - bias_value=0.0, - lr_mult=1.0, - lab_lr=0.01): - super().__init__() - self.scale = self.create_parameter( - shape=[1, ], - default_initializer=Constant(value=scale_value), - attr=ParamAttr(learning_rate=lr_mult * lab_lr)) - self.add_parameter("scale", self.scale) - self.bias = self.create_parameter( - shape=[1, ], - default_initializer=Constant(value=bias_value), - attr=ParamAttr(learning_rate=lr_mult * lab_lr)) - self.add_parameter("bias", self.bias) - - def forward(self, x): - return self.scale * x + self.bias - - -class ConvBNAct(nn.Layer): - def __init__(self, - in_channels, - out_channels, - kernel_size=3, - stride=1, - padding=1, - groups=1, - use_act=True, - use_lab=False, - lr_mult=1.0): - super().__init__() - self.use_act = use_act - self.use_lab = use_lab - self.conv = Conv2D( - in_channels, - out_channels, - kernel_size, - stride, - padding=padding - if isinstance(padding, str) else (kernel_size - 1) // 2, - groups=groups, - weight_attr=ParamAttr(learning_rate=lr_mult), - bias_attr=False) - self.bn = BatchNorm2D( - out_channels, - weight_attr=ParamAttr( - regularizer=L2Decay(0.0), learning_rate=lr_mult), - bias_attr=ParamAttr( - regularizer=L2Decay(0.0), learning_rate=lr_mult)) - if self.use_act: - self.act = ReLU() - if self.use_lab: - self.lab = LearnableAffineBlock(lr_mult=lr_mult) - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - if self.use_act: - x = self.act(x) - if self.use_lab: - x = self.lab(x) - return x - - -class LightConvBNAct(nn.Layer): - def __init__(self, - in_channels, - out_channels, - kernel_size, - stride, - groups=1, - use_lab=False, - lr_mult=1.0): - super().__init__() - self.conv1 = ConvBNAct( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=1, - use_act=False, - use_lab=use_lab, - lr_mult=lr_mult) - self.conv2 = ConvBNAct( - in_channels=out_channels, - out_channels=out_channels, - kernel_size=kernel_size, - groups=out_channels, - use_act=True, - use_lab=use_lab, - lr_mult=lr_mult) - - def forward(self, x): - x = self.conv1(x) - x = self.conv2(x) - return x - - -class StemBlock(nn.Layer): - def __init__(self, - in_channels, - mid_channels, - out_channels, - use_lab=False, - lr_mult=1.0): - super().__init__() - self.stem1 = ConvBNAct( - in_channels=in_channels, - out_channels=mid_channels, - kernel_size=3, - stride=2, - use_lab=use_lab, - lr_mult=lr_mult) - self.stem2a = ConvBNAct( - in_channels=mid_channels, - out_channels=mid_channels // 2, - kernel_size=2, - stride=1, - padding="SAME", - use_lab=use_lab, - lr_mult=lr_mult) - self.stem2b = ConvBNAct( - in_channels=mid_channels // 2, - out_channels=mid_channels, - kernel_size=2, - stride=1, - padding="SAME", - use_lab=use_lab, - lr_mult=lr_mult) - self.stem3 = ConvBNAct( - in_channels=mid_channels * 2, - out_channels=mid_channels, - kernel_size=3, - stride=2, - use_lab=use_lab, - lr_mult=lr_mult) - self.stem4 = ConvBNAct( - in_channels=mid_channels, - out_channels=out_channels, - kernel_size=1, - stride=1, - use_lab=use_lab, - lr_mult=lr_mult) - self.pool = nn.MaxPool2D( - kernel_size=2, stride=1, ceil_mode=True, padding="SAME") - - def forward(self, x): - x = self.stem1(x) - x2 = self.stem2a(x) - x2 = self.stem2b(x2) - x1 = self.pool(x) - x = paddle.concat([x1, x2], 1) - x = self.stem3(x) - x = self.stem4(x) - - return x - - -class HG_Block(nn.Layer): - def __init__(self, - in_channels, - mid_channels, - out_channels, - kernel_size=3, - layer_num=6, - identity=False, - light_block=True, - use_lab=False, - lr_mult=1.0): - super().__init__() - self.identity = identity - - self.layers = nn.LayerList() - block_type = "LightConvBNAct" if light_block else "ConvBNAct" - for i in range(layer_num): - self.layers.append( - eval(block_type)(in_channels=in_channels - if i == 0 else mid_channels, - out_channels=mid_channels, - stride=1, - kernel_size=kernel_size, - use_lab=use_lab, - lr_mult=lr_mult)) - # feature aggregation - total_channels = in_channels + layer_num * mid_channels - self.aggregation_squeeze_conv = ConvBNAct( - in_channels=total_channels, - out_channels=out_channels // 2, - kernel_size=1, - stride=1, - use_lab=use_lab, - lr_mult=lr_mult) - self.aggregation_excitation_conv = ConvBNAct( - in_channels=out_channels // 2, - out_channels=out_channels, - kernel_size=1, - stride=1, - use_lab=use_lab, - lr_mult=lr_mult) - - def forward(self, x): - identity = x - output = [] - output.append(x) - for layer in self.layers: - x = layer(x) - output.append(x) - x = paddle.concat(output, axis=1) - x = self.aggregation_squeeze_conv(x) - x = self.aggregation_excitation_conv(x) - if self.identity: - x += identity - return x - - -class HG_Stage(nn.Layer): - def __init__(self, - in_channels, - mid_channels, - out_channels, - block_num, - layer_num=6, - downsample=True, - light_block=True, - kernel_size=3, - use_lab=False, - lr_mult=1.0): - super().__init__() - self.downsample = downsample - if downsample: - self.downsample = ConvBNAct( - in_channels=in_channels, - out_channels=in_channels, - kernel_size=3, - stride=2, - groups=in_channels, - use_act=False, - use_lab=use_lab, - lr_mult=lr_mult) - - blocks_list = [] - for i in range(block_num): - blocks_list.append( - HG_Block( - in_channels=in_channels if i == 0 else out_channels, - mid_channels=mid_channels, - out_channels=out_channels, - kernel_size=kernel_size, - layer_num=layer_num, - identity=False if i == 0 else True, - light_block=light_block, - use_lab=use_lab, - lr_mult=lr_mult)) - self.blocks = nn.Sequential(*blocks_list) - - def forward(self, x): - if self.downsample: - x = self.downsample(x) - x = self.blocks(x) - return x - - -def _freeze_norm(m: nn.BatchNorm2D): - param_attr = ParamAttr( - learning_rate=0., regularizer=L2Decay(0.), trainable=False) - bias_attr = ParamAttr( - learning_rate=0., regularizer=L2Decay(0.), trainable=False) - global_stats = True - norm = nn.BatchNorm2D( - m._num_features, - weight_attr=param_attr, - bias_attr=bias_attr, - use_global_stats=global_stats) - for param in norm.parameters(): - param.stop_gradient = True - return norm - - -def reset_bn(model: nn.Layer, reset_func=_freeze_norm): - if isinstance(model, nn.BatchNorm2D): - model = reset_func(model) - else: - for name, child in model.named_children(): - _child = reset_bn(child, reset_func) - if _child is not child: - setattr(model, name, _child) - return model - - -@register -@serializable -class PPHGNetV2(nn.Layer): - """ - PPHGNetV2 - Args: - stem_channels: list. Number of channels for the stem block. - stage_type: str. The stage configuration of PPHGNet. such as the number of channels, stride, etc. - use_lab: boolean. Whether to use LearnableAffineBlock in network. - lr_mult_list: list. Control the learning rate of different stages. - Returns: - model: nn.Layer. Specific PPHGNetV2 model depends on args. - """ - - arch_configs = { - 'L': { - 'stem_channels': [3, 32, 48], - 'stage_config': { - # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num - "stage1": [48, 48, 128, 1, False, False, 3, 6], - "stage2": [128, 96, 512, 1, True, False, 3, 6], - "stage3": [512, 192, 1024, 3, True, True, 5, 6], - "stage4": [1024, 384, 2048, 1, True, True, 5, 6], - } - }, - 'X': { - 'stem_channels': [3, 32, 64], - 'stage_config': { - # in_channels, mid_channels, out_channels, num_blocks, downsample, light_block, kernel_size, layer_num - "stage1": [64, 64, 128, 1, False, False, 3, 6], - "stage2": [128, 128, 512, 2, True, False, 3, 6], - "stage3": [512, 256, 1024, 5, True, True, 5, 6], - "stage4": [1024, 512, 2048, 2, True, True, 5, 6], - } - } - } - - def __init__(self, - arch, - use_lab=False, - lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], - return_idx=[1, 2, 3], - freeze_stem_only=True, - freeze_at=0, - freeze_norm=True): - super().__init__() - self.use_lab = use_lab - self.return_idx = return_idx - - stem_channels = self.arch_configs[arch]['stem_channels'] - stage_config = self.arch_configs[arch]['stage_config'] - - self._out_strides = [4, 8, 16, 32] - self._out_channels = [stage_config[k][2] for k in stage_config] - - # stem - self.stem = StemBlock( - in_channels=stem_channels[0], - mid_channels=stem_channels[1], - out_channels=stem_channels[2], - use_lab=use_lab, - lr_mult=lr_mult_list[0]) - - # stages - self.stages = nn.LayerList() - for i, k in enumerate(stage_config): - in_channels, mid_channels, out_channels, block_num, downsample, light_block, kernel_size, layer_num = stage_config[ - k] - self.stages.append( - HG_Stage( - in_channels, - mid_channels, - out_channels, - block_num, - layer_num, - downsample, - light_block, - kernel_size, - use_lab, - lr_mult=lr_mult_list[i + 1])) - - if freeze_at >= 0: - self._freeze_parameters(self.stem) - if not freeze_stem_only: - for i in range(min(freeze_at + 1, len(self.stages))): - self._freeze_parameters(self.stages[i]) - - if freeze_norm: - reset_bn(self, reset_func=_freeze_norm) - - self._init_weights() - - def _freeze_parameters(self, m): - for p in m.parameters(): - p.stop_gradient = True - - def _init_weights(self): - for m in self.sublayers(): - if isinstance(m, nn.Conv2D): - kaiming_normal_(m.weight) - elif isinstance(m, (nn.BatchNorm2D)): - ones_(m.weight) - zeros_(m.bias) - elif isinstance(m, nn.Linear): - zeros_(m.bias) - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=self._out_channels[i], stride=self._out_strides[i]) - for i in self.return_idx - ] - - def forward(self, inputs): - x = inputs['image'] - x = self.stem(x) - outs = [] - for idx, stage in enumerate(self.stages): - x = stage(x) - if idx in self.return_idx: - outs.append(x) - return outs diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/hrnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/hrnet.py deleted file mode 100644 index 977edd6..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/hrnet.py +++ /dev/null @@ -1,869 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.nn import AdaptiveAvgPool2D, Linear -from paddle.regularizer import L2Decay -from paddle import ParamAttr -from paddle.nn.initializer import Normal, Uniform -from numbers import Integral -import math - -from ppdet.core.workspace import register -from ..shape_spec import ShapeSpec - -__all__ = ['HRNet'] - - -class ConvNormLayer(nn.Layer): - def __init__(self, - ch_in, - ch_out, - filter_size, - stride=1, - norm_type='bn', - norm_groups=32, - use_dcn=False, - norm_momentum=0.9, - norm_decay=0., - freeze_norm=False, - act=None, - name=None): - super(ConvNormLayer, self).__init__() - assert norm_type in ['bn', 'sync_bn', 'gn'] - - self.act = act - self.conv = nn.Conv2D( - in_channels=ch_in, - out_channels=ch_out, - kernel_size=filter_size, - stride=stride, - padding=(filter_size - 1) // 2, - groups=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=False) - - norm_lr = 0. if freeze_norm else 1. - - param_attr = ParamAttr( - learning_rate=norm_lr, regularizer=L2Decay(norm_decay)) - bias_attr = ParamAttr( - learning_rate=norm_lr, regularizer=L2Decay(norm_decay)) - global_stats = True if freeze_norm else None - if norm_type in ['bn', 'sync_bn']: - self.norm = nn.BatchNorm2D( - ch_out, - momentum=norm_momentum, - weight_attr=param_attr, - bias_attr=bias_attr, - use_global_stats=global_stats) - elif norm_type == 'gn': - self.norm = nn.GroupNorm( - num_groups=norm_groups, - num_channels=ch_out, - weight_attr=param_attr, - bias_attr=bias_attr) - norm_params = self.norm.parameters() - if freeze_norm: - for param in norm_params: - param.stop_gradient = True - - def forward(self, inputs): - out = self.conv(inputs) - out = self.norm(out) - - if self.act == 'relu': - out = F.relu(out) - return out - - -class Layer1(nn.Layer): - def __init__(self, - num_channels, - has_se=False, - norm_momentum=0.9, - norm_decay=0., - freeze_norm=True, - name=None): - super(Layer1, self).__init__() - - self.bottleneck_block_list = [] - - for i in range(4): - bottleneck_block = self.add_sublayer( - "block_{}_{}".format(name, i + 1), - BottleneckBlock( - num_channels=num_channels if i == 0 else 256, - num_filters=64, - has_se=has_se, - stride=1, - downsample=True if i == 0 else False, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + '_' + str(i + 1))) - self.bottleneck_block_list.append(bottleneck_block) - - def forward(self, input): - conv = input - for block_func in self.bottleneck_block_list: - conv = block_func(conv) - return conv - - -class TransitionLayer(nn.Layer): - def __init__(self, - in_channels, - out_channels, - norm_momentum=0.9, - norm_decay=0., - freeze_norm=True, - name=None): - super(TransitionLayer, self).__init__() - - num_in = len(in_channels) - num_out = len(out_channels) - out = [] - self.conv_bn_func_list = [] - for i in range(num_out): - residual = None - if i < num_in: - if in_channels[i] != out_channels[i]: - residual = self.add_sublayer( - "transition_{}_layer_{}".format(name, i + 1), - ConvNormLayer( - ch_in=in_channels[i], - ch_out=out_channels[i], - filter_size=3, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - act='relu', - name=name + '_layer_' + str(i + 1))) - else: - residual = self.add_sublayer( - "transition_{}_layer_{}".format(name, i + 1), - ConvNormLayer( - ch_in=in_channels[-1], - ch_out=out_channels[i], - filter_size=3, - stride=2, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - act='relu', - name=name + '_layer_' + str(i + 1))) - self.conv_bn_func_list.append(residual) - - def forward(self, input): - outs = [] - for idx, conv_bn_func in enumerate(self.conv_bn_func_list): - if conv_bn_func is None: - outs.append(input[idx]) - else: - if idx < len(input): - outs.append(conv_bn_func(input[idx])) - else: - outs.append(conv_bn_func(input[-1])) - return outs - - -class Branches(nn.Layer): - def __init__(self, - block_num, - in_channels, - out_channels, - has_se=False, - norm_momentum=0.9, - norm_decay=0., - freeze_norm=True, - name=None): - super(Branches, self).__init__() - - self.basic_block_list = [] - for i in range(len(out_channels)): - self.basic_block_list.append([]) - for j in range(block_num): - in_ch = in_channels[i] if j == 0 else out_channels[i] - basic_block_func = self.add_sublayer( - "bb_{}_branch_layer_{}_{}".format(name, i + 1, j + 1), - BasicBlock( - num_channels=in_ch, - num_filters=out_channels[i], - has_se=has_se, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + '_branch_layer_' + str(i + 1) + '_' + - str(j + 1))) - self.basic_block_list[i].append(basic_block_func) - - def forward(self, inputs): - outs = [] - for idx, input in enumerate(inputs): - conv = input - basic_block_list = self.basic_block_list[idx] - for basic_block_func in basic_block_list: - conv = basic_block_func(conv) - outs.append(conv) - return outs - - -class BottleneckBlock(nn.Layer): - def __init__(self, - num_channels, - num_filters, - has_se, - stride=1, - downsample=False, - norm_momentum=0.9, - norm_decay=0., - freeze_norm=True, - name=None): - super(BottleneckBlock, self).__init__() - - self.has_se = has_se - self.downsample = downsample - - self.conv1 = ConvNormLayer( - ch_in=num_channels, - ch_out=num_filters, - filter_size=1, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - act="relu", - name=name + "_conv1") - self.conv2 = ConvNormLayer( - ch_in=num_filters, - ch_out=num_filters, - filter_size=3, - stride=stride, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - act="relu", - name=name + "_conv2") - self.conv3 = ConvNormLayer( - ch_in=num_filters, - ch_out=num_filters * 4, - filter_size=1, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - act=None, - name=name + "_conv3") - - if self.downsample: - self.conv_down = ConvNormLayer( - ch_in=num_channels, - ch_out=num_filters * 4, - filter_size=1, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - act=None, - name=name + "_downsample") - - if self.has_se: - self.se = SELayer( - num_channels=num_filters * 4, - num_filters=num_filters * 4, - reduction_ratio=16, - name='fc' + name) - - def forward(self, input): - residual = input - conv1 = self.conv1(input) - conv2 = self.conv2(conv1) - conv3 = self.conv3(conv2) - - if self.downsample: - residual = self.conv_down(input) - - if self.has_se: - conv3 = self.se(conv3) - - y = paddle.add(x=residual, y=conv3) - y = F.relu(y) - return y - - -class BasicBlock(nn.Layer): - def __init__(self, - num_channels, - num_filters, - stride=1, - has_se=False, - downsample=False, - norm_momentum=0.9, - norm_decay=0., - freeze_norm=True, - name=None): - super(BasicBlock, self).__init__() - - self.has_se = has_se - self.downsample = downsample - self.conv1 = ConvNormLayer( - ch_in=num_channels, - ch_out=num_filters, - filter_size=3, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - stride=stride, - act="relu", - name=name + "_conv1") - self.conv2 = ConvNormLayer( - ch_in=num_filters, - ch_out=num_filters, - filter_size=3, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - stride=1, - act=None, - name=name + "_conv2") - - if self.downsample: - self.conv_down = ConvNormLayer( - ch_in=num_channels, - ch_out=num_filters * 4, - filter_size=1, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - act=None, - name=name + "_downsample") - - if self.has_se: - self.se = SELayer( - num_channels=num_filters, - num_filters=num_filters, - reduction_ratio=16, - name='fc' + name) - - def forward(self, input): - residual = input - conv1 = self.conv1(input) - conv2 = self.conv2(conv1) - - if self.downsample: - residual = self.conv_down(input) - - if self.has_se: - conv2 = self.se(conv2) - - y = paddle.add(x=residual, y=conv2) - y = F.relu(y) - return y - - -class SELayer(nn.Layer): - def __init__(self, num_channels, num_filters, reduction_ratio, name=None): - super(SELayer, self).__init__() - - self.pool2d_gap = AdaptiveAvgPool2D(1) - - self._num_channels = num_channels - - med_ch = int(num_channels / reduction_ratio) - stdv = 1.0 / math.sqrt(num_channels * 1.0) - self.squeeze = Linear( - num_channels, - med_ch, - weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv))) - - stdv = 1.0 / math.sqrt(med_ch * 1.0) - self.excitation = Linear( - med_ch, - num_filters, - weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv))) - - def forward(self, input): - pool = self.pool2d_gap(input) - pool = paddle.squeeze(pool, axis=[2, 3]) - squeeze = self.squeeze(pool) - squeeze = F.relu(squeeze) - excitation = self.excitation(squeeze) - excitation = F.sigmoid(excitation) - excitation = paddle.unsqueeze(excitation, axis=[2, 3]) - out = input * excitation - return out - - -class Stage(nn.Layer): - def __init__(self, - num_channels, - num_modules, - num_filters, - has_se=False, - norm_momentum=0.9, - norm_decay=0., - freeze_norm=True, - multi_scale_output=True, - name=None): - super(Stage, self).__init__() - - self._num_modules = num_modules - self.stage_func_list = [] - for i in range(num_modules): - if i == num_modules - 1 and not multi_scale_output: - stage_func = self.add_sublayer( - "stage_{}_{}".format(name, i + 1), - HighResolutionModule( - num_channels=num_channels, - num_filters=num_filters, - has_se=has_se, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - multi_scale_output=False, - name=name + '_' + str(i + 1))) - else: - stage_func = self.add_sublayer( - "stage_{}_{}".format(name, i + 1), - HighResolutionModule( - num_channels=num_channels, - num_filters=num_filters, - has_se=has_se, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + '_' + str(i + 1))) - - self.stage_func_list.append(stage_func) - - def forward(self, input): - out = input - for idx in range(self._num_modules): - out = self.stage_func_list[idx](out) - return out - - -class HighResolutionModule(nn.Layer): - def __init__(self, - num_channels, - num_filters, - has_se=False, - multi_scale_output=True, - norm_momentum=0.9, - norm_decay=0., - freeze_norm=True, - name=None): - super(HighResolutionModule, self).__init__() - self.branches_func = Branches( - block_num=4, - in_channels=num_channels, - out_channels=num_filters, - has_se=has_se, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name) - - self.fuse_func = FuseLayers( - in_channels=num_filters, - out_channels=num_filters, - multi_scale_output=multi_scale_output, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name) - - def forward(self, input): - out = self.branches_func(input) - out = self.fuse_func(out) - return out - - -class FuseLayers(nn.Layer): - def __init__(self, - in_channels, - out_channels, - multi_scale_output=True, - norm_momentum=0.9, - norm_decay=0., - freeze_norm=True, - name=None): - super(FuseLayers, self).__init__() - - self._actual_ch = len(in_channels) if multi_scale_output else 1 - self._in_channels = in_channels - - self.residual_func_list = [] - for i in range(self._actual_ch): - for j in range(len(in_channels)): - residual_func = None - if j > i: - residual_func = self.add_sublayer( - "residual_{}_layer_{}_{}".format(name, i + 1, j + 1), - ConvNormLayer( - ch_in=in_channels[j], - ch_out=out_channels[i], - filter_size=1, - stride=1, - act=None, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + '_layer_' + str(i + 1) + '_' + - str(j + 1))) - self.residual_func_list.append(residual_func) - elif j < i: - pre_num_filters = in_channels[j] - for k in range(i - j): - if k == i - j - 1: - residual_func = self.add_sublayer( - "residual_{}_layer_{}_{}_{}".format( - name, i + 1, j + 1, k + 1), - ConvNormLayer( - ch_in=pre_num_filters, - ch_out=out_channels[i], - filter_size=3, - stride=2, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - act=None, - name=name + '_layer_' + str(i + 1) + '_' + - str(j + 1) + '_' + str(k + 1))) - pre_num_filters = out_channels[i] - else: - residual_func = self.add_sublayer( - "residual_{}_layer_{}_{}_{}".format( - name, i + 1, j + 1, k + 1), - ConvNormLayer( - ch_in=pre_num_filters, - ch_out=out_channels[j], - filter_size=3, - stride=2, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - act="relu", - name=name + '_layer_' + str(i + 1) + '_' + - str(j + 1) + '_' + str(k + 1))) - pre_num_filters = out_channels[j] - self.residual_func_list.append(residual_func) - - def forward(self, input): - outs = [] - residual_func_idx = 0 - for i in range(self._actual_ch): - residual = input[i] - for j in range(len(self._in_channels)): - if j > i: - y = self.residual_func_list[residual_func_idx](input[j]) - residual_func_idx += 1 - y = F.interpolate(y, scale_factor=2**(j - i)) - residual = paddle.add(x=residual, y=y) - elif j < i: - y = input[j] - for k in range(i - j): - y = self.residual_func_list[residual_func_idx](y) - residual_func_idx += 1 - residual = paddle.add(x=residual, y=y) - residual = F.relu(residual) - outs.append(residual) - - return outs - - -@register -class HRNet(nn.Layer): - """ - HRNet, see https://arxiv.org/abs/1908.07919 - - Args: - width (int): the width of HRNet - has_se (bool): whether to add SE block for each stage - freeze_at (int): the stage to freeze - freeze_norm (bool): whether to freeze norm in HRNet - norm_momentum (float): momentum of BatchNorm - norm_decay (float): weight decay for normalization layer weights - return_idx (List): the stage to return - upsample (bool): whether to upsample and concat the backbone feats - """ - - def __init__(self, - width=18, - has_se=False, - freeze_at=0, - freeze_norm=True, - norm_momentum=0.9, - norm_decay=0., - return_idx=[0, 1, 2, 3], - upsample=False, - downsample=False): - super(HRNet, self).__init__() - - self.width = width - self.has_se = has_se - if isinstance(return_idx, Integral): - return_idx = [return_idx] - - assert len(return_idx) > 0, "need one or more return index" - self.freeze_at = freeze_at - self.return_idx = return_idx - self.upsample = upsample - self.downsample = downsample - - self.channels = { - 18: [[18, 36], [18, 36, 72], [18, 36, 72, 144]], - 30: [[30, 60], [30, 60, 120], [30, 60, 120, 240]], - 32: [[32, 64], [32, 64, 128], [32, 64, 128, 256]], - 40: [[40, 80], [40, 80, 160], [40, 80, 160, 320]], - 44: [[44, 88], [44, 88, 176], [44, 88, 176, 352]], - 48: [[48, 96], [48, 96, 192], [48, 96, 192, 384]], - 60: [[60, 120], [60, 120, 240], [60, 120, 240, 480]], - 64: [[64, 128], [64, 128, 256], [64, 128, 256, 512]] - } - - channels_2, channels_3, channels_4 = self.channels[width] - num_modules_2, num_modules_3, num_modules_4 = 1, 4, 3 - self._out_channels = [sum(channels_4)] if self.upsample else channels_4 - self._out_strides = [4] if self.upsample else [4, 8, 16, 32] - - self.conv_layer1_1 = ConvNormLayer( - ch_in=3, - ch_out=64, - filter_size=3, - stride=2, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - act='relu', - name="layer1_1") - - self.conv_layer1_2 = ConvNormLayer( - ch_in=64, - ch_out=64, - filter_size=3, - stride=2, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - act='relu', - name="layer1_2") - - self.la1 = Layer1( - num_channels=64, - has_se=has_se, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name="layer2") - - self.tr1 = TransitionLayer( - in_channels=[256], - out_channels=channels_2, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name="tr1") - - self.st2 = Stage( - num_channels=channels_2, - num_modules=num_modules_2, - num_filters=channels_2, - has_se=self.has_se, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name="st2") - - self.tr2 = TransitionLayer( - in_channels=channels_2, - out_channels=channels_3, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name="tr2") - - self.st3 = Stage( - num_channels=channels_3, - num_modules=num_modules_3, - num_filters=channels_3, - has_se=self.has_se, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name="st3") - - self.tr3 = TransitionLayer( - in_channels=channels_3, - out_channels=channels_4, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name="tr3") - self.st4 = Stage( - num_channels=channels_4, - num_modules=num_modules_4, - num_filters=channels_4, - has_se=self.has_se, - norm_momentum=norm_momentum, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - multi_scale_output=len(return_idx) > 1, - name="st4") - - if self.downsample: - self.incre_modules, self.downsamp_modules, \ - self.final_layer = self._make_head(channels_4, norm_momentum=norm_momentum, has_se=self.has_se) - - def _make_layer(self, - block, - inplanes, - planes, - blocks, - stride=1, - norm_momentum=0.9, - has_se=False, - name=None): - downsample = None - if stride != 1 or inplanes != planes * 4: - downsample = True - - layers = [] - layers.append( - block( - inplanes, - planes, - has_se, - stride, - downsample, - norm_momentum=norm_momentum, - freeze_norm=False, - name=name + "_s0")) - inplanes = planes * 4 - for i in range(1, blocks): - layers.append( - block( - inplanes, - planes, - has_se, - norm_momentum=norm_momentum, - freeze_norm=False, - name=name + "_s" + str(i))) - - return nn.Sequential(*layers) - - def _make_head(self, pre_stage_channels, norm_momentum=0.9, has_se=False): - head_block = BottleneckBlock - head_channels = [32, 64, 128, 256] - - # Increasing the #channels on each resolution - # from C, 2C, 4C, 8C to 128, 256, 512, 1024 - incre_modules = [] - for i, channels in enumerate(pre_stage_channels): - incre_module = self._make_layer( - head_block, - channels, - head_channels[i], - 1, - stride=1, - norm_momentum=norm_momentum, - has_se=has_se, - name='incre' + str(i)) - incre_modules.append(incre_module) - incre_modules = nn.LayerList(incre_modules) - - # downsampling modules - downsamp_modules = [] - for i in range(len(pre_stage_channels) - 1): - in_channels = head_channels[i] * 4 - out_channels = head_channels[i + 1] * 4 - - downsamp_module = nn.Sequential( - nn.Conv2D( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=3, - stride=2, - padding=1), - nn.BatchNorm2D( - out_channels, momentum=norm_momentum), - nn.ReLU()) - - downsamp_modules.append(downsamp_module) - downsamp_modules = nn.LayerList(downsamp_modules) - - final_layer = nn.Sequential( - nn.Conv2D( - in_channels=head_channels[3] * 4, - out_channels=2048, - kernel_size=1, - stride=1, - padding=0), - nn.BatchNorm2D( - 2048, momentum=norm_momentum), - nn.ReLU()) - - return incre_modules, downsamp_modules, final_layer - - def forward(self, inputs): - x = inputs['image'] - conv1 = self.conv_layer1_1(x) - conv2 = self.conv_layer1_2(conv1) - - la1 = self.la1(conv2) - tr1 = self.tr1([la1]) - st2 = self.st2(tr1) - tr2 = self.tr2(st2) - - st3 = self.st3(tr2) - tr3 = self.tr3(st3) - - st4 = self.st4(tr3) - - if self.upsample: - # Upsampling - x0_h, x0_w = st4[0].shape[2:4] - x1 = F.upsample(st4[1], size=(x0_h, x0_w), mode='bilinear') - x2 = F.upsample(st4[2], size=(x0_h, x0_w), mode='bilinear') - x3 = F.upsample(st4[3], size=(x0_h, x0_w), mode='bilinear') - x = paddle.concat([st4[0], x1, x2, x3], 1) - return x - - if self.downsample: - y = self.incre_modules[0](st4[0]) - for i in range(len(self.downsamp_modules)): - y = self.incre_modules[i+1](st4[i+1]) + \ - self.downsamp_modules[i](y) - y = self.final_layer(y) - return y - - res = [] - for i, layer in enumerate(st4): - if i == self.freeze_at: - layer.stop_gradient = True - if i in self.return_idx: - res.append(layer) - - return res - - @property - def out_shape(self): - if self.upsample: - self.return_idx = [0] - return [ - ShapeSpec( - channels=self._out_channels[i], stride=self._out_strides[i]) - for i in self.return_idx - ] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/lcnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/lcnet.py deleted file mode 100644 index 76da139..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/lcnet.py +++ /dev/null @@ -1,271 +0,0 @@ -# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -from paddle import ParamAttr -from paddle.nn import AdaptiveAvgPool2D, Conv2D -from paddle.regularizer import L2Decay -from paddle.nn.initializer import KaimingNormal - -from ppdet.core.workspace import register, serializable -from numbers import Integral -from ..shape_spec import ShapeSpec - -__all__ = ['LCNet'] - -NET_CONFIG = { - "blocks2": - #k, in_c, out_c, s, use_se - [[3, 16, 32, 1, False], ], - "blocks3": [ - [3, 32, 64, 2, False], - [3, 64, 64, 1, False], - ], - "blocks4": [ - [3, 64, 128, 2, False], - [3, 128, 128, 1, False], - ], - "blocks5": [ - [3, 128, 256, 2, False], - [5, 256, 256, 1, False], - [5, 256, 256, 1, False], - [5, 256, 256, 1, False], - [5, 256, 256, 1, False], - [5, 256, 256, 1, False], - ], - "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]] -} - - -def make_divisible(v, divisor=8, min_value=None): - if min_value is None: - min_value = divisor - new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) - if new_v < 0.9 * v: - new_v += divisor - return new_v - - -class ConvBNLayer(nn.Layer): - def __init__(self, - num_channels, - filter_size, - num_filters, - stride, - num_groups=1, - act='hard_swish'): - super().__init__() - - self.conv = Conv2D( - in_channels=num_channels, - out_channels=num_filters, - kernel_size=filter_size, - stride=stride, - padding=(filter_size - 1) // 2, - groups=num_groups, - weight_attr=ParamAttr(initializer=KaimingNormal()), - bias_attr=False) - - self.bn = nn.BatchNorm2D( - num_filters, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - if act == 'hard_swish': - self.act = nn.Hardswish() - elif act == 'relu6': - self.act = nn.ReLU6() - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.act(x) - return x - - -class DepthwiseSeparable(nn.Layer): - def __init__(self, - num_channels, - num_filters, - stride, - dw_size=3, - use_se=False, - act='hard_swish'): - super().__init__() - self.use_se = use_se - self.dw_conv = ConvBNLayer( - num_channels=num_channels, - num_filters=num_channels, - filter_size=dw_size, - stride=stride, - num_groups=num_channels, - act=act) - if use_se: - self.se = SEModule(num_channels) - self.pw_conv = ConvBNLayer( - num_channels=num_channels, - filter_size=1, - num_filters=num_filters, - stride=1, - act=act) - - def forward(self, x): - x = self.dw_conv(x) - if self.use_se: - x = self.se(x) - x = self.pw_conv(x) - return x - - -class SEModule(nn.Layer): - def __init__(self, channel, reduction=4): - super().__init__() - self.avg_pool = AdaptiveAvgPool2D(1) - self.conv1 = Conv2D( - in_channels=channel, - out_channels=channel // reduction, - kernel_size=1, - stride=1, - padding=0) - self.relu = nn.ReLU() - self.conv2 = Conv2D( - in_channels=channel // reduction, - out_channels=channel, - kernel_size=1, - stride=1, - padding=0) - self.hardsigmoid = nn.Hardsigmoid() - - def forward(self, x): - identity = x - x = self.avg_pool(x) - x = self.conv1(x) - x = self.relu(x) - x = self.conv2(x) - x = self.hardsigmoid(x) - x = paddle.multiply(x=identity, y=x) - return x - - -@register -@serializable -class LCNet(nn.Layer): - def __init__(self, scale=1.0, feature_maps=[3, 4, 5], act='hard_swish'): - super().__init__() - self.scale = scale - self.feature_maps = feature_maps - - out_channels = [] - - self.conv1 = ConvBNLayer( - num_channels=3, - filter_size=3, - num_filters=make_divisible(16 * scale), - stride=2, - act=act) - - self.blocks2 = nn.Sequential(* [ - DepthwiseSeparable( - num_channels=make_divisible(in_c * scale), - num_filters=make_divisible(out_c * scale), - dw_size=k, - stride=s, - use_se=se, - act=act) - for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"]) - ]) - - self.blocks3 = nn.Sequential(* [ - DepthwiseSeparable( - num_channels=make_divisible(in_c * scale), - num_filters=make_divisible(out_c * scale), - dw_size=k, - stride=s, - use_se=se, - act=act) - for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"]) - ]) - - out_channels.append( - make_divisible(NET_CONFIG["blocks3"][-1][2] * scale)) - - self.blocks4 = nn.Sequential(* [ - DepthwiseSeparable( - num_channels=make_divisible(in_c * scale), - num_filters=make_divisible(out_c * scale), - dw_size=k, - stride=s, - use_se=se, - act=act) - for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"]) - ]) - - out_channels.append( - make_divisible(NET_CONFIG["blocks4"][-1][2] * scale)) - - self.blocks5 = nn.Sequential(* [ - DepthwiseSeparable( - num_channels=make_divisible(in_c * scale), - num_filters=make_divisible(out_c * scale), - dw_size=k, - stride=s, - use_se=se, - act=act) - for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"]) - ]) - - out_channels.append( - make_divisible(NET_CONFIG["blocks5"][-1][2] * scale)) - - self.blocks6 = nn.Sequential(* [ - DepthwiseSeparable( - num_channels=make_divisible(in_c * scale), - num_filters=make_divisible(out_c * scale), - dw_size=k, - stride=s, - use_se=se, - act=act) - for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"]) - ]) - - out_channels.append( - make_divisible(NET_CONFIG["blocks6"][-1][2] * scale)) - self._out_channels = [ - ch for idx, ch in enumerate(out_channels) if idx + 2 in feature_maps - ] - - def forward(self, inputs): - x = inputs['image'] - outs = [] - - x = self.conv1(x) - x = self.blocks2(x) - x = self.blocks3(x) - outs.append(x) - x = self.blocks4(x) - outs.append(x) - x = self.blocks5(x) - outs.append(x) - x = self.blocks6(x) - outs.append(x) - outs = [o for i, o in enumerate(outs) if i + 2 in self.feature_maps] - return outs - - @property - def out_shape(self): - return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/lite_hrnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/lite_hrnet.py deleted file mode 100644 index 95e3a26..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/lite_hrnet.py +++ /dev/null @@ -1,891 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on -https://github.com/HRNet/Lite-HRNet/blob/hrnet/models/backbones/litehrnet.py -""" - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - -from numbers import Integral -from paddle import ParamAttr -from paddle.regularizer import L2Decay -from paddle.nn.initializer import Normal, Constant -from ppdet.core.workspace import register -from ppdet.modeling.shape_spec import ShapeSpec -from ppdet.modeling.ops import channel_shuffle -from .. import layers as L - -__all__ = ['LiteHRNet'] - - -class ConvNormLayer(nn.Layer): - def __init__(self, - ch_in, - ch_out, - filter_size, - stride=1, - groups=1, - norm_type=None, - norm_groups=32, - norm_decay=0., - freeze_norm=False, - act=None): - super(ConvNormLayer, self).__init__() - self.act = act - norm_lr = 0. if freeze_norm else 1. - if norm_type is not None: - assert norm_type in ['bn', 'sync_bn', 'gn'], \ - "norm_type should be one of ['bn', 'sync_bn', 'gn'], but got {}".format(norm_type) - param_attr = ParamAttr( - initializer=Constant(1.0), - learning_rate=norm_lr, - regularizer=L2Decay(norm_decay), ) - bias_attr = ParamAttr( - learning_rate=norm_lr, regularizer=L2Decay(norm_decay)) - global_stats = True if freeze_norm else None - if norm_type in ['bn', 'sync_bn']: - self.norm = nn.BatchNorm2D( - ch_out, - weight_attr=param_attr, - bias_attr=bias_attr, - use_global_stats=global_stats, ) - elif norm_type == 'gn': - self.norm = nn.GroupNorm( - num_groups=norm_groups, - num_channels=ch_out, - weight_attr=param_attr, - bias_attr=bias_attr) - norm_params = self.norm.parameters() - if freeze_norm: - for param in norm_params: - param.stop_gradient = True - conv_bias_attr = False - else: - conv_bias_attr = True - self.norm = None - - self.conv = nn.Conv2D( - in_channels=ch_in, - out_channels=ch_out, - kernel_size=filter_size, - stride=stride, - padding=(filter_size - 1) // 2, - groups=groups, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.001)), - bias_attr=conv_bias_attr) - - def forward(self, inputs): - out = self.conv(inputs) - if self.norm is not None: - out = self.norm(out) - - if self.act == 'relu': - out = F.relu(out) - elif self.act == 'sigmoid': - out = F.sigmoid(out) - return out - - -class DepthWiseSeparableConvNormLayer(nn.Layer): - def __init__(self, - ch_in, - ch_out, - filter_size, - stride=1, - dw_norm_type=None, - pw_norm_type=None, - norm_decay=0., - freeze_norm=False, - dw_act=None, - pw_act=None): - super(DepthWiseSeparableConvNormLayer, self).__init__() - self.depthwise_conv = ConvNormLayer( - ch_in=ch_in, - ch_out=ch_in, - filter_size=filter_size, - stride=stride, - groups=ch_in, - norm_type=dw_norm_type, - act=dw_act, - norm_decay=norm_decay, - freeze_norm=freeze_norm, ) - self.pointwise_conv = ConvNormLayer( - ch_in=ch_in, - ch_out=ch_out, - filter_size=1, - stride=1, - norm_type=pw_norm_type, - act=pw_act, - norm_decay=norm_decay, - freeze_norm=freeze_norm, ) - - def forward(self, x): - x = self.depthwise_conv(x) - x = self.pointwise_conv(x) - return x - - -class CrossResolutionWeightingModule(nn.Layer): - def __init__(self, - channels, - ratio=16, - norm_type='bn', - freeze_norm=False, - norm_decay=0.): - super(CrossResolutionWeightingModule, self).__init__() - self.channels = channels - total_channel = sum(channels) - self.conv1 = ConvNormLayer( - ch_in=total_channel, - ch_out=total_channel // ratio, - filter_size=1, - stride=1, - norm_type=norm_type, - act='relu', - freeze_norm=freeze_norm, - norm_decay=norm_decay) - self.conv2 = ConvNormLayer( - ch_in=total_channel // ratio, - ch_out=total_channel, - filter_size=1, - stride=1, - norm_type=norm_type, - act='sigmoid', - freeze_norm=freeze_norm, - norm_decay=norm_decay) - - def forward(self, x): - mini_size = x[-1].shape[-2:] - out = [F.adaptive_avg_pool2d(s, mini_size) for s in x[:-1]] + [x[-1]] - out = paddle.concat(out, 1) - out = self.conv1(out) - out = self.conv2(out) - out = paddle.split(out, self.channels, 1) - out = [ - s * F.interpolate( - a, s.shape[-2:], mode='nearest') for s, a in zip(x, out) - ] - return out - - -class SpatialWeightingModule(nn.Layer): - def __init__(self, in_channel, ratio=16, freeze_norm=False, norm_decay=0.): - super(SpatialWeightingModule, self).__init__() - self.global_avgpooling = nn.AdaptiveAvgPool2D(1) - self.conv1 = ConvNormLayer( - ch_in=in_channel, - ch_out=in_channel // ratio, - filter_size=1, - stride=1, - act='relu', - freeze_norm=freeze_norm, - norm_decay=norm_decay) - self.conv2 = ConvNormLayer( - ch_in=in_channel // ratio, - ch_out=in_channel, - filter_size=1, - stride=1, - act='sigmoid', - freeze_norm=freeze_norm, - norm_decay=norm_decay) - - def forward(self, x): - out = self.global_avgpooling(x) - out = self.conv1(out) - out = self.conv2(out) - return x * out - - -class ConditionalChannelWeightingBlock(nn.Layer): - def __init__(self, - in_channels, - stride, - reduce_ratio, - norm_type='bn', - freeze_norm=False, - norm_decay=0.): - super(ConditionalChannelWeightingBlock, self).__init__() - assert stride in [1, 2] - branch_channels = [channel // 2 for channel in in_channels] - - self.cross_resolution_weighting = CrossResolutionWeightingModule( - branch_channels, - ratio=reduce_ratio, - norm_type=norm_type, - freeze_norm=freeze_norm, - norm_decay=norm_decay) - self.depthwise_convs = nn.LayerList([ - ConvNormLayer( - channel, - channel, - filter_size=3, - stride=stride, - groups=channel, - norm_type=norm_type, - freeze_norm=freeze_norm, - norm_decay=norm_decay) for channel in branch_channels - ]) - - self.spatial_weighting = nn.LayerList([ - SpatialWeightingModule( - channel, - ratio=4, - freeze_norm=freeze_norm, - norm_decay=norm_decay) for channel in branch_channels - ]) - - def forward(self, x): - x = [s.chunk(2, axis=1) for s in x] - x1 = [s[0] for s in x] - x2 = [s[1] for s in x] - - x2 = self.cross_resolution_weighting(x2) - x2 = [dw(s) for s, dw in zip(x2, self.depthwise_convs)] - x2 = [sw(s) for s, sw in zip(x2, self.spatial_weighting)] - - out = [paddle.concat([s1, s2], axis=1) for s1, s2 in zip(x1, x2)] - out = [channel_shuffle(s, groups=2) for s in out] - return out - - -class ShuffleUnit(nn.Layer): - def __init__(self, - in_channel, - out_channel, - stride, - norm_type='bn', - freeze_norm=False, - norm_decay=0.): - super(ShuffleUnit, self).__init__() - branch_channel = out_channel // 2 - self.stride = stride - if self.stride == 1: - assert in_channel == branch_channel * 2, \ - "when stride=1, in_channel {} should equal to branch_channel*2 {}".format(in_channel, branch_channel * 2) - if stride > 1: - self.branch1 = nn.Sequential( - ConvNormLayer( - ch_in=in_channel, - ch_out=in_channel, - filter_size=3, - stride=self.stride, - groups=in_channel, - norm_type=norm_type, - freeze_norm=freeze_norm, - norm_decay=norm_decay), - ConvNormLayer( - ch_in=in_channel, - ch_out=branch_channel, - filter_size=1, - stride=1, - norm_type=norm_type, - act='relu', - freeze_norm=freeze_norm, - norm_decay=norm_decay), ) - self.branch2 = nn.Sequential( - ConvNormLayer( - ch_in=branch_channel if stride == 1 else in_channel, - ch_out=branch_channel, - filter_size=1, - stride=1, - norm_type=norm_type, - act='relu', - freeze_norm=freeze_norm, - norm_decay=norm_decay), - ConvNormLayer( - ch_in=branch_channel, - ch_out=branch_channel, - filter_size=3, - stride=self.stride, - groups=branch_channel, - norm_type=norm_type, - freeze_norm=freeze_norm, - norm_decay=norm_decay), - ConvNormLayer( - ch_in=branch_channel, - ch_out=branch_channel, - filter_size=1, - stride=1, - norm_type=norm_type, - act='relu', - freeze_norm=freeze_norm, - norm_decay=norm_decay), ) - - def forward(self, x): - if self.stride > 1: - x1 = self.branch1(x) - x2 = self.branch2(x) - else: - x1, x2 = x.chunk(2, axis=1) - x2 = self.branch2(x2) - out = paddle.concat([x1, x2], axis=1) - out = channel_shuffle(out, groups=2) - return out - - -class IterativeHead(nn.Layer): - def __init__(self, - in_channels, - norm_type='bn', - freeze_norm=False, - norm_decay=0.): - super(IterativeHead, self).__init__() - num_branches = len(in_channels) - self.in_channels = in_channels[::-1] - - projects = [] - for i in range(num_branches): - if i != num_branches - 1: - projects.append( - DepthWiseSeparableConvNormLayer( - ch_in=self.in_channels[i], - ch_out=self.in_channels[i + 1], - filter_size=3, - stride=1, - dw_act=None, - pw_act='relu', - dw_norm_type=norm_type, - pw_norm_type=norm_type, - freeze_norm=freeze_norm, - norm_decay=norm_decay)) - else: - projects.append( - DepthWiseSeparableConvNormLayer( - ch_in=self.in_channels[i], - ch_out=self.in_channels[i], - filter_size=3, - stride=1, - dw_act=None, - pw_act='relu', - dw_norm_type=norm_type, - pw_norm_type=norm_type, - freeze_norm=freeze_norm, - norm_decay=norm_decay)) - self.projects = nn.LayerList(projects) - - def forward(self, x): - x = x[::-1] - y = [] - last_x = None - for i, s in enumerate(x): - if last_x is not None: - last_x = F.interpolate( - last_x, - size=s.shape[-2:], - mode='bilinear', - align_corners=True) - s = s + last_x - s = self.projects[i](s) - y.append(s) - last_x = s - - return y[::-1] - - -class Stem(nn.Layer): - def __init__(self, - in_channel, - stem_channel, - out_channel, - expand_ratio, - norm_type='bn', - freeze_norm=False, - norm_decay=0.): - super(Stem, self).__init__() - self.conv1 = ConvNormLayer( - in_channel, - stem_channel, - filter_size=3, - stride=2, - norm_type=norm_type, - act='relu', - freeze_norm=freeze_norm, - norm_decay=norm_decay) - mid_channel = int(round(stem_channel * expand_ratio)) - branch_channel = stem_channel // 2 - if stem_channel == out_channel: - inc_channel = out_channel - branch_channel - else: - inc_channel = out_channel - stem_channel - self.branch1 = nn.Sequential( - ConvNormLayer( - ch_in=branch_channel, - ch_out=branch_channel, - filter_size=3, - stride=2, - groups=branch_channel, - norm_type=norm_type, - freeze_norm=freeze_norm, - norm_decay=norm_decay), - ConvNormLayer( - ch_in=branch_channel, - ch_out=inc_channel, - filter_size=1, - stride=1, - norm_type=norm_type, - act='relu', - freeze_norm=freeze_norm, - norm_decay=norm_decay), ) - self.expand_conv = ConvNormLayer( - ch_in=branch_channel, - ch_out=mid_channel, - filter_size=1, - stride=1, - norm_type=norm_type, - act='relu', - freeze_norm=freeze_norm, - norm_decay=norm_decay) - self.depthwise_conv = ConvNormLayer( - ch_in=mid_channel, - ch_out=mid_channel, - filter_size=3, - stride=2, - groups=mid_channel, - norm_type=norm_type, - freeze_norm=freeze_norm, - norm_decay=norm_decay) - self.linear_conv = ConvNormLayer( - ch_in=mid_channel, - ch_out=branch_channel - if stem_channel == out_channel else stem_channel, - filter_size=1, - stride=1, - norm_type=norm_type, - act='relu', - freeze_norm=freeze_norm, - norm_decay=norm_decay) - - def forward(self, x): - x = self.conv1(x) - x1, x2 = x.chunk(2, axis=1) - x1 = self.branch1(x1) - x2 = self.expand_conv(x2) - x2 = self.depthwise_conv(x2) - x2 = self.linear_conv(x2) - out = paddle.concat([x1, x2], axis=1) - out = channel_shuffle(out, groups=2) - - return out - - -class LiteHRNetModule(nn.Layer): - def __init__(self, - num_branches, - num_blocks, - in_channels, - reduce_ratio, - module_type, - multiscale_output=False, - with_fuse=True, - norm_type='bn', - freeze_norm=False, - norm_decay=0.): - super(LiteHRNetModule, self).__init__() - assert num_branches == len(in_channels),\ - "num_branches {} should equal to num_in_channels {}".format(num_branches, len(in_channels)) - assert module_type in [ - 'LITE', 'NAIVE' - ], "module_type should be one of ['LITE', 'NAIVE']" - self.num_branches = num_branches - self.in_channels = in_channels - self.multiscale_output = multiscale_output - self.with_fuse = with_fuse - self.norm_type = 'bn' - self.module_type = module_type - - if self.module_type == 'LITE': - self.layers = self._make_weighting_blocks( - num_blocks, - reduce_ratio, - freeze_norm=freeze_norm, - norm_decay=norm_decay) - elif self.module_type == 'NAIVE': - self.layers = self._make_naive_branches( - num_branches, - num_blocks, - freeze_norm=freeze_norm, - norm_decay=norm_decay) - - if self.with_fuse: - self.fuse_layers = self._make_fuse_layers( - freeze_norm=freeze_norm, norm_decay=norm_decay) - self.relu = nn.ReLU() - - def _make_weighting_blocks(self, - num_blocks, - reduce_ratio, - stride=1, - freeze_norm=False, - norm_decay=0.): - layers = [] - for i in range(num_blocks): - layers.append( - ConditionalChannelWeightingBlock( - self.in_channels, - stride=stride, - reduce_ratio=reduce_ratio, - norm_type=self.norm_type, - freeze_norm=freeze_norm, - norm_decay=norm_decay)) - return nn.Sequential(*layers) - - def _make_naive_branches(self, - num_branches, - num_blocks, - freeze_norm=False, - norm_decay=0.): - branches = [] - for branch_idx in range(num_branches): - layers = [] - for i in range(num_blocks): - layers.append( - ShuffleUnit( - self.in_channels[branch_idx], - self.in_channels[branch_idx], - stride=1, - norm_type=self.norm_type, - freeze_norm=freeze_norm, - norm_decay=norm_decay)) - branches.append(nn.Sequential(*layers)) - return nn.LayerList(branches) - - def _make_fuse_layers(self, freeze_norm=False, norm_decay=0.): - if self.num_branches == 1: - return None - fuse_layers = [] - num_out_branches = self.num_branches if self.multiscale_output else 1 - for i in range(num_out_branches): - fuse_layer = [] - for j in range(self.num_branches): - if j > i: - fuse_layer.append( - nn.Sequential( - L.Conv2d( - self.in_channels[j], - self.in_channels[i], - kernel_size=1, - stride=1, - padding=0, - bias=False, ), - nn.BatchNorm2D(self.in_channels[i]), - nn.Upsample( - scale_factor=2**(j - i), mode='nearest'))) - elif j == i: - fuse_layer.append(None) - else: - conv_downsamples = [] - for k in range(i - j): - if k == i - j - 1: - conv_downsamples.append( - nn.Sequential( - L.Conv2d( - self.in_channels[j], - self.in_channels[j], - kernel_size=3, - stride=2, - padding=1, - groups=self.in_channels[j], - bias=False, ), - nn.BatchNorm2D(self.in_channels[j]), - L.Conv2d( - self.in_channels[j], - self.in_channels[i], - kernel_size=1, - stride=1, - padding=0, - bias=False, ), - nn.BatchNorm2D(self.in_channels[i]))) - else: - conv_downsamples.append( - nn.Sequential( - L.Conv2d( - self.in_channels[j], - self.in_channels[j], - kernel_size=3, - stride=2, - padding=1, - groups=self.in_channels[j], - bias=False, ), - nn.BatchNorm2D(self.in_channels[j]), - L.Conv2d( - self.in_channels[j], - self.in_channels[j], - kernel_size=1, - stride=1, - padding=0, - bias=False, ), - nn.BatchNorm2D(self.in_channels[j]), - nn.ReLU())) - - fuse_layer.append(nn.Sequential(*conv_downsamples)) - fuse_layers.append(nn.LayerList(fuse_layer)) - - return nn.LayerList(fuse_layers) - - def forward(self, x): - if self.num_branches == 1: - return [self.layers[0](x[0])] - if self.module_type == 'LITE': - out = self.layers(x) - elif self.module_type == 'NAIVE': - for i in range(self.num_branches): - x[i] = self.layers[i](x[i]) - out = x - if self.with_fuse: - out_fuse = [] - for i in range(len(self.fuse_layers)): - y = out[0] if i == 0 else self.fuse_layers[i][0](out[0]) - for j in range(self.num_branches): - if j == 0: - y += y - elif i == j: - y += out[j] - else: - y += self.fuse_layers[i][j](out[j]) - if i == 0: - out[i] = y - out_fuse.append(self.relu(y)) - out = out_fuse - elif not self.multiscale_output: - out = [out[0]] - return out - - -@register -class LiteHRNet(nn.Layer): - """ - @inproceedings{Yulitehrnet21, - title={Lite-HRNet: A Lightweight High-Resolution Network}, - author={Yu, Changqian and Xiao, Bin and Gao, Changxin and Yuan, Lu and Zhang, Lei and Sang, Nong and Wang, Jingdong}, - booktitle={CVPR},year={2021} - } - Args: - network_type (str): the network_type should be one of ["lite_18", "lite_30", "naive", "wider_naive"], - "naive": Simply combining the shuffle block in ShuffleNet and the highresolution design pattern in HRNet. - "wider_naive": Naive network with wider channels in each block. - "lite_18": Lite-HRNet-18, which replaces the pointwise convolution in a shuffle block by conditional channel weighting. - "lite_30": Lite-HRNet-30, with more blocks compared with Lite-HRNet-18. - freeze_at (int): the stage to freeze - freeze_norm (bool): whether to freeze norm in HRNet - norm_decay (float): weight decay for normalization layer weights - return_idx (List): the stage to return - """ - - def __init__(self, - network_type, - freeze_at=0, - freeze_norm=True, - norm_decay=0., - return_idx=[0, 1, 2, 3]): - super(LiteHRNet, self).__init__() - if isinstance(return_idx, Integral): - return_idx = [return_idx] - assert network_type in ["lite_18", "lite_30", "naive", "wider_naive"], \ - "the network_type should be one of [lite_18, lite_30, naive, wider_naive]" - assert len(return_idx) > 0, "need one or more return index" - self.freeze_at = freeze_at - self.freeze_norm = freeze_norm - self.norm_decay = norm_decay - self.return_idx = return_idx - self.norm_type = 'bn' - - self.module_configs = { - "lite_18": { - "num_modules": [2, 4, 2], - "num_branches": [2, 3, 4], - "num_blocks": [2, 2, 2], - "module_type": ["LITE", "LITE", "LITE"], - "reduce_ratios": [8, 8, 8], - "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]], - }, - "lite_30": { - "num_modules": [3, 8, 3], - "num_branches": [2, 3, 4], - "num_blocks": [2, 2, 2], - "module_type": ["LITE", "LITE", "LITE"], - "reduce_ratios": [8, 8, 8], - "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]], - }, - "naive": { - "num_modules": [2, 4, 2], - "num_branches": [2, 3, 4], - "num_blocks": [2, 2, 2], - "module_type": ["NAIVE", "NAIVE", "NAIVE"], - "reduce_ratios": [1, 1, 1], - "num_channels": [[30, 60], [30, 60, 120], [30, 60, 120, 240]], - }, - "wider_naive": { - "num_modules": [2, 4, 2], - "num_branches": [2, 3, 4], - "num_blocks": [2, 2, 2], - "module_type": ["NAIVE", "NAIVE", "NAIVE"], - "reduce_ratios": [1, 1, 1], - "num_channels": [[40, 80], [40, 80, 160], [40, 80, 160, 320]], - }, - } - - self.stages_config = self.module_configs[network_type] - - self.stem = Stem(3, 32, 32, 1) - num_channels_pre_layer = [32] - for stage_idx in range(3): - num_channels = self.stages_config["num_channels"][stage_idx] - setattr(self, 'transition{}'.format(stage_idx), - self._make_transition_layer(num_channels_pre_layer, - num_channels, self.freeze_norm, - self.norm_decay)) - stage, num_channels_pre_layer = self._make_stage( - self.stages_config, stage_idx, num_channels, True, - self.freeze_norm, self.norm_decay) - setattr(self, 'stage{}'.format(stage_idx), stage) - self.head_layer = IterativeHead(num_channels_pre_layer, 'bn', - self.freeze_norm, self.norm_decay) - - def _make_transition_layer(self, - num_channels_pre_layer, - num_channels_cur_layer, - freeze_norm=False, - norm_decay=0.): - num_branches_pre = len(num_channels_pre_layer) - num_branches_cur = len(num_channels_cur_layer) - transition_layers = [] - for i in range(num_branches_cur): - if i < num_branches_pre: - if num_channels_cur_layer[i] != num_channels_pre_layer[i]: - transition_layers.append( - nn.Sequential( - L.Conv2d( - num_channels_pre_layer[i], - num_channels_pre_layer[i], - kernel_size=3, - stride=1, - padding=1, - groups=num_channels_pre_layer[i], - bias=False), - nn.BatchNorm2D(num_channels_pre_layer[i]), - L.Conv2d( - num_channels_pre_layer[i], - num_channels_cur_layer[i], - kernel_size=1, - stride=1, - padding=0, - bias=False, ), - nn.BatchNorm2D(num_channels_cur_layer[i]), - nn.ReLU())) - else: - transition_layers.append(None) - else: - conv_downsamples = [] - for j in range(i + 1 - num_branches_pre): - conv_downsamples.append( - nn.Sequential( - L.Conv2d( - num_channels_pre_layer[-1], - num_channels_pre_layer[-1], - groups=num_channels_pre_layer[-1], - kernel_size=3, - stride=2, - padding=1, - bias=False, ), - nn.BatchNorm2D(num_channels_pre_layer[-1]), - L.Conv2d( - num_channels_pre_layer[-1], - num_channels_cur_layer[i] - if j == i - num_branches_pre else - num_channels_pre_layer[-1], - kernel_size=1, - stride=1, - padding=0, - bias=False, ), - nn.BatchNorm2D(num_channels_cur_layer[i] - if j == i - num_branches_pre else - num_channels_pre_layer[-1]), - nn.ReLU())) - transition_layers.append(nn.Sequential(*conv_downsamples)) - return nn.LayerList(transition_layers) - - def _make_stage(self, - stages_config, - stage_idx, - in_channels, - multiscale_output, - freeze_norm=False, - norm_decay=0.): - num_modules = stages_config["num_modules"][stage_idx] - num_branches = stages_config["num_branches"][stage_idx] - num_blocks = stages_config["num_blocks"][stage_idx] - reduce_ratio = stages_config['reduce_ratios'][stage_idx] - module_type = stages_config['module_type'][stage_idx] - - modules = [] - for i in range(num_modules): - if not multiscale_output and i == num_modules - 1: - reset_multiscale_output = False - else: - reset_multiscale_output = True - modules.append( - LiteHRNetModule( - num_branches, - num_blocks, - in_channels, - reduce_ratio, - module_type, - multiscale_output=reset_multiscale_output, - with_fuse=True, - freeze_norm=freeze_norm, - norm_decay=norm_decay)) - in_channels = modules[-1].in_channels - return nn.Sequential(*modules), in_channels - - def forward(self, inputs): - x = inputs['image'] - dims = x.shape - if len(dims) == 5: - x = paddle.reshape(x, (dims[0] * dims[1], dims[2], dims[3], - dims[4])) # [6, 3, 128, 96] - - x = self.stem(x) - y_list = [x] - for stage_idx in range(3): - x_list = [] - transition = getattr(self, 'transition{}'.format(stage_idx)) - for j in range(self.stages_config["num_branches"][stage_idx]): - if transition[j] is not None: - if j >= len(y_list): - x_list.append(transition[j](y_list[-1])) - else: - x_list.append(transition[j](y_list[j])) - else: - x_list.append(y_list[j]) - y_list = getattr(self, 'stage{}'.format(stage_idx))(x_list) - x = self.head_layer(y_list) - res = [] - for i, layer in enumerate(x): - if i == self.freeze_at: - layer.stop_gradient = True - if i in self.return_idx: - res.append(layer) - return res - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=self._out_channels[i], stride=self._out_strides[i]) - for i in self.return_idx - ] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/mobilenet_v1.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/mobilenet_v1.py deleted file mode 100644 index a39435b..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/mobilenet_v1.py +++ /dev/null @@ -1,402 +0,0 @@ -# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.regularizer import L2Decay -from paddle.nn.initializer import KaimingNormal -from ppdet.core.workspace import register, serializable -from numbers import Integral -from ..shape_spec import ShapeSpec - -__all__ = ['MobileNet'] - - -class ConvBNLayer(nn.Layer): - def __init__(self, - in_channels, - out_channels, - kernel_size, - stride, - padding, - num_groups=1, - act='relu', - conv_lr=1., - conv_decay=0., - norm_decay=0., - norm_type='bn', - name=None): - super(ConvBNLayer, self).__init__() - self.act = act - self._conv = nn.Conv2D( - in_channels, - out_channels, - kernel_size=kernel_size, - stride=stride, - padding=padding, - groups=num_groups, - weight_attr=ParamAttr( - learning_rate=conv_lr, - initializer=KaimingNormal(), - regularizer=L2Decay(conv_decay)), - bias_attr=False) - - param_attr = ParamAttr(regularizer=L2Decay(norm_decay)) - bias_attr = ParamAttr(regularizer=L2Decay(norm_decay)) - if norm_type in ['sync_bn', 'bn']: - self._batch_norm = nn.BatchNorm2D( - out_channels, weight_attr=param_attr, bias_attr=bias_attr) - - def forward(self, x): - x = self._conv(x) - x = self._batch_norm(x) - if self.act == "relu": - x = F.relu(x) - elif self.act == "relu6": - x = F.relu6(x) - return x - - -class DepthwiseSeparable(nn.Layer): - def __init__(self, - in_channels, - out_channels1, - out_channels2, - num_groups, - stride, - scale, - conv_lr=1., - conv_decay=0., - norm_decay=0., - norm_type='bn', - name=None): - super(DepthwiseSeparable, self).__init__() - - self._depthwise_conv = ConvBNLayer( - in_channels, - int(out_channels1 * scale), - kernel_size=3, - stride=stride, - padding=1, - num_groups=int(num_groups * scale), - conv_lr=conv_lr, - conv_decay=conv_decay, - norm_decay=norm_decay, - norm_type=norm_type, - name=name + "_dw") - - self._pointwise_conv = ConvBNLayer( - int(out_channels1 * scale), - int(out_channels2 * scale), - kernel_size=1, - stride=1, - padding=0, - conv_lr=conv_lr, - conv_decay=conv_decay, - norm_decay=norm_decay, - norm_type=norm_type, - name=name + "_sep") - - def forward(self, x): - x = self._depthwise_conv(x) - x = self._pointwise_conv(x) - return x - - -class ExtraBlock(nn.Layer): - def __init__(self, - in_channels, - out_channels1, - out_channels2, - num_groups=1, - stride=2, - conv_lr=1., - conv_decay=0., - norm_decay=0., - norm_type='bn', - name=None): - super(ExtraBlock, self).__init__() - - self.pointwise_conv = ConvBNLayer( - in_channels, - int(out_channels1), - kernel_size=1, - stride=1, - padding=0, - num_groups=int(num_groups), - act='relu6', - conv_lr=conv_lr, - conv_decay=conv_decay, - norm_decay=norm_decay, - norm_type=norm_type, - name=name + "_extra1") - - self.normal_conv = ConvBNLayer( - int(out_channels1), - int(out_channels2), - kernel_size=3, - stride=stride, - padding=1, - num_groups=int(num_groups), - act='relu6', - conv_lr=conv_lr, - conv_decay=conv_decay, - norm_decay=norm_decay, - norm_type=norm_type, - name=name + "_extra2") - - def forward(self, x): - x = self.pointwise_conv(x) - x = self.normal_conv(x) - return x - - -@register -@serializable -class MobileNet(nn.Layer): - __shared__ = ['norm_type'] - - def __init__(self, - norm_type='bn', - norm_decay=0., - conv_decay=0., - scale=1, - conv_learning_rate=1.0, - feature_maps=[4, 6, 13], - with_extra_blocks=False, - extra_block_filters=[[256, 512], [128, 256], [128, 256], - [64, 128]]): - super(MobileNet, self).__init__() - if isinstance(feature_maps, Integral): - feature_maps = [feature_maps] - self.feature_maps = feature_maps - self.with_extra_blocks = with_extra_blocks - self.extra_block_filters = extra_block_filters - - self._out_channels = [] - - self.conv1 = ConvBNLayer( - in_channels=3, - out_channels=int(32 * scale), - kernel_size=3, - stride=2, - padding=1, - conv_lr=conv_learning_rate, - conv_decay=conv_decay, - norm_decay=norm_decay, - norm_type=norm_type, - name="conv1") - - self.dwsl = [] - dws21 = self.add_sublayer( - "conv2_1", - sublayer=DepthwiseSeparable( - in_channels=int(32 * scale), - out_channels1=32, - out_channels2=64, - num_groups=32, - stride=1, - scale=scale, - conv_lr=conv_learning_rate, - conv_decay=conv_decay, - norm_decay=norm_decay, - norm_type=norm_type, - name="conv2_1")) - self.dwsl.append(dws21) - self._update_out_channels(int(64 * scale), len(self.dwsl), feature_maps) - dws22 = self.add_sublayer( - "conv2_2", - sublayer=DepthwiseSeparable( - in_channels=int(64 * scale), - out_channels1=64, - out_channels2=128, - num_groups=64, - stride=2, - scale=scale, - conv_lr=conv_learning_rate, - conv_decay=conv_decay, - norm_decay=norm_decay, - norm_type=norm_type, - name="conv2_2")) - self.dwsl.append(dws22) - self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps) - # 1/4 - dws31 = self.add_sublayer( - "conv3_1", - sublayer=DepthwiseSeparable( - in_channels=int(128 * scale), - out_channels1=128, - out_channels2=128, - num_groups=128, - stride=1, - scale=scale, - conv_lr=conv_learning_rate, - conv_decay=conv_decay, - norm_decay=norm_decay, - norm_type=norm_type, - name="conv3_1")) - self.dwsl.append(dws31) - self._update_out_channels(int(128 * scale), len(self.dwsl), feature_maps) - dws32 = self.add_sublayer( - "conv3_2", - sublayer=DepthwiseSeparable( - in_channels=int(128 * scale), - out_channels1=128, - out_channels2=256, - num_groups=128, - stride=2, - scale=scale, - conv_lr=conv_learning_rate, - conv_decay=conv_decay, - norm_decay=norm_decay, - norm_type=norm_type, - name="conv3_2")) - self.dwsl.append(dws32) - self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps) - # 1/8 - dws41 = self.add_sublayer( - "conv4_1", - sublayer=DepthwiseSeparable( - in_channels=int(256 * scale), - out_channels1=256, - out_channels2=256, - num_groups=256, - stride=1, - scale=scale, - conv_lr=conv_learning_rate, - conv_decay=conv_decay, - norm_decay=norm_decay, - norm_type=norm_type, - name="conv4_1")) - self.dwsl.append(dws41) - self._update_out_channels(int(256 * scale), len(self.dwsl), feature_maps) - dws42 = self.add_sublayer( - "conv4_2", - sublayer=DepthwiseSeparable( - in_channels=int(256 * scale), - out_channels1=256, - out_channels2=512, - num_groups=256, - stride=2, - scale=scale, - conv_lr=conv_learning_rate, - conv_decay=conv_decay, - norm_decay=norm_decay, - norm_type=norm_type, - name="conv4_2")) - self.dwsl.append(dws42) - self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps) - # 1/16 - for i in range(5): - tmp = self.add_sublayer( - "conv5_" + str(i + 1), - sublayer=DepthwiseSeparable( - in_channels=int(512 * scale), - out_channels1=512, - out_channels2=512, - num_groups=512, - stride=1, - scale=scale, - conv_lr=conv_learning_rate, - conv_decay=conv_decay, - norm_decay=norm_decay, - norm_type=norm_type, - name="conv5_" + str(i + 1))) - self.dwsl.append(tmp) - self._update_out_channels(int(512 * scale), len(self.dwsl), feature_maps) - dws56 = self.add_sublayer( - "conv5_6", - sublayer=DepthwiseSeparable( - in_channels=int(512 * scale), - out_channels1=512, - out_channels2=1024, - num_groups=512, - stride=2, - scale=scale, - conv_lr=conv_learning_rate, - conv_decay=conv_decay, - norm_decay=norm_decay, - norm_type=norm_type, - name="conv5_6")) - self.dwsl.append(dws56) - self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps) - # 1/32 - dws6 = self.add_sublayer( - "conv6", - sublayer=DepthwiseSeparable( - in_channels=int(1024 * scale), - out_channels1=1024, - out_channels2=1024, - num_groups=1024, - stride=1, - scale=scale, - conv_lr=conv_learning_rate, - conv_decay=conv_decay, - norm_decay=norm_decay, - norm_type=norm_type, - name="conv6")) - self.dwsl.append(dws6) - self._update_out_channels(int(1024 * scale), len(self.dwsl), feature_maps) - - if self.with_extra_blocks: - self.extra_blocks = [] - for i, block_filter in enumerate(self.extra_block_filters): - in_c = 1024 if i == 0 else self.extra_block_filters[i - 1][1] - conv_extra = self.add_sublayer( - "conv7_" + str(i + 1), - sublayer=ExtraBlock( - in_c, - block_filter[0], - block_filter[1], - conv_lr=conv_learning_rate, - conv_decay=conv_decay, - norm_decay=norm_decay, - norm_type=norm_type, - name="conv7_" + str(i + 1))) - self.extra_blocks.append(conv_extra) - self._update_out_channels( - block_filter[1], - len(self.dwsl) + len(self.extra_blocks), feature_maps) - - def _update_out_channels(self, channel, feature_idx, feature_maps): - if feature_idx in feature_maps: - self._out_channels.append(channel) - - def forward(self, inputs): - outs = [] - y = self.conv1(inputs['image']) - for i, block in enumerate(self.dwsl): - y = block(y) - if i + 1 in self.feature_maps: - outs.append(y) - - if not self.with_extra_blocks: - return outs - - y = outs[-1] - for i, block in enumerate(self.extra_blocks): - idx = i + len(self.dwsl) - y = block(y) - if idx + 1 in self.feature_maps: - outs.append(y) - return outs - - @property - def out_shape(self): - return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/mobilenet_v3.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/mobilenet_v3.py deleted file mode 100644 index 2bd8856..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/mobilenet_v3.py +++ /dev/null @@ -1,478 +0,0 @@ -# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.regularizer import L2Decay -from ppdet.core.workspace import register, serializable -from numbers import Integral -from ..shape_spec import ShapeSpec - -__all__ = ['MobileNetV3'] - - -def make_divisible(v, divisor=8, min_value=None): - if min_value is None: - min_value = divisor - new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) - if new_v < 0.9 * v: - new_v += divisor - return new_v - - -class ConvBNLayer(nn.Layer): - def __init__(self, - in_c, - out_c, - filter_size, - stride, - padding, - num_groups=1, - act=None, - lr_mult=1., - conv_decay=0., - norm_type='bn', - norm_decay=0., - freeze_norm=False, - name=""): - super(ConvBNLayer, self).__init__() - self.act = act - self.conv = nn.Conv2D( - in_channels=in_c, - out_channels=out_c, - kernel_size=filter_size, - stride=stride, - padding=padding, - groups=num_groups, - weight_attr=ParamAttr( - learning_rate=lr_mult, regularizer=L2Decay(conv_decay)), - bias_attr=False) - - norm_lr = 0. if freeze_norm else lr_mult - param_attr = ParamAttr( - learning_rate=norm_lr, - regularizer=L2Decay(norm_decay), - trainable=False if freeze_norm else True) - bias_attr = ParamAttr( - learning_rate=norm_lr, - regularizer=L2Decay(norm_decay), - trainable=False if freeze_norm else True) - global_stats = True if freeze_norm else None - if norm_type in ['sync_bn', 'bn']: - self.bn = nn.BatchNorm2D( - out_c, - weight_attr=param_attr, - bias_attr=bias_attr, - use_global_stats=global_stats) - norm_params = self.bn.parameters() - if freeze_norm: - for param in norm_params: - param.stop_gradient = True - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - if self.act is not None: - if self.act == "relu": - x = F.relu(x) - elif self.act == "relu6": - x = F.relu6(x) - elif self.act == "hard_swish": - x = F.hardswish(x) - else: - raise NotImplementedError( - "The activation function is selected incorrectly.") - return x - - -class ResidualUnit(nn.Layer): - def __init__(self, - in_c, - mid_c, - out_c, - filter_size, - stride, - use_se, - lr_mult, - conv_decay=0., - norm_type='bn', - norm_decay=0., - freeze_norm=False, - act=None, - return_list=False, - name=''): - super(ResidualUnit, self).__init__() - self.if_shortcut = stride == 1 and in_c == out_c - self.use_se = use_se - self.return_list = return_list - - self.expand_conv = ConvBNLayer( - in_c=in_c, - out_c=mid_c, - filter_size=1, - stride=1, - padding=0, - act=act, - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + "_expand") - self.bottleneck_conv = ConvBNLayer( - in_c=mid_c, - out_c=mid_c, - filter_size=filter_size, - stride=stride, - padding=int((filter_size - 1) // 2), - num_groups=mid_c, - act=act, - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + "_depthwise") - if self.use_se: - self.mid_se = SEModule( - mid_c, lr_mult, conv_decay, name=name + "_se") - self.linear_conv = ConvBNLayer( - in_c=mid_c, - out_c=out_c, - filter_size=1, - stride=1, - padding=0, - act=None, - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + "_linear") - - def forward(self, inputs): - y = self.expand_conv(inputs) - x = self.bottleneck_conv(y) - if self.use_se: - x = self.mid_se(x) - x = self.linear_conv(x) - if self.if_shortcut: - x = paddle.add(inputs, x) - if self.return_list: - return [y, x] - else: - return x - - -class SEModule(nn.Layer): - def __init__(self, channel, lr_mult, conv_decay, reduction=4, name=""): - super(SEModule, self).__init__() - self.avg_pool = nn.AdaptiveAvgPool2D(1) - mid_channels = int(channel // reduction) - self.conv1 = nn.Conv2D( - in_channels=channel, - out_channels=mid_channels, - kernel_size=1, - stride=1, - padding=0, - weight_attr=ParamAttr( - learning_rate=lr_mult, regularizer=L2Decay(conv_decay)), - bias_attr=ParamAttr( - learning_rate=lr_mult, regularizer=L2Decay(conv_decay))) - self.conv2 = nn.Conv2D( - in_channels=mid_channels, - out_channels=channel, - kernel_size=1, - stride=1, - padding=0, - weight_attr=ParamAttr( - learning_rate=lr_mult, regularizer=L2Decay(conv_decay)), - bias_attr=ParamAttr( - learning_rate=lr_mult, regularizer=L2Decay(conv_decay))) - - def forward(self, inputs): - outputs = self.avg_pool(inputs) - outputs = self.conv1(outputs) - outputs = F.relu(outputs) - outputs = self.conv2(outputs) - outputs = F.hardsigmoid(outputs, slope=0.2, offset=0.5) - return paddle.multiply(x=inputs, y=outputs) - - -class ExtraBlockDW(nn.Layer): - def __init__(self, - in_c, - ch_1, - ch_2, - stride, - lr_mult, - conv_decay=0., - norm_type='bn', - norm_decay=0., - freeze_norm=False, - name=None): - super(ExtraBlockDW, self).__init__() - self.pointwise_conv = ConvBNLayer( - in_c=in_c, - out_c=ch_1, - filter_size=1, - stride=1, - padding='SAME', - act='relu6', - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + "_extra1") - self.depthwise_conv = ConvBNLayer( - in_c=ch_1, - out_c=ch_2, - filter_size=3, - stride=stride, - padding='SAME', - num_groups=int(ch_1), - act='relu6', - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + "_extra2_dw") - self.normal_conv = ConvBNLayer( - in_c=ch_2, - out_c=ch_2, - filter_size=1, - stride=1, - padding='SAME', - act='relu6', - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name=name + "_extra2_sep") - - def forward(self, inputs): - x = self.pointwise_conv(inputs) - x = self.depthwise_conv(x) - x = self.normal_conv(x) - return x - - -@register -@serializable -class MobileNetV3(nn.Layer): - __shared__ = ['norm_type'] - - def __init__( - self, - scale=1.0, - model_name="large", - feature_maps=[6, 12, 15], - with_extra_blocks=False, - extra_block_filters=[[256, 512], [128, 256], [128, 256], [64, 128]], - lr_mult_list=[1.0, 1.0, 1.0, 1.0, 1.0], - conv_decay=0.0, - multiplier=1.0, - norm_type='bn', - norm_decay=0.0, - freeze_norm=False): - super(MobileNetV3, self).__init__() - if isinstance(feature_maps, Integral): - feature_maps = [feature_maps] - if norm_type == 'sync_bn' and freeze_norm: - raise ValueError( - "The norm_type should not be sync_bn when freeze_norm is True") - self.feature_maps = feature_maps - self.with_extra_blocks = with_extra_blocks - self.extra_block_filters = extra_block_filters - - inplanes = 16 - if model_name == "large": - self.cfg = [ - # k, exp, c, se, nl, s, - [3, 16, 16, False, "relu", 1], - [3, 64, 24, False, "relu", 2], - [3, 72, 24, False, "relu", 1], - [5, 72, 40, True, "relu", 2], # RCNN output - [5, 120, 40, True, "relu", 1], - [5, 120, 40, True, "relu", 1], # YOLOv3 output - [3, 240, 80, False, "hard_swish", 2], # RCNN output - [3, 200, 80, False, "hard_swish", 1], - [3, 184, 80, False, "hard_swish", 1], - [3, 184, 80, False, "hard_swish", 1], - [3, 480, 112, True, "hard_swish", 1], - [3, 672, 112, True, "hard_swish", 1], # YOLOv3 output - [5, 672, 160, True, "hard_swish", 2], # SSD/SSDLite/RCNN output - [5, 960, 160, True, "hard_swish", 1], - [5, 960, 160, True, "hard_swish", 1], # YOLOv3 output - ] - elif model_name == "small": - self.cfg = [ - # k, exp, c, se, nl, s, - [3, 16, 16, True, "relu", 2], - [3, 72, 24, False, "relu", 2], # RCNN output - [3, 88, 24, False, "relu", 1], # YOLOv3 output - [5, 96, 40, True, "hard_swish", 2], # RCNN output - [5, 240, 40, True, "hard_swish", 1], - [5, 240, 40, True, "hard_swish", 1], - [5, 120, 48, True, "hard_swish", 1], - [5, 144, 48, True, "hard_swish", 1], # YOLOv3 output - [5, 288, 96, True, "hard_swish", 2], # SSD/SSDLite/RCNN output - [5, 576, 96, True, "hard_swish", 1], - [5, 576, 96, True, "hard_swish", 1], # YOLOv3 output - ] - else: - raise NotImplementedError( - "mode[{}_model] is not implemented!".format(model_name)) - - if multiplier != 1.0: - self.cfg[-3][2] = int(self.cfg[-3][2] * multiplier) - self.cfg[-2][1] = int(self.cfg[-2][1] * multiplier) - self.cfg[-2][2] = int(self.cfg[-2][2] * multiplier) - self.cfg[-1][1] = int(self.cfg[-1][1] * multiplier) - self.cfg[-1][2] = int(self.cfg[-1][2] * multiplier) - - self.conv1 = ConvBNLayer( - in_c=3, - out_c=make_divisible(inplanes * scale), - filter_size=3, - stride=2, - padding=1, - num_groups=1, - act="hard_swish", - lr_mult=lr_mult_list[0], - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name="conv1") - - self._out_channels = [] - self.block_list = [] - i = 0 - inplanes = make_divisible(inplanes * scale) - for (k, exp, c, se, nl, s) in self.cfg: - lr_idx = min(i // 3, len(lr_mult_list) - 1) - lr_mult = lr_mult_list[lr_idx] - - # for SSD/SSDLite, first head input is after ResidualUnit expand_conv - return_list = self.with_extra_blocks and i + 2 in self.feature_maps - - block = self.add_sublayer( - "conv" + str(i + 2), - sublayer=ResidualUnit( - in_c=inplanes, - mid_c=make_divisible(scale * exp), - out_c=make_divisible(scale * c), - filter_size=k, - stride=s, - use_se=se, - act=nl, - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - return_list=return_list, - name="conv" + str(i + 2))) - self.block_list.append(block) - inplanes = make_divisible(scale * c) - i += 1 - self._update_out_channels( - make_divisible(scale * exp) - if return_list else inplanes, i + 1, feature_maps) - - if self.with_extra_blocks: - self.extra_block_list = [] - extra_out_c = make_divisible(scale * self.cfg[-1][1]) - lr_idx = min(i // 3, len(lr_mult_list) - 1) - lr_mult = lr_mult_list[lr_idx] - - conv_extra = self.add_sublayer( - "conv" + str(i + 2), - sublayer=ConvBNLayer( - in_c=inplanes, - out_c=extra_out_c, - filter_size=1, - stride=1, - padding=0, - num_groups=1, - act="hard_swish", - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name="conv" + str(i + 2))) - self.extra_block_list.append(conv_extra) - i += 1 - self._update_out_channels(extra_out_c, i + 1, feature_maps) - - for j, block_filter in enumerate(self.extra_block_filters): - in_c = extra_out_c if j == 0 else self.extra_block_filters[j - - 1][1] - conv_extra = self.add_sublayer( - "conv" + str(i + 2), - sublayer=ExtraBlockDW( - in_c, - block_filter[0], - block_filter[1], - stride=2, - lr_mult=lr_mult, - conv_decay=conv_decay, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - name='conv' + str(i + 2))) - self.extra_block_list.append(conv_extra) - i += 1 - self._update_out_channels(block_filter[1], i + 1, feature_maps) - - def _update_out_channels(self, channel, feature_idx, feature_maps): - if feature_idx in feature_maps: - self._out_channels.append(channel) - - def forward(self, inputs): - x = self.conv1(inputs['image']) - outs = [] - for idx, block in enumerate(self.block_list): - x = block(x) - if idx + 2 in self.feature_maps: - if isinstance(x, list): - outs.append(x[0]) - x = x[1] - else: - outs.append(x) - - if not self.with_extra_blocks: - return outs - - for i, block in enumerate(self.extra_block_list): - idx = i + len(self.block_list) - x = block(x) - if idx + 2 in self.feature_maps: - outs.append(x) - return outs - - @property - def out_shape(self): - return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/mobileone.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/mobileone.py deleted file mode 100644 index e548bad..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/mobileone.py +++ /dev/null @@ -1,266 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is the paddle implementation of MobileOne block, see: https://arxiv.org/pdf/2206.04040.pdf. -Some codes are based on https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py -Ths copyright of microsoft/Swin-Transformer is as follows: -MIT License [see LICENSE for details] -""" - -import paddle -import paddle.nn as nn -from paddle import ParamAttr -from paddle.regularizer import L2Decay -from paddle.nn.initializer import Normal, Constant - -from ppdet.modeling.ops import get_act_fn -from ppdet.modeling.layers import ConvNormLayer - - -class MobileOneBlock(nn.Layer): - def __init__( - self, - ch_in, - ch_out, - stride, - kernel_size, - conv_num=1, - norm_type='bn', - norm_decay=0., - norm_groups=32, - bias_on=False, - lr_scale=1., - freeze_norm=False, - initializer=Normal( - mean=0., std=0.01), - skip_quant=False, - act='relu', ): - super(MobileOneBlock, self).__init__() - - self.ch_in = ch_in - self.ch_out = ch_out - self.kernel_size = kernel_size - self.stride = stride - self.padding = (kernel_size - 1) // 2 - self.k = conv_num - - self.depth_conv = nn.LayerList() - self.point_conv = nn.LayerList() - for _ in range(self.k): - self.depth_conv.append( - ConvNormLayer( - ch_in, - ch_in, - kernel_size, - stride=stride, - groups=ch_in, - norm_type=norm_type, - norm_decay=norm_decay, - norm_groups=norm_groups, - bias_on=bias_on, - lr_scale=lr_scale, - freeze_norm=freeze_norm, - initializer=initializer, - skip_quant=skip_quant)) - self.point_conv.append( - ConvNormLayer( - ch_in, - ch_out, - 1, - stride=1, - groups=1, - norm_type=norm_type, - norm_decay=norm_decay, - norm_groups=norm_groups, - bias_on=bias_on, - lr_scale=lr_scale, - freeze_norm=freeze_norm, - initializer=initializer, - skip_quant=skip_quant)) - self.rbr_1x1 = ConvNormLayer( - ch_in, - ch_in, - 1, - stride=self.stride, - groups=ch_in, - norm_type=norm_type, - norm_decay=norm_decay, - norm_groups=norm_groups, - bias_on=bias_on, - lr_scale=lr_scale, - freeze_norm=freeze_norm, - initializer=initializer, - skip_quant=skip_quant) - self.rbr_identity_st1 = nn.BatchNorm2D( - num_features=ch_in, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay( - 0.0))) if ch_in == ch_out and self.stride == 1 else None - self.rbr_identity_st2 = nn.BatchNorm2D( - num_features=ch_out, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay( - 0.0))) if ch_in == ch_out and self.stride == 1 else None - self.act = get_act_fn(act) if act is None or isinstance(act, ( - str, dict)) else act - - def forward(self, x): - if hasattr(self, "conv1") and hasattr(self, "conv2"): - y = self.act(self.conv2(self.act(self.conv1(x)))) - else: - if self.rbr_identity_st1 is None: - id_out_st1 = 0 - else: - id_out_st1 = self.rbr_identity_st1(x) - - x1_1 = 0 - for i in range(self.k): - x1_1 += self.depth_conv[i](x) - - x1_2 = self.rbr_1x1(x) - x1 = self.act(x1_1 + x1_2 + id_out_st1) - - if self.rbr_identity_st2 is None: - id_out_st2 = 0 - else: - id_out_st2 = self.rbr_identity_st2(x1) - - x2_1 = 0 - for i in range(self.k): - x2_1 += self.point_conv[i](x1) - y = self.act(x2_1 + id_out_st2) - - return y - - def convert_to_deploy(self): - if not hasattr(self, 'conv1'): - self.conv1 = nn.Conv2D( - in_channels=self.ch_in, - out_channels=self.ch_in, - kernel_size=self.kernel_size, - stride=self.stride, - padding=self.padding, - groups=self.ch_in, - bias_attr=ParamAttr( - initializer=Constant(value=0.), learning_rate=1.)) - if not hasattr(self, 'conv2'): - self.conv2 = nn.Conv2D( - in_channels=self.ch_in, - out_channels=self.ch_out, - kernel_size=1, - stride=1, - padding='SAME', - groups=1, - bias_attr=ParamAttr( - initializer=Constant(value=0.), learning_rate=1.)) - - conv1_kernel, conv1_bias, conv2_kernel, conv2_bias = self.get_equivalent_kernel_bias( - ) - self.conv1.weight.set_value(conv1_kernel) - self.conv1.bias.set_value(conv1_bias) - self.conv2.weight.set_value(conv2_kernel) - self.conv2.bias.set_value(conv2_bias) - self.__delattr__('depth_conv') - self.__delattr__('point_conv') - self.__delattr__('rbr_1x1') - if hasattr(self, 'rbr_identity_st1'): - self.__delattr__('rbr_identity_st1') - if hasattr(self, 'rbr_identity_st2'): - self.__delattr__('rbr_identity_st2') - - def get_equivalent_kernel_bias(self): - st1_kernel3x3, st1_bias3x3 = self._fuse_bn_tensor(self.depth_conv) - st1_kernel1x1, st1_bias1x1 = self._fuse_bn_tensor(self.rbr_1x1) - st1_kernelid, st1_biasid = self._fuse_bn_tensor( - self.rbr_identity_st1, kernel_size=self.kernel_size) - - st2_kernel1x1, st2_bias1x1 = self._fuse_bn_tensor(self.point_conv) - st2_kernelid, st2_biasid = self._fuse_bn_tensor( - self.rbr_identity_st2, kernel_size=1) - - conv1_kernel = st1_kernel3x3 + self._pad_1x1_to_3x3_tensor( - st1_kernel1x1) + st1_kernelid - - conv1_bias = st1_bias3x3 + st1_bias1x1 + st1_biasid - - conv2_kernel = st2_kernel1x1 + st2_kernelid - conv2_bias = st2_bias1x1 + st2_biasid - - return conv1_kernel, conv1_bias, conv2_kernel, conv2_bias - - def _pad_1x1_to_3x3_tensor(self, kernel1x1): - if kernel1x1 is None: - return 0 - else: - padding_size = (self.kernel_size - 1) // 2 - return nn.functional.pad( - kernel1x1, - [padding_size, padding_size, padding_size, padding_size]) - - def _fuse_bn_tensor(self, branch, kernel_size=3): - if branch is None: - return 0, 0 - - if isinstance(branch, nn.LayerList): - fused_kernels = [] - fused_bias = [] - for block in branch: - kernel = block.conv.weight - running_mean = block.norm._mean - running_var = block.norm._variance - gamma = block.norm.weight - beta = block.norm.bias - eps = block.norm._epsilon - - std = (running_var + eps).sqrt() - t = (gamma / std).reshape((-1, 1, 1, 1)) - - fused_kernels.append(kernel * t) - fused_bias.append(beta - running_mean * gamma / std) - - return sum(fused_kernels), sum(fused_bias) - - elif isinstance(branch, ConvNormLayer): - kernel = branch.conv.weight - running_mean = branch.norm._mean - running_var = branch.norm._variance - gamma = branch.norm.weight - beta = branch.norm.bias - eps = branch.norm._epsilon - else: - assert isinstance(branch, nn.BatchNorm2D) - input_dim = self.ch_in if kernel_size == 1 else 1 - kernel_value = paddle.zeros( - shape=[self.ch_in, input_dim, kernel_size, kernel_size], - dtype='float32') - if kernel_size > 1: - for i in range(self.ch_in): - kernel_value[i, i % input_dim, (kernel_size - 1) // 2, ( - kernel_size - 1) // 2] = 1 - elif kernel_size == 1: - for i in range(self.ch_in): - kernel_value[i, i % input_dim, 0, 0] = 1 - else: - raise ValueError("Invalid kernel size recieved!") - kernel = paddle.to_tensor(kernel_value, place=branch.weight.place) - running_mean = branch._mean - running_var = branch._variance - gamma = branch.weight - beta = branch.bias - eps = branch._epsilon - - std = (running_var + eps).sqrt() - t = (gamma / std).reshape((-1, 1, 1, 1)) - - return kernel * t, beta - running_mean * gamma / std diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/name_adapter.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/name_adapter.py deleted file mode 100644 index 4afbb9b..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/name_adapter.py +++ /dev/null @@ -1,69 +0,0 @@ -class NameAdapter(object): - """Fix the backbones variable names for pretrained weight""" - - def __init__(self, model): - super(NameAdapter, self).__init__() - self.model = model - - @property - def model_type(self): - return getattr(self.model, '_model_type', '') - - @property - def variant(self): - return getattr(self.model, 'variant', '') - - def fix_conv_norm_name(self, name): - if name == "conv1": - bn_name = "bn_" + name - else: - bn_name = "bn" + name[3:] - # the naming rule is same as pretrained weight - if self.model_type == 'SEResNeXt': - bn_name = name + "_bn" - return bn_name - - def fix_shortcut_name(self, name): - if self.model_type == 'SEResNeXt': - name = 'conv' + name + '_prj' - return name - - def fix_bottleneck_name(self, name): - if self.model_type == 'SEResNeXt': - conv_name1 = 'conv' + name + '_x1' - conv_name2 = 'conv' + name + '_x2' - conv_name3 = 'conv' + name + '_x3' - shortcut_name = name - else: - conv_name1 = name + "_branch2a" - conv_name2 = name + "_branch2b" - conv_name3 = name + "_branch2c" - shortcut_name = name + "_branch1" - return conv_name1, conv_name2, conv_name3, shortcut_name - - def fix_basicblock_name(self, name): - if self.model_type == 'SEResNeXt': - conv_name1 = 'conv' + name + '_x1' - conv_name2 = 'conv' + name + '_x2' - shortcut_name = name - else: - conv_name1 = name + "_branch2a" - conv_name2 = name + "_branch2b" - shortcut_name = name + "_branch1" - return conv_name1, conv_name2, shortcut_name - - def fix_layer_warp_name(self, stage_num, count, i): - name = 'res' + str(stage_num) - if count > 10 and stage_num == 4: - if i == 0: - conv_name = name + "a" - else: - conv_name = name + "b" + str(i) - else: - conv_name = name + chr(ord("a") + i) - if self.model_type == 'SEResNeXt': - conv_name = str(stage_num + 2) + '_' + str(i + 1) - return conv_name - - def fix_c1_stage_name(self): - return "res_conv1" if self.model_type == 'ResNeXt' else "conv1" diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/res2net.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/res2net.py deleted file mode 100644 index 9e76772..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/res2net.py +++ /dev/null @@ -1,357 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from numbers import Integral - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register, serializable -from ..shape_spec import ShapeSpec -from .resnet import ConvNormLayer - -__all__ = ['Res2Net', 'Res2NetC5'] - -Res2Net_cfg = { - 50: [3, 4, 6, 3], - 101: [3, 4, 23, 3], - 152: [3, 8, 36, 3], - 200: [3, 12, 48, 3] -} - - -class BottleNeck(nn.Layer): - def __init__(self, - ch_in, - ch_out, - stride, - shortcut, - width, - scales=4, - variant='b', - groups=1, - lr=1.0, - norm_type='bn', - norm_decay=0., - freeze_norm=True, - dcn_v2=False): - super(BottleNeck, self).__init__() - - self.shortcut = shortcut - self.scales = scales - self.stride = stride - if not shortcut: - if variant == 'd' and stride == 2: - self.branch1 = nn.Sequential() - self.branch1.add_sublayer( - 'pool', - nn.AvgPool2D( - kernel_size=2, stride=2, padding=0, ceil_mode=True)) - self.branch1.add_sublayer( - 'conv', - ConvNormLayer( - ch_in=ch_in, - ch_out=ch_out, - filter_size=1, - stride=1, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - lr=lr)) - else: - self.branch1 = ConvNormLayer( - ch_in=ch_in, - ch_out=ch_out, - filter_size=1, - stride=stride, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - lr=lr) - - self.branch2a = ConvNormLayer( - ch_in=ch_in, - ch_out=width * scales, - filter_size=1, - stride=stride if variant == 'a' else 1, - groups=1, - act='relu', - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - lr=lr) - - self.branch2b = nn.LayerList([ - ConvNormLayer( - ch_in=width, - ch_out=width, - filter_size=3, - stride=1 if variant == 'a' else stride, - groups=groups, - act='relu', - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - lr=lr, - dcn_v2=dcn_v2) for _ in range(self.scales - 1) - ]) - - self.branch2c = ConvNormLayer( - ch_in=width * scales, - ch_out=ch_out, - filter_size=1, - stride=1, - groups=1, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - lr=lr) - - def forward(self, inputs): - - out = self.branch2a(inputs) - feature_split = paddle.split(out, self.scales, 1) - out_split = [] - for i in range(self.scales - 1): - if i == 0 or self.stride == 2: - out_split.append(self.branch2b[i](feature_split[i])) - else: - out_split.append(self.branch2b[i](paddle.add(feature_split[i], - out_split[-1]))) - if self.stride == 1: - out_split.append(feature_split[-1]) - else: - out_split.append(F.avg_pool2d(feature_split[-1], 3, self.stride, 1)) - out = self.branch2c(paddle.concat(out_split, 1)) - - if self.shortcut: - short = inputs - else: - short = self.branch1(inputs) - - out = paddle.add(out, short) - out = F.relu(out) - - return out - - -class Blocks(nn.Layer): - def __init__(self, - ch_in, - ch_out, - count, - stage_num, - width, - scales=4, - variant='b', - groups=1, - lr=1.0, - norm_type='bn', - norm_decay=0., - freeze_norm=True, - dcn_v2=False): - super(Blocks, self).__init__() - - self.blocks = nn.Sequential() - for i in range(count): - self.blocks.add_sublayer( - str(i), - BottleNeck( - ch_in=ch_in if i == 0 else ch_out, - ch_out=ch_out, - stride=2 if i == 0 and stage_num != 2 else 1, - shortcut=False if i == 0 else True, - width=width * (2**(stage_num - 2)), - scales=scales, - variant=variant, - groups=groups, - lr=lr, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - dcn_v2=dcn_v2)) - - def forward(self, inputs): - return self.blocks(inputs) - - -@register -@serializable -class Res2Net(nn.Layer): - """ - Res2Net, see https://arxiv.org/abs/1904.01169 - Args: - depth (int): Res2Net depth, should be 50, 101, 152, 200. - width (int): Res2Net width - scales (int): Res2Net scale - variant (str): Res2Net variant, supports 'a', 'b', 'c', 'd' currently - lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5), - lower learning rate ratio is need for pretrained model - got using distillation(default as [1.0, 1.0, 1.0, 1.0]). - groups (int): The groups number of the Conv Layer. - norm_type (str): normalization type, 'bn' or 'sync_bn' - norm_decay (float): weight decay for normalization layer weights - freeze_norm (bool): freeze normalization layers - freeze_at (int): freeze the backbone at which stage - return_idx (list): index of stages whose feature maps are returned, - index 0 stands for res2 - dcn_v2_stages (list): index of stages who select deformable conv v2 - num_stages (int): number of stages created - - """ - __shared__ = ['norm_type'] - - def __init__(self, - depth=50, - width=26, - scales=4, - variant='b', - lr_mult_list=[1.0, 1.0, 1.0, 1.0], - groups=1, - norm_type='bn', - norm_decay=0., - freeze_norm=True, - freeze_at=0, - return_idx=[0, 1, 2, 3], - dcn_v2_stages=[-1], - num_stages=4): - super(Res2Net, self).__init__() - - self._model_type = 'Res2Net' if groups == 1 else 'Res2NeXt' - - assert depth in [50, 101, 152, 200], \ - "depth {} not in [50, 101, 152, 200]" - assert variant in ['a', 'b', 'c', 'd'], "invalid Res2Net variant" - assert num_stages >= 1 and num_stages <= 4 - - self.depth = depth - self.variant = variant - self.norm_type = norm_type - self.norm_decay = norm_decay - self.freeze_norm = freeze_norm - self.freeze_at = freeze_at - if isinstance(return_idx, Integral): - return_idx = [return_idx] - assert max(return_idx) < num_stages, \ - 'the maximum return index must smaller than num_stages, ' \ - 'but received maximum return index is {} and num_stages ' \ - 'is {}'.format(max(return_idx), num_stages) - self.return_idx = return_idx - self.num_stages = num_stages - assert len(lr_mult_list) == 4, \ - "lr_mult_list length must be 4 but got {}".format(len(lr_mult_list)) - if isinstance(dcn_v2_stages, Integral): - dcn_v2_stages = [dcn_v2_stages] - assert max(dcn_v2_stages) < num_stages - self.dcn_v2_stages = dcn_v2_stages - - block_nums = Res2Net_cfg[depth] - - # C1 stage - if self.variant in ['c', 'd']: - conv_def = [ - [3, 32, 3, 2, "conv1_1"], - [32, 32, 3, 1, "conv1_2"], - [32, 64, 3, 1, "conv1_3"], - ] - else: - conv_def = [[3, 64, 7, 2, "conv1"]] - self.res1 = nn.Sequential() - for (c_in, c_out, k, s, _name) in conv_def: - self.res1.add_sublayer( - _name, - ConvNormLayer( - ch_in=c_in, - ch_out=c_out, - filter_size=k, - stride=s, - groups=1, - act='relu', - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - lr=1.0)) - - self._in_channels = [64, 256, 512, 1024] - self._out_channels = [256, 512, 1024, 2048] - self._out_strides = [4, 8, 16, 32] - - # C2-C5 stages - self.res_layers = [] - for i in range(num_stages): - lr_mult = lr_mult_list[i] - stage_num = i + 2 - self.res_layers.append( - self.add_sublayer( - "res{}".format(stage_num), - Blocks( - self._in_channels[i], - self._out_channels[i], - count=block_nums[i], - stage_num=stage_num, - width=width, - scales=scales, - groups=groups, - lr=lr_mult, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - dcn_v2=(i in self.dcn_v2_stages)))) - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=self._out_channels[i], stride=self._out_strides[i]) - for i in self.return_idx - ] - - def forward(self, inputs): - x = inputs['image'] - res1 = self.res1(x) - x = F.max_pool2d(res1, kernel_size=3, stride=2, padding=1) - outs = [] - for idx, stage in enumerate(self.res_layers): - x = stage(x) - if idx == self.freeze_at: - x.stop_gradient = True - if idx in self.return_idx: - outs.append(x) - return outs - - -@register -class Res2NetC5(nn.Layer): - def __init__(self, depth=50, width=26, scales=4, variant='b'): - super(Res2NetC5, self).__init__() - feat_in, feat_out = [1024, 2048] - self.res5 = Blocks( - feat_in, - feat_out, - count=3, - stage_num=5, - width=width, - scales=scales, - variant=variant) - self.feat_out = feat_out - - @property - def out_shape(self): - return [ShapeSpec( - channels=self.feat_out, - stride=32, )] - - def forward(self, roi_feat, stage=0): - y = self.res5(roi_feat) - return y diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/resnet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/resnet.py deleted file mode 100644 index a64f400..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/resnet.py +++ /dev/null @@ -1,611 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from numbers import Integral - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register, serializable -from paddle.regularizer import L2Decay -from paddle.nn.initializer import Uniform -from paddle import ParamAttr -from paddle.nn.initializer import Constant -from paddle.vision.ops import DeformConv2D -from .name_adapter import NameAdapter -from ..shape_spec import ShapeSpec - -__all__ = ['ResNet', 'Res5Head', 'Blocks', 'BasicBlock', 'BottleNeck'] - -ResNet_cfg = { - 18: [2, 2, 2, 2], - 34: [3, 4, 6, 3], - 50: [3, 4, 6, 3], - 101: [3, 4, 23, 3], - 152: [3, 8, 36, 3], -} - - -class ConvNormLayer(nn.Layer): - def __init__(self, - ch_in, - ch_out, - filter_size, - stride, - groups=1, - act=None, - norm_type='bn', - norm_decay=0., - freeze_norm=True, - lr=1.0, - dcn_v2=False): - super(ConvNormLayer, self).__init__() - assert norm_type in ['bn', 'sync_bn'] - self.norm_type = norm_type - self.act = act - self.dcn_v2 = dcn_v2 - - if not self.dcn_v2: - self.conv = nn.Conv2D( - in_channels=ch_in, - out_channels=ch_out, - kernel_size=filter_size, - stride=stride, - padding=(filter_size - 1) // 2, - groups=groups, - weight_attr=ParamAttr(learning_rate=lr), - bias_attr=False) - else: - self.offset_channel = 2 * filter_size**2 - self.mask_channel = filter_size**2 - - self.conv_offset = nn.Conv2D( - in_channels=ch_in, - out_channels=3 * filter_size**2, - kernel_size=filter_size, - stride=stride, - padding=(filter_size - 1) // 2, - weight_attr=ParamAttr(initializer=Constant(0.)), - bias_attr=ParamAttr(initializer=Constant(0.))) - self.conv = DeformConv2D( - in_channels=ch_in, - out_channels=ch_out, - kernel_size=filter_size, - stride=stride, - padding=(filter_size - 1) // 2, - dilation=1, - groups=groups, - weight_attr=ParamAttr(learning_rate=lr), - bias_attr=False) - - norm_lr = 0. if freeze_norm else lr - param_attr = ParamAttr( - learning_rate=norm_lr, - regularizer=L2Decay(norm_decay), - trainable=False if freeze_norm else True) - bias_attr = ParamAttr( - learning_rate=norm_lr, - regularizer=L2Decay(norm_decay), - trainable=False if freeze_norm else True) - - global_stats = True if freeze_norm else None - if norm_type in ['sync_bn', 'bn']: - self.norm = nn.BatchNorm2D( - ch_out, - weight_attr=param_attr, - bias_attr=bias_attr, - use_global_stats=global_stats) - norm_params = self.norm.parameters() - - if freeze_norm: - for param in norm_params: - param.stop_gradient = True - - def forward(self, inputs): - if not self.dcn_v2: - out = self.conv(inputs) - else: - offset_mask = self.conv_offset(inputs) - offset, mask = paddle.split( - offset_mask, - num_or_sections=[self.offset_channel, self.mask_channel], - axis=1) - mask = F.sigmoid(mask) - out = self.conv(inputs, offset, mask=mask) - - if self.norm_type in ['bn', 'sync_bn']: - out = self.norm(out) - if self.act: - out = getattr(F, self.act)(out) - return out - - -class SELayer(nn.Layer): - def __init__(self, ch, reduction_ratio=16): - super(SELayer, self).__init__() - self.pool = nn.AdaptiveAvgPool2D(1) - stdv = 1.0 / math.sqrt(ch) - c_ = ch // reduction_ratio - self.squeeze = nn.Linear( - ch, - c_, - weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)), - bias_attr=True) - - stdv = 1.0 / math.sqrt(c_) - self.extract = nn.Linear( - c_, - ch, - weight_attr=paddle.ParamAttr(initializer=Uniform(-stdv, stdv)), - bias_attr=True) - - def forward(self, inputs): - out = self.pool(inputs) - out = paddle.squeeze(out, axis=[2, 3]) - out = self.squeeze(out) - out = F.relu(out) - out = self.extract(out) - out = F.sigmoid(out) - out = paddle.unsqueeze(out, axis=[2, 3]) - scale = out * inputs - return scale - - -class BasicBlock(nn.Layer): - - expansion = 1 - - def __init__(self, - ch_in, - ch_out, - stride, - shortcut, - variant='b', - groups=1, - base_width=64, - lr=1.0, - norm_type='bn', - norm_decay=0., - freeze_norm=True, - dcn_v2=False, - std_senet=False): - super(BasicBlock, self).__init__() - assert groups == 1 and base_width == 64, 'BasicBlock only supports groups=1 and base_width=64' - - self.shortcut = shortcut - if not shortcut: - if variant == 'd' and stride == 2: - self.short = nn.Sequential() - self.short.add_sublayer( - 'pool', - nn.AvgPool2D( - kernel_size=2, stride=2, padding=0, ceil_mode=True)) - self.short.add_sublayer( - 'conv', - ConvNormLayer( - ch_in=ch_in, - ch_out=ch_out, - filter_size=1, - stride=1, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - lr=lr)) - else: - self.short = ConvNormLayer( - ch_in=ch_in, - ch_out=ch_out, - filter_size=1, - stride=stride, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - lr=lr) - - self.branch2a = ConvNormLayer( - ch_in=ch_in, - ch_out=ch_out, - filter_size=3, - stride=stride, - act='relu', - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - lr=lr) - - self.branch2b = ConvNormLayer( - ch_in=ch_out, - ch_out=ch_out, - filter_size=3, - stride=1, - act=None, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - lr=lr, - dcn_v2=dcn_v2) - - self.std_senet = std_senet - if self.std_senet: - self.se = SELayer(ch_out) - - def forward(self, inputs): - out = self.branch2a(inputs) - out = self.branch2b(out) - if self.std_senet: - out = self.se(out) - - if self.shortcut: - short = inputs - else: - short = self.short(inputs) - - out = paddle.add(x=out, y=short) - out = F.relu(out) - - return out - - -class BottleNeck(nn.Layer): - - expansion = 4 - - def __init__(self, - ch_in, - ch_out, - stride, - shortcut, - variant='b', - groups=1, - base_width=4, - lr=1.0, - norm_type='bn', - norm_decay=0., - freeze_norm=True, - dcn_v2=False, - std_senet=False): - super(BottleNeck, self).__init__() - if variant == 'a': - stride1, stride2 = stride, 1 - else: - stride1, stride2 = 1, stride - - # ResNeXt - width = int(ch_out * (base_width / 64.)) * groups - - self.branch2a = ConvNormLayer( - ch_in=ch_in, - ch_out=width, - filter_size=1, - stride=stride1, - groups=1, - act='relu', - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - lr=lr) - - self.branch2b = ConvNormLayer( - ch_in=width, - ch_out=width, - filter_size=3, - stride=stride2, - groups=groups, - act='relu', - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - lr=lr, - dcn_v2=dcn_v2) - - self.branch2c = ConvNormLayer( - ch_in=width, - ch_out=ch_out * self.expansion, - filter_size=1, - stride=1, - groups=1, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - lr=lr) - - self.shortcut = shortcut - if not shortcut: - if variant == 'd' and stride == 2: - self.short = nn.Sequential() - self.short.add_sublayer( - 'pool', - nn.AvgPool2D( - kernel_size=2, stride=2, padding=0, ceil_mode=True)) - self.short.add_sublayer( - 'conv', - ConvNormLayer( - ch_in=ch_in, - ch_out=ch_out * self.expansion, - filter_size=1, - stride=1, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - lr=lr)) - else: - self.short = ConvNormLayer( - ch_in=ch_in, - ch_out=ch_out * self.expansion, - filter_size=1, - stride=stride, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - lr=lr) - - self.std_senet = std_senet - if self.std_senet: - self.se = SELayer(ch_out * self.expansion) - - def forward(self, inputs): - - out = self.branch2a(inputs) - out = self.branch2b(out) - out = self.branch2c(out) - - if self.std_senet: - out = self.se(out) - - if self.shortcut: - short = inputs - else: - short = self.short(inputs) - - out = paddle.add(x=out, y=short) - out = F.relu(out) - - return out - - -class Blocks(nn.Layer): - def __init__(self, - block, - ch_in, - ch_out, - count, - name_adapter, - stage_num, - variant='b', - groups=1, - base_width=64, - lr=1.0, - norm_type='bn', - norm_decay=0., - freeze_norm=True, - dcn_v2=False, - std_senet=False): - super(Blocks, self).__init__() - - self.blocks = [] - for i in range(count): - conv_name = name_adapter.fix_layer_warp_name(stage_num, count, i) - layer = self.add_sublayer( - conv_name, - block( - ch_in=ch_in, - ch_out=ch_out, - stride=2 if i == 0 and stage_num != 2 else 1, - shortcut=False if i == 0 else True, - variant=variant, - groups=groups, - base_width=base_width, - lr=lr, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - dcn_v2=dcn_v2, - std_senet=std_senet)) - self.blocks.append(layer) - if i == 0: - ch_in = ch_out * block.expansion - - def forward(self, inputs): - block_out = inputs - for block in self.blocks: - block_out = block(block_out) - return block_out - - -@register -@serializable -class ResNet(nn.Layer): - __shared__ = ['norm_type'] - - def __init__(self, - depth=50, - ch_in=64, - variant='b', - lr_mult_list=[1.0, 1.0, 1.0, 1.0], - groups=1, - base_width=64, - norm_type='bn', - norm_decay=0, - freeze_norm=True, - freeze_at=0, - return_idx=[0, 1, 2, 3], - dcn_v2_stages=[-1], - num_stages=4, - std_senet=False, - freeze_stem_only=False): - """ - Residual Network, see https://arxiv.org/abs/1512.03385 - - Args: - depth (int): ResNet depth, should be 18, 34, 50, 101, 152. - ch_in (int): output channel of first stage, default 64 - variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently - lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5), - lower learning rate ratio is need for pretrained model - got using distillation(default as [1.0, 1.0, 1.0, 1.0]). - groups (int): group convolution cardinality - base_width (int): base width of each group convolution - norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel' - norm_decay (float): weight decay for normalization layer weights - freeze_norm (bool): freeze normalization layers - freeze_at (int): freeze the backbone at which stage - return_idx (list): index of the stages whose feature maps are returned - dcn_v2_stages (list): index of stages who select deformable conv v2 - num_stages (int): total num of stages - std_senet (bool): whether use senet, default True - """ - super(ResNet, self).__init__() - self._model_type = 'ResNet' if groups == 1 else 'ResNeXt' - assert num_stages >= 1 and num_stages <= 4 - self.depth = depth - self.variant = variant - self.groups = groups - self.base_width = base_width - self.norm_type = norm_type - self.norm_decay = norm_decay - self.freeze_norm = freeze_norm - self.freeze_at = freeze_at - if isinstance(return_idx, Integral): - return_idx = [return_idx] - assert max(return_idx) < num_stages, \ - 'the maximum return index must smaller than num_stages, ' \ - 'but received maximum return index is {} and num_stages ' \ - 'is {}'.format(max(return_idx), num_stages) - self.return_idx = return_idx - self.num_stages = num_stages - assert len(lr_mult_list) == 4, \ - "lr_mult_list length must be 4 but got {}".format(len(lr_mult_list)) - if isinstance(dcn_v2_stages, Integral): - dcn_v2_stages = [dcn_v2_stages] - assert max(dcn_v2_stages) < num_stages - - if isinstance(dcn_v2_stages, Integral): - dcn_v2_stages = [dcn_v2_stages] - assert max(dcn_v2_stages) < num_stages - self.dcn_v2_stages = dcn_v2_stages - - block_nums = ResNet_cfg[depth] - na = NameAdapter(self) - - conv1_name = na.fix_c1_stage_name() - if variant in ['c', 'd']: - conv_def = [ - [3, ch_in // 2, 3, 2, "conv1_1"], - [ch_in // 2, ch_in // 2, 3, 1, "conv1_2"], - [ch_in // 2, ch_in, 3, 1, "conv1_3"], - ] - else: - conv_def = [[3, ch_in, 7, 2, conv1_name]] - self.conv1 = nn.Sequential() - for (c_in, c_out, k, s, _name) in conv_def: - self.conv1.add_sublayer( - _name, - ConvNormLayer( - ch_in=c_in, - ch_out=c_out, - filter_size=k, - stride=s, - groups=1, - act='relu', - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - lr=1.0)) - - self.ch_in = ch_in - ch_out_list = [64, 128, 256, 512] - block = BottleNeck if depth >= 50 else BasicBlock - - self._out_channels = [block.expansion * v for v in ch_out_list] - self._out_strides = [4, 8, 16, 32] - - self.res_layers = [] - for i in range(num_stages): - lr_mult = lr_mult_list[i] - stage_num = i + 2 - res_name = "res{}".format(stage_num) - res_layer = self.add_sublayer( - res_name, - Blocks( - block, - self.ch_in, - ch_out_list[i], - count=block_nums[i], - name_adapter=na, - stage_num=stage_num, - variant=variant, - groups=groups, - base_width=base_width, - lr=lr_mult, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - dcn_v2=(i in self.dcn_v2_stages), - std_senet=std_senet)) - self.res_layers.append(res_layer) - self.ch_in = self._out_channels[i] - - if freeze_at >= 0: - self._freeze_parameters(self.conv1) - if not freeze_stem_only: - for i in range(min(freeze_at + 1, num_stages)): - self._freeze_parameters(self.res_layers[i]) - - def _freeze_parameters(self, m): - for p in m.parameters(): - p.stop_gradient = True - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=self._out_channels[i], stride=self._out_strides[i]) - for i in self.return_idx - ] - - def forward(self, inputs): - x = inputs['image'] - conv1 = self.conv1(x) - x = F.max_pool2d(conv1, kernel_size=3, stride=2, padding=1) - outs = [] - for idx, stage in enumerate(self.res_layers): - x = stage(x) - if idx in self.return_idx: - outs.append(x) - return outs - - -@register -class Res5Head(nn.Layer): - def __init__(self, depth=50): - super(Res5Head, self).__init__() - feat_in, feat_out = [1024, 512] - if depth < 50: - feat_in = 256 - na = NameAdapter(self) - block = BottleNeck if depth >= 50 else BasicBlock - self.res5 = Blocks( - block, feat_in, feat_out, count=3, name_adapter=na, stage_num=5) - self.feat_out = feat_out if depth < 50 else feat_out * 4 - - @property - def out_shape(self): - return [ShapeSpec( - channels=self.feat_out, - stride=16, )] - - def forward(self, roi_feat, stage=0): - y = self.res5(roi_feat) - return y diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/senet.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/senet.py deleted file mode 100644 index db1e29b..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/senet.py +++ /dev/null @@ -1,141 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.nn as nn - -from ppdet.core.workspace import register, serializable -from .resnet import ResNet, Blocks, BasicBlock, BottleNeck -from ..shape_spec import ShapeSpec -from .name_adapter import NameAdapter - -__all__ = ['SENet', 'SERes5Head'] - - -@register -@serializable -class SENet(ResNet): - __shared__ = ['norm_type'] - - def __init__(self, - depth=50, - variant='b', - lr_mult_list=[1.0, 1.0, 1.0, 1.0], - groups=1, - base_width=64, - norm_type='bn', - norm_decay=0, - freeze_norm=True, - freeze_at=0, - return_idx=[0, 1, 2, 3], - dcn_v2_stages=[-1], - std_senet=True, - num_stages=4): - """ - Squeeze-and-Excitation Networks, see https://arxiv.org/abs/1709.01507 - - Args: - depth (int): SENet depth, should be 50, 101, 152 - variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently - lr_mult_list (list): learning rate ratio of different resnet stages(2,3,4,5), - lower learning rate ratio is need for pretrained model - got using distillation(default as [1.0, 1.0, 1.0, 1.0]). - groups (int): group convolution cardinality - base_width (int): base width of each group convolution - norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel' - norm_decay (float): weight decay for normalization layer weights - freeze_norm (bool): freeze normalization layers - freeze_at (int): freeze the backbone at which stage - return_idx (list): index of the stages whose feature maps are returned - dcn_v2_stages (list): index of stages who select deformable conv v2 - std_senet (bool): whether use senet, default True - num_stages (int): total num of stages - """ - - super(SENet, self).__init__( - depth=depth, - variant=variant, - lr_mult_list=lr_mult_list, - ch_in=128, - groups=groups, - base_width=base_width, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - freeze_at=freeze_at, - return_idx=return_idx, - dcn_v2_stages=dcn_v2_stages, - std_senet=std_senet, - num_stages=num_stages) - - -@register -class SERes5Head(nn.Layer): - def __init__(self, - depth=50, - variant='b', - lr_mult=1.0, - groups=1, - base_width=64, - norm_type='bn', - norm_decay=0, - dcn_v2=False, - freeze_norm=False, - std_senet=True): - """ - SERes5Head layer - - Args: - depth (int): SENet depth, should be 50, 101, 152 - variant (str): ResNet variant, supports 'a', 'b', 'c', 'd' currently - lr_mult (list): learning rate ratio of SERes5Head, default as 1.0. - groups (int): group convolution cardinality - base_width (int): base width of each group convolution - norm_type (str): normalization type, 'bn', 'sync_bn' or 'affine_channel' - norm_decay (float): weight decay for normalization layer weights - dcn_v2_stages (list): index of stages who select deformable conv v2 - std_senet (bool): whether use senet, default True - - """ - super(SERes5Head, self).__init__() - ch_out = 512 - ch_in = 256 if depth < 50 else 1024 - na = NameAdapter(self) - block = BottleNeck if depth >= 50 else BasicBlock - self.res5 = Blocks( - block, - ch_in, - ch_out, - count=3, - name_adapter=na, - stage_num=5, - variant=variant, - groups=groups, - base_width=base_width, - lr=lr_mult, - norm_type=norm_type, - norm_decay=norm_decay, - freeze_norm=freeze_norm, - dcn_v2=dcn_v2, - std_senet=std_senet) - self.ch_out = ch_out * block.expansion - - @property - def out_shape(self): - return [ShapeSpec( - channels=self.ch_out, - stride=16, )] - - def forward(self, roi_feat): - y = self.res5(roi_feat) - return y diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/shufflenet_v2.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/shufflenet_v2.py deleted file mode 100644 index ca7ebb9..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/shufflenet_v2.py +++ /dev/null @@ -1,250 +0,0 @@ -# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -from paddle import ParamAttr -import paddle.nn.functional as F -from paddle.nn import Conv2D, MaxPool2D, AdaptiveAvgPool2D, BatchNorm2D -from paddle.nn.initializer import KaimingNormal -from paddle.regularizer import L2Decay - -from ppdet.core.workspace import register, serializable -from numbers import Integral -from ..shape_spec import ShapeSpec -from ppdet.modeling.ops import channel_shuffle - -__all__ = ['ShuffleNetV2'] - - -class ConvBNLayer(nn.Layer): - def __init__(self, - in_channels, - out_channels, - kernel_size, - stride, - padding, - groups=1, - act=None): - super(ConvBNLayer, self).__init__() - self._conv = Conv2D( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=kernel_size, - stride=stride, - padding=padding, - groups=groups, - weight_attr=ParamAttr(initializer=KaimingNormal()), - bias_attr=False) - - self._batch_norm = BatchNorm2D( - out_channels, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - if act == "hard_swish": - act = 'hardswish' - self.act = act - - def forward(self, inputs): - y = self._conv(inputs) - y = self._batch_norm(y) - if self.act: - y = getattr(F, self.act)(y) - return y - - -class InvertedResidual(nn.Layer): - def __init__(self, in_channels, out_channels, stride, act="relu"): - super(InvertedResidual, self).__init__() - self._conv_pw = ConvBNLayer( - in_channels=in_channels // 2, - out_channels=out_channels // 2, - kernel_size=1, - stride=1, - padding=0, - groups=1, - act=act) - self._conv_dw = ConvBNLayer( - in_channels=out_channels // 2, - out_channels=out_channels // 2, - kernel_size=3, - stride=stride, - padding=1, - groups=out_channels // 2, - act=None) - self._conv_linear = ConvBNLayer( - in_channels=out_channels // 2, - out_channels=out_channels // 2, - kernel_size=1, - stride=1, - padding=0, - groups=1, - act=act) - - def forward(self, inputs): - x1, x2 = paddle.split( - inputs, - num_or_sections=[inputs.shape[1] // 2, inputs.shape[1] // 2], - axis=1) - x2 = self._conv_pw(x2) - x2 = self._conv_dw(x2) - x2 = self._conv_linear(x2) - out = paddle.concat([x1, x2], axis=1) - return channel_shuffle(out, 2) - - -class InvertedResidualDS(nn.Layer): - def __init__(self, in_channels, out_channels, stride, act="relu"): - super(InvertedResidualDS, self).__init__() - - # branch1 - self._conv_dw_1 = ConvBNLayer( - in_channels=in_channels, - out_channels=in_channels, - kernel_size=3, - stride=stride, - padding=1, - groups=in_channels, - act=None) - self._conv_linear_1 = ConvBNLayer( - in_channels=in_channels, - out_channels=out_channels // 2, - kernel_size=1, - stride=1, - padding=0, - groups=1, - act=act) - # branch2 - self._conv_pw_2 = ConvBNLayer( - in_channels=in_channels, - out_channels=out_channels // 2, - kernel_size=1, - stride=1, - padding=0, - groups=1, - act=act) - self._conv_dw_2 = ConvBNLayer( - in_channels=out_channels // 2, - out_channels=out_channels // 2, - kernel_size=3, - stride=stride, - padding=1, - groups=out_channels // 2, - act=None) - self._conv_linear_2 = ConvBNLayer( - in_channels=out_channels // 2, - out_channels=out_channels // 2, - kernel_size=1, - stride=1, - padding=0, - groups=1, - act=act) - - def forward(self, inputs): - x1 = self._conv_dw_1(inputs) - x1 = self._conv_linear_1(x1) - x2 = self._conv_pw_2(inputs) - x2 = self._conv_dw_2(x2) - x2 = self._conv_linear_2(x2) - out = paddle.concat([x1, x2], axis=1) - - return channel_shuffle(out, 2) - - -@register -@serializable -class ShuffleNetV2(nn.Layer): - def __init__(self, scale=1.0, act="relu", feature_maps=[5, 13, 17]): - super(ShuffleNetV2, self).__init__() - self.scale = scale - if isinstance(feature_maps, Integral): - feature_maps = [feature_maps] - self.feature_maps = feature_maps - stage_repeats = [4, 8, 4] - - if scale == 0.25: - stage_out_channels = [-1, 24, 24, 48, 96, 512] - elif scale == 0.33: - stage_out_channels = [-1, 24, 32, 64, 128, 512] - elif scale == 0.5: - stage_out_channels = [-1, 24, 48, 96, 192, 1024] - elif scale == 1.0: - stage_out_channels = [-1, 24, 116, 232, 464, 1024] - elif scale == 1.5: - stage_out_channels = [-1, 24, 176, 352, 704, 1024] - elif scale == 2.0: - stage_out_channels = [-1, 24, 244, 488, 976, 2048] - else: - raise NotImplementedError("This scale size:[" + str(scale) + - "] is not implemented!") - self._out_channels = [] - self._feature_idx = 0 - # 1. conv1 - self._conv1 = ConvBNLayer( - in_channels=3, - out_channels=stage_out_channels[1], - kernel_size=3, - stride=2, - padding=1, - act=act) - self._max_pool = MaxPool2D(kernel_size=3, stride=2, padding=1) - self._feature_idx += 1 - - # 2. bottleneck sequences - self._block_list = [] - for stage_id, num_repeat in enumerate(stage_repeats): - for i in range(num_repeat): - if i == 0: - block = self.add_sublayer( - name=str(stage_id + 2) + '_' + str(i + 1), - sublayer=InvertedResidualDS( - in_channels=stage_out_channels[stage_id + 1], - out_channels=stage_out_channels[stage_id + 2], - stride=2, - act=act)) - else: - block = self.add_sublayer( - name=str(stage_id + 2) + '_' + str(i + 1), - sublayer=InvertedResidual( - in_channels=stage_out_channels[stage_id + 2], - out_channels=stage_out_channels[stage_id + 2], - stride=1, - act=act)) - self._block_list.append(block) - self._feature_idx += 1 - self._update_out_channels(stage_out_channels[stage_id + 2], - self._feature_idx, self.feature_maps) - - def _update_out_channels(self, channel, feature_idx, feature_maps): - if feature_idx in feature_maps: - self._out_channels.append(channel) - - def forward(self, inputs): - y = self._conv1(inputs['image']) - y = self._max_pool(y) - outs = [] - for i, inv in enumerate(self._block_list): - y = inv(y) - if i + 2 in self.feature_maps: - outs.append(y) - - return outs - - @property - def out_shape(self): - return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/swin_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/swin_transformer.py deleted file mode 100644 index 64aabab..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/swin_transformer.py +++ /dev/null @@ -1,752 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py -Ths copyright of microsoft/Swin-Transformer is as follows: -MIT License [see LICENSE for details] -""" -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.modeling.shape_spec import ShapeSpec -from ppdet.core.workspace import register, serializable -from .transformer_utils import DropPath, Identity -from .transformer_utils import add_parameter, to_2tuple -from .transformer_utils import ones_, zeros_, trunc_normal_ - -__all__ = ['SwinTransformer'] - -MODEL_cfg = { - # use 22kto1k finetune weights as default pretrained, can set by SwinTransformer.pretrained in config - 'swin_T_224': dict( - pretrain_img_size=224, - embed_dim=96, - depths=[2, 2, 6, 2], - num_heads=[3, 6, 12, 24], - window_size=7, - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_tiny_patch4_window7_224_22kto1k_pretrained.pdparams', - ), - 'swin_S_224': dict( - pretrain_img_size=224, - embed_dim=96, - depths=[2, 2, 18, 2], - num_heads=[3, 6, 12, 24], - window_size=7, - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_small_patch4_window7_224_22kto1k_pretrained.pdparams', - ), - 'swin_B_224': dict( - pretrain_img_size=224, - embed_dim=128, - depths=[2, 2, 18, 2], - num_heads=[4, 8, 16, 32], - window_size=7, - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window7_224_22kto1k_pretrained.pdparams', - ), - 'swin_L_224': dict( - pretrain_img_size=224, - embed_dim=192, - depths=[2, 2, 18, 2], - num_heads=[6, 12, 24, 48], - window_size=7, - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window7_224_22kto1k_pretrained.pdparams', - ), - 'swin_B_384': dict( - pretrain_img_size=384, - embed_dim=128, - depths=[2, 2, 18, 2], - num_heads=[4, 8, 16, 32], - window_size=12, - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_base_patch4_window12_384_22kto1k_pretrained.pdparams', - ), - 'swin_L_384': dict( - pretrain_img_size=384, - embed_dim=192, - depths=[2, 2, 18, 2], - num_heads=[6, 12, 24, 48], - window_size=12, - pretrained='https://bj.bcebos.com/v1/paddledet/models/pretrained/swin_large_patch4_window12_384_22kto1k_pretrained.pdparams', - ), -} - - -class Mlp(nn.Layer): - def __init__(self, - in_features, - hidden_features=None, - out_features=None, - act_layer=nn.GELU, - drop=0.): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - self.fc1 = nn.Linear(in_features, hidden_features) - self.act = act_layer() - self.fc2 = nn.Linear(hidden_features, out_features) - self.drop = nn.Dropout(drop) - - def forward(self, x): - x = self.fc1(x) - x = self.act(x) - x = self.drop(x) - x = self.fc2(x) - x = self.drop(x) - return x - - -def window_partition(x, window_size): - """ - Args: - x: (B, H, W, C) - window_size (int): window size - Returns: - windows: (num_windows*B, window_size, window_size, C) - """ - B, H, W, C = x.shape - x = x.reshape( - [-1, H // window_size, window_size, W // window_size, window_size, C]) - windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape( - [-1, window_size, window_size, C]) - return windows - - -def window_reverse(windows, window_size, H, W): - """ - Args: - windows: (num_windows*B, window_size, window_size, C) - window_size (int): Window size - H (int): Height of image - W (int): Width of image - Returns: - x: (B, H, W, C) - """ - _, _, _, C = windows.shape - B = int(windows.shape[0] / (H * W / window_size / window_size)) - x = windows.reshape( - [-1, H // window_size, W // window_size, window_size, window_size, C]) - x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([-1, H, W, C]) - return x - - -class WindowAttention(nn.Layer): - """ Window based multi-head self attention (W-MSA) module with relative position bias. - It supports both of shifted and non-shifted window. - - Args: - dim (int): Number of input channels. - window_size (tuple[int]): The height and width of the window. - num_heads (int): Number of attention heads. - qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True - qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set - attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 - proj_drop (float, optional): Dropout ratio of output. Default: 0.0 - """ - - def __init__(self, - dim, - window_size, - num_heads, - qkv_bias=True, - qk_scale=None, - attn_drop=0., - proj_drop=0.): - - super().__init__() - self.dim = dim - self.window_size = window_size # Wh, Ww - self.num_heads = num_heads - head_dim = dim // num_heads - self.scale = qk_scale or head_dim**-0.5 - - # define a parameter table of relative position bias - self.relative_position_bias_table = add_parameter( - self, - paddle.zeros(((2 * window_size[0] - 1) * (2 * window_size[1] - 1), - num_heads))) # 2*Wh-1 * 2*Ww-1, nH - - # get pair-wise relative position index for each token inside the window - coords_h = paddle.arange(self.window_size[0]) - coords_w = paddle.arange(self.window_size[1]) - coords = paddle.stack(paddle.meshgrid( - [coords_h, coords_w])) # 2, Wh, Ww - coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww - coords_flatten_1 = coords_flatten.unsqueeze(axis=2) - coords_flatten_2 = coords_flatten.unsqueeze(axis=1) - relative_coords = coords_flatten_1 - coords_flatten_2 - relative_coords = relative_coords.transpose( - [1, 2, 0]) # Wh*Ww, Wh*Ww, 2 - relative_coords[:, :, 0] += self.window_size[ - 0] - 1 # shift to start from 0 - relative_coords[:, :, 1] += self.window_size[1] - 1 - relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 - self.relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww - - self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) - self.attn_drop = nn.Dropout(attn_drop) - self.proj = nn.Linear(dim, dim) - self.proj_drop = nn.Dropout(proj_drop) - - trunc_normal_(self.relative_position_bias_table) - self.softmax = nn.Softmax(axis=-1) - - def forward(self, x, mask=None): - """ Forward function. - Args: - x: input features with shape of (num_windows*B, N, C) - mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None - """ - B_, N, C = x.shape - qkv = self.qkv(x).reshape( - [-1, N, 3, self.num_heads, C // self.num_heads]).transpose( - [2, 0, 3, 1, 4]) - q, k, v = qkv[0], qkv[1], qkv[2] - - q = q * self.scale - attn = paddle.mm(q, k.transpose([0, 1, 3, 2])) - - index = self.relative_position_index.flatten() - - relative_position_bias = paddle.index_select( - self.relative_position_bias_table, index) - relative_position_bias = relative_position_bias.reshape([ - self.window_size[0] * self.window_size[1], - self.window_size[0] * self.window_size[1], -1 - ]) # Wh*Ww,Wh*Ww,nH - relative_position_bias = relative_position_bias.transpose( - [2, 0, 1]) # nH, Wh*Ww, Wh*Ww - attn = attn + relative_position_bias.unsqueeze(0) - - if mask is not None: - nW = mask.shape[0] - attn = attn.reshape([-1, nW, self.num_heads, N, N - ]) + mask.unsqueeze(1).unsqueeze(0) - attn = attn.reshape([-1, self.num_heads, N, N]) - attn = self.softmax(attn) - else: - attn = self.softmax(attn) - - attn = self.attn_drop(attn) - - # x = (attn @ v).transpose(1, 2).reshape([B_, N, C]) - x = paddle.mm(attn, v).transpose([0, 2, 1, 3]).reshape([-1, N, C]) - x = self.proj(x) - x = self.proj_drop(x) - return x - - -class SwinTransformerBlock(nn.Layer): - """ Swin Transformer Block. - Args: - dim (int): Number of input channels. - num_heads (int): Number of attention heads. - window_size (int): Window size. - shift_size (int): Shift size for SW-MSA. - mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. - qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True - qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. - drop (float, optional): Dropout rate. Default: 0.0 - attn_drop (float, optional): Attention dropout rate. Default: 0.0 - drop_path (float, optional): Stochastic depth rate. Default: 0.0 - act_layer (nn.Layer, optional): Activation layer. Default: nn.GELU - norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm - """ - - def __init__(self, - dim, - num_heads, - window_size=7, - shift_size=0, - mlp_ratio=4., - qkv_bias=True, - qk_scale=None, - drop=0., - attn_drop=0., - drop_path=0., - act_layer=nn.GELU, - norm_layer=nn.LayerNorm): - super().__init__() - self.dim = dim - self.num_heads = num_heads - self.window_size = window_size - self.shift_size = shift_size - self.mlp_ratio = mlp_ratio - assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" - - self.norm1 = norm_layer(dim) - self.attn = WindowAttention( - dim, - window_size=to_2tuple(self.window_size), - num_heads=num_heads, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - attn_drop=attn_drop, - proj_drop=drop) - - self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() - self.norm2 = norm_layer(dim) - mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = Mlp(in_features=dim, - hidden_features=mlp_hidden_dim, - act_layer=act_layer, - drop=drop) - - self.H = None - self.W = None - - def forward(self, x, mask_matrix): - """ Forward function. - Args: - x: Input feature, tensor size (B, H*W, C). - H, W: Spatial resolution of the input feature. - mask_matrix: Attention mask for cyclic shift. - """ - B, L, C = x.shape - H, W = self.H, self.W - assert L == H * W, "input feature has wrong size" - - shortcut = x - x = self.norm1(x) - x = x.reshape([-1, H, W, C]) - - # pad feature maps to multiples of window size - pad_l = pad_t = 0 - pad_r = (self.window_size - W % self.window_size) % self.window_size - pad_b = (self.window_size - H % self.window_size) % self.window_size - x = F.pad(x, [0, pad_l, 0, pad_b, 0, pad_r, 0, pad_t], - data_format='NHWC') - _, Hp, Wp, _ = x.shape - - # cyclic shift - if self.shift_size > 0: - shifted_x = paddle.roll( - x, shifts=(-self.shift_size, -self.shift_size), axis=(1, 2)) - attn_mask = mask_matrix - else: - shifted_x = x - attn_mask = None - - # partition windows - x_windows = window_partition( - shifted_x, self.window_size) # nW*B, window_size, window_size, C - x_windows = x_windows.reshape( - [x_windows.shape[0], self.window_size * self.window_size, - C]) # nW*B, window_size*window_size, C - - # W-MSA/SW-MSA - attn_windows = self.attn( - x_windows, mask=attn_mask) # nW*B, window_size*window_size, C - - # merge windows - attn_windows = attn_windows.reshape( - [x_windows.shape[0], self.window_size, self.window_size, C]) - shifted_x = window_reverse(attn_windows, self.window_size, Hp, - Wp) # B H' W' C - - # reverse cyclic shift - if self.shift_size > 0: - x = paddle.roll( - shifted_x, - shifts=(self.shift_size, self.shift_size), - axis=(1, 2)) - else: - x = shifted_x - - if pad_r > 0 or pad_b > 0: - x = x[:, :H, :W, :] - - x = x.reshape([-1, H * W, C]) - - # FFN - x = shortcut + self.drop_path(x) - x = x + self.drop_path(self.mlp(self.norm2(x))) - - return x - - -class PatchMerging(nn.Layer): - r""" Patch Merging Layer. - Args: - dim (int): Number of input channels. - norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm - """ - - def __init__(self, dim, norm_layer=nn.LayerNorm): - super().__init__() - self.dim = dim - self.reduction = nn.Linear(4 * dim, 2 * dim, bias_attr=False) - self.norm = norm_layer(4 * dim) - - def forward(self, x, H, W): - """ Forward function. - Args: - x: Input feature, tensor size (B, H*W, C). - H, W: Spatial resolution of the input feature. - """ - B, L, C = x.shape - assert L == H * W, "input feature has wrong size" - - x = x.reshape([-1, H, W, C]) - - # padding - pad_input = (H % 2 == 1) or (W % 2 == 1) - if pad_input: - # paddle F.pad default data_format is 'NCHW' - x = F.pad(x, [0, 0, 0, H % 2, 0, W % 2, 0, 0], data_format='NHWC') - H += H % 2 - W += W % 2 - - x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C - x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C - x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C - x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C - x = paddle.concat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C - x = x.reshape([-1, H * W // 4, 4 * C]) # B H/2*W/2 4*C - - x = self.norm(x) - x = self.reduction(x) - - return x - - -class BasicLayer(nn.Layer): - """ A basic Swin Transformer layer for one stage. - Args: - dim (int): Number of input channels. - depth (int): Number of blocks. - num_heads (int): Number of attention heads. - window_size (int): Local window size. - mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. - qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True - qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. - drop (float, optional): Dropout rate. Default: 0.0 - attn_drop (float, optional): Attention dropout rate. Default: 0.0 - drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 - norm_layer (nn.Layer, optional): Normalization layer. Default: nn.LayerNorm - downsample (nn.Layer | None, optional): Downsample layer at the end of the layer. Default: None - """ - - def __init__(self, - dim, - depth, - num_heads, - window_size=7, - mlp_ratio=4., - qkv_bias=True, - qk_scale=None, - drop=0., - attn_drop=0., - drop_path=0., - norm_layer=nn.LayerNorm, - downsample=None): - super().__init__() - self.window_size = window_size - self.shift_size = window_size // 2 - self.depth = depth - - # build blocks - self.blocks = nn.LayerList([ - SwinTransformerBlock( - dim=dim, - num_heads=num_heads, - window_size=window_size, - shift_size=0 if (i % 2 == 0) else window_size // 2, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop, - attn_drop=attn_drop, - drop_path=drop_path[i] - if isinstance(drop_path, np.ndarray) else drop_path, - norm_layer=norm_layer) for i in range(depth) - ]) - - # patch merging layer - if downsample is not None: - self.downsample = downsample(dim=dim, norm_layer=norm_layer) - else: - self.downsample = None - - def forward(self, x, H, W): - """ Forward function. - Args: - x: Input feature, tensor size (B, H*W, C). - H, W: Spatial resolution of the input feature. - """ - - # calculate attention mask for SW-MSA - Hp = int(np.ceil(H / self.window_size)) * self.window_size - Wp = int(np.ceil(W / self.window_size)) * self.window_size - img_mask = paddle.zeros([1, Hp, Wp, 1], dtype='float32') # 1 Hp Wp 1 - h_slices = (slice(0, -self.window_size), - slice(-self.window_size, -self.shift_size), - slice(-self.shift_size, None)) - w_slices = (slice(0, -self.window_size), - slice(-self.window_size, -self.shift_size), - slice(-self.shift_size, None)) - cnt = 0 - for h in h_slices: - for w in w_slices: - img_mask[:, h, w, :] = cnt - - cnt += 1 - - mask_windows = window_partition( - img_mask, self.window_size) # nW, window_size, window_size, 1 - mask_windows = mask_windows.reshape( - [-1, self.window_size * self.window_size]) - attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) - huns = -100.0 * paddle.ones_like(attn_mask) - attn_mask = huns * (attn_mask != 0).astype("float32") - - for blk in self.blocks: - blk.H, blk.W = H, W - x = blk(x, attn_mask) - if self.downsample is not None: - x_down = self.downsample(x, H, W) - Wh, Ww = (H + 1) // 2, (W + 1) // 2 - return x, H, W, x_down, Wh, Ww - else: - return x, H, W, x, H, W - - -class PatchEmbed(nn.Layer): - """ Image to Patch Embedding - Args: - patch_size (int): Patch token size. Default: 4. - in_chans (int): Number of input image channels. Default: 3. - embed_dim (int): Number of linear projection output channels. Default: 96. - norm_layer (nn.Layer, optional): Normalization layer. Default: None - """ - - def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): - super().__init__() - patch_size = to_2tuple(patch_size) - self.patch_size = patch_size - - self.in_chans = in_chans - self.embed_dim = embed_dim - - self.proj = nn.Conv2D( - in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) - if norm_layer is not None: - self.norm = norm_layer(embed_dim) - else: - self.norm = None - - def forward(self, x): - # TODO # export dynamic shape - B, C, H, W = x.shape - # assert [H, W] == self.img_size[:2], "Input image size ({H}*{W}) doesn't match model ({}*{}).".format(H, W, self.img_size[0], self.img_size[1]) - if W % self.patch_size[1] != 0: - x = F.pad(x, [0, self.patch_size[1] - W % self.patch_size[1], 0, 0]) - if H % self.patch_size[0] != 0: - x = F.pad(x, [0, 0, 0, self.patch_size[0] - H % self.patch_size[0]]) - - x = self.proj(x) - if self.norm is not None: - _, _, Wh, Ww = x.shape - x = x.flatten(2).transpose([0, 2, 1]) - x = self.norm(x) - x = x.transpose([0, 2, 1]).reshape([-1, self.embed_dim, Wh, Ww]) - - return x - - -@register -@serializable -class SwinTransformer(nn.Layer): - """ Swin Transformer backbone - Args: - arch (str): Architecture of FocalNet - pretrain_img_size (int | tuple(int)): Input image size. Default 224 - patch_size (int | tuple(int)): Patch size. Default: 4 - in_chans (int): Number of input image channels. Default: 3 - embed_dim (int): Patch embedding dimension. Default: 96 - depths (tuple(int)): Depth of each Swin Transformer layer. - num_heads (tuple(int)): Number of attention heads in different layers. - window_size (int): Window size. Default: 7 - mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 - qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True - qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None - drop_rate (float): Dropout rate. Default: 0 - attn_drop_rate (float): Attention dropout rate. Default: 0 - drop_path_rate (float): Stochastic depth rate. Default: 0.1 - norm_layer (nn.Layer): Normalization layer. Default: nn.LayerNorm. - ape (bool): If True, add absolute position embedding to the patch embedding. Default: False - patch_norm (bool): If True, add normalization after patch embedding. Default: True - """ - - def __init__(self, - arch='swin_T_224', - pretrain_img_size=224, - patch_size=4, - in_chans=3, - embed_dim=96, - depths=[2, 2, 6, 2], - num_heads=[3, 6, 12, 24], - window_size=7, - mlp_ratio=4., - qkv_bias=True, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0.2, - norm_layer=nn.LayerNorm, - ape=False, - patch_norm=True, - out_indices=(0, 1, 2, 3), - frozen_stages=-1, - pretrained=None): - super(SwinTransformer, self).__init__() - assert arch in MODEL_cfg.keys(), "Unsupported arch: {}".format(arch) - - pretrain_img_size = MODEL_cfg[arch]['pretrain_img_size'] - embed_dim = MODEL_cfg[arch]['embed_dim'] - depths = MODEL_cfg[arch]['depths'] - num_heads = MODEL_cfg[arch]['num_heads'] - window_size = MODEL_cfg[arch]['window_size'] - if pretrained is None: - pretrained = MODEL_cfg[arch]['pretrained'] - - self.num_layers = len(depths) - self.ape = ape - self.patch_norm = patch_norm - self.out_indices = out_indices - self.frozen_stages = frozen_stages - - # split image into non-overlapping patches - self.patch_embed = PatchEmbed( - patch_size=patch_size, - in_chans=in_chans, - embed_dim=embed_dim, - norm_layer=norm_layer if self.patch_norm else None) - - # absolute position embedding - if self.ape: - pretrain_img_size = to_2tuple(pretrain_img_size) - patch_size = to_2tuple(patch_size) - patches_resolution = [ - pretrain_img_size[0] // patch_size[0], - pretrain_img_size[1] // patch_size[1] - ] - - self.absolute_pos_embed = add_parameter( - self, - paddle.zeros((1, embed_dim, patches_resolution[0], - patches_resolution[1]))) - trunc_normal_(self.absolute_pos_embed) - - self.pos_drop = nn.Dropout(p=drop_rate) - - # stochastic depth - dpr = np.linspace(0, drop_path_rate, - sum(depths)) # stochastic depth decay rule - - # build layers - self.layers = nn.LayerList() - for i_layer in range(self.num_layers): - layer = BasicLayer( - dim=int(embed_dim * 2**i_layer), - depth=depths[i_layer], - num_heads=num_heads[i_layer], - window_size=window_size, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], - norm_layer=norm_layer, - downsample=PatchMerging - if (i_layer < self.num_layers - 1) else None) - self.layers.append(layer) - - num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)] - self.num_features = num_features - - # add a norm layer for each output - for i_layer in out_indices: - layer = norm_layer(num_features[i_layer]) - layer_name = f'norm{i_layer}' - self.add_sublayer(layer_name, layer) - - self.apply(self._init_weights) - self._freeze_stages() - if pretrained: - if 'http' in pretrained: #URL - path = paddle.utils.download.get_weights_path_from_url( - pretrained) - else: #model in local path - path = pretrained - self.set_state_dict(paddle.load(path)) - - def _freeze_stages(self): - if self.frozen_stages >= 0: - self.patch_embed.eval() - for param in self.patch_embed.parameters(): - param.stop_gradient = True - - if self.frozen_stages >= 1 and self.ape: - self.absolute_pos_embed.stop_gradient = True - - if self.frozen_stages >= 2: - self.pos_drop.eval() - for i in range(0, self.frozen_stages - 1): - m = self.layers[i] - m.eval() - for param in m.parameters(): - param.stop_gradient = True - - def _init_weights(self, m): - if isinstance(m, nn.Linear): - trunc_normal_(m.weight) - if isinstance(m, nn.Linear) and m.bias is not None: - zeros_(m.bias) - elif isinstance(m, nn.LayerNorm): - zeros_(m.bias) - ones_(m.weight) - - def forward(self, x): - """Forward function.""" - x = self.patch_embed(x['image']) - B, _, Wh, Ww = x.shape - if self.ape: - # interpolate the position embedding to the corresponding size - absolute_pos_embed = F.interpolate( - self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic') - x = (x + absolute_pos_embed).flatten(2).transpose([0, 2, 1]) - else: - x = x.flatten(2).transpose([0, 2, 1]) - x = self.pos_drop(x) - outs = [] - for i in range(self.num_layers): - layer = self.layers[i] - x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) - if i in self.out_indices: - norm_layer = getattr(self, f'norm{i}') - x_out = norm_layer(x_out) - out = x_out.reshape((-1, H, W, self.num_features[i])).transpose( - (0, 3, 1, 2)) - outs.append(out) - - return outs - - @property - def out_shape(self): - out_strides = [4, 8, 16, 32] - return [ - ShapeSpec( - channels=self.num_features[i], stride=out_strides[i]) - for i in self.out_indices - ] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/trans_encoder.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/trans_encoder.py deleted file mode 100644 index 1a45e0f..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/trans_encoder.py +++ /dev/null @@ -1,381 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.nn import ReLU, Swish, GELU -import math - -from ppdet.core.workspace import register -from ..shape_spec import ShapeSpec - -__all__ = ['TransEncoder'] - - -class BertEmbeddings(nn.Layer): - def __init__(self, word_size, position_embeddings_size, word_type_size, - hidden_size, dropout_prob): - super(BertEmbeddings, self).__init__() - self.word_embeddings = nn.Embedding( - word_size, hidden_size, padding_idx=0) - self.position_embeddings = nn.Embedding(position_embeddings_size, - hidden_size) - self.token_type_embeddings = nn.Embedding(word_type_size, hidden_size) - self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8) - self.dropout = nn.Dropout(dropout_prob) - - def forward(self, x, token_type_ids=None, position_ids=None): - seq_len = paddle.shape(x)[1] - if position_ids is None: - position_ids = paddle.arange(seq_len).unsqueeze(0).expand_as(x) - if token_type_ids is None: - token_type_ids = paddle.zeros(paddle.shape(x)) - - word_embs = self.word_embeddings(x) - position_embs = self.position_embeddings(position_ids) - token_type_embs = self.token_type_embeddings(token_type_ids) - - embs_cmb = word_embs + position_embs + token_type_embs - embs_out = self.layernorm(embs_cmb) - embs_out = self.dropout(embs_out) - return embs_out - - -class BertSelfAttention(nn.Layer): - def __init__(self, - hidden_size, - num_attention_heads, - attention_probs_dropout_prob, - output_attentions=False): - super(BertSelfAttention, self).__init__() - if hidden_size % num_attention_heads != 0: - raise ValueError( - "The hidden_size must be a multiple of the number of attention " - "heads, but got {} % {} != 0" % - (hidden_size, num_attention_heads)) - - self.num_attention_heads = num_attention_heads - self.attention_head_size = int(hidden_size / num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size - - self.query = nn.Linear(hidden_size, self.all_head_size) - self.key = nn.Linear(hidden_size, self.all_head_size) - self.value = nn.Linear(hidden_size, self.all_head_size) - - self.dropout = nn.Dropout(attention_probs_dropout_prob) - self.output_attentions = output_attentions - - def forward(self, x, attention_mask, head_mask=None): - query = self.query(x) - key = self.key(x) - value = self.value(x) - - query_dim1, query_dim2 = paddle.shape(query)[:-1] - new_shape = [ - query_dim1, query_dim2, self.num_attention_heads, - self.attention_head_size - ] - query = query.reshape(new_shape).transpose(perm=(0, 2, 1, 3)) - key = key.reshape(new_shape).transpose(perm=(0, 2, 3, 1)) - value = value.reshape(new_shape).transpose(perm=(0, 2, 1, 3)) - - attention = paddle.matmul(query, - key) / math.sqrt(self.attention_head_size) - attention = attention + attention_mask - attention_value = F.softmax(attention, axis=-1) - attention_value = self.dropout(attention_value) - - if head_mask is not None: - attention_value = attention_value * head_mask - - context = paddle.matmul(attention_value, value).transpose(perm=(0, 2, 1, - 3)) - ctx_dim1, ctx_dim2 = paddle.shape(context)[:-2] - new_context_shape = [ - ctx_dim1, - ctx_dim2, - self.all_head_size, - ] - context = context.reshape(new_context_shape) - - if self.output_attentions: - return (context, attention_value) - else: - return (context, ) - - -class BertAttention(nn.Layer): - def __init__(self, - hidden_size, - num_attention_heads, - attention_probs_dropout_prob, - fc_dropout_prob, - output_attentions=False): - super(BertAttention, self).__init__() - self.bert_selfattention = BertSelfAttention( - hidden_size, num_attention_heads, attention_probs_dropout_prob, - output_attentions) - self.fc = nn.Linear(hidden_size, hidden_size) - self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8) - self.dropout = nn.Dropout(fc_dropout_prob) - - def forward(self, x, attention_mask, head_mask=None): - attention_feats = self.bert_selfattention(x, attention_mask, head_mask) - features = self.fc(attention_feats[0]) - features = self.dropout(features) - features = self.layernorm(features + x) - if len(attention_feats) == 2: - return (features, attention_feats[1]) - else: - return (features, ) - - -class BertFeedForward(nn.Layer): - def __init__(self, - hidden_size, - intermediate_size, - num_attention_heads, - attention_probs_dropout_prob, - fc_dropout_prob, - act_fn='ReLU', - output_attentions=False): - super(BertFeedForward, self).__init__() - self.fc1 = nn.Linear(hidden_size, intermediate_size) - self.act_fn = eval(act_fn) - self.fc2 = nn.Linear(intermediate_size, hidden_size) - self.layernorm = nn.LayerNorm(hidden_size, epsilon=1e-8) - self.dropout = nn.Dropout(fc_dropout_prob) - - def forward(self, x): - features = self.fc1(x) - features = self.act_fn(features) - features = self.fc2(features) - features = self.dropout(features) - features = self.layernorm(features + x) - return features - - -class BertLayer(nn.Layer): - def __init__(self, - hidden_size, - intermediate_size, - num_attention_heads, - attention_probs_dropout_prob, - fc_dropout_prob, - act_fn='ReLU', - output_attentions=False): - super(BertLayer, self).__init__() - self.attention = BertAttention(hidden_size, num_attention_heads, - attention_probs_dropout_prob, - output_attentions) - self.feed_forward = BertFeedForward( - hidden_size, intermediate_size, num_attention_heads, - attention_probs_dropout_prob, fc_dropout_prob, act_fn, - output_attentions) - - def forward(self, x, attention_mask, head_mask=None): - attention_feats = self.attention(x, attention_mask, head_mask) - features = self.feed_forward(attention_feats[0]) - if len(attention_feats) == 2: - return (features, attention_feats[1]) - else: - return (features, ) - - -class BertEncoder(nn.Layer): - def __init__(self, - num_hidden_layers, - hidden_size, - intermediate_size, - num_attention_heads, - attention_probs_dropout_prob, - fc_dropout_prob, - act_fn='ReLU', - output_attentions=False, - output_hidden_feats=False): - super(BertEncoder, self).__init__() - self.output_attentions = output_attentions - self.output_hidden_feats = output_hidden_feats - self.layers = nn.LayerList([ - BertLayer(hidden_size, intermediate_size, num_attention_heads, - attention_probs_dropout_prob, fc_dropout_prob, act_fn, - output_attentions) for _ in range(num_hidden_layers) - ]) - - def forward(self, x, attention_mask, head_mask=None): - all_features = (x, ) - all_attentions = () - - for i, layer in enumerate(self.layers): - mask = head_mask[i] if head_mask is not None else None - layer_out = layer(x, attention_mask, mask) - - if self.output_hidden_feats: - all_features = all_features + (x, ) - x = layer_out[0] - if self.output_attentions: - all_attentions = all_attentions + (layer_out[1], ) - - outputs = (x, ) - if self.output_hidden_feats: - outputs += (all_features, ) - if self.output_attentions: - outputs += (all_attentions, ) - return outputs - - -class BertPooler(nn.Layer): - def __init__(self, hidden_size): - super(BertPooler, self).__init__() - self.fc = nn.Linear(hidden_size, hidden_size) - self.act = nn.Tanh() - - def forward(self, x): - first_token = x[:, 0] - pooled_output = self.fc(first_token) - pooled_output = self.act(pooled_output) - return pooled_output - - -class METROEncoder(nn.Layer): - def __init__(self, - vocab_size, - num_hidden_layers, - features_dims, - position_embeddings_size, - hidden_size, - intermediate_size, - output_feature_dim, - num_attention_heads, - attention_probs_dropout_prob, - fc_dropout_prob, - act_fn='ReLU', - output_attentions=False, - output_hidden_feats=False, - use_img_layernorm=False): - super(METROEncoder, self).__init__() - self.img_dims = features_dims - self.num_hidden_layers = num_hidden_layers - self.use_img_layernorm = use_img_layernorm - self.output_attentions = output_attentions - self.embedding = BertEmbeddings(vocab_size, position_embeddings_size, 2, - hidden_size, fc_dropout_prob) - self.encoder = BertEncoder( - num_hidden_layers, hidden_size, intermediate_size, - num_attention_heads, attention_probs_dropout_prob, fc_dropout_prob, - act_fn, output_attentions, output_hidden_feats) - self.pooler = BertPooler(hidden_size) - self.position_embeddings = nn.Embedding(position_embeddings_size, - hidden_size) - self.img_embedding = nn.Linear( - features_dims, hidden_size, bias_attr=True) - self.dropout = nn.Dropout(fc_dropout_prob) - self.cls_head = nn.Linear(hidden_size, output_feature_dim) - self.residual = nn.Linear(features_dims, output_feature_dim) - - self.apply(self.init_weights) - - def init_weights(self, module): - """ Initialize the weights. - """ - if isinstance(module, (nn.Linear, nn.Embedding)): - module.weight.set_value( - paddle.normal( - mean=0.0, std=0.02, shape=module.weight.shape)) - elif isinstance(module, nn.LayerNorm): - module.bias.set_value(paddle.zeros(shape=module.bias.shape)) - module.weight.set_value( - paddle.full( - shape=module.weight.shape, fill_value=1.0)) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.set_value(paddle.zeros(shape=module.bias.shape)) - - def forward(self, x): - batchsize, seq_len = paddle.shape(x)[:2] - input_ids = paddle.zeros((batchsize, seq_len), dtype="int64") - position_ids = paddle.arange( - seq_len, dtype="int64").unsqueeze(0).expand_as(input_ids) - - attention_mask = paddle.ones_like(input_ids).unsqueeze(1).unsqueeze(2) - head_mask = [None] * self.num_hidden_layers - - position_embs = self.position_embeddings(position_ids) - attention_mask = (1.0 - attention_mask) * -10000.0 - - img_features = self.img_embedding(x) - - # We empirically observe that adding an additional learnable position embedding leads to more stable training - embeddings = position_embs + img_features - if self.use_img_layernorm: - embeddings = self.layernorm(embeddings) - embeddings = self.dropout(embeddings) - - encoder_outputs = self.encoder( - embeddings, attention_mask, head_mask=head_mask) - - pred_score = self.cls_head(encoder_outputs[0]) - res_img_feats = self.residual(x) - pred_score = pred_score + res_img_feats - - if self.output_attentions and self.output_hidden_feats: - return pred_score, encoder_outputs[1], encoder_outputs[-1] - else: - return pred_score - - -def gelu(x): - """Implementation of the gelu activation function. - https://arxiv.org/abs/1606.08415 - """ - return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0))) - - -@register -class TransEncoder(nn.Layer): - def __init__(self, - vocab_size=30522, - num_hidden_layers=4, - num_attention_heads=4, - position_embeddings_size=512, - intermediate_size=3072, - input_feat_dim=[2048, 512, 128], - hidden_feat_dim=[1024, 256, 128], - attention_probs_dropout_prob=0.1, - fc_dropout_prob=0.1, - act_fn='gelu', - output_attentions=False, - output_hidden_feats=False): - super(TransEncoder, self).__init__() - output_feat_dim = input_feat_dim[1:] + [3] - trans_encoder = [] - for i in range(len(output_feat_dim)): - features_dims = input_feat_dim[i] - output_feature_dim = output_feat_dim[i] - hidden_size = hidden_feat_dim[i] - - # init a transformer encoder and append it to a list - assert hidden_size % num_attention_heads == 0 - model = METROEncoder(vocab_size, num_hidden_layers, features_dims, - position_embeddings_size, hidden_size, - intermediate_size, output_feature_dim, - num_attention_heads, - attention_probs_dropout_prob, fc_dropout_prob, - act_fn, output_attentions, output_hidden_feats) - trans_encoder.append(model) - self.trans_encoder = paddle.nn.Sequential(*trans_encoder) - - def forward(self, x): - out = self.trans_encoder(x) - return out diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/transformer_utils.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/transformer_utils.py deleted file mode 100644 index a0783e1..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/transformer_utils.py +++ /dev/null @@ -1,124 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - -from paddle.nn.initializer import TruncatedNormal, Constant, Assign - -# Common initializations -ones_ = Constant(value=1.) -zeros_ = Constant(value=0.) -trunc_normal_ = TruncatedNormal(std=.02) - - -# Common Layers -def drop_path(x, drop_prob=0., training=False): - """ - Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... - See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... - """ - if drop_prob == 0. or not training: - return x - keep_prob = paddle.to_tensor(1 - drop_prob, dtype=x.dtype) - shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) - random_tensor = keep_prob + paddle.rand(shape, dtype=x.dtype) - random_tensor = paddle.floor(random_tensor) # binarize - output = x.divide(keep_prob) * random_tensor - return output - - -class DropPath(nn.Layer): - def __init__(self, drop_prob=None): - super(DropPath, self).__init__() - self.drop_prob = drop_prob - - def forward(self, x): - return drop_path(x, self.drop_prob, self.training) - - -class Identity(nn.Layer): - def __init__(self): - super(Identity, self).__init__() - - def forward(self, input): - return input - - -# common funcs - - -def to_2tuple(x): - if isinstance(x, (list, tuple)): - return x - return tuple([x] * 2) - - -def add_parameter(layer, datas, name=None): - parameter = layer.create_parameter( - shape=(datas.shape), default_initializer=Assign(datas)) - if name: - layer.add_parameter(name, parameter) - return parameter - - -def window_partition(x, window_size): - """ - Partition into non-overlapping windows with padding if needed. - Args: - x (tensor): input tokens with [B, H, W, C]. - window_size (int): window size. - Returns: - windows: windows after partition with [B * num_windows, window_size, window_size, C]. - (Hp, Wp): padded height and width before partition - """ - B, H, W, C = paddle.shape(x) - - pad_h = (window_size - H % window_size) % window_size - pad_w = (window_size - W % window_size) % window_size - x = F.pad(x.transpose([0, 3, 1, 2]), - paddle.to_tensor( - [0, int(pad_w), 0, int(pad_h)], - dtype='int32')).transpose([0, 2, 3, 1]) - Hp, Wp = H + pad_h, W + pad_w - - num_h, num_w = Hp // window_size, Wp // window_size - - x = x.reshape([B, num_h, window_size, num_w, window_size, C]) - windows = x.transpose([0, 1, 3, 2, 4, 5]).reshape( - [-1, window_size, window_size, C]) - return windows, (Hp, Wp), (num_h, num_w) - - -def window_unpartition(x, pad_hw, num_hw, hw): - """ - Window unpartition into original sequences and removing padding. - Args: - x (tensor): input tokens with [B * num_windows, window_size, window_size, C]. - pad_hw (Tuple): padded height and width (Hp, Wp). - hw (Tuple): original height and width (H, W) before padding. - Returns: - x: unpartitioned sequences with [B, H, W, C]. - """ - Hp, Wp = pad_hw - num_h, num_w = num_hw - H, W = hw - B, window_size, _, C = paddle.shape(x) - B = B // (num_h * num_w) - x = x.reshape([B, num_h, num_w, window_size, window_size, C]) - x = x.transpose([0, 1, 3, 2, 4, 5]).reshape([B, Hp, Wp, C]) - - return x[:, :H, :W, :] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/vgg.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/vgg.py deleted file mode 100644 index e057532..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/vgg.py +++ /dev/null @@ -1,210 +0,0 @@ -from __future__ import division - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.nn import Conv2D, MaxPool2D -from ppdet.core.workspace import register, serializable -from ..shape_spec import ShapeSpec - -__all__ = ['VGG'] - -VGG_cfg = {16: [2, 2, 3, 3, 3], 19: [2, 2, 4, 4, 4]} - - -class ConvBlock(nn.Layer): - def __init__(self, - in_channels, - out_channels, - groups, - pool_size=2, - pool_stride=2, - pool_padding=0, - name=None): - super(ConvBlock, self).__init__() - - self.groups = groups - self.conv0 = nn.Conv2D( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=3, - stride=1, - padding=1) - self.conv_out_list = [] - for i in range(1, groups): - conv_out = self.add_sublayer( - 'conv{}'.format(i), - Conv2D( - in_channels=out_channels, - out_channels=out_channels, - kernel_size=3, - stride=1, - padding=1)) - self.conv_out_list.append(conv_out) - - self.pool = MaxPool2D( - kernel_size=pool_size, - stride=pool_stride, - padding=pool_padding, - ceil_mode=True) - - def forward(self, inputs): - out = self.conv0(inputs) - out = F.relu(out) - for conv_i in self.conv_out_list: - out = conv_i(out) - out = F.relu(out) - pool = self.pool(out) - return out, pool - - -class ExtraBlock(nn.Layer): - def __init__(self, - in_channels, - mid_channels, - out_channels, - padding, - stride, - kernel_size, - name=None): - super(ExtraBlock, self).__init__() - - self.conv0 = Conv2D( - in_channels=in_channels, - out_channels=mid_channels, - kernel_size=1, - stride=1, - padding=0) - self.conv1 = Conv2D( - in_channels=mid_channels, - out_channels=out_channels, - kernel_size=kernel_size, - stride=stride, - padding=padding) - - def forward(self, inputs): - out = self.conv0(inputs) - out = F.relu(out) - out = self.conv1(out) - out = F.relu(out) - return out - - -class L2NormScale(nn.Layer): - def __init__(self, num_channels, scale=1.0): - super(L2NormScale, self).__init__() - self.scale = self.create_parameter( - attr=ParamAttr(initializer=paddle.nn.initializer.Constant(scale)), - shape=[num_channels]) - - def forward(self, inputs): - out = F.normalize(inputs, axis=1, epsilon=1e-10) - # out = self.scale.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as( - # out) * out - out = self.scale.unsqueeze(0).unsqueeze(2).unsqueeze(3) * out - return out - - -@register -@serializable -class VGG(nn.Layer): - def __init__(self, - depth=16, - normalizations=[20., -1, -1, -1, -1, -1], - extra_block_filters=[[256, 512, 1, 2, 3], [128, 256, 1, 2, 3], - [128, 256, 0, 1, 3], - [128, 256, 0, 1, 3]]): - super(VGG, self).__init__() - - assert depth in [16, 19], \ - "depth as 16/19 supported currently, but got {}".format(depth) - self.depth = depth - self.groups = VGG_cfg[depth] - self.normalizations = normalizations - self.extra_block_filters = extra_block_filters - - self._out_channels = [] - - self.conv_block_0 = ConvBlock( - 3, 64, self.groups[0], 2, 2, 0, name="conv1_") - self.conv_block_1 = ConvBlock( - 64, 128, self.groups[1], 2, 2, 0, name="conv2_") - self.conv_block_2 = ConvBlock( - 128, 256, self.groups[2], 2, 2, 0, name="conv3_") - self.conv_block_3 = ConvBlock( - 256, 512, self.groups[3], 2, 2, 0, name="conv4_") - self.conv_block_4 = ConvBlock( - 512, 512, self.groups[4], 3, 1, 1, name="conv5_") - self._out_channels.append(512) - - self.fc6 = Conv2D( - in_channels=512, - out_channels=1024, - kernel_size=3, - stride=1, - padding=6, - dilation=6) - self.fc7 = Conv2D( - in_channels=1024, - out_channels=1024, - kernel_size=1, - stride=1, - padding=0) - self._out_channels.append(1024) - - # extra block - self.extra_convs = [] - last_channels = 1024 - for i, v in enumerate(self.extra_block_filters): - assert len(v) == 5, "extra_block_filters size not fix" - extra_conv = self.add_sublayer("conv{}".format(6 + i), - ExtraBlock(last_channels, v[0], v[1], - v[2], v[3], v[4])) - last_channels = v[1] - self.extra_convs.append(extra_conv) - self._out_channels.append(last_channels) - - self.norms = [] - for i, n in enumerate(self.normalizations): - if n != -1: - norm = self.add_sublayer("norm{}".format(i), - L2NormScale( - self.extra_block_filters[i][1], n)) - else: - norm = None - self.norms.append(norm) - - def forward(self, inputs): - outputs = [] - - conv, pool = self.conv_block_0(inputs['image']) - conv, pool = self.conv_block_1(pool) - conv, pool = self.conv_block_2(pool) - conv, pool = self.conv_block_3(pool) - outputs.append(conv) - - conv, pool = self.conv_block_4(pool) - out = self.fc6(pool) - out = F.relu(out) - out = self.fc7(out) - out = F.relu(out) - outputs.append(out) - - if not self.extra_block_filters: - return outputs - - # extra block - for extra_conv in self.extra_convs: - out = extra_conv(out) - outputs.append(out) - - for i, n in enumerate(self.normalizations): - if n != -1: - outputs[i] = self.norms[i](outputs[i]) - - return outputs - - @property - def out_shape(self): - return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/vision_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/vision_transformer.py deleted file mode 100644 index a21eefc..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/vision_transformer.py +++ /dev/null @@ -1,652 +0,0 @@ -# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -import numpy as np -from paddle.nn.initializer import Constant - -from ppdet.modeling.shape_spec import ShapeSpec -from ppdet.core.workspace import register, serializable - -from .transformer_utils import zeros_, DropPath, Identity - - -class Mlp(nn.Layer): - def __init__(self, - in_features, - hidden_features=None, - out_features=None, - act_layer=nn.GELU, - drop=0.): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - self.fc1 = nn.Linear(in_features, hidden_features) - self.act = act_layer() - self.fc2 = nn.Linear(hidden_features, out_features) - self.drop = nn.Dropout(drop) - - def forward(self, x): - x = self.fc1(x) - x = self.act(x) - x = self.drop(x) - x = self.fc2(x) - x = self.drop(x) - return x - - -class Attention(nn.Layer): - def __init__(self, - dim, - num_heads=8, - qkv_bias=False, - qk_scale=None, - attn_drop=0., - proj_drop=0., - window_size=None): - super().__init__() - self.num_heads = num_heads - head_dim = dim // num_heads - self.scale = qk_scale or head_dim**-0.5 - - self.qkv = nn.Linear(dim, dim * 3, bias_attr=False) - - if qkv_bias: - self.q_bias = self.create_parameter( - shape=([dim]), default_initializer=zeros_) - self.v_bias = self.create_parameter( - shape=([dim]), default_initializer=zeros_) - else: - self.q_bias = None - self.v_bias = None - if window_size: - self.window_size = window_size - self.num_relative_distance = (2 * window_size[0] - 1) * ( - 2 * window_size[1] - 1) + 3 - self.relative_position_bias_table = self.create_parameter( - shape=(self.num_relative_distance, num_heads), - default_initializer=zeros_) # 2*Wh-1 * 2*Ww-1, nH - # cls to token & token 2 cls & cls to cls - - # get pair-wise relative position index for each token inside the window - coords_h = paddle.arange(window_size[0]) - coords_w = paddle.arange(window_size[1]) - coords = paddle.stack(paddle.meshgrid( - [coords_h, coords_w])) # 2, Wh, Ww - coords_flatten = paddle.flatten(coords, 1) # 2, Wh*Ww - coords_flatten_1 = paddle.unsqueeze(coords_flatten, 2) - coords_flatten_2 = paddle.unsqueeze(coords_flatten, 1) - relative_coords = coords_flatten_1.clone() - coords_flatten_2.clone( - ) - - #relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Wh - relative_coords = relative_coords.transpose( - (1, 2, 0)) #.contiguous() # Wh*Ww, Wh*Ww, 2 - relative_coords[:, :, 0] += window_size[ - 0] - 1 # shift to start from 0 - relative_coords[:, :, 1] += window_size[1] - 1 - relative_coords[:, :, 0] *= 2 * window_size[1] - 1 - relative_position_index = \ - paddle.zeros(shape=(window_size[0] * window_size[1] + 1, ) * 2, dtype=relative_coords.dtype) - relative_position_index[1:, 1:] = relative_coords.sum( - -1) # Wh*Ww, Wh*Ww - relative_position_index[0, 0:] = self.num_relative_distance - 3 - relative_position_index[0:, 0] = self.num_relative_distance - 2 - relative_position_index[0, 0] = self.num_relative_distance - 1 - - self.register_buffer("relative_position_index", - relative_position_index) - # trunc_normal_(self.relative_position_bias_table, std=.0) - else: - self.window_size = None - self.relative_position_bias_table = None - self.relative_position_index = None - - self.attn_drop = nn.Dropout(attn_drop) - self.proj = nn.Linear(dim, dim) - self.proj_drop = nn.Dropout(proj_drop) - - def forward(self, x, rel_pos_bias=None): - x_shape = paddle.shape(x) - N, C = x_shape[1], x_shape[2] - - qkv_bias = None - if self.q_bias is not None: - qkv_bias = paddle.concat( - (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias)) - qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias) - - qkv = qkv.reshape((-1, N, 3, self.num_heads, - C // self.num_heads)).transpose((2, 0, 3, 1, 4)) - q, k, v = qkv[0], qkv[1], qkv[2] - attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale - - if self.relative_position_bias_table is not None: - relative_position_bias = self.relative_position_bias_table[ - self.relative_position_index.reshape([-1])].reshape([ - self.window_size[0] * self.window_size[1] + 1, - self.window_size[0] * self.window_size[1] + 1, -1 - ]) # Wh*Ww,Wh*Ww,nH - relative_position_bias = relative_position_bias.transpose( - (2, 0, 1)) #.contiguous() # nH, Wh*Ww, Wh*Ww - attn = attn + relative_position_bias.unsqueeze(0) - if rel_pos_bias is not None: - attn = attn + rel_pos_bias - - attn = nn.functional.softmax(attn, axis=-1) - attn = self.attn_drop(attn) - - x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C)) - x = self.proj(x) - x = self.proj_drop(x) - return x - - -class Block(nn.Layer): - def __init__(self, - dim, - num_heads, - mlp_ratio=4., - qkv_bias=False, - qk_scale=None, - drop=0., - attn_drop=0., - drop_path=0., - window_size=None, - init_values=None, - act_layer=nn.GELU, - norm_layer='nn.LayerNorm', - epsilon=1e-5): - super().__init__() - self.norm1 = nn.LayerNorm(dim, epsilon=1e-6) - self.attn = Attention( - dim, - num_heads=num_heads, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - attn_drop=attn_drop, - proj_drop=drop, - window_size=window_size) - # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here - self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() - self.norm2 = eval(norm_layer)(dim, epsilon=epsilon) - mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = Mlp(in_features=dim, - hidden_features=mlp_hidden_dim, - act_layer=act_layer, - drop=drop) - if init_values is not None: - self.gamma_1 = self.create_parameter( - shape=([dim]), default_initializer=Constant(value=init_values)) - self.gamma_2 = self.create_parameter( - shape=([dim]), default_initializer=Constant(value=init_values)) - else: - self.gamma_1, self.gamma_2 = None, None - - def forward(self, x, rel_pos_bias=None): - - if self.gamma_1 is None: - x = x + self.drop_path( - self.attn( - self.norm1(x), rel_pos_bias=rel_pos_bias)) - x = x + self.drop_path(self.mlp(self.norm2(x))) - else: - x = x + self.drop_path(self.gamma_1 * self.attn( - self.norm1(x), rel_pos_bias=rel_pos_bias)) - x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) - return x - - -class PatchEmbed(nn.Layer): - """ Image to Patch Embedding - """ - - def __init__(self, - img_size=[224, 224], - patch_size=16, - in_chans=3, - embed_dim=768): - super().__init__() - self.num_patches_w = img_size[0] // patch_size - self.num_patches_h = img_size[1] // patch_size - - num_patches = self.num_patches_w * self.num_patches_h - self.patch_shape = (img_size[0] // patch_size, - img_size[1] // patch_size) - self.img_size = img_size - self.patch_size = patch_size - self.num_patches = num_patches - - self.proj = nn.Conv2D( - in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) - - @property - def num_patches_in_h(self): - return self.img_size[1] // self.patch_size - - @property - def num_patches_in_w(self): - return self.img_size[0] // self.patch_size - - def forward(self, x, mask=None): - B, C, H, W = x.shape - return self.proj(x) - - -class RelativePositionBias(nn.Layer): - def __init__(self, window_size, num_heads): - super().__init__() - self.window_size = window_size - self.num_relative_distance = (2 * window_size[0] - 1) * ( - 2 * window_size[1] - 1) + 3 - self.relative_position_bias_table = self.create_parameter( - shape=(self.num_relative_distance, num_heads), - default_initialize=zeros_) - # cls to token & token 2 cls & cls to cls - - # get pair-wise relative position index for each token inside the window - coords_h = paddle.arange(window_size[0]) - coords_w = paddle.arange(window_size[1]) - coords = paddle.stack(paddle.meshgrid( - [coords_h, coords_w])) # 2, Wh, Ww - coords_flatten = coords.flatten(1) # 2, Wh*Ww - - relative_coords = coords_flatten[:, :, - None] - coords_flatten[:, - None, :] # 2, Wh*Ww, Wh*Ww - relative_coords = relative_coords.transpos( - (1, 2, 0)) # Wh*Ww, Wh*Ww, 2 - relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 - relative_coords[:, :, 1] += window_size[1] - 1 - relative_coords[:, :, 0] *= 2 * window_size[1] - 1 - relative_position_index = \ - paddle.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) - relative_position_index[1:, 1:] = relative_coords.sum( - -1) # Wh*Ww, Wh*Ww - relative_position_index[0, 0:] = self.num_relative_distance - 3 - relative_position_index[0:, 0] = self.num_relative_distance - 2 - relative_position_index[0, 0] = self.num_relative_distance - 1 - self.register_buffer("relative_position_index", relative_position_index) - - def forward(self): - relative_position_bias = \ - self.relative_position_bias_table[self.relative_position_index.reshape([-1])].reshape([ - self.window_size[0] * self.window_size[1] + 1, - self.window_size[0] * self.window_size[1] + 1, -1]) # Wh*Ww,Wh*Ww,nH - return relative_position_bias.transpose((2, 0, 1)) # nH, Wh*Ww, Wh*Ww - - -def get_sinusoid_encoding_table(n_position, d_hid, token=False): - ''' Sinusoid position encoding table ''' - - def get_position_angle_vec(position): - return [ - position / np.power(10000, 2 * (hid_j // 2) / d_hid) - for hid_j in range(d_hid) - ] - - sinusoid_table = np.array( - [get_position_angle_vec(pos_i) for pos_i in range(n_position)]) - sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2]) # dim 2i - sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2]) # dim 2i+1 - if token: - sinusoid_table = np.concatenate( - [sinusoid_table, np.zeros([1, d_hid])], dim=0) - - return paddle.to_tensor(sinusoid_table, dtype=paddle.float32).unsqueeze(0) - - -@register -@serializable -class VisionTransformer(nn.Layer): - """ Vision Transformer with support for patch input - """ - - def __init__(self, - img_size=[672, 1092], - patch_size=16, - in_chans=3, - embed_dim=768, - depth=12, - num_heads=12, - mlp_ratio=4, - qkv_bias=False, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0., - norm_layer='nn.LayerNorm', - init_values=None, - use_rel_pos_bias=False, - use_shared_rel_pos_bias=False, - epsilon=1e-5, - final_norm=False, - pretrained=None, - out_indices=[3, 5, 7, 11], - use_abs_pos_emb=False, - use_sincos_pos_emb=True, - with_fpn=True, - num_fpn_levels=4, - use_checkpoint=False, - **args): - super().__init__() - self.img_size = img_size - self.embed_dim = embed_dim - self.with_fpn = with_fpn - self.use_checkpoint = use_checkpoint - self.use_sincos_pos_emb = use_sincos_pos_emb - self.use_rel_pos_bias = use_rel_pos_bias - self.final_norm = final_norm - self.out_indices = out_indices - self.num_fpn_levels = num_fpn_levels - - if use_checkpoint: - paddle.seed(0) - - self.patch_embed = PatchEmbed( - img_size=img_size, - patch_size=patch_size, - in_chans=in_chans, - embed_dim=embed_dim) - - self.pos_w = self.patch_embed.num_patches_in_w - self.pos_h = self.patch_embed.num_patches_in_h - - self.cls_token = self.create_parameter( - shape=(1, 1, embed_dim), - default_initializer=paddle.nn.initializer.Constant(value=0.)) - - if use_abs_pos_emb: - self.pos_embed = self.create_parameter( - shape=(1, self.pos_w * self.pos_h + 1, embed_dim), - default_initializer=paddle.nn.initializer.TruncatedNormal( - std=.02)) - elif use_sincos_pos_emb: - pos_embed = self.build_2d_sincos_position_embedding(embed_dim) - - self.pos_embed = pos_embed - self.pos_embed = self.create_parameter(shape=pos_embed.shape) - self.pos_embed.set_value(pos_embed.numpy()) - self.pos_embed.stop_gradient = True - - else: - self.pos_embed = None - - self.pos_drop = nn.Dropout(p=drop_rate) - - if use_shared_rel_pos_bias: - self.rel_pos_bias = RelativePositionBias( - window_size=self.patch_embed.patch_shape, num_heads=num_heads) - else: - self.rel_pos_bias = None - - dpr = np.linspace(0, drop_path_rate, depth) - - self.blocks = nn.LayerList([ - Block( - dim=embed_dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - drop_path=dpr[i], - norm_layer=norm_layer, - init_values=init_values, - window_size=self.patch_embed.patch_shape - if use_rel_pos_bias else None, - epsilon=epsilon) for i in range(depth) - ]) - - self.pretrained = pretrained - self.init_weight() - - assert len(out_indices) <= 4, '' - self.out_indices = out_indices - self.out_channels = [embed_dim for _ in range(num_fpn_levels)] - self.out_strides = [4, 8, 16, 32][-num_fpn_levels:] if with_fpn else [ - patch_size for _ in range(len(out_indices)) - ] - - self.norm = Identity() - - if self.with_fpn: - assert num_fpn_levels <= 4, '' - self.init_fpn( - embed_dim=embed_dim, - patch_size=patch_size, ) - - def init_weight(self): - pretrained = self.pretrained - - if pretrained: - if 'http' in pretrained: #URL - path = paddle.utils.download.get_weights_path_from_url( - pretrained) - else: #model in local path - path = pretrained - - load_state_dict = paddle.load(path) - model_state_dict = self.state_dict() - pos_embed_name = "pos_embed" - - if pos_embed_name in load_state_dict.keys(): - load_pos_embed = paddle.to_tensor( - load_state_dict[pos_embed_name], dtype="float32") - if self.pos_embed.shape != load_pos_embed.shape: - pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1)) - model_state_dict[pos_embed_name] = self.resize_pos_embed( - load_pos_embed, (pos_size, pos_size), - (self.pos_h, self.pos_w)) - - # self.set_state_dict(model_state_dict) - load_state_dict[pos_embed_name] = model_state_dict[ - pos_embed_name] - - print("Load pos_embed and resize it from {} to {} .".format( - load_pos_embed.shape, self.pos_embed.shape)) - - self.set_state_dict(load_state_dict) - print("Load load_state_dict....") - - def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False): - if patch_size == 16: - self.fpn1 = nn.Sequential( - nn.Conv2DTranspose( - embed_dim, embed_dim, kernel_size=2, stride=2), - nn.BatchNorm2D(embed_dim), - nn.GELU(), - nn.Conv2DTranspose( - embed_dim, embed_dim, kernel_size=2, stride=2), ) - - self.fpn2 = nn.Sequential( - nn.Conv2DTranspose( - embed_dim, embed_dim, kernel_size=2, stride=2), ) - - self.fpn3 = Identity() - - self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2) - elif patch_size == 8: - self.fpn1 = nn.Sequential( - nn.Conv2DTranspose( - embed_dim, embed_dim, kernel_size=2, stride=2), ) - - self.fpn2 = Identity() - - self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), ) - - self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), ) - - if not out_with_norm: - self.norm = Identity() - else: - self.norm = nn.LayerNorm(embed_dim, epsilon=1e-6) - - def interpolate_pos_encoding(self, x, w, h): - npatch = x.shape[1] - 1 - N = self.pos_embed.shape[1] - 1 - w0 = w // self.patch_embed.patch_size - h0 = h // self.patch_embed.patch_size - if npatch == N and w0 == self.patch_embed.num_patches_w and h0 == self.patch_embed.num_patches_h: - return self.pos_embed - class_pos_embed = self.pos_embed[:, 0] - patch_pos_embed = self.pos_embed[:, 1:] - dim = x.shape[-1] - # we add a small number to avoid floating point error in the interpolation - # see discussion at https://github.com/facebookresearch/dino/issues/8 - # w0, h0 = w0 + 0.1, h0 + 0.1 - # patch_pos_embed = nn.functional.interpolate( - # patch_pos_embed.reshape([ - # 1, self.patch_embed.num_patches_w, - # self.patch_embed.num_patches_h, dim - # ]).transpose((0, 3, 1, 2)), - # scale_factor=(w0 / self.patch_embed.num_patches_w, - # h0 / self.patch_embed.num_patches_h), - # mode='bicubic', ) - - patch_pos_embed = nn.functional.interpolate( - patch_pos_embed.reshape([ - 1, self.patch_embed.num_patches_w, - self.patch_embed.num_patches_h, dim - ]).transpose((0, 3, 1, 2)), - (w0, h0), - mode='bicubic', ) - - assert int(w0) == patch_pos_embed.shape[-2] and int( - h0) == patch_pos_embed.shape[-1] - patch_pos_embed = patch_pos_embed.transpose( - (0, 2, 3, 1)).reshape([1, -1, dim]) - return paddle.concat( - (class_pos_embed.unsqueeze(0), patch_pos_embed), axis=1) - - def resize_pos_embed(self, pos_embed, old_hw, new_hw): - """ - Resize pos_embed weight. - Args: - pos_embed (Tensor): the pos_embed weight - old_hw (list[int]): the height and width of old pos_embed - new_hw (list[int]): the height and width of new pos_embed - Returns: - Tensor: the resized pos_embed weight - """ - cls_pos_embed = pos_embed[:, :1, :] - pos_embed = pos_embed[:, 1:, :] - - pos_embed = pos_embed.transpose([0, 2, 1]) - pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]]) - pos_embed = F.interpolate( - pos_embed, new_hw, mode='bicubic', align_corners=False) - pos_embed = pos_embed.flatten(2).transpose([0, 2, 1]) - pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1) - - return pos_embed - - def build_2d_sincos_position_embedding( - self, - embed_dim=768, - temperature=10000., ): - h, w = self.patch_embed.patch_shape - grid_w = paddle.arange(w, dtype=paddle.float32) - grid_h = paddle.arange(h, dtype=paddle.float32) - grid_w, grid_h = paddle.meshgrid(grid_w, grid_h) - assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' - pos_dim = embed_dim // 4 - omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim - omega = 1. / (temperature**omega) - - out_w = grid_w.flatten()[..., None] @omega[None] - out_h = grid_h.flatten()[..., None] @omega[None] - - pos_emb = paddle.concat( - [ - paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h), - paddle.cos(out_h) - ], - axis=1)[None, :, :] - - pe_token = paddle.zeros([1, 1, embed_dim], dtype=paddle.float32) - pos_embed = paddle.concat([pe_token, pos_emb], axis=1) - # pos_embed.stop_gradient = True - - return pos_embed - - def forward(self, x): - x = x['image'] if isinstance(x, dict) else x - _, _, h, w = x.shape - - x = self.patch_embed(x) - - B, D, Hp, Wp = x.shape # b * c * h * w - - cls_tokens = self.cls_token.expand( - (B, self.cls_token.shape[-2], self.cls_token.shape[-1])) - x = x.flatten(2).transpose([0, 2, 1]) # b * hw * c - x = paddle.concat([cls_tokens, x], axis=1) - - if self.pos_embed is not None: - # x = x + self.interpolate_pos_encoding(x, w, h) - x = x + self.interpolate_pos_encoding(x, h, w) - - x = self.pos_drop(x) - - rel_pos_bias = self.rel_pos_bias( - ) if self.rel_pos_bias is not None else None - - feats = [] - for idx, blk in enumerate(self.blocks): - if self.use_checkpoint and self.training: - x = paddle.distributed.fleet.utils.recompute( - blk, x, rel_pos_bias, **{"preserve_rng_state": True}) - else: - x = blk(x, rel_pos_bias) - - if idx in self.out_indices: - xp = paddle.reshape( - paddle.transpose( - self.norm(x[:, 1:, :]), perm=[0, 2, 1]), - shape=[B, D, Hp, Wp]) - feats.append(xp) - - if self.with_fpn: - fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4][ - -self.num_fpn_levels:] - assert len(fpns) == len(feats) or len(feats) == 1, '' - outputs = [] - for i, m in enumerate(fpns): - outputs.append( - m(feats[i] if len(feats) == len(fpns) else feats[-1])) - - return outputs - - return feats - - @property - def num_layers(self): - return len(self.blocks) - - @property - def no_weight_decay(self): - return {'pos_embed', 'cls_token'} - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=c, stride=s) - for c, s in zip(self.out_channels, self.out_strides) - ] diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/vit_mae.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/vit_mae.py deleted file mode 100644 index 8d00da7..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/vit_mae.py +++ /dev/null @@ -1,749 +0,0 @@ -# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -import numpy as np -import math -from paddle import ParamAttr -from paddle.regularizer import L2Decay -from paddle.nn.initializer import Constant, TruncatedNormal - -from ppdet.modeling.shape_spec import ShapeSpec -from ppdet.core.workspace import register, serializable - -from .transformer_utils import (zeros_, DropPath, Identity, window_partition, - window_unpartition) -from ..initializer import linear_init_ - -__all__ = ['VisionTransformer2D', 'SimpleFeaturePyramid'] - - -class Mlp(nn.Layer): - def __init__(self, - in_features, - hidden_features=None, - out_features=None, - act_layer='nn.GELU', - drop=0., - lr_factor=1.0): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - self.fc1 = nn.Linear( - in_features, - hidden_features, - weight_attr=ParamAttr(learning_rate=lr_factor), - bias_attr=ParamAttr(learning_rate=lr_factor)) - self.act = eval(act_layer)() - self.fc2 = nn.Linear( - hidden_features, - out_features, - weight_attr=ParamAttr(learning_rate=lr_factor), - bias_attr=ParamAttr(learning_rate=lr_factor)) - self.drop = nn.Dropout(drop) - - self._init_weights() - - def _init_weights(self): - linear_init_(self.fc1) - linear_init_(self.fc2) - - def forward(self, x): - x = self.drop(self.act(self.fc1(x))) - x = self.drop(self.fc2(x)) - return x - - -class Attention(nn.Layer): - def __init__(self, - dim, - num_heads=8, - qkv_bias=False, - attn_bias=False, - attn_drop=0., - proj_drop=0., - use_rel_pos=False, - rel_pos_zero_init=True, - window_size=None, - input_size=None, - qk_scale=None, - lr_factor=1.0): - super().__init__() - self.num_heads = num_heads - self.head_dim = dim // num_heads - self.scale = qk_scale or self.head_dim**-0.5 - self.use_rel_pos = use_rel_pos - self.input_size = input_size - self.rel_pos_zero_init = rel_pos_zero_init - self.window_size = window_size - self.lr_factor = lr_factor - - self.qkv = nn.Linear( - dim, - dim * 3, - weight_attr=ParamAttr(learning_rate=lr_factor), - bias_attr=ParamAttr(learning_rate=lr_factor) - if attn_bias else False) - if qkv_bias: - self.q_bias = self.create_parameter( - shape=([dim]), default_initializer=zeros_) - self.v_bias = self.create_parameter( - shape=([dim]), default_initializer=zeros_) - else: - self.q_bias = None - self.v_bias = None - self.proj = nn.Linear( - dim, - dim, - weight_attr=ParamAttr(learning_rate=lr_factor), - bias_attr=ParamAttr(learning_rate=lr_factor)) - self.attn_drop = nn.Dropout(attn_drop) - if window_size is None: - self.window_size = self.input_size[0] - - self._init_weights() - - def _init_weights(self): - linear_init_(self.qkv) - linear_init_(self.proj) - - if self.use_rel_pos: - self.rel_pos_h = self.create_parameter( - [2 * self.window_size - 1, self.head_dim], - attr=ParamAttr(learning_rate=self.lr_factor), - default_initializer=Constant(value=0.)) - self.rel_pos_w = self.create_parameter( - [2 * self.window_size - 1, self.head_dim], - attr=ParamAttr(learning_rate=self.lr_factor), - default_initializer=Constant(value=0.)) - - if not self.rel_pos_zero_init: - TruncatedNormal(self.rel_pos_h, std=0.02) - TruncatedNormal(self.rel_pos_w, std=0.02) - - def get_rel_pos(self, seq_size, rel_pos): - max_rel_dist = int(2 * seq_size - 1) - # Interpolate rel pos if needed. - if rel_pos.shape[0] != max_rel_dist: - # Interpolate rel pos. - rel_pos = rel_pos.reshape([1, rel_pos.shape[0], -1]) - rel_pos = rel_pos.transpose([0, 2, 1]) - rel_pos_resized = F.interpolate( - rel_pos, - size=(max_rel_dist, ), - mode="linear", - data_format='NCW') - rel_pos_resized = rel_pos_resized.reshape([-1, max_rel_dist]) - rel_pos_resized = rel_pos_resized.transpose([1, 0]) - else: - rel_pos_resized = rel_pos - - coords = paddle.arange(seq_size, dtype='float32') - relative_coords = coords.unsqueeze(-1) - coords.unsqueeze(0) - relative_coords += (seq_size - 1) - relative_coords = relative_coords.astype('int64').flatten() - - return paddle.index_select(rel_pos_resized, relative_coords).reshape( - [seq_size, seq_size, self.head_dim]) - - def add_decomposed_rel_pos(self, attn, q, h, w): - """ - Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. - Args: - attn (Tensor): attention map. - q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C). - Returns: - attn (Tensor): attention map with added relative positional embeddings. - """ - Rh = self.get_rel_pos(h, self.rel_pos_h) - Rw = self.get_rel_pos(w, self.rel_pos_w) - - B, _, dim = q.shape - r_q = q.reshape([B, h, w, dim]) - # bhwc, hch->bhwh1 - # bwhc, wcw->bhw1w - rel_h = paddle.einsum("bhwc,hkc->bhwk", r_q, Rh).unsqueeze(-1) - rel_w = paddle.einsum("bhwc,wkc->bhwk", r_q, Rw).unsqueeze(-2) - - attn = attn.reshape([B, h, w, h, w]) + rel_h + rel_w - return attn.reshape([B, h * w, h * w]) - - def forward(self, x): - B, H, W, C = paddle.shape(x) - - if self.q_bias is not None: - qkv_bias = paddle.concat( - (self.q_bias, paddle.zeros_like(self.v_bias), self.v_bias)) - qkv = F.linear(x, weight=self.qkv.weight, bias=qkv_bias) - else: - qkv = self.qkv(x).reshape( - [B, H * W, 3, self.num_heads, self.head_dim]).transpose( - [2, 0, 3, 1, 4]).reshape( - [3, B * self.num_heads, H * W, self.head_dim]) - - q, k, v = qkv[0], qkv[1], qkv[2] - attn = q.matmul(k.transpose([0, 2, 1])) * self.scale - - if self.use_rel_pos: - attn = self.add_decomposed_rel_pos(attn, q, H, W) - - attn = F.softmax(attn, axis=-1) - attn = self.attn_drop(attn) - x = attn.matmul(v).reshape( - [B, self.num_heads, H * W, self.head_dim]).transpose( - [0, 2, 1, 3]).reshape([B, H, W, C]) - x = self.proj(x) - return x - - -class Block(nn.Layer): - def __init__(self, - dim, - num_heads, - mlp_ratio=4., - qkv_bias=False, - attn_bias=False, - qk_scale=None, - init_values=None, - drop=0., - attn_drop=0., - drop_path=0., - use_rel_pos=True, - rel_pos_zero_init=True, - window_size=None, - input_size=None, - act_layer='nn.GELU', - norm_layer='nn.LayerNorm', - lr_factor=1.0, - epsilon=1e-5): - super().__init__() - self.window_size = window_size - - self.norm1 = eval(norm_layer)(dim, - weight_attr=ParamAttr( - learning_rate=lr_factor, - regularizer=L2Decay(0.0)), - bias_attr=ParamAttr( - learning_rate=lr_factor, - regularizer=L2Decay(0.0)), - epsilon=epsilon) - self.attn = Attention( - dim, - num_heads=num_heads, - qkv_bias=qkv_bias, - attn_bias=attn_bias, - qk_scale=qk_scale, - attn_drop=attn_drop, - proj_drop=drop, - use_rel_pos=use_rel_pos, - rel_pos_zero_init=rel_pos_zero_init, - window_size=window_size, - input_size=input_size, - lr_factor=lr_factor) - - self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() - self.norm2 = eval(norm_layer)(dim, - weight_attr=ParamAttr( - learning_rate=lr_factor, - regularizer=L2Decay(0.0)), - bias_attr=ParamAttr( - learning_rate=lr_factor, - regularizer=L2Decay(0.0)), - epsilon=epsilon) - self.mlp = Mlp(in_features=dim, - hidden_features=int(dim * mlp_ratio), - act_layer=act_layer, - drop=drop, - lr_factor=lr_factor) - if init_values is not None: - self.gamma_1 = self.create_parameter( - shape=([dim]), default_initializer=Constant(value=init_values)) - self.gamma_2 = self.create_parameter( - shape=([dim]), default_initializer=Constant(value=init_values)) - else: - self.gamma_1, self.gamma_2 = None, None - - def forward(self, x): - y = self.norm1(x) - if self.window_size is not None: - y, pad_hw, num_hw = window_partition(y, self.window_size) - y = self.attn(y) - if self.gamma_1 is not None: - y = self.gamma_1 * y - - if self.window_size is not None: - y = window_unpartition(y, pad_hw, num_hw, (x.shape[1], x.shape[2])) - x = x + self.drop_path(y) - if self.gamma_2 is None: - x = x + self.drop_path(self.mlp(self.norm2(x))) - else: - x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) - - return x - - -class PatchEmbed(nn.Layer): - """ Image to Patch Embedding - """ - - def __init__(self, - img_size=(224, 224), - patch_size=16, - in_chans=3, - embed_dim=768, - lr_factor=0.01): - super().__init__() - self.img_size = img_size - self.patch_size = patch_size - self.proj = nn.Conv2D( - in_chans, - embed_dim, - kernel_size=patch_size, - stride=patch_size, - weight_attr=ParamAttr(learning_rate=lr_factor), - bias_attr=ParamAttr(learning_rate=lr_factor)) - - @property - def num_patches_in_h(self): - return self.img_size[1] // self.patch_size - - @property - def num_patches_in_w(self): - return self.img_size[0] // self.patch_size - - def forward(self, x): - out = self.proj(x) - return out - - -@register -@serializable -class VisionTransformer2D(nn.Layer): - """ Vision Transformer with support for patch input - """ - - def __init__(self, - img_size=(1024, 1024), - patch_size=16, - in_chans=3, - embed_dim=768, - depth=12, - num_heads=12, - mlp_ratio=4, - qkv_bias=False, - attn_bias=False, - qk_scale=None, - init_values=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0., - act_layer='nn.GELU', - norm_layer='nn.LayerNorm', - lr_decay_rate=1.0, - global_attn_indexes=(2, 5, 8, 11), - use_abs_pos=False, - use_rel_pos=False, - use_abs_pos_emb=False, - use_sincos_pos_emb=False, - rel_pos_zero_init=True, - epsilon=1e-5, - final_norm=False, - pretrained=None, - window_size=None, - out_indices=(11, ), - with_fpn=False, - use_checkpoint=False, - *args, - **kwargs): - super().__init__() - self.img_size = img_size - self.patch_size = patch_size - self.embed_dim = embed_dim - self.num_heads = num_heads - self.depth = depth - self.global_attn_indexes = global_attn_indexes - self.epsilon = epsilon - self.with_fpn = with_fpn - self.use_checkpoint = use_checkpoint - - self.patch_h = img_size[0] // patch_size - self.patch_w = img_size[1] // patch_size - self.num_patches = self.patch_h * self.patch_w - self.use_abs_pos = use_abs_pos - self.use_abs_pos_emb = use_abs_pos_emb - - self.patch_embed = PatchEmbed( - img_size=img_size, - patch_size=patch_size, - in_chans=in_chans, - embed_dim=embed_dim) - - dpr = np.linspace(0, drop_path_rate, depth) - if use_checkpoint: - paddle.seed(0) - - if use_abs_pos_emb: - self.pos_w = self.patch_embed.num_patches_in_w - self.pos_h = self.patch_embed.num_patches_in_h - self.pos_embed = self.create_parameter( - shape=(1, self.pos_w * self.pos_h + 1, embed_dim), - default_initializer=paddle.nn.initializer.TruncatedNormal( - std=.02)) - elif use_sincos_pos_emb: - pos_embed = self.get_2d_sincos_position_embedding(self.patch_h, - self.patch_w) - - self.pos_embed = pos_embed - self.pos_embed = self.create_parameter(shape=pos_embed.shape) - self.pos_embed.set_value(pos_embed.numpy()) - self.pos_embed.stop_gradient = True - else: - self.pos_embed = None - - self.blocks = nn.LayerList([ - Block( - embed_dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - attn_bias=attn_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - drop_path=dpr[i], - use_rel_pos=use_rel_pos, - rel_pos_zero_init=rel_pos_zero_init, - window_size=None - if i in self.global_attn_indexes else window_size, - input_size=[self.patch_h, self.patch_w], - act_layer=act_layer, - lr_factor=self.get_vit_lr_decay_rate(i, lr_decay_rate), - norm_layer=norm_layer, - init_values=init_values, - epsilon=epsilon) for i in range(depth) - ]) - - assert len(out_indices) <= 4, 'out_indices out of bound' - self.out_indices = out_indices - self.pretrained = pretrained - self.init_weight() - - self.out_channels = [embed_dim for _ in range(len(out_indices))] - self.out_strides = [4, 8, 16, 32][-len(out_indices):] if with_fpn else [ - patch_size for _ in range(len(out_indices)) - ] - self.norm = Identity() - if self.with_fpn: - self.init_fpn( - embed_dim=embed_dim, - patch_size=patch_size, - out_with_norm=final_norm) - - def get_vit_lr_decay_rate(self, layer_id, lr_decay_rate): - return lr_decay_rate**(self.depth - layer_id) - - def init_weight(self): - pretrained = self.pretrained - if pretrained: - if 'http' in pretrained: - path = paddle.utils.download.get_weights_path_from_url( - pretrained) - else: - path = pretrained - - load_state_dict = paddle.load(path) - model_state_dict = self.state_dict() - pos_embed_name = "pos_embed" - - if pos_embed_name in load_state_dict.keys( - ) and self.use_abs_pos_emb: - load_pos_embed = paddle.to_tensor( - load_state_dict[pos_embed_name], dtype="float32") - if self.pos_embed.shape != load_pos_embed.shape: - pos_size = int(math.sqrt(load_pos_embed.shape[1] - 1)) - model_state_dict[pos_embed_name] = self.resize_pos_embed( - load_pos_embed, (pos_size, pos_size), - (self.pos_h, self.pos_w)) - - # self.set_state_dict(model_state_dict) - load_state_dict[pos_embed_name] = model_state_dict[ - pos_embed_name] - - print("Load pos_embed and resize it from {} to {} .".format( - load_pos_embed.shape, self.pos_embed.shape)) - - self.set_state_dict(load_state_dict) - print("Load load_state_dict....") - - def init_fpn(self, embed_dim=768, patch_size=16, out_with_norm=False): - if patch_size == 16: - self.fpn1 = nn.Sequential( - nn.Conv2DTranspose( - embed_dim, embed_dim, kernel_size=2, stride=2), - nn.BatchNorm2D(embed_dim), - nn.GELU(), - nn.Conv2DTranspose( - embed_dim, embed_dim, kernel_size=2, stride=2), ) - - self.fpn2 = nn.Sequential( - nn.Conv2DTranspose( - embed_dim, embed_dim, kernel_size=2, stride=2), ) - - self.fpn3 = Identity() - - self.fpn4 = nn.MaxPool2D(kernel_size=2, stride=2) - elif patch_size == 8: - self.fpn1 = nn.Sequential( - nn.Conv2DTranspose( - embed_dim, embed_dim, kernel_size=2, stride=2), ) - - self.fpn2 = Identity() - - self.fpn3 = nn.Sequential(nn.MaxPool2D(kernel_size=2, stride=2), ) - - self.fpn4 = nn.Sequential(nn.MaxPool2D(kernel_size=4, stride=4), ) - - if not out_with_norm: - self.norm = Identity() - else: - self.norm = nn.LayerNorm(embed_dim, epsilon=self.epsilon) - - def resize_pos_embed(self, pos_embed, old_hw, new_hw): - """ - Resize pos_embed weight. - Args: - pos_embed (Tensor): the pos_embed weight - old_hw (list[int]): the height and width of old pos_embed - new_hw (list[int]): the height and width of new pos_embed - Returns: - Tensor: the resized pos_embed weight - """ - cls_pos_embed = pos_embed[:, :1, :] - pos_embed = pos_embed[:, 1:, :] - - pos_embed = pos_embed.transpose([0, 2, 1]) - pos_embed = pos_embed.reshape([1, -1, old_hw[0], old_hw[1]]) - pos_embed = F.interpolate( - pos_embed, new_hw, mode='bicubic', align_corners=False) - pos_embed = pos_embed.flatten(2).transpose([0, 2, 1]) - pos_embed = paddle.concat([cls_pos_embed, pos_embed], axis=1) - - return pos_embed - - def get_2d_sincos_position_embedding(self, h, w, temperature=10000.): - grid_y, grid_x = paddle.meshgrid( - paddle.arange( - h, dtype=paddle.float32), - paddle.arange( - w, dtype=paddle.float32)) - assert self.embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' - pos_dim = self.embed_dim // 4 - omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim - omega = (1. / (temperature**omega)).unsqueeze(0) - - out_x = grid_x.reshape([-1, 1]).matmul(omega) - out_y = grid_y.reshape([-1, 1]).matmul(omega) - - pos_emb = paddle.concat( - [ - paddle.sin(out_y), paddle.cos(out_y), paddle.sin(out_x), - paddle.cos(out_x) - ], - axis=1) - - return pos_emb.reshape([1, h, w, self.embed_dim]) - - def forward(self, inputs): - x = self.patch_embed(inputs['image']).transpose([0, 2, 3, 1]) - B, Hp, Wp, _ = paddle.shape(x) - - if self.use_abs_pos: - x = x + self.get_2d_sincos_position_embedding(Hp, Wp) - - if self.use_abs_pos_emb: - x = x + self.resize_pos_embed(self.pos_embed, - (self.pos_h, self.pos_w), (Hp, Wp)) - - feats = [] - for idx, blk in enumerate(self.blocks): - if self.use_checkpoint and self.training: - x = paddle.distributed.fleet.utils.recompute( - blk, x, **{"preserve_rng_state": True}) - else: - x = blk(x) - if idx in self.out_indices: - feats.append(self.norm(x.transpose([0, 3, 1, 2]))) - - if self.with_fpn: - fpns = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] - for i in range(len(feats)): - feats[i] = fpns[i](feats[i]) - return feats - - @property - def num_layers(self): - return len(self.blocks) - - @property - def no_weight_decay(self): - return {'pos_embed', 'cls_token'} - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=c, stride=s) - for c, s in zip(self.out_channels, self.out_strides) - ] - - -class LayerNorm(nn.Layer): - """ - A LayerNorm variant, popularized by Transformers, that performs point-wise mean and - variance normalization over the channel dimension for inputs that have shape - (batch_size, channels, height, width). - Note that, the modified LayerNorm on used in ResBlock and SimpleFeaturePyramid. - - In ViT, we use the nn.LayerNorm - """ - - def __init__(self, normalized_shape, eps=1e-6): - super().__init__() - self.weight = self.create_parameter([normalized_shape]) - self.bias = self.create_parameter([normalized_shape]) - self.eps = eps - self.normalized_shape = (normalized_shape, ) - - def forward(self, x): - u = x.mean(1, keepdim=True) - s = (x - u).pow(2).mean(1, keepdim=True) - x = (x - u) / paddle.sqrt(s + self.eps) - x = self.weight[:, None, None] * x + self.bias[:, None, None] - return x - - -@register -@serializable -class SimpleFeaturePyramid(nn.Layer): - def __init__(self, - in_channels, - out_channels, - spatial_scales, - num_levels=4, - use_bias=False): - """ - Args: - in_channels (list[int]): input channels of each level which can be - derived from the output shape of backbone by from_config - out_channel (int): output channel of each level. - spatial_scales (list[float]): list of scaling factors to upsample or downsample - the input features for creating pyramid features which can be derived from - the output shape of backbone by from_config - num_levels (int): number of levels of output features. - use_bias (bool): whether use bias or not. - """ - super(SimpleFeaturePyramid, self).__init__() - - self.in_channels = in_channels[0] - self.out_channels = out_channels - self.num_levels = num_levels - - self.stages = [] - dim = self.in_channels - if num_levels == 4: - scale_factors = [2.0, 1.0, 0.5] - elif num_levels == 5: - scale_factors = [4.0, 2.0, 1.0, 0.5] - else: - raise NotImplementedError( - f"num_levels={num_levels} is not supported yet.") - - dim = in_channels[0] - for idx, scale in enumerate(scale_factors): - out_dim = dim - if scale == 4.0: - layers = [ - nn.Conv2DTranspose( - dim, dim // 2, kernel_size=2, stride=2), - nn.LayerNorm(dim // 2), - nn.GELU(), - nn.Conv2DTranspose( - dim // 2, dim // 4, kernel_size=2, stride=2), - ] - out_dim = dim // 4 - elif scale == 2.0: - layers = [ - nn.Conv2DTranspose( - dim, dim // 2, kernel_size=2, stride=2) - ] - out_dim = dim // 2 - elif scale == 1.0: - layers = [] - elif scale == 0.5: - layers = [nn.MaxPool2D(kernel_size=2, stride=2)] - - layers.extend([ - nn.Conv2D( - out_dim, - out_channels, - kernel_size=1, - bias_attr=use_bias, ), LayerNorm(out_channels), nn.Conv2D( - out_channels, - out_channels, - kernel_size=3, - padding=1, - bias_attr=use_bias, ), LayerNorm(out_channels) - ]) - layers = nn.Sequential(*layers) - - stage = -int(math.log2(spatial_scales[0] * scale_factors[idx])) - self.add_sublayer(f"simfp_{stage}", layers) - self.stages.append(layers) - - # top block output feature maps. - self.top_block = nn.Sequential( - nn.MaxPool2D( - kernel_size=1, stride=2, padding=0)) - - @classmethod - def from_config(cls, cfg, input_shape): - return { - 'in_channels': [i.channels for i in input_shape], - 'spatial_scales': [1.0 / i.stride for i in input_shape], - } - - @property - def out_shape(self): - return [ - ShapeSpec(channels=self.out_channels) - for _ in range(self.num_levels) - ] - - def forward(self, feats): - """ - Args: - x: Tensor of shape (N,C,H,W). - """ - features = feats[0] - results = [] - - for stage in self.stages: - results.append(stage(features)) - - top_block_in_feature = results[-1] - results.append(self.top_block(top_block_in_feature)) - assert self.num_levels == len(results) - - return results diff --git a/pdfdet/models/Paddle/ppdet/modeling/backbones/vitpose.py b/pdfdet/models/Paddle/ppdet/modeling/backbones/vitpose.py deleted file mode 100644 index 23e00be..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/backbones/vitpose.py +++ /dev/null @@ -1,320 +0,0 @@ -# copyright (c) 2023 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# Code was based on https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py -# reference: https://arxiv.org/abs/2010.11929 - -from collections.abc import Callable - -import numpy as np -import paddle -import paddle.nn as nn -from paddle.nn.initializer import TruncatedNormal, Constant, Normal -from ppdet.core.workspace import register, serializable - -trunc_normal_ = TruncatedNormal(std=.02) - - -def to_2tuple(x): - if isinstance(x, (list, tuple)): - return x - return tuple([x] * 2) - - -def drop_path(x, drop_prob=0., training=False): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... - See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... - """ - if drop_prob == 0. or not training: - return x - keep_prob = paddle.to_tensor(1.0 - drop_prob).astype(x.dtype) - shape = (paddle.shape(x)[0], ) + (1, ) * (x.ndim - 1) - random_tensor = keep_prob + paddle.rand(shape).astype(x.dtype) - random_tensor = paddle.floor(random_tensor) # binarize - output = x.divide(keep_prob) * random_tensor - return output - - -class DropPath(nn.Layer): - """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - """ - - def __init__(self, drop_prob=None): - super(DropPath, self).__init__() - self.drop_prob = drop_prob - - def forward(self, x): - return drop_path(x, self.drop_prob, self.training) - - -class Identity(nn.Layer): - def __init__(self): - super(Identity, self).__init__() - - def forward(self, input): - return input - - -class Mlp(nn.Layer): - def __init__(self, - in_features, - hidden_features=None, - out_features=None, - act_layer=nn.GELU, - drop=0.): - super().__init__() - out_features = out_features or in_features - hidden_features = hidden_features or in_features - self.fc1 = nn.Linear(in_features, hidden_features) - self.act = act_layer() - self.fc2 = nn.Linear(hidden_features, out_features) - self.drop = nn.Dropout(drop) - - def forward(self, x): - x = self.fc1(x) - x = self.act(x) - - x = self.fc2(x) - x = self.drop(x) - return x - - -class Attention(nn.Layer): - def __init__(self, - dim, - num_heads=8, - qkv_bias=False, - qk_scale=None, - attn_drop=0., - proj_drop=0.): - super().__init__() - self.num_heads = num_heads - head_dim = dim // num_heads - self.scale = qk_scale or head_dim**-0.5 - - self.qkv = nn.Linear(dim, dim * 3, bias_attr=qkv_bias) - - self.attn_drop = nn.Dropout(attn_drop) - self.proj = nn.Linear(dim, dim) - self.proj_drop = nn.Dropout(proj_drop) - - def forward(self, x): - - N, C = x.shape[1:] - qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C // - self.num_heads)).transpose((2, 0, 3, 1, 4)) - - q, k, v = qkv[0], qkv[1], qkv[2] - - attn = (q.matmul(k.transpose((0, 1, 3, 2)))) * self.scale - attn = nn.functional.softmax(attn, axis=-1) - attn = self.attn_drop(attn) - - x = (attn.matmul(v)).transpose((0, 2, 1, 3)).reshape((-1, N, C)) - x = self.proj(x) - - x = self.proj_drop(x) - return x - - -class Block(nn.Layer): - def __init__(self, - dim, - num_heads, - mlp_ratio=4., - qkv_bias=False, - qk_scale=None, - drop=0., - attn_drop=0., - drop_path=0., - act_layer=nn.GELU, - norm_layer='nn.LayerNorm', - epsilon=1e-5): - super().__init__() - if isinstance(norm_layer, str): - self.norm1 = eval(norm_layer)(dim, epsilon=epsilon) - elif isinstance(norm_layer, Callable): - self.norm1 = norm_layer(dim) - else: - raise TypeError( - "The norm_layer must be str or paddle.nn.layer.Layer class") - self.attn = Attention( - dim, - num_heads=num_heads, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - attn_drop=attn_drop, - proj_drop=drop) - # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here - self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() - if isinstance(norm_layer, str): - self.norm2 = eval(norm_layer)(dim, epsilon=epsilon) - elif isinstance(norm_layer, Callable): - self.norm2 = norm_layer(dim) - else: - raise TypeError( - "The norm_layer must be str or paddle.nn.layer.Layer class") - mlp_hidden_dim = int(dim * mlp_ratio) - self.mlp = Mlp(in_features=dim, - hidden_features=mlp_hidden_dim, - act_layer=act_layer, - drop=drop) - - def forward(self, x): - x = x + self.drop_path(self.attn(self.norm1(x))) - x = x + self.drop_path(self.mlp(self.norm2(x))) - - return x - - -class PatchEmbed(nn.Layer): - """ Image to Patch Embedding - """ - - def __init__(self, - img_size=224, - patch_size=16, - in_chans=3, - embed_dim=768, - ratio=1): - super().__init__() - img_size = to_2tuple(img_size) - patch_size = to_2tuple(patch_size) - - num_patches = (img_size[1] // patch_size[1]) * ( - img_size[0] // patch_size[0]) * (ratio**2) - self.img_size = img_size - self.patch_size = patch_size - self.num_patches = num_patches - - self.proj = nn.Conv2D( - in_chans, - embed_dim, - kernel_size=patch_size, - stride=(patch_size[0] // ratio), - padding=(4 + 2 * (ratio // 2 - 1), 4 + 2 * (ratio // 2 - 1))) - - def forward(self, x): - B, C, H, W = x.shape - assert H == self.img_size[0] and W == self.img_size[1], \ - f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." - - x = self.proj(x) - return x - - -@register -@serializable -class ViT(nn.Layer): - """ Vision Transformer with support for patch input - - This module is different from ppdet's VisionTransformer (from ppdet/modeling/backbones/visio_transformer.py), - the main differences are: - 1.the module PatchEmbed.proj has padding set,padding=(4 + 2 * (ratio // 2 - 1), 4 + 2 * (ratio // 2 - 1), - VisionTransformer dose not - 2.Attention module qkv is standard.but VisionTransformer provide more options - 3.MLP module only one Dropout,and VisionTransformer twice; - 4.VisionTransformer provide fpn layer,but the module does not. - - """ - - def __init__(self, - img_size=224, - patch_size=16, - in_chans=3, - embed_dim=768, - depth=12, - num_heads=12, - mlp_ratio=4, - qkv_bias=False, - qk_scale=None, - drop_rate=0., - attn_drop_rate=0., - drop_path_rate=0., - norm_layer='nn.LayerNorm', - epsilon=1e-5, - ratio=1, - pretrained=None, - **kwargs): - super().__init__() - - self.pretrained = pretrained - self.num_features = self.embed_dim = embed_dim - - self.patch_embed = PatchEmbed( - img_size=img_size, - patch_size=patch_size, - in_chans=in_chans, - embed_dim=embed_dim, - ratio=ratio) - num_patches = self.patch_embed.num_patches - - self.pos_embed = self.create_parameter( - shape=(1, num_patches + 1, embed_dim), - default_initializer=trunc_normal_) - self.add_parameter("pos_embed", self.pos_embed) - - dpr = np.linspace(0, drop_path_rate, depth, dtype='float32') - - self.blocks = nn.LayerList([ - Block( - dim=embed_dim, - num_heads=num_heads, - mlp_ratio=mlp_ratio, - qkv_bias=qkv_bias, - qk_scale=qk_scale, - drop=drop_rate, - attn_drop=attn_drop_rate, - drop_path=dpr[i], - norm_layer=norm_layer, - epsilon=epsilon) for i in range(depth) - ]) - - self.last_norm = eval(norm_layer)(embed_dim, epsilon=epsilon) - trunc_normal_(self.pos_embed) - self._init_weights() - - def _init_weights(self): - pretrained = self.pretrained - - if pretrained: - - if 'http' in pretrained: #URL - path = paddle.utils.download.get_weights_path_from_url( - pretrained) - else: #model in local path - path = pretrained - - load_state_dict = paddle.load(path) - self.set_state_dict(load_state_dict) - print("Load load_state_dict:", path) - - def forward_features(self, x): - - B = paddle.shape(x)[0] - x = self.patch_embed(x) - B, D, Hp, Wp = x.shape - x = x.flatten(2).transpose([0, 2, 1]) - x = x + self.pos_embed[:, 1:] + self.pos_embed[:, :1] - - for blk in self.blocks: - x = blk(x) - - x = self.last_norm(x) - xp = paddle.reshape( - paddle.transpose( - x, perm=[0, 2, 1]), shape=[B, -1, Hp, Wp]) - - return xp diff --git a/pdfdet/models/Paddle/ppdet/modeling/bbox_utils.py b/pdfdet/models/Paddle/ppdet/modeling/bbox_utils.py deleted file mode 100644 index 576cbbf..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/bbox_utils.py +++ /dev/null @@ -1,607 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import paddle -import numpy as np - - -def bbox2delta(src_boxes, tgt_boxes, weights=[1.0, 1.0, 1.0, 1.0]): - """Encode bboxes to deltas. - """ - src_w = src_boxes[:, 2] - src_boxes[:, 0] - src_h = src_boxes[:, 3] - src_boxes[:, 1] - src_ctr_x = src_boxes[:, 0] + 0.5 * src_w - src_ctr_y = src_boxes[:, 1] + 0.5 * src_h - - tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0] - tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1] - tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w - tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h - - wx, wy, ww, wh = weights - dx = wx * (tgt_ctr_x - src_ctr_x) / src_w - dy = wy * (tgt_ctr_y - src_ctr_y) / src_h - dw = ww * paddle.log(tgt_w / src_w) - dh = wh * paddle.log(tgt_h / src_h) - - deltas = paddle.stack((dx, dy, dw, dh), axis=1) - return deltas - - -def delta2bbox(deltas, boxes, weights=[1.0, 1.0, 1.0, 1.0], max_shape=None): - """Decode deltas to boxes. Used in RCNNBox,CascadeHead,RCNNHead,RetinaHead. - Note: return tensor shape [n,1,4] - If you want to add a reshape, please add after the calling code instead of here. - """ - clip_scale = math.log(1000.0 / 16) - - widths = boxes[:, 2] - boxes[:, 0] - heights = boxes[:, 3] - boxes[:, 1] - ctr_x = boxes[:, 0] + 0.5 * widths - ctr_y = boxes[:, 1] + 0.5 * heights - - wx, wy, ww, wh = weights - dx = deltas[:, 0::4] / wx - dy = deltas[:, 1::4] / wy - dw = deltas[:, 2::4] / ww - dh = deltas[:, 3::4] / wh - # Prevent sending too large values into paddle.exp() - dw = paddle.clip(dw, max=clip_scale) - dh = paddle.clip(dh, max=clip_scale) - - pred_ctr_x = dx * widths.unsqueeze(1) + ctr_x.unsqueeze(1) - pred_ctr_y = dy * heights.unsqueeze(1) + ctr_y.unsqueeze(1) - pred_w = paddle.exp(dw) * widths.unsqueeze(1) - pred_h = paddle.exp(dh) * heights.unsqueeze(1) - - pred_boxes = [] - pred_boxes.append(pred_ctr_x - 0.5 * pred_w) - pred_boxes.append(pred_ctr_y - 0.5 * pred_h) - pred_boxes.append(pred_ctr_x + 0.5 * pred_w) - pred_boxes.append(pred_ctr_y + 0.5 * pred_h) - pred_boxes = paddle.stack(pred_boxes, axis=-1) - - if max_shape is not None: - pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip( - min=0, max=max_shape[1]) - pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip( - min=0, max=max_shape[0]) - return pred_boxes - - -def bbox2delta_v2(src_boxes, - tgt_boxes, - delta_mean=[0.0, 0.0, 0.0, 0.0], - delta_std=[1.0, 1.0, 1.0, 1.0]): - """Encode bboxes to deltas. - Modified from bbox2delta() which just use weight parameters to multiply deltas. - """ - src_w = src_boxes[:, 2] - src_boxes[:, 0] - src_h = src_boxes[:, 3] - src_boxes[:, 1] - src_ctr_x = src_boxes[:, 0] + 0.5 * src_w - src_ctr_y = src_boxes[:, 1] + 0.5 * src_h - - tgt_w = tgt_boxes[:, 2] - tgt_boxes[:, 0] - tgt_h = tgt_boxes[:, 3] - tgt_boxes[:, 1] - tgt_ctr_x = tgt_boxes[:, 0] + 0.5 * tgt_w - tgt_ctr_y = tgt_boxes[:, 1] + 0.5 * tgt_h - - dx = (tgt_ctr_x - src_ctr_x) / src_w - dy = (tgt_ctr_y - src_ctr_y) / src_h - dw = paddle.log(tgt_w / src_w) - dh = paddle.log(tgt_h / src_h) - - deltas = paddle.stack((dx, dy, dw, dh), axis=1) - deltas = ( - deltas - paddle.to_tensor(delta_mean)) / paddle.to_tensor(delta_std) - return deltas - - -def delta2bbox_v2(deltas, - boxes, - delta_mean=[0.0, 0.0, 0.0, 0.0], - delta_std=[1.0, 1.0, 1.0, 1.0], - max_shape=None, - ctr_clip=32.0): - """Decode deltas to bboxes. - Modified from delta2bbox() which just use weight parameters to be divided by deltas. - Used in YOLOFHead. - Note: return tensor shape [n,1,4] - If you want to add a reshape, please add after the calling code instead of here. - """ - clip_scale = math.log(1000.0 / 16) - - widths = boxes[:, 2] - boxes[:, 0] - heights = boxes[:, 3] - boxes[:, 1] - ctr_x = boxes[:, 0] + 0.5 * widths - ctr_y = boxes[:, 1] + 0.5 * heights - - deltas = deltas * paddle.to_tensor(delta_std) + paddle.to_tensor(delta_mean) - dx = deltas[:, 0::4] - dy = deltas[:, 1::4] - dw = deltas[:, 2::4] - dh = deltas[:, 3::4] - - # Prevent sending too large values into paddle.exp() - dx = dx * widths.unsqueeze(1) - dy = dy * heights.unsqueeze(1) - if ctr_clip is not None: - dx = paddle.clip(dx, max=ctr_clip, min=-ctr_clip) - dy = paddle.clip(dy, max=ctr_clip, min=-ctr_clip) - dw = paddle.clip(dw, max=clip_scale) - dh = paddle.clip(dh, max=clip_scale) - else: - dw = dw.clip(min=-clip_scale, max=clip_scale) - dh = dh.clip(min=-clip_scale, max=clip_scale) - - pred_ctr_x = dx + ctr_x.unsqueeze(1) - pred_ctr_y = dy + ctr_y.unsqueeze(1) - pred_w = paddle.exp(dw) * widths.unsqueeze(1) - pred_h = paddle.exp(dh) * heights.unsqueeze(1) - - pred_boxes = [] - pred_boxes.append(pred_ctr_x - 0.5 * pred_w) - pred_boxes.append(pred_ctr_y - 0.5 * pred_h) - pred_boxes.append(pred_ctr_x + 0.5 * pred_w) - pred_boxes.append(pred_ctr_y + 0.5 * pred_h) - pred_boxes = paddle.stack(pred_boxes, axis=-1) - - if max_shape is not None: - pred_boxes[..., 0::2] = pred_boxes[..., 0::2].clip( - min=0, max=max_shape[1]) - pred_boxes[..., 1::2] = pred_boxes[..., 1::2].clip( - min=0, max=max_shape[0]) - return pred_boxes - - -def expand_bbox(bboxes, scale): - w_half = (bboxes[:, 2] - bboxes[:, 0]) * .5 - h_half = (bboxes[:, 3] - bboxes[:, 1]) * .5 - x_c = (bboxes[:, 2] + bboxes[:, 0]) * .5 - y_c = (bboxes[:, 3] + bboxes[:, 1]) * .5 - - w_half *= scale - h_half *= scale - - bboxes_exp = np.zeros(bboxes.shape, dtype=np.float32) - bboxes_exp[:, 0] = x_c - w_half - bboxes_exp[:, 2] = x_c + w_half - bboxes_exp[:, 1] = y_c - h_half - bboxes_exp[:, 3] = y_c + h_half - - return bboxes_exp - - -def clip_bbox(boxes, im_shape): - h, w = im_shape[0], im_shape[1] - x1 = boxes[:, 0].clip(0, w) - y1 = boxes[:, 1].clip(0, h) - x2 = boxes[:, 2].clip(0, w) - y2 = boxes[:, 3].clip(0, h) - return paddle.stack([x1, y1, x2, y2], axis=1) - - -def nonempty_bbox(boxes, min_size=0, return_mask=False): - w = boxes[:, 2] - boxes[:, 0] - h = boxes[:, 3] - boxes[:, 1] - mask = paddle.logical_and(h > min_size, w > min_size) - if return_mask: - return mask - keep = paddle.nonzero(mask).flatten() - return keep - - -def bbox_area(boxes): - return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) - - -def bbox_overlaps(boxes1, boxes2): - """ - Calculate overlaps between boxes1 and boxes2 - - Args: - boxes1 (Tensor): boxes with shape [M, 4] - boxes2 (Tensor): boxes with shape [N, 4] - - Return: - overlaps (Tensor): overlaps between boxes1 and boxes2 with shape [M, N] - """ - M = boxes1.shape[0] - N = boxes2.shape[0] - if M * N == 0: - return paddle.zeros([M, N], dtype='float32') - area1 = bbox_area(boxes1) - area2 = bbox_area(boxes2) - - xy_max = paddle.minimum( - paddle.unsqueeze(boxes1, 1)[:, :, 2:], boxes2[:, 2:]) - xy_min = paddle.maximum( - paddle.unsqueeze(boxes1, 1)[:, :, :2], boxes2[:, :2]) - width_height = xy_max - xy_min - width_height = width_height.clip(min=0) - inter = width_height.prod(axis=2) - - overlaps = paddle.where(inter > 0, inter / - (paddle.unsqueeze(area1, 1) + area2 - inter), - paddle.zeros_like(inter)) - return overlaps - - -def batch_bbox_overlaps(bboxes1, - bboxes2, - mode='iou', - is_aligned=False, - eps=1e-6): - """Calculate overlap between two set of bboxes. - If ``is_aligned `` is ``False``, then calculate the overlaps between each - bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned - pair of bboxes1 and bboxes2. - Args: - bboxes1 (Tensor): shape (B, m, 4) in format or empty. - bboxes2 (Tensor): shape (B, n, 4) in format or empty. - B indicates the batch dim, in shape (B1, B2, ..., Bn). - If ``is_aligned `` is ``True``, then m and n must be equal. - mode (str): "iou" (intersection over union) or "iof" (intersection over - foreground). - is_aligned (bool, optional): If True, then m and n must be equal. - Default False. - eps (float, optional): A value added to the denominator for numerical - stability. Default 1e-6. - Returns: - Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,) - """ - assert mode in ['iou', 'iof', 'giou'], 'Unsupported mode {}'.format(mode) - # Either the boxes are empty or the length of boxes's last dimenstion is 4 - assert (bboxes1.shape[-1] == 4 or bboxes1.shape[0] == 0) - assert (bboxes2.shape[-1] == 4 or bboxes2.shape[0] == 0) - - # Batch dim must be the same - # Batch dim: (B1, B2, ... Bn) - assert bboxes1.shape[:-2] == bboxes2.shape[:-2] - batch_shape = bboxes1.shape[:-2] - - rows = bboxes1.shape[-2] if bboxes1.shape[0] > 0 else 0 - cols = bboxes2.shape[-2] if bboxes2.shape[0] > 0 else 0 - if is_aligned: - assert rows == cols - - if rows * cols == 0: - if is_aligned: - return paddle.full(batch_shape + (rows, ), 1) - else: - return paddle.full(batch_shape + (rows, cols), 1) - - area1 = (bboxes1[:, 2] - bboxes1[:, 0]) * (bboxes1[:, 3] - bboxes1[:, 1]) - area2 = (bboxes2[:, 2] - bboxes2[:, 0]) * (bboxes2[:, 3] - bboxes2[:, 1]) - - if is_aligned: - lt = paddle.maximum(bboxes1[:, :2], bboxes2[:, :2]) # [B, rows, 2] - rb = paddle.minimum(bboxes1[:, 2:], bboxes2[:, 2:]) # [B, rows, 2] - - wh = (rb - lt).clip(min=0) # [B, rows, 2] - overlap = wh[:, 0] * wh[:, 1] - - if mode in ['iou', 'giou']: - union = area1 + area2 - overlap - else: - union = area1 - if mode == 'giou': - enclosed_lt = paddle.minimum(bboxes1[:, :2], bboxes2[:, :2]) - enclosed_rb = paddle.maximum(bboxes1[:, 2:], bboxes2[:, 2:]) - else: - lt = paddle.maximum(bboxes1[:, :2].reshape([rows, 1, 2]), - bboxes2[:, :2]) # [B, rows, cols, 2] - rb = paddle.minimum(bboxes1[:, 2:].reshape([rows, 1, 2]), - bboxes2[:, 2:]) # [B, rows, cols, 2] - - wh = (rb - lt).clip(min=0) # [B, rows, cols, 2] - overlap = wh[:, :, 0] * wh[:, :, 1] - - if mode in ['iou', 'giou']: - union = area1.reshape([rows,1]) \ - + area2.reshape([1,cols]) - overlap - else: - union = area1[:, None] - if mode == 'giou': - enclosed_lt = paddle.minimum(bboxes1[:, :2].reshape([rows, 1, 2]), - bboxes2[:, :2]) - enclosed_rb = paddle.maximum(bboxes1[:, 2:].reshape([rows, 1, 2]), - bboxes2[:, 2:]) - - eps = paddle.to_tensor([eps]) - union = paddle.maximum(union, eps) - ious = overlap / union - if mode in ['iou', 'iof']: - return ious - # calculate gious - enclose_wh = (enclosed_rb - enclosed_lt).clip(min=0) - enclose_area = enclose_wh[:, :, 0] * enclose_wh[:, :, 1] - enclose_area = paddle.maximum(enclose_area, eps) - gious = ious - (enclose_area - union) / enclose_area - return 1 - gious - - -def xywh2xyxy(box): - x, y, w, h = box - x1 = x - w * 0.5 - y1 = y - h * 0.5 - x2 = x + w * 0.5 - y2 = y + h * 0.5 - return [x1, y1, x2, y2] - - -def make_grid(h, w, dtype): - yv, xv = paddle.meshgrid([paddle.arange(h), paddle.arange(w)]) - return paddle.stack((xv, yv), 2).cast(dtype=dtype) - - -def decode_yolo(box, anchor, downsample_ratio): - """decode yolo box - - Args: - box (list): [x, y, w, h], all have the shape [b, na, h, w, 1] - anchor (list): anchor with the shape [na, 2] - downsample_ratio (int): downsample ratio, default 32 - scale (float): scale, default 1. - - Return: - box (list): decoded box, [x, y, w, h], all have the shape [b, na, h, w, 1] - """ - x, y, w, h = box - na, grid_h, grid_w = x.shape[1:4] - grid = make_grid(grid_h, grid_w, x.dtype).reshape((1, 1, grid_h, grid_w, 2)) - x1 = (x + grid[:, :, :, :, 0:1]) / grid_w - y1 = (y + grid[:, :, :, :, 1:2]) / grid_h - - anchor = paddle.to_tensor(anchor, dtype=x.dtype) - anchor = anchor.reshape((1, na, 1, 1, 2)) - w1 = paddle.exp(w) * anchor[:, :, :, :, 0:1] / (downsample_ratio * grid_w) - h1 = paddle.exp(h) * anchor[:, :, :, :, 1:2] / (downsample_ratio * grid_h) - - return [x1, y1, w1, h1] - - -def batch_iou_similarity(box1, box2, eps=1e-9): - """Calculate iou of box1 and box2 in batch - - Args: - box1 (Tensor): box with the shape [N, M1, 4] - box2 (Tensor): box with the shape [N, M2, 4] - - Return: - iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2] - """ - box1 = box1.unsqueeze(2) # [N, M1, 4] -> [N, M1, 1, 4] - box2 = box2.unsqueeze(1) # [N, M2, 4] -> [N, 1, M2, 4] - px1y1, px2y2 = box1[:, :, :, 0:2], box1[:, :, :, 2:4] - gx1y1, gx2y2 = box2[:, :, :, 0:2], box2[:, :, :, 2:4] - x1y1 = paddle.maximum(px1y1, gx1y1) - x2y2 = paddle.minimum(px2y2, gx2y2) - overlap = (x2y2 - x1y1).clip(0).prod(-1) - area1 = (px2y2 - px1y1).clip(0).prod(-1) - area2 = (gx2y2 - gx1y1).clip(0).prod(-1) - union = area1 + area2 - overlap + eps - return overlap / union - - -def bbox_iou(box1, box2, giou=False, diou=False, ciou=False, eps=1e-9): - """calculate the iou of box1 and box2 - - Args: - box1 (list): [x, y, w, h], all have the shape [b, na, h, w, 1] - box2 (list): [x, y, w, h], all have the shape [b, na, h, w, 1] - giou (bool): whether use giou or not, default False - diou (bool): whether use diou or not, default False - ciou (bool): whether use ciou or not, default False - eps (float): epsilon to avoid divide by zero - - Return: - iou (Tensor): iou of box1 and box1, with the shape [b, na, h, w, 1] - """ - px1, py1, px2, py2 = box1 - gx1, gy1, gx2, gy2 = box2 - x1 = paddle.maximum(px1, gx1) - y1 = paddle.maximum(py1, gy1) - x2 = paddle.minimum(px2, gx2) - y2 = paddle.minimum(py2, gy2) - - overlap = ((x2 - x1).clip(0)) * ((y2 - y1).clip(0)) - - area1 = (px2 - px1) * (py2 - py1) - area1 = area1.clip(0) - - area2 = (gx2 - gx1) * (gy2 - gy1) - area2 = area2.clip(0) - - union = area1 + area2 - overlap + eps - iou = overlap / union - - if giou or ciou or diou: - # convex w, h - cw = paddle.maximum(px2, gx2) - paddle.minimum(px1, gx1) - ch = paddle.maximum(py2, gy2) - paddle.minimum(py1, gy1) - if giou: - c_area = cw * ch + eps - return iou - (c_area - union) / c_area - else: - # convex diagonal squared - c2 = cw**2 + ch**2 + eps - # center distance - rho2 = ((px1 + px2 - gx1 - gx2)**2 + (py1 + py2 - gy1 - gy2)**2) / 4 - if diou: - return iou - rho2 / c2 - else: - w1, h1 = px2 - px1, py2 - py1 + eps - w2, h2 = gx2 - gx1, gy2 - gy1 + eps - delta = paddle.atan(w1 / h1) - paddle.atan(w2 / h2) - v = (4 / math.pi**2) * paddle.pow(delta, 2) - alpha = v / (1 + eps - iou + v) - alpha.stop_gradient = True - return iou - (rho2 / c2 + v * alpha) - else: - return iou - - -def bbox_iou_np_expand(box1, box2, x1y1x2y2=True, eps=1e-16): - """ - Calculate the iou of box1 and box2 with numpy. - - Args: - box1 (ndarray): [N, 4] - box2 (ndarray): [M, 4], usually N != M - x1y1x2y2 (bool): whether in x1y1x2y2 stype, default True - eps (float): epsilon to avoid divide by zero - Return: - iou (ndarray): iou of box1 and box2, [N, M] - """ - N, M = len(box1), len(box2) # usually N != M - if x1y1x2y2: - b1_x1, b1_y1 = box1[:, 0], box1[:, 1] - b1_x2, b1_y2 = box1[:, 2], box1[:, 3] - b2_x1, b2_y1 = box2[:, 0], box2[:, 1] - b2_x2, b2_y2 = box2[:, 2], box2[:, 3] - else: - # cxcywh style - # Transform from center and width to exact coordinates - b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 - b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 - b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 - b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 - - # get the coordinates of the intersection rectangle - inter_rect_x1 = np.zeros((N, M), dtype=np.float32) - inter_rect_y1 = np.zeros((N, M), dtype=np.float32) - inter_rect_x2 = np.zeros((N, M), dtype=np.float32) - inter_rect_y2 = np.zeros((N, M), dtype=np.float32) - for i in range(len(box2)): - inter_rect_x1[:, i] = np.maximum(b1_x1, b2_x1[i]) - inter_rect_y1[:, i] = np.maximum(b1_y1, b2_y1[i]) - inter_rect_x2[:, i] = np.minimum(b1_x2, b2_x2[i]) - inter_rect_y2[:, i] = np.minimum(b1_y2, b2_y2[i]) - # Intersection area - inter_area = np.maximum(inter_rect_x2 - inter_rect_x1, 0) * np.maximum( - inter_rect_y2 - inter_rect_y1, 0) - # Union Area - b1_area = np.repeat( - ((b1_x2 - b1_x1) * (b1_y2 - b1_y1)).reshape(-1, 1), M, axis=-1) - b2_area = np.repeat( - ((b2_x2 - b2_x1) * (b2_y2 - b2_y1)).reshape(1, -1), N, axis=0) - - ious = inter_area / (b1_area + b2_area - inter_area + eps) - return ious - - -def bbox2distance(points, bbox, max_dis=None, eps=0.1): - """Decode bounding box based on distances. - Args: - points (Tensor): Shape (n, 2), [x, y]. - bbox (Tensor): Shape (n, 4), "xyxy" format - max_dis (float): Upper bound of the distance. - eps (float): a small value to ensure target < max_dis, instead <= - Returns: - Tensor: Decoded distances. - """ - left = points[:, 0] - bbox[:, 0] - top = points[:, 1] - bbox[:, 1] - right = bbox[:, 2] - points[:, 0] - bottom = bbox[:, 3] - points[:, 1] - if max_dis is not None: - left = left.clip(min=0, max=max_dis - eps) - top = top.clip(min=0, max=max_dis - eps) - right = right.clip(min=0, max=max_dis - eps) - bottom = bottom.clip(min=0, max=max_dis - eps) - return paddle.stack([left, top, right, bottom], -1) - - -def distance2bbox(points, distance, max_shape=None): - """Decode distance prediction to bounding box. - Args: - points (Tensor): Shape (n, 2), [x, y]. - distance (Tensor): Distance from the given point to 4 - boundaries (left, top, right, bottom). - max_shape (tuple): Shape of the image. - Returns: - Tensor: Decoded bboxes. - """ - x1 = points[:, 0] - distance[:, 0] - y1 = points[:, 1] - distance[:, 1] - x2 = points[:, 0] + distance[:, 2] - y2 = points[:, 1] + distance[:, 3] - if max_shape is not None: - x1 = x1.clip(min=0, max=max_shape[1]) - y1 = y1.clip(min=0, max=max_shape[0]) - x2 = x2.clip(min=0, max=max_shape[1]) - y2 = y2.clip(min=0, max=max_shape[0]) - return paddle.stack([x1, y1, x2, y2], -1) - - -def bbox_center(boxes): - """Get bbox centers from boxes. - Args: - boxes (Tensor): boxes with shape (..., 4), "xmin, ymin, xmax, ymax" format. - Returns: - Tensor: boxes centers with shape (..., 2), "cx, cy" format. - """ - boxes_cx = (boxes[..., 0] + boxes[..., 2]) / 2 - boxes_cy = (boxes[..., 1] + boxes[..., 3]) / 2 - return paddle.stack([boxes_cx, boxes_cy], axis=-1) - - -def batch_distance2bbox(points, distance, max_shapes=None): - """Decode distance prediction to bounding box for batch. - Args: - points (Tensor): [B, ..., 2], "xy" format - distance (Tensor): [B, ..., 4], "ltrb" format - max_shapes (Tensor): [B, 2], "h,w" format, Shape of the image. - Returns: - Tensor: Decoded bboxes, "x1y1x2y2" format. - """ - lt, rb = paddle.split(distance, 2, -1) - # while tensor add parameters, parameters should be better placed on the second place - x1y1 = -lt + points - x2y2 = rb + points - out_bbox = paddle.concat([x1y1, x2y2], -1) - if max_shapes is not None: - max_shapes = max_shapes.flip(-1).tile([1, 2]) - delta_dim = out_bbox.ndim - max_shapes.ndim - for _ in range(delta_dim): - max_shapes.unsqueeze_(1) - out_bbox = paddle.where(out_bbox < max_shapes, out_bbox, max_shapes) - out_bbox = paddle.where(out_bbox > 0, out_bbox, - paddle.zeros_like(out_bbox)) - return out_bbox - - -def iou_similarity(box1, box2, eps=1e-10): - """Calculate iou of box1 and box2 - - Args: - box1 (Tensor): box with the shape [M1, 4] - box2 (Tensor): box with the shape [M2, 4] - - Return: - iou (Tensor): iou between box1 and box2 with the shape [M1, M2] - """ - box1 = box1.unsqueeze(1) # [M1, 4] -> [M1, 1, 4] - box2 = box2.unsqueeze(0) # [M2, 4] -> [1, M2, 4] - px1y1, px2y2 = box1[:, :, 0:2], box1[:, :, 2:4] - gx1y1, gx2y2 = box2[:, :, 0:2], box2[:, :, 2:4] - x1y1 = paddle.maximum(px1y1, gx1y1) - x2y2 = paddle.minimum(px2y2, gx2y2) - overlap = (x2y2 - x1y1).clip(0).prod(-1) - area1 = (px2y2 - px1y1).clip(0).prod(-1) - area2 = (gx2y2 - gx1y1).clip(0).prod(-1) - union = area1 + area2 - overlap + eps - return overlap / union diff --git a/pdfdet/models/Paddle/ppdet/modeling/clrnet_utils.py b/pdfdet/models/Paddle/ppdet/modeling/clrnet_utils.py deleted file mode 100644 index 24ece5c..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/clrnet_utils.py +++ /dev/null @@ -1,309 +0,0 @@ -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.modeling.initializer import constant_ -from paddle.nn.initializer import KaimingNormal - - -class ConvModule(nn.Layer): - def __init__(self, - in_channels, - out_channels, - kernel_size=1, - stride=1, - padding=0, - dilation=1, - groups=1, - bias=False, - norm_type='bn', - wtih_act=True): - super(ConvModule, self).__init__() - assert norm_type in ['bn', 'sync_bn', 'gn', None] - self.with_norm = norm_type is not None - self.wtih_act = wtih_act - self.conv = nn.Conv2D( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=kernel_size, - stride=stride, - padding=padding, - dilation=dilation, - groups=groups, - bias_attr=bias, - weight_attr=KaimingNormal()) - if self.with_norm: - if norm_type == 'bn': - self.bn = nn.BatchNorm2D(out_channels) - elif norm_type == 'gn': - self.bn = nn.GroupNorm(out_channels, out_channels) - - if self.wtih_act: - self.act = nn.ReLU() - - def forward(self, inputs): - x = self.conv(inputs) - if self.with_norm: - x = self.bn(x) - if self.wtih_act: - x = self.act(x) - return x - - -def LinearModule(hidden_dim): - return nn.LayerList( - [nn.Linear( - hidden_dim, hidden_dim, bias_attr=True), nn.ReLU()]) - - -class FeatureResize(nn.Layer): - def __init__(self, size=(10, 25)): - super(FeatureResize, self).__init__() - self.size = size - - def forward(self, x): - x = F.interpolate(x, self.size) - return x.flatten(2) - - -class ROIGather(nn.Layer): - ''' - ROIGather module for gather global information - Args: - in_channels: prior feature channels - num_priors: prior numbers we predefined - sample_points: the number of sampled points when we extract feature from line - fc_hidden_dim: the fc output channel - refine_layers: the total number of layers to build refine - ''' - - def __init__(self, - in_channels, - num_priors, - sample_points, - fc_hidden_dim, - refine_layers, - mid_channels=48): - super(ROIGather, self).__init__() - self.in_channels = in_channels - self.num_priors = num_priors - self.f_key = ConvModule( - in_channels=self.in_channels, - out_channels=self.in_channels, - kernel_size=1, - stride=1, - padding=0, - norm_type='bn') - - self.f_query = nn.Sequential( - nn.Conv1D( - in_channels=num_priors, - out_channels=num_priors, - kernel_size=1, - stride=1, - padding=0, - groups=num_priors), - nn.ReLU(), ) - self.f_value = nn.Conv2D( - in_channels=self.in_channels, - out_channels=self.in_channels, - kernel_size=1, - stride=1, - padding=0) - self.W = nn.Conv1D( - in_channels=num_priors, - out_channels=num_priors, - kernel_size=1, - stride=1, - padding=0, - groups=num_priors) - - self.resize = FeatureResize() - constant_(self.W.weight, 0) - constant_(self.W.bias, 0) - - self.convs = nn.LayerList() - self.catconv = nn.LayerList() - for i in range(refine_layers): - self.convs.append( - ConvModule( - in_channels, - mid_channels, (9, 1), - padding=(4, 0), - bias=False, - norm_type='bn')) - - self.catconv.append( - ConvModule( - mid_channels * (i + 1), - in_channels, (9, 1), - padding=(4, 0), - bias=False, - norm_type='bn')) - - self.fc = nn.Linear( - sample_points * fc_hidden_dim, fc_hidden_dim, bias_attr=True) - - self.fc_norm = nn.LayerNorm(fc_hidden_dim) - - def roi_fea(self, x, layer_index): - feats = [] - for i, feature in enumerate(x): - feat_trans = self.convs[i](feature) - feats.append(feat_trans) - cat_feat = paddle.concat(feats, axis=1) - cat_feat = self.catconv[layer_index](cat_feat) - return cat_feat - - def forward(self, roi_features, x, layer_index): - ''' - Args: - roi_features: prior feature, shape: (Batch * num_priors, prior_feat_channel, sample_point, 1) - x: feature map - layer_index: currently on which layer to refine - Return: - roi: prior features with gathered global information, shape: (Batch, num_priors, fc_hidden_dim) - ''' - - roi = self.roi_fea(roi_features, layer_index) - # return roi - # print(roi.shape) - # return roi - bs = x.shape[0] - # print(bs) - #roi = roi.contiguous().view(bs * self.num_priors, -1) - roi = roi.reshape([bs * self.num_priors, -1]) - # roi = paddle.randn([192,2304]) - # return roi - # print(roi) - # print(self.fc) - # print(self.fc.weight) - roi = self.fc(roi) - roi = F.relu(self.fc_norm(roi)) - # return roi - #roi = roi.view(bs, self.num_priors, -1) - roi = roi.reshape([bs, self.num_priors, -1]) - query = roi - - value = self.resize(self.f_value(x)) # (B, C, N) global feature - query = self.f_query( - query) # (B, N, 1) sample context feature from prior roi - key = self.f_key(x) - value = value.transpose(perm=[0, 2, 1]) - key = self.resize(key) # (B, C, N) global feature - sim_map = paddle.matmul(query, key) - sim_map = (self.in_channels**-.5) * sim_map - sim_map = F.softmax(sim_map, axis=-1) - - context = paddle.matmul(sim_map, value) - context = self.W(context) - - roi = roi + F.dropout(context, p=0.1, training=self.training) - - return roi - - -class SegDecoder(nn.Layer): - ''' - Optionaly seg decoder - ''' - - def __init__(self, - image_height, - image_width, - num_class, - prior_feat_channels=64, - refine_layers=3): - super().__init__() - self.dropout = nn.Dropout2D(0.1) - self.conv = nn.Conv2D(prior_feat_channels * refine_layers, num_class, 1) - self.image_height = image_height - self.image_width = image_width - - def forward(self, x): - x = self.dropout(x) - x = self.conv(x) - x = F.interpolate( - x, - size=[self.image_height, self.image_width], - mode='bilinear', - align_corners=False) - return x - - -import paddle.nn as nn - - -def accuracy(pred, target, topk=1, thresh=None): - """Calculate accuracy according to the prediction and target. - - Args: - pred (torch.Tensor): The model prediction, shape (N, num_class) - target (torch.Tensor): The target of each prediction, shape (N, ) - topk (int | tuple[int], optional): If the predictions in ``topk`` - matches the target, the predictions will be regarded as - correct ones. Defaults to 1. - thresh (float, optional): If not None, predictions with scores under - this threshold are considered incorrect. Default to None. - - Returns: - float | tuple[float]: If the input ``topk`` is a single integer, - the function will return a single float as accuracy. If - ``topk`` is a tuple containing multiple integers, the - function will return a tuple containing accuracies of - each ``topk`` number. - """ - assert isinstance(topk, (int, tuple)) - if isinstance(topk, int): - topk = (topk, ) - return_single = True - else: - return_single = False - - maxk = max(topk) - if pred.shape[0] == 0: - accu = [pred.new_tensor(0.) for i in range(len(topk))] - return accu[0] if return_single else accu - assert pred.ndim == 2 and target.ndim == 1 - assert pred.shape[0] == target.shape[0] - assert maxk <= pred.shape[1], \ - f'maxk {maxk} exceeds pred dimension {pred.shape[1]}' - pred_value, pred_label = pred.topk(maxk, axis=1) - pred_label = pred_label.t() # transpose to shape (maxk, N) - correct = pred_label.equal(target.reshape([1, -1]).expand_as(pred_label)) - if thresh is not None: - # Only prediction values larger than thresh are counted as correct - correct = correct & (pred_value > thresh).t() - res = [] - for k in topk: - correct_k = correct[:k].reshape([-1]).cast("float32").sum(0, - keepdim=True) - correct_k = correct_k * (100.0 / pred.shape[0]) - res.append(correct_k) - return res[0] if return_single else res - - -class Accuracy(nn.Layer): - def __init__(self, topk=(1, ), thresh=None): - """Module to calculate the accuracy. - - Args: - topk (tuple, optional): The criterion used to calculate the - accuracy. Defaults to (1,). - thresh (float, optional): If not None, predictions with scores - under this threshold are considered incorrect. Default to None. - """ - super().__init__() - self.topk = topk - self.thresh = thresh - - def forward(self, pred, target): - """Forward function to calculate accuracy. - - Args: - pred (torch.Tensor): Prediction of models. - target (torch.Tensor): Target for each prediction. - - Returns: - tuple[float]: The accuracies under different topk criterions. - """ - return accuracy(pred, target, self.topk, self.thresh) diff --git a/pdfdet/models/Paddle/ppdet/modeling/cls_utils.py b/pdfdet/models/Paddle/ppdet/modeling/cls_utils.py deleted file mode 100644 index 3ae8d11..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/cls_utils.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -def _get_class_default_kwargs(cls, *args, **kwargs): - """ - Get default arguments of a class in dict format, if args and - kwargs is specified, it will replace default arguments - """ - varnames = cls.__init__.__code__.co_varnames - argcount = cls.__init__.__code__.co_argcount - keys = varnames[:argcount] - assert keys[0] == 'self' - keys = keys[1:] - - values = list(cls.__init__.__defaults__) - assert len(values) == len(keys) - - if len(args) > 0: - for i, arg in enumerate(args): - values[i] = arg - - default_kwargs = dict(zip(keys, values)) - - if len(kwargs) > 0: - for k, v in kwargs.items(): - default_kwargs[k] = v - - return default_kwargs diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/heads/__init__.py deleted file mode 100644 index 0d126c0..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/__init__.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import bbox_head -from . import mask_head -from . import yolo_head -from . import roi_extractor -from . import ssd_head -from . import fcos_head -from . import solov2_head -from . import ttf_head -from . import cascade_head -from . import face_head -from . import s2anet_head -from . import keypoint_hrhrnet_head -from . import centernet_head -from . import gfl_head -from . import simota_head -from . import pico_head -from . import detr_head -from . import sparsercnn_head -from . import tood_head -from . import retina_head -from . import ppyoloe_head -from . import fcosr_head -from . import ppyoloe_r_head -from . import yolof_head -from . import ppyoloe_contrast_head -from . import centertrack_head -from . import sparse_roi_head -from . import vitpose_head -from . import clrnet_head - -from .bbox_head import * -from .mask_head import * -from .yolo_head import * -from .roi_extractor import * -from .ssd_head import * -from .fcos_head import * -from .solov2_head import * -from .ttf_head import * -from .cascade_head import * -from .face_head import * -from .s2anet_head import * -from .keypoint_hrhrnet_head import * -from .centernet_head import * -from .gfl_head import * -from .simota_head import * -from .pico_head import * -from .detr_head import * -from .sparsercnn_head import * -from .tood_head import * -from .retina_head import * -from .ppyoloe_head import * -from .fcosr_head import * -from .ppyoloe_r_head import * -from .yolof_head import * -from .ppyoloe_contrast_head import * -from .centertrack_head import * -from .sparse_roi_head import * -from .petr_head import * -from .vitpose_head import * -from .clrnet_head import * \ No newline at end of file diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/bbox_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/bbox_head.py deleted file mode 100644 index 3ce4798..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/bbox_head.py +++ /dev/null @@ -1,443 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.nn.initializer import Normal, XavierUniform, KaimingNormal -from paddle.regularizer import L2Decay - -from ppdet.core.workspace import register, create -from .roi_extractor import RoIAlign -from ..shape_spec import ShapeSpec -from ..bbox_utils import bbox2delta -from ..cls_utils import _get_class_default_kwargs -from ppdet.modeling.layers import ConvNormLayer - -__all__ = ['TwoFCHead', 'XConvNormHead', 'BBoxHead'] - - -@register -class TwoFCHead(nn.Layer): - """ - RCNN bbox head with Two fc layers to extract feature - - Args: - in_channel (int): Input channel which can be derived by from_config - out_channel (int): Output channel - resolution (int): Resolution of input feature map, default 7 - """ - - def __init__(self, in_channel=256, out_channel=1024, resolution=7): - super(TwoFCHead, self).__init__() - self.in_channel = in_channel - self.out_channel = out_channel - fan = in_channel * resolution * resolution - self.fc6 = nn.Linear( - in_channel * resolution * resolution, - out_channel, - weight_attr=paddle.ParamAttr( - initializer=XavierUniform(fan_out=fan))) - self.fc6.skip_quant = True - - self.fc7 = nn.Linear( - out_channel, - out_channel, - weight_attr=paddle.ParamAttr(initializer=XavierUniform())) - self.fc7.skip_quant = True - - @classmethod - def from_config(cls, cfg, input_shape): - s = input_shape - s = s[0] if isinstance(s, (list, tuple)) else s - return {'in_channel': s.channels} - - @property - def out_shape(self): - return [ShapeSpec(channels=self.out_channel, )] - - def forward(self, rois_feat): - rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1) - fc6 = self.fc6(rois_feat) - fc6 = F.relu(fc6) - fc7 = self.fc7(fc6) - fc7 = F.relu(fc7) - return fc7 - - -@register -class XConvNormHead(nn.Layer): - __shared__ = ['norm_type', 'freeze_norm'] - """ - RCNN bbox head with serveral convolution layers - - Args: - in_channel (int): Input channels which can be derived by from_config - num_convs (int): The number of conv layers - conv_dim (int): The number of channels for the conv layers - out_channel (int): Output channels - resolution (int): Resolution of input feature map - norm_type (string): Norm type, bn, gn, sync_bn are available, - default `gn` - freeze_norm (bool): Whether to freeze the norm - stage_name (string): Prefix name for conv layer, '' by default - """ - - def __init__(self, - in_channel=256, - num_convs=4, - conv_dim=256, - out_channel=1024, - resolution=7, - norm_type='gn', - freeze_norm=False, - stage_name=''): - super(XConvNormHead, self).__init__() - self.in_channel = in_channel - self.num_convs = num_convs - self.conv_dim = conv_dim - self.out_channel = out_channel - self.norm_type = norm_type - self.freeze_norm = freeze_norm - - self.bbox_head_convs = [] - fan = conv_dim * 3 * 3 - initializer = KaimingNormal(fan_in=fan) - for i in range(self.num_convs): - in_c = in_channel if i == 0 else conv_dim - head_conv_name = stage_name + 'bbox_head_conv{}'.format(i) - head_conv = self.add_sublayer( - head_conv_name, - ConvNormLayer( - ch_in=in_c, - ch_out=conv_dim, - filter_size=3, - stride=1, - norm_type=self.norm_type, - freeze_norm=self.freeze_norm, - initializer=initializer)) - self.bbox_head_convs.append(head_conv) - - fan = conv_dim * resolution * resolution - self.fc6 = nn.Linear( - conv_dim * resolution * resolution, - out_channel, - weight_attr=paddle.ParamAttr( - initializer=XavierUniform(fan_out=fan)), - bias_attr=paddle.ParamAttr( - learning_rate=2., regularizer=L2Decay(0.))) - - @classmethod - def from_config(cls, cfg, input_shape): - s = input_shape - s = s[0] if isinstance(s, (list, tuple)) else s - return {'in_channel': s.channels} - - @property - def out_shape(self): - return [ShapeSpec(channels=self.out_channel, )] - - def forward(self, rois_feat): - for i in range(self.num_convs): - rois_feat = F.relu(self.bbox_head_convs[i](rois_feat)) - rois_feat = paddle.flatten(rois_feat, start_axis=1, stop_axis=-1) - fc6 = F.relu(self.fc6(rois_feat)) - return fc6 - - -@register -class BBoxHead(nn.Layer): - __shared__ = ['num_classes', 'use_cot'] - __inject__ = ['bbox_assigner', 'bbox_loss', 'loss_cot'] - """ - RCNN bbox head - - Args: - head (nn.Layer): Extract feature in bbox head - in_channel (int): Input channel after RoI extractor - roi_extractor (object): The module of RoI Extractor - bbox_assigner (object): The module of Box Assigner, label and sample the - box. - with_pool (bool): Whether to use pooling for the RoI feature. - num_classes (int): The number of classes - bbox_weight (List[float]): The weight to get the decode box - cot_classes (int): The number of base classes - loss_cot (object): The module of Label-cotuning - use_cot(bool): whether to use Label-cotuning - """ - - def __init__(self, - head, - in_channel, - roi_extractor=_get_class_default_kwargs(RoIAlign), - bbox_assigner='BboxAssigner', - with_pool=False, - num_classes=80, - bbox_weight=[10., 10., 5., 5.], - bbox_loss=None, - loss_normalize_pos=False, - cot_classes=None, - loss_cot='COTLoss', - use_cot=False): - super(BBoxHead, self).__init__() - self.head = head - self.roi_extractor = roi_extractor - if isinstance(roi_extractor, dict): - self.roi_extractor = RoIAlign(**roi_extractor) - self.bbox_assigner = bbox_assigner - - self.with_pool = with_pool - self.num_classes = num_classes - self.bbox_weight = bbox_weight - self.bbox_loss = bbox_loss - self.loss_normalize_pos = loss_normalize_pos - - self.loss_cot = loss_cot - self.cot_relation = None - self.cot_classes = cot_classes - self.use_cot = use_cot - if use_cot: - self.cot_bbox_score = nn.Linear( - in_channel, - self.num_classes + 1, - weight_attr=paddle.ParamAttr(initializer=Normal( - mean=0.0, std=0.01))) - - self.bbox_score = nn.Linear( - in_channel, - self.cot_classes + 1, - weight_attr=paddle.ParamAttr(initializer=Normal( - mean=0.0, std=0.01))) - self.cot_bbox_score.skip_quant = True - else: - self.bbox_score = nn.Linear( - in_channel, - self.num_classes + 1, - weight_attr=paddle.ParamAttr(initializer=Normal( - mean=0.0, std=0.01))) - self.bbox_score.skip_quant = True - - self.bbox_delta = nn.Linear( - in_channel, - 4 * self.num_classes, - weight_attr=paddle.ParamAttr(initializer=Normal( - mean=0.0, std=0.001))) - self.bbox_delta.skip_quant = True - self.assigned_label = None - self.assigned_rois = None - - def init_cot_head(self, relationship): - self.cot_relation = relationship - - @classmethod - def from_config(cls, cfg, input_shape): - roi_pooler = cfg['roi_extractor'] - assert isinstance(roi_pooler, dict) - kwargs = RoIAlign.from_config(cfg, input_shape) - roi_pooler.update(kwargs) - kwargs = {'input_shape': input_shape} - head = create(cfg['head'], **kwargs) - return { - 'roi_extractor': roi_pooler, - 'head': head, - 'in_channel': head.out_shape[0].channels - } - - def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None, cot=False): - """ - body_feats (list[Tensor]): Feature maps from backbone - rois (list[Tensor]): RoIs generated from RPN module - rois_num (Tensor): The number of RoIs in each image - inputs (dict{Tensor}): The ground-truth of image - """ - if self.training: - rois, rois_num, targets = self.bbox_assigner(rois, rois_num, inputs) - self.assigned_rois = (rois, rois_num) - self.assigned_targets = targets - - rois_feat = self.roi_extractor(body_feats, rois, rois_num) - bbox_feat = self.head(rois_feat) - if self.with_pool: - feat = F.adaptive_avg_pool2d(bbox_feat, output_size=1) - feat = paddle.squeeze(feat, axis=[2, 3]) - else: - feat = bbox_feat - if self.use_cot: - scores = self.cot_bbox_score(feat) - cot_scores = self.bbox_score(feat) - else: - scores = self.bbox_score(feat) - deltas = self.bbox_delta(feat) - - if self.training: - loss = self.get_loss( - scores, - deltas, - targets, - rois, - self.bbox_weight, - loss_normalize_pos=self.loss_normalize_pos) - - if self.cot_relation is not None: - loss_cot = self.loss_cot(cot_scores, targets, self.cot_relation) - loss.update(loss_cot) - return loss, bbox_feat - else: - if cot: - pred = self.get_prediction(cot_scores, deltas) - else: - pred = self.get_prediction(scores, deltas) - return pred, self.head - - - def get_loss(self, - scores, - deltas, - targets, - rois, - bbox_weight, - loss_normalize_pos=False): - """ - scores (Tensor): scores from bbox head outputs - deltas (Tensor): deltas from bbox head outputs - targets (list[List[Tensor]]): bbox targets containing tgt_labels, tgt_bboxes and tgt_gt_inds - rois (List[Tensor]): RoIs generated in each batch - """ - cls_name = 'loss_bbox_cls' - reg_name = 'loss_bbox_reg' - loss_bbox = {} - - # TODO: better pass args - tgt_labels, tgt_bboxes, tgt_gt_inds = targets - - # bbox cls - tgt_labels = paddle.concat(tgt_labels) if len( - tgt_labels) > 1 else tgt_labels[0] - valid_inds = paddle.nonzero(tgt_labels >= 0).flatten() - if valid_inds.shape[0] == 0: - loss_bbox[cls_name] = paddle.zeros([1], dtype='float32') - else: - tgt_labels = tgt_labels.cast('int64') - tgt_labels.stop_gradient = True - - if not loss_normalize_pos: - loss_bbox_cls = F.cross_entropy( - input=scores, label=tgt_labels, reduction='mean') - else: - loss_bbox_cls = F.cross_entropy( - input=scores, label=tgt_labels, - reduction='none').sum() / (tgt_labels.shape[0] + 1e-7) - - loss_bbox[cls_name] = loss_bbox_cls - - # bbox reg - - cls_agnostic_bbox_reg = deltas.shape[1] == 4 - - fg_inds = paddle.nonzero( - paddle.logical_and(tgt_labels >= 0, tgt_labels < - self.num_classes)).flatten() - - if fg_inds.numel() == 0: - loss_bbox[reg_name] = paddle.zeros([1], dtype='float32') - return loss_bbox - - if cls_agnostic_bbox_reg: - reg_delta = paddle.gather(deltas, fg_inds) - else: - fg_gt_classes = paddle.gather(tgt_labels, fg_inds) - - reg_row_inds = paddle.arange(fg_gt_classes.shape[0]).unsqueeze(1) - reg_row_inds = paddle.tile(reg_row_inds, [1, 4]).reshape([-1, 1]) - - reg_col_inds = 4 * fg_gt_classes.unsqueeze(1) + paddle.arange(4) - - reg_col_inds = reg_col_inds.reshape([-1, 1]) - reg_inds = paddle.concat([reg_row_inds, reg_col_inds], axis=1) - - reg_delta = paddle.gather(deltas, fg_inds) - reg_delta = paddle.gather_nd(reg_delta, reg_inds).reshape([-1, 4]) - rois = paddle.concat(rois) if len(rois) > 1 else rois[0] - tgt_bboxes = paddle.concat(tgt_bboxes) if len( - tgt_bboxes) > 1 else tgt_bboxes[0] - - reg_target = bbox2delta(rois, tgt_bboxes, bbox_weight) - reg_target = paddle.gather(reg_target, fg_inds) - reg_target.stop_gradient = True - - if self.bbox_loss is not None: - reg_delta = self.bbox_transform(reg_delta) - reg_target = self.bbox_transform(reg_target) - - if not loss_normalize_pos: - loss_bbox_reg = self.bbox_loss( - reg_delta, reg_target).sum() / tgt_labels.shape[0] - loss_bbox_reg *= self.num_classes - - else: - loss_bbox_reg = self.bbox_loss( - reg_delta, reg_target).sum() / (tgt_labels.shape[0] + 1e-7) - - else: - loss_bbox_reg = paddle.abs(reg_delta - reg_target).sum( - ) / tgt_labels.shape[0] - - loss_bbox[reg_name] = loss_bbox_reg - - return loss_bbox - - def bbox_transform(self, deltas, weights=[0.1, 0.1, 0.2, 0.2]): - wx, wy, ww, wh = weights - - deltas = paddle.reshape(deltas, shape=(0, -1, 4)) - - dx = paddle.slice(deltas, axes=[2], starts=[0], ends=[1]) * wx - dy = paddle.slice(deltas, axes=[2], starts=[1], ends=[2]) * wy - dw = paddle.slice(deltas, axes=[2], starts=[2], ends=[3]) * ww - dh = paddle.slice(deltas, axes=[2], starts=[3], ends=[4]) * wh - - dw = paddle.clip(dw, -1.e10, np.log(1000. / 16)) - dh = paddle.clip(dh, -1.e10, np.log(1000. / 16)) - - pred_ctr_x = dx - pred_ctr_y = dy - pred_w = paddle.exp(dw) - pred_h = paddle.exp(dh) - - x1 = pred_ctr_x - 0.5 * pred_w - y1 = pred_ctr_y - 0.5 * pred_h - x2 = pred_ctr_x + 0.5 * pred_w - y2 = pred_ctr_y + 0.5 * pred_h - - x1 = paddle.reshape(x1, shape=(-1, )) - y1 = paddle.reshape(y1, shape=(-1, )) - x2 = paddle.reshape(x2, shape=(-1, )) - y2 = paddle.reshape(y2, shape=(-1, )) - - return paddle.concat([x1, y1, x2, y2]) - - def get_prediction(self, score, delta): - bbox_prob = F.softmax(score) - return delta, bbox_prob - - def get_head(self, ): - return self.head - - def get_assigned_targets(self, ): - return self.assigned_targets - - def get_assigned_rois(self, ): - return self.assigned_rois diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/cascade_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/cascade_head.py deleted file mode 100644 index d6f21d2..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/cascade_head.py +++ /dev/null @@ -1,337 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.nn.initializer import Normal - -from ppdet.core.workspace import register -from .bbox_head import BBoxHead, TwoFCHead, XConvNormHead -from .roi_extractor import RoIAlign -from ..shape_spec import ShapeSpec -from ..bbox_utils import delta2bbox, clip_bbox, nonempty_bbox -from ..cls_utils import _get_class_default_kwargs - -__all__ = ['CascadeTwoFCHead', 'CascadeXConvNormHead', 'CascadeHead'] - - -@register -class CascadeTwoFCHead(nn.Layer): - __shared__ = ['num_cascade_stage'] - """ - Cascade RCNN bbox head with Two fc layers to extract feature - - Args: - in_channel (int): Input channel which can be derived by from_config - out_channel (int): Output channel - resolution (int): Resolution of input feature map, default 7 - num_cascade_stage (int): The number of cascade stage, default 3 - """ - - def __init__(self, - in_channel=256, - out_channel=1024, - resolution=7, - num_cascade_stage=3): - super(CascadeTwoFCHead, self).__init__() - - self.in_channel = in_channel - self.out_channel = out_channel - - self.head_list = [] - for stage in range(num_cascade_stage): - head_per_stage = self.add_sublayer( - str(stage), TwoFCHead(in_channel, out_channel, resolution)) - self.head_list.append(head_per_stage) - - @classmethod - def from_config(cls, cfg, input_shape): - s = input_shape - s = s[0] if isinstance(s, (list, tuple)) else s - return {'in_channel': s.channels} - - @property - def out_shape(self): - return [ShapeSpec(channels=self.out_channel, )] - - def forward(self, rois_feat, stage=0): - out = self.head_list[stage](rois_feat) - return out - - -@register -class CascadeXConvNormHead(nn.Layer): - __shared__ = ['norm_type', 'freeze_norm', 'num_cascade_stage'] - """ - Cascade RCNN bbox head with serveral convolution layers - - Args: - in_channel (int): Input channels which can be derived by from_config - num_convs (int): The number of conv layers - conv_dim (int): The number of channels for the conv layers - out_channel (int): Output channels - resolution (int): Resolution of input feature map - norm_type (string): Norm type, bn, gn, sync_bn are available, - default `gn` - freeze_norm (bool): Whether to freeze the norm - num_cascade_stage (int): The number of cascade stage, default 3 - """ - - def __init__(self, - in_channel=256, - num_convs=4, - conv_dim=256, - out_channel=1024, - resolution=7, - norm_type='gn', - freeze_norm=False, - num_cascade_stage=3): - super(CascadeXConvNormHead, self).__init__() - self.in_channel = in_channel - self.out_channel = out_channel - - self.head_list = [] - for stage in range(num_cascade_stage): - head_per_stage = self.add_sublayer( - str(stage), - XConvNormHead( - in_channel, - num_convs, - conv_dim, - out_channel, - resolution, - norm_type, - freeze_norm, - stage_name='stage{}_'.format(stage))) - self.head_list.append(head_per_stage) - - @classmethod - def from_config(cls, cfg, input_shape): - s = input_shape - s = s[0] if isinstance(s, (list, tuple)) else s - return {'in_channel': s.channels} - - @property - def out_shape(self): - return [ShapeSpec(channels=self.out_channel, )] - - def forward(self, rois_feat, stage=0): - out = self.head_list[stage](rois_feat) - return out - - -@register -class CascadeHead(BBoxHead): - __shared__ = ['num_classes', 'num_cascade_stages'] - __inject__ = ['bbox_assigner', 'bbox_loss'] - """ - Cascade RCNN bbox head - - Args: - head (nn.Layer): Extract feature in bbox head - in_channel (int): Input channel after RoI extractor - roi_extractor (object): The module of RoI Extractor - bbox_assigner (object): The module of Box Assigner, label and sample the - box. - num_classes (int): The number of classes - bbox_weight (List[List[float]]): The weight to get the decode box and the - length of weight is the number of cascade stage - num_cascade_stages (int): THe number of stage to refine the box - """ - - def __init__(self, - head, - in_channel, - roi_extractor=_get_class_default_kwargs(RoIAlign), - bbox_assigner='BboxAssigner', - num_classes=80, - bbox_weight=[[10., 10., 5., 5.], [20.0, 20.0, 10.0, 10.0], - [30.0, 30.0, 15.0, 15.0]], - num_cascade_stages=3, - bbox_loss=None, - reg_class_agnostic=True, - stage_loss_weights=None, - loss_normalize_pos=False, - add_gt_as_proposals=[True, False, False]): - - nn.Layer.__init__(self, ) - self.head = head - self.roi_extractor = roi_extractor - if isinstance(roi_extractor, dict): - self.roi_extractor = RoIAlign(**roi_extractor) - self.bbox_assigner = bbox_assigner - - self.num_classes = num_classes - self.bbox_weight = bbox_weight - self.num_cascade_stages = num_cascade_stages - self.bbox_loss = bbox_loss - self.stage_loss_weights = [ - 1. / num_cascade_stages for _ in range(num_cascade_stages) - ] if stage_loss_weights is None else stage_loss_weights - self.add_gt_as_proposals = add_gt_as_proposals - - assert len( - self.stage_loss_weights - ) == num_cascade_stages, f'stage_loss_weights({len(self.stage_loss_weights)}) do not equal to num_cascade_stages({num_cascade_stages})' - - self.reg_class_agnostic = reg_class_agnostic - num_bbox_delta = 4 if reg_class_agnostic else 4 * num_classes - self.loss_normalize_pos = loss_normalize_pos - - self.bbox_score_list = [] - self.bbox_delta_list = [] - for i in range(num_cascade_stages): - score_name = 'bbox_score_stage{}'.format(i) - delta_name = 'bbox_delta_stage{}'.format(i) - bbox_score = self.add_sublayer( - score_name, - nn.Linear( - in_channel, - self.num_classes + 1, - weight_attr=paddle.ParamAttr(initializer=Normal( - mean=0.0, std=0.01)))) - - bbox_delta = self.add_sublayer( - delta_name, - nn.Linear( - in_channel, - num_bbox_delta, - weight_attr=paddle.ParamAttr(initializer=Normal( - mean=0.0, std=0.001)))) - self.bbox_score_list.append(bbox_score) - self.bbox_delta_list.append(bbox_delta) - self.assigned_label = None - self.assigned_rois = None - - def forward(self, body_feats=None, rois=None, rois_num=None, inputs=None): - """ - body_feats (list[Tensor]): Feature maps from backbone - rois (Tensor): RoIs generated from RPN module - rois_num (Tensor): The number of RoIs in each image - inputs (dict{Tensor}): The ground-truth of image - """ - targets = [] - if self.training: - rois, rois_num, targets = self.bbox_assigner( - rois, - rois_num, - inputs, - add_gt_as_proposals=self.add_gt_as_proposals[0]) - targets_list = [targets] - self.assigned_rois = (rois, rois_num) - self.assigned_targets = targets - - pred_bbox = None - head_out_list = [] - for i in range(self.num_cascade_stages): - if i > 0: - rois, rois_num = self._get_rois_from_boxes(pred_bbox, - inputs['im_shape']) - if self.training: - rois, rois_num, targets = self.bbox_assigner( - rois, - rois_num, - inputs, - i, - is_cascade=True, - add_gt_as_proposals=self.add_gt_as_proposals[i]) - targets_list.append(targets) - - rois_feat = self.roi_extractor(body_feats, rois, rois_num) - bbox_feat = self.head(rois_feat, i) - scores = self.bbox_score_list[i](bbox_feat) - deltas = self.bbox_delta_list[i](bbox_feat) - - # TODO (lyuwenyu) Is it correct for only one class ? - if not self.reg_class_agnostic and i < self.num_cascade_stages - 1: - deltas = deltas.reshape([deltas.shape[0], self.num_classes, 4]) - labels = scores[:, :-1].argmax(axis=-1) - - if self.training: - deltas = deltas[paddle.arange(deltas.shape[0]), labels] - else: - deltas = deltas[((deltas + 10000) * F.one_hot( - labels, num_classes=self.num_classes).unsqueeze(-1) != 0 - ).nonzero(as_tuple=True)].reshape( - [deltas.shape[0], 4]) - - head_out_list.append([scores, deltas, rois]) - pred_bbox = self._get_pred_bbox(deltas, rois, self.bbox_weight[i]) - - if self.training: - loss = {} - for stage, value in enumerate(zip(head_out_list, targets_list)): - (scores, deltas, rois), targets = value - loss_stage = self.get_loss( - scores, - deltas, - targets, - rois, - self.bbox_weight[stage], - loss_normalize_pos=self.loss_normalize_pos) - for k, v in loss_stage.items(): - loss[k + "_stage{}".format( - stage)] = v * self.stage_loss_weights[stage] - - return loss, bbox_feat - else: - scores, deltas, self.refined_rois = self.get_prediction( - head_out_list) - return (deltas, scores), self.head - - def _get_rois_from_boxes(self, boxes, im_shape): - rois = [] - for i, boxes_per_image in enumerate(boxes): - clip_box = clip_bbox(boxes_per_image, im_shape[i]) - if self.training: - keep = nonempty_bbox(clip_box) - if keep.shape[0] == 0: - keep = paddle.zeros([1], dtype='int32') - clip_box = paddle.gather(clip_box, keep) - rois.append(clip_box) - rois_num = paddle.concat([paddle.shape(r)[0:1] for r in rois]) - return rois, rois_num - - def _get_pred_bbox(self, deltas, proposals, weights): - pred_proposals = paddle.concat(proposals) if len( - proposals) > 1 else proposals[0] - pred_bbox = delta2bbox(deltas, pred_proposals, weights) - pred_bbox = paddle.reshape(pred_bbox, [-1, deltas.shape[-1]]) - num_prop = [] - for p in proposals: - num_prop.append(p.shape[0]) - - # NOTE(dev): num_prob will be tagged as LoDTensorArray because it - # depends on batch_size under @to_static. However the argument - # num_or_sections in paddle.split does not support LoDTensorArray, - # so we use [-1] to replace it if num_prop is not list. The modification - # This ensures the correctness of both dynamic and static graphs. - if not isinstance(num_prop, list): - num_prop = [-1] - return pred_bbox.split(num_prop) - - def get_prediction(self, head_out_list): - """ - head_out_list(List[Tensor]): scores, deltas, rois - """ - pred_list = [] - scores_list = [F.softmax(head[0]) for head in head_out_list] - scores = paddle.add_n(scores_list) / self.num_cascade_stages - # Get deltas and rois from the last stage - _, deltas, rois = head_out_list[-1] - return scores, deltas, rois - - def get_refined_rois(self, ): - return self.refined_rois diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/centernet_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/centernet_head.py deleted file mode 100644 index 7657774..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/centernet_head.py +++ /dev/null @@ -1,293 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.nn.initializer import Constant, Uniform -from ppdet.core.workspace import register -from ppdet.modeling.losses import CTFocalLoss, GIoULoss - - -class ConvLayer(nn.Layer): - def __init__(self, - ch_in, - ch_out, - kernel_size, - stride=1, - padding=0, - dilation=1, - groups=1, - bias=False): - super(ConvLayer, self).__init__() - bias_attr = False - fan_in = ch_in * kernel_size**2 - bound = 1 / math.sqrt(fan_in) - param_attr = paddle.ParamAttr(initializer=Uniform(-bound, bound)) - if bias: - bias_attr = paddle.ParamAttr(initializer=Constant(0.)) - self.conv = nn.Conv2D( - in_channels=ch_in, - out_channels=ch_out, - kernel_size=kernel_size, - stride=stride, - padding=padding, - dilation=dilation, - groups=groups, - weight_attr=param_attr, - bias_attr=bias_attr) - - def forward(self, inputs): - out = self.conv(inputs) - return out - - -@register -class CenterNetHead(nn.Layer): - """ - Args: - in_channels (int): the channel number of input to CenterNetHead. - num_classes (int): the number of classes, 80 (COCO dataset) by default. - head_planes (int): the channel number in all head, 256 by default. - prior_bias (float): prior bias in heatmap head, -2.19 by default, -4.6 in CenterTrack - regress_ltrb (bool): whether to regress left/top/right/bottom or - width/height for a box, True by default. - size_loss (str): the type of size regression loss, 'L1' by default, can be 'giou'. - loss_weight (dict): the weight of each loss. - add_iou (bool): whether to add iou branch, False by default. - """ - - __shared__ = ['num_classes'] - - def __init__(self, - in_channels, - num_classes=80, - head_planes=256, - prior_bias=-2.19, - regress_ltrb=True, - size_loss='L1', - loss_weight={ - 'heatmap': 1.0, - 'size': 0.1, - 'offset': 1.0, - 'iou': 0.0, - }, - add_iou=False): - super(CenterNetHead, self).__init__() - self.regress_ltrb = regress_ltrb - self.loss_weight = loss_weight - self.add_iou = add_iou - - # heatmap head - self.heatmap = nn.Sequential( - ConvLayer( - in_channels, head_planes, kernel_size=3, padding=1, bias=True), - nn.ReLU(), - ConvLayer( - head_planes, - num_classes, - kernel_size=1, - stride=1, - padding=0, - bias=True)) - with paddle.no_grad(): - self.heatmap[2].conv.bias[:] = prior_bias - - # size(ltrb or wh) head - self.size = nn.Sequential( - ConvLayer( - in_channels, head_planes, kernel_size=3, padding=1, bias=True), - nn.ReLU(), - ConvLayer( - head_planes, - 4 if regress_ltrb else 2, - kernel_size=1, - stride=1, - padding=0, - bias=True)) - self.size_loss = size_loss - - # offset head - self.offset = nn.Sequential( - ConvLayer( - in_channels, head_planes, kernel_size=3, padding=1, bias=True), - nn.ReLU(), - ConvLayer( - head_planes, 2, kernel_size=1, stride=1, padding=0, bias=True)) - - # iou head (optinal) - if self.add_iou and 'iou' in self.loss_weight: - self.iou = nn.Sequential( - ConvLayer( - in_channels, - head_planes, - kernel_size=3, - padding=1, - bias=True), - nn.ReLU(), - ConvLayer( - head_planes, - 4 if regress_ltrb else 2, - kernel_size=1, - stride=1, - padding=0, - bias=True)) - - @classmethod - def from_config(cls, cfg, input_shape): - if isinstance(input_shape, (list, tuple)): - input_shape = input_shape[0] - return {'in_channels': input_shape.channels} - - def forward(self, feat, inputs): - heatmap = F.sigmoid(self.heatmap(feat)) - size = self.size(feat) - offset = self.offset(feat) - head_outs = {'heatmap': heatmap, 'size': size, 'offset': offset} - if self.add_iou and 'iou' in self.loss_weight: - iou = self.iou(feat) - head_outs.update({'iou': iou}) - - if self.training: - losses = self.get_loss(inputs, self.loss_weight, head_outs) - return losses - else: - return head_outs - - def get_loss(self, inputs, weights, head_outs): - # 1.heatmap(hm) head loss: CTFocalLoss - heatmap = head_outs['heatmap'] - heatmap_target = inputs['heatmap'] - heatmap = paddle.clip(heatmap, 1e-4, 1 - 1e-4) - ctfocal_loss = CTFocalLoss() - heatmap_loss = ctfocal_loss(heatmap, heatmap_target) - - # 2.size(wh) head loss: L1 loss or GIoU loss - size = head_outs['size'] - index = inputs['index'] - mask = inputs['index_mask'] - size = paddle.transpose(size, perm=[0, 2, 3, 1]) - size_n, _, _, size_c = size.shape - size = paddle.reshape(size, shape=[size_n, -1, size_c]) - index = paddle.unsqueeze(index, 2) - batch_inds = list() - for i in range(size_n): - batch_ind = paddle.full( - shape=[1, index.shape[1], 1], fill_value=i, dtype='int64') - batch_inds.append(batch_ind) - batch_inds = paddle.concat(batch_inds, axis=0) - index = paddle.concat(x=[batch_inds, index], axis=2) - pos_size = paddle.gather_nd(size, index=index) - mask = paddle.unsqueeze(mask, axis=2) - size_mask = paddle.expand_as(mask, pos_size) - size_mask = paddle.cast(size_mask, dtype=pos_size.dtype) - pos_num = size_mask.sum() - size_mask.stop_gradient = True - if self.size_loss == 'L1': - if self.regress_ltrb: - size_target = inputs['size'] - # shape: [bs, max_per_img, 4] - else: - if inputs['size'].shape[-1] == 2: - # inputs['size'] is wh, and regress as wh - # shape: [bs, max_per_img, 2] - size_target = inputs['size'] - else: - # inputs['size'] is ltrb, but regress as wh - # shape: [bs, max_per_img, 4] - size_target = inputs['size'][:, :, 0:2] + inputs[ - 'size'][:, :, 2:] - - size_target.stop_gradient = True - size_loss = F.l1_loss( - pos_size * size_mask, size_target * size_mask, reduction='sum') - size_loss = size_loss / (pos_num + 1e-4) - elif self.size_loss == 'giou': - size_target = inputs['bbox_xys'] - size_target.stop_gradient = True - centers_x = (size_target[:, :, 0:1] + size_target[:, :, 2:3]) / 2.0 - centers_y = (size_target[:, :, 1:2] + size_target[:, :, 3:4]) / 2.0 - x1 = centers_x - pos_size[:, :, 0:1] - y1 = centers_y - pos_size[:, :, 1:2] - x2 = centers_x + pos_size[:, :, 2:3] - y2 = centers_y + pos_size[:, :, 3:4] - pred_boxes = paddle.concat([x1, y1, x2, y2], axis=-1) - giou_loss = GIoULoss(reduction='sum') - size_loss = giou_loss( - pred_boxes * size_mask, - size_target * size_mask, - iou_weight=size_mask, - loc_reweight=None) - size_loss = size_loss / (pos_num + 1e-4) - - # 3.offset(reg) head loss: L1 loss - offset = head_outs['offset'] - offset_target = inputs['offset'] - offset = paddle.transpose(offset, perm=[0, 2, 3, 1]) - offset_n, _, _, offset_c = offset.shape - offset = paddle.reshape(offset, shape=[offset_n, -1, offset_c]) - pos_offset = paddle.gather_nd(offset, index=index) - offset_mask = paddle.expand_as(mask, pos_offset) - offset_mask = paddle.cast(offset_mask, dtype=pos_offset.dtype) - pos_num = offset_mask.sum() - offset_mask.stop_gradient = True - offset_target.stop_gradient = True - offset_loss = F.l1_loss( - pos_offset * offset_mask, - offset_target * offset_mask, - reduction='sum') - offset_loss = offset_loss / (pos_num + 1e-4) - - # 4.iou head loss: GIoU loss (optinal) - if self.add_iou and 'iou' in self.loss_weight: - iou = head_outs['iou'] - iou = paddle.transpose(iou, perm=[0, 2, 3, 1]) - iou_n, _, _, iou_c = iou.shape - iou = paddle.reshape(iou, shape=[iou_n, -1, iou_c]) - pos_iou = paddle.gather_nd(iou, index=index) - iou_mask = paddle.expand_as(mask, pos_iou) - iou_mask = paddle.cast(iou_mask, dtype=pos_iou.dtype) - pos_num = iou_mask.sum() - iou_mask.stop_gradient = True - gt_bbox_xys = inputs['bbox_xys'] - gt_bbox_xys.stop_gradient = True - centers_x = (gt_bbox_xys[:, :, 0:1] + gt_bbox_xys[:, :, 2:3]) / 2.0 - centers_y = (gt_bbox_xys[:, :, 1:2] + gt_bbox_xys[:, :, 3:4]) / 2.0 - x1 = centers_x - pos_size[:, :, 0:1] - y1 = centers_y - pos_size[:, :, 1:2] - x2 = centers_x + pos_size[:, :, 2:3] - y2 = centers_y + pos_size[:, :, 3:4] - pred_boxes = paddle.concat([x1, y1, x2, y2], axis=-1) - giou_loss = GIoULoss(reduction='sum') - iou_loss = giou_loss( - pred_boxes * iou_mask, - gt_bbox_xys * iou_mask, - iou_weight=iou_mask, - loc_reweight=None) - iou_loss = iou_loss / (pos_num + 1e-4) - - losses = { - 'heatmap_loss': heatmap_loss, - 'size_loss': size_loss, - 'offset_loss': offset_loss, - } - det_loss = weights['heatmap'] * heatmap_loss + weights[ - 'size'] * size_loss + weights['offset'] * offset_loss - - if self.add_iou and 'iou' in self.loss_weight: - losses.update({'iou_loss': iou_loss}) - det_loss += weights['iou'] * iou_loss - losses.update({'det_loss': det_loss}) - return losses diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/centertrack_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/centertrack_head.py deleted file mode 100644 index dc35336..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/centertrack_head.py +++ /dev/null @@ -1,244 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register -from .centernet_head import ConvLayer -from ..keypoint_utils import get_affine_transform - -__all__ = ['CenterTrackHead'] - - -@register -class CenterTrackHead(nn.Layer): - """ - Args: - in_channels (int): the channel number of input to CenterNetHead. - num_classes (int): the number of classes, 1 (MOT17 dataset) by default. - head_planes (int): the channel number in all head, 256 by default. - task (str): the type of task for regression, 'tracking' by default. - loss_weight (dict): the weight of each loss. - add_ltrb_amodal (bool): whether to add ltrb_amodal branch, False by default. - """ - - __shared__ = ['num_classes'] - - def __init__(self, - in_channels, - num_classes=1, - head_planes=256, - task='tracking', - loss_weight={ - 'tracking': 1.0, - 'ltrb_amodal': 0.1, - }, - add_ltrb_amodal=True): - super(CenterTrackHead, self).__init__() - self.task = task - self.loss_weight = loss_weight - self.add_ltrb_amodal = add_ltrb_amodal - - # tracking head - self.tracking = nn.Sequential( - ConvLayer( - in_channels, head_planes, kernel_size=3, padding=1, bias=True), - nn.ReLU(), - ConvLayer( - head_planes, 2, kernel_size=1, stride=1, padding=0, bias=True)) - - # ltrb_amodal head - if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight: - self.ltrb_amodal = nn.Sequential( - ConvLayer( - in_channels, - head_planes, - kernel_size=3, - padding=1, - bias=True), - nn.ReLU(), - ConvLayer( - head_planes, - 4, - kernel_size=1, - stride=1, - padding=0, - bias=True)) - - # TODO: add more tasks - - @classmethod - def from_config(cls, cfg, input_shape): - if isinstance(input_shape, (list, tuple)): - input_shape = input_shape[0] - return {'in_channels': input_shape.channels} - - def forward(self, - feat, - inputs, - bboxes=None, - bbox_inds=None, - topk_clses=None, - topk_ys=None, - topk_xs=None): - tracking = self.tracking(feat) - head_outs = {'tracking': tracking} - if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight: - ltrb_amodal = self.ltrb_amodal(feat) - head_outs.update({'ltrb_amodal': ltrb_amodal}) - - if self.training: - losses = self.get_loss(inputs, self.loss_weight, head_outs) - return losses - else: - ret = self.generic_decode(head_outs, bboxes, bbox_inds, topk_ys, - topk_xs) - return ret - - def get_loss(self, inputs, weights, head_outs): - index = inputs['index'].unsqueeze(2) - mask = inputs['index_mask'].unsqueeze(2) - batch_inds = list() - for i in range(head_outs['tracking'].shape[0]): - batch_ind = paddle.full( - shape=[1, index.shape[1], 1], fill_value=i, dtype='int64') - batch_inds.append(batch_ind) - batch_inds = paddle.concat(batch_inds, axis=0) - index = paddle.concat(x=[batch_inds, index], axis=2) - - # 1.tracking head loss: L1 loss - tracking = head_outs['tracking'].transpose([0, 2, 3, 1]) - tracking_target = inputs['tracking'] - bs, _, _, c = tracking.shape - tracking = tracking.reshape([bs, -1, c]) - pos_tracking = paddle.gather_nd(tracking, index=index) - tracking_mask = paddle.cast( - paddle.expand_as(mask, pos_tracking), dtype=pos_tracking.dtype) - pos_num = tracking_mask.sum() - tracking_mask.stop_gradient = True - tracking_target.stop_gradient = True - tracking_loss = F.l1_loss( - pos_tracking * tracking_mask, - tracking_target * tracking_mask, - reduction='sum') - tracking_loss = tracking_loss / (pos_num + 1e-4) - - # 2.ltrb_amodal head loss(optinal): L1 loss - if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight: - ltrb_amodal = head_outs['ltrb_amodal'].transpose([0, 2, 3, 1]) - ltrb_amodal_target = inputs['ltrb_amodal'] - bs, _, _, c = ltrb_amodal.shape - ltrb_amodal = ltrb_amodal.reshape([bs, -1, c]) - pos_ltrb_amodal = paddle.gather_nd(ltrb_amodal, index=index) - ltrb_amodal_mask = paddle.cast( - paddle.expand_as(mask, pos_ltrb_amodal), - dtype=pos_ltrb_amodal.dtype) - pos_num = ltrb_amodal_mask.sum() - ltrb_amodal_mask.stop_gradient = True - ltrb_amodal_target.stop_gradient = True - ltrb_amodal_loss = F.l1_loss( - pos_ltrb_amodal * ltrb_amodal_mask, - ltrb_amodal_target * ltrb_amodal_mask, - reduction='sum') - ltrb_amodal_loss = ltrb_amodal_loss / (pos_num + 1e-4) - - losses = {'tracking_loss': tracking_loss, } - plugin_loss = weights['tracking'] * tracking_loss - - if self.add_ltrb_amodal and 'ltrb_amodal' in self.loss_weight: - losses.update({'ltrb_amodal_loss': ltrb_amodal_loss}) - plugin_loss += weights['ltrb_amodal'] * ltrb_amodal_loss - losses.update({'plugin_loss': plugin_loss}) - return losses - - def generic_decode(self, head_outs, bboxes, bbox_inds, topk_ys, topk_xs): - topk_ys = paddle.floor(topk_ys) # note: More accurate - topk_xs = paddle.floor(topk_xs) - cts = paddle.concat([topk_xs, topk_ys], 1) - ret = {'bboxes': bboxes, 'cts': cts} - - regression_heads = ['tracking'] # todo: add more tasks - for head in regression_heads: - if head in head_outs: - ret[head] = _tranpose_and_gather_feat(head_outs[head], - bbox_inds) - - if 'ltrb_amodal' in head_outs: - ltrb_amodal = head_outs['ltrb_amodal'] - ltrb_amodal = _tranpose_and_gather_feat(ltrb_amodal, bbox_inds) - bboxes_amodal = paddle.concat( - [ - topk_xs * 1.0 + ltrb_amodal[..., 0:1], - topk_ys * 1.0 + ltrb_amodal[..., 1:2], - topk_xs * 1.0 + ltrb_amodal[..., 2:3], - topk_ys * 1.0 + ltrb_amodal[..., 3:4] - ], - axis=1) - ret['bboxes'] = paddle.concat([bboxes[:, 0:2], bboxes_amodal], 1) - # cls_id, score, x0, y0, x1, y1 - - return ret - - def centertrack_post_process(self, dets, meta, out_thresh): - if not ('bboxes' in dets): - return [{}] - - preds = [] - c, s = meta['center'].numpy(), meta['scale'].numpy() - h, w = meta['out_height'].numpy(), meta['out_width'].numpy() - trans = get_affine_transform( - center=c[0], - input_size=s[0], - rot=0, - output_size=[w[0], h[0]], - shift=(0., 0.), - inv=True).astype(np.float32) - for i, dets_bbox in enumerate(dets['bboxes']): - if dets_bbox[1] < out_thresh: - break - item = {} - item['score'] = dets_bbox[1] - item['class'] = int(dets_bbox[0]) + 1 - item['ct'] = transform_preds_with_trans( - dets['cts'][i].reshape([1, 2]), trans).reshape(2) - - if 'tracking' in dets: - tracking = transform_preds_with_trans( - (dets['tracking'][i] + dets['cts'][i]).reshape([1, 2]), - trans).reshape(2) - item['tracking'] = tracking - item['ct'] - - if 'bboxes' in dets: - bbox = transform_preds_with_trans( - dets_bbox[2:6].reshape([2, 2]), trans).reshape(4) - item['bbox'] = bbox - - preds.append(item) - return preds - - -def transform_preds_with_trans(coords, trans): - target_coords = np.ones((coords.shape[0], 3), np.float32) - target_coords[:, :2] = coords - target_coords = np.dot(trans, target_coords.transpose()).transpose() - return target_coords[:, :2] - - -def _tranpose_and_gather_feat(feat, bbox_inds): - feat = feat.transpose([0, 2, 3, 1]) - feat = feat.reshape([-1, feat.shape[3]]) - feat = paddle.gather(feat, bbox_inds) - return feat diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/clrnet_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/clrnet_head.py deleted file mode 100644 index 14760b9..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/clrnet_head.py +++ /dev/null @@ -1,399 +0,0 @@ -import math -import paddle -import numpy as np -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register - -from ppdet.modeling.initializer import normal_ -from ppdet.modeling.lane_utils import Lane -from ppdet.modeling.losses import line_iou -from ppdet.modeling.clrnet_utils import ROIGather, LinearModule, SegDecoder - -__all__ = ['CLRHead'] - - -@register -class CLRHead(nn.Layer): - __inject__ = ['loss'] - __shared__ = [ - 'img_w', 'img_h', 'ori_img_h', 'num_classes', 'cut_height', - 'num_points', "max_lanes" - ] - - def __init__(self, - num_points=72, - prior_feat_channels=64, - fc_hidden_dim=64, - num_priors=192, - img_w=800, - img_h=320, - ori_img_h=590, - cut_height=270, - num_classes=5, - num_fc=2, - refine_layers=3, - sample_points=36, - conf_threshold=0.4, - nms_thres=0.5, - max_lanes=4, - loss='CLRNetLoss'): - super(CLRHead, self).__init__() - self.img_w = img_w - self.img_h = img_h - self.n_strips = num_points - 1 - self.n_offsets = num_points - self.num_priors = num_priors - self.sample_points = sample_points - self.refine_layers = refine_layers - self.num_classes = num_classes - self.fc_hidden_dim = fc_hidden_dim - self.ori_img_h = ori_img_h - self.cut_height = cut_height - self.conf_threshold = conf_threshold - self.nms_thres = nms_thres - self.max_lanes = max_lanes - self.prior_feat_channels = prior_feat_channels - self.loss = loss - self.register_buffer( - name='sample_x_indexs', - tensor=(paddle.linspace( - start=0, stop=1, num=self.sample_points, - dtype=paddle.float32) * self.n_strips).astype(dtype='int64')) - self.register_buffer( - name='prior_feat_ys', - tensor=paddle.flip( - x=(1 - self.sample_x_indexs.astype('float32') / self.n_strips), - axis=[-1])) - self.register_buffer( - name='prior_ys', - tensor=paddle.linspace( - start=1, stop=0, num=self.n_offsets).astype('float32')) - self.prior_feat_channels = prior_feat_channels - self._init_prior_embeddings() - init_priors, priors_on_featmap = self.generate_priors_from_embeddings() - self.register_buffer(name='priors', tensor=init_priors) - self.register_buffer(name='priors_on_featmap', tensor=priors_on_featmap) - self.seg_decoder = SegDecoder(self.img_h, self.img_w, self.num_classes, - self.prior_feat_channels, - self.refine_layers) - reg_modules = list() - cls_modules = list() - for _ in range(num_fc): - reg_modules += [*LinearModule(self.fc_hidden_dim)] - cls_modules += [*LinearModule(self.fc_hidden_dim)] - self.reg_modules = nn.LayerList(sublayers=reg_modules) - self.cls_modules = nn.LayerList(sublayers=cls_modules) - self.roi_gather = ROIGather(self.prior_feat_channels, self.num_priors, - self.sample_points, self.fc_hidden_dim, - self.refine_layers) - self.reg_layers = nn.Linear( - in_features=self.fc_hidden_dim, - out_features=self.n_offsets + 1 + 2 + 1, - bias_attr=True) - self.cls_layers = nn.Linear( - in_features=self.fc_hidden_dim, out_features=2, bias_attr=True) - self.init_weights() - - def init_weights(self): - for m in self.cls_layers.parameters(): - normal_(m, mean=0.0, std=0.001) - for m in self.reg_layers.parameters(): - normal_(m, mean=0.0, std=0.001) - - def pool_prior_features(self, batch_features, num_priors, prior_xs): - """ - pool prior feature from feature map. - Args: - batch_features (Tensor): Input feature maps, shape: (B, C, H, W) - """ - batch_size = batch_features.shape[0] - prior_xs = prior_xs.reshape([batch_size, num_priors, -1, 1]) - - prior_ys = self.prior_feat_ys.tile(repeat_times=[ - batch_size * num_priors - ]).reshape([batch_size, num_priors, -1, 1]) - prior_xs = prior_xs * 2.0 - 1.0 - prior_ys = prior_ys * 2.0 - 1.0 - grid = paddle.concat(x=(prior_xs, prior_ys), axis=-1) - feature = F.grid_sample( - x=batch_features, grid=grid, - align_corners=True).transpose(perm=[0, 2, 1, 3]) - feature = feature.reshape([ - batch_size * num_priors, self.prior_feat_channels, - self.sample_points, 1 - ]) - return feature - - def generate_priors_from_embeddings(self): - predictions = self.prior_embeddings.weight - # 2 scores, 1 start_y, 1 start_x, 1 theta, 1 length, 72 coordinates, score[0] = negative prob, score[1] = positive prob - priors = paddle.zeros( - (self.num_priors, 2 + 2 + 2 + self.n_offsets), - dtype=predictions.dtype) - priors[:, 2:5] = predictions.clone() - priors[:, 6:] = ( - priors[:, 3].unsqueeze(1).clone().tile([1, self.n_offsets]) * - (self.img_w - 1) + - ((1 - self.prior_ys.tile([self.num_priors, 1]) - - priors[:, 2].unsqueeze(1).clone().tile([1, self.n_offsets])) * - self.img_h / paddle.tan(x=priors[:, 4].unsqueeze(1).clone().tile( - [1, self.n_offsets]) * math.pi + 1e-05))) / (self.img_w - 1) - priors_on_featmap = paddle.index_select( - priors, 6 + self.sample_x_indexs, axis=-1) - return priors, priors_on_featmap - - def _init_prior_embeddings(self): - self.prior_embeddings = nn.Embedding(self.num_priors, 3) - bottom_priors_nums = self.num_priors * 3 // 4 - left_priors_nums, _ = self.num_priors // 8, self.num_priors // 8 - strip_size = 0.5 / (left_priors_nums // 2 - 1) - bottom_strip_size = 1 / (bottom_priors_nums // 4 + 1) - - with paddle.no_grad(): - for i in range(left_priors_nums): - self.prior_embeddings.weight[i, 0] = i // 2 * strip_size - self.prior_embeddings.weight[i, 1] = 0.0 - self.prior_embeddings.weight[i, - 2] = 0.16 if i % 2 == 0 else 0.32 - - for i in range(left_priors_nums, - left_priors_nums + bottom_priors_nums): - self.prior_embeddings.weight[i, 0] = 0.0 - self.prior_embeddings.weight[i, 1] = ( - (i - left_priors_nums) // 4 + 1) * bottom_strip_size - self.prior_embeddings.weight[i, 2] = 0.2 * (i % 4 + 1) - - for i in range(left_priors_nums + bottom_priors_nums, - self.num_priors): - self.prior_embeddings.weight[i, 0] = ( - i - left_priors_nums - bottom_priors_nums) // 2 * strip_size - self.prior_embeddings.weight[i, 1] = 1.0 - self.prior_embeddings.weight[i, - 2] = 0.68 if i % 2 == 0 else 0.84 - - def forward(self, x, inputs=None): - """ - Take pyramid features as input to perform Cross Layer Refinement and finally output the prediction lanes. - Each feature is a 4D tensor. - Args: - x: input features (list[Tensor]) - Return: - prediction_list: each layer's prediction result - seg: segmentation result for auxiliary loss - """ - batch_features = list(x[len(x) - self.refine_layers:]) - batch_features.reverse() - batch_size = batch_features[-1].shape[0] - - if self.training: - self.priors, self.priors_on_featmap = self.generate_priors_from_embeddings( - ) - priors, priors_on_featmap = self.priors.tile( - [batch_size, 1, - 1]), self.priors_on_featmap.tile([batch_size, 1, 1]) - predictions_lists = [] - prior_features_stages = [] - - for stage in range(self.refine_layers): - num_priors = priors_on_featmap.shape[1] - prior_xs = paddle.flip(x=priors_on_featmap, axis=[2]) - batch_prior_features = self.pool_prior_features( - batch_features[stage], num_priors, prior_xs) - prior_features_stages.append(batch_prior_features) - - fc_features = self.roi_gather(prior_features_stages, - batch_features[stage], stage) - # return fc_features - fc_features = fc_features.reshape( - [num_priors, batch_size, -1]).reshape( - [batch_size * num_priors, self.fc_hidden_dim]) - cls_features = fc_features.clone() - reg_features = fc_features.clone() - - for cls_layer in self.cls_modules: - cls_features = cls_layer(cls_features) - - # return cls_features - for reg_layer in self.reg_modules: - reg_features = reg_layer(reg_features) - cls_logits = self.cls_layers(cls_features) - reg = self.reg_layers(reg_features) - - cls_logits = cls_logits.reshape( - [batch_size, -1, cls_logits.shape[1]]) - reg = reg.reshape([batch_size, -1, reg.shape[1]]) - predictions = priors.clone() - predictions[:, :, :2] = cls_logits - predictions[:, :, 2:5] += reg[:, :, :3] - predictions[:, :, 5] = reg[:, :, 3] - - def tran_tensor(t): - return t.unsqueeze(axis=2).clone().tile([1, 1, self.n_offsets]) - - predictions[..., 6:] = ( - tran_tensor(predictions[..., 3]) * (self.img_w - 1) + - ((1 - self.prior_ys.tile([batch_size, num_priors, 1]) - - tran_tensor(predictions[..., 2])) * self.img_h / paddle.tan( - tran_tensor(predictions[..., 4]) * math.pi + 1e-05))) / ( - self.img_w - 1) - - prediction_lines = predictions.clone() - predictions[..., 6:] += reg[..., 4:] - predictions_lists.append(predictions) - - if stage != self.refine_layers - 1: - priors = prediction_lines.detach().clone() - priors_on_featmap = priors.index_select( - 6 + self.sample_x_indexs, axis=-1) - - if self.training: - seg = None - seg_features = paddle.concat( - [ - F.interpolate( - feature, - size=[ - batch_features[-1].shape[2], - batch_features[-1].shape[3] - ], - mode='bilinear', - align_corners=False) for feature in batch_features - ], - axis=1) - - seg = self.seg_decoder(seg_features) - - output = {'predictions_lists': predictions_lists, 'seg': seg} - return self.loss(output, inputs) - return predictions_lists[-1] - - def predictions_to_pred(self, predictions): - """ - Convert predictions to internal Lane structure for evaluation. - """ - self.prior_ys = paddle.to_tensor(self.prior_ys) - self.prior_ys = self.prior_ys.astype('float64') - lanes = [] - for lane in predictions: - lane_xs = lane[6:].clone() - start = min( - max(0, int(round(lane[2].item() * self.n_strips))), - self.n_strips) - length = int(round(lane[5].item())) - end = start + length - 1 - end = min(end, len(self.prior_ys) - 1) - if start > 0: - mask = ((lane_xs[:start] >= 0.) & - (lane_xs[:start] <= 1.)).cpu().detach().numpy()[::-1] - mask = ~((mask.cumprod()[::-1]).astype(np.bool)) - lane_xs[:start][mask] = -2 - if end < len(self.prior_ys) - 1: - lane_xs[end + 1:] = -2 - - lane_ys = self.prior_ys[lane_xs >= 0].clone() - lane_xs = lane_xs[lane_xs >= 0] - lane_xs = lane_xs.flip(axis=0).astype('float64') - lane_ys = lane_ys.flip(axis=0) - - lane_ys = (lane_ys * - (self.ori_img_h - self.cut_height) + self.cut_height - ) / self.ori_img_h - if len(lane_xs) <= 1: - continue - points = paddle.stack( - x=(lane_xs.reshape([-1, 1]), lane_ys.reshape([-1, 1])), - axis=1).squeeze(axis=2) - lane = Lane( - points=points.cpu().numpy(), - metadata={ - 'start_x': lane[3], - 'start_y': lane[2], - 'conf': lane[1] - }) - lanes.append(lane) - return lanes - - def lane_nms(self, predictions, scores, nms_overlap_thresh, top_k): - """ - NMS for lane detection. - predictions: paddle.Tensor [num_lanes,conf,y,x,lenght,72offsets] [12,77] - scores: paddle.Tensor [num_lanes] - nms_overlap_thresh: float - top_k: int - """ - # sort by scores to get idx - idx = scores.argsort(descending=True) - keep = [] - - condidates = predictions.clone() - condidates = condidates.index_select(idx) - - while len(condidates) > 0: - keep.append(idx[0]) - if len(keep) >= top_k or len(condidates) == 1: - break - - ious = [] - for i in range(1, len(condidates)): - ious.append(1 - line_iou( - condidates[i].unsqueeze(0), - condidates[0].unsqueeze(0), - img_w=self.img_w, - length=15)) - ious = paddle.to_tensor(ious) - - mask = ious <= nms_overlap_thresh - id = paddle.where(mask == False)[0] - - if id.shape[0] == 0: - break - condidates = condidates[1:].index_select(id) - idx = idx[1:].index_select(id) - keep = paddle.stack(keep) - - return keep - - def get_lanes(self, output, as_lanes=True): - """ - Convert model output to lanes. - """ - softmax = nn.Softmax(axis=1) - decoded = [] - - for predictions in output: - threshold = self.conf_threshold - scores = softmax(predictions[:, :2])[:, 1] - keep_inds = scores >= threshold - predictions = predictions[keep_inds] - scores = scores[keep_inds] - - if predictions.shape[0] == 0: - decoded.append([]) - continue - nms_predictions = predictions.detach().clone() - nms_predictions = paddle.concat( - x=[nms_predictions[..., :4], nms_predictions[..., 5:]], axis=-1) - - nms_predictions[..., 4] = nms_predictions[..., 4] * self.n_strips - nms_predictions[..., 5:] = nms_predictions[..., 5:] * ( - self.img_w - 1) - - keep = self.lane_nms( - nms_predictions[..., 5:], - scores, - nms_overlap_thresh=self.nms_thres, - top_k=self.max_lanes) - - predictions = predictions.index_select(keep) - - if predictions.shape[0] == 0: - decoded.append([]) - continue - predictions[:, 5] = paddle.round(predictions[:, 5] * self.n_strips) - if as_lanes: - pred = self.predictions_to_pred(predictions) - else: - pred = predictions - decoded.append(pred) - return decoded diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/detr_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/detr_head.py deleted file mode 100644 index d3c093f..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/detr_head.py +++ /dev/null @@ -1,536 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register -import pycocotools.mask as mask_util -from ..initializer import linear_init_, constant_ -from ..transformers.utils import inverse_sigmoid - -__all__ = ['DETRHead', 'DeformableDETRHead', 'DINOHead', 'MaskDINOHead'] - - -class MLP(nn.Layer): - """This code is based on - https://github.com/facebookresearch/detr/blob/main/models/detr.py - """ - - def __init__(self, input_dim, hidden_dim, output_dim, num_layers): - super().__init__() - self.num_layers = num_layers - h = [hidden_dim] * (num_layers - 1) - self.layers = nn.LayerList( - nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) - - self._reset_parameters() - - def _reset_parameters(self): - for l in self.layers: - linear_init_(l) - - def forward(self, x): - for i, layer in enumerate(self.layers): - x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) - return x - - -class MultiHeadAttentionMap(nn.Layer): - """This code is based on - https://github.com/facebookresearch/detr/blob/main/models/segmentation.py - - This is a 2D attention module, which only returns the attention softmax (no multiplication by value) - """ - - def __init__(self, query_dim, hidden_dim, num_heads, dropout=0.0, - bias=True): - super().__init__() - self.num_heads = num_heads - self.hidden_dim = hidden_dim - self.dropout = nn.Dropout(dropout) - - weight_attr = paddle.ParamAttr( - initializer=paddle.nn.initializer.XavierUniform()) - bias_attr = paddle.framework.ParamAttr( - initializer=paddle.nn.initializer.Constant()) if bias else False - - self.q_proj = nn.Linear(query_dim, hidden_dim, weight_attr, bias_attr) - self.k_proj = nn.Conv2D( - query_dim, - hidden_dim, - 1, - weight_attr=weight_attr, - bias_attr=bias_attr) - - self.normalize_fact = float(hidden_dim / self.num_heads)**-0.5 - - def forward(self, q, k, mask=None): - q = self.q_proj(q) - k = self.k_proj(k) - bs, num_queries, n, c, h, w = q.shape[0], q.shape[1], self.num_heads,\ - self.hidden_dim // self.num_heads, k.shape[-2], k.shape[-1] - qh = q.reshape([bs, num_queries, n, c]) - kh = k.reshape([bs, n, c, h, w]) - # weights = paddle.einsum("bqnc,bnchw->bqnhw", qh * self.normalize_fact, kh) - qh = qh.transpose([0, 2, 1, 3]).reshape([-1, num_queries, c]) - kh = kh.reshape([-1, c, h * w]) - weights = paddle.bmm(qh * self.normalize_fact, kh).reshape( - [bs, n, num_queries, h, w]).transpose([0, 2, 1, 3, 4]) - - if mask is not None: - weights += mask - # fix a potenial bug: https://github.com/facebookresearch/detr/issues/247 - weights = F.softmax(weights.flatten(3), axis=-1).reshape(weights.shape) - weights = self.dropout(weights) - return weights - - -class MaskHeadFPNConv(nn.Layer): - """This code is based on - https://github.com/facebookresearch/detr/blob/main/models/segmentation.py - - Simple convolutional head, using group norm. - Upsampling is done using a FPN approach - """ - - def __init__(self, input_dim, fpn_dims, context_dim, num_groups=8): - super().__init__() - - inter_dims = [input_dim, - ] + [context_dim // (2**i) for i in range(1, 5)] - weight_attr = paddle.ParamAttr( - initializer=paddle.nn.initializer.KaimingUniform()) - bias_attr = paddle.framework.ParamAttr( - initializer=paddle.nn.initializer.Constant()) - - self.conv0 = self._make_layers(input_dim, input_dim, 3, num_groups, - weight_attr, bias_attr) - self.conv_inter = nn.LayerList() - for in_dims, out_dims in zip(inter_dims[:-1], inter_dims[1:]): - self.conv_inter.append( - self._make_layers(in_dims, out_dims, 3, num_groups, weight_attr, - bias_attr)) - - self.conv_out = nn.Conv2D( - inter_dims[-1], - 1, - 3, - padding=1, - weight_attr=weight_attr, - bias_attr=bias_attr) - - self.adapter = nn.LayerList() - for i in range(len(fpn_dims)): - self.adapter.append( - nn.Conv2D( - fpn_dims[i], - inter_dims[i + 1], - 1, - weight_attr=weight_attr, - bias_attr=bias_attr)) - - def _make_layers(self, - in_dims, - out_dims, - kernel_size, - num_groups, - weight_attr=None, - bias_attr=None): - return nn.Sequential( - nn.Conv2D( - in_dims, - out_dims, - kernel_size, - padding=kernel_size // 2, - weight_attr=weight_attr, - bias_attr=bias_attr), - nn.GroupNorm(num_groups, out_dims), - nn.ReLU()) - - def forward(self, x, bbox_attention_map, fpns): - x = paddle.concat([ - x.tile([bbox_attention_map.shape[1], 1, 1, 1]), - bbox_attention_map.flatten(0, 1) - ], 1) - x = self.conv0(x) - for inter_layer, adapter_layer, feat in zip(self.conv_inter[:-1], - self.adapter, fpns): - feat = adapter_layer(feat).tile( - [bbox_attention_map.shape[1], 1, 1, 1]) - x = inter_layer(x) - x = feat + F.interpolate(x, size=feat.shape[-2:]) - - x = self.conv_inter[-1](x) - x = self.conv_out(x) - return x - - -@register -class DETRHead(nn.Layer): - __shared__ = ['num_classes', 'hidden_dim', 'use_focal_loss'] - __inject__ = ['loss'] - - def __init__(self, - num_classes=80, - hidden_dim=256, - nhead=8, - num_mlp_layers=3, - loss='DETRLoss', - fpn_dims=[1024, 512, 256], - with_mask_head=False, - use_focal_loss=False): - super(DETRHead, self).__init__() - # add background class - self.num_classes = num_classes if use_focal_loss else num_classes + 1 - self.hidden_dim = hidden_dim - self.loss = loss - self.with_mask_head = with_mask_head - self.use_focal_loss = use_focal_loss - - self.score_head = nn.Linear(hidden_dim, self.num_classes) - self.bbox_head = MLP(hidden_dim, - hidden_dim, - output_dim=4, - num_layers=num_mlp_layers) - if self.with_mask_head: - self.bbox_attention = MultiHeadAttentionMap(hidden_dim, hidden_dim, - nhead) - self.mask_head = MaskHeadFPNConv(hidden_dim + nhead, fpn_dims, - hidden_dim) - self._reset_parameters() - - def _reset_parameters(self): - linear_init_(self.score_head) - - @classmethod - def from_config(cls, cfg, hidden_dim, nhead, input_shape): - - return { - 'hidden_dim': hidden_dim, - 'nhead': nhead, - 'fpn_dims': [i.channels for i in input_shape[::-1]][1:] - } - - @staticmethod - def get_gt_mask_from_polygons(gt_poly, pad_mask): - out_gt_mask = [] - for polygons, padding in zip(gt_poly, pad_mask): - height, width = int(padding[:, 0].sum()), int(padding[0, :].sum()) - masks = [] - for obj_poly in polygons: - rles = mask_util.frPyObjects(obj_poly, height, width) - rle = mask_util.merge(rles) - masks.append( - paddle.to_tensor(mask_util.decode(rle)).astype('float32')) - masks = paddle.stack(masks) - masks_pad = paddle.zeros( - [masks.shape[0], pad_mask.shape[1], pad_mask.shape[2]]) - masks_pad[:, :height, :width] = masks - out_gt_mask.append(masks_pad) - return out_gt_mask - - def forward(self, out_transformer, body_feats, inputs=None): - r""" - Args: - out_transformer (Tuple): (feats: [num_levels, batch_size, - num_queries, hidden_dim], - memory: [batch_size, hidden_dim, h, w], - src_proj: [batch_size, h*w, hidden_dim], - src_mask: [batch_size, 1, 1, h, w]) - body_feats (List(Tensor)): list[[B, C, H, W]] - inputs (dict): dict(inputs) - """ - feats, memory, src_proj, src_mask = out_transformer - outputs_logit = self.score_head(feats) - outputs_bbox = F.sigmoid(self.bbox_head(feats)) - outputs_seg = None - if self.with_mask_head: - bbox_attention_map = self.bbox_attention(feats[-1], memory, - src_mask) - fpn_feats = [a for a in body_feats[::-1]][1:] - outputs_seg = self.mask_head(src_proj, bbox_attention_map, - fpn_feats) - outputs_seg = outputs_seg.reshape([ - feats.shape[1], feats.shape[2], outputs_seg.shape[-2], - outputs_seg.shape[-1] - ]) - - if self.training: - assert inputs is not None - assert 'gt_bbox' in inputs and 'gt_class' in inputs - gt_mask = self.get_gt_mask_from_polygons( - inputs['gt_poly'], - inputs['pad_mask']) if 'gt_poly' in inputs else None - return self.loss( - outputs_bbox, - outputs_logit, - inputs['gt_bbox'], - inputs['gt_class'], - masks=outputs_seg, - gt_mask=gt_mask) - else: - return (outputs_bbox[-1], outputs_logit[-1], outputs_seg) - - -@register -class DeformableDETRHead(nn.Layer): - __shared__ = ['num_classes', 'hidden_dim'] - __inject__ = ['loss'] - - def __init__(self, - num_classes=80, - hidden_dim=512, - nhead=8, - num_mlp_layers=3, - loss='DETRLoss'): - super(DeformableDETRHead, self).__init__() - self.num_classes = num_classes - self.hidden_dim = hidden_dim - self.nhead = nhead - self.loss = loss - - self.score_head = nn.Linear(hidden_dim, self.num_classes) - self.bbox_head = MLP(hidden_dim, - hidden_dim, - output_dim=4, - num_layers=num_mlp_layers) - - self._reset_parameters() - - def _reset_parameters(self): - linear_init_(self.score_head) - constant_(self.score_head.bias, -4.595) - constant_(self.bbox_head.layers[-1].weight) - - with paddle.no_grad(): - bias = paddle.zeros_like(self.bbox_head.layers[-1].bias) - bias[2:] = -2.0 - self.bbox_head.layers[-1].bias.set_value(bias) - - @classmethod - def from_config(cls, cfg, hidden_dim, nhead, input_shape): - return {'hidden_dim': hidden_dim, 'nhead': nhead} - - def forward(self, out_transformer, body_feats, inputs=None): - r""" - Args: - out_transformer (Tuple): (feats: [num_levels, batch_size, - num_queries, hidden_dim], - memory: [batch_size, - \sum_{l=0}^{L-1} H_l \cdot W_l, hidden_dim], - reference_points: [batch_size, num_queries, 2]) - body_feats (List(Tensor)): list[[B, C, H, W]] - inputs (dict): dict(inputs) - """ - feats, memory, reference_points = out_transformer - reference_points = inverse_sigmoid(reference_points.unsqueeze(0)) - outputs_bbox = self.bbox_head(feats) - - # It's equivalent to "outputs_bbox[:, :, :, :2] += reference_points", - # but the gradient is wrong in paddle. - outputs_bbox = paddle.concat( - [ - outputs_bbox[:, :, :, :2] + reference_points, - outputs_bbox[:, :, :, 2:] - ], - axis=-1) - - outputs_bbox = F.sigmoid(outputs_bbox) - outputs_logit = self.score_head(feats) - - if self.training: - assert inputs is not None - assert 'gt_bbox' in inputs and 'gt_class' in inputs - - return self.loss(outputs_bbox, outputs_logit, inputs['gt_bbox'], - inputs['gt_class']) - else: - return (outputs_bbox[-1], outputs_logit[-1], None) - - -@register -class DINOHead(nn.Layer): - __inject__ = ['loss'] - - def __init__(self, loss='DINOLoss', eval_idx=-1): - super(DINOHead, self).__init__() - self.loss = loss - self.eval_idx = eval_idx - - def forward(self, out_transformer, body_feats, inputs=None): - (dec_out_bboxes, dec_out_logits, enc_topk_bboxes, enc_topk_logits, - dn_meta) = out_transformer - if self.training: - assert inputs is not None - assert 'gt_bbox' in inputs and 'gt_class' in inputs - - if dn_meta is not None: - if isinstance(dn_meta, list): - dual_groups = len(dn_meta) - 1 - dec_out_bboxes = paddle.split( - dec_out_bboxes, dual_groups + 1, axis=2) - dec_out_logits = paddle.split( - dec_out_logits, dual_groups + 1, axis=2) - enc_topk_bboxes = paddle.split( - enc_topk_bboxes, dual_groups + 1, axis=1) - enc_topk_logits = paddle.split( - enc_topk_logits, dual_groups + 1, axis=1) - - dec_out_bboxes_list = [] - dec_out_logits_list = [] - dn_out_bboxes_list = [] - dn_out_logits_list = [] - loss = {} - for g_id in range(dual_groups + 1): - if dn_meta[g_id] is not None: - dn_out_bboxes_gid, dec_out_bboxes_gid = paddle.split( - dec_out_bboxes[g_id], - dn_meta[g_id]['dn_num_split'], - axis=2) - dn_out_logits_gid, dec_out_logits_gid = paddle.split( - dec_out_logits[g_id], - dn_meta[g_id]['dn_num_split'], - axis=2) - else: - dn_out_bboxes_gid, dn_out_logits_gid = None, None - dec_out_bboxes_gid = dec_out_bboxes[g_id] - dec_out_logits_gid = dec_out_logits[g_id] - out_bboxes_gid = paddle.concat([ - enc_topk_bboxes[g_id].unsqueeze(0), - dec_out_bboxes_gid - ]) - out_logits_gid = paddle.concat([ - enc_topk_logits[g_id].unsqueeze(0), - dec_out_logits_gid - ]) - loss_gid = self.loss( - out_bboxes_gid, - out_logits_gid, - inputs['gt_bbox'], - inputs['gt_class'], - dn_out_bboxes=dn_out_bboxes_gid, - dn_out_logits=dn_out_logits_gid, - dn_meta=dn_meta[g_id]) - # sum loss - for key, value in loss_gid.items(): - loss.update({ - key: loss.get(key, paddle.zeros([1])) + value - }) - - # average across (dual_groups + 1) - for key, value in loss.items(): - loss.update({key: value / (dual_groups + 1)}) - return loss - else: - dn_out_bboxes, dec_out_bboxes = paddle.split( - dec_out_bboxes, dn_meta['dn_num_split'], axis=2) - dn_out_logits, dec_out_logits = paddle.split( - dec_out_logits, dn_meta['dn_num_split'], axis=2) - else: - dn_out_bboxes, dn_out_logits = None, None - - out_bboxes = paddle.concat( - [enc_topk_bboxes.unsqueeze(0), dec_out_bboxes]) - out_logits = paddle.concat( - [enc_topk_logits.unsqueeze(0), dec_out_logits]) - - return self.loss( - out_bboxes, - out_logits, - inputs['gt_bbox'], - inputs['gt_class'], - dn_out_bboxes=dn_out_bboxes, - dn_out_logits=dn_out_logits, - dn_meta=dn_meta, - gt_score=inputs.get('gt_score', None)) - else: - return (dec_out_bboxes[self.eval_idx], - dec_out_logits[self.eval_idx], None) - - -@register -class MaskDINOHead(nn.Layer): - __inject__ = ['loss'] - - def __init__(self, loss='DINOLoss'): - super(MaskDINOHead, self).__init__() - self.loss = loss - - def forward(self, out_transformer, body_feats, inputs=None): - (dec_out_logits, dec_out_bboxes, dec_out_masks, enc_out, init_out, - dn_meta) = out_transformer - if self.training: - assert inputs is not None - assert 'gt_bbox' in inputs and 'gt_class' in inputs - assert 'gt_segm' in inputs - - if dn_meta is not None: - dn_out_logits, dec_out_logits = paddle.split( - dec_out_logits, dn_meta['dn_num_split'], axis=2) - dn_out_bboxes, dec_out_bboxes = paddle.split( - dec_out_bboxes, dn_meta['dn_num_split'], axis=2) - dn_out_masks, dec_out_masks = paddle.split( - dec_out_masks, dn_meta['dn_num_split'], axis=2) - if init_out is not None: - init_out_logits, init_out_bboxes, init_out_masks = init_out - init_out_logits_dn, init_out_logits = paddle.split( - init_out_logits, dn_meta['dn_num_split'], axis=1) - init_out_bboxes_dn, init_out_bboxes = paddle.split( - init_out_bboxes, dn_meta['dn_num_split'], axis=1) - init_out_masks_dn, init_out_masks = paddle.split( - init_out_masks, dn_meta['dn_num_split'], axis=1) - - dec_out_logits = paddle.concat( - [init_out_logits.unsqueeze(0), dec_out_logits]) - dec_out_bboxes = paddle.concat( - [init_out_bboxes.unsqueeze(0), dec_out_bboxes]) - dec_out_masks = paddle.concat( - [init_out_masks.unsqueeze(0), dec_out_masks]) - - dn_out_logits = paddle.concat( - [init_out_logits_dn.unsqueeze(0), dn_out_logits]) - dn_out_bboxes = paddle.concat( - [init_out_bboxes_dn.unsqueeze(0), dn_out_bboxes]) - dn_out_masks = paddle.concat( - [init_out_masks_dn.unsqueeze(0), dn_out_masks]) - else: - dn_out_bboxes, dn_out_logits = None, None - dn_out_masks = None - - enc_out_logits, enc_out_bboxes, enc_out_masks = enc_out - out_logits = paddle.concat( - [enc_out_logits.unsqueeze(0), dec_out_logits]) - out_bboxes = paddle.concat( - [enc_out_bboxes.unsqueeze(0), dec_out_bboxes]) - out_masks = paddle.concat( - [enc_out_masks.unsqueeze(0), dec_out_masks]) - - return self.loss( - out_bboxes, - out_logits, - inputs['gt_bbox'], - inputs['gt_class'], - masks=out_masks, - gt_mask=inputs['gt_segm'], - dn_out_logits=dn_out_logits, - dn_out_bboxes=dn_out_bboxes, - dn_out_masks=dn_out_masks, - dn_meta=dn_meta) - else: - return (dec_out_bboxes[-1], dec_out_logits[-1], dec_out_masks[-1]) diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/face_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/face_head.py deleted file mode 100644 index 360f909..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/face_head.py +++ /dev/null @@ -1,111 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn - -from ppdet.core.workspace import register -from ..layers import AnchorGeneratorSSD -from ..cls_utils import _get_class_default_kwargs - - -@register -class FaceHead(nn.Layer): - """ - Head block for Face detection network - - Args: - num_classes (int): Number of output classes. - in_channels (int): Number of input channels. - anchor_generator(object): instance of anchor genertor method. - kernel_size (int): kernel size of Conv2D in FaceHead. - padding (int): padding of Conv2D in FaceHead. - conv_decay (float): norm_decay (float): weight decay for conv layer weights. - loss (object): loss of face detection model. - """ - __shared__ = ['num_classes'] - __inject__ = ['anchor_generator', 'loss'] - - def __init__(self, - num_classes=80, - in_channels=[96, 96], - anchor_generator=_get_class_default_kwargs(AnchorGeneratorSSD), - kernel_size=3, - padding=1, - conv_decay=0., - loss='SSDLoss'): - super(FaceHead, self).__init__() - # add background class - self.num_classes = num_classes + 1 - self.in_channels = in_channels - self.anchor_generator = anchor_generator - self.loss = loss - - if isinstance(anchor_generator, dict): - self.anchor_generator = AnchorGeneratorSSD(**anchor_generator) - - self.num_priors = self.anchor_generator.num_priors - self.box_convs = [] - self.score_convs = [] - for i, num_prior in enumerate(self.num_priors): - box_conv_name = "boxes{}".format(i) - box_conv = self.add_sublayer( - box_conv_name, - nn.Conv2D( - in_channels=self.in_channels[i], - out_channels=num_prior * 4, - kernel_size=kernel_size, - padding=padding)) - self.box_convs.append(box_conv) - - score_conv_name = "scores{}".format(i) - score_conv = self.add_sublayer( - score_conv_name, - nn.Conv2D( - in_channels=self.in_channels[i], - out_channels=num_prior * self.num_classes, - kernel_size=kernel_size, - padding=padding)) - self.score_convs.append(score_conv) - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } - - def forward(self, feats, image, gt_bbox=None, gt_class=None): - box_preds = [] - cls_scores = [] - prior_boxes = [] - for feat, box_conv, score_conv in zip(feats, self.box_convs, - self.score_convs): - box_pred = box_conv(feat) - box_pred = paddle.transpose(box_pred, [0, 2, 3, 1]) - box_pred = paddle.reshape(box_pred, [0, -1, 4]) - box_preds.append(box_pred) - - cls_score = score_conv(feat) - cls_score = paddle.transpose(cls_score, [0, 2, 3, 1]) - cls_score = paddle.reshape(cls_score, [0, -1, self.num_classes]) - cls_scores.append(cls_score) - - prior_boxes = self.anchor_generator(feats, image) - - if self.training: - return self.get_loss(box_preds, cls_scores, gt_bbox, gt_class, - prior_boxes) - else: - return (box_preds, cls_scores), prior_boxes - - def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes): - return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes) diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/fcos_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/fcos_head.py deleted file mode 100644 index f975789..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/fcos_head.py +++ /dev/null @@ -1,499 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import math -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.nn.initializer import Normal, Constant - -from ppdet.core.workspace import register -from ppdet.modeling.layers import ConvNormLayer, MultiClassNMS - -__all__ = ['FCOSFeat', 'FCOSHead', 'FCOSHead_ARSL'] - - -class ScaleReg(nn.Layer): - """ - Parameter for scaling the regression outputs. - """ - - def __init__(self): - super(ScaleReg, self).__init__() - self.scale_reg = self.create_parameter( - shape=[1], - attr=ParamAttr(initializer=Constant(value=1.)), - dtype="float32") - - def forward(self, inputs): - out = inputs * self.scale_reg - return out - - -@register -class FCOSFeat(nn.Layer): - """ - FCOSFeat of FCOS - - Args: - feat_in (int): The channel number of input Tensor. - feat_out (int): The channel number of output Tensor. - num_convs (int): The convolution number of the FCOSFeat. - norm_type (str): Normalization type, 'bn'/'sync_bn'/'gn'. - use_dcn (bool): Whether to use dcn in tower or not. - """ - - def __init__(self, - feat_in=256, - feat_out=256, - num_convs=4, - norm_type='bn', - use_dcn=False): - super(FCOSFeat, self).__init__() - self.feat_in = feat_in - self.feat_out = feat_out - self.num_convs = num_convs - self.norm_type = norm_type - self.cls_subnet_convs = [] - self.reg_subnet_convs = [] - for i in range(self.num_convs): - in_c = feat_in if i == 0 else feat_out - - cls_conv_name = 'fcos_head_cls_tower_conv_{}'.format(i) - cls_conv = self.add_sublayer( - cls_conv_name, - ConvNormLayer( - ch_in=in_c, - ch_out=feat_out, - filter_size=3, - stride=1, - norm_type=norm_type, - use_dcn=use_dcn, - bias_on=True, - lr_scale=2.)) - self.cls_subnet_convs.append(cls_conv) - - reg_conv_name = 'fcos_head_reg_tower_conv_{}'.format(i) - reg_conv = self.add_sublayer( - reg_conv_name, - ConvNormLayer( - ch_in=in_c, - ch_out=feat_out, - filter_size=3, - stride=1, - norm_type=norm_type, - use_dcn=use_dcn, - bias_on=True, - lr_scale=2.)) - self.reg_subnet_convs.append(reg_conv) - - def forward(self, fpn_feat): - cls_feat = fpn_feat - reg_feat = fpn_feat - for i in range(self.num_convs): - cls_feat = F.relu(self.cls_subnet_convs[i](cls_feat)) - reg_feat = F.relu(self.reg_subnet_convs[i](reg_feat)) - return cls_feat, reg_feat - - -@register -class FCOSHead(nn.Layer): - """ - FCOSHead - Args: - num_classes (int): Number of classes - fcos_feat (object): Instance of 'FCOSFeat' - fpn_stride (list): The stride of each FPN Layer - prior_prob (float): Used to set the bias init for the class prediction layer - norm_reg_targets (bool): Normalization the regression target if true - centerness_on_reg (bool): The prediction of centerness on regression or clssification branch - num_shift (float): Relative offset between the center of the first shift and the top-left corner of img - fcos_loss (object): Instance of 'FCOSLoss' - nms (object): Instance of 'MultiClassNMS' - trt (bool): Whether to use trt in nms of deploy - """ - __inject__ = ['fcos_feat', 'fcos_loss', 'nms'] - __shared__ = ['num_classes', 'trt'] - - def __init__(self, - num_classes=80, - fcos_feat='FCOSFeat', - fpn_stride=[8, 16, 32, 64, 128], - prior_prob=0.01, - multiply_strides_reg_targets=False, - norm_reg_targets=True, - centerness_on_reg=True, - num_shift=0.5, - sqrt_score=False, - fcos_loss='FCOSLoss', - nms='MultiClassNMS', - trt=False): - super(FCOSHead, self).__init__() - self.fcos_feat = fcos_feat - self.num_classes = num_classes - self.fpn_stride = fpn_stride - self.prior_prob = prior_prob - self.fcos_loss = fcos_loss - self.norm_reg_targets = norm_reg_targets - self.centerness_on_reg = centerness_on_reg - self.multiply_strides_reg_targets = multiply_strides_reg_targets - self.num_shift = num_shift - self.nms = nms - if isinstance(self.nms, MultiClassNMS) and trt: - self.nms.trt = trt - self.sqrt_score = sqrt_score - self.is_teacher = False - - conv_cls_name = "fcos_head_cls" - bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob) - self.fcos_head_cls = self.add_sublayer( - conv_cls_name, - nn.Conv2D( - in_channels=256, - out_channels=self.num_classes, - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr( - initializer=Constant(value=bias_init_value)))) - - conv_reg_name = "fcos_head_reg" - self.fcos_head_reg = self.add_sublayer( - conv_reg_name, - nn.Conv2D( - in_channels=256, - out_channels=4, - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr(initializer=Constant(value=0)))) - - conv_centerness_name = "fcos_head_centerness" - self.fcos_head_centerness = self.add_sublayer( - conv_centerness_name, - nn.Conv2D( - in_channels=256, - out_channels=1, - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr(initializer=Constant(value=0)))) - - self.scales_regs = [] - for i in range(len(self.fpn_stride)): - lvl = int(math.log(int(self.fpn_stride[i]), 2)) - feat_name = 'p{}_feat'.format(lvl) - scale_reg = self.add_sublayer(feat_name, ScaleReg()) - self.scales_regs.append(scale_reg) - - def _compute_locations_by_level(self, fpn_stride, feature, num_shift=0.5): - """ - Compute locations of anchor points of each FPN layer - Args: - fpn_stride (int): The stride of current FPN feature map - feature (Tensor): Tensor of current FPN feature map - Return: - Anchor points locations of current FPN feature map - """ - h, w = feature.shape[2], feature.shape[3] - shift_x = paddle.arange(0, w * fpn_stride, fpn_stride) - shift_y = paddle.arange(0, h * fpn_stride, fpn_stride) - shift_x = paddle.unsqueeze(shift_x, axis=0) - shift_y = paddle.unsqueeze(shift_y, axis=1) - shift_x = paddle.expand(shift_x, shape=[h, w]) - shift_y = paddle.expand(shift_y, shape=[h, w]) - - shift_x = paddle.reshape(shift_x, shape=[-1]) - shift_y = paddle.reshape(shift_y, shape=[-1]) - location = paddle.stack( - [shift_x, shift_y], axis=-1) + float(fpn_stride * num_shift) - return location - - def forward(self, fpn_feats, targets=None): - assert len(fpn_feats) == len( - self.fpn_stride - ), "The size of fpn_feats is not equal to size of fpn_stride" - cls_logits_list = [] - bboxes_reg_list = [] - centerness_list = [] - for scale_reg, fpn_stride, fpn_feat in zip(self.scales_regs, - self.fpn_stride, fpn_feats): - fcos_cls_feat, fcos_reg_feat = self.fcos_feat(fpn_feat) - cls_logits = self.fcos_head_cls(fcos_cls_feat) - bbox_reg = scale_reg(self.fcos_head_reg(fcos_reg_feat)) - if self.centerness_on_reg: - centerness = self.fcos_head_centerness(fcos_reg_feat) - else: - centerness = self.fcos_head_centerness(fcos_cls_feat) - if self.norm_reg_targets: - bbox_reg = F.relu(bbox_reg) - if self.multiply_strides_reg_targets: - bbox_reg = bbox_reg * fpn_stride - else: - if not self.training or targets.get( - 'get_data', - False) or targets.get('is_teacher', False): - bbox_reg = bbox_reg * fpn_stride - else: - bbox_reg = paddle.exp(bbox_reg) - cls_logits_list.append(cls_logits) - bboxes_reg_list.append(bbox_reg) - centerness_list.append(centerness) - - if targets is not None: - self.is_teacher = targets.get('is_teacher', False) - if self.is_teacher: - return [cls_logits_list, bboxes_reg_list, centerness_list] - - if self.training and targets is not None: - get_data = targets.get('get_data', False) - if get_data: - return [cls_logits_list, bboxes_reg_list, centerness_list] - - losses = {} - fcos_head_outs = [cls_logits_list, bboxes_reg_list, centerness_list] - losses_fcos = self.get_loss(fcos_head_outs, targets) - losses.update(losses_fcos) - - total_loss = paddle.add_n(list(losses.values())) - losses.update({'loss': total_loss}) - return losses - else: - # eval or infer - locations_list = [] - for fpn_stride, feature in zip(self.fpn_stride, fpn_feats): - location = self._compute_locations_by_level(fpn_stride, feature, - self.num_shift) - locations_list.append(location) - - fcos_head_outs = [ - locations_list, cls_logits_list, bboxes_reg_list, - centerness_list - ] - return fcos_head_outs - - def get_loss(self, fcos_head_outs, targets): - cls_logits, bboxes_reg, centerness = fcos_head_outs - - # get labels,reg_target,centerness - tag_labels, tag_bboxes, tag_centerness = [], [], [] - for i in range(len(self.fpn_stride)): - k_lbl = 'labels{}'.format(i) - if k_lbl in targets: - tag_labels.append(targets[k_lbl]) - k_box = 'reg_target{}'.format(i) - if k_box in targets: - tag_bboxes.append(targets[k_box]) - k_ctn = 'centerness{}'.format(i) - if k_ctn in targets: - tag_centerness.append(targets[k_ctn]) - - losses_fcos = self.fcos_loss(cls_logits, bboxes_reg, centerness, - tag_labels, tag_bboxes, tag_centerness) - return losses_fcos - - def _post_process_by_level(self, - locations, - box_cls, - box_reg, - box_ctn, - sqrt_score=False): - box_scores = F.sigmoid(box_cls).flatten(2).transpose([0, 2, 1]) - box_centerness = F.sigmoid(box_ctn).flatten(2).transpose([0, 2, 1]) - pred_scores = box_scores * box_centerness - if sqrt_score: - pred_scores = paddle.sqrt(pred_scores) - - box_reg_ch_last = box_reg.flatten(2).transpose([0, 2, 1]) - box_reg_decoding = paddle.stack( - [ - locations[:, 0] - box_reg_ch_last[:, :, 0], - locations[:, 1] - box_reg_ch_last[:, :, 1], - locations[:, 0] + box_reg_ch_last[:, :, 2], - locations[:, 1] + box_reg_ch_last[:, :, 3] - ], - axis=1) - pred_boxes = box_reg_decoding.transpose([0, 2, 1]) - - return pred_scores, pred_boxes - - def post_process(self, fcos_head_outs, scale_factor): - locations, cls_logits, bboxes_reg, centerness = fcos_head_outs - pred_bboxes, pred_scores = [], [] - - for pts, cls, reg, ctn in zip(locations, cls_logits, bboxes_reg, - centerness): - scores, boxes = self._post_process_by_level(pts, cls, reg, ctn, - self.sqrt_score) - pred_scores.append(scores) - pred_bboxes.append(boxes) - pred_bboxes = paddle.concat(pred_bboxes, axis=1) - pred_scores = paddle.concat(pred_scores, axis=1) - - # scale bbox to origin - scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1) - scale_factor = paddle.concat( - [scale_x, scale_y, scale_x, scale_y], axis=-1).reshape([-1, 1, 4]) - pred_bboxes /= scale_factor - - pred_scores = pred_scores.transpose([0, 2, 1]) - bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) - return bbox_pred, bbox_num - - -@register -class FCOSHead_ARSL(FCOSHead): - """ - FCOSHead of ARSL for semi-det(ssod) - Args: - fcos_feat (object): Instance of 'FCOSFeat' - num_classes (int): Number of classes - fpn_stride (list): The stride of each FPN Layer - prior_prob (float): Used to set the bias init for the class prediction layer - fcos_loss (object): Instance of 'FCOSLoss' - norm_reg_targets (bool): Normalization the regression target if true - centerness_on_reg (bool): The prediction of centerness on regression or clssification branch - nms (object): Instance of 'MultiClassNMS' - trt (bool): Whether to use trt in nms of deploy - """ - __inject__ = ['fcos_feat', 'fcos_loss', 'nms'] - __shared__ = ['num_classes', 'trt'] - - def __init__(self, - num_classes=80, - fcos_feat='FCOSFeat', - fpn_stride=[8, 16, 32, 64, 128], - prior_prob=0.01, - multiply_strides_reg_targets=False, - norm_reg_targets=True, - centerness_on_reg=True, - num_shift=0.5, - sqrt_score=False, - fcos_loss='FCOSLossMILC', - nms='MultiClassNMS', - trt=False): - super(FCOSHead_ARSL, self).__init__() - self.fcos_feat = fcos_feat - self.num_classes = num_classes - self.fpn_stride = fpn_stride - self.prior_prob = prior_prob - self.fcos_loss = fcos_loss - self.norm_reg_targets = norm_reg_targets - self.centerness_on_reg = centerness_on_reg - self.multiply_strides_reg_targets = multiply_strides_reg_targets - self.num_shift = num_shift - self.nms = nms - if isinstance(self.nms, MultiClassNMS) and trt: - self.nms.trt = trt - self.sqrt_score = sqrt_score - - conv_cls_name = "fcos_head_cls" - bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob) - self.fcos_head_cls = self.add_sublayer( - conv_cls_name, - nn.Conv2D( - in_channels=256, - out_channels=self.num_classes, - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr( - initializer=Constant(value=bias_init_value)))) - - conv_reg_name = "fcos_head_reg" - self.fcos_head_reg = self.add_sublayer( - conv_reg_name, - nn.Conv2D( - in_channels=256, - out_channels=4, - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr(initializer=Constant(value=0)))) - - conv_centerness_name = "fcos_head_centerness" - self.fcos_head_centerness = self.add_sublayer( - conv_centerness_name, - nn.Conv2D( - in_channels=256, - out_channels=1, - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr(initializer=Constant(value=0)))) - - self.scales_regs = [] - for i in range(len(self.fpn_stride)): - lvl = int(math.log(int(self.fpn_stride[i]), 2)) - feat_name = 'p{}_feat'.format(lvl) - scale_reg = self.add_sublayer(feat_name, ScaleReg()) - self.scales_regs.append(scale_reg) - - def forward(self, fpn_feats, targets=None): - assert len(fpn_feats) == len( - self.fpn_stride - ), "The size of fpn_feats is not equal to size of fpn_stride" - cls_logits_list = [] - bboxes_reg_list = [] - centerness_list = [] - for scale_reg, fpn_stride, fpn_feat in zip(self.scales_regs, - self.fpn_stride, fpn_feats): - fcos_cls_feat, fcos_reg_feat = self.fcos_feat(fpn_feat) - cls_logits = self.fcos_head_cls(fcos_cls_feat) - bbox_reg = scale_reg(self.fcos_head_reg(fcos_reg_feat)) - if self.centerness_on_reg: - centerness = self.fcos_head_centerness(fcos_reg_feat) - else: - centerness = self.fcos_head_centerness(fcos_cls_feat) - if self.norm_reg_targets: - bbox_reg = F.relu(bbox_reg) - if not self.training: - bbox_reg = bbox_reg * fpn_stride - else: - bbox_reg = paddle.exp(bbox_reg) - cls_logits_list.append(cls_logits) - bboxes_reg_list.append(bbox_reg) - centerness_list.append(centerness) - - if not self.training: - locations_list = [] - for fpn_stride, feature in zip(self.fpn_stride, fpn_feats): - location = self._compute_locations_by_level(fpn_stride, feature) - locations_list.append(location) - - return locations_list, cls_logits_list, bboxes_reg_list, centerness_list - else: - return cls_logits_list, bboxes_reg_list, centerness_list - - def get_loss(self, fcos_head_outs, tag_labels, tag_bboxes, tag_centerness): - cls_logits, bboxes_reg, centerness = fcos_head_outs - return self.fcos_loss(cls_logits, bboxes_reg, centerness, tag_labels, - tag_bboxes, tag_centerness) diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/fcosr_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/fcosr_head.py deleted file mode 100644 index df98883..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/fcosr_head.py +++ /dev/null @@ -1,396 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register -from paddle import ParamAttr -from paddle.regularizer import L2Decay - -from .fcos_head import ScaleReg -from ..initializer import bias_init_with_prob, constant_, normal_ -from ..ops import get_act_fn, anchor_generator -from ..rbox_utils import box2corners -from ..losses import ProbIoULoss -import numpy as np - -__all__ = ['FCOSRHead'] - - -def trunc_div(a, b): - ipt = paddle.divide(a, b) - sign_ipt = paddle.sign(ipt) - abs_ipt = paddle.abs(ipt) - abs_ipt = paddle.floor(abs_ipt) - out = paddle.multiply(sign_ipt, abs_ipt) - return out - - -def fmod(a, b): - return a - trunc_div(a, b) * b - - -def fmod_eval(a, b): - return a - a.divide(b).cast(paddle.int32).cast(paddle.float32) * b - - -class ConvBNLayer(nn.Layer): - def __init__(self, - ch_in, - ch_out, - filter_size=3, - stride=1, - groups=1, - padding=0, - norm_cfg={'name': 'gn', - 'num_groups': 32}, - act=None): - super(ConvBNLayer, self).__init__() - - self.conv = nn.Conv2D( - in_channels=ch_in, - out_channels=ch_out, - kernel_size=filter_size, - stride=stride, - padding=padding, - groups=groups, - bias_attr=False) - - norm_type = norm_cfg['name'] - if norm_type in ['sync_bn', 'bn']: - self.norm = nn.BatchNorm2D( - ch_out, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - else: - groups = norm_cfg.get('num_groups', 1) - self.norm = nn.GroupNorm( - num_groups=groups, - num_channels=ch_out, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - self.act = get_act_fn(act) if act is None or isinstance(act, ( - str, dict)) else act - - def forward(self, x): - x = self.conv(x) - x = self.norm(x) - x = self.act(x) - - return x - - -@register -class FCOSRHead(nn.Layer): - """ FCOSR Head, refer to https://arxiv.org/abs/2111.10780 for details """ - - __shared__ = ['num_classes', 'trt'] - __inject__ = ['assigner', 'nms'] - - def __init__(self, - num_classes=15, - in_channels=256, - feat_channels=256, - stacked_convs=4, - act='relu', - fpn_strides=[4, 8, 16, 32, 64], - trt=False, - loss_weight={'class': 1.0, - 'probiou': 1.0}, - norm_cfg={'name': 'gn', - 'num_groups': 32}, - assigner='FCOSRAssigner', - nms='MultiClassNMS'): - - super(FCOSRHead, self).__init__() - self.in_channels = in_channels - self.num_classes = num_classes - self.fpn_strides = fpn_strides - self.stacked_convs = stacked_convs - self.loss_weight = loss_weight - self.half_pi = paddle.to_tensor( - [1.5707963267948966], dtype=paddle.float32) - self.probiou_loss = ProbIoULoss(mode='l1') - act = get_act_fn( - act, trt=trt) if act is None or isinstance(act, - (str, dict)) else act - self.trt = trt - self.loss_weight = loss_weight - self.assigner = assigner - self.nms = nms - # stem - self.stem_cls = nn.LayerList() - self.stem_reg = nn.LayerList() - for i in range(self.stacked_convs): - self.stem_cls.append( - ConvBNLayer( - self.in_channels[i], - feat_channels, - filter_size=3, - stride=1, - padding=1, - norm_cfg=norm_cfg, - act=act)) - self.stem_reg.append( - ConvBNLayer( - self.in_channels[i], - feat_channels, - filter_size=3, - stride=1, - padding=1, - norm_cfg=norm_cfg, - act=act)) - - self.scales = nn.LayerList( - [ScaleReg() for _ in range(len(fpn_strides))]) - - # prediction - self.pred_cls = nn.Conv2D(feat_channels, self.num_classes, 3, padding=1) - - self.pred_xy = nn.Conv2D(feat_channels, 2, 3, padding=1) - - self.pred_wh = nn.Conv2D(feat_channels, 2, 3, padding=1) - - self.pred_angle = nn.Conv2D(feat_channels, 1, 3, padding=1) - - self._init_weights() - - def _init_weights(self): - for cls_, reg_ in zip(self.stem_cls, self.stem_reg): - normal_(cls_.conv.weight, std=0.01) - normal_(reg_.conv.weight, std=0.01) - - bias_cls = bias_init_with_prob(0.01) - normal_(self.pred_cls.weight, std=0.01) - constant_(self.pred_cls.bias, bias_cls) - normal_(self.pred_xy.weight, std=0.01) - normal_(self.pred_wh.weight, std=0.01) - normal_(self.pred_angle.weight, std=0.01) - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } - - def _generate_anchors(self, feats): - if self.trt: - anchor_points = [] - for feat, stride in zip(feats, self.fpn_strides): - _, _, h, w = paddle.shape(feat) - anchor, _ = anchor_generator( - feat, - stride * 4, - 1.0, [1.0, 1.0, 1.0, 1.0], [stride, stride], - offset=0.5) - x1, y1, x2, y2 = paddle.split(anchor, 4, axis=-1) - xc = (x1 + x2 + 1) / 2 - yc = (y1 + y2 + 1) / 2 - anchor_point = paddle.concat( - [xc, yc], axis=-1).reshape((1, h * w, 2)) - anchor_points.append(anchor_point) - anchor_points = paddle.concat(anchor_points, axis=1) - return anchor_points, None, None - else: - anchor_points = [] - stride_tensor = [] - num_anchors_list = [] - for feat, stride in zip(feats, self.fpn_strides): - _, _, h, w = paddle.shape(feat) - shift_x = (paddle.arange(end=w) + 0.5) * stride - shift_y = (paddle.arange(end=h) + 0.5) * stride - shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) - anchor_point = paddle.cast( - paddle.stack( - [shift_x, shift_y], axis=-1), dtype='float32') - anchor_points.append(anchor_point.reshape([1, -1, 2])) - stride_tensor.append( - paddle.full( - [1, h * w, 1], stride, dtype='float32')) - num_anchors_list.append(h * w) - anchor_points = paddle.concat(anchor_points, axis=1) - stride_tensor = paddle.concat(stride_tensor, axis=1) - return anchor_points, stride_tensor, num_anchors_list - - def forward(self, feats, target=None): - if self.training: - return self.forward_train(feats, target) - else: - return self.forward_eval(feats, target) - - def forward_train(self, feats, target=None): - anchor_points, stride_tensor, num_anchors_list = self._generate_anchors( - feats) - cls_pred_list, reg_pred_list = [], [] - for stride, feat, scale in zip(self.fpn_strides, feats, self.scales): - # cls - cls_feat = feat - for cls_layer in self.stem_cls: - cls_feat = cls_layer(cls_feat) - cls_pred = F.sigmoid(self.pred_cls(cls_feat)) - cls_pred_list.append(cls_pred.flatten(2).transpose((0, 2, 1))) - # reg - reg_feat = feat - for reg_layer in self.stem_reg: - reg_feat = reg_layer(reg_feat) - - reg_xy = scale(self.pred_xy(reg_feat)) * stride - reg_wh = F.elu(scale(self.pred_wh(reg_feat)) + 1.) * stride - reg_angle = self.pred_angle(reg_feat) - reg_angle = fmod(reg_angle, self.half_pi) - reg_pred = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1) - reg_pred_list.append(reg_pred.flatten(2).transpose((0, 2, 1))) - - cls_pred_list = paddle.concat(cls_pred_list, axis=1) - reg_pred_list = paddle.concat(reg_pred_list, axis=1) - - return self.get_loss([ - cls_pred_list, reg_pred_list, anchor_points, stride_tensor, - num_anchors_list - ], target) - - def forward_eval(self, feats, target=None): - cls_pred_list, reg_pred_list = [], [] - anchor_points, _, _ = self._generate_anchors(feats) - for stride, feat, scale in zip(self.fpn_strides, feats, self.scales): - b, _, h, w = paddle.shape(feat) - # cls - cls_feat = feat - for cls_layer in self.stem_cls: - cls_feat = cls_layer(cls_feat) - cls_pred = F.sigmoid(self.pred_cls(cls_feat)) - cls_pred_list.append(cls_pred.reshape([b, self.num_classes, h * w])) - # reg - reg_feat = feat - for reg_layer in self.stem_reg: - reg_feat = reg_layer(reg_feat) - - reg_xy = scale(self.pred_xy(reg_feat)) * stride - reg_wh = F.elu(scale(self.pred_wh(reg_feat)) + 1.) * stride - reg_angle = self.pred_angle(reg_feat) - reg_angle = fmod_eval(reg_angle, self.half_pi) - reg_pred = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1) - reg_pred = reg_pred.reshape([b, 5, h * w]).transpose((0, 2, 1)) - reg_pred_list.append(reg_pred) - - cls_pred_list = paddle.concat(cls_pred_list, axis=2) - reg_pred_list = paddle.concat(reg_pred_list, axis=1) - reg_pred_list = self._bbox_decode(anchor_points, reg_pred_list) - return cls_pred_list, reg_pred_list - - def _bbox_decode(self, points, reg_pred_list): - xy, wha = paddle.split(reg_pred_list, [2, 3], axis=-1) - xy = xy + points - return paddle.concat([xy, wha], axis=-1) - - def _box2corners(self, pred_bboxes): - """ convert (x, y, w, h, angle) to (x1, y1, x2, y2, x3, y3, x4, y4) - - Args: - pred_bboxes (Tensor): [B, N, 5] - - Returns: - polys (Tensor): [B, N, 8] - """ - x, y, w, h, angle = paddle.split(pred_bboxes, 5, axis=-1) - cos_a_half = paddle.cos(angle) * 0.5 - sin_a_half = paddle.sin(angle) * 0.5 - w_x = cos_a_half * w - w_y = sin_a_half * w - h_x = -sin_a_half * h - h_y = cos_a_half * h - return paddle.concat( - [ - x + w_x + h_x, y + w_y + h_y, x - w_x + h_x, y - w_y + h_y, - x - w_x - h_x, y - w_y - h_y, x + w_x - h_x, y + w_y - h_y - ], - axis=-1) - - def get_loss(self, head_outs, gt_meta): - cls_pred_list, reg_pred_list, anchor_points, stride_tensor, num_anchors_list = head_outs - gt_labels = gt_meta['gt_class'] - gt_bboxes = gt_meta['gt_bbox'] - gt_rboxes = gt_meta['gt_rbox'] - pad_gt_mask = gt_meta['pad_gt_mask'] - # decode - pred_rboxes = self._bbox_decode(anchor_points, reg_pred_list) - # label assignment - assigned_labels, assigned_rboxes, assigned_scores = \ - self.assigner( - anchor_points, - stride_tensor, - num_anchors_list, - gt_labels, - gt_bboxes, - gt_rboxes, - pad_gt_mask, - self.num_classes, - pred_rboxes - ) - - # reg_loss - mask_positive = (assigned_labels != self.num_classes) - num_pos = mask_positive.sum().item() - if num_pos > 0: - bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 5]) - pred_rboxes_pos = paddle.masked_select(pred_rboxes, - bbox_mask).reshape([-1, 5]) - assigned_rboxes_pos = paddle.masked_select( - assigned_rboxes, bbox_mask).reshape([-1, 5]) - bbox_weight = paddle.masked_select( - assigned_scores.sum(-1), mask_positive).reshape([-1]) - avg_factor = bbox_weight.sum() - loss_probiou = self.probiou_loss(pred_rboxes_pos, - assigned_rboxes_pos) - loss_probiou = paddle.sum(loss_probiou * bbox_weight) / avg_factor - else: - loss_probiou = pred_rboxes.sum() * 0. - - avg_factor = max(num_pos, 1.0) - # cls_loss - loss_cls = self._qfocal_loss( - cls_pred_list, assigned_scores, reduction='sum') - loss_cls = loss_cls / avg_factor - - loss = self.loss_weight['class'] * loss_cls + \ - self.loss_weight['probiou'] * loss_probiou - out_dict = { - 'loss': loss, - 'loss_probiou': loss_probiou, - 'loss_cls': loss_cls - } - return out_dict - - @staticmethod - def _qfocal_loss(score, label, gamma=2.0, reduction='sum'): - weight = (score - label).pow(gamma) - loss = F.binary_cross_entropy( - score, label, weight=weight, reduction=reduction) - return loss - - def post_process(self, head_outs, scale_factor): - pred_scores, pred_rboxes = head_outs - # [B, N, 5] -> [B, N, 4, 2] -> [B, N, 8] - pred_rboxes = self._box2corners(pred_rboxes) - # scale bbox to origin - scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1) - scale_factor = paddle.concat( - [ - scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x, - scale_y - ], - axis=-1).reshape([-1, 1, 8]) - pred_rboxes /= scale_factor - bbox_pred, bbox_num, before_nms_indexes = self.nms(pred_rboxes, - pred_scores) - return bbox_pred, bbox_num, before_nms_indexes diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/gfl_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/gfl_head.py deleted file mode 100644 index 040a3f7..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/gfl_head.py +++ /dev/null @@ -1,736 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The code is based on: -# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/gfl_head.py - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import math -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.nn.initializer import Normal, Constant - -from ppdet.core.workspace import register -from ppdet.modeling.bbox_utils import distance2bbox, bbox2distance, batch_distance2bbox -from ppdet.data.transform.atss_assigner import bbox_overlaps - -__all__ = ['GFLHead', 'LDGFLHead'] - - -class ScaleReg(nn.Layer): - """ - Parameter for scaling the regression outputs. - """ - - def __init__(self): - super(ScaleReg, self).__init__() - self.scale_reg = self.create_parameter( - shape=[1], - attr=ParamAttr(initializer=Constant(value=1.)), - dtype="float32") - - def forward(self, inputs): - out = inputs * self.scale_reg - return out - - -class Integral(nn.Layer): - """A fixed layer for calculating integral result from distribution. - This layer calculates the target location by :math: `sum{P(y_i) * y_i}`, - P(y_i) denotes the softmax vector that represents the discrete distribution - y_i denotes the discrete set, usually {0, 1, 2, ..., reg_max} - Args: - reg_max (int): The maximal value of the discrete set. Default: 16. You - may want to reset it according to your new dataset or related - settings. - """ - - def __init__(self, reg_max=16): - super(Integral, self).__init__() - self.reg_max = reg_max - self.register_buffer('project', - paddle.linspace(0, self.reg_max, self.reg_max + 1)) - - def forward(self, x): - """Forward feature from the regression head to get integral result of - bounding box location. - Args: - x (Tensor): Features of the regression head, shape (N, 4*(n+1)), - n is self.reg_max. - Returns: - x (Tensor): Integral result of box locations, i.e., distance - offsets from the box center in four directions, shape (N, 4). - """ - x = F.softmax(x.reshape([-1, self.reg_max + 1]), axis=1) - x = F.linear(x, self.project) - if self.training: - x = x.reshape([-1, 4]) - return x - - -@register -class DGQP(nn.Layer): - """Distribution-Guided Quality Predictor of GFocal head - Args: - reg_topk (int): top-k statistics of distribution to guide LQE - reg_channels (int): hidden layer unit to generate LQE - add_mean (bool): Whether to calculate the mean of top-k statistics - """ - - def __init__(self, reg_topk=4, reg_channels=64, add_mean=True): - super(DGQP, self).__init__() - self.reg_topk = reg_topk - self.reg_channels = reg_channels - self.add_mean = add_mean - self.total_dim = reg_topk - if add_mean: - self.total_dim += 1 - self.reg_conv1 = self.add_sublayer( - 'dgqp_reg_conv1', - nn.Conv2D( - in_channels=4 * self.total_dim, - out_channels=self.reg_channels, - kernel_size=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr(initializer=Constant(value=0)))) - self.reg_conv2 = self.add_sublayer( - 'dgqp_reg_conv2', - nn.Conv2D( - in_channels=self.reg_channels, - out_channels=1, - kernel_size=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr(initializer=Constant(value=0)))) - - def forward(self, x): - """Forward feature from the regression head to get integral result of - bounding box location. - Args: - x (Tensor): Features of the regression head, shape (N, 4*(n+1)), - n is self.reg_max. - Returns: - x (Tensor): Integral result of box locations, i.e., distance - offsets from the box center in four directions, shape (N, 4). - """ - N, _, H, W = x.shape[:] - prob = F.softmax(x.reshape([N, 4, -1, H, W]), axis=2) - prob_topk, _ = prob.topk(self.reg_topk, axis=2) - if self.add_mean: - stat = paddle.concat( - [prob_topk, prob_topk.mean( - axis=2, keepdim=True)], axis=2) - else: - stat = prob_topk - y = F.relu(self.reg_conv1(stat.reshape([N, 4 * self.total_dim, H, W]))) - y = F.sigmoid(self.reg_conv2(y)) - return y - - -@register -class GFLHead(nn.Layer): - """ - GFLHead - Args: - conv_feat (object): Instance of 'FCOSFeat' - num_classes (int): Number of classes - fpn_stride (list): The stride of each FPN Layer - prior_prob (float): Used to set the bias init for the class prediction layer - loss_class (object): Instance of QualityFocalLoss. - loss_dfl (object): Instance of DistributionFocalLoss. - loss_bbox (object): Instance of bbox loss. - reg_max: Max value of integral set :math: `{0, ..., reg_max}` - n QFL setting. Default: 16. - """ - __inject__ = [ - 'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', 'nms' - ] - __shared__ = ['num_classes'] - - def __init__(self, - conv_feat='FCOSFeat', - dgqp_module=None, - num_classes=80, - fpn_stride=[8, 16, 32, 64, 128], - prior_prob=0.01, - loss_class='QualityFocalLoss', - loss_dfl='DistributionFocalLoss', - loss_bbox='GIoULoss', - reg_max=16, - feat_in_chan=256, - nms=None, - nms_pre=1000, - cell_offset=0): - super(GFLHead, self).__init__() - self.conv_feat = conv_feat - self.dgqp_module = dgqp_module - self.num_classes = num_classes - self.fpn_stride = fpn_stride - self.prior_prob = prior_prob - self.loss_qfl = loss_class - self.loss_dfl = loss_dfl - self.loss_bbox = loss_bbox - self.reg_max = reg_max - self.feat_in_chan = feat_in_chan - self.nms = nms - self.nms_pre = nms_pre - self.cell_offset = cell_offset - self.use_sigmoid = self.loss_qfl.use_sigmoid - if self.use_sigmoid: - self.cls_out_channels = self.num_classes - else: - self.cls_out_channels = self.num_classes + 1 - - conv_cls_name = "gfl_head_cls" - bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob) - self.gfl_head_cls = self.add_sublayer( - conv_cls_name, - nn.Conv2D( - in_channels=self.feat_in_chan, - out_channels=self.cls_out_channels, - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr( - initializer=Constant(value=bias_init_value)))) - - conv_reg_name = "gfl_head_reg" - self.gfl_head_reg = self.add_sublayer( - conv_reg_name, - nn.Conv2D( - in_channels=self.feat_in_chan, - out_channels=4 * (self.reg_max + 1), - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr(initializer=Constant(value=0)))) - - self.scales_regs = [] - for i in range(len(self.fpn_stride)): - lvl = int(math.log(int(self.fpn_stride[i]), 2)) - feat_name = 'p{}_feat'.format(lvl) - scale_reg = self.add_sublayer(feat_name, ScaleReg()) - self.scales_regs.append(scale_reg) - - self.distribution_project = Integral(self.reg_max) - - def forward(self, fpn_feats): - assert len(fpn_feats) == len( - self.fpn_stride - ), "The size of fpn_feats is not equal to size of fpn_stride" - cls_logits_list = [] - bboxes_reg_list = [] - for stride, scale_reg, fpn_feat in zip(self.fpn_stride, - self.scales_regs, fpn_feats): - conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat) - cls_score = self.gfl_head_cls(conv_cls_feat) - bbox_pred = scale_reg(self.gfl_head_reg(conv_reg_feat)) - if self.dgqp_module: - quality_score = self.dgqp_module(bbox_pred) - cls_score = F.sigmoid(cls_score) * quality_score - if not self.training: - cls_score = F.sigmoid(cls_score.transpose([0, 2, 3, 1])) - bbox_pred = bbox_pred.transpose([0, 2, 3, 1]) - b, cell_h, cell_w, _ = paddle.shape(cls_score) - y, x = self.get_single_level_center_point( - [cell_h, cell_w], stride, cell_offset=self.cell_offset) - center_points = paddle.stack([x, y], axis=-1) - cls_score = cls_score.reshape([b, -1, self.cls_out_channels]) - bbox_pred = self.distribution_project(bbox_pred) * stride - bbox_pred = bbox_pred.reshape([-1, cell_h * cell_w, 4]) - - # NOTE: If keep_ratio=False and image shape value that - # multiples of 32, distance2bbox not set max_shapes parameter - # to speed up model prediction. If need to set max_shapes, - # please use inputs['im_shape']. - bbox_pred = batch_distance2bbox( - center_points, bbox_pred, max_shapes=None) - - cls_logits_list.append(cls_score) - bboxes_reg_list.append(bbox_pred) - - return (cls_logits_list, bboxes_reg_list) - - def _images_to_levels(self, target, num_level_anchors): - """ - Convert targets by image to targets by feature level. - """ - level_targets = [] - start = 0 - for n in num_level_anchors: - end = start + n - level_targets.append(target[:, start:end].squeeze(0)) - start = end - return level_targets - - def _grid_cells_to_center(self, grid_cells): - """ - Get center location of each gird cell - Args: - grid_cells: grid cells of a feature map - Returns: - center points - """ - cells_cx = (grid_cells[:, 2] + grid_cells[:, 0]) / 2 - cells_cy = (grid_cells[:, 3] + grid_cells[:, 1]) / 2 - return paddle.stack([cells_cx, cells_cy], axis=-1) - - def get_loss(self, gfl_head_outs, gt_meta): - cls_logits, bboxes_reg = gfl_head_outs - num_level_anchors = [ - featmap.shape[-2] * featmap.shape[-1] for featmap in cls_logits - ] - grid_cells_list = self._images_to_levels(gt_meta['grid_cells'], - num_level_anchors) - labels_list = self._images_to_levels(gt_meta['labels'], - num_level_anchors) - label_weights_list = self._images_to_levels(gt_meta['label_weights'], - num_level_anchors) - bbox_targets_list = self._images_to_levels(gt_meta['bbox_targets'], - num_level_anchors) - num_total_pos = sum(gt_meta['pos_num']) - try: - paddle.distributed.all_reduce(num_total_pos) - num_total_pos = paddle.clip( - num_total_pos / paddle.distributed.get_world_size(), min=1) - except: - num_total_pos = max(num_total_pos, 1) - - loss_bbox_list, loss_dfl_list, loss_qfl_list, avg_factor = [], [], [], [] - for cls_score, bbox_pred, grid_cells, labels, label_weights, bbox_targets, stride in zip( - cls_logits, bboxes_reg, grid_cells_list, labels_list, - label_weights_list, bbox_targets_list, self.fpn_stride): - grid_cells = grid_cells.reshape([-1, 4]) - cls_score = cls_score.transpose([0, 2, 3, 1]).reshape( - [-1, self.cls_out_channels]) - bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape( - [-1, 4 * (self.reg_max + 1)]) - bbox_targets = bbox_targets.reshape([-1, 4]) - labels = labels.reshape([-1]) - label_weights = label_weights.reshape([-1]) - - bg_class_ind = self.num_classes - pos_inds = paddle.nonzero( - paddle.logical_and((labels >= 0), (labels < bg_class_ind)), - as_tuple=False).squeeze(1) - score = np.zeros(labels.shape) - if len(pos_inds) > 0: - pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0) - pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0) - pos_grid_cells = paddle.gather(grid_cells, pos_inds, axis=0) - pos_grid_cell_centers = self._grid_cells_to_center( - pos_grid_cells) / stride - - weight_targets = F.sigmoid(cls_score.detach()) - weight_targets = paddle.gather( - weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0) - pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred) - pos_decode_bbox_pred = distance2bbox(pos_grid_cell_centers, - pos_bbox_pred_corners) - pos_decode_bbox_targets = pos_bbox_targets / stride - bbox_iou = bbox_overlaps( - pos_decode_bbox_pred.detach().numpy(), - pos_decode_bbox_targets.detach().numpy(), - is_aligned=True) - score[pos_inds.numpy()] = bbox_iou - pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1]) - target_corners = bbox2distance(pos_grid_cell_centers, - pos_decode_bbox_targets, - self.reg_max).reshape([-1]) - # regression loss - loss_bbox = paddle.sum( - self.loss_bbox(pos_decode_bbox_pred, - pos_decode_bbox_targets) * weight_targets) - - # dfl loss - loss_dfl = self.loss_dfl( - pred_corners, - target_corners, - weight=weight_targets.expand([-1, 4]).reshape([-1]), - avg_factor=4.0) - else: - loss_bbox = bbox_pred.sum() * 0 - loss_dfl = bbox_pred.sum() * 0 - weight_targets = paddle.to_tensor([0], dtype='float32') - - # qfl loss - score = paddle.to_tensor(score) - loss_qfl = self.loss_qfl( - cls_score, (labels, score), - weight=label_weights, - avg_factor=num_total_pos) - loss_bbox_list.append(loss_bbox) - loss_dfl_list.append(loss_dfl) - loss_qfl_list.append(loss_qfl) - avg_factor.append(weight_targets.sum()) - - avg_factor = sum(avg_factor) - try: - paddle.distributed.all_reduce(avg_factor) - avg_factor = paddle.clip( - avg_factor / paddle.distributed.get_world_size(), min=1) - except: - avg_factor = max(avg_factor.item(), 1) - if avg_factor <= 0: - loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) - loss_bbox = paddle.to_tensor( - 0, dtype='float32', stop_gradient=False) - loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) - else: - losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list)) - losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list)) - loss_qfl = sum(loss_qfl_list) - loss_bbox = sum(losses_bbox) - loss_dfl = sum(losses_dfl) - - loss_states = dict( - loss_qfl=loss_qfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl) - - return loss_states - - def get_single_level_center_point(self, featmap_size, stride, - cell_offset=0): - """ - Generate pixel centers of a single stage feature map. - Args: - featmap_size: height and width of the feature map - stride: down sample stride of the feature map - Returns: - y and x of the center points - """ - h, w = featmap_size - x_range = (paddle.arange(w, dtype='float32') + cell_offset) * stride - y_range = (paddle.arange(h, dtype='float32') + cell_offset) * stride - y, x = paddle.meshgrid(y_range, x_range) - y = y.flatten() - x = x.flatten() - return y, x - - def post_process(self, gfl_head_outs, im_shape, scale_factor): - cls_scores, bboxes_reg = gfl_head_outs - bboxes = paddle.concat(bboxes_reg, axis=1) - # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale] - im_scale = scale_factor.flip([1]).tile([1, 2]).unsqueeze(1) - bboxes /= im_scale - mlvl_scores = paddle.concat(cls_scores, axis=1) - mlvl_scores = mlvl_scores.transpose([0, 2, 1]) - bbox_pred, bbox_num, _ = self.nms(bboxes, mlvl_scores) - return bbox_pred, bbox_num - - -@register -class LDGFLHead(GFLHead): - """ - GFLHead for LD distill - Args: - conv_feat (object): Instance of 'FCOSFeat' - num_classes (int): Number of classes - fpn_stride (list): The stride of each FPN Layer - prior_prob (float): Used to set the bias init for the class prediction layer - loss_class (object): Instance of QualityFocalLoss. - loss_dfl (object): Instance of DistributionFocalLoss. - loss_bbox (object): Instance of bbox loss. - reg_max: Max value of integral set :math: `{0, ..., reg_max}` - n QFL setting. Default: 16. - """ - __inject__ = [ - 'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', - 'loss_ld', 'loss_ld_vlr', 'loss_kd', 'nms' - ] - __shared__ = ['num_classes'] - - def __init__(self, - conv_feat='FCOSFeat', - dgqp_module=None, - num_classes=80, - fpn_stride=[8, 16, 32, 64, 128], - prior_prob=0.01, - loss_class='QualityFocalLoss', - loss_dfl='DistributionFocalLoss', - loss_bbox='GIoULoss', - loss_ld='KnowledgeDistillationKLDivLoss', - loss_ld_vlr='KnowledgeDistillationKLDivLoss', - loss_kd='KnowledgeDistillationKLDivLoss', - reg_max=16, - feat_in_chan=256, - nms=None, - nms_pre=1000, - cell_offset=0): - - super(LDGFLHead, self).__init__( - conv_feat=conv_feat, - dgqp_module=dgqp_module, - num_classes=num_classes, - fpn_stride=fpn_stride, - prior_prob=prior_prob, - loss_class=loss_class, - loss_dfl=loss_dfl, - loss_bbox=loss_bbox, - reg_max=reg_max, - feat_in_chan=feat_in_chan, - nms=nms, - nms_pre=nms_pre, - cell_offset=cell_offset) - self.loss_ld = loss_ld - self.loss_kd = loss_kd - self.loss_ld_vlr = loss_ld_vlr - - def forward(self, fpn_feats): - assert len(fpn_feats) == len( - self.fpn_stride - ), "The size of fpn_feats is not equal to size of fpn_stride" - cls_logits_list = [] - bboxes_reg_list = [] - for stride, scale_reg, fpn_feat in zip(self.fpn_stride, - self.scales_regs, fpn_feats): - conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat) - cls_score = self.gfl_head_cls(conv_cls_feat) - bbox_pred = scale_reg(self.gfl_head_reg(conv_reg_feat)) - - if self.dgqp_module: - quality_score = self.dgqp_module(bbox_pred) - cls_score = F.sigmoid(cls_score) * quality_score - if not self.training: - cls_score = F.sigmoid(cls_score.transpose([0, 2, 3, 1])) - bbox_pred = bbox_pred.transpose([0, 2, 3, 1]) - b, cell_h, cell_w, _ = paddle.shape(cls_score) - y, x = self.get_single_level_center_point( - [cell_h, cell_w], stride, cell_offset=self.cell_offset) - center_points = paddle.stack([x, y], axis=-1) - cls_score = cls_score.reshape([b, -1, self.cls_out_channels]) - bbox_pred = self.distribution_project(bbox_pred) * stride - bbox_pred = bbox_pred.reshape([b, cell_h * cell_w, 4]) - - # NOTE: If keep_ratio=False and image shape value that - # multiples of 32, distance2bbox not set max_shapes parameter - # to speed up model prediction. If need to set max_shapes, - # please use inputs['im_shape']. - bbox_pred = batch_distance2bbox( - center_points, bbox_pred, max_shapes=None) - - cls_logits_list.append(cls_score) - bboxes_reg_list.append(bbox_pred) - - return (cls_logits_list, bboxes_reg_list) - - def get_loss(self, gfl_head_outs, gt_meta, soft_label_list, - soft_targets_list): - cls_logits, bboxes_reg = gfl_head_outs - - num_level_anchors = [ - featmap.shape[-2] * featmap.shape[-1] for featmap in cls_logits - ] - - grid_cells_list = self._images_to_levels(gt_meta['grid_cells'], - num_level_anchors) - - labels_list = self._images_to_levels(gt_meta['labels'], - num_level_anchors) - - label_weights_list = self._images_to_levels(gt_meta['label_weights'], - num_level_anchors) - bbox_targets_list = self._images_to_levels(gt_meta['bbox_targets'], - num_level_anchors) - # vlr regions - vlr_regions_list = self._images_to_levels(gt_meta['vlr_regions'], - num_level_anchors) - - num_total_pos = sum(gt_meta['pos_num']) - try: - paddle.distributed.all_reduce(num_total_pos) - num_total_pos = paddle.clip( - num_total_pos / paddle.distributed.get_world_size(), min=1.) - except: - num_total_pos = max(num_total_pos, 1) - - loss_bbox_list, loss_dfl_list, loss_qfl_list, loss_ld_list, avg_factor = [], [], [], [], [] - loss_ld_vlr_list, loss_kd_list = [], [] - - for cls_score, bbox_pred, grid_cells, labels, label_weights, bbox_targets, stride, soft_targets,\ - soft_label, vlr_region in zip( - cls_logits, bboxes_reg, grid_cells_list, labels_list, - label_weights_list, bbox_targets_list, self.fpn_stride, soft_targets_list, - soft_label_list, vlr_regions_list): - - grid_cells = grid_cells.reshape([-1, 4]) - cls_score = cls_score.transpose([0, 2, 3, 1]).reshape( - [-1, self.cls_out_channels]) - bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape( - [-1, 4 * (self.reg_max + 1)]) - - soft_targets = soft_targets.transpose([0, 2, 3, 1]).reshape( - [-1, 4 * (self.reg_max + 1)]) - - soft_label = soft_label.transpose([0, 2, 3, 1]).reshape( - [-1, self.cls_out_channels]) - - # feture im - # teacher_x = teacher_x.transpose([0, 2, 3, 1]).reshape([-1, 256]) - # x = x.transpose([0, 2, 3, 1]).reshape([-1, 256]) - - bbox_targets = bbox_targets.reshape([-1, 4]) - labels = labels.reshape([-1]) - label_weights = label_weights.reshape([-1]) - - vlr_region = vlr_region.reshape([-1]) - - bg_class_ind = self.num_classes - pos_inds = paddle.nonzero( - paddle.logical_and((labels >= 0), (labels < bg_class_ind)), - as_tuple=False).squeeze(1) - score = np.zeros(labels.shape) - - remain_inds = (vlr_region > 0).nonzero() - - if len(pos_inds) > 0: - pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0) - pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0) - pos_grid_cells = paddle.gather(grid_cells, pos_inds, axis=0) - - pos_grid_cell_centers = self._grid_cells_to_center( - pos_grid_cells) / stride - - weight_targets = F.sigmoid(cls_score.detach()) - weight_targets = paddle.gather( - weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0) - pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred) - pos_decode_bbox_pred = distance2bbox(pos_grid_cell_centers, - pos_bbox_pred_corners) - pos_decode_bbox_targets = pos_bbox_targets / stride - bbox_iou = bbox_overlaps( - pos_decode_bbox_pred.detach().numpy(), - pos_decode_bbox_targets.detach().numpy(), - is_aligned=True) - score[pos_inds.numpy()] = bbox_iou - pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1]) - - pos_soft_targets = paddle.gather(soft_targets, pos_inds, axis=0) - soft_corners = pos_soft_targets.reshape([-1, self.reg_max + 1]) - - target_corners = bbox2distance(pos_grid_cell_centers, - pos_decode_bbox_targets, - self.reg_max).reshape([-1]) - # regression loss - loss_bbox = paddle.sum( - self.loss_bbox(pos_decode_bbox_pred, - pos_decode_bbox_targets) * weight_targets) - - # dfl loss - loss_dfl = self.loss_dfl( - pred_corners, - target_corners, - weight=weight_targets.expand([-1, 4]).reshape([-1]), - avg_factor=4.0) - - # ld loss - loss_ld = self.loss_ld( - pred_corners, - soft_corners, - weight=weight_targets.expand([-1, 4]).reshape([-1]), - avg_factor=4.0) - - loss_kd = self.loss_kd( - paddle.gather( - cls_score, pos_inds, axis=0), - paddle.gather( - soft_label, pos_inds, axis=0), - weight=paddle.gather( - label_weights, pos_inds, axis=0), - avg_factor=pos_inds.shape[0]) - - else: - loss_bbox = bbox_pred.sum() * 0 - loss_dfl = bbox_pred.sum() * 0 - loss_ld = bbox_pred.sum() * 0 - loss_kd = bbox_pred.sum() * 0 - weight_targets = paddle.to_tensor([0], dtype='float32') - - if len(remain_inds) > 0: - neg_pred_corners = bbox_pred[remain_inds].reshape( - [-1, self.reg_max + 1]) - neg_soft_corners = soft_targets[remain_inds].reshape( - [-1, self.reg_max + 1]) - - remain_targets = vlr_region[remain_inds] - - loss_ld_vlr = self.loss_ld_vlr( - neg_pred_corners, - neg_soft_corners, - weight=remain_targets.expand([-1, 4]).reshape([-1]), - avg_factor=16.0) - else: - loss_ld_vlr = bbox_pred.sum() * 0 - - # qfl loss - score = paddle.to_tensor(score) - loss_qfl = self.loss_qfl( - cls_score, (labels, score), - weight=label_weights, - avg_factor=num_total_pos) - - loss_bbox_list.append(loss_bbox) - loss_dfl_list.append(loss_dfl) - loss_qfl_list.append(loss_qfl) - loss_ld_list.append(loss_ld) - loss_ld_vlr_list.append(loss_ld_vlr) - loss_kd_list.append(loss_kd) - avg_factor.append(weight_targets.sum()) - - avg_factor = sum(avg_factor) # + 1e-6 - try: - paddle.distributed.all_reduce(avg_factor) - avg_factor = paddle.clip( - avg_factor / paddle.distributed.get_world_size(), min=1) - except: - avg_factor = max(avg_factor.item(), 1) - - if avg_factor <= 0: - loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) - loss_bbox = paddle.to_tensor( - 0, dtype='float32', stop_gradient=False) - loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) - loss_ld = paddle.to_tensor(0, dtype='float32', stop_gradient=False) - loss_ld_vlr = paddle.to_tensor( - 0, dtype='float32', stop_gradient=False) - loss_kd = paddle.to_tensor(0, dtype='float32', stop_gradient=False) - else: - losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list)) - losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list)) - loss_qfl = sum(loss_qfl_list) - loss_bbox = sum(losses_bbox) - loss_dfl = sum(losses_dfl) - loss_ld = sum(loss_ld_list) - loss_ld_vlr = sum(loss_ld_vlr_list) - loss_kd = sum(loss_kd_list) - - loss_states = dict( - loss_qfl=loss_qfl, - loss_bbox=loss_bbox, - loss_dfl=loss_dfl, - loss_ld=loss_ld, - loss_ld_vlr=loss_ld_vlr, - loss_kd=loss_kd) - - return loss_states diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/keypoint_hrhrnet_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/keypoint_hrhrnet_head.py deleted file mode 100644 index 869b181..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/keypoint_hrhrnet_head.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn - -from ppdet.core.workspace import register -from .. import layers as L -from ..backbones.hrnet import BasicBlock - - -@register -class HrHRNetHead(nn.Layer): - __inject__ = ['loss'] - - def __init__(self, num_joints, loss='HrHRNetLoss', swahr=False, width=32): - """ - Head for HigherHRNet network - - Args: - num_joints (int): number of keypoints - hrloss (object): HrHRNetLoss instance - swahr (bool): whether to use swahr - width (int): hrnet channel width - """ - super(HrHRNetHead, self).__init__() - self.loss = loss - - self.num_joints = num_joints - num_featout1 = num_joints * 2 - num_featout2 = num_joints - self.swahr = swahr - self.conv1 = L.Conv2d(width, num_featout1, 1, 1, 0, bias=True) - self.conv2 = L.Conv2d(width, num_featout2, 1, 1, 0, bias=True) - self.deconv = nn.Sequential( - L.ConvTranspose2d( - num_featout1 + width, width, 4, 2, 1, 0, bias=False), - L.BatchNorm2d(width), - L.ReLU()) - self.blocks = nn.Sequential(*(BasicBlock( - num_channels=width, - num_filters=width, - has_se=False, - freeze_norm=False, - name='HrHRNetHead_{}'.format(i)) for i in range(4))) - - self.interpolate = L.Upsample(2, mode='bilinear') - self.concat = L.Concat(dim=1) - if swahr: - self.scalelayer0 = nn.Sequential( - L.Conv2d( - width, num_joints, 1, 1, 0, bias=True), - L.BatchNorm2d(num_joints), - L.ReLU(), - L.Conv2d( - num_joints, - num_joints, - 9, - 1, - 4, - groups=num_joints, - bias=True)) - self.scalelayer1 = nn.Sequential( - L.Conv2d( - width, num_joints, 1, 1, 0, bias=True), - L.BatchNorm2d(num_joints), - L.ReLU(), - L.Conv2d( - num_joints, - num_joints, - 9, - 1, - 4, - groups=num_joints, - bias=True)) - - def forward(self, feats, targets=None): - x1 = feats[0] - xo1 = self.conv1(x1) - x2 = self.blocks(self.deconv(self.concat((x1, xo1)))) - xo2 = self.conv2(x2) - num_joints = self.num_joints - if self.training: - heatmap1, tagmap = paddle.split(xo1, 2, axis=1) - if self.swahr: - so1 = self.scalelayer0(x1) - so2 = self.scalelayer1(x2) - hrhrnet_outputs = ([heatmap1, so1], [xo2, so2], tagmap) - return self.loss(hrhrnet_outputs, targets) - else: - hrhrnet_outputs = (heatmap1, xo2, tagmap) - return self.loss(hrhrnet_outputs, targets) - - # averaged heatmap, upsampled tagmap - upsampled = self.interpolate(xo1) - avg = (upsampled[:, :num_joints] + xo2[:, :num_joints]) / 2 - return avg, upsampled[:, num_joints:] diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/mask_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/mask_head.py deleted file mode 100644 index 403d4ce..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/mask_head.py +++ /dev/null @@ -1,251 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.nn.initializer import KaimingNormal - -from ppdet.core.workspace import register, create -from ppdet.modeling.layers import ConvNormLayer -from .roi_extractor import RoIAlign -from ..cls_utils import _get_class_default_kwargs - - -@register -class MaskFeat(nn.Layer): - """ - Feature extraction in Mask head - - Args: - in_channel (int): Input channels - out_channel (int): Output channels - num_convs (int): The number of conv layers, default 4 - norm_type (string | None): Norm type, bn, gn, sync_bn are available, - default None - """ - - def __init__(self, - in_channel=256, - out_channel=256, - num_convs=4, - norm_type=None): - super(MaskFeat, self).__init__() - self.num_convs = num_convs - self.in_channel = in_channel - self.out_channel = out_channel - self.norm_type = norm_type - fan_conv = out_channel * 3 * 3 - fan_deconv = out_channel * 2 * 2 - - mask_conv = nn.Sequential() - if norm_type == 'gn': - for i in range(self.num_convs): - conv_name = 'mask_inter_feat_{}'.format(i + 1) - mask_conv.add_sublayer( - conv_name, - ConvNormLayer( - ch_in=in_channel if i == 0 else out_channel, - ch_out=out_channel, - filter_size=3, - stride=1, - norm_type=self.norm_type, - initializer=KaimingNormal(fan_in=fan_conv), - skip_quant=True)) - mask_conv.add_sublayer(conv_name + 'act', nn.ReLU()) - else: - for i in range(self.num_convs): - conv_name = 'mask_inter_feat_{}'.format(i + 1) - conv = nn.Conv2D( - in_channels=in_channel if i == 0 else out_channel, - out_channels=out_channel, - kernel_size=3, - padding=1, - weight_attr=paddle.ParamAttr( - initializer=KaimingNormal(fan_in=fan_conv))) - conv.skip_quant = True - mask_conv.add_sublayer(conv_name, conv) - mask_conv.add_sublayer(conv_name + 'act', nn.ReLU()) - mask_conv.add_sublayer( - 'conv5_mask', - nn.Conv2DTranspose( - in_channels=self.out_channel if num_convs > 0 else self.in_channel, - out_channels=self.out_channel, - kernel_size=2, - stride=2, - weight_attr=paddle.ParamAttr( - initializer=KaimingNormal(fan_in=fan_deconv)))) - mask_conv.add_sublayer('conv5_mask' + 'act', nn.ReLU()) - self.upsample = mask_conv - - @classmethod - def from_config(cls, cfg, input_shape): - if isinstance(input_shape, (list, tuple)): - input_shape = input_shape[0] - return {'in_channel': input_shape.channels, } - - def out_channels(self): - return self.out_channel - - def forward(self, feats): - return self.upsample(feats) - - -@register -class MaskHead(nn.Layer): - __shared__ = ['num_classes', 'export_onnx'] - __inject__ = ['mask_assigner'] - """ - RCNN mask head - - Args: - head (nn.Layer): Extract feature in mask head - roi_extractor (object): The module of RoI Extractor - mask_assigner (object): The module of Mask Assigner, - label and sample the mask - num_classes (int): The number of classes - share_bbox_feat (bool): Whether to share the feature from bbox head, - default false - """ - - def __init__(self, - head, - roi_extractor=_get_class_default_kwargs(RoIAlign), - mask_assigner='MaskAssigner', - num_classes=80, - share_bbox_feat=False, - export_onnx=False): - super(MaskHead, self).__init__() - self.num_classes = num_classes - self.export_onnx = export_onnx - - self.roi_extractor = roi_extractor - if isinstance(roi_extractor, dict): - self.roi_extractor = RoIAlign(**roi_extractor) - self.head = head - self.in_channels = head.out_channels() - self.mask_assigner = mask_assigner - self.share_bbox_feat = share_bbox_feat - self.bbox_head = None - - self.mask_fcn_logits = nn.Conv2D( - in_channels=self.in_channels, - out_channels=self.num_classes, - kernel_size=1, - weight_attr=paddle.ParamAttr(initializer=KaimingNormal( - fan_in=self.num_classes))) - self.mask_fcn_logits.skip_quant = True - - @classmethod - def from_config(cls, cfg, input_shape): - roi_pooler = cfg['roi_extractor'] - assert isinstance(roi_pooler, dict) - kwargs = RoIAlign.from_config(cfg, input_shape) - roi_pooler.update(kwargs) - kwargs = {'input_shape': input_shape} - head = create(cfg['head'], **kwargs) - return { - 'roi_extractor': roi_pooler, - 'head': head, - } - - def get_loss(self, mask_logits, mask_label, mask_target, mask_weight): - mask_label = F.one_hot(mask_label, self.num_classes).unsqueeze([2, 3]) - mask_label = paddle.expand_as(mask_label, mask_logits) - mask_label.stop_gradient = True - mask_pred = paddle.gather_nd(mask_logits, paddle.nonzero(mask_label)) - shape = mask_logits.shape - mask_pred = paddle.reshape(mask_pred, [shape[0], shape[2], shape[3]]) - - mask_target = mask_target.cast('float32') - mask_weight = mask_weight.unsqueeze([1, 2]) - loss_mask = F.binary_cross_entropy_with_logits( - mask_pred, mask_target, weight=mask_weight, reduction="mean") - return loss_mask - - def forward_train(self, body_feats, rois, rois_num, inputs, targets, - bbox_feat): - """ - body_feats (list[Tensor]): Multi-level backbone features - rois (list[Tensor]): Proposals for each batch with shape [N, 4] - rois_num (Tensor): The number of proposals for each batch - inputs (dict): ground truth info - """ - tgt_labels, _, tgt_gt_inds = targets - rois, rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights = self.mask_assigner( - rois, tgt_labels, tgt_gt_inds, inputs) - - if self.share_bbox_feat: - rois_feat = paddle.gather(bbox_feat, mask_index) - else: - rois_feat = self.roi_extractor(body_feats, rois, rois_num) - mask_feat = self.head(rois_feat) - mask_logits = self.mask_fcn_logits(mask_feat) - - loss_mask = self.get_loss(mask_logits, tgt_classes, tgt_masks, - tgt_weights) - return {'loss_mask': loss_mask} - - def forward_test(self, - body_feats, - rois, - rois_num, - scale_factor, - feat_func=None): - """ - body_feats (list[Tensor]): Multi-level backbone features - rois (Tensor): Prediction from bbox head with shape [N, 6] - rois_num (Tensor): The number of prediction for each batch - scale_factor (Tensor): The scale factor from origin size to input size - """ - if not self.export_onnx and rois.shape[0] == 0: - mask_out = paddle.full([1, 1, 1], -1) - else: - bbox = [rois[:, 2:]] - labels = rois[:, 0].cast('int32') - rois_feat = self.roi_extractor(body_feats, bbox, rois_num) - if self.share_bbox_feat: - assert feat_func is not None - rois_feat = feat_func(rois_feat) - - mask_feat = self.head(rois_feat) - mask_logit = self.mask_fcn_logits(mask_feat) - if self.num_classes == 1: - mask_out = F.sigmoid(mask_logit)[:, 0, :, :] - else: - num_masks = paddle.shape(mask_logit)[0] - index = paddle.arange(num_masks).cast('int32') - mask_out = mask_logit[index, labels] - mask_out_shape = paddle.shape(mask_out) - mask_out = paddle.reshape(mask_out, [ - paddle.shape(index), mask_out_shape[-2], mask_out_shape[-1] - ]) - mask_out = F.sigmoid(mask_out) - return mask_out - - def forward(self, - body_feats, - rois, - rois_num, - inputs, - targets=None, - bbox_feat=None, - feat_func=None): - if self.training: - return self.forward_train(body_feats, rois, rois_num, inputs, - targets, bbox_feat) - else: - im_scale = inputs['scale_factor'] - return self.forward_test(body_feats, rois, rois_num, im_scale, - feat_func) diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/petr_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/petr_head.py deleted file mode 100644 index 90760c6..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/petr_head.py +++ /dev/null @@ -1,1161 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/dense_heads/petr_head.py -""" -import copy -import numpy as np - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register -import paddle.distributed as dist - -from ..transformers.petr_transformer import inverse_sigmoid, masked_fill -from ..initializer import constant_, normal_ - -__all__ = ["PETRHead"] - -from functools import partial - - -def bias_init_with_prob(prior_prob: float) -> float: - """initialize conv/fc bias value according to a given probability value.""" - bias_init = float(-np.log((1 - prior_prob) / prior_prob)) - return bias_init - - -def multi_apply(func, *args, **kwargs): - """Apply function to a list of arguments. - - Note: - This function applies the ``func`` to multiple inputs and - map the multiple outputs of the ``func`` into different - list. Each list contains the same type of outputs corresponding - to different inputs. - - Args: - func (Function): A function that will be applied to a list of - arguments - - Returns: - tuple(list): A tuple containing multiple list, each list contains \ - a kind of returned results by the function - """ - pfunc = partial(func, **kwargs) if kwargs else func - map_results = map(pfunc, *args) - res = tuple(map(list, zip(*map_results))) - return res - - -def reduce_mean(tensor): - """"Obtain the mean of tensor on different GPUs.""" - if not (dist.get_world_size() and dist.is_initialized()): - return tensor - tensor = tensor.clone() - dist.all_reduce( - tensor.divide( - paddle.to_tensor( - dist.get_world_size(), dtype='float32')), - op=dist.ReduceOp.SUM) - return tensor - - -def gaussian_radius(det_size, min_overlap=0.7): - """calculate gaussian radius according to object size. - """ - height, width = det_size - - a1 = 1 - b1 = (height + width) - c1 = width * height * (1 - min_overlap) / (1 + min_overlap) - sq1 = paddle.sqrt(b1**2 - 4 * a1 * c1) - r1 = (b1 + sq1) / 2 - - a2 = 4 - b2 = 2 * (height + width) - c2 = (1 - min_overlap) * width * height - sq2 = paddle.sqrt(b2**2 - 4 * a2 * c2) - r2 = (b2 + sq2) / 2 - - a3 = 4 * min_overlap - b3 = -2 * min_overlap * (height + width) - c3 = (min_overlap - 1) * width * height - sq3 = paddle.sqrt(b3**2 - 4 * a3 * c3) - r3 = (b3 + sq3) / 2 - return min(r1, r2, r3) - - -def gaussian2D(shape, sigma=1): - m, n = [(ss - 1.) / 2. for ss in shape] - y = paddle.arange(-m, m + 1, dtype="float32")[:, None] - x = paddle.arange(-n, n + 1, dtype="float32")[None, :] - # y, x = np.ogrid[-m:m + 1, -n:n + 1] - - h = paddle.exp(-(x * x + y * y) / (2 * sigma * sigma)) - h[h < np.finfo(np.float32).eps * h.max()] = 0 - return h - - -def draw_umich_gaussian(heatmap, center, radius, k=1): - diameter = 2 * radius + 1 - gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6) - gaussian = paddle.to_tensor(gaussian, dtype=heatmap.dtype) - - x, y = int(center[0]), int(center[1]) - radius = int(radius) - - height, width = heatmap.shape[0:2] - - left, right = min(x, radius), min(width - x, radius + 1) - top, bottom = min(y, radius), min(height - y, radius + 1) - - masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] - masked_gaussian = gaussian[radius - top:radius + bottom, radius - left: - radius + right] - # assert masked_gaussian.equal(1).float().sum() == 1 - if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: - heatmap[y - top:y + bottom, x - left:x + right] = paddle.maximum( - masked_heatmap, masked_gaussian * k) - return heatmap - - -@register -class PETRHead(nn.Layer): - """Head of `End-to-End Multi-Person Pose Estimation with Transformers`. - - Args: - num_classes (int): Number of categories excluding the background. - in_channels (int): Number of channels in the input feature map. - num_query (int): Number of query in Transformer. - num_kpt_fcs (int, optional): Number of fully-connected layers used in - `FFN`, which is then used for the keypoint regression head. - Default 2. - transformer (obj:`mmcv.ConfigDict`|dict): ConfigDict is used for - building the Encoder and Decoder. Default: None. - sync_cls_avg_factor (bool): Whether to sync the avg_factor of - all ranks. Default to False. - positional_encoding (obj:`mmcv.ConfigDict`|dict): - Config for position encoding. - loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the - classification loss. Default `CrossEntropyLoss`. - loss_kpt (obj:`mmcv.ConfigDict`|dict): Config of the - regression loss. Default `L1Loss`. - loss_oks (obj:`mmcv.ConfigDict`|dict): Config of the - regression oks loss. Default `OKSLoss`. - loss_hm (obj:`mmcv.ConfigDict`|dict): Config of the - regression heatmap loss. Default `NegLoss`. - as_two_stage (bool) : Whether to generate the proposal from - the outputs of encoder. - with_kpt_refine (bool): Whether to refine the reference points - in the decoder. Defaults to True. - test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of - transformer head. - init_cfg (dict or list[dict], optional): Initialization config dict. - Default: None. - """ - __inject__ = [ - "transformer", "positional_encoding", "assigner", "sampler", "loss_cls", - "loss_kpt", "loss_oks", "loss_hm", "loss_kpt_rpn", "loss_kpt_refine", - "loss_oks_refine" - ] - - def __init__(self, - num_classes, - in_channels, - num_query=100, - num_kpt_fcs=2, - num_keypoints=17, - transformer=None, - sync_cls_avg_factor=True, - positional_encoding='SinePositionalEncoding', - loss_cls='FocalLoss', - loss_kpt='L1Loss', - loss_oks='OKSLoss', - loss_hm='CenterFocalLoss', - with_kpt_refine=True, - assigner='PoseHungarianAssigner', - sampler='PseudoSampler', - loss_kpt_rpn='L1Loss', - loss_kpt_refine='L1Loss', - loss_oks_refine='opera.OKSLoss', - test_cfg=dict(max_per_img=100), - init_cfg=None, - **kwargs): - # NOTE here use `AnchorFreeHead` instead of `TransformerHead`, - # since it brings inconvenience when the initialization of - # `AnchorFreeHead` is called. - super().__init__() - self.bg_cls_weight = 0 - self.sync_cls_avg_factor = sync_cls_avg_factor - self.assigner = assigner - self.sampler = sampler - self.num_query = num_query - self.num_classes = num_classes - self.in_channels = in_channels - self.num_kpt_fcs = num_kpt_fcs - self.test_cfg = test_cfg - self.fp16_enabled = False - self.as_two_stage = transformer.as_two_stage - self.with_kpt_refine = with_kpt_refine - self.num_keypoints = num_keypoints - self.loss_cls = loss_cls - self.loss_kpt = loss_kpt - self.loss_kpt_rpn = loss_kpt_rpn - self.loss_kpt_refine = loss_kpt_refine - self.loss_oks = loss_oks - self.loss_oks_refine = loss_oks_refine - self.loss_hm = loss_hm - if self.loss_cls.use_sigmoid: - self.cls_out_channels = num_classes - else: - self.cls_out_channels = num_classes + 1 - self.positional_encoding = positional_encoding - self.transformer = transformer - self.embed_dims = self.transformer.embed_dims - # assert 'num_feats' in positional_encoding - num_feats = positional_encoding.num_pos_feats - assert num_feats * 2 == self.embed_dims, 'embed_dims should' \ - f' be exactly 2 times of num_feats. Found {self.embed_dims}' \ - f' and {num_feats}.' - self._init_layers() - self.init_weights() - - def _init_layers(self): - """Initialize classification branch and keypoint branch of head.""" - - fc_cls = nn.Linear(self.embed_dims, self.cls_out_channels) - - kpt_branch = [] - kpt_branch.append(nn.Linear(self.embed_dims, 512)) - kpt_branch.append(nn.ReLU()) - for _ in range(self.num_kpt_fcs): - kpt_branch.append(nn.Linear(512, 512)) - kpt_branch.append(nn.ReLU()) - kpt_branch.append(nn.Linear(512, 2 * self.num_keypoints)) - kpt_branch = nn.Sequential(*kpt_branch) - - def _get_clones(module, N): - return nn.LayerList([copy.deepcopy(module) for i in range(N)]) - - # last kpt_branch is used to generate proposal from - # encode feature map when as_two_stage is True. - num_pred = (self.transformer.decoder.num_layers + 1) if \ - self.as_two_stage else self.transformer.decoder.num_layers - - if self.with_kpt_refine: - self.cls_branches = _get_clones(fc_cls, num_pred) - self.kpt_branches = _get_clones(kpt_branch, num_pred) - else: - self.cls_branches = nn.LayerList([fc_cls for _ in range(num_pred)]) - self.kpt_branches = nn.LayerList( - [kpt_branch for _ in range(num_pred)]) - - self.query_embedding = nn.Embedding(self.num_query, self.embed_dims * 2) - - refine_kpt_branch = [] - for _ in range(self.num_kpt_fcs): - refine_kpt_branch.append( - nn.Linear(self.embed_dims, self.embed_dims)) - refine_kpt_branch.append(nn.ReLU()) - refine_kpt_branch.append(nn.Linear(self.embed_dims, 2)) - refine_kpt_branch = nn.Sequential(*refine_kpt_branch) - if self.with_kpt_refine: - num_pred = self.transformer.refine_decoder.num_layers - self.refine_kpt_branches = _get_clones(refine_kpt_branch, num_pred) - self.fc_hm = nn.Linear(self.embed_dims, self.num_keypoints) - - def init_weights(self): - """Initialize weights of the PETR head.""" - self.transformer.init_weights() - if self.loss_cls.use_sigmoid: - bias_init = bias_init_with_prob(0.01) - for m in self.cls_branches: - constant_(m.bias, bias_init) - for m in self.kpt_branches: - constant_(m[-1].bias, 0) - # initialization of keypoint refinement branch - if self.with_kpt_refine: - for m in self.refine_kpt_branches: - constant_(m[-1].bias, 0) - # initialize bias for heatmap prediction - bias_init = bias_init_with_prob(0.1) - normal_(self.fc_hm.weight, std=0.01) - constant_(self.fc_hm.bias, bias_init) - - def forward(self, mlvl_feats, img_metas): - """Forward function. - - Args: - mlvl_feats (tuple[Tensor]): Features from the upstream - network, each is a 4D-tensor with shape - (N, C, H, W). - img_metas (list[dict]): List of image information. - - Returns: - outputs_classes (Tensor): Outputs from the classification head, - shape [nb_dec, bs, num_query, cls_out_channels]. Note - cls_out_channels should include background. - outputs_kpts (Tensor): Sigmoid outputs from the regression - head with normalized coordinate format (cx, cy, w, h). - Shape [nb_dec, bs, num_query, K*2]. - enc_outputs_class (Tensor): The score of each point on encode - feature map, has shape (N, h*w, num_class). Only when - as_two_stage is Ture it would be returned, otherwise - `None` would be returned. - enc_outputs_kpt (Tensor): The proposal generate from the - encode feature map, has shape (N, h*w, K*2). Only when - as_two_stage is Ture it would be returned, otherwise - `None` would be returned. - """ - - batch_size = mlvl_feats[0].shape[0] - input_img_h, input_img_w = img_metas[0]['batch_input_shape'] - img_masks = paddle.zeros( - (batch_size, input_img_h, input_img_w), dtype=mlvl_feats[0].dtype) - for img_id in range(batch_size): - img_h, img_w, _ = img_metas[img_id]['img_shape'] - img_masks[img_id, :img_h, :img_w] = 1 - - mlvl_masks = [] - mlvl_positional_encodings = [] - for feat in mlvl_feats: - mlvl_masks.append( - F.interpolate( - img_masks[None], size=feat.shape[-2:]).squeeze(0)) - mlvl_positional_encodings.append( - self.positional_encoding(mlvl_masks[-1]).transpose( - [0, 3, 1, 2])) - - query_embeds = self.query_embedding.weight - hs, init_reference, inter_references, \ - enc_outputs_class, enc_outputs_kpt, hm_proto, memory = \ - self.transformer( - mlvl_feats, - mlvl_masks, - query_embeds, - mlvl_positional_encodings, - kpt_branches=self.kpt_branches \ - if self.with_kpt_refine else None, # noqa:E501 - cls_branches=self.cls_branches \ - if self.as_two_stage else None # noqa:E501 - ) - - outputs_classes = [] - outputs_kpts = [] - - for lvl in range(hs.shape[0]): - if lvl == 0: - reference = init_reference - else: - reference = inter_references[lvl - 1] - reference = inverse_sigmoid(reference) - outputs_class = self.cls_branches[lvl](hs[lvl]) - tmp_kpt = self.kpt_branches[lvl](hs[lvl]) - assert reference.shape[-1] == self.num_keypoints * 2 - tmp_kpt += reference - outputs_kpt = F.sigmoid(tmp_kpt) - outputs_classes.append(outputs_class) - outputs_kpts.append(outputs_kpt) - - outputs_classes = paddle.stack(outputs_classes) - outputs_kpts = paddle.stack(outputs_kpts) - - if hm_proto is not None: - # get heatmap prediction (training phase) - hm_memory, hm_mask = hm_proto - hm_pred = self.fc_hm(hm_memory) - hm_proto = (hm_pred.transpose((0, 3, 1, 2)), hm_mask) - - if self.as_two_stage: - return outputs_classes, outputs_kpts, \ - enc_outputs_class, F.sigmoid(enc_outputs_kpt), \ - hm_proto, memory, mlvl_masks - else: - raise RuntimeError('only "as_two_stage=True" is supported.') - - def forward_refine(self, memory, mlvl_masks, refine_targets, losses, - img_metas): - """Forward function. - - Args: - mlvl_masks (tuple[Tensor]): The key_padding_mask from - different level used for encoder and decoder, - each is a 3D-tensor with shape (bs, H, W). - losses (dict[str, Tensor]): A dictionary of loss components. - img_metas (list[dict]): List of image information. - - Returns: - dict[str, Tensor]: A dictionary of loss components. - """ - kpt_preds, kpt_targets, area_targets, kpt_weights = refine_targets - pos_inds = kpt_weights.sum(-1) > 0 - if not pos_inds.any(): - pos_kpt_preds = paddle.zeros_like(kpt_preds[:1]) - pos_img_inds = paddle.zeros([1], dtype="int64") - else: - pos_kpt_preds = kpt_preds[pos_inds] - pos_img_inds = (pos_inds.nonzero() / - self.num_query).squeeze(1).astype("int64") - hs, init_reference, inter_references = self.transformer.forward_refine( - mlvl_masks, - memory, - pos_kpt_preds.detach(), - pos_img_inds, - kpt_branches=self.refine_kpt_branches - if self.with_kpt_refine else None, # noqa:E501 - ) - - outputs_kpts = [] - - for lvl in range(hs.shape[0]): - if lvl == 0: - reference = init_reference - else: - reference = inter_references[lvl - 1] - reference = inverse_sigmoid(reference) - tmp_kpt = self.refine_kpt_branches[lvl](hs[lvl]) - assert reference.shape[-1] == 2 - tmp_kpt += reference - outputs_kpt = F.sigmoid(tmp_kpt) - outputs_kpts.append(outputs_kpt) - outputs_kpts = paddle.stack(outputs_kpts) - - if not self.training: - return outputs_kpts - - num_valid_kpt = paddle.clip( - reduce_mean(kpt_weights.sum()), min=1).item() - num_total_pos = paddle.to_tensor( - [outputs_kpts.shape[1]], dtype=kpt_weights.dtype) - num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item() - - if not pos_inds.any(): - for i, kpt_refine_preds in enumerate(outputs_kpts): - loss_kpt = loss_oks = kpt_refine_preds.sum() * 0 - losses[f'd{i}.loss_kpt_refine'] = loss_kpt - losses[f'd{i}.loss_oks_refine'] = loss_oks - continue - return losses - - batch_size = mlvl_masks[0].shape[0] - factors = [] - for img_id in range(batch_size): - img_h, img_w, _ = img_metas[img_id]['img_shape'] - factor = paddle.to_tensor( - [img_w, img_h, img_w, img_h], - dtype="float32").squeeze(-1).unsqueeze(0).tile( - (self.num_query, 1)) - factors.append(factor) - factors = paddle.concat(factors, 0) - factors = factors[pos_inds][:, :2].tile((1, kpt_preds.shape[-1] // 2)) - - pos_kpt_weights = kpt_weights[pos_inds] - pos_kpt_targets = kpt_targets[pos_inds] - pos_kpt_targets_scaled = pos_kpt_targets * factors - pos_areas = area_targets[pos_inds] - pos_valid = kpt_weights[pos_inds][:, 0::2] - for i, kpt_refine_preds in enumerate(outputs_kpts): - if not pos_inds.any(): - print("refine kpt and oks skip") - loss_kpt = loss_oks = kpt_refine_preds.sum() * 0 - losses[f'd{i}.loss_kpt_refine'] = loss_kpt - losses[f'd{i}.loss_oks_refine'] = loss_oks - continue - - # kpt L1 Loss - pos_refine_preds = kpt_refine_preds.reshape( - (kpt_refine_preds.shape[0], -1)) - loss_kpt = self.loss_kpt_refine( - pos_refine_preds, - pos_kpt_targets, - pos_kpt_weights, - avg_factor=num_valid_kpt) - losses[f'd{i}.loss_kpt_refine'] = loss_kpt - # kpt oks loss - pos_refine_preds_scaled = pos_refine_preds * factors - assert (pos_areas > 0).all() - loss_oks = self.loss_oks_refine( - pos_refine_preds_scaled, - pos_kpt_targets_scaled, - pos_valid, - pos_areas, - avg_factor=num_total_pos) - losses[f'd{i}.loss_oks_refine'] = loss_oks - return losses - - # over-write because img_metas are needed as inputs for bbox_head. - def forward_train(self, - x, - img_metas, - gt_bboxes, - gt_labels=None, - gt_keypoints=None, - gt_areas=None, - gt_bboxes_ignore=None, - proposal_cfg=None, - **kwargs): - """Forward function for training mode. - - Args: - x (list[Tensor]): Features from backbone. - img_metas (list[dict]): Meta information of each image, e.g., - image size, scaling factor, etc. - gt_bboxes (list[Tensor]): Ground truth bboxes of the image, - shape (num_gts, 4). - gt_labels (list[Tensor]): Ground truth labels of each box, - shape (num_gts,). - gt_keypoints (list[Tensor]): Ground truth keypoints of the image, - shape (num_gts, K*3). - gt_areas (list[Tensor]): Ground truth mask areas of each box, - shape (num_gts,). - gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be - ignored, shape (num_ignored_gts, 4). - proposal_cfg (mmcv.Config): Test / postprocessing configuration, - if None, test_cfg would be used. - - Returns: - dict[str, Tensor]: A dictionary of loss components. - """ - assert proposal_cfg is None, '"proposal_cfg" must be None' - outs = self(x, img_metas) - memory, mlvl_masks = outs[-2:] - outs = outs[:-2] - if gt_labels is None: - loss_inputs = outs + (gt_bboxes, gt_keypoints, gt_areas, img_metas) - else: - loss_inputs = outs + (gt_bboxes, gt_labels, gt_keypoints, gt_areas, - img_metas) - losses_and_targets = self.loss( - *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) - # losses = losses_and_targets - losses, refine_targets = losses_and_targets - # get pose refinement loss - losses = self.forward_refine(memory, mlvl_masks, refine_targets, losses, - img_metas) - return losses - - def loss(self, - all_cls_scores, - all_kpt_preds, - enc_cls_scores, - enc_kpt_preds, - enc_hm_proto, - gt_bboxes_list, - gt_labels_list, - gt_keypoints_list, - gt_areas_list, - img_metas, - gt_bboxes_ignore=None): - """Loss function. - - Args: - all_cls_scores (Tensor): Classification score of all - decoder layers, has shape - [nb_dec, bs, num_query, cls_out_channels]. - all_kpt_preds (Tensor): Sigmoid regression - outputs of all decode layers. Each is a 4D-tensor with - normalized coordinate format (x_{i}, y_{i}) and shape - [nb_dec, bs, num_query, K*2]. - enc_cls_scores (Tensor): Classification scores of - points on encode feature map, has shape - (N, h*w, num_classes). Only be passed when as_two_stage is - True, otherwise is None. - enc_kpt_preds (Tensor): Regression results of each points - on the encode feature map, has shape (N, h*w, K*2). Only be - passed when as_two_stage is True, otherwise is None. - gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image - with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. - gt_labels_list (list[Tensor]): Ground truth class indices for each - image with shape (num_gts, ). - gt_keypoints_list (list[Tensor]): Ground truth keypoints for each - image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v, - ..., p^{K}_x, p^{K}_y, p^{K}_v] format. - gt_areas_list (list[Tensor]): Ground truth mask areas for each - image with shape (num_gts, ). - img_metas (list[dict]): List of image meta information. - gt_bboxes_ignore (list[Tensor], optional): Bounding boxes - which can be ignored for each image. Default None. - - Returns: - dict[str, Tensor]: A dictionary of loss components. - """ - assert gt_bboxes_ignore is None, \ - f'{self.__class__.__name__} only supports ' \ - f'for gt_bboxes_ignore setting to None.' - - num_dec_layers = len(all_cls_scores) - all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)] - all_gt_keypoints_list = [ - gt_keypoints_list for _ in range(num_dec_layers) - ] - all_gt_areas_list = [gt_areas_list for _ in range(num_dec_layers)] - img_metas_list = [img_metas for _ in range(num_dec_layers)] - - losses_cls, losses_kpt, losses_oks, kpt_preds_list, kpt_targets_list, \ - area_targets_list, kpt_weights_list = multi_apply( - self.loss_single, all_cls_scores, all_kpt_preds, - all_gt_labels_list, all_gt_keypoints_list, - all_gt_areas_list, img_metas_list) - - loss_dict = dict() - # loss of proposal generated from encode feature map. - if enc_cls_scores is not None: - binary_labels_list = [ - paddle.zeros_like(gt_labels_list[i]) - for i in range(len(img_metas)) - ] - enc_loss_cls, enc_losses_kpt = \ - self.loss_single_rpn( - enc_cls_scores, enc_kpt_preds, binary_labels_list, - gt_keypoints_list, gt_areas_list, img_metas) - loss_dict['enc_loss_cls'] = enc_loss_cls - loss_dict['enc_loss_kpt'] = enc_losses_kpt - - # loss from the last decoder layer - loss_dict['loss_cls'] = losses_cls[-1] - loss_dict['loss_kpt'] = losses_kpt[-1] - loss_dict['loss_oks'] = losses_oks[-1] - # loss from other decoder layers - num_dec_layer = 0 - for loss_cls_i, loss_kpt_i, loss_oks_i in zip( - losses_cls[:-1], losses_kpt[:-1], losses_oks[:-1]): - loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i - loss_dict[f'd{num_dec_layer}.loss_kpt'] = loss_kpt_i - loss_dict[f'd{num_dec_layer}.loss_oks'] = loss_oks_i - num_dec_layer += 1 - - # losses of heatmap generated from P3 feature map - hm_pred, hm_mask = enc_hm_proto - loss_hm = self.loss_heatmap(hm_pred, hm_mask, gt_keypoints_list, - gt_labels_list, gt_bboxes_list) - loss_dict['loss_hm'] = loss_hm - - return loss_dict, (kpt_preds_list[-1], kpt_targets_list[-1], - area_targets_list[-1], kpt_weights_list[-1]) - - def loss_heatmap(self, hm_pred, hm_mask, gt_keypoints, gt_labels, - gt_bboxes): - assert hm_pred.shape[-2:] == hm_mask.shape[-2:] - num_img, _, h, w = hm_pred.shape - # placeholder of heatmap target (Gaussian distribution) - hm_target = paddle.zeros(hm_pred.shape, hm_pred.dtype) - for i, (gt_label, gt_bbox, gt_keypoint - ) in enumerate(zip(gt_labels, gt_bboxes, gt_keypoints)): - if gt_label.shape[0] == 0: - continue - gt_keypoint = gt_keypoint.reshape((gt_keypoint.shape[0], -1, - 3)).clone() - gt_keypoint[..., :2] /= 8 - - assert gt_keypoint[..., 0].max() <= w + 0.5 # new coordinate system - assert gt_keypoint[..., 1].max() <= h + 0.5 # new coordinate system - gt_bbox /= 8 - gt_w = gt_bbox[:, 2] - gt_bbox[:, 0] - gt_h = gt_bbox[:, 3] - gt_bbox[:, 1] - for j in range(gt_label.shape[0]): - # get heatmap radius - kp_radius = paddle.clip( - paddle.floor( - gaussian_radius( - (gt_h[j], gt_w[j]), min_overlap=0.9)), - min=0, - max=3) - for k in range(self.num_keypoints): - if gt_keypoint[j, k, 2] > 0: - gt_kp = gt_keypoint[j, k, :2] - gt_kp_int = paddle.floor(gt_kp) - hm_target[i, k] = draw_umich_gaussian( - hm_target[i, k], gt_kp_int, kp_radius) - # compute heatmap loss - hm_pred = paddle.clip( - F.sigmoid(hm_pred), min=1e-4, max=1 - 1e-4) # refer to CenterNet - loss_hm = self.loss_hm( - hm_pred, - hm_target.detach(), - mask=~hm_mask.astype("bool").unsqueeze(1)) - return loss_hm - - def loss_single(self, cls_scores, kpt_preds, gt_labels_list, - gt_keypoints_list, gt_areas_list, img_metas): - """Loss function for outputs from a single decoder layer of a single - feature level. - - Args: - cls_scores (Tensor): Box score logits from a single decoder layer - for all images. Shape [bs, num_query, cls_out_channels]. - kpt_preds (Tensor): Sigmoid outputs from a single decoder layer - for all images, with normalized coordinate (x_{i}, y_{i}) and - shape [bs, num_query, K*2]. - gt_labels_list (list[Tensor]): Ground truth class indices for each - image with shape (num_gts, ). - gt_keypoints_list (list[Tensor]): Ground truth keypoints for each - image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v, - ..., p^{K}_x, p^{K}_y, p^{K}_v] format. - gt_areas_list (list[Tensor]): Ground truth mask areas for each - image with shape (num_gts, ). - img_metas (list[dict]): List of image meta information. - - Returns: - dict[str, Tensor]: A dictionary of loss components for outputs from - a single decoder layer. - """ - num_imgs = cls_scores.shape[0] - cls_scores_list = [cls_scores[i] for i in range(num_imgs)] - kpt_preds_list = [kpt_preds[i] for i in range(num_imgs)] - cls_reg_targets = self.get_targets(cls_scores_list, kpt_preds_list, - gt_labels_list, gt_keypoints_list, - gt_areas_list, img_metas) - (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list, - area_targets_list, num_total_pos, num_total_neg) = cls_reg_targets - labels = paddle.concat(labels_list, 0) - label_weights = paddle.concat(label_weights_list, 0) - kpt_targets = paddle.concat(kpt_targets_list, 0) - kpt_weights = paddle.concat(kpt_weights_list, 0) - area_targets = paddle.concat(area_targets_list, 0) - - # classification loss - cls_scores = cls_scores.reshape((-1, self.cls_out_channels)) - # construct weighted avg_factor to match with the official DETR repo - cls_avg_factor = num_total_pos * 1.0 + \ - num_total_neg * self.bg_cls_weight - if self.sync_cls_avg_factor: - cls_avg_factor = reduce_mean( - paddle.to_tensor( - [cls_avg_factor], dtype=cls_scores.dtype)) - cls_avg_factor = max(cls_avg_factor, 1) - - loss_cls = self.loss_cls( - cls_scores, labels, label_weights, avg_factor=cls_avg_factor) - - # Compute the average number of gt keypoints accross all gpus, for - # normalization purposes - num_total_pos = paddle.to_tensor([num_total_pos], dtype=loss_cls.dtype) - num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item() - - # construct factors used for rescale keypoints - factors = [] - for img_meta, kpt_pred in zip(img_metas, kpt_preds): - img_h, img_w, _ = img_meta['img_shape'] - factor = paddle.to_tensor( - [img_w, img_h, img_w, img_h], - dtype=kpt_pred.dtype).squeeze().unsqueeze(0).tile( - (kpt_pred.shape[0], 1)) - factors.append(factor) - factors = paddle.concat(factors, 0) - - # keypoint regression loss - kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1])) - num_valid_kpt = paddle.clip( - reduce_mean(kpt_weights.sum()), min=1).item() - # assert num_valid_kpt == (kpt_targets>0).sum().item() - loss_kpt = self.loss_kpt( - kpt_preds, - kpt_targets.detach(), - kpt_weights.detach(), - avg_factor=num_valid_kpt) - - # keypoint oks loss - pos_inds = kpt_weights.sum(-1) > 0 - if not pos_inds.any(): - loss_oks = kpt_preds.sum() * 0 - else: - factors = factors[pos_inds][:, :2].tile(( - (1, kpt_preds.shape[-1] // 2))) - pos_kpt_preds = kpt_preds[pos_inds] * factors - pos_kpt_targets = kpt_targets[pos_inds] * factors - pos_areas = area_targets[pos_inds] - pos_valid = kpt_weights[pos_inds][..., 0::2] - assert (pos_areas > 0).all() - loss_oks = self.loss_oks( - pos_kpt_preds, - pos_kpt_targets, - pos_valid, - pos_areas, - avg_factor=num_total_pos) - return loss_cls, loss_kpt, loss_oks, kpt_preds, kpt_targets, \ - area_targets, kpt_weights - - def get_targets(self, cls_scores_list, kpt_preds_list, gt_labels_list, - gt_keypoints_list, gt_areas_list, img_metas): - """Compute regression and classification targets for a batch image. - - Outputs from a single decoder layer of a single feature level are used. - - Args: - cls_scores_list (list[Tensor]): Box score logits from a single - decoder layer for each image with shape [num_query, - cls_out_channels]. - kpt_preds_list (list[Tensor]): Sigmoid outputs from a single - decoder layer for each image, with normalized coordinate - (x_{i}, y_{i}) and shape [num_query, K*2]. - gt_labels_list (list[Tensor]): Ground truth class indices for each - image with shape (num_gts, ). - gt_keypoints_list (list[Tensor]): Ground truth keypoints for each - image with shape (num_gts, K*3). - gt_areas_list (list[Tensor]): Ground truth mask areas for each - image with shape (num_gts, ). - img_metas (list[dict]): List of image meta information. - - Returns: - tuple: a tuple containing the following targets. - - - labels_list (list[Tensor]): Labels for all images. - - label_weights_list (list[Tensor]): Label weights for all - images. - - kpt_targets_list (list[Tensor]): Keypoint targets for all - images. - - kpt_weights_list (list[Tensor]): Keypoint weights for all - images. - - area_targets_list (list[Tensor]): area targets for all - images. - - num_total_pos (int): Number of positive samples in all - images. - - num_total_neg (int): Number of negative samples in all - images. - """ - (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list, - area_targets_list, pos_inds_list, neg_inds_list) = multi_apply( - self._get_target_single, cls_scores_list, kpt_preds_list, - gt_labels_list, gt_keypoints_list, gt_areas_list, img_metas) - num_total_pos = sum((inds.numel() for inds in pos_inds_list)) - num_total_neg = sum((inds.numel() for inds in neg_inds_list)) - return (labels_list, label_weights_list, kpt_targets_list, - kpt_weights_list, area_targets_list, num_total_pos, - num_total_neg) - - def _get_target_single(self, cls_score, kpt_pred, gt_labels, gt_keypoints, - gt_areas, img_meta): - """Compute regression and classification targets for one image. - - Outputs from a single decoder layer of a single feature level are used. - - Args: - cls_score (Tensor): Box score logits from a single decoder layer - for one image. Shape [num_query, cls_out_channels]. - kpt_pred (Tensor): Sigmoid outputs from a single decoder layer - for one image, with normalized coordinate (x_{i}, y_{i}) and - shape [num_query, K*2]. - gt_labels (Tensor): Ground truth class indices for one image - with shape (num_gts, ). - gt_keypoints (Tensor): Ground truth keypoints for one image with - shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v, ..., \ - p^{K}_x, p^{K}_y, p^{K}_v] format. - gt_areas (Tensor): Ground truth mask areas for one image - with shape (num_gts, ). - img_meta (dict): Meta information for one image. - - Returns: - tuple[Tensor]: a tuple containing the following for one image. - - - labels (Tensor): Labels of each image. - - label_weights (Tensor): Label weights of each image. - - kpt_targets (Tensor): Keypoint targets of each image. - - kpt_weights (Tensor): Keypoint weights of each image. - - area_targets (Tensor): Area targets of each image. - - pos_inds (Tensor): Sampled positive indices for each image. - - neg_inds (Tensor): Sampled negative indices for each image. - """ - num_bboxes = kpt_pred.shape[0] - # assigner and sampler - assign_result = self.assigner.assign(cls_score, kpt_pred, gt_labels, - gt_keypoints, gt_areas, img_meta) - sampling_result = self.sampler.sample(assign_result, kpt_pred, - gt_keypoints) - - pos_inds = sampling_result.pos_inds - neg_inds = sampling_result.neg_inds - - # label targets - labels = paddle.full((num_bboxes, ), self.num_classes, dtype="int64") - label_weights = paddle.ones((num_bboxes, ), dtype=gt_labels.dtype) - kpt_targets = paddle.zeros_like(kpt_pred) - kpt_weights = paddle.zeros_like(kpt_pred) - area_targets = paddle.zeros((kpt_pred.shape[0], ), dtype=kpt_pred.dtype) - - if pos_inds.size == 0: - return (labels, label_weights, kpt_targets, kpt_weights, - area_targets, pos_inds, neg_inds) - - labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds][ - ..., 0].astype("int64") - - img_h, img_w, _ = img_meta['img_shape'] - # keypoint targets - pos_gt_kpts = gt_keypoints[sampling_result.pos_assigned_gt_inds] - pos_gt_kpts = pos_gt_kpts.reshape( - (len(sampling_result.pos_assigned_gt_inds), -1, 3)) - valid_idx = pos_gt_kpts[:, :, 2] > 0 - pos_kpt_weights = kpt_weights[pos_inds].reshape( - (pos_gt_kpts.shape[0], kpt_weights.shape[-1] // 2, 2)) - # pos_kpt_weights[valid_idx][...] = 1.0 - pos_kpt_weights = masked_fill(pos_kpt_weights, - valid_idx.unsqueeze(-1), 1.0) - kpt_weights[pos_inds] = pos_kpt_weights.reshape( - (pos_kpt_weights.shape[0], kpt_pred.shape[-1])) - - factor = paddle.to_tensor( - [img_w, img_h], dtype=kpt_pred.dtype).squeeze().unsqueeze(0) - pos_gt_kpts_normalized = pos_gt_kpts[..., :2] - pos_gt_kpts_normalized[..., 0] = pos_gt_kpts_normalized[..., 0] / \ - factor[:, 0:1] - pos_gt_kpts_normalized[..., 1] = pos_gt_kpts_normalized[..., 1] / \ - factor[:, 1:2] - kpt_targets[pos_inds] = pos_gt_kpts_normalized.reshape( - (pos_gt_kpts.shape[0], kpt_pred.shape[-1])) - - pos_gt_areas = gt_areas[sampling_result.pos_assigned_gt_inds][..., 0] - area_targets[pos_inds] = pos_gt_areas - - return (labels, label_weights, kpt_targets, kpt_weights, area_targets, - pos_inds, neg_inds) - - def loss_single_rpn(self, cls_scores, kpt_preds, gt_labels_list, - gt_keypoints_list, gt_areas_list, img_metas): - """Loss function for outputs from a single decoder layer of a single - feature level. - - Args: - cls_scores (Tensor): Box score logits from a single decoder layer - for all images. Shape [bs, num_query, cls_out_channels]. - kpt_preds (Tensor): Sigmoid outputs from a single decoder layer - for all images, with normalized coordinate (x_{i}, y_{i}) and - shape [bs, num_query, K*2]. - gt_labels_list (list[Tensor]): Ground truth class indices for each - image with shape (num_gts, ). - gt_keypoints_list (list[Tensor]): Ground truth keypoints for each - image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v, - ..., p^{K}_x, p^{K}_y, p^{K}_v] format. - gt_areas_list (list[Tensor]): Ground truth mask areas for each - image with shape (num_gts, ). - img_metas (list[dict]): List of image meta information. - - Returns: - dict[str, Tensor]: A dictionary of loss components for outputs from - a single decoder layer. - """ - num_imgs = cls_scores.shape[0] - cls_scores_list = [cls_scores[i] for i in range(num_imgs)] - kpt_preds_list = [kpt_preds[i] for i in range(num_imgs)] - cls_reg_targets = self.get_targets(cls_scores_list, kpt_preds_list, - gt_labels_list, gt_keypoints_list, - gt_areas_list, img_metas) - (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list, - area_targets_list, num_total_pos, num_total_neg) = cls_reg_targets - labels = paddle.concat(labels_list, 0) - label_weights = paddle.concat(label_weights_list, 0) - kpt_targets = paddle.concat(kpt_targets_list, 0) - kpt_weights = paddle.concat(kpt_weights_list, 0) - - # classification loss - cls_scores = cls_scores.reshape((-1, self.cls_out_channels)) - # construct weighted avg_factor to match with the official DETR repo - cls_avg_factor = num_total_pos * 1.0 + \ - num_total_neg * self.bg_cls_weight - if self.sync_cls_avg_factor: - cls_avg_factor = reduce_mean( - paddle.to_tensor( - [cls_avg_factor], dtype=cls_scores.dtype)) - cls_avg_factor = max(cls_avg_factor, 1) - - cls_avg_factor = max(cls_avg_factor, 1) - loss_cls = self.loss_cls( - cls_scores, labels, label_weights, avg_factor=cls_avg_factor) - - # Compute the average number of gt keypoints accross all gpus, for - # normalization purposes - # num_total_pos = loss_cls.to_tensor([num_total_pos]) - # num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item() - - # keypoint regression loss - kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1])) - num_valid_kpt = paddle.clip( - reduce_mean(kpt_weights.sum()), min=1).item() - # assert num_valid_kpt == (kpt_targets>0).sum().item() - loss_kpt = self.loss_kpt_rpn( - kpt_preds, kpt_targets, kpt_weights, avg_factor=num_valid_kpt) - - return loss_cls, loss_kpt - - def get_bboxes(self, - all_cls_scores, - all_kpt_preds, - enc_cls_scores, - enc_kpt_preds, - hm_proto, - memory, - mlvl_masks, - img_metas, - rescale=False): - """Transform network outputs for a batch into bbox predictions. - - Args: - all_cls_scores (Tensor): Classification score of all - decoder layers, has shape - [nb_dec, bs, num_query, cls_out_channels]. - all_kpt_preds (Tensor): Sigmoid regression - outputs of all decode layers. Each is a 4D-tensor with - normalized coordinate format (x_{i}, y_{i}) and shape - [nb_dec, bs, num_query, K*2]. - enc_cls_scores (Tensor): Classification scores of points on - encode feature map, has shape (N, h*w, num_classes). - Only be passed when as_two_stage is True, otherwise is None. - enc_kpt_preds (Tensor): Regression results of each points - on the encode feature map, has shape (N, h*w, K*2). Only be - passed when as_two_stage is True, otherwise is None. - img_metas (list[dict]): Meta information of each image. - rescale (bool, optional): If True, return boxes in original - image space. Defalut False. - - Returns: - list[list[Tensor, Tensor]]: Each item in result_list is 3-tuple. - The first item is an (n, 5) tensor, where the first 4 columns - are bounding box positions (tl_x, tl_y, br_x, br_y) and the - 5-th column is a score between 0 and 1. The second item is a - (n,) tensor where each item is the predicted class label of - the corresponding box. The third item is an (n, K, 3) tensor - with [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x, p^{K}_y, - p^{K}_v] format. - """ - cls_scores = all_cls_scores[-1] - kpt_preds = all_kpt_preds[-1] - - result_list = [] - for img_id in range(len(img_metas)): - cls_score = cls_scores[img_id] - kpt_pred = kpt_preds[img_id] - img_shape = img_metas[img_id]['img_shape'] - scale_factor = img_metas[img_id]['scale_factor'] - # TODO: only support single image test - # memory_i = memory[:, img_id, :] - # mlvl_mask = mlvl_masks[img_id] - proposals = self._get_bboxes_single(cls_score, kpt_pred, img_shape, - scale_factor, memory, - mlvl_masks, rescale) - result_list.append(proposals) - return result_list - - def _get_bboxes_single(self, - cls_score, - kpt_pred, - img_shape, - scale_factor, - memory, - mlvl_masks, - rescale=False): - """Transform outputs from the last decoder layer into bbox predictions - for each image. - - Args: - cls_score (Tensor): Box score logits from the last decoder layer - for each image. Shape [num_query, cls_out_channels]. - kpt_pred (Tensor): Sigmoid outputs from the last decoder layer - for each image, with coordinate format (x_{i}, y_{i}) and - shape [num_query, K*2]. - img_shape (tuple[int]): Shape of input image, (height, width, 3). - scale_factor (ndarray, optional): Scale factor of the image arange - as (w_scale, h_scale, w_scale, h_scale). - rescale (bool, optional): If True, return boxes in original image - space. Default False. - - Returns: - tuple[Tensor]: Results of detected bboxes and labels. - - - det_bboxes: Predicted bboxes with shape [num_query, 5], - where the first 4 columns are bounding box positions - (tl_x, tl_y, br_x, br_y) and the 5-th column are scores - between 0 and 1. - - det_labels: Predicted labels of the corresponding box with - shape [num_query]. - - det_kpts: Predicted keypoints with shape [num_query, K, 3]. - """ - assert len(cls_score) == len(kpt_pred) - max_per_img = self.test_cfg.get('max_per_img', self.num_query) - # exclude background - if self.loss_cls.use_sigmoid: - cls_score = F.sigmoid(cls_score) - scores, indexs = cls_score.reshape([-1]).topk(max_per_img) - det_labels = indexs % self.num_classes - bbox_index = indexs // self.num_classes - kpt_pred = kpt_pred[bbox_index] - else: - scores, det_labels = F.softmax(cls_score, axis=-1)[..., :-1].max(-1) - scores, bbox_index = scores.topk(max_per_img) - kpt_pred = kpt_pred[bbox_index] - det_labels = det_labels[bbox_index] - - # ----- results after pose decoder ----- - # det_kpts = kpt_pred.reshape((kpt_pred.shape[0], -1, 2)) - - # ----- results after joint decoder (default) ----- - # import time - # start = time.time() - refine_targets = (kpt_pred, None, None, paddle.ones_like(kpt_pred)) - refine_outputs = self.forward_refine(memory, mlvl_masks, refine_targets, - None, None) - # end = time.time() - # print(f'refine time: {end - start:.6f}') - det_kpts = refine_outputs[-1] - - det_kpts[..., 0] = det_kpts[..., 0] * img_shape[1] - det_kpts[..., 1] = det_kpts[..., 1] * img_shape[0] - det_kpts[..., 0].clip_(min=0, max=img_shape[1]) - det_kpts[..., 1].clip_(min=0, max=img_shape[0]) - if rescale: - det_kpts /= paddle.to_tensor( - scale_factor[:2], - dtype=det_kpts.dtype).unsqueeze(0).unsqueeze(0) - - # use circumscribed rectangle box of keypoints as det bboxes - x1 = det_kpts[..., 0].min(axis=1, keepdim=True) - y1 = det_kpts[..., 1].min(axis=1, keepdim=True) - x2 = det_kpts[..., 0].max(axis=1, keepdim=True) - y2 = det_kpts[..., 1].max(axis=1, keepdim=True) - det_bboxes = paddle.concat([x1, y1, x2, y2], axis=1) - det_bboxes = paddle.concat((det_bboxes, scores.unsqueeze(1)), -1) - - det_kpts = paddle.concat( - (det_kpts, paddle.ones( - det_kpts[..., :1].shape, dtype=det_kpts.dtype)), - axis=2) - - return det_bboxes, det_labels, det_kpts - - def simple_test(self, feats, img_metas, rescale=False): - """Test det bboxes without test-time augmentation. - - Args: - feats (tuple[paddle.Tensor]): Multi-level features from the - upstream network, each is a 4D-tensor. - img_metas (list[dict]): List of image information. - rescale (bool, optional): Whether to rescale the results. - Defaults to False. - - Returns: - list[tuple[Tensor, Tensor, Tensor]]: Each item in result_list is - 3-tuple. The first item is ``bboxes`` with shape (n, 5), - where 5 represent (tl_x, tl_y, br_x, br_y, score). - The shape of the second tensor in the tuple is ``labels`` - with shape (n,). The third item is ``kpts`` with shape - (n, K, 3), in [p^{1}_x, p^{1}_y, p^{1}_v, p^{K}_x, p^{K}_y, - p^{K}_v] format. - """ - # forward of this head requires img_metas - outs = self.forward(feats, img_metas) - results_list = self.get_bboxes(*outs, img_metas, rescale=rescale) - return results_list - - def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes): - return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes) diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/pico_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/pico_head.py deleted file mode 100644 index 6e04173..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/pico_head.py +++ /dev/null @@ -1,797 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import math -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.nn.initializer import Normal, Constant - -from ppdet.modeling.ops import get_static_shape -from ..initializer import normal_ -from ..assigners.utils import generate_anchors_for_grid_cell -from ..bbox_utils import bbox_center, batch_distance2bbox, bbox2distance -from ppdet.core.workspace import register -from ppdet.modeling.layers import ConvNormLayer -from .simota_head import OTAVFLHead -from .gfl_head import Integral, GFLHead -from ppdet.modeling.necks.csp_pan import DPModule - -eps = 1e-9 - -__all__ = ['PicoHead', 'PicoHeadV2', 'PicoFeat'] - - -class PicoSE(nn.Layer): - def __init__(self, feat_channels): - super(PicoSE, self).__init__() - self.fc = nn.Conv2D(feat_channels, feat_channels, 1) - self.conv = ConvNormLayer(feat_channels, feat_channels, 1, 1) - - self._init_weights() - - def _init_weights(self): - normal_(self.fc.weight, std=0.001) - - def forward(self, feat, avg_feat): - weight = F.sigmoid(self.fc(avg_feat)) - out = self.conv(feat * weight) - return out - - -@register -class PicoFeat(nn.Layer): - """ - PicoFeat of PicoDet - - Args: - feat_in (int): The channel number of input Tensor. - feat_out (int): The channel number of output Tensor. - num_convs (int): The convolution number of the LiteGFLFeat. - norm_type (str): Normalization type, 'bn'/'sync_bn'/'gn'. - share_cls_reg (bool): Whether to share the cls and reg output. - act (str): The act of per layers. - use_se (bool): Whether to use se module. - """ - - def __init__(self, - feat_in=256, - feat_out=96, - num_fpn_stride=3, - num_convs=2, - norm_type='bn', - share_cls_reg=False, - act='hard_swish', - use_se=False): - super(PicoFeat, self).__init__() - self.num_convs = num_convs - self.norm_type = norm_type - self.share_cls_reg = share_cls_reg - self.act = act - self.use_se = use_se - self.cls_convs = [] - self.reg_convs = [] - if use_se: - assert share_cls_reg == True, \ - 'In the case of using se, share_cls_reg must be set to True' - self.se = nn.LayerList() - for stage_idx in range(num_fpn_stride): - cls_subnet_convs = [] - reg_subnet_convs = [] - for i in range(self.num_convs): - in_c = feat_in if i == 0 else feat_out - cls_conv_dw = self.add_sublayer( - 'cls_conv_dw{}.{}'.format(stage_idx, i), - ConvNormLayer( - ch_in=in_c, - ch_out=feat_out, - filter_size=5, - stride=1, - groups=feat_out, - norm_type=norm_type, - bias_on=False, - lr_scale=2.)) - cls_subnet_convs.append(cls_conv_dw) - cls_conv_pw = self.add_sublayer( - 'cls_conv_pw{}.{}'.format(stage_idx, i), - ConvNormLayer( - ch_in=in_c, - ch_out=feat_out, - filter_size=1, - stride=1, - norm_type=norm_type, - bias_on=False, - lr_scale=2.)) - cls_subnet_convs.append(cls_conv_pw) - - if not self.share_cls_reg: - reg_conv_dw = self.add_sublayer( - 'reg_conv_dw{}.{}'.format(stage_idx, i), - ConvNormLayer( - ch_in=in_c, - ch_out=feat_out, - filter_size=5, - stride=1, - groups=feat_out, - norm_type=norm_type, - bias_on=False, - lr_scale=2.)) - reg_subnet_convs.append(reg_conv_dw) - reg_conv_pw = self.add_sublayer( - 'reg_conv_pw{}.{}'.format(stage_idx, i), - ConvNormLayer( - ch_in=in_c, - ch_out=feat_out, - filter_size=1, - stride=1, - norm_type=norm_type, - bias_on=False, - lr_scale=2.)) - reg_subnet_convs.append(reg_conv_pw) - self.cls_convs.append(cls_subnet_convs) - self.reg_convs.append(reg_subnet_convs) - if use_se: - self.se.append(PicoSE(feat_out)) - - def act_func(self, x): - if self.act == "leaky_relu": - x = F.leaky_relu(x) - elif self.act == "hard_swish": - x = F.hardswish(x) - elif self.act == "relu6": - x = F.relu6(x) - return x - - def forward(self, fpn_feat, stage_idx): - assert stage_idx < len(self.cls_convs) - cls_feat = fpn_feat - reg_feat = fpn_feat - for i in range(len(self.cls_convs[stage_idx])): - cls_feat = self.act_func(self.cls_convs[stage_idx][i](cls_feat)) - reg_feat = cls_feat - if not self.share_cls_reg: - reg_feat = self.act_func(self.reg_convs[stage_idx][i](reg_feat)) - if self.use_se: - avg_feat = F.adaptive_avg_pool2d(cls_feat, (1, 1)) - se_feat = self.act_func(self.se[stage_idx](cls_feat, avg_feat)) - return cls_feat, se_feat - return cls_feat, reg_feat - - -@register -class PicoHead(OTAVFLHead): - """ - PicoHead - Args: - conv_feat (object): Instance of 'PicoFeat' - num_classes (int): Number of classes - fpn_stride (list): The stride of each FPN Layer - prior_prob (float): Used to set the bias init for the class prediction layer - loss_class (object): Instance of VariFocalLoss. - loss_dfl (object): Instance of DistributionFocalLoss. - loss_bbox (object): Instance of bbox loss. - assigner (object): Instance of label assigner. - reg_max: Max value of integral set :math: `{0, ..., reg_max}` - n QFL setting. Default: 7. - """ - __inject__ = [ - 'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', - 'assigner', 'nms' - ] - __shared__ = ['num_classes', 'eval_size'] - - def __init__(self, - conv_feat='PicoFeat', - dgqp_module=None, - num_classes=80, - fpn_stride=[8, 16, 32], - prior_prob=0.01, - loss_class='VariFocalLoss', - loss_dfl='DistributionFocalLoss', - loss_bbox='GIoULoss', - assigner='SimOTAAssigner', - reg_max=16, - feat_in_chan=96, - nms=None, - nms_pre=1000, - cell_offset=0, - eval_size=None): - super(PicoHead, self).__init__( - conv_feat=conv_feat, - dgqp_module=dgqp_module, - num_classes=num_classes, - fpn_stride=fpn_stride, - prior_prob=prior_prob, - loss_class=loss_class, - loss_dfl=loss_dfl, - loss_bbox=loss_bbox, - assigner=assigner, - reg_max=reg_max, - feat_in_chan=feat_in_chan, - nms=nms, - nms_pre=nms_pre, - cell_offset=cell_offset) - self.conv_feat = conv_feat - self.num_classes = num_classes - self.fpn_stride = fpn_stride - self.prior_prob = prior_prob - self.loss_vfl = loss_class - self.loss_dfl = loss_dfl - self.loss_bbox = loss_bbox - self.assigner = assigner - self.reg_max = reg_max - self.feat_in_chan = feat_in_chan - self.nms = nms - self.nms_pre = nms_pre - self.cell_offset = cell_offset - self.eval_size = eval_size - self.device = paddle.device.get_device() - - self.use_sigmoid = self.loss_vfl.use_sigmoid - if self.use_sigmoid: - self.cls_out_channels = self.num_classes - else: - self.cls_out_channels = self.num_classes + 1 - bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob) - # Clear the super class initialization - self.gfl_head_cls = None - self.gfl_head_reg = None - self.scales_regs = None - - self.head_cls_list = [] - self.head_reg_list = [] - for i in range(len(fpn_stride)): - head_cls = self.add_sublayer( - "head_cls" + str(i), - nn.Conv2D( - in_channels=self.feat_in_chan, - out_channels=self.cls_out_channels + 4 * (self.reg_max + 1) - if self.conv_feat.share_cls_reg else self.cls_out_channels, - kernel_size=1, - stride=1, - padding=0, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr( - initializer=Constant(value=bias_init_value)))) - self.head_cls_list.append(head_cls) - if not self.conv_feat.share_cls_reg: - head_reg = self.add_sublayer( - "head_reg" + str(i), - nn.Conv2D( - in_channels=self.feat_in_chan, - out_channels=4 * (self.reg_max + 1), - kernel_size=1, - stride=1, - padding=0, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr(initializer=Constant(value=0)))) - self.head_reg_list.append(head_reg) - - # initialize the anchor points - if self.eval_size: - self.anchor_points, self.stride_tensor = self._generate_anchors() - - def forward(self, fpn_feats, export_post_process=True): - assert len(fpn_feats) == len( - self.fpn_stride - ), "The size of fpn_feats is not equal to size of fpn_stride" - - if self.training: - return self.forward_train(fpn_feats) - else: - return self.forward_eval( - fpn_feats, export_post_process=export_post_process) - - def forward_train(self, fpn_feats): - cls_logits_list, bboxes_reg_list = [], [] - for i, fpn_feat in enumerate(fpn_feats): - conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat, i) - if self.conv_feat.share_cls_reg: - cls_logits = self.head_cls_list[i](conv_cls_feat) - cls_score, bbox_pred = paddle.split( - cls_logits, - [self.cls_out_channels, 4 * (self.reg_max + 1)], - axis=1) - else: - cls_score = self.head_cls_list[i](conv_cls_feat) - bbox_pred = self.head_reg_list[i](conv_reg_feat) - - if self.dgqp_module: - quality_score = self.dgqp_module(bbox_pred) - cls_score = F.sigmoid(cls_score) * quality_score - - cls_logits_list.append(cls_score) - bboxes_reg_list.append(bbox_pred) - - return (cls_logits_list, bboxes_reg_list) - - def forward_eval(self, fpn_feats, export_post_process=True): - if self.eval_size: - anchor_points, stride_tensor = self.anchor_points, self.stride_tensor - else: - anchor_points, stride_tensor = self._generate_anchors(fpn_feats) - cls_logits_list, bboxes_reg_list = [], [] - for i, fpn_feat in enumerate(fpn_feats): - conv_cls_feat, conv_reg_feat = self.conv_feat(fpn_feat, i) - if self.conv_feat.share_cls_reg: - cls_logits = self.head_cls_list[i](conv_cls_feat) - cls_score, bbox_pred = paddle.split( - cls_logits, - [self.cls_out_channels, 4 * (self.reg_max + 1)], - axis=1) - else: - cls_score = self.head_cls_list[i](conv_cls_feat) - bbox_pred = self.head_reg_list[i](conv_reg_feat) - - if self.dgqp_module: - quality_score = self.dgqp_module(bbox_pred) - cls_score = F.sigmoid(cls_score) * quality_score - - if not export_post_process: - # Now only supports batch size = 1 in deploy - # TODO(ygh): support batch size > 1 - cls_score_out = F.sigmoid(cls_score).reshape( - [1, self.cls_out_channels, -1]).transpose([0, 2, 1]) - bbox_pred = bbox_pred.reshape([1, (self.reg_max + 1) * 4, - -1]).transpose([0, 2, 1]) - else: - _, _, h, w = fpn_feat.shape - l = h * w - cls_score_out = F.sigmoid( - cls_score.reshape([-1, self.cls_out_channels, l])) - bbox_pred = bbox_pred.transpose([0, 2, 3, 1]) - bbox_pred = self.distribution_project(bbox_pred) - bbox_pred = bbox_pred.reshape([-1, l, 4]) - - cls_logits_list.append(cls_score_out) - bboxes_reg_list.append(bbox_pred) - - if export_post_process: - cls_logits_list = paddle.concat(cls_logits_list, axis=-1) - bboxes_reg_list = paddle.concat(bboxes_reg_list, axis=1) - bboxes_reg_list = batch_distance2bbox(anchor_points, - bboxes_reg_list) - bboxes_reg_list *= stride_tensor - - return (cls_logits_list, bboxes_reg_list) - - def _generate_anchors(self, feats=None): - # just use in eval time - anchor_points = [] - stride_tensor = [] - for i, stride in enumerate(self.fpn_stride): - if feats is not None: - _, _, h, w = feats[i].shape - else: - h = math.ceil(self.eval_size[0] / stride) - w = math.ceil(self.eval_size[1] / stride) - shift_x = paddle.arange(end=w) + self.cell_offset - shift_y = paddle.arange(end=h) + self.cell_offset - shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) - anchor_point = paddle.cast( - paddle.stack( - [shift_x, shift_y], axis=-1), dtype='float32') - anchor_points.append(anchor_point.reshape([-1, 2])) - stride_tensor.append( - paddle.full( - [h * w, 1], stride, dtype='float32')) - anchor_points = paddle.concat(anchor_points) - stride_tensor = paddle.concat(stride_tensor) - return anchor_points, stride_tensor - - def post_process(self, - head_outs, - scale_factor, - export_nms=True, - nms_cpu=False): - pred_scores, pred_bboxes = head_outs - if not export_nms: - return pred_bboxes, pred_scores - else: - # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale] - scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1) - scale_factor = paddle.concat( - [scale_x, scale_y, scale_x, scale_y], - axis=-1).reshape([-1, 1, 4]) - # scale bbox to origin image size. - pred_bboxes /= scale_factor - if nms_cpu: - paddle.set_device("cpu") - bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) - paddle.set_device(self.device) - else: - bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) - return bbox_pred, bbox_num - - -@register -class PicoHeadV2(GFLHead): - """ - PicoHeadV2 - Args: - conv_feat (object): Instance of 'PicoFeat' - num_classes (int): Number of classes - fpn_stride (list): The stride of each FPN Layer - prior_prob (float): Used to set the bias init for the class prediction layer - loss_class (object): Instance of VariFocalLoss. - loss_dfl (object): Instance of DistributionFocalLoss. - loss_bbox (object): Instance of bbox loss. - assigner (object): Instance of label assigner. - reg_max: Max value of integral set :math: `{0, ..., reg_max}` - n QFL setting. Default: 7. - """ - __inject__ = [ - 'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', - 'static_assigner', 'assigner', 'nms' - ] - __shared__ = ['num_classes', 'eval_size'] - - def __init__(self, - conv_feat='PicoFeatV2', - dgqp_module=None, - num_classes=80, - fpn_stride=[8, 16, 32], - prior_prob=0.01, - use_align_head=True, - loss_class='VariFocalLoss', - loss_dfl='DistributionFocalLoss', - loss_bbox='GIoULoss', - static_assigner_epoch=60, - static_assigner='ATSSAssigner', - assigner='TaskAlignedAssigner', - reg_max=16, - feat_in_chan=96, - nms=None, - nms_pre=1000, - cell_offset=0, - act='hard_swish', - grid_cell_scale=5.0, - eval_size=None): - super(PicoHeadV2, self).__init__( - conv_feat=conv_feat, - dgqp_module=dgqp_module, - num_classes=num_classes, - fpn_stride=fpn_stride, - prior_prob=prior_prob, - loss_class=loss_class, - loss_dfl=loss_dfl, - loss_bbox=loss_bbox, - reg_max=reg_max, - feat_in_chan=feat_in_chan, - nms=nms, - nms_pre=nms_pre, - cell_offset=cell_offset, ) - self.conv_feat = conv_feat - self.num_classes = num_classes - self.fpn_stride = fpn_stride - self.prior_prob = prior_prob - self.loss_vfl = loss_class - self.loss_dfl = loss_dfl - self.loss_bbox = loss_bbox - - self.static_assigner_epoch = static_assigner_epoch - self.static_assigner = static_assigner - self.assigner = assigner - - self.reg_max = reg_max - self.feat_in_chan = feat_in_chan - self.nms = nms - self.nms_pre = nms_pre - self.cell_offset = cell_offset - self.act = act - self.grid_cell_scale = grid_cell_scale - self.use_align_head = use_align_head - self.cls_out_channels = self.num_classes - self.eval_size = eval_size - - bias_init_value = -math.log((1 - self.prior_prob) / self.prior_prob) - # Clear the super class initialization - self.gfl_head_cls = None - self.gfl_head_reg = None - self.scales_regs = None - - self.head_cls_list = nn.LayerList() - self.head_reg_list = nn.LayerList() - self.cls_align = nn.LayerList() - - for i in range(len(fpn_stride)): - head_cls = self.add_sublayer( - "head_cls" + str(i), - nn.Conv2D( - in_channels=self.feat_in_chan, - out_channels=self.cls_out_channels, - kernel_size=1, - stride=1, - padding=0, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr( - initializer=Constant(value=bias_init_value)))) - self.head_cls_list.append(head_cls) - head_reg = self.add_sublayer( - "head_reg" + str(i), - nn.Conv2D( - in_channels=self.feat_in_chan, - out_channels=4 * (self.reg_max + 1), - kernel_size=1, - stride=1, - padding=0, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr(initializer=Constant(value=0)))) - self.head_reg_list.append(head_reg) - if self.use_align_head: - self.cls_align.append( - DPModule( - self.feat_in_chan, - 1, - 5, - act=self.act, - use_act_in_out=False)) - - # initialize the anchor points - if self.eval_size: - self.anchor_points, self.stride_tensor = self._generate_anchors() - - def forward(self, fpn_feats, export_post_process=True): - assert len(fpn_feats) == len( - self.fpn_stride - ), "The size of fpn_feats is not equal to size of fpn_stride" - - if self.training: - return self.forward_train(fpn_feats) - else: - return self.forward_eval( - fpn_feats, export_post_process=export_post_process) - - def forward_train(self, fpn_feats): - cls_score_list, reg_list, box_list = [], [], [] - for i, (fpn_feat, stride) in enumerate(zip(fpn_feats, self.fpn_stride)): - b, _, h, w = get_static_shape(fpn_feat) - # task decomposition - conv_cls_feat, se_feat = self.conv_feat(fpn_feat, i) - cls_logit = self.head_cls_list[i](se_feat) - reg_pred = self.head_reg_list[i](se_feat) - - # cls prediction and alignment - if self.use_align_head: - cls_prob = F.sigmoid(self.cls_align[i](conv_cls_feat)) - cls_score = (F.sigmoid(cls_logit) * cls_prob + eps).sqrt() - else: - cls_score = F.sigmoid(cls_logit) - - cls_score_out = cls_score.transpose([0, 2, 3, 1]) - bbox_pred = reg_pred.transpose([0, 2, 3, 1]) - b, cell_h, cell_w, _ = paddle.shape(cls_score_out) - y, x = self.get_single_level_center_point( - [cell_h, cell_w], stride, cell_offset=self.cell_offset) - center_points = paddle.stack([x, y], axis=-1) - cls_score_out = cls_score_out.reshape( - [b, -1, self.cls_out_channels]) - bbox_pred = self.distribution_project(bbox_pred) * stride - bbox_pred = bbox_pred.reshape([b, cell_h * cell_w, 4]) - bbox_pred = batch_distance2bbox( - center_points, bbox_pred, max_shapes=None) - cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1])) - reg_list.append(reg_pred.flatten(2).transpose([0, 2, 1])) - box_list.append(bbox_pred / stride) - - cls_score_list = paddle.concat(cls_score_list, axis=1) - box_list = paddle.concat(box_list, axis=1) - reg_list = paddle.concat(reg_list, axis=1) - return cls_score_list, reg_list, box_list, fpn_feats - - def forward_eval(self, fpn_feats, export_post_process=True): - if self.eval_size: - anchor_points, stride_tensor = self.anchor_points, self.stride_tensor - else: - anchor_points, stride_tensor = self._generate_anchors(fpn_feats) - cls_score_list, box_list = [], [] - for i, (fpn_feat, stride) in enumerate(zip(fpn_feats, self.fpn_stride)): - _, _, h, w = fpn_feat.shape - # task decomposition - conv_cls_feat, se_feat = self.conv_feat(fpn_feat, i) - cls_logit = self.head_cls_list[i](se_feat) - reg_pred = self.head_reg_list[i](se_feat) - - # cls prediction and alignment - if self.use_align_head: - cls_prob = F.sigmoid(self.cls_align[i](conv_cls_feat)) - cls_score = (F.sigmoid(cls_logit) * cls_prob + eps).sqrt() - else: - cls_score = F.sigmoid(cls_logit) - - if not export_post_process: - # Now only supports batch size = 1 in deploy - cls_score_list.append( - cls_score.reshape([1, self.cls_out_channels, -1]).transpose( - [0, 2, 1])) - box_list.append( - reg_pred.reshape([1, (self.reg_max + 1) * 4, -1]).transpose( - [0, 2, 1])) - else: - l = h * w - cls_score_out = cls_score.reshape( - [-1, self.cls_out_channels, l]) - bbox_pred = reg_pred.transpose([0, 2, 3, 1]) - bbox_pred = self.distribution_project(bbox_pred) - bbox_pred = bbox_pred.reshape([-1, l, 4]) - cls_score_list.append(cls_score_out) - box_list.append(bbox_pred) - - if export_post_process: - cls_score_list = paddle.concat(cls_score_list, axis=-1) - box_list = paddle.concat(box_list, axis=1) - box_list = batch_distance2bbox(anchor_points, box_list) - box_list *= stride_tensor - - return cls_score_list, box_list - - def get_loss(self, head_outs, gt_meta): - pred_scores, pred_regs, pred_bboxes, fpn_feats = head_outs - gt_labels = gt_meta['gt_class'] - gt_bboxes = gt_meta['gt_bbox'] - gt_scores = gt_meta['gt_score'] if 'gt_score' in gt_meta else None - num_imgs = gt_meta['im_id'].shape[0] - pad_gt_mask = gt_meta['pad_gt_mask'] - - anchors, _, num_anchors_list, stride_tensor_list = generate_anchors_for_grid_cell( - fpn_feats, self.fpn_stride, self.grid_cell_scale, self.cell_offset) - - centers = bbox_center(anchors) - - # label assignment - if gt_meta['epoch_id'] < self.static_assigner_epoch: - assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner( - anchors, - num_anchors_list, - gt_labels, - gt_bboxes, - pad_gt_mask, - bg_index=self.num_classes, - gt_scores=gt_scores, - pred_bboxes=pred_bboxes.detach() * stride_tensor_list) - - else: - assigned_labels, assigned_bboxes, assigned_scores = self.assigner( - pred_scores.detach(), - pred_bboxes.detach() * stride_tensor_list, - centers, - num_anchors_list, - gt_labels, - gt_bboxes, - pad_gt_mask, - bg_index=self.num_classes, - gt_scores=gt_scores) - - assigned_bboxes /= stride_tensor_list - - centers_shape = centers.shape - flatten_centers = centers.expand( - [num_imgs, centers_shape[0], centers_shape[1]]).reshape([-1, 2]) - flatten_strides = stride_tensor_list.expand( - [num_imgs, centers_shape[0], 1]).reshape([-1, 1]) - flatten_cls_preds = pred_scores.reshape([-1, self.num_classes]) - flatten_regs = pred_regs.reshape([-1, 4 * (self.reg_max + 1)]) - flatten_bboxes = pred_bboxes.reshape([-1, 4]) - flatten_bbox_targets = assigned_bboxes.reshape([-1, 4]) - flatten_labels = assigned_labels.reshape([-1]) - flatten_assigned_scores = assigned_scores.reshape( - [-1, self.num_classes]) - - pos_inds = paddle.nonzero( - paddle.logical_and((flatten_labels >= 0), - (flatten_labels < self.num_classes)), - as_tuple=False).squeeze(1) - - num_total_pos = len(pos_inds) - - if num_total_pos > 0: - pos_bbox_targets = paddle.gather( - flatten_bbox_targets, pos_inds, axis=0) - pos_decode_bbox_pred = paddle.gather( - flatten_bboxes, pos_inds, axis=0) - pos_reg = paddle.gather(flatten_regs, pos_inds, axis=0) - pos_strides = paddle.gather(flatten_strides, pos_inds, axis=0) - pos_centers = paddle.gather( - flatten_centers, pos_inds, axis=0) / pos_strides - - weight_targets = flatten_assigned_scores.detach() - weight_targets = paddle.gather( - weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0) - - pred_corners = pos_reg.reshape([-1, self.reg_max + 1]) - target_corners = bbox2distance(pos_centers, pos_bbox_targets, - self.reg_max).reshape([-1]) - # regression loss - loss_bbox = paddle.sum( - self.loss_bbox(pos_decode_bbox_pred, - pos_bbox_targets) * weight_targets) - - # dfl loss - loss_dfl = self.loss_dfl( - pred_corners, - target_corners, - weight=weight_targets.expand([-1, 4]).reshape([-1]), - avg_factor=4.0) - else: - loss_bbox = paddle.zeros([1]) - loss_dfl = paddle.zeros([1]) - - avg_factor = flatten_assigned_scores.sum() - if paddle.distributed.get_world_size() > 1: - paddle.distributed.all_reduce(avg_factor) - avg_factor = paddle.clip( - avg_factor / paddle.distributed.get_world_size(), min=1) - loss_vfl = self.loss_vfl( - flatten_cls_preds, flatten_assigned_scores, avg_factor=avg_factor) - - loss_bbox = loss_bbox / avg_factor - loss_dfl = loss_dfl / avg_factor - - loss_states = dict( - loss_vfl=loss_vfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl) - - return loss_states - - def _generate_anchors(self, feats=None): - # just use in eval time - anchor_points = [] - stride_tensor = [] - for i, stride in enumerate(self.fpn_stride): - if feats is not None: - _, _, h, w = feats[i].shape - else: - h = math.ceil(self.eval_size[0] / stride) - w = math.ceil(self.eval_size[1] / stride) - shift_x = paddle.arange(end=w) + self.cell_offset - shift_y = paddle.arange(end=h) + self.cell_offset - shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) - anchor_point = paddle.cast( - paddle.stack( - [shift_x, shift_y], axis=-1), dtype='float32') - anchor_points.append(anchor_point.reshape([-1, 2])) - stride_tensor.append( - paddle.full( - [h * w, 1], stride, dtype='float32')) - anchor_points = paddle.concat(anchor_points) - stride_tensor = paddle.concat(stride_tensor) - return anchor_points, stride_tensor - - def post_process(self, - head_outs, - scale_factor, - export_nms=True, - nms_cpu=False): - pred_scores, pred_bboxes = head_outs - if not export_nms: - return pred_bboxes, pred_scores - else: - # rescale: [h_scale, w_scale] -> [w_scale, h_scale, w_scale, h_scale] - scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1) - scale_factor = paddle.concat( - [scale_x, scale_y, scale_x, scale_y], - axis=-1).reshape([-1, 1, 4]) - # scale bbox to origin image size. - pred_bboxes /= scale_factor - bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) - return bbox_pred, bbox_num diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_contrast_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_contrast_head.py deleted file mode 100644 index 8732c2c..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_contrast_head.py +++ /dev/null @@ -1,197 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register - -from ..initializer import bias_init_with_prob, constant_ -from ..assigners.utils import generate_anchors_for_grid_cell -from ppdet.modeling.heads.ppyoloe_head import PPYOLOEHead - -__all__ = ['PPYOLOEContrastHead'] - - -@register -class PPYOLOEContrastHead(PPYOLOEHead): - __shared__ = [ - 'num_classes', 'eval_size', 'trt', 'exclude_nms', - 'exclude_post_process', 'use_shared_conv', 'for_distill' - ] - __inject__ = ['static_assigner', 'assigner', 'nms', 'contrast_loss'] - - def __init__(self, - in_channels=[1024, 512, 256], - num_classes=80, - act='swish', - fpn_strides=(32, 16, 8), - grid_cell_scale=5.0, - grid_cell_offset=0.5, - reg_max=16, - reg_range=None, - static_assigner_epoch=4, - use_varifocal_loss=True, - static_assigner='ATSSAssigner', - assigner='TaskAlignedAssigner', - contrast_loss='SupContrast', - nms='MultiClassNMS', - eval_size=None, - loss_weight={ - 'class': 1.0, - 'iou': 2.5, - 'dfl': 0.5, - }, - trt=False, - attn_conv='convbn', - exclude_nms=False, - exclude_post_process=False, - use_shared_conv=True, - for_distill=False): - super().__init__(in_channels, num_classes, act, fpn_strides, - grid_cell_scale, grid_cell_offset, reg_max, reg_range, - static_assigner_epoch, use_varifocal_loss, - static_assigner, assigner, nms, eval_size, loss_weight, - trt, attn_conv, exclude_nms, exclude_post_process, - use_shared_conv, for_distill) - - assert len(in_channels) > 0, "len(in_channels) should > 0" - self.contrast_loss = contrast_loss - self.contrast_encoder = nn.LayerList() - for in_c in self.in_channels: - self.contrast_encoder.append(nn.Conv2D(in_c, 128, 3, padding=1)) - self._init_contrast_encoder() - - def _init_contrast_encoder(self): - bias_en = bias_init_with_prob(0.01) - for en_ in self.contrast_encoder: - constant_(en_.weight) - constant_(en_.bias, bias_en) - - def forward_train(self, feats, targets, aux_pred=None): - anchors, anchor_points, num_anchors_list, stride_tensor = \ - generate_anchors_for_grid_cell( - feats, self.fpn_strides, self.grid_cell_scale, - self.grid_cell_offset) - - cls_score_list, reg_distri_list = [], [] - contrast_encoder_list = [] - for i, feat in enumerate(feats): - avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) - cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + - feat) - reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat)) - contrast_logit = self.contrast_encoder[i](self.stem_cls[i]( - feat, avg_feat) + feat) - contrast_encoder_list.append( - contrast_logit.flatten(2).transpose([0, 2, 1])) - # cls and reg - cls_score = F.sigmoid(cls_logit) - cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1])) - reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1])) - cls_score_list = paddle.concat(cls_score_list, axis=1) - reg_distri_list = paddle.concat(reg_distri_list, axis=1) - contrast_encoder_list = paddle.concat(contrast_encoder_list, axis=1) - - return self.get_loss([ - cls_score_list, reg_distri_list, contrast_encoder_list, anchors, - anchor_points, num_anchors_list, stride_tensor - ], targets) - - def get_loss(self, head_outs, gt_meta): - pred_scores, pred_distri, pred_contrast_encoder, anchors,\ - anchor_points, num_anchors_list, stride_tensor = head_outs - - anchor_points_s = anchor_points / stride_tensor - pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri) - - gt_labels = gt_meta['gt_class'] - gt_bboxes = gt_meta['gt_bbox'] - pad_gt_mask = gt_meta['pad_gt_mask'] - # label assignment - if gt_meta['epoch_id'] < self.static_assigner_epoch: - assigned_labels, assigned_bboxes, assigned_scores = \ - self.static_assigner( - anchors, - num_anchors_list, - gt_labels, - gt_bboxes, - pad_gt_mask, - bg_index=self.num_classes, - pred_bboxes=pred_bboxes.detach() * stride_tensor) - alpha_l = 0.25 - else: - if self.sm_use: - assigned_labels, assigned_bboxes, assigned_scores = \ - self.assigner( - pred_scores.detach(), - pred_bboxes.detach() * stride_tensor, - anchor_points, - stride_tensor, - gt_labels, - gt_bboxes, - pad_gt_mask, - bg_index=self.num_classes) - else: - assigned_labels, assigned_bboxes, assigned_scores = \ - self.assigner( - pred_scores.detach(), - pred_bboxes.detach() * stride_tensor, - anchor_points, - num_anchors_list, - gt_labels, - gt_bboxes, - pad_gt_mask, - bg_index=self.num_classes) - alpha_l = -1 - # rescale bbox - assigned_bboxes /= stride_tensor - # cls loss - if self.use_varifocal_loss: - one_hot_label = F.one_hot(assigned_labels, - self.num_classes + 1)[..., :-1] - loss_cls = self._varifocal_loss(pred_scores, assigned_scores, - one_hot_label) - else: - loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l) - - assigned_scores_sum = assigned_scores.sum() - if paddle.distributed.get_world_size() > 1: - paddle.distributed.all_reduce(assigned_scores_sum) - assigned_scores_sum /= paddle.distributed.get_world_size() - assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.) - loss_cls /= assigned_scores_sum - - loss_l1, loss_iou, loss_dfl = \ - self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s, - assigned_labels, assigned_bboxes, assigned_scores, - assigned_scores_sum) - # contrast loss - loss_contrast = self.contrast_loss(pred_contrast_encoder.reshape([-1, pred_contrast_encoder.shape[-1]]), \ - assigned_labels.reshape([-1]), assigned_scores.max(-1).reshape([-1])) - - loss = self.loss_weight['class'] * loss_cls + \ - self.loss_weight['iou'] * loss_iou + \ - self.loss_weight['dfl'] * loss_dfl + \ - self.loss_weight['contrast'] * loss_contrast - - out_dict = { - 'loss': loss, - 'loss_cls': loss_cls, - 'loss_iou': loss_iou, - 'loss_dfl': loss_dfl, - 'loss_l1': loss_l1, - 'loss_contrast': loss_contrast - } - return out_dict diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_head.py deleted file mode 100644 index 80f1bc4..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_head.py +++ /dev/null @@ -1,700 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register -from paddle import ParamAttr -from paddle.nn.initializer import KaimingNormal -from paddle.nn.initializer import Normal, Constant - -from ..bbox_utils import batch_distance2bbox -from ..losses import GIoULoss -from ..initializer import bias_init_with_prob, constant_, normal_ -from ..assigners.utils import generate_anchors_for_grid_cell -from ppdet.modeling.backbones.cspresnet import ConvBNLayer, RepVggBlock -from ppdet.modeling.ops import get_static_shape, get_act_fn -from ppdet.modeling.layers import MultiClassNMS - -__all__ = ['PPYOLOEHead', 'SimpleConvHead'] - - -class ESEAttn(nn.Layer): - def __init__(self, feat_channels, act='swish', attn_conv='convbn'): - super(ESEAttn, self).__init__() - self.fc = nn.Conv2D(feat_channels, feat_channels, 1) - if attn_conv == 'convbn': - self.conv = ConvBNLayer(feat_channels, feat_channels, 1, act=act) - elif attn_conv == 'repvgg': - self.conv = RepVggBlock(feat_channels, feat_channels, act=act) - else: - self.conv = None - self._init_weights() - - def _init_weights(self): - normal_(self.fc.weight, std=0.001) - - def forward(self, feat, avg_feat): - weight = F.sigmoid(self.fc(avg_feat)) - if self.conv: - return self.conv(feat * weight) - else: - return feat * weight - - -@register -class PPYOLOEHead(nn.Layer): - __shared__ = [ - 'num_classes', 'eval_size', 'trt', 'exclude_nms', - 'exclude_post_process', 'use_shared_conv', 'for_distill' - ] - __inject__ = ['static_assigner', 'assigner', 'nms'] - - def __init__(self, - in_channels=[1024, 512, 256], - num_classes=80, - act='swish', - fpn_strides=(32, 16, 8), - grid_cell_scale=5.0, - grid_cell_offset=0.5, - reg_max=16, - reg_range=None, - static_assigner_epoch=4, - use_varifocal_loss=True, - static_assigner='ATSSAssigner', - assigner='TaskAlignedAssigner', - nms='MultiClassNMS', - eval_size=None, - loss_weight={ - 'class': 1.0, - 'iou': 2.5, - 'dfl': 0.5, - }, - trt=False, - attn_conv='convbn', - exclude_nms=False, - exclude_post_process=False, - use_shared_conv=True, - for_distill=False): - super(PPYOLOEHead, self).__init__() - assert len(in_channels) > 0, "len(in_channels) should > 0" - self.in_channels = in_channels - self.num_classes = num_classes - self.fpn_strides = fpn_strides - self.grid_cell_scale = grid_cell_scale - self.grid_cell_offset = grid_cell_offset - if reg_range: - self.sm_use = True - self.reg_range = reg_range - else: - self.sm_use = False - self.reg_range = (0, reg_max + 1) - self.reg_channels = self.reg_range[1] - self.reg_range[0] - self.iou_loss = GIoULoss() - self.loss_weight = loss_weight - self.use_varifocal_loss = use_varifocal_loss - self.eval_size = eval_size - - self.static_assigner_epoch = static_assigner_epoch - self.static_assigner = static_assigner - self.assigner = assigner - self.nms = nms - if isinstance(self.nms, MultiClassNMS) and trt: - self.nms.trt = trt - self.exclude_nms = exclude_nms - self.exclude_post_process = exclude_post_process - self.use_shared_conv = use_shared_conv - self.for_distill = for_distill - self.is_teacher = False - - # stem - self.stem_cls = nn.LayerList() - self.stem_reg = nn.LayerList() - act = get_act_fn( - act, trt=trt) if act is None or isinstance(act, - (str, dict)) else act - for in_c in self.in_channels: - self.stem_cls.append(ESEAttn(in_c, act=act, attn_conv=attn_conv)) - self.stem_reg.append(ESEAttn(in_c, act=act, attn_conv=attn_conv)) - # pred head - self.pred_cls = nn.LayerList() - self.pred_reg = nn.LayerList() - for in_c in self.in_channels: - self.pred_cls.append( - nn.Conv2D( - in_c, self.num_classes, 3, padding=1)) - self.pred_reg.append( - nn.Conv2D( - in_c, 4 * self.reg_channels, 3, padding=1)) - # projection conv - self.proj_conv = nn.Conv2D(self.reg_channels, 1, 1, bias_attr=False) - self.proj_conv.skip_quant = True - self._init_weights() - - if self.for_distill: - self.distill_pairs = {} - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } - - def _init_weights(self): - bias_cls = bias_init_with_prob(0.01) - for cls_, reg_ in zip(self.pred_cls, self.pred_reg): - constant_(cls_.weight) - constant_(cls_.bias, bias_cls) - constant_(reg_.weight) - constant_(reg_.bias, 1.0) - - proj = paddle.linspace(self.reg_range[0], self.reg_range[1] - 1, - self.reg_channels).reshape( - [1, self.reg_channels, 1, 1]) - self.proj_conv.weight.set_value(proj) - self.proj_conv.weight.stop_gradient = True - if self.eval_size: - anchor_points, stride_tensor = self._generate_anchors() - self.anchor_points = anchor_points - self.stride_tensor = stride_tensor - - def forward_train(self, feats, targets, aux_pred=None): - anchors, anchor_points, num_anchors_list, stride_tensor = \ - generate_anchors_for_grid_cell( - feats, self.fpn_strides, self.grid_cell_scale, - self.grid_cell_offset) - - cls_score_list, reg_distri_list = [], [] - for i, feat in enumerate(feats): - avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) - cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + - feat) - reg_distri = self.pred_reg[i](self.stem_reg[i](feat, avg_feat)) - # cls and reg - cls_score = F.sigmoid(cls_logit) - cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1])) - reg_distri_list.append(reg_distri.flatten(2).transpose([0, 2, 1])) - cls_score_list = paddle.concat(cls_score_list, axis=1) - reg_distri_list = paddle.concat(reg_distri_list, axis=1) - - if targets.get('is_teacher', False): - pred_deltas, pred_dfls = self._bbox_decode_fake(reg_distri_list) - return cls_score_list, pred_deltas * stride_tensor, pred_dfls - - if targets.get('get_data', False): - pred_deltas, pred_dfls = self._bbox_decode_fake(reg_distri_list) - return cls_score_list, pred_deltas * stride_tensor, pred_dfls - - return self.get_loss([ - cls_score_list, reg_distri_list, anchors, anchor_points, - num_anchors_list, stride_tensor - ], targets, aux_pred) - - def _generate_anchors(self, feats=None, dtype='float32'): - # just use in eval time - anchor_points = [] - stride_tensor = [] - for i, stride in enumerate(self.fpn_strides): - if feats is not None: - _, _, h, w = feats[i].shape - else: - h = int(self.eval_size[0] / stride) - w = int(self.eval_size[1] / stride) - shift_x = paddle.arange(end=w) + self.grid_cell_offset - shift_y = paddle.arange(end=h) + self.grid_cell_offset - shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) - anchor_point = paddle.cast( - paddle.stack( - [shift_x, shift_y], axis=-1), dtype=dtype) - anchor_points.append(anchor_point.reshape([-1, 2])) - stride_tensor.append(paddle.full([h * w, 1], stride, dtype=dtype)) - anchor_points = paddle.concat(anchor_points) - stride_tensor = paddle.concat(stride_tensor) - return anchor_points, stride_tensor - - def forward_eval(self, feats): - if self.eval_size: - anchor_points, stride_tensor = self.anchor_points, self.stride_tensor - else: - anchor_points, stride_tensor = self._generate_anchors(feats) - cls_score_list, reg_dist_list = [], [] - for i, feat in enumerate(feats): - _, _, h, w = feat.shape - l = h * w - avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) - cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + - feat) - reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat)) - reg_dist = reg_dist.reshape( - [-1, 4, self.reg_channels, l]).transpose([0, 2, 3, 1]) - if self.use_shared_conv: - reg_dist = self.proj_conv(F.softmax( - reg_dist, axis=1)).squeeze(1) - else: - reg_dist = F.softmax(reg_dist, axis=1) - # cls and reg - cls_score = F.sigmoid(cls_logit) - cls_score_list.append(cls_score.reshape([-1, self.num_classes, l])) - reg_dist_list.append(reg_dist) - - cls_score_list = paddle.concat(cls_score_list, axis=-1) - if self.use_shared_conv: - reg_dist_list = paddle.concat(reg_dist_list, axis=1) - else: - reg_dist_list = paddle.concat(reg_dist_list, axis=2) - reg_dist_list = self.proj_conv(reg_dist_list).squeeze(1) - - return cls_score_list, reg_dist_list, anchor_points, stride_tensor - - def forward(self, feats, targets=None, aux_pred=None): - assert len(feats) == len(self.fpn_strides), \ - "The size of feats is not equal to size of fpn_strides" - - if self.training: - return self.forward_train(feats, targets, aux_pred) - else: - if targets is not None: - # only for semi-det - self.is_teacher = targets.get('is_teacher', False) - if self.is_teacher: - return self.forward_train(feats, targets, aux_pred=None) - else: - return self.forward_eval(feats) - - return self.forward_eval(feats) - - @staticmethod - def _focal_loss(score, label, alpha=0.25, gamma=2.0): - weight = (score - label).pow(gamma) - if alpha > 0: - alpha_t = alpha * label + (1 - alpha) * (1 - label) - weight *= alpha_t - loss = F.binary_cross_entropy( - score, label, weight=weight, reduction='sum') - return loss - - @staticmethod - def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0): - weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label - loss = F.binary_cross_entropy( - pred_score, gt_score, weight=weight, reduction='sum') - return loss - - def _bbox_decode(self, anchor_points, pred_dist): - _, l, _ = get_static_shape(pred_dist) - pred_dist = F.softmax(pred_dist.reshape([-1, l, 4, self.reg_channels])) - pred_dist = self.proj_conv(pred_dist.transpose([0, 3, 1, 2])).squeeze(1) - return batch_distance2bbox(anchor_points, pred_dist) - - def _bbox_decode_fake(self, pred_dist): - _, l, _ = get_static_shape(pred_dist) - pred_dist_dfl = F.softmax( - pred_dist.reshape([-1, l, 4, self.reg_channels])) - pred_dist = self.proj_conv(pred_dist_dfl.transpose([0, 3, 1, 2 - ])).squeeze(1) - return pred_dist, pred_dist_dfl - - def _bbox2distance(self, points, bbox): - x1y1, x2y2 = paddle.split(bbox, 2, -1) - lt = points - x1y1 - rb = x2y2 - points - return paddle.concat([lt, rb], -1).clip(self.reg_range[0], - self.reg_range[1] - 1 - 0.01) - - def _df_loss(self, pred_dist, target, lower_bound=0): - target_left = paddle.cast(target.floor(), 'int64') - target_right = target_left + 1 - weight_left = target_right.astype('float32') - target - weight_right = 1 - weight_left - loss_left = F.cross_entropy( - pred_dist, target_left - lower_bound, - reduction='none') * weight_left - loss_right = F.cross_entropy( - pred_dist, target_right - lower_bound, - reduction='none') * weight_right - return (loss_left + loss_right).mean(-1, keepdim=True) - - def _bbox_loss(self, pred_dist, pred_bboxes, anchor_points, assigned_labels, - assigned_bboxes, assigned_scores, assigned_scores_sum): - # select positive samples mask - mask_positive = (assigned_labels != self.num_classes) - - if self.for_distill: - # only used for LD main_kd distill - self.distill_pairs['mask_positive_select'] = mask_positive - - num_pos = mask_positive.sum() - # pos/neg loss - if num_pos > 0: - # l1 + iou - bbox_mask = mask_positive.astype('int32').unsqueeze(-1).tile( - [1, 1, 4]).astype('bool') - pred_bboxes_pos = paddle.masked_select(pred_bboxes, - bbox_mask).reshape([-1, 4]) - assigned_bboxes_pos = paddle.masked_select( - assigned_bboxes, bbox_mask).reshape([-1, 4]) - bbox_weight = paddle.masked_select( - assigned_scores.sum(-1), mask_positive).unsqueeze(-1) - - loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos) - - loss_iou = self.iou_loss(pred_bboxes_pos, - assigned_bboxes_pos) * bbox_weight - loss_iou = loss_iou.sum() / assigned_scores_sum - - dist_mask = mask_positive.unsqueeze(-1).astype('int32').tile( - [1, 1, self.reg_channels * 4]).astype('bool') - pred_dist_pos = paddle.masked_select( - pred_dist, dist_mask).reshape([-1, 4, self.reg_channels]) - assigned_ltrb = self._bbox2distance(anchor_points, assigned_bboxes) - assigned_ltrb_pos = paddle.masked_select( - assigned_ltrb, bbox_mask).reshape([-1, 4]) - loss_dfl = self._df_loss(pred_dist_pos, assigned_ltrb_pos, - self.reg_range[0]) * bbox_weight - loss_dfl = loss_dfl.sum() / assigned_scores_sum - if self.for_distill: - self.distill_pairs['pred_bboxes_pos'] = pred_bboxes_pos - self.distill_pairs['pred_dist_pos'] = pred_dist_pos - self.distill_pairs['bbox_weight'] = bbox_weight - else: - loss_l1 = paddle.zeros([1]) - loss_iou = paddle.zeros([1]) - loss_dfl = pred_dist.sum() * 0. - return loss_l1, loss_iou, loss_dfl - - def get_loss(self, head_outs, gt_meta, aux_pred=None): - pred_scores, pred_distri, anchors,\ - anchor_points, num_anchors_list, stride_tensor = head_outs - - anchor_points_s = anchor_points / stride_tensor - pred_bboxes = self._bbox_decode(anchor_points_s, pred_distri) - - if aux_pred is not None: - pred_scores_aux = aux_pred[0] - pred_bboxes_aux = self._bbox_decode(anchor_points_s, aux_pred[1]) - - gt_labels = gt_meta['gt_class'] - gt_bboxes = gt_meta['gt_bbox'] - pad_gt_mask = gt_meta['pad_gt_mask'] - # label assignment - if gt_meta['epoch_id'] < self.static_assigner_epoch: - assigned_labels, assigned_bboxes, assigned_scores = \ - self.static_assigner( - anchors, - num_anchors_list, - gt_labels, - gt_bboxes, - pad_gt_mask, - bg_index=self.num_classes, - pred_bboxes=pred_bboxes.detach() * stride_tensor) - alpha_l = 0.25 - else: - if self.sm_use: - # only used in smalldet of PPYOLOE-SOD model - assigned_labels, assigned_bboxes, assigned_scores = \ - self.assigner( - pred_scores.detach(), - pred_bboxes.detach() * stride_tensor, - anchor_points, - stride_tensor, - gt_labels, - gt_bboxes, - pad_gt_mask, - bg_index=self.num_classes) - else: - if aux_pred is None: - if not hasattr(self, "assigned_labels"): - assigned_labels, assigned_bboxes, assigned_scores = \ - self.assigner( - pred_scores.detach(), - pred_bboxes.detach() * stride_tensor, - anchor_points, - num_anchors_list, - gt_labels, - gt_bboxes, - pad_gt_mask, - bg_index=self.num_classes) - if self.for_distill: - self.assigned_labels = assigned_labels - self.assigned_bboxes = assigned_bboxes - self.assigned_scores = assigned_scores - - else: - # only used in distill - assigned_labels = self.assigned_labels - assigned_bboxes = self.assigned_bboxes - assigned_scores = self.assigned_scores - - else: - assigned_labels, assigned_bboxes, assigned_scores = \ - self.assigner( - pred_scores_aux.detach(), - pred_bboxes_aux.detach() * stride_tensor, - anchor_points, - num_anchors_list, - gt_labels, - gt_bboxes, - pad_gt_mask, - bg_index=self.num_classes) - alpha_l = -1 - # rescale bbox - assigned_bboxes /= stride_tensor - - assign_out_dict = self.get_loss_from_assign( - pred_scores, pred_distri, pred_bboxes, anchor_points_s, - assigned_labels, assigned_bboxes, assigned_scores, alpha_l) - - if aux_pred is not None: - assign_out_dict_aux = self.get_loss_from_assign( - aux_pred[0], aux_pred[1], pred_bboxes_aux, anchor_points_s, - assigned_labels, assigned_bboxes, assigned_scores, alpha_l) - loss = {} - for key in assign_out_dict.keys(): - loss[key] = assign_out_dict[key] + assign_out_dict_aux[key] - else: - loss = assign_out_dict - - return loss - - def get_loss_from_assign(self, pred_scores, pred_distri, pred_bboxes, - anchor_points_s, assigned_labels, assigned_bboxes, - assigned_scores, alpha_l): - # cls loss - if self.use_varifocal_loss: - one_hot_label = F.one_hot(assigned_labels, - self.num_classes + 1)[..., :-1] - loss_cls = self._varifocal_loss(pred_scores, assigned_scores, - one_hot_label) - else: - loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l) - - assigned_scores_sum = assigned_scores.sum() - if paddle.distributed.get_world_size() > 1: - paddle.distributed.all_reduce(assigned_scores_sum) - assigned_scores_sum /= paddle.distributed.get_world_size() - assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.) - loss_cls /= assigned_scores_sum - - if self.for_distill: - self.distill_pairs['pred_cls_scores'] = pred_scores - self.distill_pairs['pos_num'] = assigned_scores_sum - self.distill_pairs['assigned_scores'] = assigned_scores - - one_hot_label = F.one_hot(assigned_labels, - self.num_classes + 1)[..., :-1] - self.distill_pairs['target_labels'] = one_hot_label - - loss_l1, loss_iou, loss_dfl = \ - self._bbox_loss(pred_distri, pred_bboxes, anchor_points_s, - assigned_labels, assigned_bboxes, assigned_scores, - assigned_scores_sum) - loss = self.loss_weight['class'] * loss_cls + \ - self.loss_weight['iou'] * loss_iou + \ - self.loss_weight['dfl'] * loss_dfl - out_dict = { - 'loss': loss, - 'loss_cls': loss_cls, - 'loss_iou': loss_iou, - 'loss_dfl': loss_dfl, - 'loss_l1': loss_l1, - } - return out_dict - - def post_process(self, head_outs, scale_factor): - pred_scores, pred_dist, anchor_points, stride_tensor = head_outs - pred_bboxes = batch_distance2bbox(anchor_points, pred_dist) - pred_bboxes *= stride_tensor - if self.exclude_post_process: - return paddle.concat( - [pred_bboxes, pred_scores.transpose([0, 2, 1])], - axis=-1), None, None - else: - # scale bbox to origin - scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1) - scale_factor = paddle.concat( - [scale_x, scale_y, scale_x, scale_y], - axis=-1).reshape([-1, 1, 4]) - pred_bboxes /= scale_factor - if self.exclude_nms: - # `exclude_nms=True` just use in benchmark - return pred_bboxes, pred_scores, None - else: - bbox_pred, bbox_num, nms_keep_idx = self.nms(pred_bboxes, - pred_scores) - return bbox_pred, bbox_num, nms_keep_idx - - -def get_activation(name="LeakyReLU"): - if name == "silu": - module = nn.Silu() - elif name == "relu": - module = nn.ReLU() - elif name in ["LeakyReLU", 'leakyrelu', 'lrelu']: - module = nn.LeakyReLU(0.1) - elif name is None: - module = nn.Identity() - else: - raise AttributeError("Unsupported act type: {}".format(name)) - return module - - -class ConvNormLayer(nn.Layer): - def __init__(self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - groups=1, - norm_type='gn', - activation="LeakyReLU"): - super(ConvNormLayer, self).__init__() - assert norm_type in ['bn', 'sync_bn', 'syncbn', 'gn', None] - self.conv = nn.Conv2D( - in_channels, - out_channels, - kernel_size, - stride=stride, - padding=padding, - dilation=dilation, - groups=groups, - bias_attr=False, - weight_attr=ParamAttr(initializer=KaimingNormal())) - - if norm_type in ['bn', 'sync_bn', 'syncbn']: - self.norm = nn.BatchNorm2D(out_channels) - elif norm_type == 'gn': - self.norm = nn.GroupNorm(num_groups=32, num_channels=out_channels) - else: - self.norm = None - - self.act = get_activation(activation) - - def forward(self, x): - y = self.conv(x) - if self.norm is not None: - y = self.norm(y) - y = self.act(y) - return y - - -class ScaleReg(nn.Layer): - """ - Parameter for scaling the regression outputs. - """ - - def __init__(self, scale=1.0): - super(ScaleReg, self).__init__() - scale = paddle.to_tensor(scale) - self.scale = self.create_parameter( - shape=[1], - dtype='float32', - default_initializer=nn.initializer.Assign(scale)) - - def forward(self, x): - return x * self.scale - - -@register -class SimpleConvHead(nn.Layer): - __shared__ = ['num_classes'] - - def __init__(self, - num_classes=80, - feat_in=288, - feat_out=288, - num_convs=1, - fpn_strides=[32, 16, 8, 4], - norm_type='gn', - act='LeakyReLU', - prior_prob=0.01, - reg_max=16): - super(SimpleConvHead, self).__init__() - self.num_classes = num_classes - self.feat_in = feat_in - self.feat_out = feat_out - self.num_convs = num_convs - self.fpn_strides = fpn_strides - self.reg_max = reg_max - - self.cls_convs = nn.LayerList() - self.reg_convs = nn.LayerList() - for i in range(self.num_convs): - in_c = feat_in if i == 0 else feat_out - self.cls_convs.append( - ConvNormLayer( - in_c, - feat_out, - 3, - stride=1, - padding=1, - norm_type=norm_type, - activation=act)) - self.reg_convs.append( - ConvNormLayer( - in_c, - feat_out, - 3, - stride=1, - padding=1, - norm_type=norm_type, - activation=act)) - - bias_cls = bias_init_with_prob(prior_prob) - self.gfl_cls = nn.Conv2D( - feat_out, - self.num_classes, - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0.0, std=0.01)), - bias_attr=ParamAttr(initializer=Constant(value=bias_cls))) - self.gfl_reg = nn.Conv2D( - feat_out, - 4 * (self.reg_max + 1), - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0.0, std=0.01)), - bias_attr=ParamAttr(initializer=Constant(value=0))) - - self.scales = nn.LayerList() - for i in range(len(self.fpn_strides)): - self.scales.append(ScaleReg(1.0)) - - def forward(self, feats): - cls_scores = [] - bbox_preds = [] - for x, scale in zip(feats, self.scales): - cls_feat = x - reg_feat = x - for cls_conv in self.cls_convs: - cls_feat = cls_conv(cls_feat) - for reg_conv in self.reg_convs: - reg_feat = reg_conv(reg_feat) - - cls_score = self.gfl_cls(cls_feat) - cls_score = F.sigmoid(cls_score) - cls_score = cls_score.flatten(2).transpose([0, 2, 1]) - cls_scores.append(cls_score) - - bbox_pred = scale(self.gfl_reg(reg_feat)) - bbox_pred = bbox_pred.flatten(2).transpose([0, 2, 1]) - bbox_preds.append(bbox_pred) - - cls_scores = paddle.concat(cls_scores, axis=1) - bbox_preds = paddle.concat(bbox_preds, axis=1) - return cls_scores, bbox_preds diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_r_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_r_head.py deleted file mode 100644 index e7cf772..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/ppyoloe_r_head.py +++ /dev/null @@ -1,425 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register - -from ..losses import ProbIoULoss -from ..initializer import bias_init_with_prob, constant_, normal_, vector_ -from ppdet.modeling.backbones.cspresnet import ConvBNLayer -from ppdet.modeling.ops import get_static_shape, get_act_fn, anchor_generator -from ppdet.modeling.layers import MultiClassNMS - -__all__ = ['PPYOLOERHead'] - - -class ESEAttn(nn.Layer): - def __init__(self, feat_channels, act='swish'): - super(ESEAttn, self).__init__() - self.fc = nn.Conv2D(feat_channels, feat_channels, 1) - self.conv = ConvBNLayer(feat_channels, feat_channels, 1, act=act) - - self._init_weights() - - def _init_weights(self): - normal_(self.fc.weight, std=0.01) - - def forward(self, feat, avg_feat): - weight = F.sigmoid(self.fc(avg_feat)) - return self.conv(feat * weight) - - -@register -class PPYOLOERHead(nn.Layer): - __shared__ = ['num_classes', 'trt', 'export_onnx'] - __inject__ = ['static_assigner', 'assigner', 'nms'] - - def __init__(self, - in_channels=[1024, 512, 256], - num_classes=15, - act='swish', - fpn_strides=(32, 16, 8), - grid_cell_offset=0.5, - angle_max=90, - use_varifocal_loss=True, - static_assigner_epoch=4, - trt=False, - export_onnx=False, - static_assigner='ATSSAssigner', - assigner='TaskAlignedAssigner', - nms='MultiClassNMS', - loss_weight={'class': 1.0, - 'iou': 2.5, - 'dfl': 0.05}): - super(PPYOLOERHead, self).__init__() - assert len(in_channels) > 0, "len(in_channels) should > 0" - self.in_channels = in_channels - self.num_classes = num_classes - self.fpn_strides = fpn_strides - self.grid_cell_offset = grid_cell_offset - self.angle_max = angle_max - self.loss_weight = loss_weight - self.use_varifocal_loss = use_varifocal_loss - self.half_pi = paddle.to_tensor( - [1.5707963267948966], dtype=paddle.float32) - self.half_pi_bin = self.half_pi / angle_max - self.iou_loss = ProbIoULoss() - self.static_assigner_epoch = static_assigner_epoch - self.static_assigner = static_assigner - self.assigner = assigner - self.nms = nms - # stem - self.stem_cls = nn.LayerList() - self.stem_reg = nn.LayerList() - self.stem_angle = nn.LayerList() - trt = False if export_onnx else trt - self.export_onnx = export_onnx - act = get_act_fn( - act, trt=trt) if act is None or isinstance(act, - (str, dict)) else act - self.trt = trt - for in_c in self.in_channels: - self.stem_cls.append(ESEAttn(in_c, act=act)) - self.stem_reg.append(ESEAttn(in_c, act=act)) - self.stem_angle.append(ESEAttn(in_c, act=act)) - # pred head - self.pred_cls = nn.LayerList() - self.pred_reg = nn.LayerList() - self.pred_angle = nn.LayerList() - for in_c in self.in_channels: - self.pred_cls.append( - nn.Conv2D( - in_c, self.num_classes, 3, padding=1)) - self.pred_reg.append(nn.Conv2D(in_c, 4, 3, padding=1)) - self.pred_angle.append( - nn.Conv2D( - in_c, self.angle_max + 1, 3, padding=1)) - self.angle_proj_conv = nn.Conv2D( - self.angle_max + 1, 1, 1, bias_attr=False) - self._init_weights() - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } - - def _init_weights(self): - bias_cls = bias_init_with_prob(0.01) - bias_angle = [10.] + [1.] * self.angle_max - for cls_, reg_, angle_ in zip(self.pred_cls, self.pred_reg, - self.pred_angle): - normal_(cls_.weight, std=0.01) - constant_(cls_.bias, bias_cls) - normal_(reg_.weight, std=0.01) - constant_(reg_.bias) - constant_(angle_.weight) - vector_(angle_.bias, bias_angle) - - angle_proj = paddle.linspace(0, self.angle_max, self.angle_max + 1) - self.angle_proj = angle_proj * self.half_pi_bin - self.angle_proj_conv.weight.set_value( - self.angle_proj.reshape([1, self.angle_max + 1, 1, 1])) - self.angle_proj_conv.weight.stop_gradient = True - - def _generate_anchors(self, feats): - if self.trt: - anchor_points = [] - for feat, stride in zip(feats, self.fpn_strides): - _, _, h, w = paddle.shape(feat) - anchor, _ = anchor_generator( - feat, - stride * 4, - 1.0, [1.0, 1.0, 1.0, 1.0], [stride, stride], - offset=0.5) - x1, y1, x2, y2 = paddle.split(anchor, 4, axis=-1) - xc = (x1 + x2 + 1) / 2 - yc = (y1 + y2 + 1) / 2 - anchor_point = paddle.concat( - [xc, yc], axis=-1).reshape((1, h * w, 2)) - anchor_points.append(anchor_point) - anchor_points = paddle.concat(anchor_points, axis=1) - return anchor_points, None, None - else: - anchor_points = [] - stride_tensor = [] - num_anchors_list = [] - for feat, stride in zip(feats, self.fpn_strides): - _, _, h, w = paddle.shape(feat) - shift_x = (paddle.arange(end=w) + 0.5) * stride - shift_y = (paddle.arange(end=h) + 0.5) * stride - shift_y, shift_x = paddle.meshgrid(shift_y, shift_x) - anchor_point = paddle.cast( - paddle.stack( - [shift_x, shift_y], axis=-1), dtype='float32') - anchor_points.append(anchor_point.reshape([1, -1, 2])) - stride_tensor.append( - paddle.full( - [1, h * w, 1], stride, dtype='float32')) - num_anchors_list.append(h * w) - anchor_points = paddle.concat(anchor_points, axis=1) - stride_tensor = paddle.concat(stride_tensor, axis=1) - return anchor_points, stride_tensor, num_anchors_list - - def forward(self, feats, targets=None): - assert len(feats) == len(self.fpn_strides), \ - "The size of feats is not equal to size of fpn_strides" - - if self.training: - return self.forward_train(feats, targets) - else: - return self.forward_eval(feats) - - def forward_train(self, feats, targets): - anchor_points, stride_tensor, num_anchors_list = self._generate_anchors( - feats) - - cls_score_list, reg_dist_list, reg_angle_list = [], [], [] - for i, feat in enumerate(feats): - avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) - cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + - feat) - reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat)) - reg_angle = self.pred_angle[i](self.stem_angle[i](feat, avg_feat)) - # cls and reg - cls_score = F.sigmoid(cls_logit) - cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1])) - reg_dist_list.append(reg_dist.flatten(2).transpose([0, 2, 1])) - reg_angle_list.append(reg_angle.flatten(2).transpose([0, 2, 1])) - cls_score_list = paddle.concat(cls_score_list, axis=1) - reg_dist_list = paddle.concat(reg_dist_list, axis=1) - reg_angle_list = paddle.concat(reg_angle_list, axis=1) - - return self.get_loss([ - cls_score_list, reg_dist_list, reg_angle_list, anchor_points, - num_anchors_list, stride_tensor - ], targets) - - def forward_eval(self, feats): - cls_score_list, reg_box_list = [], [] - anchor_points, _, _ = self._generate_anchors(feats) - for i, (feat, stride) in enumerate(zip(feats, self.fpn_strides)): - b, _, h, w = paddle.shape(feat) - l = h * w - # cls - avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) - cls_logit = self.pred_cls[i](self.stem_cls[i](feat, avg_feat) + - feat) - # reg - reg_dist = self.pred_reg[i](self.stem_reg[i](feat, avg_feat)) - reg_xy, reg_wh = paddle.split(reg_dist, 2, axis=1) - reg_xy = reg_xy * stride - reg_wh = (F.elu(reg_wh) + 1.) * stride - reg_angle = self.pred_angle[i](self.stem_angle[i](feat, avg_feat)) - reg_angle = self.angle_proj_conv(F.softmax(reg_angle, axis=1)) - reg_box = paddle.concat([reg_xy, reg_wh, reg_angle], axis=1) - # cls and reg - cls_score = F.sigmoid(cls_logit) - cls_score_list.append(cls_score.reshape([b, self.num_classes, l])) - reg_box_list.append(reg_box.reshape([b, 5, l])) - - cls_score_list = paddle.concat(cls_score_list, axis=-1) - reg_box_list = paddle.concat(reg_box_list, axis=-1).transpose([0, 2, 1]) - reg_xy, reg_wha = paddle.split(reg_box_list, [2, 3], axis=-1) - reg_xy = reg_xy + anchor_points - reg_box_list = paddle.concat([reg_xy, reg_wha], axis=-1) - return cls_score_list, reg_box_list - - def _bbox_decode(self, points, pred_dist, pred_angle, stride_tensor): - # predict vector to x, y, w, h, angle - b, l = pred_angle.shape[:2] - xy, wh = paddle.split(pred_dist, 2, axis=-1) - xy = xy * stride_tensor + points - wh = (F.elu(wh) + 1.) * stride_tensor - angle = F.softmax(pred_angle.reshape([b, l, 1, self.angle_max + 1 - ])).matmul(self.angle_proj) - return paddle.concat([xy, wh, angle], axis=-1) - - def get_loss(self, head_outs, gt_meta): - pred_scores, pred_dist, pred_angle, \ - anchor_points, num_anchors_list, stride_tensor = head_outs - # [B, N, 5] -> [B, N, 5] - pred_bboxes = self._bbox_decode(anchor_points, pred_dist, pred_angle, - stride_tensor) - gt_labels = gt_meta['gt_class'] - # [B, N, 5] - gt_bboxes = gt_meta['gt_rbox'] - pad_gt_mask = gt_meta['pad_gt_mask'] - # label assignment - if gt_meta['epoch_id'] < self.static_assigner_epoch: - assigned_labels, assigned_bboxes, assigned_scores = \ - self.static_assigner( - anchor_points, - stride_tensor, - num_anchors_list, - gt_labels, - gt_meta['gt_bbox'], - gt_bboxes, - pad_gt_mask, - self.num_classes, - pred_bboxes.detach() - ) - else: - assigned_labels, assigned_bboxes, assigned_scores = \ - self.assigner( - pred_scores.detach(), - pred_bboxes.detach(), - anchor_points, - num_anchors_list, - gt_labels, - gt_bboxes, - pad_gt_mask, - bg_index=self.num_classes) - alpha_l = -1 - # cls loss - if self.use_varifocal_loss: - one_hot_label = F.one_hot(assigned_labels, - self.num_classes + 1)[..., :-1] - loss_cls = self._varifocal_loss(pred_scores, assigned_scores, - one_hot_label) - else: - loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha_l) - - assigned_scores_sum = assigned_scores.sum() - if paddle.distributed.get_world_size() > 1: - paddle.distributed.all_reduce(assigned_scores_sum) - assigned_scores_sum = paddle.clip( - assigned_scores_sum / paddle.distributed.get_world_size(), - min=1.) - else: - assigned_scores_sum = paddle.clip(assigned_scores_sum, min=1.) - loss_cls /= assigned_scores_sum - - loss_iou, loss_dfl = self._bbox_loss(pred_angle, pred_bboxes, - anchor_points, assigned_labels, - assigned_bboxes, assigned_scores, - assigned_scores_sum, stride_tensor) - - loss = self.loss_weight['class'] * loss_cls + \ - self.loss_weight['iou'] * loss_iou + \ - self.loss_weight['dfl'] * loss_dfl - out_dict = { - 'loss': loss, - 'loss_cls': loss_cls, - 'loss_iou': loss_iou, - 'loss_dfl': loss_dfl - } - return out_dict - - @staticmethod - def _focal_loss(score, label, alpha=0.25, gamma=2.0): - weight = (score - label).pow(gamma) - if alpha > 0: - alpha_t = alpha * label + (1 - alpha) * (1 - label) - weight *= alpha_t - loss = F.binary_cross_entropy( - score, label, weight=weight, reduction='sum') - return loss - - @staticmethod - def _varifocal_loss(pred_score, gt_score, label, alpha=0.75, gamma=2.0): - weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label - loss = F.binary_cross_entropy( - pred_score, gt_score, weight=weight, reduction='sum') - return loss - - @staticmethod - def _df_loss(pred_dist, target): - target_left = paddle.cast(target, 'int64') - target_right = target_left + 1 - weight_left = target_right.astype('float32') - target - weight_right = 1 - weight_left - loss_left = F.cross_entropy( - pred_dist, target_left, reduction='none') * weight_left - loss_right = F.cross_entropy( - pred_dist, target_right, reduction='none') * weight_right - return (loss_left + loss_right).mean(-1, keepdim=True) - - def _bbox_loss(self, pred_angle, pred_bboxes, anchor_points, - assigned_labels, assigned_bboxes, assigned_scores, - assigned_scores_sum, stride_tensor): - # select positive samples mask - mask_positive = (assigned_labels != self.num_classes) - num_pos = mask_positive.sum() - # pos/neg loss - if num_pos > 0: - # iou - bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 5]) - pred_bboxes_pos = paddle.masked_select(pred_bboxes, - bbox_mask).reshape([-1, 5]) - assigned_bboxes_pos = paddle.masked_select( - assigned_bboxes, bbox_mask).reshape([-1, 5]) - bbox_weight = paddle.masked_select( - assigned_scores.sum(-1), mask_positive).reshape([-1]) - - loss_iou = self.iou_loss(pred_bboxes_pos, - assigned_bboxes_pos) * bbox_weight - loss_iou = loss_iou.sum() / assigned_scores_sum - - # dfl - angle_mask = mask_positive.unsqueeze(-1).tile( - [1, 1, self.angle_max + 1]) - pred_angle_pos = paddle.masked_select( - pred_angle, angle_mask).reshape([-1, self.angle_max + 1]) - assigned_angle_pos = ( - assigned_bboxes_pos[:, 4] / - self.half_pi_bin).clip(0, self.angle_max - 0.01) - loss_dfl = self._df_loss(pred_angle_pos, assigned_angle_pos) - else: - loss_iou = pred_bboxes.sum() * 0. - loss_dfl = paddle.zeros([1]) - - return loss_iou, loss_dfl - - def _box2corners(self, pred_bboxes): - """ convert (x, y, w, h, angle) to (x1, y1, x2, y2, x3, y3, x4, y4) - - Args: - pred_bboxes (Tensor): [B, N, 5] - - Returns: - polys (Tensor): [B, N, 8] - """ - x, y, w, h, angle = paddle.split(pred_bboxes, 5, axis=-1) - cos_a_half = paddle.cos(angle) * 0.5 - sin_a_half = paddle.sin(angle) * 0.5 - w_x = cos_a_half * w - w_y = sin_a_half * w - h_x = -sin_a_half * h - h_y = cos_a_half * h - return paddle.concat( - [ - x + w_x + h_x, y + w_y + h_y, x - w_x + h_x, y - w_y + h_y, - x - w_x - h_x, y - w_y - h_y, x + w_x - h_x, y + w_y - h_y - ], - axis=-1) - - def post_process(self, head_outs, scale_factor): - pred_scores, pred_bboxes = head_outs - # [B, N, 5] -> [B, N, 8] - pred_bboxes = self._box2corners(pred_bboxes) - # scale bbox to origin - scale_y, scale_x = paddle.split(scale_factor, 2, axis=-1) - scale_factor = paddle.concat( - [ - scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x, - scale_y - ], - axis=-1).reshape([-1, 1, 8]) - pred_bboxes /= scale_factor - if self.export_onnx: - return pred_bboxes, pred_scores, None - bbox_pred, bbox_num, nms_keep_idx = self.nms(pred_bboxes, - pred_scores) - return bbox_pred, bbox_num, nms_keep_idx diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/retina_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/retina_head.py deleted file mode 100644 index 67a5126..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/retina_head.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import math -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.nn.initializer import Normal, Constant -from ppdet.modeling.bbox_utils import bbox2delta, delta2bbox -from ppdet.modeling.heads.fcos_head import FCOSFeat - -from ppdet.core.workspace import register - -__all__ = ['RetinaHead'] - - -@register -class RetinaFeat(FCOSFeat): - """We use FCOSFeat to construct conv layers in RetinaNet. - We rename FCOSFeat to RetinaFeat to avoid confusion. - """ - pass - - -@register -class RetinaHead(nn.Layer): - """Used in RetinaNet proposed in paper https://arxiv.org/pdf/1708.02002.pdf - """ - __shared__ = ['num_classes'] - __inject__ = [ - 'conv_feat', 'anchor_generator', 'bbox_assigner', 'loss_class', - 'loss_bbox', 'nms' - ] - - def __init__(self, - num_classes=80, - conv_feat='RetinaFeat', - anchor_generator='RetinaAnchorGenerator', - bbox_assigner='MaxIoUAssigner', - loss_class='FocalLoss', - loss_bbox='SmoothL1Loss', - nms='MultiClassNMS', - prior_prob=0.01, - nms_pre=1000, - weights=[1., 1., 1., 1.]): - super(RetinaHead, self).__init__() - self.num_classes = num_classes - self.conv_feat = conv_feat - self.anchor_generator = anchor_generator - self.bbox_assigner = bbox_assigner - self.loss_class = loss_class - self.loss_bbox = loss_bbox - self.nms = nms - self.nms_pre = nms_pre - self.weights = weights - - bias_init_value = -math.log((1 - prior_prob) / prior_prob) - num_anchors = self.anchor_generator.num_anchors - self.retina_cls = nn.Conv2D( - in_channels=self.conv_feat.feat_out, - out_channels=self.num_classes * num_anchors, - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0.0, std=0.01)), - bias_attr=ParamAttr(initializer=Constant(value=bias_init_value))) - self.retina_reg = nn.Conv2D( - in_channels=self.conv_feat.feat_out, - out_channels=4 * num_anchors, - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0.0, std=0.01)), - bias_attr=ParamAttr(initializer=Constant(value=0))) - - def forward(self, neck_feats, targets=None): - cls_logits_list = [] - bboxes_reg_list = [] - for neck_feat in neck_feats: - conv_cls_feat, conv_reg_feat = self.conv_feat(neck_feat) - cls_logits = self.retina_cls(conv_cls_feat) - bbox_reg = self.retina_reg(conv_reg_feat) - cls_logits_list.append(cls_logits) - bboxes_reg_list.append(bbox_reg) - - if self.training: - return self.get_loss([cls_logits_list, bboxes_reg_list], targets) - else: - return [cls_logits_list, bboxes_reg_list] - - def get_loss(self, head_outputs, targets): - """Here we calculate loss for a batch of images. - We assign anchors to gts in each image and gather all the assigned - postive and negative samples. Then loss is calculated on the gathered - samples. - """ - cls_logits_list, bboxes_reg_list = head_outputs - anchors = self.anchor_generator(cls_logits_list) - anchors = paddle.concat(anchors) - - # matches: contain gt_inds - # match_labels: -1(ignore), 0(neg) or 1(pos) - matches_list, match_labels_list = [], [] - # assign anchors to gts, no sampling is involved - for gt_bbox in targets['gt_bbox']: - matches, match_labels = self.bbox_assigner(anchors, gt_bbox) - matches_list.append(matches) - match_labels_list.append(match_labels) - - # reshape network outputs - cls_logits = [ - _.transpose([0, 2, 3, 1]).reshape([0, -1, self.num_classes]) - for _ in cls_logits_list - ] - bboxes_reg = [ - _.transpose([0, 2, 3, 1]).reshape([0, -1, 4]) - for _ in bboxes_reg_list - ] - cls_logits = paddle.concat(cls_logits, axis=1) - bboxes_reg = paddle.concat(bboxes_reg, axis=1) - - cls_pred_list, cls_tar_list = [], [] - reg_pred_list, reg_tar_list = [], [] - # find and gather preds and targets in each image - for matches, match_labels, cls_logit, bbox_reg, gt_bbox, gt_class in \ - zip(matches_list, match_labels_list, cls_logits, bboxes_reg, - targets['gt_bbox'], targets['gt_class']): - pos_mask = (match_labels == 1) - neg_mask = (match_labels == 0) - chosen_mask = paddle.logical_or(pos_mask, neg_mask) - - gt_class = gt_class.reshape([-1]) - bg_class = paddle.to_tensor( - [self.num_classes], dtype=gt_class.dtype) - # a trick to assign num_classes to negative targets - gt_class = paddle.concat([gt_class, bg_class], axis=-1) - matches = paddle.where(neg_mask, - paddle.full_like(matches, gt_class.size - 1), - matches) - - cls_pred = cls_logit[chosen_mask] - cls_tar = gt_class[matches[chosen_mask]] - reg_pred = bbox_reg[pos_mask].reshape([-1, 4]) - reg_tar = gt_bbox[matches[pos_mask]].reshape([-1, 4]) - reg_tar = bbox2delta(anchors[pos_mask], reg_tar, self.weights) - cls_pred_list.append(cls_pred) - cls_tar_list.append(cls_tar) - reg_pred_list.append(reg_pred) - reg_tar_list.append(reg_tar) - cls_pred = paddle.concat(cls_pred_list) - cls_tar = paddle.concat(cls_tar_list) - reg_pred = paddle.concat(reg_pred_list) - reg_tar = paddle.concat(reg_tar_list) - - avg_factor = max(1.0, reg_pred.shape[0]) - cls_loss = self.loss_class( - cls_pred, cls_tar, reduction='sum') / avg_factor - - if reg_pred.shape[0] == 0: - reg_loss = paddle.zeros([1]) - reg_loss.stop_gradient = False - else: - reg_loss = self.loss_bbox( - reg_pred, reg_tar, reduction='sum') / avg_factor - - loss = cls_loss + reg_loss - out_dict = { - 'loss_cls': cls_loss, - 'loss_reg': reg_loss, - 'loss': loss, - } - return out_dict - - def get_bboxes_single(self, - anchors, - cls_scores_list, - bbox_preds_list, - im_shape, - scale_factor, - rescale=True): - assert len(cls_scores_list) == len(bbox_preds_list) - mlvl_bboxes = [] - mlvl_scores = [] - for anchor, cls_score, bbox_pred in zip(anchors, cls_scores_list, - bbox_preds_list): - cls_score = cls_score.reshape([-1, self.num_classes]) - bbox_pred = bbox_pred.reshape([-1, 4]) - if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre: - max_score = cls_score.max(axis=1) - _, topk_inds = max_score.topk(self.nms_pre) - bbox_pred = bbox_pred.gather(topk_inds) - anchor = anchor.gather(topk_inds) - cls_score = cls_score.gather(topk_inds) - bbox_pred = delta2bbox(bbox_pred, anchor, self.weights).squeeze() - mlvl_bboxes.append(bbox_pred) - mlvl_scores.append(F.sigmoid(cls_score)) - mlvl_bboxes = paddle.concat(mlvl_bboxes) - mlvl_bboxes = paddle.squeeze(mlvl_bboxes) - if rescale: - mlvl_bboxes = mlvl_bboxes / paddle.concat( - [scale_factor[::-1], scale_factor[::-1]]) - mlvl_scores = paddle.concat(mlvl_scores) - mlvl_scores = mlvl_scores.transpose([1, 0]) - return mlvl_bboxes, mlvl_scores - - def decode(self, anchors, cls_logits, bboxes_reg, im_shape, scale_factor): - batch_bboxes = [] - batch_scores = [] - for img_id in range(cls_logits[0].shape[0]): - num_lvls = len(cls_logits) - cls_scores_list = [cls_logits[i][img_id] for i in range(num_lvls)] - bbox_preds_list = [bboxes_reg[i][img_id] for i in range(num_lvls)] - bboxes, scores = self.get_bboxes_single( - anchors, cls_scores_list, bbox_preds_list, im_shape[img_id], - scale_factor[img_id]) - batch_bboxes.append(bboxes) - batch_scores.append(scores) - batch_bboxes = paddle.stack(batch_bboxes, axis=0) - batch_scores = paddle.stack(batch_scores, axis=0) - return batch_bboxes, batch_scores - - def post_process(self, head_outputs, im_shape, scale_factor): - cls_logits_list, bboxes_reg_list = head_outputs - anchors = self.anchor_generator(cls_logits_list) - cls_logits = [_.transpose([0, 2, 3, 1]) for _ in cls_logits_list] - bboxes_reg = [_.transpose([0, 2, 3, 1]) for _ in bboxes_reg_list] - bboxes, scores = self.decode(anchors, cls_logits, bboxes_reg, im_shape, - scale_factor) - - bbox_pred, bbox_num, nms_keep_idx = self.nms(bboxes, scores) - return bbox_pred, bbox_num, nms_keep_idx - - - def get_scores_single(self, cls_scores_list): - mlvl_logits = [] - for cls_score in cls_scores_list: - cls_score = cls_score.reshape([-1, self.num_classes]) - if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre: - max_score = cls_score.max(axis=1) - _, topk_inds = max_score.topk(self.nms_pre) - cls_score = cls_score.gather(topk_inds) - - mlvl_logits.append(cls_score) - - mlvl_logits = paddle.concat(mlvl_logits) - mlvl_logits = mlvl_logits.transpose([1, 0]) - - return mlvl_logits - - def decode_cls_logits(self, cls_logits_list): - cls_logits = [_.transpose([0, 2, 3, 1]) for _ in cls_logits_list] - batch_logits = [] - for img_id in range(cls_logits[0].shape[0]): - num_lvls = len(cls_logits) - cls_scores_list = [cls_logits[i][img_id] for i in range(num_lvls)] - logits = self.get_scores_single(cls_scores_list) - batch_logits.append(logits) - batch_logits = paddle.stack(batch_logits, axis=0) - return batch_logits - diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/roi_extractor.py b/pdfdet/models/Paddle/ppdet/modeling/heads/roi_extractor.py deleted file mode 100644 index 6c2f5c8..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/roi_extractor.py +++ /dev/null @@ -1,118 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -from ppdet.core.workspace import register -from ppdet.modeling import ops -import paddle.nn as nn - - -def _to_list(v): - if not isinstance(v, (list, tuple)): - return [v] - return v - - -@register -class RoIAlign(nn.Layer): - """ - RoI Align module - - For more details, please refer to the document of roi_align in - in https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/vision/ops.py - - Args: - resolution (int): The output size, default 14 - spatial_scale (float): Multiplicative spatial scale factor to translate - ROI coords from their input scale to the scale used when pooling. - default 0.0625 - sampling_ratio (int): The number of sampling points in the interpolation - grid, default 0 - canconical_level (int): The referring level of FPN layer with - specified level. default 4 - canonical_size (int): The referring scale of FPN layer with - specified scale. default 224 - start_level (int): The start level of FPN layer to extract RoI feature, - default 0 - end_level (int): The end level of FPN layer to extract RoI feature, - default 3 - aligned (bool): Whether to add offset to rois' coord in roi_align. - default false - """ - - def __init__(self, - resolution=14, - spatial_scale=0.0625, - sampling_ratio=0, - canconical_level=4, - canonical_size=224, - start_level=0, - end_level=3, - aligned=False): - super(RoIAlign, self).__init__() - self.resolution = resolution - self.spatial_scale = _to_list(spatial_scale) - self.sampling_ratio = sampling_ratio - self.canconical_level = canconical_level - self.canonical_size = canonical_size - self.start_level = start_level - self.end_level = end_level - self.aligned = aligned - - @classmethod - def from_config(cls, cfg, input_shape): - return {'spatial_scale': [1. / i.stride for i in input_shape]} - - def forward(self, feats, roi, rois_num): - roi = paddle.concat(roi) if len(roi) > 1 else roi[0] - if len(feats) == 1: - rois_feat = paddle.vision.ops.roi_align( - x=feats[self.start_level], - boxes=roi, - boxes_num=rois_num, - output_size=self.resolution, - spatial_scale=self.spatial_scale[0], - aligned=self.aligned) - else: - offset = 2 - k_min = self.start_level + offset - k_max = self.end_level + offset - if hasattr(paddle.vision.ops, "distribute_fpn_proposals"): - distribute_fpn_proposals = getattr(paddle.vision.ops, - "distribute_fpn_proposals") - else: - distribute_fpn_proposals = ops.distribute_fpn_proposals - rois_dist, restore_index, rois_num_dist = distribute_fpn_proposals( - roi, - k_min, - k_max, - self.canconical_level, - self.canonical_size, - rois_num=rois_num) - - rois_feat_list = [] - for lvl in range(self.start_level, self.end_level + 1): - roi_feat = paddle.vision.ops.roi_align( - x=feats[lvl], - boxes=rois_dist[lvl], - boxes_num=rois_num_dist[lvl], - output_size=self.resolution, - spatial_scale=self.spatial_scale[lvl], - sampling_ratio=self.sampling_ratio, - aligned=self.aligned) - rois_feat_list.append(roi_feat) - rois_feat_shuffle = paddle.concat(rois_feat_list) - rois_feat = paddle.gather(rois_feat_shuffle, restore_index) - - return rois_feat diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/s2anet_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/s2anet_head.py deleted file mode 100644 index 99fd13a..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/s2anet_head.py +++ /dev/null @@ -1,745 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# The code is based on https://github.com/csuhan/s2anet/blob/master/mmdet/models/anchor_heads_rotated/s2anet_head.py - -import paddle -from paddle import ParamAttr -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.nn.initializer import Normal, Constant -from ppdet.core.workspace import register -from ppdet.modeling.proposal_generator.target_layer import RBoxAssigner -from ppdet.modeling.proposal_generator.anchor_generator import S2ANetAnchorGenerator -from ppdet.modeling.layers import AlignConv -from ..cls_utils import _get_class_default_kwargs -import numpy as np - - -@register -class S2ANetHead(nn.Layer): - """ - S2Anet head - Args: - stacked_convs (int): number of stacked_convs - feat_in (int): input channels of feat - feat_out (int): output channels of feat - num_classes (int): num_classes - anchor_strides (list): stride of anchors - anchor_scales (list): scale of anchors - anchor_ratios (list): ratios of anchors - target_means (list): target_means - target_stds (list): target_stds - align_conv_type (str): align_conv_type ['Conv', 'AlignConv'] - align_conv_size (int): kernel size of align_conv - use_sigmoid_cls (bool): use sigmoid_cls or not - reg_loss_weight (list): loss weight for regression - """ - __shared__ = ['num_classes'] - __inject__ = ['anchor_assign', 'nms'] - - def __init__(self, - stacked_convs=2, - feat_in=256, - feat_out=256, - num_classes=15, - anchor_strides=[8, 16, 32, 64, 128], - anchor_scales=[4], - anchor_ratios=[1.0], - target_means=0.0, - target_stds=1.0, - align_conv_type='AlignConv', - align_conv_size=3, - use_sigmoid_cls=True, - anchor_assign=_get_class_default_kwargs(RBoxAssigner), - reg_loss_weight=[1.0, 1.0, 1.0, 1.0, 1.1], - cls_loss_weight=[1.1, 1.05], - reg_loss_type='l1', - nms_pre=2000, - nms='MultiClassNMS'): - super(S2ANetHead, self).__init__() - self.stacked_convs = stacked_convs - self.feat_in = feat_in - self.feat_out = feat_out - self.anchor_list = None - self.anchor_scales = anchor_scales - self.anchor_ratios = anchor_ratios - self.anchor_strides = anchor_strides - self.anchor_strides = paddle.to_tensor(anchor_strides) - self.anchor_base_sizes = list(anchor_strides) - self.means = paddle.ones(shape=[5]) * target_means - self.stds = paddle.ones(shape=[5]) * target_stds - assert align_conv_type in ['AlignConv', 'Conv', 'DCN'] - self.align_conv_type = align_conv_type - self.align_conv_size = align_conv_size - - self.use_sigmoid_cls = use_sigmoid_cls - self.cls_out_channels = num_classes if self.use_sigmoid_cls else num_classes + 1 - self.sampling = False - self.anchor_assign = anchor_assign - self.reg_loss_weight = reg_loss_weight - self.cls_loss_weight = cls_loss_weight - self.alpha = 1.0 - self.beta = 1.0 - self.reg_loss_type = reg_loss_type - self.nms_pre = nms_pre - self.nms = nms - self.fake_bbox = paddle.to_tensor( - np.array( - [[-1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], - dtype='float32')) - self.fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32')) - - # anchor - self.anchor_generators = [] - for anchor_base in self.anchor_base_sizes: - self.anchor_generators.append( - S2ANetAnchorGenerator(anchor_base, anchor_scales, - anchor_ratios)) - - self.anchor_generators = nn.LayerList(self.anchor_generators) - self.fam_cls_convs = nn.Sequential() - self.fam_reg_convs = nn.Sequential() - - for i in range(self.stacked_convs): - chan_in = self.feat_in if i == 0 else self.feat_out - - self.fam_cls_convs.add_sublayer( - 'fam_cls_conv_{}'.format(i), - nn.Conv2D( - in_channels=chan_in, - out_channels=self.feat_out, - kernel_size=3, - padding=1, - weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), - bias_attr=ParamAttr(initializer=Constant(0)))) - - self.fam_cls_convs.add_sublayer('fam_cls_conv_{}_act'.format(i), - nn.ReLU()) - - self.fam_reg_convs.add_sublayer( - 'fam_reg_conv_{}'.format(i), - nn.Conv2D( - in_channels=chan_in, - out_channels=self.feat_out, - kernel_size=3, - padding=1, - weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), - bias_attr=ParamAttr(initializer=Constant(0)))) - - self.fam_reg_convs.add_sublayer('fam_reg_conv_{}_act'.format(i), - nn.ReLU()) - - self.fam_reg = nn.Conv2D( - self.feat_out, - 5, - 1, - weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), - bias_attr=ParamAttr(initializer=Constant(0))) - prior_prob = 0.01 - bias_init = float(-np.log((1 - prior_prob) / prior_prob)) - self.fam_cls = nn.Conv2D( - self.feat_out, - self.cls_out_channels, - 1, - weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), - bias_attr=ParamAttr(initializer=Constant(bias_init))) - - if self.align_conv_type == "AlignConv": - self.align_conv = AlignConv(self.feat_out, self.feat_out, - self.align_conv_size) - elif self.align_conv_type == "Conv": - self.align_conv = nn.Conv2D( - self.feat_out, - self.feat_out, - self.align_conv_size, - padding=(self.align_conv_size - 1) // 2, - bias_attr=ParamAttr(initializer=Constant(0))) - - elif self.align_conv_type == "DCN": - self.align_conv_offset = nn.Conv2D( - self.feat_out, - 2 * self.align_conv_size**2, - 1, - weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), - bias_attr=ParamAttr(initializer=Constant(0))) - - self.align_conv = paddle.vision.ops.DeformConv2D( - self.feat_out, - self.feat_out, - self.align_conv_size, - padding=(self.align_conv_size - 1) // 2, - weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), - bias_attr=False) - - self.or_conv = nn.Conv2D( - self.feat_out, - self.feat_out, - kernel_size=3, - padding=1, - weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), - bias_attr=ParamAttr(initializer=Constant(0))) - - # ODM - self.odm_cls_convs = nn.Sequential() - self.odm_reg_convs = nn.Sequential() - - for i in range(self.stacked_convs): - ch_in = self.feat_out - # ch_in = int(self.feat_out / 8) if i == 0 else self.feat_out - - self.odm_cls_convs.add_sublayer( - 'odm_cls_conv_{}'.format(i), - nn.Conv2D( - in_channels=ch_in, - out_channels=self.feat_out, - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), - bias_attr=ParamAttr(initializer=Constant(0)))) - - self.odm_cls_convs.add_sublayer('odm_cls_conv_{}_act'.format(i), - nn.ReLU()) - - self.odm_reg_convs.add_sublayer( - 'odm_reg_conv_{}'.format(i), - nn.Conv2D( - in_channels=self.feat_out, - out_channels=self.feat_out, - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), - bias_attr=ParamAttr(initializer=Constant(0)))) - - self.odm_reg_convs.add_sublayer('odm_reg_conv_{}_act'.format(i), - nn.ReLU()) - - self.odm_cls = nn.Conv2D( - self.feat_out, - self.cls_out_channels, - 3, - padding=1, - weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), - bias_attr=ParamAttr(initializer=Constant(bias_init))) - self.odm_reg = nn.Conv2D( - self.feat_out, - 5, - 3, - padding=1, - weight_attr=ParamAttr(initializer=Normal(0.0, 0.01)), - bias_attr=ParamAttr(initializer=Constant(0))) - - def forward(self, feats, targets=None): - fam_reg_list, fam_cls_list = [], [] - odm_reg_list, odm_cls_list = [], [] - num_anchors_list, base_anchors_list, refine_anchors_list = [], [], [] - - for i, feat in enumerate(feats): - # get shape - B = feat.shape[0] - H, W = paddle.shape(feat)[2], paddle.shape(feat)[3] - - NA = H * W - num_anchors_list.append(NA) - - fam_cls_feat = self.fam_cls_convs(feat) - fam_cls = self.fam_cls(fam_cls_feat) - # [N, CLS, H, W] --> [N, H, W, CLS] - fam_cls = fam_cls.transpose([0, 2, 3, 1]).reshape( - [B, NA, self.cls_out_channels]) - fam_cls_list.append(fam_cls) - - fam_reg_feat = self.fam_reg_convs(feat) - fam_reg = self.fam_reg(fam_reg_feat) - # [N, 5, H, W] --> [N, H, W, 5] - fam_reg = fam_reg.transpose([0, 2, 3, 1]).reshape([B, NA, 5]) - fam_reg_list.append(fam_reg) - - # prepare anchor - init_anchors = self.anchor_generators[i]((H, W), - self.anchor_strides[i]) - init_anchors = init_anchors.reshape([1, NA, 5]) - base_anchors_list.append(init_anchors.squeeze(0)) - - if self.training: - refine_anchor = self.bbox_decode(fam_reg.detach(), init_anchors) - else: - refine_anchor = self.bbox_decode(fam_reg, init_anchors) - - refine_anchors_list.append(refine_anchor) - - if self.align_conv_type == 'AlignConv': - align_feat = self.align_conv(feat, - refine_anchor.clone(), (H, W), - self.anchor_strides[i]) - elif self.align_conv_type == 'DCN': - align_offset = self.align_conv_offset(feat) - align_feat = self.align_conv(feat, align_offset) - elif self.align_conv_type == 'Conv': - align_feat = self.align_conv(feat) - - or_feat = self.or_conv(align_feat) - odm_reg_feat = or_feat - odm_cls_feat = or_feat - - odm_reg_feat = self.odm_reg_convs(odm_reg_feat) - odm_cls_feat = self.odm_cls_convs(odm_cls_feat) - - odm_cls = self.odm_cls(odm_cls_feat) - # [N, CLS, H, W] --> [N, H, W, CLS] - odm_cls = odm_cls.transpose([0, 2, 3, 1]).reshape( - [B, NA, self.cls_out_channels]) - odm_cls_list.append(odm_cls) - - odm_reg = self.odm_reg(odm_reg_feat) - # [N, 5, H, W] --> [N, H, W, 5] - odm_reg = odm_reg.transpose([0, 2, 3, 1]).reshape([B, NA, 5]) - odm_reg_list.append(odm_reg) - - if self.training: - return self.get_loss([ - fam_cls_list, fam_reg_list, odm_cls_list, odm_reg_list, - num_anchors_list, base_anchors_list, refine_anchors_list - ], targets) - else: - odm_bboxes_list = [] - for odm_reg, refine_anchor in zip(odm_reg_list, - refine_anchors_list): - odm_bboxes = self.bbox_decode(odm_reg, refine_anchor) - odm_bboxes_list.append(odm_bboxes) - return [odm_bboxes_list, odm_cls_list] - - def get_bboxes(self, head_outs): - perd_bboxes_list, pred_scores_list = head_outs - batch = paddle.shape(pred_scores_list[0])[0] - bboxes, bbox_num = [], [] - for i in range(batch): - pred_scores_per_image = [t[i] for t in pred_scores_list] - pred_bboxes_per_image = [t[i] for t in perd_bboxes_list] - bbox_per_image, bbox_num_per_image = self.get_bboxes_single( - pred_scores_per_image, pred_bboxes_per_image) - bboxes.append(bbox_per_image) - bbox_num.append(bbox_num_per_image) - - bboxes = paddle.concat(bboxes) - bbox_num = paddle.concat(bbox_num) - return bboxes, bbox_num - - def get_pred(self, bboxes, bbox_num, im_shape, scale_factor): - """ - Rescale, clip and filter the bbox from the output of NMS to - get final prediction. - Args: - bboxes(Tensor): bboxes [N, 10] - bbox_num(Tensor): bbox_num - im_shape(Tensor): [1 2] - scale_factor(Tensor): [1 2] - Returns: - bbox_pred(Tensor): The output is the prediction with shape [N, 8] - including labels, scores and bboxes. The size of - bboxes are corresponding to the original image. - """ - origin_shape = paddle.floor(im_shape / scale_factor + 0.5) - - origin_shape_list = [] - scale_factor_list = [] - # scale_factor: scale_y, scale_x - for i in range(bbox_num.shape[0]): - expand_shape = paddle.expand(origin_shape[i:i + 1, :], - [bbox_num[i], 2]) - scale_y, scale_x = scale_factor[i, 0:1], scale_factor[i, 1:2] - scale = paddle.concat([ - scale_x, scale_y, scale_x, scale_y, scale_x, scale_y, scale_x, - scale_y - ]) - expand_scale = paddle.expand(scale, [bbox_num[i], 8]) - origin_shape_list.append(expand_shape) - scale_factor_list.append(expand_scale) - - origin_shape_list = paddle.concat(origin_shape_list) - scale_factor_list = paddle.concat(scale_factor_list) - - # bboxes: [N, 10], label, score, bbox - pred_label_score = bboxes[:, 0:2] - pred_bbox = bboxes[:, 2:] - - # rescale bbox to original image - pred_bbox = pred_bbox.reshape([-1, 8]) - scaled_bbox = pred_bbox / scale_factor_list - origin_h = origin_shape_list[:, 0] - origin_w = origin_shape_list[:, 1] - - bboxes = scaled_bbox - zeros = paddle.zeros_like(origin_h) - x1 = paddle.maximum(paddle.minimum(bboxes[:, 0], origin_w - 1), zeros) - y1 = paddle.maximum(paddle.minimum(bboxes[:, 1], origin_h - 1), zeros) - x2 = paddle.maximum(paddle.minimum(bboxes[:, 2], origin_w - 1), zeros) - y2 = paddle.maximum(paddle.minimum(bboxes[:, 3], origin_h - 1), zeros) - x3 = paddle.maximum(paddle.minimum(bboxes[:, 4], origin_w - 1), zeros) - y3 = paddle.maximum(paddle.minimum(bboxes[:, 5], origin_h - 1), zeros) - x4 = paddle.maximum(paddle.minimum(bboxes[:, 6], origin_w - 1), zeros) - y4 = paddle.maximum(paddle.minimum(bboxes[:, 7], origin_h - 1), zeros) - pred_bbox = paddle.stack([x1, y1, x2, y2, x3, y3, x4, y4], axis=-1) - pred_result = paddle.concat([pred_label_score, pred_bbox], axis=1) - return pred_result - - def get_bboxes_single(self, cls_score_list, bbox_pred_list): - mlvl_bboxes = [] - mlvl_scores = [] - - for cls_score, bbox_pred in zip(cls_score_list, bbox_pred_list): - if self.use_sigmoid_cls: - scores = F.sigmoid(cls_score) - else: - scores = F.softmax(cls_score, axis=-1) - - if scores.shape[0] > self.nms_pre: - # Get maximum scores for foreground classes. - if self.use_sigmoid_cls: - max_scores = paddle.max(scores, axis=1) - else: - max_scores = paddle.max(scores[:, :-1], axis=1) - - topk_val, topk_inds = paddle.topk(max_scores, self.nms_pre) - bbox_pred = paddle.gather(bbox_pred, topk_inds) - scores = paddle.gather(scores, topk_inds) - - mlvl_bboxes.append(bbox_pred) - mlvl_scores.append(scores) - - mlvl_bboxes = paddle.concat(mlvl_bboxes) - mlvl_scores = paddle.concat(mlvl_scores) - - mlvl_polys = self.rbox2poly(mlvl_bboxes).unsqueeze(0) - mlvl_scores = paddle.transpose(mlvl_scores, [1, 0]).unsqueeze(0) - - bbox, bbox_num, _ = self.nms(mlvl_polys, mlvl_scores) - if bbox.shape[0] <= 0: - bbox = self.fake_bbox - bbox_num = self.fake_bbox_num - - return bbox, bbox_num - - def smooth_l1_loss(self, pred, label, delta=1.0 / 9.0): - """ - Args: - pred: pred score - label: label - delta: delta - Returns: loss - """ - assert pred.shape == label.shape and label.numel() > 0 - assert delta > 0 - diff = paddle.abs(pred - label) - loss = paddle.where(diff < delta, 0.5 * diff * diff / delta, - diff - 0.5 * delta) - return loss - - def get_fam_loss(self, fam_target, s2anet_head_out, reg_loss_type='l1'): - (labels, label_weights, bbox_targets, bbox_weights, bbox_gt_bboxes, - pos_inds, neg_inds) = fam_target - fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list, num_anchors_list = s2anet_head_out - - fam_cls_losses = [] - fam_bbox_losses = [] - st_idx = 0 - num_total_samples = len(pos_inds) + len( - neg_inds) if self.sampling else len(pos_inds) - num_total_samples = max(1, num_total_samples) - - for idx, feat_anchor_num in enumerate(num_anchors_list): - # step1: get data - feat_labels = labels[st_idx:st_idx + feat_anchor_num] - feat_label_weights = label_weights[st_idx:st_idx + feat_anchor_num] - - feat_bbox_targets = bbox_targets[st_idx:st_idx + feat_anchor_num, :] - feat_bbox_weights = bbox_weights[st_idx:st_idx + feat_anchor_num, :] - - # step2: calc cls loss - feat_labels = feat_labels.reshape(-1) - feat_label_weights = feat_label_weights.reshape(-1) - - fam_cls_score = fam_cls_branch_list[idx] - fam_cls_score = paddle.squeeze(fam_cls_score, axis=0) - fam_cls_score1 = fam_cls_score - - feat_labels = paddle.to_tensor(feat_labels) - feat_labels_one_hot = paddle.nn.functional.one_hot( - feat_labels, self.cls_out_channels + 1) - feat_labels_one_hot = feat_labels_one_hot[:, 1:] - feat_labels_one_hot.stop_gradient = True - - num_total_samples = paddle.to_tensor( - num_total_samples, dtype='float32', stop_gradient=True) - - fam_cls = F.sigmoid_focal_loss( - fam_cls_score1, - feat_labels_one_hot, - normalizer=num_total_samples, - reduction='none') - - feat_label_weights = feat_label_weights.reshape( - feat_label_weights.shape[0], 1) - feat_label_weights = np.repeat( - feat_label_weights, self.cls_out_channels, axis=1) - feat_label_weights = paddle.to_tensor( - feat_label_weights, stop_gradient=True) - - fam_cls = fam_cls * feat_label_weights - fam_cls_total = paddle.sum(fam_cls) - fam_cls_losses.append(fam_cls_total) - - # step3: regression loss - feat_bbox_targets = paddle.to_tensor( - feat_bbox_targets, dtype='float32', stop_gradient=True) - feat_bbox_targets = paddle.reshape(feat_bbox_targets, [-1, 5]) - - fam_bbox_pred = fam_reg_branch_list[idx] - fam_bbox_pred = paddle.squeeze(fam_bbox_pred, axis=0) - fam_bbox_pred = paddle.reshape(fam_bbox_pred, [-1, 5]) - fam_bbox = self.smooth_l1_loss(fam_bbox_pred, feat_bbox_targets) - loss_weight = paddle.to_tensor( - self.reg_loss_weight, dtype='float32', stop_gradient=True) - fam_bbox = paddle.multiply(fam_bbox, loss_weight) - feat_bbox_weights = paddle.to_tensor( - feat_bbox_weights, stop_gradient=True) - - fam_bbox = fam_bbox * feat_bbox_weights - fam_bbox_total = paddle.sum(fam_bbox) / num_total_samples - fam_bbox_losses.append(fam_bbox_total) - st_idx += feat_anchor_num - - fam_cls_loss = paddle.add_n(fam_cls_losses) - fam_cls_loss_weight = paddle.to_tensor( - self.cls_loss_weight[0], dtype='float32', stop_gradient=True) - fam_cls_loss = fam_cls_loss * fam_cls_loss_weight - fam_reg_loss = paddle.add_n(fam_bbox_losses) - return fam_cls_loss, fam_reg_loss - - def get_odm_loss(self, odm_target, s2anet_head_out, reg_loss_type='l1'): - (labels, label_weights, bbox_targets, bbox_weights, bbox_gt_bboxes, - pos_inds, neg_inds) = odm_target - fam_cls_branch_list, fam_reg_branch_list, odm_cls_branch_list, odm_reg_branch_list, num_anchors_list = s2anet_head_out - - odm_cls_losses = [] - odm_bbox_losses = [] - st_idx = 0 - num_total_samples = len(pos_inds) + len( - neg_inds) if self.sampling else len(pos_inds) - num_total_samples = max(1, num_total_samples) - - for idx, feat_anchor_num in enumerate(num_anchors_list): - # step1: get data - feat_labels = labels[st_idx:st_idx + feat_anchor_num] - feat_label_weights = label_weights[st_idx:st_idx + feat_anchor_num] - - feat_bbox_targets = bbox_targets[st_idx:st_idx + feat_anchor_num, :] - feat_bbox_weights = bbox_weights[st_idx:st_idx + feat_anchor_num, :] - - # step2: calc cls loss - feat_labels = feat_labels.reshape(-1) - feat_label_weights = feat_label_weights.reshape(-1) - - odm_cls_score = odm_cls_branch_list[idx] - odm_cls_score = paddle.squeeze(odm_cls_score, axis=0) - odm_cls_score1 = odm_cls_score - - feat_labels = paddle.to_tensor(feat_labels) - feat_labels_one_hot = paddle.nn.functional.one_hot( - feat_labels, self.cls_out_channels + 1) - feat_labels_one_hot = feat_labels_one_hot[:, 1:] - feat_labels_one_hot.stop_gradient = True - - num_total_samples = paddle.to_tensor( - num_total_samples, dtype='float32', stop_gradient=True) - odm_cls = F.sigmoid_focal_loss( - odm_cls_score1, - feat_labels_one_hot, - normalizer=num_total_samples, - reduction='none') - - feat_label_weights = feat_label_weights.reshape( - feat_label_weights.shape[0], 1) - feat_label_weights = np.repeat( - feat_label_weights, self.cls_out_channels, axis=1) - feat_label_weights = paddle.to_tensor(feat_label_weights) - feat_label_weights.stop_gradient = True - - odm_cls = odm_cls * feat_label_weights - odm_cls_total = paddle.sum(odm_cls) - odm_cls_losses.append(odm_cls_total) - - # # step3: regression loss - feat_bbox_targets = paddle.to_tensor( - feat_bbox_targets, dtype='float32') - feat_bbox_targets = paddle.reshape(feat_bbox_targets, [-1, 5]) - feat_bbox_targets.stop_gradient = True - - odm_bbox_pred = odm_reg_branch_list[idx] - odm_bbox_pred = paddle.squeeze(odm_bbox_pred, axis=0) - odm_bbox_pred = paddle.reshape(odm_bbox_pred, [-1, 5]) - odm_bbox = self.smooth_l1_loss(odm_bbox_pred, feat_bbox_targets) - - loss_weight = paddle.to_tensor( - self.reg_loss_weight, dtype='float32', stop_gradient=True) - odm_bbox = paddle.multiply(odm_bbox, loss_weight) - feat_bbox_weights = paddle.to_tensor( - feat_bbox_weights, stop_gradient=True) - - odm_bbox = odm_bbox * feat_bbox_weights - odm_bbox_total = paddle.sum(odm_bbox) / num_total_samples - - odm_bbox_losses.append(odm_bbox_total) - st_idx += feat_anchor_num - - odm_cls_loss = paddle.add_n(odm_cls_losses) - odm_cls_loss_weight = paddle.to_tensor( - self.cls_loss_weight[1], dtype='float32', stop_gradient=True) - odm_cls_loss = odm_cls_loss * odm_cls_loss_weight - odm_reg_loss = paddle.add_n(odm_bbox_losses) - return odm_cls_loss, odm_reg_loss - - def get_loss(self, head_outs, inputs): - fam_cls_list, fam_reg_list, odm_cls_list, odm_reg_list, \ - num_anchors_list, base_anchors_list, refine_anchors_list = head_outs - - # compute loss - fam_cls_loss_lst = [] - fam_reg_loss_lst = [] - odm_cls_loss_lst = [] - odm_reg_loss_lst = [] - - batch = len(inputs['gt_rbox']) - for i in range(batch): - # data_format: (xc, yc, w, h, theta) - gt_mask = inputs['pad_gt_mask'][i, :, 0] - gt_idx = paddle.nonzero(gt_mask).squeeze(-1) - gt_bboxes = paddle.gather(inputs['gt_rbox'][i], gt_idx).numpy() - gt_labels = paddle.gather(inputs['gt_class'][i], gt_idx).numpy() - is_crowd = paddle.gather(inputs['is_crowd'][i], gt_idx).numpy() - gt_labels = gt_labels + 1 - - anchors_per_image = np.concatenate(base_anchors_list) - - fam_cls_per_image = [t[i] for t in fam_cls_list] - fam_reg_per_image = [t[i] for t in fam_reg_list] - odm_cls_per_image = [t[i] for t in odm_cls_list] - odm_reg_per_image = [t[i] for t in odm_reg_list] - im_s2anet_head_out = (fam_cls_per_image, fam_reg_per_image, - odm_cls_per_image, odm_reg_per_image, - num_anchors_list) - # FAM - im_fam_target = self.anchor_assign(anchors_per_image, gt_bboxes, - gt_labels, is_crowd) - if im_fam_target is not None: - im_fam_cls_loss, im_fam_reg_loss = self.get_fam_loss( - im_fam_target, im_s2anet_head_out, self.reg_loss_type) - fam_cls_loss_lst.append(im_fam_cls_loss) - fam_reg_loss_lst.append(im_fam_reg_loss) - - # ODM - refine_anchors_per_image = [t[i] for t in refine_anchors_list] - refine_anchors_per_image = paddle.concat( - refine_anchors_per_image).numpy() - im_odm_target = self.anchor_assign(refine_anchors_per_image, - gt_bboxes, gt_labels, is_crowd) - - if im_odm_target is not None: - im_odm_cls_loss, im_odm_reg_loss = self.get_odm_loss( - im_odm_target, im_s2anet_head_out, self.reg_loss_type) - odm_cls_loss_lst.append(im_odm_cls_loss) - odm_reg_loss_lst.append(im_odm_reg_loss) - - fam_cls_loss = paddle.add_n(fam_cls_loss_lst) / batch - fam_reg_loss = paddle.add_n(fam_reg_loss_lst) / batch - odm_cls_loss = paddle.add_n(odm_cls_loss_lst) / batch - odm_reg_loss = paddle.add_n(odm_reg_loss_lst) / batch - loss = fam_cls_loss + fam_reg_loss + odm_cls_loss + odm_reg_loss - - return { - 'loss': loss, - 'fam_cls_loss': fam_cls_loss, - 'fam_reg_loss': fam_reg_loss, - 'odm_cls_loss': odm_cls_loss, - 'odm_reg_loss': odm_reg_loss - } - - def bbox_decode(self, preds, anchors, wh_ratio_clip=1e-6): - """decode bbox from deltas - Args: - preds: [B, L, 5] - anchors: [1, L, 5] - return: - bboxes: [B, L, 5] - """ - preds = paddle.add(paddle.multiply(preds, self.stds), self.means) - - dx, dy, dw, dh, dangle = paddle.split(preds, 5, axis=-1) - max_ratio = np.abs(np.log(wh_ratio_clip)) - dw = paddle.clip(dw, min=-max_ratio, max=max_ratio) - dh = paddle.clip(dh, min=-max_ratio, max=max_ratio) - - rroi_x, rroi_y, rroi_w, rroi_h, rroi_angle = paddle.split( - anchors, 5, axis=-1) - - gx = dx * rroi_w * paddle.cos(rroi_angle) - dy * rroi_h * paddle.sin( - rroi_angle) + rroi_x - gy = dx * rroi_w * paddle.sin(rroi_angle) + dy * rroi_h * paddle.cos( - rroi_angle) + rroi_y - gw = rroi_w * dw.exp() - gh = rroi_h * dh.exp() - ga = np.pi * dangle + rroi_angle - ga = (ga + np.pi / 4) % np.pi - np.pi / 4 - bboxes = paddle.concat([gx, gy, gw, gh, ga], axis=-1) - return bboxes - - def rbox2poly(self, rboxes): - """ - rboxes: [x_ctr,y_ctr,w,h,angle] - to - polys: [x0,y0,x1,y1,x2,y2,x3,y3] - """ - N = paddle.shape(rboxes)[0] - - x_ctr = rboxes[:, 0] - y_ctr = rboxes[:, 1] - width = rboxes[:, 2] - height = rboxes[:, 3] - angle = rboxes[:, 4] - - tl_x, tl_y, br_x, br_y = -width * 0.5, -height * 0.5, width * 0.5, height * 0.5 - - normal_rects = paddle.stack( - [tl_x, br_x, br_x, tl_x, tl_y, tl_y, br_y, br_y], axis=0) - normal_rects = paddle.reshape(normal_rects, [2, 4, N]) - normal_rects = paddle.transpose(normal_rects, [2, 0, 1]) - - sin, cos = paddle.sin(angle), paddle.cos(angle) - # M: [N,2,2] - M = paddle.stack([cos, -sin, sin, cos], axis=0) - M = paddle.reshape(M, [2, 2, N]) - M = paddle.transpose(M, [2, 0, 1]) - - # polys: [N,8] - polys = paddle.matmul(M, normal_rects) - polys = paddle.transpose(polys, [2, 1, 0]) - polys = paddle.reshape(polys, [-1, N]) - polys = paddle.transpose(polys, [1, 0]) - - tmp = paddle.stack( - [x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr, x_ctr, y_ctr], axis=1) - polys = polys + tmp - return polys diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/simota_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/simota_head.py deleted file mode 100644 index 037c395..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/simota_head.py +++ /dev/null @@ -1,500 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The code is based on: -# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/yolox_head.py - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import math -from functools import partial -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.nn.initializer import Normal, Constant - -from ppdet.core.workspace import register - -from ppdet.modeling.bbox_utils import distance2bbox, bbox2distance -from ppdet.data.transform.atss_assigner import bbox_overlaps - -from .gfl_head import GFLHead - - -@register -class OTAHead(GFLHead): - """ - OTAHead - Args: - conv_feat (object): Instance of 'FCOSFeat' - num_classes (int): Number of classes - fpn_stride (list): The stride of each FPN Layer - prior_prob (float): Used to set the bias init for the class prediction layer - loss_qfl (object): Instance of QualityFocalLoss. - loss_dfl (object): Instance of DistributionFocalLoss. - loss_bbox (object): Instance of bbox loss. - assigner (object): Instance of label assigner. - reg_max: Max value of integral set :math: `{0, ..., reg_max}` - n QFL setting. Default: 16. - """ - __inject__ = [ - 'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', - 'assigner', 'nms' - ] - __shared__ = ['num_classes'] - - def __init__(self, - conv_feat='FCOSFeat', - dgqp_module=None, - num_classes=80, - fpn_stride=[8, 16, 32, 64, 128], - prior_prob=0.01, - loss_class='QualityFocalLoss', - loss_dfl='DistributionFocalLoss', - loss_bbox='GIoULoss', - assigner='SimOTAAssigner', - reg_max=16, - feat_in_chan=256, - nms=None, - nms_pre=1000, - cell_offset=0): - super(OTAHead, self).__init__( - conv_feat=conv_feat, - dgqp_module=dgqp_module, - num_classes=num_classes, - fpn_stride=fpn_stride, - prior_prob=prior_prob, - loss_class=loss_class, - loss_dfl=loss_dfl, - loss_bbox=loss_bbox, - reg_max=reg_max, - feat_in_chan=feat_in_chan, - nms=nms, - nms_pre=nms_pre, - cell_offset=cell_offset) - self.conv_feat = conv_feat - self.dgqp_module = dgqp_module - self.num_classes = num_classes - self.fpn_stride = fpn_stride - self.prior_prob = prior_prob - self.loss_qfl = loss_class - self.loss_dfl = loss_dfl - self.loss_bbox = loss_bbox - self.reg_max = reg_max - self.feat_in_chan = feat_in_chan - self.nms = nms - self.nms_pre = nms_pre - self.cell_offset = cell_offset - self.use_sigmoid = self.loss_qfl.use_sigmoid - - self.assigner = assigner - - def _get_target_single(self, flatten_cls_pred, flatten_center_and_stride, - flatten_bbox, gt_bboxes, gt_labels): - """Compute targets for priors in a single image. - """ - pos_num, label, label_weight, bbox_target = self.assigner( - F.sigmoid(flatten_cls_pred), flatten_center_and_stride, - flatten_bbox, gt_bboxes, gt_labels) - - return (pos_num, label, label_weight, bbox_target) - - def get_loss(self, head_outs, gt_meta): - cls_scores, bbox_preds = head_outs - num_level_anchors = [ - featmap.shape[-2] * featmap.shape[-1] for featmap in cls_scores - ] - num_imgs = gt_meta['im_id'].shape[0] - featmap_sizes = [[featmap.shape[-2], featmap.shape[-1]] - for featmap in cls_scores] - - decode_bbox_preds = [] - center_and_strides = [] - for featmap_size, stride, bbox_pred in zip(featmap_sizes, - self.fpn_stride, bbox_preds): - - # center in origin image - yy, xx = self.get_single_level_center_point(featmap_size, stride, - self.cell_offset) - - center_and_stride = paddle.stack([xx, yy, stride, stride], -1).tile( - [num_imgs, 1, 1]) - center_and_strides.append(center_and_stride) - center_in_feature = center_and_stride.reshape( - [-1, 4])[:, :-2] / stride - bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape( - [num_imgs, -1, 4 * (self.reg_max + 1)]) - pred_distances = self.distribution_project(bbox_pred) - decode_bbox_pred_wo_stride = distance2bbox( - center_in_feature, pred_distances).reshape([num_imgs, -1, 4]) - decode_bbox_preds.append(decode_bbox_pred_wo_stride * stride) - - flatten_cls_preds = [ - cls_pred.transpose([0, 2, 3, 1]).reshape( - [num_imgs, -1, self.cls_out_channels]) - for cls_pred in cls_scores - ] - flatten_cls_preds = paddle.concat(flatten_cls_preds, axis=1) - flatten_bboxes = paddle.concat(decode_bbox_preds, axis=1) - flatten_center_and_strides = paddle.concat(center_and_strides, axis=1) - - gt_boxes, gt_labels = gt_meta['gt_bbox'], gt_meta['gt_class'] - pos_num_l, label_l, label_weight_l, bbox_target_l = [], [], [], [] - for flatten_cls_pred,flatten_center_and_stride,flatten_bbox,gt_box, gt_label \ - in zip(flatten_cls_preds.detach(),flatten_center_and_strides.detach(), \ - flatten_bboxes.detach(),gt_boxes, gt_labels): - pos_num, label, label_weight, bbox_target = self._get_target_single( - flatten_cls_pred, flatten_center_and_stride, flatten_bbox, - gt_box, gt_label) - pos_num_l.append(pos_num) - label_l.append(label) - label_weight_l.append(label_weight) - bbox_target_l.append(bbox_target) - - labels = paddle.to_tensor(np.stack(label_l, axis=0)) - label_weights = paddle.to_tensor(np.stack(label_weight_l, axis=0)) - bbox_targets = paddle.to_tensor(np.stack(bbox_target_l, axis=0)) - - center_and_strides_list = self._images_to_levels( - flatten_center_and_strides, num_level_anchors) - labels_list = self._images_to_levels(labels, num_level_anchors) - label_weights_list = self._images_to_levels(label_weights, - num_level_anchors) - bbox_targets_list = self._images_to_levels(bbox_targets, - num_level_anchors) - num_total_pos = sum(pos_num_l) - try: - paddle.distributed.all_reduce(paddle.to_tensor(num_total_pos)) - num_total_pos = paddle.clip( - num_total_pos / paddle.distributed.get_world_size(), min=1.) - except: - num_total_pos = max(num_total_pos, 1) - - loss_bbox_list, loss_dfl_list, loss_qfl_list, avg_factor = [], [], [], [] - for cls_score, bbox_pred, center_and_strides, labels, label_weights, bbox_targets, stride in zip( - cls_scores, bbox_preds, center_and_strides_list, labels_list, - label_weights_list, bbox_targets_list, self.fpn_stride): - center_and_strides = center_and_strides.reshape([-1, 4]) - cls_score = cls_score.transpose([0, 2, 3, 1]).reshape( - [-1, self.cls_out_channels]) - bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape( - [-1, 4 * (self.reg_max + 1)]) - bbox_targets = bbox_targets.reshape([-1, 4]) - labels = labels.reshape([-1]) - label_weights = label_weights.reshape([-1]) - - bg_class_ind = self.num_classes - pos_inds = paddle.nonzero( - paddle.logical_and((labels >= 0), (labels < bg_class_ind)), - as_tuple=False).squeeze(1) - score = np.zeros(labels.shape) - - if len(pos_inds) > 0: - pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0) - pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0) - pos_centers = paddle.gather( - center_and_strides[:, :-2], pos_inds, axis=0) / stride - - weight_targets = F.sigmoid(cls_score.detach()) - weight_targets = paddle.gather( - weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0) - pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred) - pos_decode_bbox_pred = distance2bbox(pos_centers, - pos_bbox_pred_corners) - pos_decode_bbox_targets = pos_bbox_targets / stride - bbox_iou = bbox_overlaps( - pos_decode_bbox_pred.detach().numpy(), - pos_decode_bbox_targets.detach().numpy(), - is_aligned=True) - score[pos_inds.numpy()] = bbox_iou - - pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1]) - target_corners = bbox2distance(pos_centers, - pos_decode_bbox_targets, - self.reg_max).reshape([-1]) - # regression loss - loss_bbox = paddle.sum( - self.loss_bbox(pos_decode_bbox_pred, - pos_decode_bbox_targets) * weight_targets) - - # dfl loss - loss_dfl = self.loss_dfl( - pred_corners, - target_corners, - weight=weight_targets.expand([-1, 4]).reshape([-1]), - avg_factor=4.0) - else: - loss_bbox = bbox_pred.sum() * 0 - loss_dfl = bbox_pred.sum() * 0 - weight_targets = paddle.to_tensor([0], dtype='float32') - - # qfl loss - score = paddle.to_tensor(score) - loss_qfl = self.loss_qfl( - cls_score, (labels, score), - weight=label_weights, - avg_factor=num_total_pos) - loss_bbox_list.append(loss_bbox) - loss_dfl_list.append(loss_dfl) - loss_qfl_list.append(loss_qfl) - avg_factor.append(weight_targets.sum()) - - avg_factor = sum(avg_factor) - try: - paddle.distributed.all_reduce(paddle.to_tensor(avg_factor)) - avg_factor = paddle.clip( - avg_factor / paddle.distributed.get_world_size(), min=1) - except: - avg_factor = max(avg_factor.item(), 1) - if avg_factor <= 0: - loss_qfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) - loss_bbox = paddle.to_tensor( - 0, dtype='float32', stop_gradient=False) - loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) - else: - losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list)) - losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list)) - loss_qfl = sum(loss_qfl_list) - loss_bbox = sum(losses_bbox) - loss_dfl = sum(losses_dfl) - - loss_states = dict( - loss_qfl=loss_qfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl) - - return loss_states - - -@register -class OTAVFLHead(OTAHead): - __inject__ = [ - 'conv_feat', 'dgqp_module', 'loss_class', 'loss_dfl', 'loss_bbox', - 'assigner', 'nms' - ] - __shared__ = ['num_classes'] - - def __init__(self, - conv_feat='FCOSFeat', - dgqp_module=None, - num_classes=80, - fpn_stride=[8, 16, 32, 64, 128], - prior_prob=0.01, - loss_class='VarifocalLoss', - loss_dfl='DistributionFocalLoss', - loss_bbox='GIoULoss', - assigner='SimOTAAssigner', - reg_max=16, - feat_in_chan=256, - nms=None, - nms_pre=1000, - cell_offset=0): - super(OTAVFLHead, self).__init__( - conv_feat=conv_feat, - dgqp_module=dgqp_module, - num_classes=num_classes, - fpn_stride=fpn_stride, - prior_prob=prior_prob, - loss_class=loss_class, - loss_dfl=loss_dfl, - loss_bbox=loss_bbox, - reg_max=reg_max, - feat_in_chan=feat_in_chan, - nms=nms, - nms_pre=nms_pre, - cell_offset=cell_offset) - self.conv_feat = conv_feat - self.dgqp_module = dgqp_module - self.num_classes = num_classes - self.fpn_stride = fpn_stride - self.prior_prob = prior_prob - self.loss_vfl = loss_class - self.loss_dfl = loss_dfl - self.loss_bbox = loss_bbox - self.reg_max = reg_max - self.feat_in_chan = feat_in_chan - self.nms = nms - self.nms_pre = nms_pre - self.cell_offset = cell_offset - self.use_sigmoid = self.loss_vfl.use_sigmoid - - self.assigner = assigner - - def get_loss(self, head_outs, gt_meta): - cls_scores, bbox_preds = head_outs - num_level_anchors = [ - featmap.shape[-2] * featmap.shape[-1] for featmap in cls_scores - ] - num_imgs = gt_meta['im_id'].shape[0] - featmap_sizes = [[featmap.shape[-2], featmap.shape[-1]] - for featmap in cls_scores] - - decode_bbox_preds = [] - center_and_strides = [] - for featmap_size, stride, bbox_pred in zip(featmap_sizes, - self.fpn_stride, bbox_preds): - # center in origin image - yy, xx = self.get_single_level_center_point(featmap_size, stride, - self.cell_offset) - strides = paddle.full((len(xx), ), stride) - center_and_stride = paddle.stack([xx, yy, strides, strides], - -1).tile([num_imgs, 1, 1]) - center_and_strides.append(center_and_stride) - center_in_feature = center_and_stride.reshape( - [-1, 4])[:, :-2] / stride - bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape( - [num_imgs, -1, 4 * (self.reg_max + 1)]) - pred_distances = self.distribution_project(bbox_pred) - decode_bbox_pred_wo_stride = distance2bbox( - center_in_feature, pred_distances).reshape([num_imgs, -1, 4]) - decode_bbox_preds.append(decode_bbox_pred_wo_stride * stride) - - flatten_cls_preds = [ - cls_pred.transpose([0, 2, 3, 1]).reshape( - [num_imgs, -1, self.cls_out_channels]) - for cls_pred in cls_scores - ] - flatten_cls_preds = paddle.concat(flatten_cls_preds, axis=1) - flatten_bboxes = paddle.concat(decode_bbox_preds, axis=1) - flatten_center_and_strides = paddle.concat(center_and_strides, axis=1) - - gt_boxes, gt_labels = gt_meta['gt_bbox'], gt_meta['gt_class'] - pos_num_l, label_l, label_weight_l, bbox_target_l = [], [], [], [] - for flatten_cls_pred, flatten_center_and_stride, flatten_bbox,gt_box,gt_label \ - in zip(flatten_cls_preds.detach(), flatten_center_and_strides.detach(), \ - flatten_bboxes.detach(),gt_boxes,gt_labels): - pos_num, label, label_weight, bbox_target = self._get_target_single( - flatten_cls_pred, flatten_center_and_stride, flatten_bbox, - gt_box, gt_label) - pos_num_l.append(pos_num) - label_l.append(label) - label_weight_l.append(label_weight) - bbox_target_l.append(bbox_target) - - labels = paddle.to_tensor(np.stack(label_l, axis=0)) - label_weights = paddle.to_tensor(np.stack(label_weight_l, axis=0)) - bbox_targets = paddle.to_tensor(np.stack(bbox_target_l, axis=0)) - - center_and_strides_list = self._images_to_levels( - flatten_center_and_strides, num_level_anchors) - labels_list = self._images_to_levels(labels, num_level_anchors) - label_weights_list = self._images_to_levels(label_weights, - num_level_anchors) - bbox_targets_list = self._images_to_levels(bbox_targets, - num_level_anchors) - num_total_pos = sum(pos_num_l) - try: - paddle.distributed.all_reduce(paddle.to_tensor(num_total_pos)) - num_total_pos = paddle.clip( - num_total_pos / paddle.distributed.get_world_size(), min=1.) - except: - num_total_pos = max(num_total_pos, 1) - - loss_bbox_list, loss_dfl_list, loss_vfl_list, avg_factor = [], [], [], [] - for cls_score, bbox_pred, center_and_strides, labels, label_weights, bbox_targets, stride in zip( - cls_scores, bbox_preds, center_and_strides_list, labels_list, - label_weights_list, bbox_targets_list, self.fpn_stride): - center_and_strides = center_and_strides.reshape([-1, 4]) - cls_score = cls_score.transpose([0, 2, 3, 1]).reshape( - [-1, self.cls_out_channels]) - bbox_pred = bbox_pred.transpose([0, 2, 3, 1]).reshape( - [-1, 4 * (self.reg_max + 1)]) - bbox_targets = bbox_targets.reshape([-1, 4]) - labels = labels.reshape([-1]) - - bg_class_ind = self.num_classes - pos_inds = paddle.nonzero( - paddle.logical_and((labels >= 0), (labels < bg_class_ind)), - as_tuple=False).squeeze(1) - # vfl - vfl_score = np.zeros(cls_score.shape) - - if len(pos_inds) > 0: - pos_bbox_targets = paddle.gather(bbox_targets, pos_inds, axis=0) - pos_bbox_pred = paddle.gather(bbox_pred, pos_inds, axis=0) - pos_centers = paddle.gather( - center_and_strides[:, :-2], pos_inds, axis=0) / stride - - weight_targets = F.sigmoid(cls_score.detach()) - weight_targets = paddle.gather( - weight_targets.max(axis=1, keepdim=True), pos_inds, axis=0) - pos_bbox_pred_corners = self.distribution_project(pos_bbox_pred) - pos_decode_bbox_pred = distance2bbox(pos_centers, - pos_bbox_pred_corners) - pos_decode_bbox_targets = pos_bbox_targets / stride - bbox_iou = bbox_overlaps( - pos_decode_bbox_pred.detach().numpy(), - pos_decode_bbox_targets.detach().numpy(), - is_aligned=True) - - # vfl - pos_labels = paddle.gather(labels, pos_inds, axis=0) - vfl_score[pos_inds.numpy(), pos_labels] = bbox_iou - - pred_corners = pos_bbox_pred.reshape([-1, self.reg_max + 1]) - target_corners = bbox2distance(pos_centers, - pos_decode_bbox_targets, - self.reg_max).reshape([-1]) - # regression loss - loss_bbox = paddle.sum( - self.loss_bbox(pos_decode_bbox_pred, - pos_decode_bbox_targets) * weight_targets) - - # dfl loss - loss_dfl = self.loss_dfl( - pred_corners, - target_corners, - weight=weight_targets.expand([-1, 4]).reshape([-1]), - avg_factor=4.0) - else: - loss_bbox = bbox_pred.sum() * 0 - loss_dfl = bbox_pred.sum() * 0 - weight_targets = paddle.to_tensor([0], dtype='float32') - - # vfl loss - num_pos_avg_per_gpu = num_total_pos - vfl_score = paddle.to_tensor(vfl_score) - loss_vfl = self.loss_vfl( - cls_score, vfl_score, avg_factor=num_pos_avg_per_gpu) - - loss_bbox_list.append(loss_bbox) - loss_dfl_list.append(loss_dfl) - loss_vfl_list.append(loss_vfl) - avg_factor.append(weight_targets.sum()) - - avg_factor = sum(avg_factor) - try: - paddle.distributed.all_reduce(paddle.to_tensor(avg_factor)) - avg_factor = paddle.clip( - avg_factor / paddle.distributed.get_world_size(), min=1) - except: - avg_factor = max(avg_factor.item(), 1) - if avg_factor <= 0: - loss_vfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) - loss_bbox = paddle.to_tensor( - 0, dtype='float32', stop_gradient=False) - loss_dfl = paddle.to_tensor(0, dtype='float32', stop_gradient=False) - else: - losses_bbox = list(map(lambda x: x / avg_factor, loss_bbox_list)) - losses_dfl = list(map(lambda x: x / avg_factor, loss_dfl_list)) - loss_vfl = sum(loss_vfl_list) - loss_bbox = sum(losses_bbox) - loss_dfl = sum(losses_dfl) - - loss_states = dict( - loss_vfl=loss_vfl, loss_bbox=loss_bbox, loss_dfl=loss_dfl) - - return loss_states diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/solov2_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/solov2_head.py deleted file mode 100644 index 0fd0f61..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/solov2_head.py +++ /dev/null @@ -1,554 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -from paddle import ParamAttr -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.nn.initializer import Normal, Constant - -from ppdet.modeling.layers import ConvNormLayer, MaskMatrixNMS, DropBlock -from ppdet.core.workspace import register - -from six.moves import zip -import numpy as np - -__all__ = ['SOLOv2Head'] - - -@register -class SOLOv2MaskHead(nn.Layer): - """ - MaskHead of SOLOv2. - The code of this function is based on: - https://github.com/WXinlong/SOLO/blob/master/mmdet/models/mask_heads/mask_feat_head.py - - Args: - in_channels (int): The channel number of input Tensor. - out_channels (int): The channel number of output Tensor. - start_level (int): The position where the input starts. - end_level (int): The position where the input ends. - use_dcn_in_tower (bool): Whether to use dcn in tower or not. - """ - __shared__ = ['norm_type'] - - def __init__(self, - in_channels=256, - mid_channels=128, - out_channels=256, - start_level=0, - end_level=3, - use_dcn_in_tower=False, - norm_type='gn'): - super(SOLOv2MaskHead, self).__init__() - assert start_level >= 0 and end_level >= start_level - self.in_channels = in_channels - self.out_channels = out_channels - self.mid_channels = mid_channels - self.use_dcn_in_tower = use_dcn_in_tower - self.range_level = end_level - start_level + 1 - self.use_dcn = True if self.use_dcn_in_tower else False - self.convs_all_levels = [] - self.norm_type = norm_type - for i in range(start_level, end_level + 1): - conv_feat_name = 'mask_feat_head.convs_all_levels.{}'.format(i) - conv_pre_feat = nn.Sequential() - if i == start_level: - conv_pre_feat.add_sublayer( - conv_feat_name + '.conv' + str(i), - ConvNormLayer( - ch_in=self.in_channels, - ch_out=self.mid_channels, - filter_size=3, - stride=1, - use_dcn=self.use_dcn, - norm_type=self.norm_type)) - self.add_sublayer('conv_pre_feat' + str(i), conv_pre_feat) - self.convs_all_levels.append(conv_pre_feat) - else: - for j in range(i): - ch_in = 0 - if j == 0: - ch_in = self.in_channels + 2 if i == end_level else self.in_channels - else: - ch_in = self.mid_channels - conv_pre_feat.add_sublayer( - conv_feat_name + '.conv' + str(j), - ConvNormLayer( - ch_in=ch_in, - ch_out=self.mid_channels, - filter_size=3, - stride=1, - use_dcn=self.use_dcn, - norm_type=self.norm_type)) - conv_pre_feat.add_sublayer( - conv_feat_name + '.conv' + str(j) + 'act', nn.ReLU()) - conv_pre_feat.add_sublayer( - 'upsample' + str(i) + str(j), - nn.Upsample( - scale_factor=2, mode='bilinear')) - self.add_sublayer('conv_pre_feat' + str(i), conv_pre_feat) - self.convs_all_levels.append(conv_pre_feat) - - conv_pred_name = 'mask_feat_head.conv_pred.0' - self.conv_pred = self.add_sublayer( - conv_pred_name, - ConvNormLayer( - ch_in=self.mid_channels, - ch_out=self.out_channels, - filter_size=1, - stride=1, - use_dcn=self.use_dcn, - norm_type=self.norm_type)) - - def forward(self, inputs): - """ - Get SOLOv2MaskHead output. - - Args: - inputs(list[Tensor]): feature map from each necks with shape of [N, C, H, W] - Returns: - ins_pred(Tensor): Output of SOLOv2MaskHead head - """ - feat_all_level = F.relu(self.convs_all_levels[0](inputs[0])) - for i in range(1, self.range_level): - input_p = inputs[i] - if i == (self.range_level - 1): - input_feat = input_p - x_range = paddle.linspace( - -1, 1, paddle.shape(input_feat)[-1], dtype='float32') - y_range = paddle.linspace( - -1, 1, paddle.shape(input_feat)[-2], dtype='float32') - y, x = paddle.meshgrid([y_range, x_range]) - x = paddle.unsqueeze(x, [0, 1]) - y = paddle.unsqueeze(y, [0, 1]) - y = paddle.expand( - y, shape=[paddle.shape(input_feat)[0], 1, -1, -1]) - x = paddle.expand( - x, shape=[paddle.shape(input_feat)[0], 1, -1, -1]) - coord_feat = paddle.concat([x, y], axis=1) - input_p = paddle.concat([input_p, coord_feat], axis=1) - feat_all_level = paddle.add(feat_all_level, - self.convs_all_levels[i](input_p)) - ins_pred = F.relu(self.conv_pred(feat_all_level)) - - return ins_pred - - -@register -class SOLOv2Head(nn.Layer): - """ - Head block for SOLOv2 network - - Args: - num_classes (int): Number of output classes. - in_channels (int): Number of input channels. - seg_feat_channels (int): Num_filters of kernel & categroy branch convolution operation. - stacked_convs (int): Times of convolution operation. - num_grids (list[int]): List of feature map grids size. - kernel_out_channels (int): Number of output channels in kernel branch. - dcn_v2_stages (list): Which stage use dcn v2 in tower. It is between [0, stacked_convs). - segm_strides (list[int]): List of segmentation area stride. - solov2_loss (object): SOLOv2Loss instance. - score_threshold (float): Threshold of categroy score. - mask_nms (object): MaskMatrixNMS instance. - """ - __inject__ = ['solov2_loss', 'mask_nms'] - __shared__ = ['norm_type', 'num_classes'] - - def __init__(self, - num_classes=80, - in_channels=256, - seg_feat_channels=256, - stacked_convs=4, - num_grids=[40, 36, 24, 16, 12], - kernel_out_channels=256, - dcn_v2_stages=[], - segm_strides=[8, 8, 16, 32, 32], - solov2_loss=None, - score_threshold=0.1, - mask_threshold=0.5, - mask_nms=None, - norm_type='gn', - drop_block=False): - super(SOLOv2Head, self).__init__() - self.num_classes = num_classes - self.in_channels = in_channels - self.seg_num_grids = num_grids - self.cate_out_channels = self.num_classes - self.seg_feat_channels = seg_feat_channels - self.stacked_convs = stacked_convs - self.kernel_out_channels = kernel_out_channels - self.dcn_v2_stages = dcn_v2_stages - self.segm_strides = segm_strides - self.solov2_loss = solov2_loss - self.mask_nms = mask_nms - self.score_threshold = score_threshold - self.mask_threshold = mask_threshold - self.norm_type = norm_type - self.drop_block = drop_block - - self.kernel_pred_convs = [] - self.cate_pred_convs = [] - for i in range(self.stacked_convs): - use_dcn = True if i in self.dcn_v2_stages else False - ch_in = self.in_channels + 2 if i == 0 else self.seg_feat_channels - kernel_conv = self.add_sublayer( - 'bbox_head.kernel_convs.' + str(i), - ConvNormLayer( - ch_in=ch_in, - ch_out=self.seg_feat_channels, - filter_size=3, - stride=1, - use_dcn=use_dcn, - norm_type=self.norm_type)) - self.kernel_pred_convs.append(kernel_conv) - ch_in = self.in_channels if i == 0 else self.seg_feat_channels - cate_conv = self.add_sublayer( - 'bbox_head.cate_convs.' + str(i), - ConvNormLayer( - ch_in=ch_in, - ch_out=self.seg_feat_channels, - filter_size=3, - stride=1, - use_dcn=use_dcn, - norm_type=self.norm_type)) - self.cate_pred_convs.append(cate_conv) - - self.solo_kernel = self.add_sublayer( - 'bbox_head.solo_kernel', - nn.Conv2D( - self.seg_feat_channels, - self.kernel_out_channels, - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=True)) - self.solo_cate = self.add_sublayer( - 'bbox_head.solo_cate', - nn.Conv2D( - self.seg_feat_channels, - self.cate_out_channels, - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.01)), - bias_attr=ParamAttr(initializer=Constant( - value=float(-np.log((1 - 0.01) / 0.01)))))) - - if self.drop_block and self.training: - self.drop_block_fun = DropBlock( - block_size=3, keep_prob=0.9, name='solo_cate.dropblock') - - def _points_nms(self, heat, kernel_size=2): - hmax = F.max_pool2d(heat, kernel_size=kernel_size, stride=1, padding=1) - keep = paddle.cast((hmax[:, :, :-1, :-1] == heat), 'float32') - return heat * keep - - def _split_feats(self, feats): - return (F.interpolate( - feats[0], - scale_factor=0.5, - align_corners=False, - align_mode=0, - mode='bilinear'), feats[1], feats[2], feats[3], F.interpolate( - feats[4], - size=paddle.shape(feats[3])[-2:], - mode='bilinear', - align_corners=False, - align_mode=0)) - - def forward(self, input): - """ - Get SOLOv2 head output - - Args: - input (list): List of Tensors, output of backbone or neck stages - Returns: - cate_pred_list (list): Tensors of each category branch layer - kernel_pred_list (list): Tensors of each kernel branch layer - """ - feats = self._split_feats(input) - cate_pred_list = [] - kernel_pred_list = [] - for idx in range(len(self.seg_num_grids)): - cate_pred, kernel_pred = self._get_output_single(feats[idx], idx) - cate_pred_list.append(cate_pred) - kernel_pred_list.append(kernel_pred) - - return cate_pred_list, kernel_pred_list - - def _get_output_single(self, input, idx): - ins_kernel_feat = input - # CoordConv - x_range = paddle.linspace( - -1, 1, paddle.shape(ins_kernel_feat)[-1], dtype='float32') - y_range = paddle.linspace( - -1, 1, paddle.shape(ins_kernel_feat)[-2], dtype='float32') - y, x = paddle.meshgrid([y_range, x_range]) - x = paddle.unsqueeze(x, [0, 1]) - y = paddle.unsqueeze(y, [0, 1]) - y = paddle.expand( - y, shape=[paddle.shape(ins_kernel_feat)[0], 1, -1, -1]) - x = paddle.expand( - x, shape=[paddle.shape(ins_kernel_feat)[0], 1, -1, -1]) - coord_feat = paddle.concat([x, y], axis=1) - ins_kernel_feat = paddle.concat([ins_kernel_feat, coord_feat], axis=1) - - # kernel branch - kernel_feat = ins_kernel_feat - seg_num_grid = self.seg_num_grids[idx] - kernel_feat = F.interpolate( - kernel_feat, - size=[seg_num_grid, seg_num_grid], - mode='bilinear', - align_corners=False, - align_mode=0) - cate_feat = kernel_feat[:, :-2, :, :] - - for kernel_layer in self.kernel_pred_convs: - kernel_feat = F.relu(kernel_layer(kernel_feat)) - if self.drop_block and self.training: - kernel_feat = self.drop_block_fun(kernel_feat) - kernel_pred = self.solo_kernel(kernel_feat) - # cate branch - for cate_layer in self.cate_pred_convs: - cate_feat = F.relu(cate_layer(cate_feat)) - if self.drop_block and self.training: - cate_feat = self.drop_block_fun(cate_feat) - cate_pred = self.solo_cate(cate_feat) - - if not self.training: - cate_pred = self._points_nms(F.sigmoid(cate_pred), kernel_size=2) - cate_pred = paddle.transpose(cate_pred, [0, 2, 3, 1]) - return cate_pred, kernel_pred - - def get_loss(self, cate_preds, kernel_preds, ins_pred, ins_labels, - cate_labels, grid_order_list, fg_num): - """ - Get loss of network of SOLOv2. - - Args: - cate_preds (list): Tensor list of categroy branch output. - kernel_preds (list): Tensor list of kernel branch output. - ins_pred (list): Tensor list of instance branch output. - ins_labels (list): List of instance labels pre batch. - cate_labels (list): List of categroy labels pre batch. - grid_order_list (list): List of index in pre grid. - fg_num (int): Number of positive samples in a mini-batch. - Returns: - loss_ins (Tensor): The instance loss Tensor of SOLOv2 network. - loss_cate (Tensor): The category loss Tensor of SOLOv2 network. - """ - batch_size = paddle.shape(grid_order_list[0])[0] - ins_pred_list = [] - for kernel_preds_level, grid_orders_level in zip(kernel_preds, - grid_order_list): - if grid_orders_level.shape[1] == 0: - ins_pred_list.append(None) - continue - grid_orders_level = paddle.reshape(grid_orders_level, [-1]) - reshape_pred = paddle.reshape( - kernel_preds_level, - shape=(paddle.shape(kernel_preds_level)[0], - paddle.shape(kernel_preds_level)[1], -1)) - reshape_pred = paddle.transpose(reshape_pred, [0, 2, 1]) - reshape_pred = paddle.reshape( - reshape_pred, shape=(-1, paddle.shape(reshape_pred)[2])) - gathered_pred = paddle.gather(reshape_pred, index=grid_orders_level) - gathered_pred = paddle.reshape( - gathered_pred, - shape=[batch_size, -1, paddle.shape(gathered_pred)[1]]) - cur_ins_pred = ins_pred - cur_ins_pred = paddle.reshape( - cur_ins_pred, - shape=(paddle.shape(cur_ins_pred)[0], - paddle.shape(cur_ins_pred)[1], -1)) - ins_pred_conv = paddle.matmul(gathered_pred, cur_ins_pred) - cur_ins_pred = paddle.reshape( - ins_pred_conv, - shape=(-1, paddle.shape(ins_pred)[-2], - paddle.shape(ins_pred)[-1])) - ins_pred_list.append(cur_ins_pred) - - num_ins = paddle.sum(fg_num) - cate_preds = [ - paddle.reshape( - paddle.transpose(cate_pred, [0, 2, 3, 1]), - shape=(-1, self.cate_out_channels)) for cate_pred in cate_preds - ] - flatten_cate_preds = paddle.concat(cate_preds) - new_cate_labels = [] - for cate_label in cate_labels: - new_cate_labels.append(paddle.reshape(cate_label, shape=[-1])) - cate_labels = paddle.concat(new_cate_labels) - - loss_ins, loss_cate = self.solov2_loss( - ins_pred_list, ins_labels, flatten_cate_preds, cate_labels, num_ins) - - return {'loss_ins': loss_ins, 'loss_cate': loss_cate} - - def get_prediction(self, cate_preds, kernel_preds, seg_pred, im_shape, - scale_factor): - """ - Get prediction result of SOLOv2 network - - Args: - cate_preds (list): List of Variables, output of categroy branch. - kernel_preds (list): List of Variables, output of kernel branch. - seg_pred (list): List of Variables, output of mask head stages. - im_shape (Variables): [h, w] for input images. - scale_factor (Variables): [scale, scale] for input images. - Returns: - seg_masks (Tensor): The prediction segmentation. - cate_labels (Tensor): The prediction categroy label of each segmentation. - seg_masks (Tensor): The prediction score of each segmentation. - """ - num_levels = len(cate_preds) - featmap_size = paddle.shape(seg_pred)[-2:] - seg_masks_list = [] - cate_labels_list = [] - cate_scores_list = [] - cate_preds = [cate_pred * 1.0 for cate_pred in cate_preds] - kernel_preds = [kernel_pred * 1.0 for kernel_pred in kernel_preds] - # Currently only supports batch size == 1 - for idx in range(1): - cate_pred_list = [ - paddle.reshape( - cate_preds[i][idx], shape=(-1, self.cate_out_channels)) - for i in range(num_levels) - ] - seg_pred_list = seg_pred - kernel_pred_list = [ - paddle.reshape( - paddle.transpose(kernel_preds[i][idx], [1, 2, 0]), - shape=(-1, self.kernel_out_channels)) - for i in range(num_levels) - ] - cate_pred_list = paddle.concat(cate_pred_list, axis=0) - kernel_pred_list = paddle.concat(kernel_pred_list, axis=0) - - seg_masks, cate_labels, cate_scores = self.get_seg_single( - cate_pred_list, seg_pred_list, kernel_pred_list, featmap_size, - im_shape[idx], scale_factor[idx][0]) - bbox_num = paddle.shape(cate_labels)[0:1] - return seg_masks, cate_labels, cate_scores, bbox_num - - def get_seg_single(self, cate_preds, seg_preds, kernel_preds, featmap_size, - im_shape, scale_factor): - """ - The code of this function is based on: - https://github.com/WXinlong/SOLO/blob/master/mmdet/models/anchor_heads/solov2_head.py#L385 - """ - h = paddle.cast(im_shape[0], 'int32') - w = paddle.cast(im_shape[1], 'int32') - upsampled_size_out = [featmap_size[0] * 4, featmap_size[1] * 4] - - y = paddle.zeros(shape=paddle.shape(cate_preds), dtype='float32') - inds = paddle.where(cate_preds > self.score_threshold, cate_preds, y) - inds = paddle.nonzero(inds) - cate_preds = paddle.reshape(cate_preds, shape=[-1]) - # Prevent empty and increase fake data - ind_a = paddle.cast(paddle.shape(kernel_preds)[0:1], 'int64') - ind_b = paddle.zeros(shape=[1], dtype='int64') - inds_end = paddle.unsqueeze(paddle.concat([ind_a, ind_b]), 0) - inds = paddle.concat([inds, inds_end]) - kernel_preds_end = paddle.ones( - shape=[1, self.kernel_out_channels], dtype='float32') - kernel_preds = paddle.concat([kernel_preds, kernel_preds_end]) - cate_preds = paddle.concat( - [cate_preds, paddle.zeros( - shape=[1], dtype='float32')]) - - # cate_labels & kernel_preds - cate_labels = inds[:, 1] - kernel_preds = paddle.gather(kernel_preds, index=inds[:, 0]) - cate_score_idx = paddle.add(inds[:, 0] * self.cate_out_channels, - cate_labels) - cate_scores = paddle.gather(cate_preds, index=cate_score_idx) - - size_trans = np.power(self.seg_num_grids, 2) - strides = [] - for _ind in range(len(self.segm_strides)): - strides.append( - paddle.full( - shape=[int(size_trans[_ind])], - fill_value=self.segm_strides[_ind], - dtype="int32")) - strides = paddle.concat(strides) - strides = paddle.concat( - [strides, paddle.zeros( - shape=[1], dtype='int32')]) - strides = paddle.gather(strides, index=inds[:, 0]) - - # mask encoding. - kernel_preds = paddle.unsqueeze(kernel_preds, [2, 3]) - seg_preds = F.conv2d(seg_preds, kernel_preds) - seg_preds = F.sigmoid(paddle.squeeze(seg_preds, [0])) - seg_masks = seg_preds > self.mask_threshold - seg_masks = paddle.cast(seg_masks, 'float32') - sum_masks = paddle.sum(seg_masks, axis=[1, 2]) - - y = paddle.zeros(shape=paddle.shape(sum_masks), dtype='float32') - keep = paddle.where(sum_masks > strides, sum_masks, y) - keep = paddle.nonzero(keep) - keep = paddle.squeeze(keep, axis=[1]) - # Prevent empty and increase fake data - keep_other = paddle.concat( - [keep, paddle.cast(paddle.shape(sum_masks)[0:1] - 1, 'int64')]) - keep_scores = paddle.concat( - [keep, paddle.cast(paddle.shape(sum_masks)[0:1], 'int64')]) - cate_scores_end = paddle.zeros(shape=[1], dtype='float32') - cate_scores = paddle.concat([cate_scores, cate_scores_end]) - - seg_masks = paddle.gather(seg_masks, index=keep_other) - seg_preds = paddle.gather(seg_preds, index=keep_other) - sum_masks = paddle.gather(sum_masks, index=keep_other) - cate_labels = paddle.gather(cate_labels, index=keep_other) - cate_scores = paddle.gather(cate_scores, index=keep_scores) - - # mask scoring. - seg_mul = paddle.cast(seg_preds * seg_masks, 'float32') - seg_scores = paddle.sum(seg_mul, axis=[1, 2]) / sum_masks - cate_scores *= seg_scores - # Matrix NMS - seg_preds, cate_scores, cate_labels = self.mask_nms( - seg_preds, seg_masks, cate_labels, cate_scores, sum_masks=sum_masks) - ori_shape = im_shape[:2] / scale_factor + 0.5 - ori_shape = paddle.cast(ori_shape, 'int32') - seg_preds = F.interpolate( - paddle.unsqueeze(seg_preds, 0), - size=upsampled_size_out, - mode='bilinear', - align_corners=False, - align_mode=0) - seg_preds = paddle.slice( - seg_preds, axes=[2, 3], starts=[0, 0], ends=[h, w]) - seg_masks = paddle.squeeze( - F.interpolate( - seg_preds, - size=ori_shape[:2], - mode='bilinear', - align_corners=False, - align_mode=0), - axis=[0]) - seg_masks = paddle.cast(seg_masks > self.mask_threshold, 'uint8') - return seg_masks, cate_labels, cate_scores diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/sparse_roi_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/sparse_roi_head.py deleted file mode 100644 index bdc76a9..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/sparse_roi_head.py +++ /dev/null @@ -1,467 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This code is referenced from: https://github.com/open-mmlab/mmdetection - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import copy - -import paddle -from paddle import nn - -from ppdet.core.workspace import register -from ppdet.modeling import initializer as init -from .roi_extractor import RoIAlign -from ..bbox_utils import delta2bbox_v2 -from ..cls_utils import _get_class_default_kwargs -from ..layers import MultiHeadAttention - -__all__ = ['SparseRoIHead', 'DIIHead', 'DynamicMaskHead'] - - -class DynamicConv(nn.Layer): - def __init__(self, - in_channels=256, - feature_channels=64, - out_channels=None, - roi_resolution=7, - with_proj=True): - super(DynamicConv, self).__init__() - - self.in_channels = in_channels - self.feature_channels = feature_channels - self.out_channels = out_channels if out_channels else in_channels - - self.num_params_in = self.in_channels * self.feature_channels - self.num_params_out = self.out_channels * self.feature_channels - self.dynamic_layer = nn.Linear(self.in_channels, - self.num_params_in + self.num_params_out) - - self.norm_in = nn.LayerNorm(self.feature_channels) - self.norm_out = nn.LayerNorm(self.out_channels) - - self.activation = nn.ReLU() - - self.with_proj = with_proj - if self.with_proj: - num_output = self.out_channels * roi_resolution**2 - self.fc_layer = nn.Linear(num_output, self.out_channels) - self.fc_norm = nn.LayerNorm(self.out_channels) - - def forward(self, param_feature, input_feature): - input_feature = input_feature.flatten(2).transpose([2, 0, 1]) - input_feature = input_feature.transpose([1, 0, 2]) - - parameters = self.dynamic_layer(param_feature) - - param_in = parameters[:, :self.num_params_in].reshape( - [-1, self.in_channels, self.feature_channels]) - param_out = parameters[:, -self.num_params_out:].reshape( - [-1, self.feature_channels, self.out_channels]) - - features = paddle.bmm(input_feature, param_in) - features = self.norm_in(features) - features = self.activation(features) - - features = paddle.bmm(features, param_out) - features = self.norm_out(features) - features = self.activation(features) - - if self.with_proj: - features = features.flatten(1) - features = self.fc_layer(features) - features = self.fc_norm(features) - features = self.activation(features) - - return features - - -class FFN(nn.Layer): - def __init__(self, - embed_dims=256, - feedforward_channels=2048, - num_fcs=2, - ffn_drop=0.0, - add_identity=True): - super(FFN, self).__init__() - - layers = [] - in_channels = embed_dims - for _ in range(num_fcs - 1): - layers.append( - nn.Sequential( - nn.Linear(in_channels, feedforward_channels), - nn.ReLU(), nn.Dropout(ffn_drop))) - in_channels = feedforward_channels - layers.append(nn.Linear(feedforward_channels, embed_dims)) - layers.append(nn.Dropout(ffn_drop)) - self.layers = nn.Sequential(*layers) - - self.add_identity = add_identity - - def forward(self, x): - identity = x - out = self.layers(x) - if not self.add_identity: - return out - else: - return out + identity - - -@register -class DynamicMaskHead(nn.Layer): - __shared__ = ['num_classes', 'proposal_embedding_dim', 'norm_type'] - - def __init__(self, - num_classes=80, - proposal_embedding_dim=256, - dynamic_feature_channels=64, - roi_resolution=14, - num_convs=4, - conv_kernel_size=3, - conv_channels=256, - upsample_method='deconv', - upsample_scale_factor=2, - norm_type='bn'): - super(DynamicMaskHead, self).__init__() - - self.d_model = proposal_embedding_dim - - self.instance_interactive_conv = DynamicConv( - self.d_model, - dynamic_feature_channels, - roi_resolution=roi_resolution, - with_proj=False) - - self.convs = nn.LayerList() - for i in range(num_convs): - self.convs.append( - nn.Sequential( - nn.Conv2D( - self.d_model if i == 0 else conv_channels, - conv_channels, - conv_kernel_size, - padding='same', - bias_attr=False), - nn.BatchNorm2D(conv_channels), - nn.ReLU())) - if norm_type == 'sync_bn': - self.convs = nn.SyncBatchNorm.convert_sync_batchnorm(self.convs) - - self.upsample_method = upsample_method - if upsample_method is None: - self.upsample = None - elif upsample_method == 'deconv': - self.upsample = nn.Conv2DTranspose( - conv_channels if num_convs > 0 else self.d_model, - conv_channels, - upsample_scale_factor, - stride=upsample_scale_factor) - self.relu = nn.ReLU() - else: - self.upsample = nn.Upsample(None, upsample_scale_factor) - - cls_in_channels = conv_channels if num_convs > 0 else self.d_model - cls_in_channels = conv_channels if upsample_method == 'deconv' else cls_in_channels - self.conv_cls = nn.Conv2D(cls_in_channels, num_classes, 1) - - self._init_weights() - - def _init_weights(self): - for p in self.parameters(): - if p.dim() > 1: - init.xavier_uniform_(p) - - init.constant_(self.conv_cls.bias, 0.) - - def forward(self, roi_features, attn_features): - attn_features = attn_features.reshape([-1, self.d_model]) - attn_features_iic = self.instance_interactive_conv(attn_features, - roi_features) - - x = attn_features_iic.transpose([0, 2, 1]).reshape(roi_features.shape) - - for conv in self.convs: - x = conv(x) - if self.upsample is not None: - x = self.upsample(x) - if self.upsample_method == 'deconv': - x = self.relu(x) - mask_pred = self.conv_cls(x) - return mask_pred - - -@register -class DIIHead(nn.Layer): - __shared__ = ['num_classes', 'proposal_embedding_dim'] - - def __init__(self, - num_classes=80, - proposal_embedding_dim=256, - feedforward_channels=2048, - dynamic_feature_channels=64, - roi_resolution=7, - num_attn_heads=8, - dropout=0.0, - num_ffn_fcs=2, - num_cls_fcs=1, - num_reg_fcs=3): - super(DIIHead, self).__init__() - - self.num_classes = num_classes - self.d_model = proposal_embedding_dim - - self.attention = MultiHeadAttention(self.d_model, num_attn_heads, - dropout) - self.attention_norm = nn.LayerNorm(self.d_model) - - self.instance_interactive_conv = DynamicConv( - self.d_model, - dynamic_feature_channels, - roi_resolution=roi_resolution, - with_proj=True) - self.instance_interactive_conv_dropout = nn.Dropout(dropout) - self.instance_interactive_conv_norm = nn.LayerNorm(self.d_model) - - self.ffn = FFN(self.d_model, feedforward_channels, num_ffn_fcs, dropout) - self.ffn_norm = nn.LayerNorm(self.d_model) - - self.cls_fcs = nn.LayerList() - for _ in range(num_cls_fcs): - self.cls_fcs.append( - nn.Linear( - self.d_model, self.d_model, bias_attr=False)) - self.cls_fcs.append(nn.LayerNorm(self.d_model)) - self.cls_fcs.append(nn.ReLU()) - self.fc_cls = nn.Linear(self.d_model, self.num_classes) - - self.reg_fcs = nn.LayerList() - for _ in range(num_reg_fcs): - self.reg_fcs.append( - nn.Linear( - self.d_model, self.d_model, bias_attr=False)) - self.reg_fcs.append(nn.LayerNorm(self.d_model)) - self.reg_fcs.append(nn.ReLU()) - self.fc_reg = nn.Linear(self.d_model, 4) - - self._init_weights() - - def _init_weights(self): - for p in self.parameters(): - if p.dim() > 1: - init.xavier_uniform_(p) - - bias_init = init.bias_init_with_prob(0.01) - init.constant_(self.fc_cls.bias, bias_init) - - def forward(self, roi_features, proposal_features): - N, num_proposals = proposal_features.shape[:2] - - proposal_features = proposal_features + self.attention( - proposal_features) - attn_features = self.attention_norm(proposal_features) - - proposal_features = attn_features.reshape([-1, self.d_model]) - proposal_features_iic = self.instance_interactive_conv( - proposal_features, roi_features) - proposal_features = proposal_features + self.instance_interactive_conv_dropout( - proposal_features_iic) - obj_features = self.instance_interactive_conv_norm(proposal_features) - - obj_features = self.ffn(obj_features) - obj_features = self.ffn_norm(obj_features) - - cls_feature = obj_features.clone() - reg_feature = obj_features.clone() - - for cls_layer in self.cls_fcs: - cls_feature = cls_layer(cls_feature) - class_logits = self.fc_cls(cls_feature) - for reg_layer in self.reg_fcs: - reg_feature = reg_layer(reg_feature) - bbox_deltas = self.fc_reg(reg_feature) - - class_logits = class_logits.reshape( - [N, num_proposals, self.num_classes]) - bbox_deltas = bbox_deltas.reshape([N, num_proposals, 4]) - obj_features = obj_features.reshape([N, num_proposals, self.d_model]) - - return class_logits, bbox_deltas, obj_features, attn_features - - @staticmethod - def refine_bboxes(proposal_bboxes, bbox_deltas): - pred_bboxes = delta2bbox_v2( - bbox_deltas.reshape([-1, 4]), - proposal_bboxes.reshape([-1, 4]), - delta_mean=[0.0, 0.0, 0.0, 0.0], - delta_std=[0.5, 0.5, 1.0, 1.0], - ctr_clip=None) - return pred_bboxes.reshape(proposal_bboxes.shape) - - -@register -class SparseRoIHead(nn.Layer): - __inject__ = ['bbox_head', 'mask_head', 'loss_func'] - - def __init__(self, - num_stages=6, - bbox_roi_extractor=_get_class_default_kwargs(RoIAlign), - mask_roi_extractor=_get_class_default_kwargs(RoIAlign), - bbox_head='DIIHead', - mask_head='DynamicMaskHead', - loss_func='QueryInstLoss'): - super(SparseRoIHead, self).__init__() - - self.num_stages = num_stages - - self.bbox_roi_extractor = bbox_roi_extractor - self.mask_roi_extractor = mask_roi_extractor - if isinstance(bbox_roi_extractor, dict): - self.bbox_roi_extractor = RoIAlign(**bbox_roi_extractor) - if isinstance(mask_roi_extractor, dict): - self.mask_roi_extractor = RoIAlign(**mask_roi_extractor) - - self.bbox_heads = nn.LayerList( - [copy.deepcopy(bbox_head) for _ in range(num_stages)]) - self.mask_heads = nn.LayerList( - [copy.deepcopy(mask_head) for _ in range(num_stages)]) - - self.loss_helper = loss_func - - @classmethod - def from_config(cls, cfg, input_shape): - bbox_roi_extractor = cfg['bbox_roi_extractor'] - mask_roi_extractor = cfg['mask_roi_extractor'] - assert isinstance(bbox_roi_extractor, dict) - assert isinstance(mask_roi_extractor, dict) - - kwargs = RoIAlign.from_config(cfg, input_shape) - bbox_roi_extractor.update(kwargs) - mask_roi_extractor.update(kwargs) - - return { - 'bbox_roi_extractor': bbox_roi_extractor, - 'mask_roi_extractor': mask_roi_extractor - } - - @staticmethod - def get_roi_features(features, bboxes, roi_extractor): - rois_list = [ - bboxes[i] for i in range(len(bboxes)) if len(bboxes[i]) > 0 - ] - rois_num = paddle.to_tensor( - [len(bboxes[i]) for i in range(len(bboxes))], dtype='int32') - - pos_ids = paddle.cast(rois_num, dtype='bool') - if pos_ids.sum() != len(rois_num): - rois_num = rois_num[pos_ids] - features = [features[i][pos_ids] for i in range(len(features))] - - return roi_extractor(features, rois_list, rois_num) - - def _forward_train(self, body_feats, pro_bboxes, pro_feats, targets): - all_stage_losses = {} - for stage in range(self.num_stages): - bbox_head = self.bbox_heads[stage] - mask_head = self.mask_heads[stage] - - roi_feats = self.get_roi_features(body_feats, pro_bboxes, - self.bbox_roi_extractor) - class_logits, bbox_deltas, pro_feats, attn_feats = bbox_head( - roi_feats, pro_feats) - bbox_pred = self.bbox_heads[stage].refine_bboxes(pro_bboxes, - bbox_deltas) - - indices = self.loss_helper.matcher({ - 'pred_logits': class_logits.detach(), - 'pred_boxes': bbox_pred.detach() - }, targets) - avg_factor = paddle.to_tensor( - [sum(len(tgt['labels']) for tgt in targets)], dtype='float32') - if paddle.distributed.get_world_size() > 1: - paddle.distributed.all_reduce(avg_factor) - avg_factor /= paddle.distributed.get_world_size() - avg_factor = paddle.clip(avg_factor, min=1.) - - loss_classes = self.loss_helper.loss_classes(class_logits, targets, - indices, avg_factor) - if sum(len(v['labels']) for v in targets) == 0: - loss_bboxes = { - 'loss_bbox': paddle.to_tensor([0.]), - 'loss_giou': paddle.to_tensor([0.]) - } - loss_masks = {'loss_mask': paddle.to_tensor([0.])} - else: - loss_bboxes = self.loss_helper.loss_bboxes(bbox_pred, targets, - indices, avg_factor) - - pos_attn_feats = paddle.concat([ - paddle.gather( - src, src_idx, axis=0) - for src, (src_idx, _) in zip(attn_feats, indices) - ]) - pos_bbox_pred = [ - paddle.gather( - src, src_idx, axis=0) - for src, (src_idx, _) in zip(bbox_pred.detach(), indices) - ] - pos_roi_feats = self.get_roi_features(body_feats, pos_bbox_pred, - self.mask_roi_extractor) - mask_logits = mask_head(pos_roi_feats, pos_attn_feats) - loss_masks = self.loss_helper.loss_masks( - pos_bbox_pred, mask_logits, targets, indices, avg_factor) - - for loss in [loss_classes, loss_bboxes, loss_masks]: - for key in loss.keys(): - all_stage_losses[f'stage{stage}_{key}'] = loss[key] - - pro_bboxes = bbox_pred.detach() - - return all_stage_losses - - def _forward_test(self, body_feats, pro_bboxes, pro_feats): - for stage in range(self.num_stages): - roi_feats = self.get_roi_features(body_feats, pro_bboxes, - self.bbox_roi_extractor) - class_logits, bbox_deltas, pro_feats, attn_feats = self.bbox_heads[ - stage](roi_feats, pro_feats) - bbox_pred = self.bbox_heads[stage].refine_bboxes(pro_bboxes, - bbox_deltas) - - pro_bboxes = bbox_pred.detach() - - roi_feats = self.get_roi_features(body_feats, bbox_pred, - self.mask_roi_extractor) - mask_logits = self.mask_heads[stage](roi_feats, attn_feats) - - return { - 'class_logits': class_logits, - 'bbox_pred': bbox_pred, - 'mask_logits': mask_logits - } - - def forward(self, - body_features, - proposal_bboxes, - proposal_features, - targets=None): - if self.training: - return self._forward_train(body_features, proposal_bboxes, - proposal_features, targets) - else: - return self._forward_test(body_features, proposal_bboxes, - proposal_features) diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/sparsercnn_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/sparsercnn_head.py deleted file mode 100644 index 0534cf4..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/sparsercnn_head.py +++ /dev/null @@ -1,380 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/PeizeSun/SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/head.py -Ths copyright of PeizeSun/SparseR-CNN is as follows: -MIT License [see LICENSE for details] -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import math -import copy -import paddle -import paddle.nn as nn - -from ppdet.core.workspace import register -from ppdet.modeling.heads.roi_extractor import RoIAlign -from ppdet.modeling.bbox_utils import delta2bbox -from .. import initializer as init - -_DEFAULT_SCALE_CLAMP = math.log(100000. / 16) - - -class DynamicConv(nn.Layer): - def __init__( - self, - head_hidden_dim, - head_dim_dynamic, - head_num_dynamic, ): - super().__init__() - - self.hidden_dim = head_hidden_dim - self.dim_dynamic = head_dim_dynamic - self.num_dynamic = head_num_dynamic - self.num_params = self.hidden_dim * self.dim_dynamic - self.dynamic_layer = nn.Linear(self.hidden_dim, - self.num_dynamic * self.num_params) - - self.norm1 = nn.LayerNorm(self.dim_dynamic) - self.norm2 = nn.LayerNorm(self.hidden_dim) - - self.activation = nn.ReLU() - - pooler_resolution = 7 - num_output = self.hidden_dim * pooler_resolution**2 - self.out_layer = nn.Linear(num_output, self.hidden_dim) - self.norm3 = nn.LayerNorm(self.hidden_dim) - - def forward(self, pro_features, roi_features): - ''' - pro_features: (1, N * nr_boxes, self.d_model) - roi_features: (49, N * nr_boxes, self.d_model) - ''' - features = roi_features.transpose(perm=[1, 0, 2]) - parameters = self.dynamic_layer(pro_features).transpose(perm=[1, 0, 2]) - - param1 = parameters[:, :, :self.num_params].reshape( - [-1, self.hidden_dim, self.dim_dynamic]) - param2 = parameters[:, :, self.num_params:].reshape( - [-1, self.dim_dynamic, self.hidden_dim]) - - features = paddle.bmm(features, param1) - features = self.norm1(features) - features = self.activation(features) - - features = paddle.bmm(features, param2) - features = self.norm2(features) - features = self.activation(features) - - features = features.flatten(1) - features = self.out_layer(features) - features = self.norm3(features) - features = self.activation(features) - - return features - - -class RCNNHead(nn.Layer): - def __init__( - self, - d_model, - num_classes, - dim_feedforward, - nhead, - dropout, - head_cls, - head_reg, - head_dim_dynamic, - head_num_dynamic, - scale_clamp: float=_DEFAULT_SCALE_CLAMP, - bbox_weights=(2.0, 2.0, 1.0, 1.0), ): - super().__init__() - - self.d_model = d_model - - # dynamic. - self.self_attn = nn.MultiHeadAttention(d_model, nhead, dropout=dropout) - self.inst_interact = DynamicConv(d_model, head_dim_dynamic, - head_num_dynamic) - - self.linear1 = nn.Linear(d_model, dim_feedforward) - self.dropout = nn.Dropout(dropout) - self.linear2 = nn.Linear(dim_feedforward, d_model) - - self.norm1 = nn.LayerNorm(d_model) - self.norm2 = nn.LayerNorm(d_model) - self.norm3 = nn.LayerNorm(d_model) - self.dropout1 = nn.Dropout(dropout) - self.dropout2 = nn.Dropout(dropout) - self.dropout3 = nn.Dropout(dropout) - - self.activation = nn.ReLU() - - # cls. - num_cls = head_cls - cls_module = list() - for _ in range(num_cls): - cls_module.append(nn.Linear(d_model, d_model, bias_attr=False)) - cls_module.append(nn.LayerNorm(d_model)) - cls_module.append(nn.ReLU()) - self.cls_module = nn.LayerList(cls_module) - - # reg. - num_reg = head_reg - reg_module = list() - for _ in range(num_reg): - reg_module.append(nn.Linear(d_model, d_model, bias_attr=False)) - reg_module.append(nn.LayerNorm(d_model)) - reg_module.append(nn.ReLU()) - self.reg_module = nn.LayerList(reg_module) - - # pred. - self.class_logits = nn.Linear(d_model, num_classes) - self.bboxes_delta = nn.Linear(d_model, 4) - self.scale_clamp = scale_clamp - self.bbox_weights = bbox_weights - - def forward(self, features, bboxes, pro_features, pooler): - """ - :param bboxes: (N, nr_boxes, 4) - :param pro_features: (N, nr_boxes, d_model) - """ - - N, nr_boxes = bboxes.shape[:2] - - proposal_boxes = list() - for b in range(N): - proposal_boxes.append(bboxes[b]) - roi_num = paddle.full([N], nr_boxes).astype("int32") - - roi_features = pooler(features, proposal_boxes, roi_num) - roi_features = roi_features.reshape( - [N * nr_boxes, self.d_model, -1]).transpose(perm=[2, 0, 1]) - - # self_att. - pro_features = pro_features.reshape([N, nr_boxes, self.d_model]) - pro_features2 = self.self_attn( - pro_features, pro_features, value=pro_features) - pro_features = pro_features.transpose(perm=[1, 0, 2]) + self.dropout1( - pro_features2.transpose(perm=[1, 0, 2])) - pro_features = self.norm1(pro_features) - - # inst_interact. - pro_features = pro_features.reshape( - [nr_boxes, N, self.d_model]).transpose(perm=[1, 0, 2]).reshape( - [1, N * nr_boxes, self.d_model]) - pro_features2 = self.inst_interact(pro_features, roi_features) - pro_features = pro_features + self.dropout2(pro_features2) - obj_features = self.norm2(pro_features) - - # obj_feature. - obj_features2 = self.linear2( - self.dropout(self.activation(self.linear1(obj_features)))) - obj_features = obj_features + self.dropout3(obj_features2) - obj_features = self.norm3(obj_features) - - fc_feature = obj_features.transpose(perm=[1, 0, 2]).reshape( - [N * nr_boxes, -1]) - cls_feature = fc_feature.clone() - reg_feature = fc_feature.clone() - for cls_layer in self.cls_module: - cls_feature = cls_layer(cls_feature) - for reg_layer in self.reg_module: - reg_feature = reg_layer(reg_feature) - class_logits = self.class_logits(cls_feature) - bboxes_deltas = self.bboxes_delta(reg_feature) - pred_bboxes = delta2bbox(bboxes_deltas, - bboxes.reshape([-1, 4]), self.bbox_weights) - - return class_logits.reshape([N, nr_boxes, -1]), pred_bboxes.reshape( - [N, nr_boxes, -1]), obj_features - - -@register -class SparseRCNNHead(nn.Layer): - ''' - SparsercnnHead - Args: - roi_input_shape (list[ShapeSpec]): The output shape of fpn - num_classes (int): Number of classes, - head_hidden_dim (int): The param of MultiHeadAttention, - head_dim_feedforward (int): The param of MultiHeadAttention, - nhead (int): The param of MultiHeadAttention, - head_dropout (float): The p of dropout, - head_cls (int): The number of class head, - head_reg (int): The number of regressionhead, - head_num_dynamic (int): The number of DynamicConv's param, - head_num_heads (int): The number of RCNNHead, - deep_supervision (int): wheather supervise the intermediate results, - num_proposals (int): the number of proposals boxes and features - ''' - __inject__ = ['loss_func'] - __shared__ = ['num_classes'] - - def __init__( - self, - head_hidden_dim, - head_dim_feedforward, - nhead, - head_dropout, - head_cls, - head_reg, - head_dim_dynamic, - head_num_dynamic, - head_num_heads, - deep_supervision, - num_proposals, - num_classes=80, - loss_func="SparseRCNNLoss", - roi_input_shape=None, ): - super().__init__() - assert head_num_heads > 0, \ - f'At least one RoI Head is required, but {head_num_heads}.' - - # Build RoI. - box_pooler = self._init_box_pooler(roi_input_shape) - self.box_pooler = box_pooler - - # Build heads. - rcnn_head = RCNNHead( - head_hidden_dim, - num_classes, - head_dim_feedforward, - nhead, - head_dropout, - head_cls, - head_reg, - head_dim_dynamic, - head_num_dynamic, ) - self.head_series = nn.LayerList( - [copy.deepcopy(rcnn_head) for i in range(head_num_heads)]) - self.return_intermediate = deep_supervision - - self.num_classes = num_classes - - # build init proposal - self.init_proposal_features = nn.Embedding(num_proposals, - head_hidden_dim) - self.init_proposal_boxes = nn.Embedding(num_proposals, 4) - - self.lossfunc = loss_func - - # Init parameters. - init.reset_initialized_parameter(self) - self._reset_parameters() - - def _reset_parameters(self): - # init all parameters. - prior_prob = 0.01 - bias_value = -math.log((1 - prior_prob) / prior_prob) - - for m in self.sublayers(): - if isinstance(m, nn.Linear): - init.xavier_normal_(m.weight, reverse=True) - elif not isinstance(m, nn.Embedding) and hasattr( - m, "weight") and m.weight.dim() > 1: - init.xavier_normal_(m.weight, reverse=False) - - if hasattr(m, "bias") and m.bias is not None and m.bias.shape[ - -1] == self.num_classes: - init.constant_(m.bias, bias_value) - - init_bboxes = paddle.empty_like(self.init_proposal_boxes.weight) - init_bboxes[:, :2] = 0.5 - init_bboxes[:, 2:] = 1.0 - self.init_proposal_boxes.weight.set_value(init_bboxes) - - @staticmethod - def _init_box_pooler(input_shape): - - pooler_resolution = 7 - sampling_ratio = 2 - - if input_shape is not None: - pooler_scales = tuple(1.0 / input_shape[k].stride - for k in range(len(input_shape))) - in_channels = [ - input_shape[f].channels for f in range(len(input_shape)) - ] - end_level = len(input_shape) - 1 - # Check all channel counts are equal - assert len(set(in_channels)) == 1, in_channels - else: - pooler_scales = [1.0 / 4.0, 1.0 / 8.0, 1.0 / 16.0, 1.0 / 32.0] - end_level = 3 - - aligned = True - if paddle.device.is_compiled_with_custom_device('npu'): - aligned = False - box_pooler = RoIAlign( - resolution=pooler_resolution, - spatial_scale=pooler_scales, - sampling_ratio=sampling_ratio, - end_level=end_level, - aligned=aligned) - return box_pooler - - def forward(self, features, input_whwh): - - bs = len(features[0]) - bboxes = box_cxcywh_to_xyxy(self.init_proposal_boxes.weight.clone( - )).unsqueeze(0) - bboxes = bboxes * input_whwh.unsqueeze(-2) - - init_features = self.init_proposal_features.weight.unsqueeze(0).tile( - [1, bs, 1]) - proposal_features = init_features.clone() - - inter_class_logits = [] - inter_pred_bboxes = [] - - for stage, rcnn_head in enumerate(self.head_series): - class_logits, pred_bboxes, proposal_features = rcnn_head( - features, bboxes, proposal_features, self.box_pooler) - - if self.return_intermediate or stage == len(self.head_series) - 1: - inter_class_logits.append(class_logits) - inter_pred_bboxes.append(pred_bboxes) - bboxes = pred_bboxes.detach() - - output = { - 'pred_logits': inter_class_logits[-1], - 'pred_boxes': inter_pred_bboxes[-1] - } - if self.return_intermediate: - output['aux_outputs'] = [{ - 'pred_logits': a, - 'pred_boxes': b - } for a, b in zip(inter_class_logits[:-1], inter_pred_bboxes[:-1])] - - return output - - def get_loss(self, outputs, targets): - losses = self.lossfunc(outputs, targets) - weight_dict = self.lossfunc.weight_dict - - for k in losses.keys(): - if k in weight_dict: - losses[k] *= weight_dict[k] - - return losses - - -def box_cxcywh_to_xyxy(x): - x_c, y_c, w, h = x.unbind(-1) - b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)] - return paddle.stack(b, axis=-1) diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/ssd_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/ssd_head.py deleted file mode 100644 index a6df482..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/ssd_head.py +++ /dev/null @@ -1,216 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register -from paddle.regularizer import L2Decay -from paddle import ParamAttr - -from ..layers import AnchorGeneratorSSD -from ..cls_utils import _get_class_default_kwargs - - -class SepConvLayer(nn.Layer): - def __init__(self, - in_channels, - out_channels, - kernel_size=3, - padding=1, - conv_decay=0.): - super(SepConvLayer, self).__init__() - self.dw_conv = nn.Conv2D( - in_channels=in_channels, - out_channels=in_channels, - kernel_size=kernel_size, - stride=1, - padding=padding, - groups=in_channels, - weight_attr=ParamAttr(regularizer=L2Decay(conv_decay)), - bias_attr=False) - - self.bn = nn.BatchNorm2D( - in_channels, - weight_attr=ParamAttr(regularizer=L2Decay(0.)), - bias_attr=ParamAttr(regularizer=L2Decay(0.))) - - self.pw_conv = nn.Conv2D( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=1, - stride=1, - padding=0, - weight_attr=ParamAttr(regularizer=L2Decay(conv_decay)), - bias_attr=False) - - def forward(self, x): - x = self.dw_conv(x) - x = F.relu6(self.bn(x)) - x = self.pw_conv(x) - return x - - -class SSDExtraHead(nn.Layer): - def __init__(self, - in_channels=256, - out_channels=([256, 512], [256, 512], [128, 256], [128, 256], - [128, 256]), - strides=(2, 2, 2, 1, 1), - paddings=(1, 1, 1, 0, 0)): - super(SSDExtraHead, self).__init__() - self.convs = nn.LayerList() - for out_channel, stride, padding in zip(out_channels, strides, - paddings): - self.convs.append( - self._make_layers(in_channels, out_channel[0], out_channel[1], - stride, padding)) - in_channels = out_channel[-1] - - def _make_layers(self, c_in, c_hidden, c_out, stride_3x3, padding_3x3): - return nn.Sequential( - nn.Conv2D(c_in, c_hidden, 1), - nn.ReLU(), - nn.Conv2D(c_hidden, c_out, 3, stride_3x3, padding_3x3), nn.ReLU()) - - def forward(self, x): - out = [x] - for conv_layer in self.convs: - out.append(conv_layer(out[-1])) - return out - - -@register -class SSDHead(nn.Layer): - """ - SSDHead - - Args: - num_classes (int): Number of classes - in_channels (list): Number of channels per input feature - anchor_generator (dict): Configuration of 'AnchorGeneratorSSD' instance - kernel_size (int): Conv kernel size - padding (int): Conv padding - use_sepconv (bool): Use SepConvLayer if true - conv_decay (float): Conv regularization coeff - loss (object): 'SSDLoss' instance - use_extra_head (bool): If use ResNet34 as baskbone, you should set `use_extra_head`=True - """ - - __shared__ = ['num_classes'] - __inject__ = ['anchor_generator', 'loss'] - - def __init__(self, - num_classes=80, - in_channels=(512, 1024, 512, 256, 256, 256), - anchor_generator=_get_class_default_kwargs(AnchorGeneratorSSD), - kernel_size=3, - padding=1, - use_sepconv=False, - conv_decay=0., - loss='SSDLoss', - use_extra_head=False): - super(SSDHead, self).__init__() - # add background class - self.num_classes = num_classes + 1 - self.in_channels = in_channels - self.anchor_generator = anchor_generator - self.loss = loss - self.use_extra_head = use_extra_head - - if self.use_extra_head: - self.ssd_extra_head = SSDExtraHead() - self.in_channels = [256, 512, 512, 256, 256, 256] - - if isinstance(anchor_generator, dict): - self.anchor_generator = AnchorGeneratorSSD(**anchor_generator) - - self.num_priors = self.anchor_generator.num_priors - self.box_convs = [] - self.score_convs = [] - for i, num_prior in enumerate(self.num_priors): - box_conv_name = "boxes{}".format(i) - if not use_sepconv: - box_conv = self.add_sublayer( - box_conv_name, - nn.Conv2D( - in_channels=self.in_channels[i], - out_channels=num_prior * 4, - kernel_size=kernel_size, - padding=padding)) - else: - box_conv = self.add_sublayer( - box_conv_name, - SepConvLayer( - in_channels=self.in_channels[i], - out_channels=num_prior * 4, - kernel_size=kernel_size, - padding=padding, - conv_decay=conv_decay)) - self.box_convs.append(box_conv) - - score_conv_name = "scores{}".format(i) - if not use_sepconv: - score_conv = self.add_sublayer( - score_conv_name, - nn.Conv2D( - in_channels=self.in_channels[i], - out_channels=num_prior * self.num_classes, - kernel_size=kernel_size, - padding=padding)) - else: - score_conv = self.add_sublayer( - score_conv_name, - SepConvLayer( - in_channels=self.in_channels[i], - out_channels=num_prior * self.num_classes, - kernel_size=kernel_size, - padding=padding, - conv_decay=conv_decay)) - self.score_convs.append(score_conv) - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } - - def forward(self, feats, image, gt_bbox=None, gt_class=None): - if self.use_extra_head: - assert len(feats) == 1, \ - ("If you set use_extra_head=True, backbone feature " - "list length should be 1.") - feats = self.ssd_extra_head(feats[0]) - box_preds = [] - cls_scores = [] - for feat, box_conv, score_conv in zip(feats, self.box_convs, - self.score_convs): - box_pred = box_conv(feat) - box_pred = paddle.transpose(box_pred, [0, 2, 3, 1]) - box_pred = paddle.reshape(box_pred, [0, -1, 4]) - box_preds.append(box_pred) - - cls_score = score_conv(feat) - cls_score = paddle.transpose(cls_score, [0, 2, 3, 1]) - cls_score = paddle.reshape(cls_score, [0, -1, self.num_classes]) - cls_scores.append(cls_score) - - prior_boxes = self.anchor_generator(feats, image) - - if self.training: - return self.get_loss(box_preds, cls_scores, gt_bbox, gt_class, - prior_boxes) - else: - return (box_preds, cls_scores), prior_boxes - - def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes): - return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes) diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/tood_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/tood_head.py deleted file mode 100644 index be84098..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/tood_head.py +++ /dev/null @@ -1,376 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.nn.initializer import Constant - -from ppdet.core.workspace import register -from ..initializer import normal_, constant_, bias_init_with_prob -from ppdet.modeling.bbox_utils import bbox_center, batch_distance2bbox -from ..losses import GIoULoss -from ppdet.modeling.layers import ConvNormLayer -from ppdet.modeling.ops import get_static_shape -from ppdet.modeling.assigners.utils import generate_anchors_for_grid_cell - - -class ScaleReg(nn.Layer): - """ - Parameter for scaling the regression outputs. - """ - - def __init__(self, init_scale=1.): - super(ScaleReg, self).__init__() - self.scale_reg = self.create_parameter( - shape=[1], - attr=ParamAttr(initializer=Constant(value=init_scale)), - dtype="float32") - - def forward(self, inputs): - out = inputs * self.scale_reg - return out - - -class TaskDecomposition(nn.Layer): - """This code is based on - https://github.com/fcjian/TOOD/blob/master/mmdet/models/dense_heads/tood_head.py - """ - - def __init__( - self, - feat_channels, - stacked_convs, - la_down_rate=8, - norm_type='gn', - norm_groups=32, ): - super(TaskDecomposition, self).__init__() - self.feat_channels = feat_channels - self.stacked_convs = stacked_convs - self.norm_type = norm_type - self.norm_groups = norm_groups - self.in_channels = self.feat_channels * self.stacked_convs - self.la_conv1 = nn.Conv2D(self.in_channels, - self.in_channels // la_down_rate, 1) - self.la_conv2 = nn.Conv2D(self.in_channels // la_down_rate, - self.stacked_convs, 1) - - self.reduction_conv = ConvNormLayer( - self.in_channels, - self.feat_channels, - filter_size=1, - stride=1, - norm_type=self.norm_type, - norm_groups=self.norm_groups) - - self._init_weights() - - def _init_weights(self): - normal_(self.la_conv1.weight, std=0.001) - normal_(self.la_conv2.weight, std=0.001) - - def forward(self, feat, avg_feat): - feat_shape = get_static_shape(feat) - b = feat_shape[0:1] - h = feat_shape[2:3] - w = feat_shape[3:4] - weight = F.relu(self.la_conv1(avg_feat)) - weight = F.sigmoid(self.la_conv2(weight)).unsqueeze(-1) - feat = paddle.reshape( - feat, [b, self.stacked_convs, self.feat_channels, h, w]) * weight - feat = self.reduction_conv(feat.flatten(1, 2)) - feat = F.relu(feat) - return feat - - -@register -class TOODHead(nn.Layer): - """This code is based on - https://github.com/fcjian/TOOD/blob/master/mmdet/models/dense_heads/tood_head.py - """ - __inject__ = ['nms', 'static_assigner', 'assigner'] - __shared__ = ['num_classes'] - - def __init__(self, - num_classes=80, - feat_channels=256, - stacked_convs=6, - fpn_strides=(8, 16, 32, 64, 128), - grid_cell_scale=8, - grid_cell_offset=0.5, - norm_type='gn', - norm_groups=32, - static_assigner_epoch=4, - use_align_head=True, - loss_weight={ - 'class': 1.0, - 'bbox': 1.0, - 'iou': 2.0, - }, - nms='MultiClassNMS', - static_assigner='ATSSAssigner', - assigner='TaskAlignedAssigner'): - super(TOODHead, self).__init__() - self.num_classes = num_classes - self.feat_channels = feat_channels - self.stacked_convs = stacked_convs - self.fpn_strides = fpn_strides - self.grid_cell_scale = grid_cell_scale - self.grid_cell_offset = grid_cell_offset - self.static_assigner_epoch = static_assigner_epoch - self.use_align_head = use_align_head - self.nms = nms - self.static_assigner = static_assigner - self.assigner = assigner - self.loss_weight = loss_weight - self.giou_loss = GIoULoss() - - self.inter_convs = nn.LayerList() - for i in range(self.stacked_convs): - self.inter_convs.append( - ConvNormLayer( - self.feat_channels, - self.feat_channels, - filter_size=3, - stride=1, - norm_type=norm_type, - norm_groups=norm_groups)) - - self.cls_decomp = TaskDecomposition( - self.feat_channels, - self.stacked_convs, - self.stacked_convs * 8, - norm_type=norm_type, - norm_groups=norm_groups) - self.reg_decomp = TaskDecomposition( - self.feat_channels, - self.stacked_convs, - self.stacked_convs * 8, - norm_type=norm_type, - norm_groups=norm_groups) - - self.tood_cls = nn.Conv2D( - self.feat_channels, self.num_classes, 3, padding=1) - self.tood_reg = nn.Conv2D(self.feat_channels, 4, 3, padding=1) - - if self.use_align_head: - self.cls_prob_conv1 = nn.Conv2D(self.feat_channels * - self.stacked_convs, - self.feat_channels // 4, 1) - self.cls_prob_conv2 = nn.Conv2D( - self.feat_channels // 4, 1, 3, padding=1) - self.reg_offset_conv1 = nn.Conv2D(self.feat_channels * - self.stacked_convs, - self.feat_channels // 4, 1) - self.reg_offset_conv2 = nn.Conv2D( - self.feat_channels // 4, 4 * 2, 3, padding=1) - - self.scales_regs = nn.LayerList([ScaleReg() for _ in self.fpn_strides]) - - self._init_weights() - - @classmethod - def from_config(cls, cfg, input_shape): - return { - 'feat_channels': input_shape[0].channels, - 'fpn_strides': [i.stride for i in input_shape], - } - - def _init_weights(self): - bias_cls = bias_init_with_prob(0.01) - normal_(self.tood_cls.weight, std=0.01) - constant_(self.tood_cls.bias, bias_cls) - normal_(self.tood_reg.weight, std=0.01) - - if self.use_align_head: - normal_(self.cls_prob_conv1.weight, std=0.01) - normal_(self.cls_prob_conv2.weight, std=0.01) - constant_(self.cls_prob_conv2.bias, bias_cls) - normal_(self.reg_offset_conv1.weight, std=0.001) - constant_(self.reg_offset_conv2.weight) - constant_(self.reg_offset_conv2.bias) - - def _reg_grid_sample(self, feat, offset, anchor_points): - feat_shape = get_static_shape(feat) - b = feat_shape[0:1] - h = feat_shape[2:3] - w = feat_shape[3:4] - feat = paddle.reshape(feat, [-1, 1, h, w]) - offset = paddle.reshape(offset, [-1, 2, h, w]).transpose([0, 2, 3, 1]) - grid_shape = paddle.concat([w, h]).astype('float32') - grid = (offset + anchor_points) / grid_shape - grid = 2 * grid.clip(0., 1.) - 1 - feat = F.grid_sample(feat, grid) - feat = paddle.reshape(feat, [b, -1, h, w]) - return feat - - def forward(self, feats): - assert len(feats) == len(self.fpn_strides), \ - "The size of feats is not equal to size of fpn_strides" - - anchors, anchor_points, num_anchors_list, stride_tensor =\ - generate_anchors_for_grid_cell( - feats, self.fpn_strides, self.grid_cell_scale, - self.grid_cell_offset) - anchor_centers_split = paddle.split(anchor_points / stride_tensor, - num_anchors_list) - - cls_score_list, bbox_pred_list = [], [] - for feat, scale_reg, anchor_centers, stride in zip( - feats, self.scales_regs, anchor_centers_split, - self.fpn_strides): - b, _, h, w = get_static_shape(feat) - inter_feats = [] - for inter_conv in self.inter_convs: - feat = F.relu(inter_conv(feat)) - inter_feats.append(feat) - feat = paddle.concat(inter_feats, axis=1) - - # task decomposition - avg_feat = F.adaptive_avg_pool2d(feat, (1, 1)) - cls_feat = self.cls_decomp(feat, avg_feat) - reg_feat = self.reg_decomp(feat, avg_feat) - - # cls prediction and alignment - cls_logits = self.tood_cls(cls_feat) - if self.use_align_head: - cls_prob = F.relu(self.cls_prob_conv1(feat)) - cls_prob = F.sigmoid(self.cls_prob_conv2(cls_prob)) - cls_score = (F.sigmoid(cls_logits) * cls_prob).sqrt() - else: - cls_score = F.sigmoid(cls_logits) - cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1])) - - # reg prediction and alignment - reg_dist = scale_reg(self.tood_reg(reg_feat).exp()) - reg_dist = reg_dist.flatten(2).transpose([0, 2, 1]) - reg_bbox = batch_distance2bbox( - anchor_centers.unsqueeze(0), reg_dist) - if self.use_align_head: - reg_offset = F.relu(self.reg_offset_conv1(feat)) - reg_offset = self.reg_offset_conv2(reg_offset) - reg_bbox = reg_bbox.transpose([0, 2, 1]).reshape([b, 4, h, w]) - anchor_centers = anchor_centers.reshape([1, h, w, 2]) - bbox_pred = self._reg_grid_sample(reg_bbox, reg_offset, - anchor_centers) - bbox_pred = bbox_pred.flatten(2).transpose([0, 2, 1]) - else: - bbox_pred = reg_bbox - - if not self.training: - bbox_pred *= stride - bbox_pred_list.append(bbox_pred) - cls_score_list = paddle.concat(cls_score_list, axis=1) - bbox_pred_list = paddle.concat(bbox_pred_list, axis=1) - - return cls_score_list, bbox_pred_list, anchors, num_anchors_list, stride_tensor - - @staticmethod - def _focal_loss(score, label, alpha=0.25, gamma=2.0): - weight = (score - label).pow(gamma) - if alpha > 0: - alpha_t = alpha * label + (1 - alpha) * (1 - label) - weight *= alpha_t - loss = F.binary_cross_entropy( - score, label, weight=weight, reduction='sum') - return loss - - def get_loss(self, head_outs, gt_meta): - pred_scores, pred_bboxes, anchors, \ - num_anchors_list, stride_tensor = head_outs - gt_labels = gt_meta['gt_class'] - gt_bboxes = gt_meta['gt_bbox'] - pad_gt_mask = gt_meta['pad_gt_mask'] - # label assignment - if gt_meta['epoch_id'] < self.static_assigner_epoch: - assigned_labels, assigned_bboxes, assigned_scores = self.static_assigner( - anchors, - num_anchors_list, - gt_labels, - gt_bboxes, - pad_gt_mask, - bg_index=self.num_classes) - alpha_l = 0.25 - else: - assigned_labels, assigned_bboxes, assigned_scores = self.assigner( - pred_scores.detach(), - pred_bboxes.detach() * stride_tensor, - bbox_center(anchors), - num_anchors_list, - gt_labels, - gt_bboxes, - pad_gt_mask, - bg_index=self.num_classes) - alpha_l = -1 - - # rescale bbox - assigned_bboxes /= stride_tensor - # classification loss - loss_cls = self._focal_loss(pred_scores, assigned_scores, alpha=alpha_l) - # select positive samples mask - mask_positive = (assigned_labels != self.num_classes) - num_pos = mask_positive.astype(paddle.float32).sum() - # bbox regression loss - if num_pos > 0: - bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4]) - pred_bboxes_pos = paddle.masked_select(pred_bboxes, - bbox_mask).reshape([-1, 4]) - assigned_bboxes_pos = paddle.masked_select( - assigned_bboxes, bbox_mask).reshape([-1, 4]) - bbox_weight = paddle.masked_select( - assigned_scores.sum(-1), mask_positive).unsqueeze(-1) - # iou loss - loss_iou = self.giou_loss(pred_bboxes_pos, - assigned_bboxes_pos) * bbox_weight - loss_iou = loss_iou.sum() / bbox_weight.sum() - # l1 loss - loss_l1 = F.l1_loss(pred_bboxes_pos, assigned_bboxes_pos) - else: - loss_iou = paddle.zeros([1]) - loss_l1 = paddle.zeros([1]) - - loss_cls /= assigned_scores.sum().clip(min=1) - loss = self.loss_weight['class'] * loss_cls + self.loss_weight[ - 'iou'] * loss_iou - - return { - 'loss': loss, - 'loss_class': loss_cls, - 'loss_iou': loss_iou, - 'loss_l1': loss_l1 - } - - def post_process(self, head_outs, img_shape, scale_factor): - pred_scores, pred_bboxes, _, _, _ = head_outs - pred_scores = pred_scores.transpose([0, 2, 1]) - - for i in range(len(pred_bboxes)): - pred_bboxes[i, :, 0] = pred_bboxes[i, :, 0].clip( - min=0, max=img_shape[i, 1]) - pred_bboxes[i, :, 1] = pred_bboxes[i, :, 1].clip( - min=0, max=img_shape[i, 0]) - pred_bboxes[i, :, 2] = pred_bboxes[i, :, 2].clip( - min=0, max=img_shape[i, 1]) - pred_bboxes[i, :, 3] = pred_bboxes[i, :, 3].clip( - min=0, max=img_shape[i, 0]) - # scale bbox to origin - scale_factor = scale_factor.flip([1]).tile([1, 2]).unsqueeze(1) - pred_bboxes /= scale_factor - bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) - return bbox_pred, bbox_num diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/ttf_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/ttf_head.py deleted file mode 100644 index dfe97bd..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/ttf_head.py +++ /dev/null @@ -1,311 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.nn.initializer import Constant, Normal -from paddle.regularizer import L2Decay -from ppdet.core.workspace import register -from ppdet.modeling.layers import DeformableConvV2, LiteConv -import numpy as np - - -@register -class HMHead(nn.Layer): - """ - Args: - ch_in (int): The channel number of input Tensor. - ch_out (int): The channel number of output Tensor. - num_classes (int): Number of classes. - conv_num (int): The convolution number of hm_feat. - dcn_head(bool): whether use dcn in head. False by default. - lite_head(bool): whether use lite version. False by default. - norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. - bn by default - - Return: - Heatmap head output - """ - __shared__ = ['num_classes', 'norm_type'] - - def __init__( - self, - ch_in, - ch_out=128, - num_classes=80, - conv_num=2, - dcn_head=False, - lite_head=False, - norm_type='bn', ): - super(HMHead, self).__init__() - head_conv = nn.Sequential() - for i in range(conv_num): - name = 'conv.{}'.format(i) - if lite_head: - lite_name = 'hm.' + name - head_conv.add_sublayer( - lite_name, - LiteConv( - in_channels=ch_in if i == 0 else ch_out, - out_channels=ch_out, - norm_type=norm_type)) - else: - if dcn_head: - head_conv.add_sublayer( - name, - DeformableConvV2( - in_channels=ch_in if i == 0 else ch_out, - out_channels=ch_out, - kernel_size=3, - weight_attr=ParamAttr(initializer=Normal(0, 0.01)))) - else: - head_conv.add_sublayer( - name, - nn.Conv2D( - in_channels=ch_in if i == 0 else ch_out, - out_channels=ch_out, - kernel_size=3, - padding=1, - weight_attr=ParamAttr(initializer=Normal(0, 0.01)), - bias_attr=ParamAttr( - learning_rate=2., regularizer=L2Decay(0.)))) - head_conv.add_sublayer(name + '.act', nn.ReLU()) - self.feat = head_conv - bias_init = float(-np.log((1 - 0.01) / 0.01)) - weight_attr = None if lite_head else ParamAttr(initializer=Normal(0, - 0.01)) - self.head = nn.Conv2D( - in_channels=ch_out, - out_channels=num_classes, - kernel_size=1, - weight_attr=weight_attr, - bias_attr=ParamAttr( - learning_rate=2., - regularizer=L2Decay(0.), - initializer=Constant(bias_init))) - - def forward(self, feat): - out = self.feat(feat) - out = self.head(out) - return out - - -@register -class WHHead(nn.Layer): - """ - Args: - ch_in (int): The channel number of input Tensor. - ch_out (int): The channel number of output Tensor. - conv_num (int): The convolution number of wh_feat. - dcn_head(bool): whether use dcn in head. False by default. - lite_head(bool): whether use lite version. False by default. - norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. - bn by default - Return: - Width & Height head output - """ - __shared__ = ['norm_type'] - - def __init__(self, - ch_in, - ch_out=64, - conv_num=2, - dcn_head=False, - lite_head=False, - norm_type='bn'): - super(WHHead, self).__init__() - head_conv = nn.Sequential() - for i in range(conv_num): - name = 'conv.{}'.format(i) - if lite_head: - lite_name = 'wh.' + name - head_conv.add_sublayer( - lite_name, - LiteConv( - in_channels=ch_in if i == 0 else ch_out, - out_channels=ch_out, - norm_type=norm_type)) - else: - if dcn_head: - head_conv.add_sublayer( - name, - DeformableConvV2( - in_channels=ch_in if i == 0 else ch_out, - out_channels=ch_out, - kernel_size=3, - weight_attr=ParamAttr(initializer=Normal(0, 0.01)))) - else: - head_conv.add_sublayer( - name, - nn.Conv2D( - in_channels=ch_in if i == 0 else ch_out, - out_channels=ch_out, - kernel_size=3, - padding=1, - weight_attr=ParamAttr(initializer=Normal(0, 0.01)), - bias_attr=ParamAttr( - learning_rate=2., regularizer=L2Decay(0.)))) - head_conv.add_sublayer(name + '.act', nn.ReLU()) - - weight_attr = None if lite_head else ParamAttr(initializer=Normal(0, - 0.01)) - self.feat = head_conv - self.head = nn.Conv2D( - in_channels=ch_out, - out_channels=4, - kernel_size=1, - weight_attr=weight_attr, - bias_attr=ParamAttr( - learning_rate=2., regularizer=L2Decay(0.))) - - def forward(self, feat): - out = self.feat(feat) - out = self.head(out) - out = F.relu(out) - return out - - -@register -class TTFHead(nn.Layer): - """ - TTFHead - Args: - in_channels (int): the channel number of input to TTFHead. - num_classes (int): the number of classes, 80 by default. - hm_head_planes (int): the channel number in heatmap head, - 128 by default. - wh_head_planes (int): the channel number in width & height head, - 64 by default. - hm_head_conv_num (int): the number of convolution in heatmap head, - 2 by default. - wh_head_conv_num (int): the number of convolution in width & height - head, 2 by default. - hm_loss (object): Instance of 'CTFocalLoss'. - wh_loss (object): Instance of 'GIoULoss'. - wh_offset_base (float): the base offset of width and height, - 16.0 by default. - down_ratio (int): the actual down_ratio is calculated by base_down_ratio - (default 16) and the number of upsample layers. - lite_head(bool): whether use lite version. False by default. - norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. - bn by default - ags_module(bool): whether use AGS module to reweight location feature. - false by default. - - """ - - __shared__ = ['num_classes', 'down_ratio', 'norm_type'] - __inject__ = ['hm_loss', 'wh_loss'] - - def __init__(self, - in_channels, - num_classes=80, - hm_head_planes=128, - wh_head_planes=64, - hm_head_conv_num=2, - wh_head_conv_num=2, - hm_loss='CTFocalLoss', - wh_loss='GIoULoss', - wh_offset_base=16., - down_ratio=4, - dcn_head=False, - lite_head=False, - norm_type='bn', - ags_module=False): - super(TTFHead, self).__init__() - self.in_channels = in_channels - self.hm_head = HMHead(in_channels, hm_head_planes, num_classes, - hm_head_conv_num, dcn_head, lite_head, norm_type) - self.wh_head = WHHead(in_channels, wh_head_planes, wh_head_conv_num, - dcn_head, lite_head, norm_type) - self.hm_loss = hm_loss - self.wh_loss = wh_loss - - self.wh_offset_base = wh_offset_base - self.down_ratio = down_ratio - self.ags_module = ags_module - - @classmethod - def from_config(cls, cfg, input_shape): - if isinstance(input_shape, (list, tuple)): - input_shape = input_shape[0] - return {'in_channels': input_shape.channels, } - - def forward(self, feats): - hm = self.hm_head(feats) - wh = self.wh_head(feats) * self.wh_offset_base - return hm, wh - - def filter_box_by_weight(self, pred, target, weight): - """ - Filter out boxes where ttf_reg_weight is 0, only keep positive samples. - """ - index = paddle.nonzero(weight > 0) - index.stop_gradient = True - weight = paddle.gather_nd(weight, index) - pred = paddle.gather_nd(pred, index) - target = paddle.gather_nd(target, index) - return pred, target, weight - - def filter_loc_by_weight(self, score, weight): - index = paddle.nonzero(weight > 0) - index.stop_gradient = True - score = paddle.gather_nd(score, index) - return score - - def get_loss(self, pred_hm, pred_wh, target_hm, box_target, target_weight): - pred_hm = paddle.clip(F.sigmoid(pred_hm), 1e-4, 1 - 1e-4) - hm_loss = self.hm_loss(pred_hm, target_hm) - H, W = target_hm.shape[2:] - mask = paddle.reshape(target_weight, [-1, H, W]) - avg_factor = paddle.sum(mask) + 1e-4 - - base_step = self.down_ratio - shifts_x = paddle.arange(0, W * base_step, base_step, dtype='int32') - shifts_y = paddle.arange(0, H * base_step, base_step, dtype='int32') - shift_y, shift_x = paddle.tensor.meshgrid([shifts_y, shifts_x]) - base_loc = paddle.stack([shift_x, shift_y], axis=0) - base_loc.stop_gradient = True - - pred_boxes = paddle.concat( - [0 - pred_wh[:, 0:2, :, :] + base_loc, pred_wh[:, 2:4] + base_loc], - axis=1) - pred_boxes = paddle.transpose(pred_boxes, [0, 2, 3, 1]) - boxes = paddle.transpose(box_target, [0, 2, 3, 1]) - boxes.stop_gradient = True - - if self.ags_module: - pred_hm_max = paddle.max(pred_hm, axis=1, keepdim=True) - pred_hm_max_softmax = F.softmax(pred_hm_max, axis=1) - pred_hm_max_softmax = paddle.transpose(pred_hm_max_softmax, - [0, 2, 3, 1]) - pred_hm_max_softmax = self.filter_loc_by_weight(pred_hm_max_softmax, - mask) - else: - pred_hm_max_softmax = None - - pred_boxes, boxes, mask = self.filter_box_by_weight(pred_boxes, boxes, - mask) - mask.stop_gradient = True - wh_loss = self.wh_loss( - pred_boxes, - boxes, - iou_weight=mask.unsqueeze(1), - loc_reweight=pred_hm_max_softmax) - wh_loss = wh_loss / avg_factor - - ttf_loss = {'hm_loss': hm_loss, 'wh_loss': wh_loss} - return ttf_loss diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/vitpose_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/vitpose_head.py deleted file mode 100644 index 43908ed..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/vitpose_head.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - -from ppdet.core.workspace import register -from ppdet.modeling.keypoint_utils import resize, flip_back -from paddle.nn.initializer import TruncatedNormal, Constant, Normal -from ppdet.modeling.layers import ConvTranspose2d, BatchNorm2d - -trunc_normal_ = TruncatedNormal(std=.02) -normal_ = Normal(std=0.001) -zeros_ = Constant(value=0.) -ones_ = Constant(value=1.) - -__all__ = ['TopdownHeatmapSimpleHead'] - - -@register -class TopdownHeatmapSimpleHead(nn.Layer): - def __init__(self, - in_channels=768, - out_channels=17, - num_deconv_layers=3, - num_deconv_filters=(256, 256, 256), - num_deconv_kernels=(4, 4, 4), - extra=None, - in_index=0, - input_transform=None, - align_corners=False, - upsample=0, - flip_pairs=None, - shift_heatmap=False, - target_type='GaussianHeatmap'): - super(TopdownHeatmapSimpleHead, self).__init__() - - self.in_channels = in_channels - self.upsample = upsample - self.flip_pairs = flip_pairs - self.shift_heatmap = shift_heatmap - self.target_type = target_type - - self._init_inputs(in_channels, in_index, input_transform) - self.in_index = in_index - self.align_corners = align_corners - - if extra is not None and not isinstance(extra, dict): - raise TypeError('extra should be dict or None.') - - if num_deconv_layers > 0: - self.deconv_layers = self._make_deconv_layer( - num_deconv_layers, - num_deconv_filters, - num_deconv_kernels, ) - elif num_deconv_layers == 0: - self.deconv_layers = nn.Identity() - else: - raise ValueError( - f'num_deconv_layers ({num_deconv_layers}) should >= 0.') - - identity_final_layer = False - if extra is not None and 'final_conv_kernel' in extra: - assert extra['final_conv_kernel'] in [0, 1, 3] - if extra['final_conv_kernel'] == 3: - padding = 1 - elif extra['final_conv_kernel'] == 1: - padding = 0 - else: - # 0 for Identity mapping. - identity_final_layer = True - kernel_size = extra['final_conv_kernel'] - else: - kernel_size = 1 - padding = 0 - - if identity_final_layer: - self.final_layer = nn.Identity() - else: - conv_channels = num_deconv_filters[ - -1] if num_deconv_layers > 0 else self.in_channels - - layers = [] - if extra is not None: - num_conv_layers = extra.get('num_conv_layers', 0) - num_conv_kernels = extra.get('num_conv_kernels', - [1] * num_conv_layers) - - for i in range(num_conv_layers): - layers.append( - nn.Conv2D( - in_channels=conv_channels, - out_channels=conv_channels, - kernel_size=num_conv_kernels[i], - stride=1, - padding=(num_conv_kernels[i] - 1) // 2)) - layers.append(nn.BatchNorm2D(conv_channels)) - layers.append(nn.ReLU()) - - layers.append( - nn.Conv2D( - in_channels=conv_channels, - out_channels=out_channels, - kernel_size=kernel_size, - stride=1, - padding=(padding, padding))) - - if len(layers) > 1: - self.final_layer = nn.Sequential(*layers) - else: - self.final_layer = layers[0] - - self.init_weights() - - @staticmethod - def _get_deconv_cfg(deconv_kernel): - """Get configurations for deconv layers.""" - if deconv_kernel == 4: - padding = 1 - output_padding = 0 - elif deconv_kernel == 3: - padding = 1 - output_padding = 1 - elif deconv_kernel == 2: - padding = 0 - output_padding = 0 - else: - raise ValueError(f'Not supported num_kernels ({deconv_kernel}).') - - return deconv_kernel, padding, output_padding - - def _init_inputs(self, in_channels, in_index, input_transform): - """Check and initialize input transforms. - """ - - if input_transform is not None: - assert input_transform in ['resize_concat', 'multiple_select'] - self.input_transform = input_transform - self.in_index = in_index - if input_transform is not None: - assert isinstance(in_channels, (list, tuple)) - assert isinstance(in_index, (list, tuple)) - assert len(in_channels) == len(in_index) - if input_transform == 'resize_concat': - self.in_channels = sum(in_channels) - else: - self.in_channels = in_channels - else: - assert isinstance(in_channels, int) - assert isinstance(in_index, int) - self.in_channels = in_channels - - def _transform_inputs(self, inputs): - """Transform inputs for decoder. - """ - if not isinstance(inputs, list): - if not isinstance(inputs, list): - - if self.upsample > 0: - inputs = resize( - input=F.relu(inputs), - scale_factor=self.upsample, - mode='bilinear', - align_corners=self.align_corners) - return inputs - - if self.input_transform == 'resize_concat': - inputs = [inputs[i] for i in self.in_index] - upsampled_inputs = [ - resize( - input=x, - size=inputs[0].shape[2:], - mode='bilinear', - align_corners=self.align_corners) for x in inputs - ] - inputs = paddle.concat(upsampled_inputs, dim=1) - elif self.input_transform == 'multiple_select': - inputs = [inputs[i] for i in self.in_index] - else: - inputs = inputs[self.in_index] - - return inputs - - def forward(self, x): - """Forward function.""" - x = self._transform_inputs(x) - x = self.deconv_layers(x) - x = self.final_layer(x) - - return x - - def inference_model(self, x, flip_pairs=None): - """Inference function. - - Returns: - output_heatmap (np.ndarray): Output heatmaps. - - Args: - x (torch.Tensor[N,K,H,W]): Input features. - flip_pairs (None | list[tuple]): - Pairs of keypoints which are mirrored. - """ - output = self.forward(x) - - if flip_pairs is not None: - output_heatmap = flip_back( - output, self.flip_pairs, target_type=self.target_type) - # feature is not aligned, shift flipped heatmap for higher accuracy - if self.shift_heatmap: - output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1] - else: - output_heatmap = output - return output_heatmap - - def _make_deconv_layer(self, num_layers, num_filters, num_kernels): - """Make deconv layers.""" - if num_layers != len(num_filters): - error_msg = f'num_layers({num_layers}) ' \ - f'!= length of num_filters({len(num_filters)})' - raise ValueError(error_msg) - if num_layers != len(num_kernels): - error_msg = f'num_layers({num_layers}) ' \ - f'!= length of num_kernels({len(num_kernels)})' - raise ValueError(error_msg) - - layers = [] - for i in range(num_layers): - kernel, padding, output_padding = \ - self._get_deconv_cfg(num_kernels[i]) - - planes = num_filters[i] - layers.append( - ConvTranspose2d( - in_channels=self.in_channels, - out_channels=planes, - kernel_size=kernel, - stride=2, - padding=padding, - output_padding=output_padding, - bias=False)) - layers.append(nn.BatchNorm2D(planes)) - layers.append(nn.ReLU()) - self.in_channels = planes - - return nn.Sequential(*layers) - - def init_weights(self): - """Initialize model weights.""" - if not isinstance(self.deconv_layers, nn.Identity): - - for m in self.deconv_layers: - if isinstance(m, nn.BatchNorm2D): - ones_(m.weight) - ones_(m.bias) - if not isinstance(self.final_layer, nn.Conv2D): - - for m in self.final_layer: - if isinstance(m, nn.Conv2D): - normal_(m.weight) - zeros_(m.bias) - elif isinstance(m, nn.BatchNorm2D): - ones_(m.weight) - ones_(m.bias) - else: - normal_(self.final_layer.weight) - zeros_(self.final_layer.bias) diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/yolo_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/yolo_head.py deleted file mode 100644 index 0a63060..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/yolo_head.py +++ /dev/null @@ -1,416 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.regularizer import L2Decay -from ppdet.core.workspace import register - -import math -import numpy as np -from ..initializer import bias_init_with_prob, constant_ -from ..backbones.csp_darknet import BaseConv, DWConv -from ..losses import IouLoss -from ppdet.modeling.assigners.simota_assigner import SimOTAAssigner -from ppdet.modeling.bbox_utils import bbox_overlaps -from ppdet.modeling.layers import MultiClassNMS - -__all__ = ['YOLOv3Head', 'YOLOXHead'] - - -def _de_sigmoid(x, eps=1e-7): - x = paddle.clip(x, eps, 1. / eps) - x = paddle.clip(1. / x - 1., eps, 1. / eps) - x = -paddle.log(x) - return x - - -@register -class YOLOv3Head(nn.Layer): - __shared__ = ['num_classes', 'data_format'] - __inject__ = ['loss'] - - def __init__(self, - in_channels=[1024, 512, 256], - anchors=[[10, 13], [16, 30], [33, 23], [30, 61], [62, 45], - [59, 119], [116, 90], [156, 198], [373, 326]], - anchor_masks=[[6, 7, 8], [3, 4, 5], [0, 1, 2]], - num_classes=80, - loss='YOLOv3Loss', - iou_aware=False, - iou_aware_factor=0.4, - data_format='NCHW'): - """ - Head for YOLOv3 network - - Args: - num_classes (int): number of foreground classes - anchors (list): anchors - anchor_masks (list): anchor masks - loss (object): YOLOv3Loss instance - iou_aware (bool): whether to use iou_aware - iou_aware_factor (float): iou aware factor - data_format (str): data format, NCHW or NHWC - """ - super(YOLOv3Head, self).__init__() - assert len(in_channels) > 0, "in_channels length should > 0" - self.in_channels = in_channels - self.num_classes = num_classes - self.loss = loss - - self.iou_aware = iou_aware - self.iou_aware_factor = iou_aware_factor - - self.parse_anchor(anchors, anchor_masks) - self.num_outputs = len(self.anchors) - self.data_format = data_format - - self.yolo_outputs = [] - for i in range(len(self.anchors)): - - if self.iou_aware: - num_filters = len(self.anchors[i]) * (self.num_classes + 6) - else: - num_filters = len(self.anchors[i]) * (self.num_classes + 5) - name = 'yolo_output.{}'.format(i) - conv = nn.Conv2D( - in_channels=self.in_channels[i], - out_channels=num_filters, - kernel_size=1, - stride=1, - padding=0, - data_format=data_format, - bias_attr=ParamAttr(regularizer=L2Decay(0.))) - conv.skip_quant = True - yolo_output = self.add_sublayer(name, conv) - self.yolo_outputs.append(yolo_output) - - def parse_anchor(self, anchors, anchor_masks): - self.anchors = [[anchors[i] for i in mask] for mask in anchor_masks] - self.mask_anchors = [] - anchor_num = len(anchors) - for masks in anchor_masks: - self.mask_anchors.append([]) - for mask in masks: - assert mask < anchor_num, "anchor mask index overflow" - self.mask_anchors[-1].extend(anchors[mask]) - - def forward(self, feats, targets=None): - assert len(feats) == len(self.anchors) - yolo_outputs = [] - for i, feat in enumerate(feats): - yolo_output = self.yolo_outputs[i](feat) - if self.data_format == 'NHWC': - yolo_output = paddle.transpose(yolo_output, [0, 3, 1, 2]) - yolo_outputs.append(yolo_output) - - if self.training: - return self.loss(yolo_outputs, targets, self.anchors) - else: - if self.iou_aware: - y = [] - for i, out in enumerate(yolo_outputs): - na = len(self.anchors[i]) - ioup, x = out[:, 0:na, :, :], out[:, na:, :, :] - b, c, h, w = x.shape - no = c // na - x = x.reshape((b, na, no, h * w)) - ioup = ioup.reshape((b, na, 1, h * w)) - obj = x[:, :, 4:5, :] - ioup = F.sigmoid(ioup) - obj = F.sigmoid(obj) - obj_t = (obj**(1 - self.iou_aware_factor)) * ( - ioup**self.iou_aware_factor) - obj_t = _de_sigmoid(obj_t) - loc_t = x[:, :, :4, :] - cls_t = x[:, :, 5:, :] - y_t = paddle.concat([loc_t, obj_t, cls_t], axis=2) - y_t = y_t.reshape((b, c, h, w)) - y.append(y_t) - return y - else: - return yolo_outputs - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } - - -@register -class YOLOXHead(nn.Layer): - __shared__ = ['num_classes', 'width_mult', 'act', 'trt', 'exclude_nms'] - __inject__ = ['assigner', 'nms'] - - def __init__(self, - num_classes=80, - width_mult=1.0, - depthwise=False, - in_channels=[256, 512, 1024], - feat_channels=256, - fpn_strides=(8, 16, 32), - l1_epoch=285, - act='silu', - assigner=SimOTAAssigner(use_vfl=False), - nms='MultiClassNMS', - loss_weight={ - 'cls': 1.0, - 'obj': 1.0, - 'iou': 5.0, - 'l1': 1.0, - }, - trt=False, - exclude_nms=False): - super(YOLOXHead, self).__init__() - self._dtype = paddle.framework.get_default_dtype() - self.num_classes = num_classes - assert len(in_channels) > 0, "in_channels length should > 0" - self.in_channels = in_channels - feat_channels = int(feat_channels * width_mult) - self.fpn_strides = fpn_strides - self.l1_epoch = l1_epoch - self.assigner = assigner - self.nms = nms - if isinstance(self.nms, MultiClassNMS) and trt: - self.nms.trt = trt - self.exclude_nms = exclude_nms - self.loss_weight = loss_weight - self.iou_loss = IouLoss(loss_weight=1.0) # default loss_weight 2.5 - - ConvBlock = DWConv if depthwise else BaseConv - - self.stem_conv = nn.LayerList() - self.conv_cls = nn.LayerList() - self.conv_reg = nn.LayerList() # reg [x,y,w,h] + obj - for in_c in self.in_channels: - self.stem_conv.append(BaseConv(in_c, feat_channels, 1, 1, act=act)) - - self.conv_cls.append( - nn.Sequential(* [ - ConvBlock( - feat_channels, feat_channels, 3, 1, act=act), ConvBlock( - feat_channels, feat_channels, 3, 1, act=act), - nn.Conv2D( - feat_channels, - self.num_classes, - 1, - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - ])) - - self.conv_reg.append( - nn.Sequential(* [ - ConvBlock( - feat_channels, feat_channels, 3, 1, act=act), - ConvBlock( - feat_channels, feat_channels, 3, 1, act=act), - nn.Conv2D( - feat_channels, - 4 + 1, # reg [x,y,w,h] + obj - 1, - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - ])) - - self._init_weights() - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } - - def _init_weights(self): - bias_cls = bias_init_with_prob(0.01) - bias_reg = paddle.full([5], math.log(5.), dtype=self._dtype) - bias_reg[:2] = 0. - bias_reg[-1] = bias_cls - for cls_, reg_ in zip(self.conv_cls, self.conv_reg): - constant_(cls_[-1].weight) - constant_(cls_[-1].bias, bias_cls) - constant_(reg_[-1].weight) - reg_[-1].bias.set_value(bias_reg) - - def _generate_anchor_point(self, feat_sizes, strides, offset=0.): - anchor_points, stride_tensor = [], [] - num_anchors_list = [] - for feat_size, stride in zip(feat_sizes, strides): - h, w = feat_size - x = (paddle.arange(w) + offset) * stride - y = (paddle.arange(h) + offset) * stride - y, x = paddle.meshgrid(y, x) - anchor_points.append(paddle.stack([x, y], axis=-1).reshape([-1, 2])) - stride_tensor.append( - paddle.full( - [len(anchor_points[-1]), 1], stride, dtype=self._dtype)) - num_anchors_list.append(len(anchor_points[-1])) - anchor_points = paddle.concat(anchor_points).astype(self._dtype) - anchor_points.stop_gradient = True - stride_tensor = paddle.concat(stride_tensor) - stride_tensor.stop_gradient = True - return anchor_points, stride_tensor, num_anchors_list - - def forward(self, feats, targets=None): - assert len(feats) == len(self.fpn_strides), \ - "The size of feats is not equal to size of fpn_strides" - - feat_sizes = [[f.shape[-2], f.shape[-1]] for f in feats] - cls_score_list, reg_pred_list = [], [] - obj_score_list = [] - for i, feat in enumerate(feats): - feat = self.stem_conv[i](feat) - cls_logit = self.conv_cls[i](feat) - reg_pred = self.conv_reg[i](feat) - # cls prediction - cls_score = F.sigmoid(cls_logit) - cls_score_list.append(cls_score.flatten(2).transpose([0, 2, 1])) - # reg prediction - reg_xywh, obj_logit = paddle.split(reg_pred, [4, 1], axis=1) - reg_xywh = reg_xywh.flatten(2).transpose([0, 2, 1]) - reg_pred_list.append(reg_xywh) - # obj prediction - obj_score = F.sigmoid(obj_logit) - obj_score_list.append(obj_score.flatten(2).transpose([0, 2, 1])) - - cls_score_list = paddle.concat(cls_score_list, axis=1) - reg_pred_list = paddle.concat(reg_pred_list, axis=1) - obj_score_list = paddle.concat(obj_score_list, axis=1) - - # bbox decode - anchor_points, stride_tensor, _ =\ - self._generate_anchor_point(feat_sizes, self.fpn_strides) - reg_xy, reg_wh = paddle.split(reg_pred_list, 2, axis=-1) - reg_xy += (anchor_points / stride_tensor) - reg_wh = paddle.exp(reg_wh) * 0.5 - bbox_pred_list = paddle.concat( - [reg_xy - reg_wh, reg_xy + reg_wh], axis=-1) - - if self.training: - anchor_points, stride_tensor, num_anchors_list =\ - self._generate_anchor_point(feat_sizes, self.fpn_strides, 0.5) - yolox_losses = self.get_loss([ - cls_score_list, bbox_pred_list, obj_score_list, anchor_points, - stride_tensor, num_anchors_list - ], targets) - return yolox_losses - else: - pred_scores = (cls_score_list * obj_score_list).sqrt() - return pred_scores, bbox_pred_list, stride_tensor - - def get_loss(self, head_outs, targets): - pred_cls, pred_bboxes, pred_obj,\ - anchor_points, stride_tensor, num_anchors_list = head_outs - gt_labels = targets['gt_class'] - gt_bboxes = targets['gt_bbox'] - pred_scores = (pred_cls * pred_obj).sqrt() - # label assignment - center_and_strides = paddle.concat( - [anchor_points, stride_tensor, stride_tensor], axis=-1) - pos_num_list, label_list, bbox_target_list = [], [], [] - for pred_score, pred_bbox, gt_box, gt_label in zip( - pred_scores.detach(), - pred_bboxes.detach() * stride_tensor, gt_bboxes, gt_labels): - pos_num, label, _, bbox_target = self.assigner( - pred_score, center_and_strides, pred_bbox, gt_box, gt_label) - pos_num_list.append(pos_num) - label_list.append(label) - bbox_target_list.append(bbox_target) - labels = paddle.to_tensor(np.stack(label_list, axis=0)) - bbox_targets = paddle.to_tensor(np.stack(bbox_target_list, axis=0)) - bbox_targets /= stride_tensor # rescale bbox - - # 1. obj score loss - mask_positive = (labels != self.num_classes) - loss_obj = F.binary_cross_entropy( - pred_obj, - mask_positive.astype(pred_obj.dtype).unsqueeze(-1), - reduction='sum') - - num_pos = sum(pos_num_list) - - if num_pos > 0: - num_pos = paddle.to_tensor(num_pos, dtype=self._dtype).clip(min=1) - loss_obj /= num_pos - - # 2. iou loss - bbox_mask = mask_positive.unsqueeze(-1).tile([1, 1, 4]) - pred_bboxes_pos = paddle.masked_select(pred_bboxes, - bbox_mask).reshape([-1, 4]) - assigned_bboxes_pos = paddle.masked_select( - bbox_targets, bbox_mask).reshape([-1, 4]) - bbox_iou = bbox_overlaps(pred_bboxes_pos, assigned_bboxes_pos) - bbox_iou = paddle.diag(bbox_iou) - - loss_iou = self.iou_loss( - pred_bboxes_pos.split( - 4, axis=-1), - assigned_bboxes_pos.split( - 4, axis=-1)) - loss_iou = loss_iou.sum() / num_pos - - # 3. cls loss - cls_mask = mask_positive.unsqueeze(-1).tile( - [1, 1, self.num_classes]) - pred_cls_pos = paddle.masked_select( - pred_cls, cls_mask).reshape([-1, self.num_classes]) - assigned_cls_pos = paddle.masked_select(labels, mask_positive) - assigned_cls_pos = F.one_hot(assigned_cls_pos, - self.num_classes + 1)[..., :-1] - assigned_cls_pos *= bbox_iou.unsqueeze(-1) - loss_cls = F.binary_cross_entropy( - pred_cls_pos, assigned_cls_pos, reduction='sum') - loss_cls /= num_pos - - # 4. l1 loss - if targets['epoch_id'] >= self.l1_epoch: - loss_l1 = F.l1_loss( - pred_bboxes_pos, assigned_bboxes_pos, reduction='sum') - loss_l1 /= num_pos - else: - loss_l1 = paddle.zeros([1]) - loss_l1.stop_gradient = False - else: - loss_cls = paddle.zeros([1]) - loss_iou = paddle.zeros([1]) - loss_l1 = paddle.zeros([1]) - loss_cls.stop_gradient = False - loss_iou.stop_gradient = False - loss_l1.stop_gradient = False - - loss = self.loss_weight['obj'] * loss_obj + \ - self.loss_weight['cls'] * loss_cls + \ - self.loss_weight['iou'] * loss_iou - - if targets['epoch_id'] >= self.l1_epoch: - loss += (self.loss_weight['l1'] * loss_l1) - - yolox_losses = { - 'loss': loss, - 'loss_cls': loss_cls, - 'loss_obj': loss_obj, - 'loss_iou': loss_iou, - 'loss_l1': loss_l1, - } - return yolox_losses - - def post_process(self, head_outs, img_shape, scale_factor): - pred_scores, pred_bboxes, stride_tensor = head_outs - pred_scores = pred_scores.transpose([0, 2, 1]) - pred_bboxes *= stride_tensor - # scale bbox to origin image - scale_factor = scale_factor.flip(-1).tile([1, 2]).unsqueeze(1) - pred_bboxes /= scale_factor - if self.exclude_nms: - # `exclude_nms=True` just use in benchmark - return pred_bboxes.sum(), pred_scores.sum() - else: - bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) - return bbox_pred, bbox_num diff --git a/pdfdet/models/Paddle/ppdet/modeling/heads/yolof_head.py b/pdfdet/models/Paddle/ppdet/modeling/heads/yolof_head.py deleted file mode 100644 index 4893337..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/heads/yolof_head.py +++ /dev/null @@ -1,368 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.regularizer import L2Decay -from paddle.nn.initializer import Normal, Constant - -from ppdet.modeling.layers import MultiClassNMS -from ppdet.core.workspace import register -from ppdet.modeling.bbox_utils import delta2bbox_v2 - -__all__ = ['YOLOFHead'] - -INF = 1e8 - - -def reduce_mean(tensor): - world_size = paddle.distributed.get_world_size() - if world_size == 1: - return tensor - paddle.distributed.all_reduce(tensor) - return tensor / world_size - - -def find_inside_anchor(feat_size, stride, num_anchors, im_shape): - feat_h, feat_w = feat_size[:2] - im_h, im_w = im_shape[:2] - inside_h = min(int(np.ceil(im_h / stride)), feat_h) - inside_w = min(int(np.ceil(im_w / stride)), feat_w) - inside_mask = paddle.zeros([feat_h, feat_w], dtype=paddle.bool) - inside_mask[:inside_h, :inside_w] = True - inside_mask = inside_mask.unsqueeze(-1).expand( - [feat_h, feat_w, num_anchors]) - return inside_mask.reshape([-1]) - - -@register -class YOLOFFeat(nn.Layer): - def __init__(self, - feat_in=256, - feat_out=256, - num_cls_convs=2, - num_reg_convs=4, - norm_type='bn'): - super(YOLOFFeat, self).__init__() - assert norm_type == 'bn', "YOLOFFeat only support BN now." - self.feat_in = feat_in - self.feat_out = feat_out - self.num_cls_convs = num_cls_convs - self.num_reg_convs = num_reg_convs - self.norm_type = norm_type - - cls_subnet, reg_subnet = [], [] - for i in range(self.num_cls_convs): - feat_in = self.feat_in if i == 0 else self.feat_out - cls_subnet.append( - nn.Conv2D( - feat_in, - self.feat_out, - 3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0.0, std=0.01)), - bias_attr=ParamAttr(initializer=Constant(value=0.0)))) - cls_subnet.append( - nn.BatchNorm2D( - self.feat_out, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0)))) - cls_subnet.append(nn.ReLU()) - - for i in range(self.num_reg_convs): - feat_in = self.feat_in if i == 0 else self.feat_out - reg_subnet.append( - nn.Conv2D( - feat_in, - self.feat_out, - 3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0.0, std=0.01)), - bias_attr=ParamAttr(initializer=Constant(value=0.0)))) - reg_subnet.append( - nn.BatchNorm2D( - self.feat_out, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0)))) - reg_subnet.append(nn.ReLU()) - - self.cls_subnet = nn.Sequential(*cls_subnet) - self.reg_subnet = nn.Sequential(*reg_subnet) - - def forward(self, fpn_feat): - cls_feat = self.cls_subnet(fpn_feat) - reg_feat = self.reg_subnet(fpn_feat) - return cls_feat, reg_feat - - -@register -class YOLOFHead(nn.Layer): - __shared__ = ['num_classes', 'trt', 'exclude_nms'] - __inject__ = [ - 'conv_feat', 'anchor_generator', 'bbox_assigner', 'loss_class', - 'loss_bbox', 'nms' - ] - - def __init__(self, - num_classes=80, - conv_feat='YOLOFFeat', - anchor_generator='AnchorGenerator', - bbox_assigner='UniformAssigner', - loss_class='FocalLoss', - loss_bbox='GIoULoss', - ctr_clip=32.0, - delta_mean=[0.0, 0.0, 0.0, 0.0], - delta_std=[1.0, 1.0, 1.0, 1.0], - nms='MultiClassNMS', - prior_prob=0.01, - nms_pre=1000, - use_inside_anchor=False, - trt=False, - exclude_nms=False): - super(YOLOFHead, self).__init__() - self.num_classes = num_classes - self.conv_feat = conv_feat - self.anchor_generator = anchor_generator - self.na = self.anchor_generator.num_anchors - self.bbox_assigner = bbox_assigner - self.loss_class = loss_class - self.loss_bbox = loss_bbox - self.ctr_clip = ctr_clip - self.delta_mean = delta_mean - self.delta_std = delta_std - self.nms = nms - self.nms_pre = nms_pre - self.use_inside_anchor = use_inside_anchor - if isinstance(self.nms, MultiClassNMS) and trt: - self.nms.trt = trt - self.exclude_nms = exclude_nms - - bias_init_value = -math.log((1 - prior_prob) / prior_prob) - self.cls_score = self.add_sublayer( - 'cls_score', - nn.Conv2D( - in_channels=conv_feat.feat_out, - out_channels=self.num_classes * self.na, - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0.0, std=0.01)), - bias_attr=ParamAttr(initializer=Constant( - value=bias_init_value)))) - - self.bbox_pred = self.add_sublayer( - 'bbox_pred', - nn.Conv2D( - in_channels=conv_feat.feat_out, - out_channels=4 * self.na, - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0.0, std=0.01)), - bias_attr=ParamAttr(initializer=Constant(value=0)))) - - self.object_pred = self.add_sublayer( - 'object_pred', - nn.Conv2D( - in_channels=conv_feat.feat_out, - out_channels=self.na, - kernel_size=3, - stride=1, - padding=1, - weight_attr=ParamAttr(initializer=Normal( - mean=0.0, std=0.01)), - bias_attr=ParamAttr(initializer=Constant(value=0)))) - - def forward(self, feats, targets=None): - assert len(feats) == 1, "YOLOF only has one level feature." - conv_cls_feat, conv_reg_feat = self.conv_feat(feats[0]) - cls_logits = self.cls_score(conv_cls_feat) - objectness = self.object_pred(conv_reg_feat) - bboxes_reg = self.bbox_pred(conv_reg_feat) - - N, C, H, W = paddle.shape(cls_logits)[:] - cls_logits = cls_logits.reshape((N, self.na, self.num_classes, H, W)) - objectness = objectness.reshape((N, self.na, 1, H, W)) - norm_cls_logits = cls_logits + objectness - paddle.log( - 1.0 + paddle.clip( - cls_logits.exp(), max=INF) + paddle.clip( - objectness.exp(), max=INF)) - norm_cls_logits = norm_cls_logits.reshape((N, C, H, W)) - - anchors = self.anchor_generator([norm_cls_logits]) - - if self.training: - yolof_losses = self.get_loss( - [anchors[0], norm_cls_logits, bboxes_reg], targets) - return yolof_losses - else: - return anchors[0], norm_cls_logits, bboxes_reg - - def get_loss(self, head_outs, targets): - anchors, cls_logits, bbox_preds = head_outs - - feat_size = cls_logits.shape[-2:] - cls_logits = cls_logits.transpose([0, 2, 3, 1]) - cls_logits = cls_logits.reshape([0, -1, self.num_classes]) - bbox_preds = bbox_preds.transpose([0, 2, 3, 1]) - bbox_preds = bbox_preds.reshape([0, -1, 4]) - - num_pos_list = [] - cls_pred_list, cls_tar_list = [], [] - reg_pred_list, reg_tar_list = [], [] - # find and gather preds and targets in each image - for cls_logit, bbox_pred, gt_bbox, gt_class, im_shape in zip( - cls_logits, bbox_preds, targets['gt_bbox'], targets['gt_class'], - targets['im_shape']): - if self.use_inside_anchor: - inside_mask = find_inside_anchor( - feat_size, self.anchor_generator.strides[0], self.na, - im_shape.tolist()) - cls_logit = cls_logit[inside_mask] - bbox_pred = bbox_pred[inside_mask] - anchors = anchors[inside_mask] - - bbox_pred = delta2bbox_v2( - bbox_pred, - anchors, - self.delta_mean, - self.delta_std, - ctr_clip=self.ctr_clip) - bbox_pred = bbox_pred.reshape([-1, bbox_pred.shape[-1]]) - - # -2:ignore, -1:neg, >=0:pos - match_labels, pos_bbox_pred, pos_bbox_tar = self.bbox_assigner( - bbox_pred, anchors, gt_bbox) - pos_mask = (match_labels >= 0) - neg_mask = (match_labels == -1) - chosen_mask = paddle.logical_or(pos_mask, neg_mask) - - gt_class = gt_class.reshape([-1]) - bg_class = paddle.to_tensor( - [self.num_classes], dtype=gt_class.dtype) - # a trick to assign num_classes to negative targets - gt_class = paddle.concat([gt_class, bg_class], axis=-1) - match_labels = paddle.where( - neg_mask, - paddle.full_like(match_labels, gt_class.size - 1), match_labels) - num_pos_list.append(max(1.0, pos_mask.sum().item())) - - cls_pred_list.append(cls_logit[chosen_mask]) - cls_tar_list.append(gt_class[match_labels[chosen_mask]]) - reg_pred_list.append(pos_bbox_pred) - reg_tar_list.append(pos_bbox_tar) - - num_tot_pos = paddle.to_tensor(sum(num_pos_list)) - num_tot_pos = reduce_mean(num_tot_pos).item() - num_tot_pos = max(1.0, num_tot_pos) - - cls_pred = paddle.concat(cls_pred_list) - cls_tar = paddle.concat(cls_tar_list) - cls_loss = self.loss_class( - cls_pred, cls_tar, reduction='sum') / num_tot_pos - - reg_pred_list = [_ for _ in reg_pred_list if _ is not None] - reg_tar_list = [_ for _ in reg_tar_list if _ is not None] - if len(reg_pred_list) == 0: - reg_loss = bbox_preds.sum() * 0.0 - else: - reg_pred = paddle.concat(reg_pred_list) - reg_tar = paddle.concat(reg_tar_list) - reg_loss = self.loss_bbox(reg_pred, reg_tar).sum() / num_tot_pos - - yolof_losses = { - 'loss': cls_loss + reg_loss, - 'loss_cls': cls_loss, - 'loss_reg': reg_loss, - } - return yolof_losses - - def get_bboxes_single(self, - anchors, - cls_scores, - bbox_preds, - im_shape, - scale_factor, - rescale=True): - assert len(cls_scores) == len(bbox_preds) - mlvl_bboxes = [] - mlvl_scores = [] - for anchor, cls_score, bbox_pred in zip(anchors, cls_scores, - bbox_preds): - cls_score = cls_score.reshape([-1, self.num_classes]) - bbox_pred = bbox_pred.reshape([-1, 4]) - if self.nms_pre is not None and cls_score.shape[0] > self.nms_pre: - max_score = cls_score.max(axis=1) - _, topk_inds = max_score.topk(self.nms_pre) - bbox_pred = bbox_pred.gather(topk_inds) - anchor = anchor.gather(topk_inds) - cls_score = cls_score.gather(topk_inds) - - bbox_pred = delta2bbox_v2( - bbox_pred, - anchor, - self.delta_mean, - self.delta_std, - max_shape=im_shape, - ctr_clip=self.ctr_clip).squeeze() - mlvl_bboxes.append(bbox_pred) - mlvl_scores.append(F.sigmoid(cls_score)) - mlvl_bboxes = paddle.concat(mlvl_bboxes) - mlvl_bboxes = paddle.squeeze(mlvl_bboxes) - if rescale: - mlvl_bboxes = mlvl_bboxes / paddle.concat( - [scale_factor[::-1], scale_factor[::-1]]) - mlvl_scores = paddle.concat(mlvl_scores) - mlvl_scores = mlvl_scores.transpose([1, 0]) - return mlvl_bboxes, mlvl_scores - - def decode(self, anchors, cls_scores, bbox_preds, im_shape, scale_factor): - batch_bboxes = [] - batch_scores = [] - for img_id in range(cls_scores[0].shape[0]): - num_lvls = len(cls_scores) - cls_score_list = [cls_scores[i][img_id] for i in range(num_lvls)] - bbox_pred_list = [bbox_preds[i][img_id] for i in range(num_lvls)] - bboxes, scores = self.get_bboxes_single( - anchors, cls_score_list, bbox_pred_list, im_shape[img_id], - scale_factor[img_id]) - batch_bboxes.append(bboxes) - batch_scores.append(scores) - batch_bboxes = paddle.stack(batch_bboxes, 0) - batch_scores = paddle.stack(batch_scores, 0) - return batch_bboxes, batch_scores - - def post_process(self, head_outs, im_shape, scale_factor): - anchors, cls_scores, bbox_preds = head_outs - cls_scores = cls_scores.transpose([0, 2, 3, 1]) - bbox_preds = bbox_preds.transpose([0, 2, 3, 1]) - pred_bboxes, pred_scores = self.decode( - [anchors], [cls_scores], [bbox_preds], im_shape, scale_factor) - - if self.exclude_nms: - # `exclude_nms=True` just use in benchmark - return pred_bboxes.sum(), pred_scores.sum() - else: - bbox_pred, bbox_num, _ = self.nms(pred_bboxes, pred_scores) - return bbox_pred, bbox_num diff --git a/pdfdet/models/Paddle/ppdet/modeling/initializer.py b/pdfdet/models/Paddle/ppdet/modeling/initializer.py deleted file mode 100644 index 308c51b..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/initializer.py +++ /dev/null @@ -1,325 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py -Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file. -""" - -import math -import numpy as np - -import paddle -import paddle.nn as nn - -__all__ = [ - 'uniform_', - 'normal_', - 'constant_', - 'ones_', - 'zeros_', - 'xavier_uniform_', - 'xavier_normal_', - 'kaiming_uniform_', - 'kaiming_normal_', - 'linear_init_', - 'conv_init_', - 'reset_initialized_parameter', -] - - -def _no_grad_uniform_(tensor, a, b): - with paddle.no_grad(): - tensor.set_value( - paddle.uniform( - shape=tensor.shape, dtype=tensor.dtype, min=a, max=b)) - return tensor - - -def _no_grad_normal_(tensor, mean=0., std=1.): - with paddle.no_grad(): - tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape)) - return tensor - - -def _no_grad_fill_(tensor, value=0.): - with paddle.no_grad(): - tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype)) - return tensor - - -def uniform_(tensor, a, b): - """ - Modified tensor inspace using uniform_ - Args: - tensor (paddle.Tensor): paddle Tensor - a (float|int): min value. - b (float|int): max value. - Return: - tensor - """ - return _no_grad_uniform_(tensor, a, b) - - -def normal_(tensor, mean=0., std=1.): - """ - Modified tensor inspace using normal_ - Args: - tensor (paddle.Tensor): paddle Tensor - mean (float|int): mean value. - std (float|int): std value. - Return: - tensor - """ - return _no_grad_normal_(tensor, mean, std) - - -def constant_(tensor, value=0.): - """ - Modified tensor inspace using constant_ - Args: - tensor (paddle.Tensor): paddle Tensor - value (float|int): value to fill tensor. - Return: - tensor - """ - return _no_grad_fill_(tensor, value) - - -def ones_(tensor): - """ - Modified tensor inspace using ones_ - Args: - tensor (paddle.Tensor): paddle Tensor - Return: - tensor - """ - return _no_grad_fill_(tensor, 1) - - -def zeros_(tensor): - """ - Modified tensor inspace using zeros_ - Args: - tensor (paddle.Tensor): paddle Tensor - Return: - tensor - """ - return _no_grad_fill_(tensor, 0) - - -def vector_(tensor, vector): - with paddle.no_grad(): - tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype)) - return tensor - - -def _calculate_fan_in_and_fan_out(tensor, reverse=False): - """ - Calculate (fan_in, _fan_out) for tensor - - Args: - tensor (Tensor): paddle.Tensor - reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True - - Return: - Tuple[fan_in, fan_out] - """ - if tensor.ndim < 2: - raise ValueError( - "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions" - ) - - if reverse: - num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1] - else: - num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0] - - receptive_field_size = 1 - if tensor.ndim > 2: - receptive_field_size = np.prod(tensor.shape[2:]) - - fan_in = num_input_fmaps * receptive_field_size - fan_out = num_output_fmaps * receptive_field_size - - return fan_in, fan_out - - -def xavier_uniform_(tensor, gain=1., reverse=False): - """ - Modified tensor inspace using xavier_uniform_ - Args: - tensor (paddle.Tensor): paddle Tensor - gain (float): super parameter, 1. default. - reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. - Return: - tensor - """ - fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse) - std = gain * math.sqrt(2.0 / float(fan_in + fan_out)) - k = math.sqrt(3.0) * std - return _no_grad_uniform_(tensor, -k, k) - - -def xavier_normal_(tensor, gain=1., reverse=False): - """ - Modified tensor inspace using xavier_normal_ - Args: - tensor (paddle.Tensor): paddle Tensor - gain (float): super parameter, 1. default. - reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. - Return: - tensor - """ - fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse) - std = gain * math.sqrt(2.0 / float(fan_in + fan_out)) - return _no_grad_normal_(tensor, 0, std) - - -# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html -def _calculate_correct_fan(tensor, mode, reverse=False): - mode = mode.lower() - valid_modes = ['fan_in', 'fan_out'] - if mode not in valid_modes: - raise ValueError("Mode {} not supported, please use one of {}".format( - mode, valid_modes)) - - fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse) - - return fan_in if mode == 'fan_in' else fan_out - - -def _calculate_gain(nonlinearity, param=None): - linear_fns = [ - 'linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d', - 'conv_transpose2d', 'conv_transpose3d' - ] - if nonlinearity in linear_fns or nonlinearity == 'sigmoid': - return 1 - elif nonlinearity == 'tanh': - return 5.0 / 3 - elif nonlinearity == 'relu': - return math.sqrt(2.0) - elif nonlinearity == 'leaky_relu': - if param is None: - negative_slope = 0.01 - elif not isinstance(param, bool) and isinstance( - param, int) or isinstance(param, float): - # True/False are instances of int, hence check above - negative_slope = param - else: - raise ValueError("negative_slope {} not a valid number".format( - param)) - return math.sqrt(2.0 / (1 + negative_slope**2)) - elif nonlinearity == 'selu': - return 3.0 / 4 - else: - raise ValueError("Unsupported nonlinearity {}".format(nonlinearity)) - - -def kaiming_uniform_(tensor, - a=0, - mode='fan_in', - nonlinearity='leaky_relu', - reverse=False): - """ - Modified tensor inspace using kaiming_uniform method - Args: - tensor (paddle.Tensor): paddle Tensor - mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut - nonlinearity (str): nonlinearity method name - reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. - Return: - tensor - """ - fan = _calculate_correct_fan(tensor, mode, reverse) - gain = _calculate_gain(nonlinearity, a) - std = gain / math.sqrt(fan) - k = math.sqrt(3.0) * std - return _no_grad_uniform_(tensor, -k, k) - - -def kaiming_normal_(tensor, - a=0, - mode='fan_in', - nonlinearity='leaky_relu', - reverse=False): - """ - Modified tensor inspace using kaiming_normal_ - Args: - tensor (paddle.Tensor): paddle Tensor - mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut - nonlinearity (str): nonlinearity method name - reverse (bool): reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. - Return: - tensor - """ - fan = _calculate_correct_fan(tensor, mode, reverse) - gain = _calculate_gain(nonlinearity, a) - std = gain / math.sqrt(fan) - return _no_grad_normal_(tensor, 0, std) - - -def linear_init_(module): - bound = 1 / math.sqrt(module.weight.shape[0]) - uniform_(module.weight, -bound, bound) - if hasattr(module, "bias") and module.bias is not None: - uniform_(module.bias, -bound, bound) - - -def conv_init_(module): - bound = 1 / np.sqrt(np.prod(module.weight.shape[1:])) - uniform_(module.weight, -bound, bound) - if module.bias is not None: - uniform_(module.bias, -bound, bound) - - -def bias_init_with_prob(prior_prob=0.01): - """initialize conv/fc bias value according to a given probability value.""" - bias_init = float(-np.log((1 - prior_prob) / prior_prob)) - return bias_init - - -@paddle.no_grad() -def reset_initialized_parameter(model, include_self=True): - """ - Reset initialized parameter using following method for [conv, linear, embedding, bn] - - Args: - model (paddle.Layer): paddle Layer - include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself - Return: - None - """ - for _, m in model.named_sublayers(include_self=include_self): - if isinstance(m, nn.Conv2D): - k = float(m._groups) / (m._in_channels * m._kernel_size[0] * - m._kernel_size[1]) - k = math.sqrt(k) - _no_grad_uniform_(m.weight, -k, k) - if hasattr(m, 'bias') and getattr(m, 'bias') is not None: - _no_grad_uniform_(m.bias, -k, k) - - elif isinstance(m, nn.Linear): - k = math.sqrt(1. / m.weight.shape[0]) - _no_grad_uniform_(m.weight, -k, k) - if hasattr(m, 'bias') and getattr(m, 'bias') is not None: - _no_grad_uniform_(m.bias, -k, k) - - elif isinstance(m, nn.Embedding): - _no_grad_normal_(m.weight, mean=0., std=1.) - - elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)): - _no_grad_fill_(m.weight, 1.) - if hasattr(m, 'bias') and getattr(m, 'bias') is not None: - _no_grad_fill_(m.bias, 0) diff --git a/pdfdet/models/Paddle/ppdet/modeling/keypoint_utils.py b/pdfdet/models/Paddle/ppdet/modeling/keypoint_utils.py deleted file mode 100644 index 382e373..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/keypoint_utils.py +++ /dev/null @@ -1,551 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -this code is based on https://github.com/open-mmlab/mmpose -""" - -import cv2 -import numpy as np -import paddle.nn.functional as F - - -def get_affine_mat_kernel(h, w, s, inv=False): - if w < h: - w_ = s - h_ = int(np.ceil((s / w * h) / 64.) * 64) - scale_w = w - scale_h = h_ / w_ * w - - else: - h_ = s - w_ = int(np.ceil((s / h * w) / 64.) * 64) - scale_h = h - scale_w = w_ / h_ * h - - center = np.array([np.round(w / 2.), np.round(h / 2.)]) - - size_resized = (w_, h_) - trans = get_affine_transform( - center, np.array([scale_w, scale_h]), 0, size_resized, inv=inv) - - return trans, size_resized - - -def get_affine_transform(center, - input_size, - rot, - output_size, - shift=(0., 0.), - inv=False): - """Get the affine transform matrix, given the center/scale/rot/output_size. - - Args: - center (np.ndarray[2, ]): Center of the bounding box (x, y). - input_size (np.ndarray[2, ]): Size of input feature (width, height). - rot (float): Rotation angle (degree). - output_size (np.ndarray[2, ]): Size of the destination heatmaps. - shift (0-100%): Shift translation ratio wrt the width/height. - Default (0., 0.). - inv (bool): Option to inverse the affine transform direction. - (inv=False: src->dst or inv=True: dst->src) - - Returns: - np.ndarray: The transform matrix. - """ - assert len(center) == 2 - assert len(output_size) == 2 - assert len(shift) == 2 - - if not isinstance(input_size, (np.ndarray, list)): - input_size = np.array([input_size, input_size], dtype=np.float32) - scale_tmp = input_size - - shift = np.array(shift) - src_w = scale_tmp[0] - dst_w = output_size[0] - dst_h = output_size[1] - - rot_rad = np.pi * rot / 180 - src_dir = rotate_point([0., src_w * -0.5], rot_rad) - dst_dir = np.array([0., dst_w * -0.5]) - - src = np.zeros((3, 2), dtype=np.float32) - - src[0, :] = center + scale_tmp * shift - src[1, :] = center + src_dir + scale_tmp * shift - src[2, :] = _get_3rd_point(src[0, :], src[1, :]) - - dst = np.zeros((3, 2), dtype=np.float32) - dst[0, :] = [dst_w * 0.5, dst_h * 0.5] - dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir - dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :]) - - if inv: - trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) - else: - trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) - - return trans - - -def get_warp_matrix(theta, size_input, size_dst, size_target): - """This code is based on - https://github.com/open-mmlab/mmpose/blob/master/mmpose/core/post_processing/post_transforms.py - - Calculate the transformation matrix under the constraint of unbiased. - Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased - Data Processing for Human Pose Estimation (CVPR 2020). - - Args: - theta (float): Rotation angle in degrees. - size_input (np.ndarray): Size of input image [w, h]. - size_dst (np.ndarray): Size of output image [w, h]. - size_target (np.ndarray): Size of ROI in input plane [w, h]. - - Returns: - matrix (np.ndarray): A matrix for transformation. - """ - theta = np.deg2rad(theta) - matrix = np.zeros((2, 3), dtype=np.float32) - scale_x = size_dst[0] / size_target[0] - scale_y = size_dst[1] / size_target[1] - matrix[0, 0] = np.cos(theta) * scale_x - matrix[0, 1] = -np.sin(theta) * scale_x - matrix[0, 2] = scale_x * ( - -0.5 * size_input[0] * np.cos(theta) + 0.5 * size_input[1] * - np.sin(theta) + 0.5 * size_target[0]) - matrix[1, 0] = np.sin(theta) * scale_y - matrix[1, 1] = np.cos(theta) * scale_y - matrix[1, 2] = scale_y * ( - -0.5 * size_input[0] * np.sin(theta) - 0.5 * size_input[1] * - np.cos(theta) + 0.5 * size_target[1]) - return matrix - - -def _get_3rd_point(a, b): - """To calculate the affine matrix, three pairs of points are required. This - function is used to get the 3rd point, given 2D points a & b. - - The 3rd point is defined by rotating vector `a - b` by 90 degrees - anticlockwise, using b as the rotation center. - - Args: - a (np.ndarray): point(x,y) - b (np.ndarray): point(x,y) - - Returns: - np.ndarray: The 3rd point. - """ - assert len( - a) == 2, 'input of _get_3rd_point should be point with length of 2' - assert len( - b) == 2, 'input of _get_3rd_point should be point with length of 2' - direction = a - b - third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32) - - return third_pt - - -def rotate_point(pt, angle_rad): - """Rotate a point by an angle. - - Args: - pt (list[float]): 2 dimensional point to be rotated - angle_rad (float): rotation angle by radian - - Returns: - list[float]: Rotated point. - """ - assert len(pt) == 2 - sn, cs = np.sin(angle_rad), np.cos(angle_rad) - new_x = pt[0] * cs - pt[1] * sn - new_y = pt[0] * sn + pt[1] * cs - rotated_pt = [new_x, new_y] - - return rotated_pt - - -def transpred(kpts, h, w, s): - trans, _ = get_affine_mat_kernel(h, w, s, inv=True) - - return warp_affine_joints(kpts[..., :2].copy(), trans) - - -def warp_affine_joints(joints, mat): - """Apply affine transformation defined by the transform matrix on the - joints. - - Args: - joints (np.ndarray[..., 2]): Origin coordinate of joints. - mat (np.ndarray[3, 2]): The affine matrix. - - Returns: - matrix (np.ndarray[..., 2]): Result coordinate of joints. - """ - joints = np.array(joints) - shape = joints.shape - joints = joints.reshape(-1, 2) - return np.dot(np.concatenate( - (joints, joints[:, 0:1] * 0 + 1), axis=1), - mat.T).reshape(shape) - - -def affine_transform(pt, t): - new_pt = np.array([pt[0], pt[1], 1.]).T - new_pt = np.dot(t, new_pt) - return new_pt[:2] - - -def transform_preds(coords, center, scale, output_size): - target_coords = np.zeros(coords.shape) - trans = get_affine_transform(center, scale * 200, 0, output_size, inv=1) - for p in range(coords.shape[0]): - target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans) - return target_coords - - -def oks_iou(g, d, a_g, a_d, sigmas=None, in_vis_thre=None): - if not isinstance(sigmas, np.ndarray): - sigmas = np.array([ - .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, - .87, .87, .89, .89 - ]) / 10.0 - vars = (sigmas * 2)**2 - xg = g[0::3] - yg = g[1::3] - vg = g[2::3] - ious = np.zeros((d.shape[0])) - for n_d in range(0, d.shape[0]): - xd = d[n_d, 0::3] - yd = d[n_d, 1::3] - vd = d[n_d, 2::3] - dx = xd - xg - dy = yd - yg - e = (dx**2 + dy**2) / vars / ((a_g + a_d[n_d]) / 2 + np.spacing(1)) / 2 - if in_vis_thre is not None: - ind = list(vg > in_vis_thre) and list(vd > in_vis_thre) - e = e[ind] - ious[n_d] = np.sum(np.exp(-e)) / e.shape[0] if e.shape[0] != 0 else 0.0 - return ious - - -def oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None): - """greedily select boxes with high confidence and overlap with current maximum <= thresh - rule out overlap >= thresh - - Args: - kpts_db (list): The predicted keypoints within the image - thresh (float): The threshold to select the boxes - sigmas (np.array): The variance to calculate the oks iou - Default: None - in_vis_thre (float): The threshold to select the high confidence boxes - Default: None - - Return: - keep (list): indexes to keep - """ - - if len(kpts_db) == 0: - return [] - - scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))]) - kpts = np.array( - [kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))]) - areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))]) - - order = scores.argsort()[::-1] - - keep = [] - while order.size > 0: - i = order[0] - keep.append(i) - - oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], - sigmas, in_vis_thre) - - inds = np.where(oks_ovr <= thresh)[0] - order = order[inds + 1] - - return keep - - -def rescore(overlap, scores, thresh, type='gaussian'): - assert overlap.shape[0] == scores.shape[0] - if type == 'linear': - inds = np.where(overlap >= thresh)[0] - scores[inds] = scores[inds] * (1 - overlap[inds]) - else: - scores = scores * np.exp(-overlap**2 / thresh) - - return scores - - -def soft_oks_nms(kpts_db, thresh, sigmas=None, in_vis_thre=None): - """greedily select boxes with high confidence and overlap with current maximum <= thresh - rule out overlap >= thresh - - Args: - kpts_db (list): The predicted keypoints within the image - thresh (float): The threshold to select the boxes - sigmas (np.array): The variance to calculate the oks iou - Default: None - in_vis_thre (float): The threshold to select the high confidence boxes - Default: None - - Return: - keep (list): indexes to keep - """ - - if len(kpts_db) == 0: - return [] - - scores = np.array([kpts_db[i]['score'] for i in range(len(kpts_db))]) - kpts = np.array( - [kpts_db[i]['keypoints'].flatten() for i in range(len(kpts_db))]) - areas = np.array([kpts_db[i]['area'] for i in range(len(kpts_db))]) - - order = scores.argsort()[::-1] - scores = scores[order] - - # max_dets = order.size - max_dets = 20 - keep = np.zeros(max_dets, dtype=np.intp) - keep_cnt = 0 - while order.size > 0 and keep_cnt < max_dets: - i = order[0] - - oks_ovr = oks_iou(kpts[i], kpts[order[1:]], areas[i], areas[order[1:]], - sigmas, in_vis_thre) - - order = order[1:] - scores = rescore(oks_ovr, scores[1:], thresh) - - tmp = scores.argsort()[::-1] - order = order[tmp] - scores = scores[tmp] - - keep[keep_cnt] = i - keep_cnt += 1 - - keep = keep[:keep_cnt] - - return keep - - -def resize(input, - size=None, - scale_factor=None, - mode='nearest', - align_corners=None, - warning=True): - if warning: - if size is not None and align_corners: - input_h, input_w = tuple(int(x) for x in input.shape[2:]) - output_h, output_w = tuple(int(x) for x in size) - if output_h > input_h or output_w > output_h: - if ((output_h > 1 and output_w > 1 and input_h > 1 and - input_w > 1) and (output_h - 1) % (input_h - 1) and - (output_w - 1) % (input_w - 1)): - warnings.warn( - f'When align_corners={align_corners}, ' - 'the output would more aligned if ' - f'input size {(input_h, input_w)} is `x+1` and ' - f'out size {(output_h, output_w)} is `nx+1`') - - return F.interpolate(input, size, scale_factor, mode, align_corners) - - -def flip_back(output_flipped, flip_pairs, target_type='GaussianHeatmap'): - """Flip the flipped heatmaps back to the original form. - Note: - - batch_size: N - - num_keypoints: K - - heatmap height: H - - heatmap width: W - Args: - output_flipped (np.ndarray[N, K, H, W]): The output heatmaps obtained - from the flipped images. - flip_pairs (list[tuple()): Pairs of keypoints which are mirrored - (for example, left ear -- right ear). - target_type (str): GaussianHeatmap or CombinedTarget - Returns: - np.ndarray: heatmaps that flipped back to the original image - """ - assert len(output_flipped.shape) == 4, \ - 'output_flipped should be [batch_size, num_keypoints, height, width]' - shape_ori = output_flipped.shape - channels = 1 - if target_type.lower() == 'CombinedTarget'.lower(): - channels = 3 - output_flipped[:, 1::3, ...] = -output_flipped[:, 1::3, ...] - output_flipped = output_flipped.reshape((shape_ori[0], -1, channels, - shape_ori[2], shape_ori[3])) - output_flipped_back = output_flipped.clone() - - # Swap left-right parts - for left, right in flip_pairs: - output_flipped_back[:, left, ...] = output_flipped[:, right, ...] - output_flipped_back[:, right, ...] = output_flipped[:, left, ...] - output_flipped_back = output_flipped_back.reshape(shape_ori) - # Flip horizontally - output_flipped_back = output_flipped_back[..., ::-1] - return output_flipped_back - - -def _calc_distances(preds, targets, mask, normalize): - """Calculate the normalized distances between preds and target. - - Note: - batch_size: N - num_keypoints: K - dimension of keypoints: D (normally, D=2 or D=3) - - Args: - preds (np.ndarray[N, K, D]): Predicted keypoint location. - targets (np.ndarray[N, K, D]): Groundtruth keypoint location. - mask (np.ndarray[N, K]): Visibility of the target. False for invisible - joints, and True for visible. Invisible joints will be ignored for - accuracy calculation. - normalize (np.ndarray[N, D]): Typical value is heatmap_size - - Returns: - np.ndarray[K, N]: The normalized distances. \ - If target keypoints are missing, the distance is -1. - """ - N, K, _ = preds.shape - # set mask=0 when normalize==0 - _mask = mask.copy() - _mask[np.where((normalize == 0).sum(1))[0], :] = False - distances = np.full((N, K), -1, dtype=np.float32) - # handle invalid values - normalize[np.where(normalize <= 0)] = 1e6 - distances[_mask] = np.linalg.norm( - ((preds - targets) / normalize[:, None, :])[_mask], axis=-1) - return distances.T - - -def _distance_acc(distances, thr=0.5): - """Return the percentage below the distance threshold, while ignoring - distances values with -1. - - Note: - batch_size: N - Args: - distances (np.ndarray[N, ]): The normalized distances. - thr (float): Threshold of the distances. - - Returns: - float: Percentage of distances below the threshold. \ - If all target keypoints are missing, return -1. - """ - distance_valid = distances != -1 - num_distance_valid = distance_valid.sum() - if num_distance_valid > 0: - return (distances[distance_valid] < thr).sum() / num_distance_valid - return -1 - - -def keypoint_pck_accuracy(pred, gt, mask, thr, normalize): - """Calculate the pose accuracy of PCK for each individual keypoint and the - averaged accuracy across all keypoints for coordinates. - - Note: - PCK metric measures accuracy of the localization of the body joints. - The distances between predicted positions and the ground-truth ones - are typically normalized by the bounding box size. - The threshold (thr) of the normalized distance is commonly set - as 0.05, 0.1 or 0.2 etc. - - - batch_size: N - - num_keypoints: K - - Args: - pred (np.ndarray[N, K, 2]): Predicted keypoint location. - gt (np.ndarray[N, K, 2]): Groundtruth keypoint location. - mask (np.ndarray[N, K]): Visibility of the target. False for invisible - joints, and True for visible. Invisible joints will be ignored for - accuracy calculation. - thr (float): Threshold of PCK calculation. - normalize (np.ndarray[N, 2]): Normalization factor for H&W. - - Returns: - tuple: A tuple containing keypoint accuracy. - - - acc (np.ndarray[K]): Accuracy of each keypoint. - - avg_acc (float): Averaged accuracy across all keypoints. - - cnt (int): Number of valid keypoints. - """ - distances = _calc_distances(pred, gt, mask, normalize) - - acc = np.array([_distance_acc(d, thr) for d in distances]) - valid_acc = acc[acc >= 0] - cnt = len(valid_acc) - avg_acc = valid_acc.mean() if cnt > 0 else 0 - return acc, avg_acc, cnt - - -def keypoint_auc(pred, gt, mask, normalize, num_step=20): - """Calculate the pose accuracy of PCK for each individual keypoint and the - averaged accuracy across all keypoints for coordinates. - - Note: - - batch_size: N - - num_keypoints: K - - Args: - pred (np.ndarray[N, K, 2]): Predicted keypoint location. - gt (np.ndarray[N, K, 2]): Groundtruth keypoint location. - mask (np.ndarray[N, K]): Visibility of the target. False for invisible - joints, and True for visible. Invisible joints will be ignored for - accuracy calculation. - normalize (float): Normalization factor. - - Returns: - float: Area under curve. - """ - nor = np.tile(np.array([[normalize, normalize]]), (pred.shape[0], 1)) - x = [1.0 * i / num_step for i in range(num_step)] - y = [] - for thr in x: - _, avg_acc, _ = keypoint_pck_accuracy(pred, gt, mask, thr, nor) - y.append(avg_acc) - - auc = 0 - for i in range(num_step): - auc += 1.0 / num_step * y[i] - return auc - - -def keypoint_epe(pred, gt, mask): - """Calculate the end-point error. - - Note: - - batch_size: N - - num_keypoints: K - - Args: - pred (np.ndarray[N, K, 2]): Predicted keypoint location. - gt (np.ndarray[N, K, 2]): Groundtruth keypoint location. - mask (np.ndarray[N, K]): Visibility of the target. False for invisible - joints, and True for visible. Invisible joints will be ignored for - accuracy calculation. - - Returns: - float: Average end-point error. - """ - - normalize = np.ones((pred.shape[0], pred.shape[2]), dtype=np.float32) - distances = _calc_distances(pred, gt, mask, normalize) - distance_valid = distances[distances != -1] - return distance_valid.sum() / max(1, len(distance_valid)) diff --git a/pdfdet/models/Paddle/ppdet/modeling/lane_utils.py b/pdfdet/models/Paddle/ppdet/modeling/lane_utils.py deleted file mode 100644 index e3fb45c..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/lane_utils.py +++ /dev/null @@ -1,111 +0,0 @@ -import os -import cv2 -import numpy as np -from scipy.interpolate import InterpolatedUnivariateSpline - - -class Lane: - def __init__(self, points=None, invalid_value=-2., metadata=None): - super(Lane, self).__init__() - self.curr_iter = 0 - self.points = points - self.invalid_value = invalid_value - self.function = InterpolatedUnivariateSpline( - points[:, 1], points[:, 0], k=min(3, len(points) - 1)) - self.min_y = points[:, 1].min() - 0.01 - self.max_y = points[:, 1].max() + 0.01 - self.metadata = metadata or {} - - def __repr__(self): - return '[Lane]\n' + str(self.points) + '\n[/Lane]' - - def __call__(self, lane_ys): - lane_xs = self.function(lane_ys) - - lane_xs[(lane_ys < self.min_y) | (lane_ys > self.max_y - )] = self.invalid_value - return lane_xs - - def to_array(self, sample_y_range, img_w, img_h): - self.sample_y = range(sample_y_range[0], sample_y_range[1], - sample_y_range[2]) - sample_y = self.sample_y - img_w, img_h = img_w, img_h - ys = np.array(sample_y) / float(img_h) - xs = self(ys) - valid_mask = (xs >= 0) & (xs < 1) - lane_xs = xs[valid_mask] * img_w - lane_ys = ys[valid_mask] * img_h - lane = np.concatenate( - (lane_xs.reshape(-1, 1), lane_ys.reshape(-1, 1)), axis=1) - return lane - - def __iter__(self): - return self - - def __next__(self): - if self.curr_iter < len(self.points): - self.curr_iter += 1 - return self.points[self.curr_iter - 1] - self.curr_iter = 0 - raise StopIteration - - -COLORS = [ - (255, 0, 0), - (0, 255, 0), - (0, 0, 255), - (255, 255, 0), - (255, 0, 255), - (0, 255, 255), - (128, 255, 0), - (255, 128, 0), - (128, 0, 255), - (255, 0, 128), - (0, 128, 255), - (0, 255, 128), - (128, 255, 255), - (255, 128, 255), - (255, 255, 128), - (60, 180, 0), - (180, 60, 0), - (0, 60, 180), - (0, 180, 60), - (60, 0, 180), - (180, 0, 60), - (255, 0, 0), - (0, 255, 0), - (0, 0, 255), - (255, 255, 0), - (255, 0, 255), - (0, 255, 255), - (128, 255, 0), - (255, 128, 0), - (128, 0, 255), -] - - -def imshow_lanes(img, lanes, show=False, out_file=None, width=4): - lanes_xys = [] - for _, lane in enumerate(lanes): - xys = [] - for x, y in lane: - if x <= 0 or y <= 0: - continue - x, y = int(x), int(y) - xys.append((x, y)) - lanes_xys.append(xys) - lanes_xys.sort(key=lambda xys: xys[0][0] if len(xys) > 0 else 0) - - for idx, xys in enumerate(lanes_xys): - for i in range(1, len(xys)): - cv2.line(img, xys[i - 1], xys[i], COLORS[idx], thickness=width) - - if show: - cv2.imshow('view', img) - cv2.waitKey(0) - - if out_file: - if not os.path.exists(os.path.dirname(out_file)): - os.makedirs(os.path.dirname(out_file)) - cv2.imwrite(out_file, img) diff --git a/pdfdet/models/Paddle/ppdet/modeling/layers.py b/pdfdet/models/Paddle/ppdet/modeling/layers.py deleted file mode 100644 index f91b840..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/layers.py +++ /dev/null @@ -1,1348 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import six -import numpy as np -from numbers import Integral - -import paddle -import paddle.nn as nn -from paddle import ParamAttr -from paddle import to_tensor -import paddle.nn.functional as F -from paddle.nn.initializer import Normal, Constant, XavierUniform -from paddle.regularizer import L2Decay - -from ppdet.core.workspace import register, serializable -from ppdet.modeling.bbox_utils import delta2bbox -from . import ops -from .initializer import xavier_uniform_, constant_ - -from paddle.vision.ops import DeformConv2D - - -def _to_list(l): - if isinstance(l, (list, tuple)): - return list(l) - return [l] - - -class AlignConv(nn.Layer): - def __init__(self, in_channels, out_channels, kernel_size=3, groups=1): - super(AlignConv, self).__init__() - self.kernel_size = kernel_size - self.align_conv = paddle.vision.ops.DeformConv2D( - in_channels, - out_channels, - kernel_size=self.kernel_size, - padding=(self.kernel_size - 1) // 2, - groups=groups, - weight_attr=ParamAttr(initializer=Normal(0, 0.01)), - bias_attr=None) - - @paddle.no_grad() - def get_offset(self, anchors, featmap_size, stride): - """ - Args: - anchors: [B, L, 5] xc,yc,w,h,angle - featmap_size: (feat_h, feat_w) - stride: 8 - Returns: - - """ - batch = anchors.shape[0] - dtype = anchors.dtype - feat_h, feat_w = featmap_size - pad = (self.kernel_size - 1) // 2 - idx = paddle.arange(-pad, pad + 1, dtype=dtype) - - yy, xx = paddle.meshgrid(idx, idx) - xx = paddle.reshape(xx, [-1]) - yy = paddle.reshape(yy, [-1]) - - # get sampling locations of default conv - xc = paddle.arange(0, feat_w, dtype=dtype) - yc = paddle.arange(0, feat_h, dtype=dtype) - yc, xc = paddle.meshgrid(yc, xc) - - xc = paddle.reshape(xc, [-1, 1]) - yc = paddle.reshape(yc, [-1, 1]) - x_conv = xc + xx - y_conv = yc + yy - - # get sampling locations of anchors - x_ctr, y_ctr, w, h, a = paddle.split(anchors, 5, axis=-1) - x_ctr = x_ctr / stride - y_ctr = y_ctr / stride - w_s = w / stride - h_s = h / stride - cos, sin = paddle.cos(a), paddle.sin(a) - dw, dh = w_s / self.kernel_size, h_s / self.kernel_size - x, y = dw * xx, dh * yy - xr = cos * x - sin * y - yr = sin * x + cos * y - x_anchor, y_anchor = xr + x_ctr, yr + y_ctr - # get offset filed - offset_x = x_anchor - x_conv - offset_y = y_anchor - y_conv - offset = paddle.stack([offset_y, offset_x], axis=-1) - offset = offset.reshape( - [batch, feat_h, feat_w, self.kernel_size * self.kernel_size * 2]) - offset = offset.transpose([0, 3, 1, 2]) - - return offset - - def forward(self, x, refine_anchors, featmap_size, stride): - batch = paddle.shape(x)[0].numpy() - offset = self.get_offset(refine_anchors, featmap_size, stride) - if self.training: - x = F.relu(self.align_conv(x, offset.detach())) - else: - x = F.relu(self.align_conv(x, offset)) - return x - - -class DeformableConvV2(nn.Layer): - def __init__(self, - in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - groups=1, - weight_attr=None, - bias_attr=None, - lr_scale=1, - regularizer=None, - skip_quant=False, - dcn_bias_regularizer=L2Decay(0.), - dcn_bias_lr_scale=2.): - super(DeformableConvV2, self).__init__() - self.offset_channel = 2 * kernel_size**2 - self.mask_channel = kernel_size**2 - - if lr_scale == 1 and regularizer is None: - offset_bias_attr = ParamAttr(initializer=Constant(0.)) - else: - offset_bias_attr = ParamAttr( - initializer=Constant(0.), - learning_rate=lr_scale, - regularizer=regularizer) - self.conv_offset = nn.Conv2D( - in_channels, - 3 * kernel_size**2, - kernel_size, - stride=stride, - padding=(kernel_size - 1) // 2, - weight_attr=ParamAttr(initializer=Constant(0.0)), - bias_attr=offset_bias_attr) - if skip_quant: - self.conv_offset.skip_quant = True - - if bias_attr: - # in FCOS-DCN head, specifically need learning_rate and regularizer - dcn_bias_attr = ParamAttr( - initializer=Constant(value=0), - regularizer=dcn_bias_regularizer, - learning_rate=dcn_bias_lr_scale) - else: - # in ResNet backbone, do not need bias - dcn_bias_attr = False - self.conv_dcn = DeformConv2D( - in_channels, - out_channels, - kernel_size, - stride=stride, - padding=(kernel_size - 1) // 2 * dilation, - dilation=dilation, - groups=groups, - weight_attr=weight_attr, - bias_attr=dcn_bias_attr) - - def forward(self, x): - offset_mask = self.conv_offset(x) - offset, mask = paddle.split( - offset_mask, - num_or_sections=[self.offset_channel, self.mask_channel], - axis=1) - mask = F.sigmoid(mask) - y = self.conv_dcn(x, offset, mask=mask) - return y - - -class ConvNormLayer(nn.Layer): - def __init__(self, - ch_in, - ch_out, - filter_size, - stride, - groups=1, - norm_type='bn', - norm_decay=0., - norm_groups=32, - use_dcn=False, - bias_on=False, - lr_scale=1., - freeze_norm=False, - initializer=Normal( - mean=0., std=0.01), - skip_quant=False, - dcn_lr_scale=2., - dcn_regularizer=L2Decay(0.)): - super(ConvNormLayer, self).__init__() - assert norm_type in ['bn', 'sync_bn', 'gn', None] - - if bias_on: - bias_attr = ParamAttr( - initializer=Constant(value=0.), learning_rate=lr_scale) - else: - bias_attr = False - - if not use_dcn: - self.conv = nn.Conv2D( - in_channels=ch_in, - out_channels=ch_out, - kernel_size=filter_size, - stride=stride, - padding=(filter_size - 1) // 2, - groups=groups, - weight_attr=ParamAttr( - initializer=initializer, learning_rate=1.), - bias_attr=bias_attr) - if skip_quant: - self.conv.skip_quant = True - else: - # in FCOS-DCN head, specifically need learning_rate and regularizer - self.conv = DeformableConvV2( - in_channels=ch_in, - out_channels=ch_out, - kernel_size=filter_size, - stride=stride, - padding=(filter_size - 1) // 2, - groups=groups, - weight_attr=ParamAttr( - initializer=initializer, learning_rate=1.), - bias_attr=True, - lr_scale=dcn_lr_scale, - regularizer=dcn_regularizer, - dcn_bias_regularizer=dcn_regularizer, - dcn_bias_lr_scale=dcn_lr_scale, - skip_quant=skip_quant) - - norm_lr = 0. if freeze_norm else 1. - param_attr = ParamAttr( - learning_rate=norm_lr, - regularizer=L2Decay(norm_decay) if norm_decay is not None else None) - bias_attr = ParamAttr( - learning_rate=norm_lr, - regularizer=L2Decay(norm_decay) if norm_decay is not None else None) - if norm_type in ['bn', 'sync_bn']: - self.norm = nn.BatchNorm2D( - ch_out, weight_attr=param_attr, bias_attr=bias_attr) - elif norm_type == 'gn': - self.norm = nn.GroupNorm( - num_groups=norm_groups, - num_channels=ch_out, - weight_attr=param_attr, - bias_attr=bias_attr) - else: - self.norm = None - - def forward(self, inputs): - out = self.conv(inputs) - if self.norm is not None: - out = self.norm(out) - return out - - -class LiteConv(nn.Layer): - def __init__(self, - in_channels, - out_channels, - stride=1, - with_act=True, - norm_type='sync_bn', - name=None): - super(LiteConv, self).__init__() - self.lite_conv = nn.Sequential() - conv1 = ConvNormLayer( - in_channels, - in_channels, - filter_size=5, - stride=stride, - groups=in_channels, - norm_type=norm_type, - initializer=XavierUniform()) - conv2 = ConvNormLayer( - in_channels, - out_channels, - filter_size=1, - stride=stride, - norm_type=norm_type, - initializer=XavierUniform()) - conv3 = ConvNormLayer( - out_channels, - out_channels, - filter_size=1, - stride=stride, - norm_type=norm_type, - initializer=XavierUniform()) - conv4 = ConvNormLayer( - out_channels, - out_channels, - filter_size=5, - stride=stride, - groups=out_channels, - norm_type=norm_type, - initializer=XavierUniform()) - conv_list = [conv1, conv2, conv3, conv4] - self.lite_conv.add_sublayer('conv1', conv1) - self.lite_conv.add_sublayer('relu6_1', nn.ReLU6()) - self.lite_conv.add_sublayer('conv2', conv2) - if with_act: - self.lite_conv.add_sublayer('relu6_2', nn.ReLU6()) - self.lite_conv.add_sublayer('conv3', conv3) - self.lite_conv.add_sublayer('relu6_3', nn.ReLU6()) - self.lite_conv.add_sublayer('conv4', conv4) - if with_act: - self.lite_conv.add_sublayer('relu6_4', nn.ReLU6()) - - def forward(self, inputs): - out = self.lite_conv(inputs) - return out - - -class DropBlock(nn.Layer): - def __init__(self, block_size, keep_prob, name=None, data_format='NCHW'): - """ - DropBlock layer, see https://arxiv.org/abs/1810.12890 - - Args: - block_size (int): block size - keep_prob (int): keep probability - name (str): layer name - data_format (str): data format, NCHW or NHWC - """ - super(DropBlock, self).__init__() - self.block_size = block_size - self.keep_prob = keep_prob - self.name = name - self.data_format = data_format - - def forward(self, x): - if not self.training or self.keep_prob == 1: - return x - else: - gamma = (1. - self.keep_prob) / (self.block_size**2) - if self.data_format == 'NCHW': - shape = x.shape[2:] - else: - shape = x.shape[1:3] - for s in shape: - gamma *= s / (s - self.block_size + 1) - - matrix = paddle.cast(paddle.rand(x.shape) < gamma, x.dtype) - mask_inv = F.max_pool2d( - matrix, - self.block_size, - stride=1, - padding=self.block_size // 2, - data_format=self.data_format) - mask = 1. - mask_inv - mask = mask.astype('float32') - x = x.astype('float32') - y = x * mask * (mask.numel() / mask.sum()) - return y - - -@register -@serializable -class AnchorGeneratorSSD(object): - def __init__(self, - steps=[8, 16, 32, 64, 100, 300], - aspect_ratios=[[2.], [2., 3.], [2., 3.], [2., 3.], [2.], [2.]], - min_ratio=15, - max_ratio=90, - base_size=300, - min_sizes=[30.0, 60.0, 111.0, 162.0, 213.0, 264.0], - max_sizes=[60.0, 111.0, 162.0, 213.0, 264.0, 315.0], - offset=0.5, - flip=True, - clip=False, - min_max_aspect_ratios_order=False): - self.steps = steps - self.aspect_ratios = aspect_ratios - self.min_ratio = min_ratio - self.max_ratio = max_ratio - self.base_size = base_size - self.min_sizes = min_sizes - self.max_sizes = max_sizes - self.offset = offset - self.flip = flip - self.clip = clip - self.min_max_aspect_ratios_order = min_max_aspect_ratios_order - - if self.min_sizes == [] and self.max_sizes == []: - num_layer = len(aspect_ratios) - step = int( - math.floor(((self.max_ratio - self.min_ratio)) / (num_layer - 2 - ))) - for ratio in six.moves.range(self.min_ratio, self.max_ratio + 1, - step): - self.min_sizes.append(self.base_size * ratio / 100.) - self.max_sizes.append(self.base_size * (ratio + step) / 100.) - self.min_sizes = [self.base_size * .10] + self.min_sizes - self.max_sizes = [self.base_size * .20] + self.max_sizes - - self.num_priors = [] - for aspect_ratio, min_size, max_size in zip( - aspect_ratios, self.min_sizes, self.max_sizes): - if isinstance(min_size, (list, tuple)): - self.num_priors.append( - len(_to_list(min_size)) + len(_to_list(max_size))) - else: - self.num_priors.append((len(aspect_ratio) * 2 + 1) * len( - _to_list(min_size)) + len(_to_list(max_size))) - - def __call__(self, inputs, image): - boxes = [] - for input, min_size, max_size, aspect_ratio, step in zip( - inputs, self.min_sizes, self.max_sizes, self.aspect_ratios, - self.steps): - box, _ = ops.prior_box( - input=input, - image=image, - min_sizes=_to_list(min_size), - max_sizes=_to_list(max_size), - aspect_ratios=aspect_ratio, - flip=self.flip, - clip=self.clip, - steps=[step, step], - offset=self.offset, - min_max_aspect_ratios_order=self.min_max_aspect_ratios_order) - boxes.append(paddle.reshape(box, [-1, 4])) - return boxes - - -@register -@serializable -class RCNNBox(object): - __shared__ = ['num_classes', 'export_onnx'] - - def __init__(self, - prior_box_var=[10., 10., 5., 5.], - code_type="decode_center_size", - box_normalized=False, - num_classes=80, - export_onnx=False): - super(RCNNBox, self).__init__() - self.prior_box_var = prior_box_var - self.code_type = code_type - self.box_normalized = box_normalized - self.num_classes = num_classes - self.export_onnx = export_onnx - - def __call__(self, bbox_head_out, rois, im_shape, scale_factor): - bbox_pred = bbox_head_out[0] - cls_prob = bbox_head_out[1] - roi = rois[0] - rois_num = rois[1] - - if self.export_onnx: - onnx_rois_num_per_im = rois_num[0] - origin_shape = paddle.expand(im_shape[0, :], - [onnx_rois_num_per_im, 2]) - - else: - origin_shape_list = [] - if isinstance(roi, list): - batch_size = len(roi) - else: - batch_size = paddle.slice(paddle.shape(im_shape), [0], [0], [1]) - - # bbox_pred.shape: [N, C*4] - for idx in range(batch_size): - rois_num_per_im = rois_num[idx] - expand_im_shape = paddle.expand(im_shape[idx, :], - [rois_num_per_im, 2]) - origin_shape_list.append(expand_im_shape) - - origin_shape = paddle.concat(origin_shape_list) - - # bbox_pred.shape: [N, C*4] - # C=num_classes in faster/mask rcnn(bbox_head), C=1 in cascade rcnn(cascade_head) - bbox = paddle.concat(roi) - bbox = delta2bbox(bbox_pred, bbox, self.prior_box_var) - scores = cls_prob[:, :-1] - - # bbox.shape: [N, C, 4] - # bbox.shape[1] must be equal to scores.shape[1] - total_num = bbox.shape[0] - bbox_dim = bbox.shape[-1] - bbox = paddle.expand(bbox, [total_num, self.num_classes, bbox_dim]) - - origin_h = paddle.unsqueeze(origin_shape[:, 0], axis=1) - origin_w = paddle.unsqueeze(origin_shape[:, 1], axis=1) - zeros = paddle.zeros_like(origin_h) - x1 = paddle.maximum(paddle.minimum(bbox[:, :, 0], origin_w), zeros) - y1 = paddle.maximum(paddle.minimum(bbox[:, :, 1], origin_h), zeros) - x2 = paddle.maximum(paddle.minimum(bbox[:, :, 2], origin_w), zeros) - y2 = paddle.maximum(paddle.minimum(bbox[:, :, 3], origin_h), zeros) - bbox = paddle.stack([x1, y1, x2, y2], axis=-1) - bboxes = (bbox, rois_num) - return bboxes, scores - - -@register -@serializable -class MultiClassNMS(object): - def __init__(self, - score_threshold=.05, - nms_top_k=-1, - keep_top_k=100, - nms_threshold=.5, - normalized=True, - nms_eta=1.0, - return_index=False, - return_rois_num=True, - trt=False): - super(MultiClassNMS, self).__init__() - self.score_threshold = score_threshold - self.nms_top_k = nms_top_k - self.keep_top_k = keep_top_k - self.nms_threshold = nms_threshold - self.normalized = normalized - self.nms_eta = nms_eta - self.return_index = return_index - self.return_rois_num = return_rois_num - self.trt = trt - - def __call__(self, bboxes, score, background_label=-1): - """ - bboxes (Tensor|List[Tensor]): 1. (Tensor) Predicted bboxes with shape - [N, M, 4], N is the batch size and M - is the number of bboxes - 2. (List[Tensor]) bboxes and bbox_num, - bboxes have shape of [M, C, 4], C - is the class number and bbox_num means - the number of bboxes of each batch with - shape [N,] - score (Tensor): Predicted scores with shape [N, C, M] or [M, C] - background_label (int): Ignore the background label; For example, RCNN - is num_classes and YOLO is -1. - """ - kwargs = self.__dict__.copy() - if isinstance(bboxes, tuple): - bboxes, bbox_num = bboxes - kwargs.update({'rois_num': bbox_num}) - if background_label > -1: - kwargs.update({'background_label': background_label}) - kwargs.pop('trt') - # TODO(wangxinxin08): paddle version should be develop or 2.3 and above to run nms on tensorrt - if self.trt and (int(paddle.version.major) == 0 or - (int(paddle.version.major) >= 2 and - int(paddle.version.minor) >= 3)): - # TODO(wangxinxin08): tricky switch to run nms on tensorrt - kwargs.update({'nms_eta': 1.1}) - bbox, bbox_num, _ = ops.multiclass_nms(bboxes, score, **kwargs) - bbox = bbox.reshape([1, -1, 6]) - idx = paddle.nonzero(bbox[..., 0] != -1) - bbox = paddle.gather_nd(bbox, idx) - return bbox, bbox_num, None - else: - return ops.multiclass_nms(bboxes, score, **kwargs) - - -@register -@serializable -class MatrixNMS(object): - __append_doc__ = True - - def __init__(self, - score_threshold=.05, - post_threshold=.05, - nms_top_k=-1, - keep_top_k=100, - use_gaussian=False, - gaussian_sigma=2., - normalized=False, - background_label=0): - super(MatrixNMS, self).__init__() - self.score_threshold = score_threshold - self.post_threshold = post_threshold - self.nms_top_k = nms_top_k - self.keep_top_k = keep_top_k - self.normalized = normalized - self.use_gaussian = use_gaussian - self.gaussian_sigma = gaussian_sigma - self.background_label = background_label - - def __call__(self, bbox, score, *args): - return ops.matrix_nms( - bboxes=bbox, - scores=score, - score_threshold=self.score_threshold, - post_threshold=self.post_threshold, - nms_top_k=self.nms_top_k, - keep_top_k=self.keep_top_k, - use_gaussian=self.use_gaussian, - gaussian_sigma=self.gaussian_sigma, - background_label=self.background_label, - normalized=self.normalized) - - -@register -@serializable -class YOLOBox(object): - __shared__ = ['num_classes'] - - def __init__(self, - num_classes=80, - conf_thresh=0.005, - downsample_ratio=32, - clip_bbox=True, - scale_x_y=1.): - self.num_classes = num_classes - self.conf_thresh = conf_thresh - self.downsample_ratio = downsample_ratio - self.clip_bbox = clip_bbox - self.scale_x_y = scale_x_y - - def __call__(self, - yolo_head_out, - anchors, - im_shape, - scale_factor, - var_weight=None): - boxes_list = [] - scores_list = [] - origin_shape = im_shape / scale_factor - origin_shape = paddle.cast(origin_shape, 'int32') - for i, head_out in enumerate(yolo_head_out): - boxes, scores = paddle.vision.ops.yolo_box( - head_out, - origin_shape, - anchors[i], - self.num_classes, - self.conf_thresh, - self.downsample_ratio // 2**i, - self.clip_bbox, - scale_x_y=self.scale_x_y) - boxes_list.append(boxes) - scores_list.append(paddle.transpose(scores, perm=[0, 2, 1])) - yolo_boxes = paddle.concat(boxes_list, axis=1) - yolo_scores = paddle.concat(scores_list, axis=2) - return yolo_boxes, yolo_scores - - -@register -@serializable -class SSDBox(object): - def __init__(self, - is_normalized=True, - prior_box_var=[0.1, 0.1, 0.2, 0.2], - use_fuse_decode=False): - self.is_normalized = is_normalized - self.norm_delta = float(not self.is_normalized) - self.prior_box_var = prior_box_var - self.use_fuse_decode = use_fuse_decode - - def __call__(self, - preds, - prior_boxes, - im_shape, - scale_factor, - var_weight=None): - boxes, scores = preds - boxes = paddle.concat(boxes, axis=1) - prior_boxes = paddle.concat(prior_boxes) - if self.use_fuse_decode: - output_boxes = ops.box_coder( - prior_boxes, - self.prior_box_var, - boxes, - code_type="decode_center_size", - box_normalized=self.is_normalized) - else: - pb_w = prior_boxes[:, 2] - prior_boxes[:, 0] + self.norm_delta - pb_h = prior_boxes[:, 3] - prior_boxes[:, 1] + self.norm_delta - pb_x = prior_boxes[:, 0] + pb_w * 0.5 - pb_y = prior_boxes[:, 1] + pb_h * 0.5 - out_x = pb_x + boxes[:, :, 0] * pb_w * self.prior_box_var[0] - out_y = pb_y + boxes[:, :, 1] * pb_h * self.prior_box_var[1] - out_w = paddle.exp(boxes[:, :, 2] * self.prior_box_var[2]) * pb_w - out_h = paddle.exp(boxes[:, :, 3] * self.prior_box_var[3]) * pb_h - output_boxes = paddle.stack( - [ - out_x - out_w / 2., out_y - out_h / 2., out_x + out_w / 2., - out_y + out_h / 2. - ], - axis=-1) - - if self.is_normalized: - h = (im_shape[:, 0] / scale_factor[:, 0]).unsqueeze(-1) - w = (im_shape[:, 1] / scale_factor[:, 1]).unsqueeze(-1) - im_shape = paddle.stack([w, h, w, h], axis=-1) - output_boxes *= im_shape - else: - output_boxes[..., -2:] -= 1.0 - output_scores = F.softmax(paddle.concat( - scores, axis=1)).transpose([0, 2, 1]) - - return output_boxes, output_scores - - -@register -class TTFBox(object): - __shared__ = ['down_ratio'] - - def __init__(self, max_per_img=100, score_thresh=0.01, down_ratio=4): - super(TTFBox, self).__init__() - self.max_per_img = max_per_img - self.score_thresh = score_thresh - self.down_ratio = down_ratio - - def _simple_nms(self, heat, kernel=3): - """ - Use maxpool to filter the max score, get local peaks. - """ - pad = (kernel - 1) // 2 - hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad) - keep = paddle.cast(hmax == heat, 'float32') - return heat * keep - - def _topk(self, scores): - """ - Select top k scores and decode to get xy coordinates. - """ - k = self.max_per_img - shape_fm = paddle.shape(scores) - shape_fm.stop_gradient = True - cat, height, width = shape_fm[1], shape_fm[2], shape_fm[3] - # batch size is 1 - scores_r = paddle.reshape(scores, [cat, -1]) - topk_scores, topk_inds = paddle.topk(scores_r, k) - topk_ys = topk_inds // width - topk_xs = topk_inds % width - - topk_score_r = paddle.reshape(topk_scores, [-1]) - topk_score, topk_ind = paddle.topk(topk_score_r, k) - k_t = paddle.full(paddle.shape(topk_ind), k, dtype='int64') - topk_clses = paddle.cast(paddle.floor_divide(topk_ind, k_t), 'float32') - - topk_inds = paddle.reshape(topk_inds, [-1]) - topk_ys = paddle.reshape(topk_ys, [-1, 1]) - topk_xs = paddle.reshape(topk_xs, [-1, 1]) - topk_inds = paddle.gather(topk_inds, topk_ind) - topk_ys = paddle.gather(topk_ys, topk_ind) - topk_xs = paddle.gather(topk_xs, topk_ind) - - return topk_score, topk_inds, topk_clses, topk_ys, topk_xs - - def _decode(self, hm, wh, im_shape, scale_factor): - heatmap = F.sigmoid(hm) - heat = self._simple_nms(heatmap) - scores, inds, clses, ys, xs = self._topk(heat) - ys = paddle.cast(ys, 'float32') * self.down_ratio - xs = paddle.cast(xs, 'float32') * self.down_ratio - scores = paddle.tensor.unsqueeze(scores, [1]) - clses = paddle.tensor.unsqueeze(clses, [1]) - - wh_t = paddle.transpose(wh, [0, 2, 3, 1]) - wh = paddle.reshape(wh_t, [-1, paddle.shape(wh_t)[-1]]) - wh = paddle.gather(wh, inds) - - x1 = xs - wh[:, 0:1] - y1 = ys - wh[:, 1:2] - x2 = xs + wh[:, 2:3] - y2 = ys + wh[:, 3:4] - - bboxes = paddle.concat([x1, y1, x2, y2], axis=1) - - scale_y = scale_factor[:, 0:1] - scale_x = scale_factor[:, 1:2] - scale_expand = paddle.concat( - [scale_x, scale_y, scale_x, scale_y], axis=1) - boxes_shape = paddle.shape(bboxes) - boxes_shape.stop_gradient = True - scale_expand = paddle.expand(scale_expand, shape=boxes_shape) - bboxes = paddle.divide(bboxes, scale_expand) - results = paddle.concat([clses, scores, bboxes], axis=1) - # hack: append result with cls=-1 and score=1. to avoid all scores - # are less than score_thresh which may cause error in gather. - fill_r = paddle.to_tensor(np.array([[-1, 1, 0, 0, 0, 0]])) - fill_r = paddle.cast(fill_r, results.dtype) - results = paddle.concat([results, fill_r]) - scores = results[:, 1] - valid_ind = paddle.nonzero(scores > self.score_thresh) - results = paddle.gather(results, valid_ind) - return results, paddle.shape(results)[0:1] - - def __call__(self, hm, wh, im_shape, scale_factor): - results = [] - results_num = [] - for i in range(scale_factor.shape[0]): - result, num = self._decode(hm[i:i + 1, ], wh[i:i + 1, ], - im_shape[i:i + 1, ], - scale_factor[i:i + 1, ]) - results.append(result) - results_num.append(num) - results = paddle.concat(results, axis=0) - results_num = paddle.concat(results_num, axis=0) - return results, results_num - - -@register -@serializable -class JDEBox(object): - __shared__ = ['num_classes'] - - def __init__(self, num_classes=1, conf_thresh=0.3, downsample_ratio=32): - self.num_classes = num_classes - self.conf_thresh = conf_thresh - self.downsample_ratio = downsample_ratio - - def generate_anchor(self, nGh, nGw, anchor_wh): - nA = len(anchor_wh) - yv, xv = paddle.meshgrid([paddle.arange(nGh), paddle.arange(nGw)]) - mesh = paddle.stack( - (xv, yv), axis=0).cast(dtype='float32') # 2 x nGh x nGw - meshs = paddle.tile(mesh, [nA, 1, 1, 1]) - - anchor_offset_mesh = anchor_wh[:, :, None][:, :, :, None].repeat( - int(nGh), axis=-2).repeat( - int(nGw), axis=-1) - anchor_offset_mesh = paddle.to_tensor( - anchor_offset_mesh.astype(np.float32)) - # nA x 2 x nGh x nGw - - anchor_mesh = paddle.concat([meshs, anchor_offset_mesh], axis=1) - anchor_mesh = paddle.transpose(anchor_mesh, - [0, 2, 3, 1]) # (nA x nGh x nGw) x 4 - return anchor_mesh - - def decode_delta(self, delta, fg_anchor_list): - px, py, pw, ph = fg_anchor_list[:, 0], fg_anchor_list[:,1], \ - fg_anchor_list[:, 2], fg_anchor_list[:,3] - dx, dy, dw, dh = delta[:, 0], delta[:, 1], delta[:, 2], delta[:, 3] - gx = pw * dx + px - gy = ph * dy + py - gw = pw * paddle.exp(dw) - gh = ph * paddle.exp(dh) - gx1 = gx - gw * 0.5 - gy1 = gy - gh * 0.5 - gx2 = gx + gw * 0.5 - gy2 = gy + gh * 0.5 - return paddle.stack([gx1, gy1, gx2, gy2], axis=1) - - def decode_delta_map(self, nA, nGh, nGw, delta_map, anchor_vec): - anchor_mesh = self.generate_anchor(nGh, nGw, anchor_vec) - anchor_mesh = paddle.unsqueeze(anchor_mesh, 0) - pred_list = self.decode_delta( - paddle.reshape( - delta_map, shape=[-1, 4]), - paddle.reshape( - anchor_mesh, shape=[-1, 4])) - pred_map = paddle.reshape(pred_list, shape=[nA * nGh * nGw, 4]) - return pred_map - - def _postprocessing_by_level(self, nA, stride, head_out, anchor_vec): - boxes_shape = head_out.shape # [nB, nA*6, nGh, nGw] - nGh, nGw = boxes_shape[-2], boxes_shape[-1] - nB = 1 # TODO: only support bs=1 now - boxes_list, scores_list = [], [] - for idx in range(nB): - p = paddle.reshape( - head_out[idx], shape=[nA, self.num_classes + 5, nGh, nGw]) - p = paddle.transpose(p, perm=[0, 2, 3, 1]) # [nA, nGh, nGw, 6] - delta_map = p[:, :, :, :4] - boxes = self.decode_delta_map(nA, nGh, nGw, delta_map, anchor_vec) - # [nA * nGh * nGw, 4] - boxes_list.append(boxes * stride) - - p_conf = paddle.transpose( - p[:, :, :, 4:6], perm=[3, 0, 1, 2]) # [2, nA, nGh, nGw] - p_conf = F.softmax( - p_conf, axis=0)[1, :, :, :].unsqueeze(-1) # [nA, nGh, nGw, 1] - scores = paddle.reshape(p_conf, shape=[nA * nGh * nGw, 1]) - scores_list.append(scores) - - boxes_results = paddle.stack(boxes_list) - scores_results = paddle.stack(scores_list) - return boxes_results, scores_results - - def __call__(self, yolo_head_out, anchors): - bbox_pred_list = [] - for i, head_out in enumerate(yolo_head_out): - stride = self.downsample_ratio // 2**i - anc_w, anc_h = anchors[i][0::2], anchors[i][1::2] - anchor_vec = np.stack((anc_w, anc_h), axis=1) / stride - nA = len(anc_w) - boxes, scores = self._postprocessing_by_level(nA, stride, head_out, - anchor_vec) - bbox_pred_list.append(paddle.concat([boxes, scores], axis=-1)) - - yolo_boxes_scores = paddle.concat(bbox_pred_list, axis=1) - boxes_idx_over_conf_thr = paddle.nonzero( - yolo_boxes_scores[:, :, -1] > self.conf_thresh) - boxes_idx_over_conf_thr.stop_gradient = True - - return boxes_idx_over_conf_thr, yolo_boxes_scores - - -@register -@serializable -class MaskMatrixNMS(object): - """ - Matrix NMS for multi-class masks. - Args: - update_threshold (float): Updated threshold of categroy score in second time. - pre_nms_top_n (int): Number of total instance to be kept per image before NMS - post_nms_top_n (int): Number of total instance to be kept per image after NMS. - kernel (str): 'linear' or 'gaussian'. - sigma (float): std in gaussian method. - Input: - seg_preds (Variable): shape (n, h, w), segmentation feature maps - seg_masks (Variable): shape (n, h, w), segmentation feature maps - cate_labels (Variable): shape (n), mask labels in descending order - cate_scores (Variable): shape (n), mask scores in descending order - sum_masks (Variable): a float tensor of the sum of seg_masks - Returns: - Variable: cate_scores, tensors of shape (n) - """ - - def __init__(self, - update_threshold=0.05, - pre_nms_top_n=500, - post_nms_top_n=100, - kernel='gaussian', - sigma=2.0): - super(MaskMatrixNMS, self).__init__() - self.update_threshold = update_threshold - self.pre_nms_top_n = pre_nms_top_n - self.post_nms_top_n = post_nms_top_n - self.kernel = kernel - self.sigma = sigma - - def _sort_score(self, scores, top_num): - if paddle.shape(scores)[0] > top_num: - return paddle.topk(scores, top_num)[1] - else: - return paddle.argsort(scores, descending=True) - - def __call__(self, - seg_preds, - seg_masks, - cate_labels, - cate_scores, - sum_masks=None): - # sort and keep top nms_pre - sort_inds = self._sort_score(cate_scores, self.pre_nms_top_n) - seg_masks = paddle.gather(seg_masks, index=sort_inds) - seg_preds = paddle.gather(seg_preds, index=sort_inds) - sum_masks = paddle.gather(sum_masks, index=sort_inds) - cate_scores = paddle.gather(cate_scores, index=sort_inds) - cate_labels = paddle.gather(cate_labels, index=sort_inds) - - seg_masks = paddle.flatten(seg_masks, start_axis=1, stop_axis=-1) - # inter. - inter_matrix = paddle.mm(seg_masks, paddle.transpose(seg_masks, [1, 0])) - n_samples = paddle.shape(cate_labels) - # union. - sum_masks_x = paddle.expand(sum_masks, shape=[n_samples, n_samples]) - # iou. - iou_matrix = (inter_matrix / ( - sum_masks_x + paddle.transpose(sum_masks_x, [1, 0]) - inter_matrix)) - iou_matrix = paddle.triu(iou_matrix, diagonal=1) - # label_specific matrix. - cate_labels_x = paddle.expand(cate_labels, shape=[n_samples, n_samples]) - label_matrix = paddle.cast( - (cate_labels_x == paddle.transpose(cate_labels_x, [1, 0])), - 'float32') - label_matrix = paddle.triu(label_matrix, diagonal=1) - - # IoU compensation - compensate_iou = paddle.max((iou_matrix * label_matrix), axis=0) - compensate_iou = paddle.expand( - compensate_iou, shape=[n_samples, n_samples]) - compensate_iou = paddle.transpose(compensate_iou, [1, 0]) - - # IoU decay - decay_iou = iou_matrix * label_matrix - - # matrix nms - if self.kernel == 'gaussian': - decay_matrix = paddle.exp(-1 * self.sigma * (decay_iou**2)) - compensate_matrix = paddle.exp(-1 * self.sigma * - (compensate_iou**2)) - decay_coefficient = paddle.min(decay_matrix / compensate_matrix, - axis=0) - elif self.kernel == 'linear': - decay_matrix = (1 - decay_iou) / (1 - compensate_iou) - decay_coefficient = paddle.min(decay_matrix, axis=0) - else: - raise NotImplementedError - - # update the score. - cate_scores = cate_scores * decay_coefficient - y = paddle.zeros(shape=paddle.shape(cate_scores), dtype='float32') - keep = paddle.where(cate_scores >= self.update_threshold, cate_scores, - y) - keep = paddle.nonzero(keep) - keep = paddle.squeeze(keep, axis=[1]) - # Prevent empty and increase fake data - keep = paddle.concat( - [keep, paddle.cast(paddle.shape(cate_scores)[0:1] - 1, 'int64')]) - - seg_preds = paddle.gather(seg_preds, index=keep) - cate_scores = paddle.gather(cate_scores, index=keep) - cate_labels = paddle.gather(cate_labels, index=keep) - - # sort and keep top_k - sort_inds = self._sort_score(cate_scores, self.post_nms_top_n) - seg_preds = paddle.gather(seg_preds, index=sort_inds) - cate_scores = paddle.gather(cate_scores, index=sort_inds) - cate_labels = paddle.gather(cate_labels, index=sort_inds) - return seg_preds, cate_scores, cate_labels - - -def Conv2d(in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - dilation=1, - groups=1, - bias=True, - weight_init=Normal(std=0.001), - bias_init=Constant(0.)): - weight_attr = paddle.framework.ParamAttr(initializer=weight_init) - if bias: - bias_attr = paddle.framework.ParamAttr(initializer=bias_init) - else: - bias_attr = False - conv = nn.Conv2D( - in_channels, - out_channels, - kernel_size, - stride, - padding, - dilation, - groups, - weight_attr=weight_attr, - bias_attr=bias_attr) - return conv - - -def ConvTranspose2d(in_channels, - out_channels, - kernel_size, - stride=1, - padding=0, - output_padding=0, - groups=1, - bias=True, - dilation=1, - weight_init=Normal(std=0.001), - bias_init=Constant(0.)): - weight_attr = paddle.framework.ParamAttr(initializer=weight_init) - if bias: - bias_attr = paddle.framework.ParamAttr(initializer=bias_init) - else: - bias_attr = False - conv = nn.Conv2DTranspose( - in_channels, - out_channels, - kernel_size, - stride, - padding, - output_padding, - dilation, - groups, - weight_attr=weight_attr, - bias_attr=bias_attr) - return conv - - -def BatchNorm2d(num_features, eps=1e-05, momentum=0.9, affine=True): - if not affine: - weight_attr = False - bias_attr = False - else: - weight_attr = None - bias_attr = None - batchnorm = nn.BatchNorm2D( - num_features, - momentum, - eps, - weight_attr=weight_attr, - bias_attr=bias_attr) - return batchnorm - - -def ReLU(): - return nn.ReLU() - - -def Upsample(scale_factor=None, mode='nearest', align_corners=False): - return nn.Upsample(None, scale_factor, mode, align_corners) - - -def MaxPool(kernel_size, stride, padding, ceil_mode=False): - return nn.MaxPool2D(kernel_size, stride, padding, ceil_mode=ceil_mode) - - -class Concat(nn.Layer): - def __init__(self, dim=0): - super(Concat, self).__init__() - self.dim = dim - - def forward(self, inputs): - return paddle.concat(inputs, axis=self.dim) - - def extra_repr(self): - return 'dim={}'.format(self.dim) - - -def _convert_attention_mask(attn_mask, dtype): - """ - Convert the attention mask to the target dtype we expect. - Parameters: - attn_mask (Tensor, optional): A tensor used in multi-head attention - to prevents attention to some unwanted positions, usually the - paddings or the subsequent positions. It is a tensor with shape - broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. - When the data type is bool, the unwanted positions have `False` - values and the others have `True` values. When the data type is - int, the unwanted positions have 0 values and the others have 1 - values. When the data type is float, the unwanted positions have - `-INF` values and the others have 0 values. It can be None when - nothing wanted or needed to be prevented attention to. Default None. - dtype (VarType): The target type of `attn_mask` we expect. - Returns: - Tensor: A Tensor with shape same as input `attn_mask`, with data type `dtype`. - """ - return nn.layer.transformer._convert_attention_mask(attn_mask, dtype) - - -@register -class MultiHeadAttention(nn.Layer): - """ - Attention mapps queries and a set of key-value pairs to outputs, and - Multi-Head Attention performs multiple parallel attention to jointly attending - to information from different representation subspaces. - - Please refer to `Attention Is All You Need `_ - for more details. - - Parameters: - embed_dim (int): The expected feature size in the input and output. - num_heads (int): The number of heads in multi-head attention. - dropout (float, optional): The dropout probability used on attention - weights to drop some attention targets. 0 for no dropout. Default 0 - kdim (int, optional): The feature size in key. If None, assumed equal to - `embed_dim`. Default None. - vdim (int, optional): The feature size in value. If None, assumed equal to - `embed_dim`. Default None. - need_weights (bool, optional): Indicate whether to return the attention - weights. Default False. - - Examples: - - .. code-block:: python - - import paddle - - # encoder input: [batch_size, sequence_length, d_model] - query = paddle.rand((2, 4, 128)) - # self attention mask: [batch_size, num_heads, query_len, query_len] - attn_mask = paddle.rand((2, 2, 4, 4)) - multi_head_attn = paddle.nn.MultiHeadAttention(128, 2) - output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128] - """ - - def __init__(self, - embed_dim, - num_heads, - dropout=0., - kdim=None, - vdim=None, - need_weights=False): - super(MultiHeadAttention, self).__init__() - self.embed_dim = embed_dim - self.kdim = kdim if kdim is not None else embed_dim - self.vdim = vdim if vdim is not None else embed_dim - self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim - - self.num_heads = num_heads - self.dropout = dropout - self.need_weights = need_weights - - self.head_dim = embed_dim // num_heads - assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" - - if self._qkv_same_embed_dim: - self.in_proj_weight = self.create_parameter( - shape=[embed_dim, 3 * embed_dim], - attr=None, - dtype=self._dtype, - is_bias=False) - self.in_proj_bias = self.create_parameter( - shape=[3 * embed_dim], - attr=None, - dtype=self._dtype, - is_bias=True) - else: - self.q_proj = nn.Linear(embed_dim, embed_dim) - self.k_proj = nn.Linear(self.kdim, embed_dim) - self.v_proj = nn.Linear(self.vdim, embed_dim) - - self.out_proj = nn.Linear(embed_dim, embed_dim) - self._type_list = ('q_proj', 'k_proj', 'v_proj') - - self._reset_parameters() - - def _reset_parameters(self): - for p in self.parameters(): - if p.dim() > 1: - xavier_uniform_(p) - else: - constant_(p) - - def compute_qkv(self, tensor, index): - if self._qkv_same_embed_dim: - tensor = F.linear( - x=tensor, - weight=self.in_proj_weight[:, index * self.embed_dim:(index + 1) - * self.embed_dim], - bias=self.in_proj_bias[index * self.embed_dim:(index + 1) * - self.embed_dim] - if self.in_proj_bias is not None else None) - else: - tensor = getattr(self, self._type_list[index])(tensor) - tensor = tensor.reshape( - [0, 0, self.num_heads, self.head_dim]).transpose([0, 2, 1, 3]) - return tensor - - def forward(self, query, key=None, value=None, attn_mask=None): - r""" - Applies multi-head attention to map queries and a set of key-value pairs - to outputs. - - Parameters: - query (Tensor): The queries for multi-head attention. It is a - tensor with shape `[batch_size, query_length, embed_dim]`. The - data type should be float32 or float64. - key (Tensor, optional): The keys for multi-head attention. It is - a tensor with shape `[batch_size, key_length, kdim]`. The - data type should be float32 or float64. If None, use `query` as - `key`. Default None. - value (Tensor, optional): The values for multi-head attention. It - is a tensor with shape `[batch_size, value_length, vdim]`. - The data type should be float32 or float64. If None, use `query` as - `value`. Default None. - attn_mask (Tensor, optional): A tensor used in multi-head attention - to prevents attention to some unwanted positions, usually the - paddings or the subsequent positions. It is a tensor with shape - broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. - When the data type is bool, the unwanted positions have `False` - values and the others have `True` values. When the data type is - int, the unwanted positions have 0 values and the others have 1 - values. When the data type is float, the unwanted positions have - `-INF` values and the others have 0 values. It can be None when - nothing wanted or needed to be prevented attention to. Default None. - - Returns: - Tensor|tuple: It is a tensor that has the same shape and data type \ - as `query`, representing attention output. Or a tuple if \ - `need_weights` is True or `cache` is not None. If `need_weights` \ - is True, except for attention output, the tuple also includes \ - the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \ - If `cache` is not None, the tuple then includes the new cache \ - having the same type as `cache`, and if it is `StaticCache`, it \ - is same as the input `cache`, if it is `Cache`, the new cache \ - reserves tensors concatanating raw tensors with intermediate \ - results of current query. - """ - key = query if key is None else key - value = query if value is None else value - # compute q ,k ,v - q, k, v = (self.compute_qkv(t, i) - for i, t in enumerate([query, key, value])) - - # scale dot product attention - product = paddle.matmul(x=q, y=k, transpose_y=True) - scaling = float(self.head_dim)**-0.5 - product = product * scaling - - if attn_mask is not None: - # Support bool or int mask - attn_mask = _convert_attention_mask(attn_mask, product.dtype) - product = product + attn_mask - weights = F.softmax(product) - if self.dropout: - weights = F.dropout( - weights, - self.dropout, - training=self.training, - mode="upscale_in_train") - out = paddle.matmul(weights, v) - - # combine heads - out = paddle.transpose(out, perm=[0, 2, 1, 3]) - out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) - - # project to output - out = self.out_proj(out) - - outs = [out] - if self.need_weights: - outs.append(weights) - return out if len(outs) == 1 else tuple(outs) - - -@register -class ConvMixer(nn.Layer): - def __init__( - self, - dim, - depth, - kernel_size=3, ): - super().__init__() - self.dim = dim - self.depth = depth - self.kernel_size = kernel_size - - self.mixer = self.conv_mixer(dim, depth, kernel_size) - - def forward(self, x): - return self.mixer(x) - - @staticmethod - def conv_mixer( - dim, - depth, - kernel_size, ): - Seq, ActBn = nn.Sequential, lambda x: Seq(x, nn.GELU(), nn.BatchNorm2D(dim)) - Residual = type('Residual', (Seq, ), - {'forward': lambda self, x: self[0](x) + x}) - return Seq(* [ - Seq(Residual( - ActBn( - nn.Conv2D( - dim, dim, kernel_size, groups=dim, padding="same"))), - ActBn(nn.Conv2D(dim, dim, 1))) for i in range(depth) - ]) diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/losses/__init__.py deleted file mode 100644 index 41b3ae0..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/__init__.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import yolo_loss -from . import iou_aware_loss -from . import iou_loss -from . import ssd_loss -from . import fcos_loss -from . import solov2_loss -from . import ctfocal_loss -from . import keypoint_loss -from . import jde_loss -from . import fairmot_loss -from . import gfocal_loss -from . import detr_loss -from . import sparsercnn_loss -from . import focal_loss -from . import smooth_l1_loss -from . import probiou_loss -from . import cot_loss -from . import supcontrast -from . import queryinst_loss -from . import clrnet_loss -from . import clrnet_line_iou_loss - -from .yolo_loss import * -from .iou_aware_loss import * -from .iou_loss import * -from .ssd_loss import * -from .fcos_loss import * -from .solov2_loss import * -from .ctfocal_loss import * -from .keypoint_loss import * -from .jde_loss import * -from .fairmot_loss import * -from .gfocal_loss import * -from .detr_loss import * -from .sparsercnn_loss import * -from .focal_loss import * -from .smooth_l1_loss import * -from .pose3d_loss import * -from .probiou_loss import * -from .cot_loss import * -from .supcontrast import * -from .queryinst_loss import * -from .clrnet_loss import * -from .clrnet_line_iou_loss import * \ No newline at end of file diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/clrnet_line_iou_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/clrnet_line_iou_loss.py deleted file mode 100644 index 2a1973d..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/clrnet_line_iou_loss.py +++ /dev/null @@ -1,41 +0,0 @@ -import paddle - - -def line_iou(pred, target, img_w, length=15, aligned=True): - ''' - Calculate the line iou value between predictions and targets - Args: - pred: lane predictions, shape: (num_pred, 72) - target: ground truth, shape: (num_target, 72) - img_w: image width - length: extended radius - aligned: True for iou loss calculation, False for pair-wise ious in assign - ''' - px1 = pred - length - px2 = pred + length - tx1 = target - length - tx2 = target + length - - if aligned: - invalid_mask = target - ovr = paddle.minimum(px2, tx2) - paddle.maximum(px1, tx1) - union = paddle.maximum(px2, tx2) - paddle.minimum(px1, tx1) - else: - num_pred = pred.shape[0] - invalid_mask = target.tile([num_pred, 1, 1]) - - ovr = (paddle.minimum(px2[:, None, :], tx2[None, ...]) - paddle.maximum( - px1[:, None, :], tx1[None, ...])) - union = (paddle.maximum(px2[:, None, :], tx2[None, ...]) - - paddle.minimum(px1[:, None, :], tx1[None, ...])) - - invalid_masks = (invalid_mask < 0) | (invalid_mask >= img_w) - - ovr[invalid_masks] = 0. - union[invalid_masks] = 0. - iou = ovr.sum(axis=-1) / (union.sum(axis=-1) + 1e-9) - return iou - - -def liou_loss(pred, target, img_w, length=15): - return (1 - line_iou(pred, target, img_w, length)).mean() diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/clrnet_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/clrnet_loss.py deleted file mode 100644 index b4ad39e..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/clrnet_loss.py +++ /dev/null @@ -1,283 +0,0 @@ -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register -from ppdet.modeling.clrnet_utils import accuracy -from ppdet.modeling.assigners.clrnet_assigner import assign -from ppdet.modeling.losses.clrnet_line_iou_loss import liou_loss - -__all__ = ['CLRNetLoss'] - - -class SoftmaxFocalLoss(nn.Layer): - def __init__(self, gamma, ignore_lb=255, *args, **kwargs): - super(SoftmaxFocalLoss, self).__init__() - self.gamma = gamma - self.nll = nn.NLLLoss(ignore_index=ignore_lb) - - def forward(self, logits, labels): - scores = F.softmax(logits, dim=1) - factor = paddle.pow(1. - scores, self.gamma) - log_score = F.log_softmax(logits, dim=1) - log_score = factor * log_score - loss = self.nll(log_score, labels) - return loss - - -def focal_loss(input: paddle.Tensor, - target: paddle.Tensor, - alpha: float, - gamma: float=2.0, - reduction: str='none', - eps: float=1e-8) -> paddle.Tensor: - r"""Function that computes Focal loss. - - See :class:`~kornia.losses.FocalLoss` for details. - """ - if not paddle.is_tensor(input): - raise TypeError("Input type is not a torch.Tensor. Got {}".format( - type(input))) - - if not len(input.shape) >= 2: - raise ValueError("Invalid input shape, we expect BxCx*. Got: {}".format( - input.shape)) - - if input.shape[0] != target.shape[0]: - raise ValueError( - 'Expected input batch_size ({}) to match target batch_size ({}).'. - format(input.shape[0], target.shape[0])) - - n = input.shape[0] - out_size = (n, ) + tuple(input.shape[2:]) - if target.shape[1:] != input.shape[2:]: - raise ValueError('Expected target size {}, got {}'.format(out_size, - target.shape)) - if (isinstance(input.place, paddle.CUDAPlace) and - isinstance(target.place, paddle.CPUPlace)) | (isinstance( - input.place, paddle.CPUPlace) and isinstance(target.place, - paddle.CUDAPlace)): - raise ValueError( - "input and target must be in the same device. Got: {} and {}". - format(input.place, target.place)) - - # compute softmax over the classes axis - input_soft: paddle.Tensor = F.softmax(input, axis=1) + eps - - # create the labels one hot tensor - target_one_hot: paddle.Tensor = paddle.to_tensor( - F.one_hot( - target, num_classes=input.shape[1]).cast(input.dtype), - place=input.place) - - # compute the actual focal loss - weight = paddle.pow(-input_soft + 1., gamma) - - focal = -alpha * weight * paddle.log(input_soft) - loss_tmp = paddle.sum(target_one_hot * focal, axis=1) - - if reduction == 'none': - loss = loss_tmp - elif reduction == 'mean': - loss = paddle.mean(loss_tmp) - elif reduction == 'sum': - loss = paddle.sum(loss_tmp) - else: - raise NotImplementedError("Invalid reduction mode: {}".format( - reduction)) - return loss - - -class FocalLoss(nn.Layer): - r"""Criterion that computes Focal loss. - - According to [1], the Focal loss is computed as follows: - - .. math:: - - \text{FL}(p_t) = -\alpha_t (1 - p_t)^{\gamma} \, \text{log}(p_t) - - where: - - :math:`p_t` is the model's estimated probability for each class. - - - Arguments: - alpha (float): Weighting factor :math:`\alpha \in [0, 1]`. - gamma (float): Focusing parameter :math:`\gamma >= 0`. - reduction (str, optional): Specifies the reduction to apply to the - output: ‘none’ | ‘mean’ | ‘sum’. ‘none’: no reduction will be applied, - ‘mean’: the sum of the output will be divided by the number of elements - in the output, ‘sum’: the output will be summed. Default: ‘none’. - - Shape: - - Input: :math:`(N, C, *)` where C = number of classes. - - Target: :math:`(N, *)` where each value is - :math:`0 ≤ targets[i] ≤ C−1`. - - Examples: - >>> N = 5 # num_classes - >>> kwargs = {"alpha": 0.5, "gamma": 2.0, "reduction": 'mean'} - >>> loss = kornia.losses.FocalLoss(**kwargs) - >>> input = torch.randn(1, N, 3, 5, requires_grad=True) - >>> target = torch.empty(1, 3, 5, dtype=torch.long).random_(N) - >>> output = loss(input, target) - >>> output.backward() - - References: - [1] https://arxiv.org/abs/1708.02002 - """ - - def __init__(self, alpha: float, gamma: float=2.0, - reduction: str='none') -> None: - super(FocalLoss, self).__init__() - self.alpha: float = alpha - self.gamma: float = gamma - self.reduction: str = reduction - self.eps: float = 1e-6 - - def forward( # type: ignore - self, input: paddle.Tensor, target: paddle.Tensor) -> paddle.Tensor: - return focal_loss(input, target, self.alpha, self.gamma, self.reduction, - self.eps) - - -@register -class CLRNetLoss(nn.Layer): - __shared__ = ['img_w', 'img_h', 'num_classes', 'num_points'] - - def __init__(self, - cls_loss_weight=2.0, - xyt_loss_weight=0.2, - iou_loss_weight=2.0, - seg_loss_weight=1.0, - refine_layers=3, - num_points=72, - img_w=800, - img_h=320, - num_classes=5, - ignore_label=255, - bg_weight=0.4): - super(CLRNetLoss, self).__init__() - self.cls_loss_weight = cls_loss_weight - self.xyt_loss_weight = xyt_loss_weight - self.iou_loss_weight = iou_loss_weight - self.seg_loss_weight = seg_loss_weight - self.refine_layers = refine_layers - self.img_w = img_w - self.img_h = img_h - self.n_strips = num_points - 1 - self.num_classes = num_classes - self.ignore_label = ignore_label - weights = paddle.ones(shape=[self.num_classes]) - weights[0] = bg_weight - self.criterion = nn.NLLLoss( - ignore_index=self.ignore_label, weight=weights) - - def forward(self, output, batch): - predictions_lists = output['predictions_lists'] - targets = batch['lane_line'].clone() - cls_criterion = FocalLoss(alpha=0.25, gamma=2.0) - cls_loss = paddle.to_tensor(0.0) - reg_xytl_loss = paddle.to_tensor(0.0) - iou_loss = paddle.to_tensor(0.0) - cls_acc = [] - cls_acc_stage = [] - for stage in range(self.refine_layers): - predictions_list = predictions_lists[stage] - for predictions, target in zip(predictions_list, targets): - target = target[target[:, 1] == 1] - - if len(target) == 0: - # If there are no targets, all predictions have to be negatives (i.e., 0 confidence) - cls_target = paddle.zeros( - [predictions.shape[0]], dtype='int64') - cls_pred = predictions[:, :2] - cls_loss = cls_loss + cls_criterion(cls_pred, - cls_target).sum() - continue - - with paddle.no_grad(): - matched_row_inds, matched_col_inds = assign( - predictions, target, self.img_w, self.img_h) - - # classification targets - cls_target = paddle.zeros([predictions.shape[0]], dtype='int64') - cls_target[matched_row_inds] = 1 - cls_pred = predictions[:, :2] - - # regression targets -> [start_y, start_x, theta] (all transformed to absolute values), only on matched pairs - reg_yxtl = predictions.index_select(matched_row_inds)[..., 2:6] - - reg_yxtl[:, 0] *= self.n_strips - reg_yxtl[:, 1] *= (self.img_w - 1) - reg_yxtl[:, 2] *= 180 - reg_yxtl[:, 3] *= self.n_strips - - target_yxtl = target.index_select(matched_col_inds)[..., 2: - 6].clone() - - # regression targets -> S coordinates (all transformed to absolute values) - reg_pred = predictions.index_select(matched_row_inds)[..., 6:] - reg_pred *= (self.img_w - 1) - reg_targets = target.index_select(matched_col_inds)[..., - 6:].clone() - - with paddle.no_grad(): - predictions_starts = paddle.clip( - (predictions.index_select(matched_row_inds)[..., 2] * - self.n_strips).round().cast("int64"), - min=0, - max=self. - n_strips) # ensure the predictions starts is valid - - target_starts = ( - target.index_select(matched_col_inds)[..., 2] * - self.n_strips).round().cast("int64") - target_yxtl[:, -1] -= ( - predictions_starts - target_starts) # reg length - - # Loss calculation - cls_loss = cls_loss + cls_criterion( - cls_pred, cls_target).sum() / target.shape[0] - - target_yxtl[:, 0] *= self.n_strips - target_yxtl[:, 2] *= 180 - - reg_xytl_loss = reg_xytl_loss + F.smooth_l1_loss( - input=reg_yxtl, label=target_yxtl, reduction='none').mean() - - iou_loss = iou_loss + liou_loss( - reg_pred, reg_targets, self.img_w, length=15) - - cls_accuracy = accuracy(cls_pred, cls_target) - cls_acc_stage.append(cls_accuracy) - - cls_acc.append(sum(cls_acc_stage) / (len(cls_acc_stage) + 1e-5)) - - # extra segmentation loss - seg_loss = self.criterion( - F.log_softmax( - output['seg'], axis=1), batch['seg'].cast('int64')) - - cls_loss /= (len(targets) * self.refine_layers) - reg_xytl_loss /= (len(targets) * self.refine_layers) - iou_loss /= (len(targets) * self.refine_layers) - - loss = cls_loss * self.cls_loss_weight \ - + reg_xytl_loss * self.xyt_loss_weight \ - + seg_loss * self.seg_loss_weight \ - + iou_loss * self.iou_loss_weight - - return_value = { - 'loss': loss, - 'cls_loss': cls_loss * self.cls_loss_weight, - 'reg_xytl_loss': reg_xytl_loss * self.xyt_loss_weight, - 'seg_loss': seg_loss * self.seg_loss_weight, - 'iou_loss': iou_loss * self.iou_loss_weight - } - - for i in range(self.refine_layers): - if not isinstance(cls_acc[i], paddle.Tensor): - cls_acc[i] = paddle.to_tensor(cls_acc[i]) - return_value['stage_{}_acc'.format(i)] = cls_acc[i] - - return return_value diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/cot_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/cot_loss.py deleted file mode 100644 index 40f8f9a..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/cot_loss.py +++ /dev/null @@ -1,61 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -import numpy as np -from ppdet.core.workspace import register - -__all__ = ['COTLoss'] - -@register -class COTLoss(nn.Layer): - __shared__ = ['num_classes'] - def __init__(self, - num_classes=80, - cot_scale=1, - cot_lambda=1): - super(COTLoss, self).__init__() - self.cot_scale = cot_scale - self.cot_lambda = cot_lambda - self.num_classes = num_classes - - def forward(self, scores, targets, cot_relation): - cls_name = 'loss_bbox_cls_cot' - loss_bbox = {} - - tgt_labels, tgt_bboxes, tgt_gt_inds = targets - tgt_labels = paddle.concat(tgt_labels) if len( - tgt_labels) > 1 else tgt_labels[0] - mask = (tgt_labels < self.num_classes) - valid_inds = paddle.nonzero(tgt_labels >= 0).flatten() - if valid_inds.shape[0] == 0: - loss_bbox[cls_name] = paddle.zeros([1], dtype='float32') - else: - tgt_labels = tgt_labels.cast('int64') - valid_cot_targets = [] - for i in range(tgt_labels.shape[0]): - train_label = tgt_labels[i] - if train_label < self.num_classes: - valid_cot_targets.append(cot_relation[train_label]) - coco_targets = paddle.to_tensor(valid_cot_targets) - coco_targets.stop_gradient = True - coco_loss = - coco_targets * F.log_softmax(scores[mask][:, :-1] * self.cot_scale) - loss_bbox[cls_name] = self.cot_lambda * paddle.mean(paddle.sum(coco_loss, axis=-1)) - return loss_bbox diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/ctfocal_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/ctfocal_loss.py deleted file mode 100644 index dd00eb8..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/ctfocal_loss.py +++ /dev/null @@ -1,68 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle - -from ppdet.core.workspace import register, serializable - -__all__ = ['CTFocalLoss'] - - -@register -@serializable -class CTFocalLoss(object): - """ - CTFocalLoss: CornerNet & CenterNet Focal Loss - Args: - loss_weight (float): loss weight - gamma (float): gamma parameter for Focal Loss - """ - - def __init__(self, loss_weight=1., gamma=2.0): - self.loss_weight = loss_weight - self.gamma = gamma - - def __call__(self, pred, target): - """ - Calculate the loss - Args: - pred (Tensor): heatmap prediction - target (Tensor): target for positive samples - Return: - ct_focal_loss (Tensor): Focal Loss used in CornerNet & CenterNet. - Note that the values in target are in [0, 1] since gaussian is - used to reduce the punishment and we treat [0, 1) as neg example. - """ - fg_map = paddle.cast(target == 1, 'float32') - fg_map.stop_gradient = True - bg_map = paddle.cast(target < 1, 'float32') - bg_map.stop_gradient = True - - neg_weights = paddle.pow(1 - target, 4) - pos_loss = 0 - paddle.log(pred) * paddle.pow(1 - pred, - self.gamma) * fg_map - - neg_loss = 0 - paddle.log(1 - pred) * paddle.pow( - pred, self.gamma) * neg_weights * bg_map - pos_loss = paddle.sum(pos_loss) - neg_loss = paddle.sum(neg_loss) - - fg_num = paddle.sum(fg_map) - ct_focal_loss = (pos_loss + neg_loss) / ( - fg_num + paddle.cast(fg_num == 0, 'float32')) - return ct_focal_loss * self.loss_weight diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/detr_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/detr_loss.py deleted file mode 100644 index d635337..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/detr_loss.py +++ /dev/null @@ -1,631 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register -from .iou_loss import GIoULoss -from ..transformers import bbox_cxcywh_to_xyxy, sigmoid_focal_loss, varifocal_loss_with_logits -from ..bbox_utils import bbox_iou - -__all__ = ['DETRLoss', 'DINOLoss'] - - -@register -class DETRLoss(nn.Layer): - __shared__ = ['num_classes', 'use_focal_loss'] - __inject__ = ['matcher'] - - def __init__(self, - num_classes=80, - matcher='HungarianMatcher', - loss_coeff={ - 'class': 1, - 'bbox': 5, - 'giou': 2, - 'no_object': 0.1, - 'mask': 1, - 'dice': 1 - }, - aux_loss=True, - use_focal_loss=False, - use_vfl=False, - use_uni_match=False, - uni_match_ind=0): - r""" - Args: - num_classes (int): The number of classes. - matcher (HungarianMatcher): It computes an assignment between the targets - and the predictions of the network. - loss_coeff (dict): The coefficient of loss. - aux_loss (bool): If 'aux_loss = True', loss at each decoder layer are to be used. - use_focal_loss (bool): Use focal loss or not. - """ - super(DETRLoss, self).__init__() - - self.num_classes = num_classes - self.matcher = matcher - self.loss_coeff = loss_coeff - self.aux_loss = aux_loss - self.use_focal_loss = use_focal_loss - self.use_vfl = use_vfl - self.use_uni_match = use_uni_match - self.uni_match_ind = uni_match_ind - - if not self.use_focal_loss: - self.loss_coeff['class'] = paddle.full([num_classes + 1], - loss_coeff['class']) - self.loss_coeff['class'][-1] = loss_coeff['no_object'] - self.giou_loss = GIoULoss() - - def _get_loss_class(self, - logits, - gt_class, - match_indices, - bg_index, - num_gts, - postfix="", - iou_score=None, - gt_score=None): - # logits: [b, query, num_classes], gt_class: list[[n, 1]] - name_class = "loss_class" + postfix - - target_label = paddle.full(logits.shape[:2], bg_index, dtype='int64') - bs, num_query_objects = target_label.shape - num_gt = sum(len(a) for a in gt_class) - if num_gt > 0: - index, updates = self._get_index_updates(num_query_objects, - gt_class, match_indices) - target_label = paddle.scatter( - target_label.reshape([-1, 1]), index, updates.astype('int64')) - target_label = target_label.reshape([bs, num_query_objects]) - if self.use_focal_loss: - target_label = F.one_hot(target_label, - self.num_classes + 1)[..., :-1] - if iou_score is not None and self.use_vfl: - if gt_score is not None: - target_score = paddle.zeros([bs, num_query_objects]) - target_score = paddle.scatter( - target_score.reshape([-1, 1]), index, gt_score) - target_score = target_score.reshape( - [bs, num_query_objects, 1]) * target_label - - target_score_iou = paddle.zeros([bs, num_query_objects]) - target_score_iou = paddle.scatter( - target_score_iou.reshape([-1, 1]), index, iou_score) - target_score_iou = target_score_iou.reshape( - [bs, num_query_objects, 1]) * target_label - target_score = paddle.multiply(target_score, - target_score_iou) - loss_ = self.loss_coeff[ - 'class'] * varifocal_loss_with_logits( - logits, target_score, target_label, - num_gts / num_query_objects) - else: - target_score = paddle.zeros([bs, num_query_objects]) - if num_gt > 0: - target_score = paddle.scatter( - target_score.reshape([-1, 1]), index, iou_score) - target_score = target_score.reshape( - [bs, num_query_objects, 1]) * target_label - loss_ = self.loss_coeff[ - 'class'] * varifocal_loss_with_logits( - logits, target_score, target_label, - num_gts / num_query_objects) - else: - loss_ = self.loss_coeff['class'] * sigmoid_focal_loss( - logits, target_label, num_gts / num_query_objects) - else: - loss_ = F.cross_entropy( - logits, target_label, weight=self.loss_coeff['class']) - return {name_class: loss_} - - def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts, - postfix=""): - # boxes: [b, query, 4], gt_bbox: list[[n, 4]] - name_bbox = "loss_bbox" + postfix - name_giou = "loss_giou" + postfix - - loss = dict() - if sum(len(a) for a in gt_bbox) == 0: - loss[name_bbox] = paddle.to_tensor([0.]) - loss[name_giou] = paddle.to_tensor([0.]) - return loss - - src_bbox, target_bbox = self._get_src_target_assign(boxes, gt_bbox, - match_indices) - loss[name_bbox] = self.loss_coeff['bbox'] * F.l1_loss( - src_bbox, target_bbox, reduction='sum') / num_gts - loss[name_giou] = self.giou_loss( - bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox)) - loss[name_giou] = loss[name_giou].sum() / num_gts - loss[name_giou] = self.loss_coeff['giou'] * loss[name_giou] - return loss - - def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts, - postfix=""): - # masks: [b, query, h, w], gt_mask: list[[n, H, W]] - name_mask = "loss_mask" + postfix - name_dice = "loss_dice" + postfix - - loss = dict() - if sum(len(a) for a in gt_mask) == 0: - loss[name_mask] = paddle.to_tensor([0.]) - loss[name_dice] = paddle.to_tensor([0.]) - return loss - - src_masks, target_masks = self._get_src_target_assign(masks, gt_mask, - match_indices) - src_masks = F.interpolate( - src_masks.unsqueeze(0), - size=target_masks.shape[-2:], - mode="bilinear")[0] - loss[name_mask] = self.loss_coeff['mask'] * F.sigmoid_focal_loss( - src_masks, - target_masks, - paddle.to_tensor( - [num_gts], dtype='float32')) - loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss( - src_masks, target_masks, num_gts) - return loss - - def _dice_loss(self, inputs, targets, num_gts): - inputs = F.sigmoid(inputs) - inputs = inputs.flatten(1) - targets = targets.flatten(1) - numerator = 2 * (inputs * targets).sum(1) - denominator = inputs.sum(-1) + targets.sum(-1) - loss = 1 - (numerator + 1) / (denominator + 1) - return loss.sum() / num_gts - - def _get_loss_aux(self, - boxes, - logits, - gt_bbox, - gt_class, - bg_index, - num_gts, - dn_match_indices=None, - postfix="", - masks=None, - gt_mask=None, - gt_score=None): - loss_class = [] - loss_bbox, loss_giou = [], [] - loss_mask, loss_dice = [], [] - if dn_match_indices is not None: - match_indices = dn_match_indices - elif self.use_uni_match: - match_indices = self.matcher( - boxes[self.uni_match_ind], - logits[self.uni_match_ind], - gt_bbox, - gt_class, - masks=masks[self.uni_match_ind] if masks is not None else None, - gt_mask=gt_mask) - for i, (aux_boxes, aux_logits) in enumerate(zip(boxes, logits)): - aux_masks = masks[i] if masks is not None else None - if not self.use_uni_match and dn_match_indices is None: - match_indices = self.matcher( - aux_boxes, - aux_logits, - gt_bbox, - gt_class, - masks=aux_masks, - gt_mask=gt_mask) - if self.use_vfl: - if sum(len(a) for a in gt_bbox) > 0: - src_bbox, target_bbox = self._get_src_target_assign( - aux_boxes.detach(), gt_bbox, match_indices) - iou_score = bbox_iou( - bbox_cxcywh_to_xyxy(src_bbox).split(4, -1), - bbox_cxcywh_to_xyxy(target_bbox).split(4, -1)) - else: - iou_score = None - if gt_score is not None: - _, target_score = self._get_src_target_assign( - logits[-1].detach(), gt_score, match_indices) - else: - iou_score = None - loss_class.append( - self._get_loss_class( - aux_logits, - gt_class, - match_indices, - bg_index, - num_gts, - postfix, - iou_score, - gt_score=target_score - if gt_score is not None else None)['loss_class' + postfix]) - loss_ = self._get_loss_bbox(aux_boxes, gt_bbox, match_indices, - num_gts, postfix) - loss_bbox.append(loss_['loss_bbox' + postfix]) - loss_giou.append(loss_['loss_giou' + postfix]) - if masks is not None and gt_mask is not None: - loss_ = self._get_loss_mask(aux_masks, gt_mask, match_indices, - num_gts, postfix) - loss_mask.append(loss_['loss_mask' + postfix]) - loss_dice.append(loss_['loss_dice' + postfix]) - loss = { - "loss_class_aux" + postfix: paddle.add_n(loss_class), - "loss_bbox_aux" + postfix: paddle.add_n(loss_bbox), - "loss_giou_aux" + postfix: paddle.add_n(loss_giou) - } - if masks is not None and gt_mask is not None: - loss["loss_mask_aux" + postfix] = paddle.add_n(loss_mask) - loss["loss_dice_aux" + postfix] = paddle.add_n(loss_dice) - return loss - - def _get_index_updates(self, num_query_objects, target, match_indices): - batch_idx = paddle.concat([ - paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices) - ]) - src_idx = paddle.concat([src for (src, _) in match_indices]) - src_idx += (batch_idx * num_query_objects) - target_assign = paddle.concat([ - paddle.gather( - t, dst, axis=0) for t, (_, dst) in zip(target, match_indices) - ]) - return src_idx, target_assign - - def _get_src_target_assign(self, src, target, match_indices): - src_assign = paddle.concat([ - paddle.gather( - t, I, axis=0) if len(I) > 0 else paddle.zeros([0, t.shape[-1]]) - for t, (I, _) in zip(src, match_indices) - ]) - target_assign = paddle.concat([ - paddle.gather( - t, J, axis=0) if len(J) > 0 else paddle.zeros([0, t.shape[-1]]) - for t, (_, J) in zip(target, match_indices) - ]) - return src_assign, target_assign - - def _get_num_gts(self, targets, dtype="float32"): - num_gts = sum(len(a) for a in targets) - num_gts = paddle.to_tensor([num_gts], dtype=dtype) - if paddle.distributed.get_world_size() > 1: - paddle.distributed.all_reduce(num_gts) - num_gts /= paddle.distributed.get_world_size() - num_gts = paddle.clip(num_gts, min=1.) - return num_gts - - def _get_prediction_loss(self, - boxes, - logits, - gt_bbox, - gt_class, - masks=None, - gt_mask=None, - postfix="", - dn_match_indices=None, - num_gts=1, - gt_score=None): - if dn_match_indices is None: - match_indices = self.matcher( - boxes, logits, gt_bbox, gt_class, masks=masks, gt_mask=gt_mask) - else: - match_indices = dn_match_indices - - if self.use_vfl: - if gt_score is not None: #ssod - _, target_score = self._get_src_target_assign( - logits[-1].detach(), gt_score, match_indices) - elif sum(len(a) for a in gt_bbox) > 0: - src_bbox, target_bbox = self._get_src_target_assign( - boxes.detach(), gt_bbox, match_indices) - iou_score = bbox_iou( - bbox_cxcywh_to_xyxy(src_bbox).split(4, -1), - bbox_cxcywh_to_xyxy(target_bbox).split(4, -1)) - else: - iou_score = None - else: - iou_score = None - - loss = dict() - loss.update( - self._get_loss_class( - logits, - gt_class, - match_indices, - self.num_classes, - num_gts, - postfix, - iou_score, - gt_score=target_score if gt_score is not None else None)) - loss.update( - self._get_loss_bbox(boxes, gt_bbox, match_indices, num_gts, - postfix)) - if masks is not None and gt_mask is not None: - loss.update( - self._get_loss_mask(masks, gt_mask, match_indices, num_gts, - postfix)) - return loss - - def forward(self, - boxes, - logits, - gt_bbox, - gt_class, - masks=None, - gt_mask=None, - postfix="", - gt_score=None, - **kwargs): - r""" - Args: - boxes (Tensor): [l, b, query, 4] - logits (Tensor): [l, b, query, num_classes] - gt_bbox (List(Tensor)): list[[n, 4]] - gt_class (List(Tensor)): list[[n, 1]] - masks (Tensor, optional): [l, b, query, h, w] - gt_mask (List(Tensor), optional): list[[n, H, W]] - postfix (str): postfix of loss name - """ - - dn_match_indices = kwargs.get("dn_match_indices", None) - num_gts = kwargs.get("num_gts", None) - if num_gts is None: - num_gts = self._get_num_gts(gt_class) - - total_loss = self._get_prediction_loss( - boxes[-1], - logits[-1], - gt_bbox, - gt_class, - masks=masks[-1] if masks is not None else None, - gt_mask=gt_mask, - postfix=postfix, - dn_match_indices=dn_match_indices, - num_gts=num_gts, - gt_score=gt_score if gt_score is not None else None) - - if self.aux_loss: - total_loss.update( - self._get_loss_aux( - boxes[:-1], - logits[:-1], - gt_bbox, - gt_class, - self.num_classes, - num_gts, - dn_match_indices, - postfix, - masks=masks[:-1] if masks is not None else None, - gt_mask=gt_mask, - gt_score=gt_score if gt_score is not None else None)) - - return total_loss - - -@register -class DINOLoss(DETRLoss): - def forward(self, - boxes, - logits, - gt_bbox, - gt_class, - masks=None, - gt_mask=None, - postfix="", - dn_out_bboxes=None, - dn_out_logits=None, - dn_meta=None, - gt_score=None, - **kwargs): - num_gts = self._get_num_gts(gt_class) - total_loss = super(DINOLoss, self).forward( - boxes, - logits, - gt_bbox, - gt_class, - num_gts=num_gts, - gt_score=gt_score) - - if dn_meta is not None: - dn_positive_idx, dn_num_group = \ - dn_meta["dn_positive_idx"], dn_meta["dn_num_group"] - assert len(gt_class) == len(dn_positive_idx) - - # denoising match indices - dn_match_indices = self.get_dn_match_indices( - gt_class, dn_positive_idx, dn_num_group) - - # compute denoising training loss - num_gts *= dn_num_group - dn_loss = super(DINOLoss, self).forward( - dn_out_bboxes, - dn_out_logits, - gt_bbox, - gt_class, - postfix="_dn", - dn_match_indices=dn_match_indices, - num_gts=num_gts, - gt_score=gt_score) - total_loss.update(dn_loss) - else: - total_loss.update( - {k + '_dn': paddle.to_tensor([0.]) - for k in total_loss.keys()}) - - return total_loss - - @staticmethod - def get_dn_match_indices(labels, dn_positive_idx, dn_num_group): - dn_match_indices = [] - for i in range(len(labels)): - num_gt = len(labels[i]) - if num_gt > 0: - gt_idx = paddle.arange(end=num_gt, dtype="int64") - gt_idx = gt_idx.tile([dn_num_group]) - assert len(dn_positive_idx[i]) == len(gt_idx) - dn_match_indices.append((dn_positive_idx[i], gt_idx)) - else: - dn_match_indices.append((paddle.zeros( - [0], dtype="int64"), paddle.zeros( - [0], dtype="int64"))) - return dn_match_indices - - -@register -class MaskDINOLoss(DETRLoss): - __shared__ = ['num_classes', 'use_focal_loss', 'num_sample_points'] - __inject__ = ['matcher'] - - def __init__(self, - num_classes=80, - matcher='HungarianMatcher', - loss_coeff={ - 'class': 4, - 'bbox': 5, - 'giou': 2, - 'mask': 5, - 'dice': 5 - }, - aux_loss=True, - use_focal_loss=False, - num_sample_points=12544, - oversample_ratio=3.0, - important_sample_ratio=0.75): - super(MaskDINOLoss, self).__init__(num_classes, matcher, loss_coeff, - aux_loss, use_focal_loss) - assert oversample_ratio >= 1 - assert important_sample_ratio <= 1 and important_sample_ratio >= 0 - - self.num_sample_points = num_sample_points - self.oversample_ratio = oversample_ratio - self.important_sample_ratio = important_sample_ratio - self.num_oversample_points = int(num_sample_points * oversample_ratio) - self.num_important_points = int(num_sample_points * - important_sample_ratio) - self.num_random_points = num_sample_points - self.num_important_points - - def forward(self, - boxes, - logits, - gt_bbox, - gt_class, - masks=None, - gt_mask=None, - postfix="", - dn_out_bboxes=None, - dn_out_logits=None, - dn_out_masks=None, - dn_meta=None, - **kwargs): - num_gts = self._get_num_gts(gt_class) - total_loss = super(MaskDINOLoss, self).forward( - boxes, - logits, - gt_bbox, - gt_class, - masks=masks, - gt_mask=gt_mask, - num_gts=num_gts) - - if dn_meta is not None: - dn_positive_idx, dn_num_group = \ - dn_meta["dn_positive_idx"], dn_meta["dn_num_group"] - assert len(gt_class) == len(dn_positive_idx) - - # denoising match indices - dn_match_indices = DINOLoss.get_dn_match_indices( - gt_class, dn_positive_idx, dn_num_group) - - # compute denoising training loss - num_gts *= dn_num_group - dn_loss = super(MaskDINOLoss, self).forward( - dn_out_bboxes, - dn_out_logits, - gt_bbox, - gt_class, - masks=dn_out_masks, - gt_mask=gt_mask, - postfix="_dn", - dn_match_indices=dn_match_indices, - num_gts=num_gts) - total_loss.update(dn_loss) - else: - total_loss.update( - {k + '_dn': paddle.to_tensor([0.]) - for k in total_loss.keys()}) - - return total_loss - - def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts, - postfix=""): - # masks: [b, query, h, w], gt_mask: list[[n, H, W]] - name_mask = "loss_mask" + postfix - name_dice = "loss_dice" + postfix - - loss = dict() - if sum(len(a) for a in gt_mask) == 0: - loss[name_mask] = paddle.to_tensor([0.]) - loss[name_dice] = paddle.to_tensor([0.]) - return loss - - src_masks, target_masks = self._get_src_target_assign(masks, gt_mask, - match_indices) - # sample points - sample_points = self._get_point_coords_by_uncertainty(src_masks) - sample_points = 2.0 * sample_points.unsqueeze(1) - 1.0 - - src_masks = F.grid_sample( - src_masks.unsqueeze(1), sample_points, - align_corners=False).squeeze([1, 2]) - - target_masks = F.grid_sample( - target_masks.unsqueeze(1), sample_points, - align_corners=False).squeeze([1, 2]).detach() - - loss[name_mask] = self.loss_coeff[ - 'mask'] * F.binary_cross_entropy_with_logits( - src_masks, target_masks, - reduction='none').mean(1).sum() / num_gts - loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss( - src_masks, target_masks, num_gts) - return loss - - def _get_point_coords_by_uncertainty(self, masks): - # Sample points based on their uncertainty. - masks = masks.detach() - num_masks = masks.shape[0] - sample_points = paddle.rand( - [num_masks, 1, self.num_oversample_points, 2]) - - out_mask = F.grid_sample( - masks.unsqueeze(1), 2.0 * sample_points - 1.0, - align_corners=False).squeeze([1, 2]) - out_mask = -paddle.abs(out_mask) - - _, topk_ind = paddle.topk(out_mask, self.num_important_points, axis=1) - batch_ind = paddle.arange(end=num_masks, dtype=topk_ind.dtype) - batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_important_points]) - topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1) - - sample_points = paddle.gather_nd(sample_points.squeeze(1), topk_ind) - if self.num_random_points > 0: - sample_points = paddle.concat( - [ - sample_points, - paddle.rand([num_masks, self.num_random_points, 2]) - ], - axis=1) - return sample_points diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/fairmot_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/fairmot_loss.py deleted file mode 100644 index e24ff33..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/fairmot_loss.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -from paddle.nn.initializer import Constant -from ppdet.core.workspace import register - -__all__ = ['FairMOTLoss'] - - -@register -class FairMOTLoss(nn.Layer): - def __init__(self): - super(FairMOTLoss, self).__init__() - self.det_weight = self.create_parameter( - shape=[1], default_initializer=Constant(-1.85)) - self.reid_weight = self.create_parameter( - shape=[1], default_initializer=Constant(-1.05)) - - def forward(self, det_loss, reid_loss): - loss = paddle.exp(-self.det_weight) * det_loss + paddle.exp( - -self.reid_weight) * reid_loss + (self.det_weight + self.reid_weight - ) - loss *= 0.5 - return {'loss': loss} diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/fcos_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/fcos_loss.py deleted file mode 100644 index e9bbc27..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/fcos_loss.py +++ /dev/null @@ -1,1020 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register -from ppdet.modeling import ops -from functools import partial - -__all__ = ['FCOSLoss', 'FCOSLossMILC', 'FCOSLossCR'] - - -def flatten_tensor(inputs, channel_first=False): - """ - Flatten a Tensor - Args: - inputs (Tensor): 4-D Tensor with shape [N, C, H, W] or [N, H, W, C] - channel_first (bool): If true the dimension order of Tensor is - [N, C, H, W], otherwise is [N, H, W, C] - Return: - output_channel_last (Tensor): The flattened Tensor in channel_last style - """ - if channel_first: - input_channel_last = paddle.transpose(inputs, perm=[0, 2, 3, 1]) - else: - input_channel_last = inputs - output_channel_last = paddle.flatten( - input_channel_last, start_axis=0, stop_axis=2) - return output_channel_last - - -@register -class FCOSLoss(nn.Layer): - """ - FCOSLoss - Args: - loss_alpha (float): alpha in focal loss - loss_gamma (float): gamma in focal loss - iou_loss_type (str): location loss type, IoU/GIoU/LINEAR_IoU - reg_weights (float): weight for location loss - quality (str): quality branch, centerness/iou - """ - - def __init__(self, - loss_alpha=0.25, - loss_gamma=2.0, - iou_loss_type="giou", - reg_weights=1.0, - quality='centerness'): - super(FCOSLoss, self).__init__() - self.loss_alpha = loss_alpha - self.loss_gamma = loss_gamma - self.iou_loss_type = iou_loss_type - self.reg_weights = reg_weights - self.quality = quality - - def _iou_loss(self, - pred, - targets, - positive_mask, - weights=None, - return_iou=False): - """ - Calculate the loss for location prediction - Args: - pred (Tensor): bounding boxes prediction - targets (Tensor): targets for positive samples - positive_mask (Tensor): mask of positive samples - weights (Tensor): weights for each positive samples - Return: - loss (Tensor): location loss - """ - plw = pred[:, 0] * positive_mask - pth = pred[:, 1] * positive_mask - prw = pred[:, 2] * positive_mask - pbh = pred[:, 3] * positive_mask - - tlw = targets[:, 0] * positive_mask - tth = targets[:, 1] * positive_mask - trw = targets[:, 2] * positive_mask - tbh = targets[:, 3] * positive_mask - tlw.stop_gradient = True - trw.stop_gradient = True - tth.stop_gradient = True - tbh.stop_gradient = True - - ilw = paddle.minimum(plw, tlw) - irw = paddle.minimum(prw, trw) - ith = paddle.minimum(pth, tth) - ibh = paddle.minimum(pbh, tbh) - - clw = paddle.maximum(plw, tlw) - crw = paddle.maximum(prw, trw) - cth = paddle.maximum(pth, tth) - cbh = paddle.maximum(pbh, tbh) - - area_predict = (plw + prw) * (pth + pbh) - area_target = (tlw + trw) * (tth + tbh) - area_inter = (ilw + irw) * (ith + ibh) - ious = (area_inter + 1.0) / ( - area_predict + area_target - area_inter + 1.0) - ious = ious * positive_mask - - if return_iou: - return ious - - if self.iou_loss_type.lower() == "linear_iou": - loss = 1.0 - ious - elif self.iou_loss_type.lower() == "giou": - area_uniou = area_predict + area_target - area_inter - area_circum = (clw + crw) * (cth + cbh) + 1e-7 - giou = ious - (area_circum - area_uniou) / area_circum - loss = 1.0 - giou - elif self.iou_loss_type.lower() == "iou": - loss = 0.0 - paddle.log(ious) - else: - raise KeyError - if weights is not None: - loss = loss * weights - return loss - - def forward(self, cls_logits, bboxes_reg, centerness, tag_labels, - tag_bboxes, tag_center): - """ - Calculate the loss for classification, location and centerness - Args: - cls_logits (list): list of Tensor, which is predicted - score for all anchor points with shape [N, M, C] - bboxes_reg (list): list of Tensor, which is predicted - offsets for all anchor points with shape [N, M, 4] - centerness (list): list of Tensor, which is predicted - centerness for all anchor points with shape [N, M, 1] - tag_labels (list): list of Tensor, which is category - targets for each anchor point - tag_bboxes (list): list of Tensor, which is bounding - boxes targets for positive samples - tag_center (list): list of Tensor, which is centerness - targets for positive samples - Return: - loss (dict): loss composed by classification loss, bounding box - """ - cls_logits_flatten_list = [] - bboxes_reg_flatten_list = [] - centerness_flatten_list = [] - tag_labels_flatten_list = [] - tag_bboxes_flatten_list = [] - tag_center_flatten_list = [] - num_lvl = len(cls_logits) - for lvl in range(num_lvl): - cls_logits_flatten_list.append( - flatten_tensor(cls_logits[lvl], True)) - bboxes_reg_flatten_list.append( - flatten_tensor(bboxes_reg[lvl], True)) - centerness_flatten_list.append( - flatten_tensor(centerness[lvl], True)) - - tag_labels_flatten_list.append( - flatten_tensor(tag_labels[lvl], False)) - tag_bboxes_flatten_list.append( - flatten_tensor(tag_bboxes[lvl], False)) - tag_center_flatten_list.append( - flatten_tensor(tag_center[lvl], False)) - - cls_logits_flatten = paddle.concat(cls_logits_flatten_list, axis=0) - bboxes_reg_flatten = paddle.concat(bboxes_reg_flatten_list, axis=0) - centerness_flatten = paddle.concat(centerness_flatten_list, axis=0) - - tag_labels_flatten = paddle.concat(tag_labels_flatten_list, axis=0) - tag_bboxes_flatten = paddle.concat(tag_bboxes_flatten_list, axis=0) - tag_center_flatten = paddle.concat(tag_center_flatten_list, axis=0) - tag_labels_flatten.stop_gradient = True - tag_bboxes_flatten.stop_gradient = True - tag_center_flatten.stop_gradient = True - - mask_positive_bool = tag_labels_flatten > 0 - mask_positive_bool.stop_gradient = True - mask_positive_float = paddle.cast(mask_positive_bool, dtype="float32") - mask_positive_float.stop_gradient = True - - num_positive_fp32 = paddle.sum(mask_positive_float) - num_positive_fp32.stop_gradient = True - num_positive_int32 = paddle.cast(num_positive_fp32, dtype="int32") - num_positive_int32 = num_positive_int32 * 0 + 1 - num_positive_int32.stop_gradient = True - - normalize_sum = paddle.sum(tag_center_flatten * mask_positive_float) - normalize_sum.stop_gradient = True - - # 1. cls_logits: sigmoid_focal_loss - # expand onehot labels - num_classes = cls_logits_flatten.shape[-1] - tag_labels_flatten = paddle.squeeze(tag_labels_flatten, axis=-1) - tag_labels_flatten_bin = F.one_hot( - tag_labels_flatten, num_classes=1 + num_classes) - tag_labels_flatten_bin = tag_labels_flatten_bin[:, 1:] - # sigmoid_focal_loss - cls_loss = F.sigmoid_focal_loss( - cls_logits_flatten, tag_labels_flatten_bin) / num_positive_fp32 - - if self.quality == 'centerness': - # 2. bboxes_reg: giou_loss - mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1) - tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1) - reg_loss = self._iou_loss( - bboxes_reg_flatten, - tag_bboxes_flatten, - mask_positive_float, - weights=tag_center_flatten) - reg_loss = reg_loss * mask_positive_float / normalize_sum - - # 3. centerness: sigmoid_cross_entropy_with_logits_loss - centerness_flatten = paddle.squeeze(centerness_flatten, axis=-1) - quality_loss = ops.sigmoid_cross_entropy_with_logits( - centerness_flatten, tag_center_flatten) - quality_loss = quality_loss * mask_positive_float / num_positive_fp32 - - elif self.quality == 'iou': - # 2. bboxes_reg: giou_loss - mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1) - tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1) - reg_loss = self._iou_loss( - bboxes_reg_flatten, - tag_bboxes_flatten, - mask_positive_float, - weights=None) - reg_loss = reg_loss * mask_positive_float / num_positive_fp32 - # num_positive_fp32 is num_foreground - - # 3. centerness: sigmoid_cross_entropy_with_logits_loss - centerness_flatten = paddle.squeeze(centerness_flatten, axis=-1) - gt_ious = self._iou_loss( - bboxes_reg_flatten, - tag_bboxes_flatten, - mask_positive_float, - weights=None, - return_iou=True) - quality_loss = ops.sigmoid_cross_entropy_with_logits( - centerness_flatten, gt_ious) - quality_loss = quality_loss * mask_positive_float / num_positive_fp32 - else: - raise Exception(f'Unknown quality type: {self.quality}') - - loss_all = { - "loss_cls": paddle.sum(cls_loss), - "loss_box": paddle.sum(reg_loss), - "loss_quality": paddle.sum(quality_loss), - } - return loss_all - - -@register -class FCOSLossMILC(FCOSLoss): - """ - FCOSLossMILC for ARSL in semi-det(ssod) - Args: - loss_alpha (float): alpha in focal loss - loss_gamma (float): gamma in focal loss - iou_loss_type (str): location loss type, IoU/GIoU/LINEAR_IoU - reg_weights (float): weight for location loss - """ - - def __init__(self, - loss_alpha=0.25, - loss_gamma=2.0, - iou_loss_type="giou", - reg_weights=1.0): - super(FCOSLossMILC, self).__init__() - self.loss_alpha = loss_alpha - self.loss_gamma = loss_gamma - self.iou_loss_type = iou_loss_type - self.reg_weights = reg_weights - - def iou_loss(self, pred, targets, weights=None, avg_factor=None): - """ - Calculate the loss for location prediction - Args: - pred (Tensor): bounding boxes prediction - targets (Tensor): targets for positive samples - weights (Tensor): weights for each positive samples - Return: - loss (Tensor): location loss - """ - plw = pred[:, 0] - pth = pred[:, 1] - prw = pred[:, 2] - pbh = pred[:, 3] - - tlw = targets[:, 0] - tth = targets[:, 1] - trw = targets[:, 2] - tbh = targets[:, 3] - tlw.stop_gradient = True - trw.stop_gradient = True - tth.stop_gradient = True - tbh.stop_gradient = True - - ilw = paddle.minimum(plw, tlw) - irw = paddle.minimum(prw, trw) - ith = paddle.minimum(pth, tth) - ibh = paddle.minimum(pbh, tbh) - - clw = paddle.maximum(plw, tlw) - crw = paddle.maximum(prw, trw) - cth = paddle.maximum(pth, tth) - cbh = paddle.maximum(pbh, tbh) - - area_predict = (plw + prw) * (pth + pbh) - area_target = (tlw + trw) * (tth + tbh) - area_inter = (ilw + irw) * (ith + ibh) - ious = (area_inter + 1.0) / ( - area_predict + area_target - area_inter + 1.0) - ious = ious - - if self.iou_loss_type.lower() == "linear_iou": - loss = 1.0 - ious - elif self.iou_loss_type.lower() == "giou": - area_uniou = area_predict + area_target - area_inter - area_circum = (clw + crw) * (cth + cbh) + 1e-7 - giou = ious - (area_circum - area_uniou) / area_circum - loss = 1.0 - giou - elif self.iou_loss_type.lower() == "iou": - loss = 0.0 - paddle.log(ious) - else: - raise KeyError - if weights is not None: - loss = loss * weights - loss = paddle.sum(loss) - if avg_factor is not None: - loss = loss / avg_factor - return loss - - # temp function: calcualate iou between bbox and target - def _bbox_overlap_align(self, pred, targets): - assert pred.shape[0] == targets.shape[0], \ - 'the pred should be aligned with target.' - - plw = pred[:, 0] - pth = pred[:, 1] - prw = pred[:, 2] - pbh = pred[:, 3] - - tlw = targets[:, 0] - tth = targets[:, 1] - trw = targets[:, 2] - tbh = targets[:, 3] - - ilw = paddle.minimum(plw, tlw) - irw = paddle.minimum(prw, trw) - ith = paddle.minimum(pth, tth) - ibh = paddle.minimum(pbh, tbh) - - area_predict = (plw + prw) * (pth + pbh) - area_target = (tlw + trw) * (tth + tbh) - area_inter = (ilw + irw) * (ith + ibh) - ious = (area_inter + 1.0) / ( - area_predict + area_target - area_inter + 1.0) - - return ious - - def iou_based_soft_label_loss(self, - pred, - target, - alpha=0.75, - gamma=2.0, - iou_weighted=False, - implicit_iou=None, - avg_factor=None): - assert pred.shape == target.shape - pred = F.sigmoid(pred) - target = target.cast(pred.dtype) - - if implicit_iou is not None: - pred = pred * implicit_iou - - if iou_weighted: - focal_weight = (pred - target).abs().pow(gamma) * target * (target > 0.0).cast('float32') + \ - alpha * (pred - target).abs().pow(gamma) * \ - (target <= 0.0).cast('float32') - else: - focal_weight = (pred - target).abs().pow(gamma) * (target > 0.0).cast('float32') + \ - alpha * (pred - target).abs().pow(gamma) * \ - (target <= 0.0).cast('float32') - - # focal loss - loss = F.binary_cross_entropy( - pred, target, reduction='none') * focal_weight - if avg_factor is not None: - loss = loss / avg_factor - return loss - - def forward(self, cls_logits, bboxes_reg, centerness, tag_labels, - tag_bboxes, tag_center): - """ - Calculate the loss for classification, location and centerness - Args: - cls_logits (list): list of Tensor, which is predicted - score for all anchor points with shape [N, M, C] - bboxes_reg (list): list of Tensor, which is predicted - offsets for all anchor points with shape [N, M, 4] - centerness (list): list of Tensor, which is predicted - centerness for all anchor points with shape [N, M, 1] - tag_labels (list): list of Tensor, which is category - targets for each anchor point - tag_bboxes (list): list of Tensor, which is bounding - boxes targets for positive samples - tag_center (list): list of Tensor, which is centerness - targets for positive samples - Return: - loss (dict): loss composed by classification loss, bounding box - """ - cls_logits_flatten_list = [] - bboxes_reg_flatten_list = [] - centerness_flatten_list = [] - tag_labels_flatten_list = [] - tag_bboxes_flatten_list = [] - tag_center_flatten_list = [] - num_lvl = len(cls_logits) - for lvl in range(num_lvl): - cls_logits_flatten_list.append( - flatten_tensor(cls_logits[lvl], True)) - bboxes_reg_flatten_list.append( - flatten_tensor(bboxes_reg[lvl], True)) - centerness_flatten_list.append( - flatten_tensor(centerness[lvl], True)) - - tag_labels_flatten_list.append( - flatten_tensor(tag_labels[lvl], False)) - tag_bboxes_flatten_list.append( - flatten_tensor(tag_bboxes[lvl], False)) - tag_center_flatten_list.append( - flatten_tensor(tag_center[lvl], False)) - - cls_logits_flatten = paddle.concat(cls_logits_flatten_list, axis=0) - bboxes_reg_flatten = paddle.concat(bboxes_reg_flatten_list, axis=0) - centerness_flatten = paddle.concat(centerness_flatten_list, axis=0) - - tag_labels_flatten = paddle.concat(tag_labels_flatten_list, axis=0) - tag_bboxes_flatten = paddle.concat(tag_bboxes_flatten_list, axis=0) - tag_center_flatten = paddle.concat(tag_center_flatten_list, axis=0) - tag_labels_flatten.stop_gradient = True - tag_bboxes_flatten.stop_gradient = True - tag_center_flatten.stop_gradient = True - - # find positive index - mask_positive_bool = tag_labels_flatten > 0 - mask_positive_bool.stop_gradient = True - mask_positive_float = paddle.cast(mask_positive_bool, dtype="float32") - mask_positive_float.stop_gradient = True - - num_positive_fp32 = paddle.sum(mask_positive_float) - num_positive_fp32.stop_gradient = True - num_positive_int32 = paddle.cast(num_positive_fp32, dtype="int32") - num_positive_int32 = num_positive_int32 * 0 + 1 - num_positive_int32.stop_gradient = True - - # centerness target is used as reg weight - normalize_sum = paddle.sum(tag_center_flatten * mask_positive_float) - normalize_sum.stop_gradient = True - - # 1. IoU-Based soft label loss - # calculate iou - with paddle.no_grad(): - pos_ind = paddle.nonzero( - tag_labels_flatten.reshape([-1]) > 0).reshape([-1]) - pos_pred = bboxes_reg_flatten[pos_ind] - pos_target = tag_bboxes_flatten[pos_ind] - bbox_iou = self._bbox_overlap_align(pos_pred, pos_target) - # pos labels - pos_labels = tag_labels_flatten[pos_ind].squeeze(1) - cls_target = paddle.zeros(cls_logits_flatten.shape) - cls_target[pos_ind, pos_labels - 1] = bbox_iou - cls_loss = self.iou_based_soft_label_loss( - cls_logits_flatten, - cls_target, - implicit_iou=F.sigmoid(centerness_flatten), - avg_factor=num_positive_fp32) - - # 2. bboxes_reg: giou_loss - mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1) - tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1) - reg_loss = self._iou_loss( - bboxes_reg_flatten, - tag_bboxes_flatten, - mask_positive_float, - weights=tag_center_flatten) - reg_loss = reg_loss * mask_positive_float / normalize_sum - - # 3. iou loss - pos_iou_pred = paddle.squeeze(centerness_flatten, axis=-1)[pos_ind] - loss_iou = ops.sigmoid_cross_entropy_with_logits(pos_iou_pred, bbox_iou) - loss_iou = loss_iou / num_positive_fp32 * 0.5 - - loss_all = { - "loss_cls": paddle.sum(cls_loss), - "loss_box": paddle.sum(reg_loss), - 'loss_iou': paddle.sum(loss_iou), - } - - return loss_all - - -# Concat multi-level feature maps by image -def levels_to_images(mlvl_tensor): - batch_size = mlvl_tensor[0].shape[0] - batch_list = [[] for _ in range(batch_size)] - channels = mlvl_tensor[0].shape[1] - for t in mlvl_tensor: - t = t.transpose([0, 2, 3, 1]) - t = t.reshape([batch_size, -1, channels]) - for img in range(batch_size): - batch_list[img].append(t[img]) - return [paddle.concat(item, axis=0) for item in batch_list] - - -def multi_apply(func, *args, **kwargs): - """Apply function to a list of arguments. - - Note: - This function applies the ``func`` to multiple inputs and - map the multiple outputs of the ``func`` into different - list. Each list contains the same type of outputs corresponding - to different inputs. - - Args: - func (Function): A function that will be applied to a list of - arguments - - Returns: - tuple(list): A tuple containing multiple list, each list contains \ - a kind of returned results by the function - """ - pfunc = partial(func, **kwargs) if kwargs else func - map_results = map(pfunc, *args) - return tuple(map(list, zip(*map_results))) - - -@register -class FCOSLossCR(FCOSLossMILC): - """ - FCOSLoss of Consistency Regularization - """ - - def __init__(self, - iou_loss_type="giou", - cls_weight=2.0, - reg_weight=2.0, - iou_weight=0.5, - hard_neg_mining_flag=True): - super(FCOSLossCR, self).__init__() - self.iou_loss_type = iou_loss_type - self.cls_weight = cls_weight - self.reg_weight = reg_weight - self.iou_weight = iou_weight - self.hard_neg_mining_flag = hard_neg_mining_flag - - def iou_loss(self, pred, targets, weights=None, avg_factor=None): - """ - Calculate the loss for location prediction - Args: - pred (Tensor): bounding boxes prediction - targets (Tensor): targets for positive samples - weights (Tensor): weights for each positive samples - Return: - loss (Tensor): location loss - """ - plw = pred[:, 0] - pth = pred[:, 1] - prw = pred[:, 2] - pbh = pred[:, 3] - - tlw = targets[:, 0] - tth = targets[:, 1] - trw = targets[:, 2] - tbh = targets[:, 3] - tlw.stop_gradient = True - trw.stop_gradient = True - tth.stop_gradient = True - tbh.stop_gradient = True - - ilw = paddle.minimum(plw, tlw) - irw = paddle.minimum(prw, trw) - ith = paddle.minimum(pth, tth) - ibh = paddle.minimum(pbh, tbh) - - clw = paddle.maximum(plw, tlw) - crw = paddle.maximum(prw, trw) - cth = paddle.maximum(pth, tth) - cbh = paddle.maximum(pbh, tbh) - - area_predict = (plw + prw) * (pth + pbh) - area_target = (tlw + trw) * (tth + tbh) - area_inter = (ilw + irw) * (ith + ibh) - ious = (area_inter + 1.0) / ( - area_predict + area_target - area_inter + 1.0) - ious = ious - - if self.iou_loss_type.lower() == "linear_iou": - loss = 1.0 - ious - elif self.iou_loss_type.lower() == "giou": - area_uniou = area_predict + area_target - area_inter - area_circum = (clw + crw) * (cth + cbh) + 1e-7 - giou = ious - (area_circum - area_uniou) / area_circum - loss = 1.0 - giou - elif self.iou_loss_type.lower() == "iou": - loss = 0.0 - paddle.log(ious) - else: - raise KeyError - if weights is not None: - loss = loss * weights - loss = paddle.sum(loss) - if avg_factor is not None: - loss = loss / avg_factor - return loss - - # calcualate iou between bbox and target - def bbox_overlap_align(self, pred, targets): - assert pred.shape[0] == targets.shape[0], \ - 'the pred should be aligned with target.' - - plw = pred[:, 0] - pth = pred[:, 1] - prw = pred[:, 2] - pbh = pred[:, 3] - - tlw = targets[:, 0] - tth = targets[:, 1] - trw = targets[:, 2] - tbh = targets[:, 3] - - ilw = paddle.minimum(plw, tlw) - irw = paddle.minimum(prw, trw) - ith = paddle.minimum(pth, tth) - ibh = paddle.minimum(pbh, tbh) - - area_predict = (plw + prw) * (pth + pbh) - area_target = (tlw + trw) * (tth + tbh) - area_inter = (ilw + irw) * (ith + ibh) - ious = (area_inter + 1.0) / ( - area_predict + area_target - area_inter + 1.0) - return ious - - # cls loss: iou-based soft lable with joint iou - def quality_focal_loss(self, - stu_cls, - targets, - quality=None, - weights=None, - alpha=0.75, - gamma=2.0, - avg_factor='sum'): - stu_cls = F.sigmoid(stu_cls) - if quality is not None: - stu_cls = stu_cls * F.sigmoid(quality) - - focal_weight = (stu_cls - targets).abs().pow(gamma) * (targets > 0.0).cast('float32') + \ - alpha * (stu_cls - targets).abs().pow(gamma) * \ - (targets <= 0.0).cast('float32') - - loss = F.binary_cross_entropy( - stu_cls, targets, reduction='none') * focal_weight - - if weights is not None: - loss = loss * weights.reshape([-1, 1]) - loss = paddle.sum(loss) - if avg_factor is not None: - loss = loss / avg_factor - return loss - - # generate points according to feature maps - def compute_locations_by_level(self, fpn_stride, h, w): - """ - Compute locations of anchor points of each FPN layer - Return: - Anchor points locations of current FPN feature map - """ - shift_x = paddle.arange(0, w * fpn_stride, fpn_stride) - shift_y = paddle.arange(0, h * fpn_stride, fpn_stride) - shift_x = paddle.unsqueeze(shift_x, axis=0) - shift_y = paddle.unsqueeze(shift_y, axis=1) - shift_x = paddle.expand(shift_x, shape=[h, w]) - shift_y = paddle.expand(shift_y, shape=[h, w]) - shift_x = paddle.reshape(shift_x, shape=[-1]) - shift_y = paddle.reshape(shift_y, shape=[-1]) - location = paddle.stack( - [shift_x, shift_y], axis=-1) + float(fpn_stride) / 2 - return location - - # decode bbox from ltrb to x1y1x2y2 - def decode_bbox(self, ltrb, points): - assert ltrb.shape[0] == points.shape[0], \ - "When decoding bbox in one image, the num of loc should be same with points." - bbox_decoding = paddle.stack( - [ - points[:, 0] - ltrb[:, 0], points[:, 1] - ltrb[:, 1], - points[:, 0] + ltrb[:, 2], points[:, 1] + ltrb[:, 3] - ], - axis=1) - return bbox_decoding - - # encode bbox from x1y1x2y2 to ltrb - def encode_bbox(self, bbox, points): - assert bbox.shape[0] == points.shape[0], \ - "When encoding bbox in one image, the num of bbox should be same with points." - bbox_encoding = paddle.stack( - [ - points[:, 0] - bbox[:, 0], points[:, 1] - bbox[:, 1], - bbox[:, 2] - points[:, 0], bbox[:, 3] - points[:, 1] - ], - axis=1) - return bbox_encoding - - def calcualate_iou(self, gt_bbox, predict_bbox): - # bbox area - gt_area = (gt_bbox[:, 2] - gt_bbox[:, 0]) * \ - (gt_bbox[:, 3] - gt_bbox[:, 1]) - predict_area = (predict_bbox[:, 2] - predict_bbox[:, 0]) * \ - (predict_bbox[:, 3] - predict_bbox[:, 1]) - # overlop area - lt = paddle.fmax(gt_bbox[:, None, :2], predict_bbox[None, :, :2]) - rb = paddle.fmin(gt_bbox[:, None, 2:], predict_bbox[None, :, 2:]) - wh = paddle.clip(rb - lt, min=0) - overlap = wh[..., 0] * wh[..., 1] - # iou - iou = overlap / (gt_area[:, None] + predict_area[None, :] - overlap) - return iou - - # select potential positives from hard negatives - def hard_neg_mining(self, - cls_score, - loc_ltrb, - quality, - pos_ind, - hard_neg_ind, - loc_mask, - loc_targets, - iou_thresh=0.6): - # get points locations and strides - points_list = [] - strides_list = [] - scale_list = [] - scale = [0, 1, 2, 3, 4] - for fpn_scale, fpn_stride, HW in zip(scale, self.fpn_stride, - self.lvl_hw): - h, w = HW - lvl_points = self.compute_locations_by_level(fpn_stride, h, w) - points_list.append(lvl_points) - lvl_strides = paddle.full([h * w, 1], fpn_stride) - strides_list.append(lvl_strides) - lvl_scales = paddle.full([h * w, 1], fpn_scale) - scale_list.append(lvl_scales) - points = paddle.concat(points_list, axis=0) - strides = paddle.concat(strides_list, axis=0) - scales = paddle.concat(scale_list, axis=0) - - # cls scores - cls_vals = F.sigmoid(cls_score) * F.sigmoid(quality) - max_vals = paddle.max(cls_vals, axis=-1) - class_ind = paddle.argmax(cls_vals, axis=-1) - - ### calculate iou between positive and hard negative - # decode pos bbox - pos_cls = max_vals[pos_ind] - pos_loc = loc_ltrb[pos_ind].reshape([-1, 4]) - pos_strides = strides[pos_ind] - pos_points = points[pos_ind].reshape([-1, 2]) - pos_loc = pos_loc * pos_strides - pos_bbox = self.decode_bbox(pos_loc, pos_points) - pos_scales = scales[pos_ind] - # decode hard negative bbox - hard_neg_loc = loc_ltrb[hard_neg_ind].reshape([-1, 4]) - hard_neg_strides = strides[hard_neg_ind] - hard_neg_points = points[hard_neg_ind].reshape([-1, 2]) - hard_neg_loc = hard_neg_loc * hard_neg_strides - hard_neg_bbox = self.decode_bbox(hard_neg_loc, hard_neg_points) - hard_neg_scales = scales[hard_neg_ind] - # iou between pos bbox and hard negative bbox - hard_neg_pos_iou = self.calcualate_iou(hard_neg_bbox, pos_bbox) - - ### select potential positives from hard negatives - # scale flag - scale_temp = paddle.abs( - pos_scales.reshape([-1])[None, :] - hard_neg_scales.reshape([-1]) - [:, None]) - scale_flag = (scale_temp <= 1.) - # iou flag - iou_flag = (hard_neg_pos_iou >= iou_thresh) - # same class flag - pos_class = class_ind[pos_ind] - hard_neg_class = class_ind[hard_neg_ind] - class_flag = pos_class[None, :] - hard_neg_class[:, None] - class_flag = (class_flag == 0) - # hard negative point inside positive bbox flag - ltrb_temp = paddle.stack( - [ - hard_neg_points[:, None, 0] - pos_bbox[None, :, 0], - hard_neg_points[:, None, 1] - pos_bbox[None, :, 1], - pos_bbox[None, :, 2] - hard_neg_points[:, None, 0], - pos_bbox[None, :, 3] - hard_neg_points[:, None, 1] - ], - axis=-1) - inside_flag = ltrb_temp.min(axis=-1) > 0 - # reset iou - valid_flag = (iou_flag & class_flag & inside_flag & scale_flag) - invalid_iou = paddle.zeros_like(hard_neg_pos_iou) - hard_neg_pos_iou = paddle.where(valid_flag, hard_neg_pos_iou, - invalid_iou) - pos_hard_neg_max_iou = hard_neg_pos_iou.max(axis=-1) - # selece potential pos - potential_pos_ind = (pos_hard_neg_max_iou > 0.) - num_potential_pos = paddle.nonzero(potential_pos_ind).shape[0] - if num_potential_pos == 0: - return None - - ### calculate loc target:aggregate all matching bboxes as the bbox targets of potential pos - # prepare data - potential_points = hard_neg_points[potential_pos_ind].reshape([-1, 2]) - potential_strides = hard_neg_strides[potential_pos_ind] - potential_valid_flag = valid_flag[potential_pos_ind] - potential_pos_ind = hard_neg_ind[potential_pos_ind] - - # get cls and box of matching positives - pos_cls = max_vals[pos_ind] - expand_pos_bbox = paddle.expand( - pos_bbox, - shape=[num_potential_pos, pos_bbox.shape[0], pos_bbox.shape[1]]) - expand_pos_cls = paddle.expand( - pos_cls, shape=[num_potential_pos, pos_cls.shape[0]]) - invalid_cls = paddle.zeros_like(expand_pos_cls) - expand_pos_cls = paddle.where(potential_valid_flag, expand_pos_cls, - invalid_cls) - expand_pos_cls = paddle.unsqueeze(expand_pos_cls, axis=-1) - # aggregate box based on cls_score - agg_bbox = (expand_pos_bbox * expand_pos_cls).sum(axis=1) \ - / expand_pos_cls.sum(axis=1) - agg_ltrb = self.encode_bbox(agg_bbox, potential_points) - agg_ltrb = agg_ltrb / potential_strides - - # loc target for all pos - loc_targets[potential_pos_ind] = agg_ltrb - loc_mask[potential_pos_ind] = 1. - - return loc_mask, loc_targets - - # get training targets - def get_targets_per_img(self, tea_cls, tea_loc, tea_iou, stu_cls, stu_loc, - stu_iou): - - ### sample selection - # prepare datas - tea_cls_scores = F.sigmoid(tea_cls) * F.sigmoid(tea_iou) - class_ind = paddle.argmax(tea_cls_scores, axis=-1) - max_vals = paddle.max(tea_cls_scores, axis=-1) - cls_mask = paddle.zeros_like( - max_vals - ) # set cls valid mask: pos is 1, hard_negative and negative are 0. - num_pos, num_hard_neg = 0, 0 - - # mean-std selection - # using nonzero to turn index from bool to int, because the index will be used to compose two-dim index in following. - # using squeeze rather than reshape to avoid errors when no score is larger than thresh. - candidate_ind = paddle.nonzero(max_vals >= 0.1).squeeze(axis=-1) - num_candidate = candidate_ind.shape[0] - if num_candidate > 0: - # pos thresh = mean + std to select pos samples - candidate_score = max_vals[candidate_ind] - candidate_score_mean = candidate_score.mean() - candidate_score_std = candidate_score.std() - pos_thresh = (candidate_score_mean + candidate_score_std).clip( - max=0.4) - # select pos - pos_ind = paddle.nonzero(max_vals >= pos_thresh).squeeze(axis=-1) - num_pos = pos_ind.shape[0] - # select hard negatives as potential pos - hard_neg_ind = (max_vals >= 0.1) & (max_vals < pos_thresh) - hard_neg_ind = paddle.nonzero(hard_neg_ind).squeeze(axis=-1) - num_hard_neg = hard_neg_ind.shape[0] - # if not positive, directly select top-10 as pos. - if (num_pos == 0): - num_pos = 10 - _, pos_ind = paddle.topk(max_vals, k=num_pos) - cls_mask[pos_ind] = 1. - - ### Consistency Regularization Training targets - # cls targets - pos_class_ind = class_ind[pos_ind] - cls_targets = paddle.zeros_like(tea_cls) - cls_targets[pos_ind, pos_class_ind] = tea_cls_scores[pos_ind, - pos_class_ind] - # hard negative cls target - if num_hard_neg != 0: - cls_targets[hard_neg_ind] = tea_cls_scores[hard_neg_ind] - # loc targets - loc_targets = paddle.zeros_like(tea_loc) - loc_targets[pos_ind] = tea_loc[pos_ind] - # iou targets - iou_targets = paddle.zeros( - shape=[tea_iou.shape[0]], dtype=tea_iou.dtype) - iou_targets[pos_ind] = F.sigmoid( - paddle.squeeze( - tea_iou, axis=-1)[pos_ind]) - - loc_mask = cls_mask.clone() - # select potential positive from hard negatives for loc_task training - if (num_hard_neg > 0) and self.hard_neg_mining_flag: - results = self.hard_neg_mining(tea_cls, tea_loc, tea_iou, pos_ind, - hard_neg_ind, loc_mask, loc_targets) - if results is not None: - loc_mask, loc_targets = results - loc_pos_ind = paddle.nonzero(loc_mask > 0.).squeeze(axis=-1) - iou_targets[loc_pos_ind] = F.sigmoid( - paddle.squeeze( - tea_iou, axis=-1)[loc_pos_ind]) - - return cls_mask, loc_mask, \ - cls_targets, loc_targets, iou_targets - - def forward(self, student_prediction, teacher_prediction): - stu_cls_lvl, stu_loc_lvl, stu_iou_lvl = student_prediction - tea_cls_lvl, tea_loc_lvl, tea_iou_lvl, self.fpn_stride = teacher_prediction - - # H and W of level (used for aggregating targets) - self.lvl_hw = [] - for t in tea_cls_lvl: - _, _, H, W = t.shape - self.lvl_hw.append([H, W]) - - # levels to images - stu_cls_img = levels_to_images(stu_cls_lvl) - stu_loc_img = levels_to_images(stu_loc_lvl) - stu_iou_img = levels_to_images(stu_iou_lvl) - tea_cls_img = levels_to_images(tea_cls_lvl) - tea_loc_img = levels_to_images(tea_loc_lvl) - tea_iou_img = levels_to_images(tea_iou_lvl) - - with paddle.no_grad(): - cls_mask, loc_mask, \ - cls_targets, loc_targets, iou_targets = multi_apply( - self.get_targets_per_img, - tea_cls_img, - tea_loc_img, - tea_iou_img, - stu_cls_img, - stu_loc_img, - stu_iou_img - ) - - # flatten preditction - stu_cls = paddle.concat(stu_cls_img, axis=0) - stu_loc = paddle.concat(stu_loc_img, axis=0) - stu_iou = paddle.concat(stu_iou_img, axis=0) - # flatten targets - cls_mask = paddle.concat(cls_mask, axis=0) - loc_mask = paddle.concat(loc_mask, axis=0) - cls_targets = paddle.concat(cls_targets, axis=0) - loc_targets = paddle.concat(loc_targets, axis=0) - iou_targets = paddle.concat(iou_targets, axis=0) - - ### Training Weights and avg factor - # find positives - cls_pos_ind = paddle.nonzero(cls_mask > 0.).squeeze(axis=-1) - loc_pos_ind = paddle.nonzero(loc_mask > 0.).squeeze(axis=-1) - # cls weight - cls_sample_weights = paddle.ones([cls_targets.shape[0]]) - cls_avg_factor = paddle.max(cls_targets[cls_pos_ind], - axis=-1).sum().item() - # loc weight - loc_sample_weights = paddle.max(cls_targets[loc_pos_ind], axis=-1) - loc_avg_factor = loc_sample_weights.sum().item() - # iou weight - iou_sample_weights = paddle.ones([loc_pos_ind.shape[0]]) - iou_avg_factor = loc_pos_ind.shape[0] - - ### unsupervised loss - # cls loss - loss_cls = self.quality_focal_loss( - stu_cls, - cls_targets, - quality=stu_iou, - weights=cls_sample_weights, - avg_factor=cls_avg_factor) * self.cls_weight - # iou loss - pos_stu_iou = paddle.squeeze(stu_iou, axis=-1)[loc_pos_ind] - pos_iou_targets = iou_targets[loc_pos_ind] - loss_iou = F.binary_cross_entropy( - F.sigmoid(pos_stu_iou), pos_iou_targets, - reduction='none') * iou_sample_weights - loss_iou = loss_iou.sum() / iou_avg_factor * self.iou_weight - # box loss - pos_stu_loc = stu_loc[loc_pos_ind] - pos_loc_targets = loc_targets[loc_pos_ind] - - loss_box = self.iou_loss( - pos_stu_loc, - pos_loc_targets, - weights=loc_sample_weights, - avg_factor=loc_avg_factor) - loss_box = loss_box * self.reg_weight - - loss_all = { - "loss_cls": loss_cls, - "loss_box": loss_box, - "loss_iou": loss_iou, - } - return loss_all diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/focal_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/focal_loss.py deleted file mode 100644 index b9a64e1..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/focal_loss.py +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn.functional as F -import paddle.nn as nn -from ppdet.core.workspace import register - -__all__ = ['FocalLoss', 'Weighted_FocalLoss'] - -@register -class FocalLoss(nn.Layer): - """A wrapper around paddle.nn.functional.sigmoid_focal_loss. - Args: - use_sigmoid (bool): currently only support use_sigmoid=True - alpha (float): parameter alpha in Focal Loss - gamma (float): parameter gamma in Focal Loss - loss_weight (float): final loss will be multiplied by this - """ - def __init__(self, - use_sigmoid=True, - alpha=0.25, - gamma=2.0, - loss_weight=1.0): - super(FocalLoss, self).__init__() - assert use_sigmoid == True, \ - 'Focal Loss only supports sigmoid at the moment' - self.use_sigmoid = use_sigmoid - self.alpha = alpha - self.gamma = gamma - self.loss_weight = loss_weight - - def forward(self, pred, target, reduction='none'): - """forward function. - Args: - pred (Tensor): logits of class prediction, of shape (N, num_classes) - target (Tensor): target class label, of shape (N, ) - reduction (str): the way to reduce loss, one of (none, sum, mean) - """ - num_classes = pred.shape[1] - target = F.one_hot(target, num_classes+1).cast(pred.dtype) - target = target[:, :-1].detach() - loss = F.sigmoid_focal_loss( - pred, target, alpha=self.alpha, gamma=self.gamma, - reduction=reduction) - return loss * self.loss_weight - - -@register -class Weighted_FocalLoss(FocalLoss): - """A wrapper around paddle.nn.functional.sigmoid_focal_loss. - Args: - use_sigmoid (bool): currently only support use_sigmoid=True - alpha (float): parameter alpha in Focal Loss - gamma (float): parameter gamma in Focal Loss - loss_weight (float): final loss will be multiplied by this - """ - def __init__(self, - use_sigmoid=True, - alpha=0.25, - gamma=2.0, - loss_weight=1.0, - reduction="mean"): - super(FocalLoss, self).__init__() - assert use_sigmoid == True, \ - 'Focal Loss only supports sigmoid at the moment' - self.use_sigmoid = use_sigmoid - self.alpha = alpha - self.gamma = gamma - self.loss_weight = loss_weight - self.reduction = reduction - - def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None): - """forward function. - Args: - pred (Tensor): logits of class prediction, of shape (N, num_classes) - target (Tensor): target class label, of shape (N, ) - reduction (str): the way to reduce loss, one of (none, sum, mean) - """ - assert reduction_override in (None, 'none', 'mean', 'sum') - reduction = ( - reduction_override if reduction_override else self.reduction) - num_classes = pred.shape[1] - target = F.one_hot(target, num_classes + 1).astype(pred.dtype) - target = target[:, :-1].detach() - loss = F.sigmoid_focal_loss( - pred, target, alpha=self.alpha, gamma=self.gamma, - reduction='none') - - if weight is not None: - if weight.shape != loss.shape: - if weight.shape[0] == loss.shape[0]: - # For most cases, weight is of shape (num_priors, ), - # which means it does not have the second axis num_class - weight = weight.reshape((-1, 1)) - else: - # Sometimes, weight per anchor per class is also needed. e.g. - # in FSAF. But it may be flattened of shape - # (num_priors x num_class, ), while loss is still of shape - # (num_priors, num_class). - assert weight.numel() == loss.numel() - weight = weight.reshape((loss.shape[0], -1)) - assert weight.ndim == loss.ndim - loss = loss * weight - - # if avg_factor is not specified, just reduce the loss - if avg_factor is None: - if reduction == 'mean': - loss = loss.mean() - elif reduction == 'sum': - loss = loss.sum() - else: - # if reduction is mean, then average the loss by avg_factor - if reduction == 'mean': - # Avoid causing ZeroDivisionError when avg_factor is 0.0, - # i.e., all labels of an image belong to ignore index. - eps = 1e-10 - loss = loss.sum() / (avg_factor + eps) - # if reduction is 'none', then do nothing, otherwise raise an error - elif reduction != 'none': - raise ValueError('avg_factor can not be used with reduction="sum"') - - return loss * self.loss_weight diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/gfocal_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/gfocal_loss.py deleted file mode 100644 index 37e27f0..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/gfocal_loss.py +++ /dev/null @@ -1,217 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The code is based on: -# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/gfocal_loss.py - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register, serializable -from ppdet.modeling import ops - -__all__ = ['QualityFocalLoss', 'DistributionFocalLoss'] - - -def quality_focal_loss(pred, target, beta=2.0, use_sigmoid=True): - """ - Quality Focal Loss (QFL) is from `Generalized Focal Loss: Learning - Qualified and Distributed Bounding Boxes for Dense Object Detection - `_. - Args: - pred (Tensor): Predicted joint representation of classification - and quality (IoU) estimation with shape (N, C), C is the number of - classes. - target (tuple([Tensor])): Target category label with shape (N,) - and target quality label with shape (N,). - beta (float): The beta parameter for calculating the modulating factor. - Defaults to 2.0. - Returns: - Tensor: Loss tensor with shape (N,). - """ - assert len(target) == 2, """target for QFL must be a tuple of two elements, - including category label and quality label, respectively""" - # label denotes the category id, score denotes the quality score - label, score = target - if use_sigmoid: - func = F.binary_cross_entropy_with_logits - else: - func = F.binary_cross_entropy - - # negatives are supervised by 0 quality score - pred_sigmoid = F.sigmoid(pred) if use_sigmoid else pred - scale_factor = pred_sigmoid - zerolabel = paddle.zeros(pred.shape, dtype='float32') - loss = func(pred, zerolabel, reduction='none') * scale_factor.pow(beta) - - # FG cat_id: [0, num_classes -1], BG cat_id: num_classes - bg_class_ind = pred.shape[1] - pos = paddle.logical_and((label >= 0), - (label < bg_class_ind)).nonzero().squeeze(1) - if pos.shape[0] == 0: - return loss.sum(axis=1) - pos_label = paddle.gather(label, pos, axis=0) - pos_mask = np.zeros(pred.shape, dtype=np.int32) - pos_mask[pos.numpy(), pos_label.numpy()] = 1 - pos_mask = paddle.to_tensor(pos_mask, dtype='bool') - score = score.unsqueeze(-1).expand([-1, pred.shape[1]]).cast('float32') - # positives are supervised by bbox quality (IoU) score - scale_factor_new = score - pred_sigmoid - - loss_pos = func( - pred, score, reduction='none') * scale_factor_new.abs().pow(beta) - loss = loss * paddle.logical_not(pos_mask) + loss_pos * pos_mask - loss = loss.sum(axis=1) - return loss - - -def distribution_focal_loss(pred, label): - """Distribution Focal Loss (DFL) is from `Generalized Focal Loss: Learning - Qualified and Distributed Bounding Boxes for Dense Object Detection - `_. - Args: - pred (Tensor): Predicted general distribution of bounding boxes - (before softmax) with shape (N, n+1), n is the max value of the - integral set `{0, ..., n}` in paper. - label (Tensor): Target distance label for bounding boxes with - shape (N,). - Returns: - Tensor: Loss tensor with shape (N,). - """ - dis_left = label.cast('int64') - dis_right = dis_left + 1 - weight_left = dis_right.cast('float32') - label - weight_right = label - dis_left.cast('float32') - loss = F.cross_entropy(pred, dis_left, reduction='none') * weight_left \ - + F.cross_entropy(pred, dis_right, reduction='none') * weight_right - return loss - - -@register -@serializable -class QualityFocalLoss(nn.Layer): - r"""Quality Focal Loss (QFL) is a variant of `Generalized Focal Loss: - Learning Qualified and Distributed Bounding Boxes for Dense Object - Detection `_. - Args: - use_sigmoid (bool): Whether sigmoid operation is conducted in QFL. - Defaults to True. - beta (float): The beta parameter for calculating the modulating factor. - Defaults to 2.0. - reduction (str): Options are "none", "mean" and "sum". - loss_weight (float): Loss weight of current loss. - """ - - def __init__(self, - use_sigmoid=True, - beta=2.0, - reduction='mean', - loss_weight=1.0): - super(QualityFocalLoss, self).__init__() - self.use_sigmoid = use_sigmoid - self.beta = beta - assert reduction in ('none', 'mean', 'sum') - self.reduction = reduction - self.loss_weight = loss_weight - - def forward(self, pred, target, weight=None, avg_factor=None): - """Forward function. - Args: - pred (Tensor): Predicted joint representation of - classification and quality (IoU) estimation with shape (N, C), - C is the number of classes. - target (tuple([Tensor])): Target category label with shape - (N,) and target quality label with shape (N,). - weight (Tensor, optional): The weight of loss for each - prediction. Defaults to None. - avg_factor (int, optional): Average factor that is used to average - the loss. Defaults to None. - """ - - loss = self.loss_weight * quality_focal_loss( - pred, target, beta=self.beta, use_sigmoid=self.use_sigmoid) - - if weight is not None: - loss = loss * weight - if avg_factor is None: - if self.reduction == 'none': - return loss - elif self.reduction == 'mean': - return loss.mean() - elif self.reduction == 'sum': - return loss.sum() - else: - # if reduction is mean, then average the loss by avg_factor - if self.reduction == 'mean': - loss = loss.sum() / avg_factor - # if reduction is 'none', then do nothing, otherwise raise an error - elif self.reduction != 'none': - raise ValueError( - 'avg_factor can not be used with reduction="sum"') - return loss - - -@register -@serializable -class DistributionFocalLoss(nn.Layer): - """Distribution Focal Loss (DFL) is a variant of `Generalized Focal Loss: - Learning Qualified and Distributed Bounding Boxes for Dense Object - Detection `_. - Args: - reduction (str): Options are `'none'`, `'mean'` and `'sum'`. - loss_weight (float): Loss weight of current loss. - """ - - def __init__(self, reduction='mean', loss_weight=1.0): - super(DistributionFocalLoss, self).__init__() - assert reduction in ('none', 'mean', 'sum') - self.reduction = reduction - self.loss_weight = loss_weight - - def forward(self, pred, target, weight=None, avg_factor=None): - """Forward function. - Args: - pred (Tensor): Predicted general distribution of bounding - boxes (before softmax) with shape (N, n+1), n is the max value - of the integral set `{0, ..., n}` in paper. - target (Tensor): Target distance label for bounding boxes - with shape (N,). - weight (Tensor, optional): The weight of loss for each - prediction. Defaults to None. - avg_factor (int, optional): Average factor that is used to average - the loss. Defaults to None. - """ - loss = self.loss_weight * distribution_focal_loss(pred, target) - if weight is not None: - loss = loss * weight - if avg_factor is None: - if self.reduction == 'none': - return loss - elif self.reduction == 'mean': - return loss.mean() - elif self.reduction == 'sum': - return loss.sum() - else: - # if reduction is mean, then average the loss by avg_factor - if self.reduction == 'mean': - loss = loss.sum() / avg_factor - # if reduction is 'none', then do nothing, otherwise raise an error - elif self.reduction != 'none': - raise ValueError( - 'avg_factor can not be used with reduction="sum"') - return loss diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/iou_aware_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/iou_aware_loss.py deleted file mode 100644 index 4a9e904..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/iou_aware_loss.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle.nn.functional as F -from ppdet.core.workspace import register, serializable -from .iou_loss import IouLoss -from ..bbox_utils import bbox_iou - - -@register -@serializable -class IouAwareLoss(IouLoss): - """ - iou aware loss, see https://arxiv.org/abs/1912.05992 - Args: - loss_weight (float): iou aware loss weight, default is 1.0 - max_height (int): max height of input to support random shape input - max_width (int): max width of input to support random shape input - """ - - def __init__(self, loss_weight=1.0, giou=False, diou=False, ciou=False): - super(IouAwareLoss, self).__init__( - loss_weight=loss_weight, giou=giou, diou=diou, ciou=ciou) - - def __call__(self, ioup, pbox, gbox): - iou = bbox_iou( - pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou) - iou.stop_gradient = True - loss_iou_aware = F.binary_cross_entropy_with_logits( - ioup, iou, reduction='none') - loss_iou_aware = loss_iou_aware * self.loss_weight - return loss_iou_aware diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/iou_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/iou_loss.py deleted file mode 100644 index b5cac22..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/iou_loss.py +++ /dev/null @@ -1,295 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np -import math -import paddle - -from ppdet.core.workspace import register, serializable -from ..bbox_utils import bbox_iou - -__all__ = ['IouLoss', 'GIoULoss', 'DIouLoss', 'SIoULoss'] - - -@register -@serializable -class IouLoss(object): - """ - iou loss, see https://arxiv.org/abs/1908.03851 - loss = 1.0 - iou * iou - Args: - loss_weight (float): iou loss weight, default is 2.5 - max_height (int): max height of input to support random shape input - max_width (int): max width of input to support random shape input - ciou_term (bool): whether to add ciou_term - loss_square (bool): whether to square the iou term - """ - - def __init__(self, - loss_weight=2.5, - giou=False, - diou=False, - ciou=False, - loss_square=True): - self.loss_weight = loss_weight - self.giou = giou - self.diou = diou - self.ciou = ciou - self.loss_square = loss_square - - def __call__(self, pbox, gbox): - iou = bbox_iou( - pbox, gbox, giou=self.giou, diou=self.diou, ciou=self.ciou) - if self.loss_square: - loss_iou = 1 - iou * iou - else: - loss_iou = 1 - iou - - loss_iou = loss_iou * self.loss_weight - return loss_iou - - -@register -@serializable -class GIoULoss(object): - """ - Generalized Intersection over Union, see https://arxiv.org/abs/1902.09630 - Args: - loss_weight (float): giou loss weight, default as 1 - eps (float): epsilon to avoid divide by zero, default as 1e-10 - reduction (string): Options are "none", "mean" and "sum". default as none - """ - - def __init__(self, loss_weight=1., eps=1e-10, reduction='none'): - self.loss_weight = loss_weight - self.eps = eps - assert reduction in ('none', 'mean', 'sum') - self.reduction = reduction - - def bbox_overlap(self, box1, box2, eps=1e-10): - """calculate the iou of box1 and box2 - Args: - box1 (Tensor): box1 with the shape (..., 4) - box2 (Tensor): box1 with the shape (..., 4) - eps (float): epsilon to avoid divide by zero - Return: - iou (Tensor): iou of box1 and box2 - overlap (Tensor): overlap of box1 and box2 - union (Tensor): union of box1 and box2 - """ - x1, y1, x2, y2 = box1 - x1g, y1g, x2g, y2g = box2 - - xkis1 = paddle.maximum(x1, x1g) - ykis1 = paddle.maximum(y1, y1g) - xkis2 = paddle.minimum(x2, x2g) - ykis2 = paddle.minimum(y2, y2g) - w_inter = (xkis2 - xkis1).clip(0) - h_inter = (ykis2 - ykis1).clip(0) - overlap = w_inter * h_inter - - area1 = (x2 - x1) * (y2 - y1) - area2 = (x2g - x1g) * (y2g - y1g) - union = area1 + area2 - overlap + eps - iou = overlap / union - - return iou, overlap, union - - def __call__(self, pbox, gbox, iou_weight=1., loc_reweight=None): - x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1) - x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1) - box1 = [x1, y1, x2, y2] - box2 = [x1g, y1g, x2g, y2g] - iou, overlap, union = self.bbox_overlap(box1, box2, self.eps) - xc1 = paddle.minimum(x1, x1g) - yc1 = paddle.minimum(y1, y1g) - xc2 = paddle.maximum(x2, x2g) - yc2 = paddle.maximum(y2, y2g) - - area_c = (xc2 - xc1) * (yc2 - yc1) + self.eps - miou = iou - ((area_c - union) / area_c) - if loc_reweight is not None: - loc_reweight = paddle.reshape(loc_reweight, shape=(-1, 1)) - loc_thresh = 0.9 - giou = 1 - (1 - loc_thresh - ) * miou - loc_thresh * miou * loc_reweight - else: - giou = 1 - miou - if self.reduction == 'none': - loss = giou - elif self.reduction == 'sum': - loss = paddle.sum(giou * iou_weight) - else: - loss = paddle.mean(giou * iou_weight) - return loss * self.loss_weight - - -@register -@serializable -class DIouLoss(GIoULoss): - """ - Distance-IoU Loss, see https://arxiv.org/abs/1911.08287 - Args: - loss_weight (float): giou loss weight, default as 1 - eps (float): epsilon to avoid divide by zero, default as 1e-10 - use_complete_iou_loss (bool): whether to use complete iou loss - """ - - def __init__(self, loss_weight=1., eps=1e-10, use_complete_iou_loss=True): - super(DIouLoss, self).__init__(loss_weight=loss_weight, eps=eps) - self.use_complete_iou_loss = use_complete_iou_loss - - def __call__(self, pbox, gbox, iou_weight=1.): - x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1) - x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1) - cx = (x1 + x2) / 2 - cy = (y1 + y2) / 2 - w = x2 - x1 - h = y2 - y1 - - cxg = (x1g + x2g) / 2 - cyg = (y1g + y2g) / 2 - wg = x2g - x1g - hg = y2g - y1g - - x2 = paddle.maximum(x1, x2) - y2 = paddle.maximum(y1, y2) - - # A and B - xkis1 = paddle.maximum(x1, x1g) - ykis1 = paddle.maximum(y1, y1g) - xkis2 = paddle.minimum(x2, x2g) - ykis2 = paddle.minimum(y2, y2g) - - # A or B - xc1 = paddle.minimum(x1, x1g) - yc1 = paddle.minimum(y1, y1g) - xc2 = paddle.maximum(x2, x2g) - yc2 = paddle.maximum(y2, y2g) - - intsctk = (xkis2 - xkis1) * (ykis2 - ykis1) - intsctk = intsctk * paddle.greater_than( - xkis2, xkis1) * paddle.greater_than(ykis2, ykis1) - unionk = (x2 - x1) * (y2 - y1) + (x2g - x1g) * (y2g - y1g - ) - intsctk + self.eps - iouk = intsctk / unionk - - # DIOU term - dist_intersection = (cx - cxg) * (cx - cxg) + (cy - cyg) * (cy - cyg) - dist_union = (xc2 - xc1) * (xc2 - xc1) + (yc2 - yc1) * (yc2 - yc1) - diou_term = (dist_intersection + self.eps) / (dist_union + self.eps) - - # CIOU term - ciou_term = 0 - if self.use_complete_iou_loss: - ar_gt = wg / hg - ar_pred = w / h - arctan = paddle.atan(ar_gt) - paddle.atan(ar_pred) - ar_loss = 4. / np.pi / np.pi * arctan * arctan - alpha = ar_loss / (1 - iouk + ar_loss + self.eps) - alpha.stop_gradient = True - ciou_term = alpha * ar_loss - - diou = paddle.mean((1 - iouk + ciou_term + diou_term) * iou_weight) - - return diou * self.loss_weight - - -@register -@serializable -class SIoULoss(GIoULoss): - """ - see https://arxiv.org/pdf/2205.12740.pdf - Args: - loss_weight (float): siou loss weight, default as 1 - eps (float): epsilon to avoid divide by zero, default as 1e-10 - theta (float): default as 4 - reduction (str): Options are "none", "mean" and "sum". default as none - """ - - def __init__(self, loss_weight=1., eps=1e-10, theta=4., reduction='none'): - super(SIoULoss, self).__init__(loss_weight=loss_weight, eps=eps) - self.loss_weight = loss_weight - self.eps = eps - self.theta = theta - self.reduction = reduction - - def __call__(self, pbox, gbox): - x1, y1, x2, y2 = paddle.split(pbox, num_or_sections=4, axis=-1) - x1g, y1g, x2g, y2g = paddle.split(gbox, num_or_sections=4, axis=-1) - - box1 = [x1, y1, x2, y2] - box2 = [x1g, y1g, x2g, y2g] - iou = bbox_iou(box1, box2) - - cx = (x1 + x2) / 2 - cy = (y1 + y2) / 2 - w = x2 - x1 + self.eps - h = y2 - y1 + self.eps - - cxg = (x1g + x2g) / 2 - cyg = (y1g + y2g) / 2 - wg = x2g - x1g + self.eps - hg = y2g - y1g + self.eps - - x2 = paddle.maximum(x1, x2) - y2 = paddle.maximum(y1, y2) - - # A or B - xc1 = paddle.minimum(x1, x1g) - yc1 = paddle.minimum(y1, y1g) - xc2 = paddle.maximum(x2, x2g) - yc2 = paddle.maximum(y2, y2g) - - cw_out = xc2 - xc1 - ch_out = yc2 - yc1 - - ch = paddle.maximum(cy, cyg) - paddle.minimum(cy, cyg) - cw = paddle.maximum(cx, cxg) - paddle.minimum(cx, cxg) - - # angle cost - dist_intersection = paddle.sqrt((cx - cxg)**2 + (cy - cyg)**2) - sin_angle_alpha = ch / dist_intersection - sin_angle_beta = cw / dist_intersection - thred = paddle.pow(paddle.to_tensor(2), 0.5) / 2 - thred.stop_gradient = True - sin_alpha = paddle.where(sin_angle_alpha > thred, sin_angle_beta, - sin_angle_alpha) - angle_cost = paddle.cos(paddle.asin(sin_alpha) * 2 - math.pi / 2) - - # distance cost - gamma = 2 - angle_cost - # gamma.stop_gradient = True - beta_x = ((cxg - cx) / cw_out)**2 - beta_y = ((cyg - cy) / ch_out)**2 - dist_cost = 1 - paddle.exp(-gamma * beta_x) + 1 - paddle.exp(-gamma * - beta_y) - - # shape cost - omega_w = paddle.abs(w - wg) / paddle.maximum(w, wg) - omega_h = paddle.abs(hg - h) / paddle.maximum(h, hg) - omega = (1 - paddle.exp(-omega_w))**self.theta + ( - 1 - paddle.exp(-omega_h))**self.theta - siou_loss = 1 - iou + (omega + dist_cost) / 2 - - if self.reduction == 'mean': - siou_loss = paddle.mean(siou_loss) - elif self.reduction == 'sum': - siou_loss = paddle.sum(siou_loss) - - return siou_loss * self.loss_weight diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/jde_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/jde_loss.py deleted file mode 100644 index 5c3b5a6..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/jde_loss.py +++ /dev/null @@ -1,193 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register - -__all__ = ['JDEDetectionLoss', 'JDEEmbeddingLoss', 'JDELoss'] - - -@register -class JDEDetectionLoss(nn.Layer): - __shared__ = ['num_classes'] - - def __init__(self, num_classes=1, for_mot=True): - super(JDEDetectionLoss, self).__init__() - self.num_classes = num_classes - self.for_mot = for_mot - - def det_loss(self, p_det, anchor, t_conf, t_box): - pshape = paddle.shape(p_det) - pshape.stop_gradient = True - nB, nGh, nGw = pshape[0], pshape[-2], pshape[-1] - nA = len(anchor) - p_det = paddle.reshape( - p_det, [nB, nA, self.num_classes + 5, nGh, nGw]).transpose( - (0, 1, 3, 4, 2)) - - # 1. loss_conf: cross_entropy - p_conf = p_det[:, :, :, :, 4:6] - p_conf_flatten = paddle.reshape(p_conf, [-1, 2]) - t_conf_flatten = t_conf.flatten() - t_conf_flatten = paddle.cast(t_conf_flatten, dtype="int64") - t_conf_flatten.stop_gradient = True - loss_conf = F.cross_entropy( - p_conf_flatten, t_conf_flatten, ignore_index=-1, reduction='mean') - loss_conf.stop_gradient = False - - # 2. loss_box: smooth_l1_loss - p_box = p_det[:, :, :, :, :4] - p_box_flatten = paddle.reshape(p_box, [-1, 4]) - t_box_flatten = paddle.reshape(t_box, [-1, 4]) - fg_inds = paddle.nonzero(t_conf_flatten > 0).flatten() - if fg_inds.numel() > 0: - reg_delta = paddle.gather(p_box_flatten, fg_inds) - reg_target = paddle.gather(t_box_flatten, fg_inds) - else: - reg_delta = paddle.to_tensor([0, 0, 0, 0], dtype='float32') - reg_delta.stop_gradient = False - reg_target = paddle.to_tensor([0, 0, 0, 0], dtype='float32') - reg_target.stop_gradient = True - loss_box = F.smooth_l1_loss( - reg_delta, reg_target, reduction='mean', delta=1.0) - loss_box.stop_gradient = False - - return loss_conf, loss_box - - def forward(self, det_outs, targets, anchors): - """ - Args: - det_outs (list[Tensor]): output from detection head, each one - is a 4-D Tensor with shape [N, C, H, W]. - targets (dict): contains 'im_id', 'gt_bbox', 'gt_ide', 'image', - 'im_shape', 'scale_factor' and 'tbox', 'tconf', 'tide' of - each FPN level. - anchors (list[list]): anchor setting of JDE model, N row M col, N is - the anchor levels(FPN levels), M is the anchor scales each - level. - """ - assert len(det_outs) == len(anchors) - loss_confs = [] - loss_boxes = [] - for i, (p_det, anchor) in enumerate(zip(det_outs, anchors)): - t_conf = targets['tconf{}'.format(i)] - t_box = targets['tbox{}'.format(i)] - - loss_conf, loss_box = self.det_loss(p_det, anchor, t_conf, t_box) - loss_confs.append(loss_conf) - loss_boxes.append(loss_box) - if self.for_mot: - return {'loss_confs': loss_confs, 'loss_boxes': loss_boxes} - else: - jde_conf_losses = sum(loss_confs) - jde_box_losses = sum(loss_boxes) - jde_det_losses = { - "loss_conf": jde_conf_losses, - "loss_box": jde_box_losses, - "loss": jde_conf_losses + jde_box_losses, - } - return jde_det_losses - - -@register -class JDEEmbeddingLoss(nn.Layer): - def __init__(self, ): - super(JDEEmbeddingLoss, self).__init__() - self.phony = self.create_parameter(shape=[1], dtype="float32") - - def emb_loss(self, p_ide, t_conf, t_ide, emb_scale, classifier): - emb_dim = p_ide.shape[1] - p_ide = p_ide.transpose((0, 2, 3, 1)) - p_ide_flatten = paddle.reshape(p_ide, [-1, emb_dim]) - mask = t_conf > 0 - mask = paddle.cast(mask, dtype="int64") - mask.stop_gradient = True - emb_mask = mask.max(1).flatten() - emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten() - emb_mask_inds.stop_gradient = True - # use max(1) to decide the id, TODO: more reseanable strategy - t_ide_flatten = t_ide.max(1).flatten() - t_ide_flatten = paddle.cast(t_ide_flatten, dtype="int64") - valid_inds = paddle.nonzero(t_ide_flatten != -1).flatten() - - if emb_mask_inds.numel() == 0 or valid_inds.numel() == 0: - # loss_ide = paddle.to_tensor([0]) # will be error in gradient backward - loss_ide = self.phony * 0 # todo - else: - embedding = paddle.gather(p_ide_flatten, emb_mask_inds) - embedding = emb_scale * F.normalize(embedding) - logits = classifier(embedding) - - ide_target = paddle.gather(t_ide_flatten, emb_mask_inds) - - loss_ide = F.cross_entropy( - logits, ide_target, ignore_index=-1, reduction='mean') - loss_ide.stop_gradient = False - - return loss_ide - - def forward(self, ide_outs, targets, emb_scale, classifier): - loss_ides = [] - for i, p_ide in enumerate(ide_outs): - t_conf = targets['tconf{}'.format(i)] - t_ide = targets['tide{}'.format(i)] - - loss_ide = self.emb_loss(p_ide, t_conf, t_ide, emb_scale, - classifier) - loss_ides.append(loss_ide) - return loss_ides - - -@register -class JDELoss(nn.Layer): - def __init__(self): - super(JDELoss, self).__init__() - - def forward(self, loss_confs, loss_boxes, loss_ides, loss_params_cls, - loss_params_reg, loss_params_ide, targets): - assert len(loss_confs) == len(loss_boxes) == len(loss_ides) - assert len(loss_params_cls) == len(loss_params_reg) == len( - loss_params_ide) - assert len(loss_confs) == len(loss_params_cls) - - batchsize = targets['gt_bbox'].shape[0] - nTargets = paddle.nonzero(paddle.sum(targets['gt_bbox'], axis=2)).shape[ - 0] / batchsize - nTargets = paddle.to_tensor(nTargets, dtype='float32') - nTargets.stop_gradient = True - - jde_losses = [] - for i, (loss_conf, loss_box, loss_ide, l_conf_p, l_box_p, - l_ide_p) in enumerate( - zip(loss_confs, loss_boxes, loss_ides, loss_params_cls, - loss_params_reg, loss_params_ide)): - - jde_loss = l_conf_p(loss_conf) + l_box_p(loss_box) + l_ide_p( - loss_ide) - jde_losses.append(jde_loss) - - loss_all = { - "loss_conf": sum(loss_confs), - "loss_box": sum(loss_boxes), - "loss_ide": sum(loss_ides), - "loss": sum(jde_losses), - "nTargets": nTargets, - } - return loss_all diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/keypoint_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/keypoint_loss.py deleted file mode 100644 index 37a2410..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/keypoint_loss.py +++ /dev/null @@ -1,632 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from itertools import cycle, islice -from collections import abc -import numpy as np -import paddle -import paddle.nn as nn - -from ppdet.core.workspace import register, serializable - -__all__ = ['HrHRNetLoss', 'KeyPointMSELoss', 'OKSLoss', 'CenterFocalLoss', 'L1Loss'] - - -@register -@serializable -class KeyPointMSELoss(nn.Layer): - def __init__(self, use_target_weight=True, loss_scale=0.5): - """ - KeyPointMSELoss layer - - Args: - use_target_weight (bool): whether to use target weight - """ - super(KeyPointMSELoss, self).__init__() - self.criterion = nn.MSELoss(reduction='mean') - self.use_target_weight = use_target_weight - self.loss_scale = loss_scale - - def forward(self, output, records): - target = records['target'] - target_weight = records['target_weight'] - batch_size = output.shape[0] - num_joints = output.shape[1] - heatmaps_pred = output.reshape( - (batch_size, num_joints, -1)).split(num_joints, 1) - heatmaps_gt = target.reshape( - (batch_size, num_joints, -1)).split(num_joints, 1) - loss = 0 - for idx in range(num_joints): - heatmap_pred = heatmaps_pred[idx].squeeze() - heatmap_gt = heatmaps_gt[idx].squeeze() - if self.use_target_weight: - loss += self.loss_scale * self.criterion( - heatmap_pred.multiply(target_weight[:, idx]), - heatmap_gt.multiply(target_weight[:, idx])) - else: - loss += self.loss_scale * self.criterion(heatmap_pred, - heatmap_gt) - keypoint_losses = dict() - keypoint_losses['loss'] = loss / num_joints - return keypoint_losses - - -@register -@serializable -class HrHRNetLoss(nn.Layer): - def __init__(self, num_joints, swahr): - """ - HrHRNetLoss layer - - Args: - num_joints (int): number of keypoints - """ - super(HrHRNetLoss, self).__init__() - if swahr: - self.heatmaploss = HeatMapSWAHRLoss(num_joints) - else: - self.heatmaploss = HeatMapLoss() - self.aeloss = AELoss() - self.ziploss = ZipLoss( - [self.heatmaploss, self.heatmaploss, self.aeloss]) - - def forward(self, inputs, records): - targets = [] - targets.append([records['heatmap_gt1x'], records['mask_1x']]) - targets.append([records['heatmap_gt2x'], records['mask_2x']]) - targets.append(records['tagmap']) - keypoint_losses = dict() - loss = self.ziploss(inputs, targets) - keypoint_losses['heatmap_loss'] = loss[0] + loss[1] - keypoint_losses['pull_loss'] = loss[2][0] - keypoint_losses['push_loss'] = loss[2][1] - keypoint_losses['loss'] = recursive_sum(loss) - return keypoint_losses - - -class HeatMapLoss(object): - def __init__(self, loss_factor=1.0): - super(HeatMapLoss, self).__init__() - self.loss_factor = loss_factor - - def __call__(self, preds, targets): - heatmap, mask = targets - loss = ((preds - heatmap)**2 * mask.cast('float').unsqueeze(1)) - loss = paddle.clip(loss, min=0, max=2).mean() - loss *= self.loss_factor - return loss - - -class HeatMapSWAHRLoss(object): - def __init__(self, num_joints, loss_factor=1.0): - super(HeatMapSWAHRLoss, self).__init__() - self.loss_factor = loss_factor - self.num_joints = num_joints - - def __call__(self, preds, targets): - heatmaps_gt, mask = targets - heatmaps_pred = preds[0] - scalemaps_pred = preds[1] - - heatmaps_scaled_gt = paddle.where(heatmaps_gt > 0, 0.5 * heatmaps_gt * ( - 1 + (1 + - (scalemaps_pred - 1.) * paddle.log(heatmaps_gt + 1e-10))**2), - heatmaps_gt) - - regularizer_loss = paddle.mean( - paddle.pow((scalemaps_pred - 1.) * (heatmaps_gt > 0).astype(float), - 2)) - omiga = 0.01 - # thres = 2**(-1/omiga), threshold for positive weight - hm_weight = heatmaps_scaled_gt**( - omiga - ) * paddle.abs(1 - heatmaps_pred) + paddle.abs(heatmaps_pred) * ( - 1 - heatmaps_scaled_gt**(omiga)) - - loss = (((heatmaps_pred - heatmaps_scaled_gt)**2) * - mask.cast('float').unsqueeze(1)) * hm_weight - loss = loss.mean() - loss = self.loss_factor * (loss + 1.0 * regularizer_loss) - return loss - - -class AELoss(object): - def __init__(self, pull_factor=0.001, push_factor=0.001): - super(AELoss, self).__init__() - self.pull_factor = pull_factor - self.push_factor = push_factor - - def apply_single(self, pred, tagmap): - if tagmap.numpy()[:, :, 3].sum() == 0: - return (paddle.zeros([1]), paddle.zeros([1])) - nonzero = paddle.nonzero(tagmap[:, :, 3] > 0) - if nonzero.shape[0] == 0: - return (paddle.zeros([1]), paddle.zeros([1])) - p_inds = paddle.unique(nonzero[:, 0]) - num_person = p_inds.shape[0] - if num_person == 0: - return (paddle.zeros([1]), paddle.zeros([1])) - - pull = 0 - tagpull_num = 0 - embs_all = [] - person_unvalid = 0 - for person_idx in p_inds.numpy(): - valid_single = tagmap[person_idx.item()] - validkpts = paddle.nonzero(valid_single[:, 3] > 0) - valid_single = paddle.index_select(valid_single, validkpts) - emb = paddle.gather_nd(pred, valid_single[:, :3]) - if emb.shape[0] == 1: - person_unvalid += 1 - mean = paddle.mean(emb, axis=0) - embs_all.append(mean) - pull += paddle.mean(paddle.pow(emb - mean, 2), axis=0) - tagpull_num += emb.shape[0] - pull /= max(num_person - person_unvalid, 1) - if num_person < 2: - return pull, paddle.zeros([1]) - - embs_all = paddle.stack(embs_all) - A = embs_all.expand([num_person, num_person]) - B = A.transpose([1, 0]) - diff = A - B - - diff = paddle.pow(diff, 2) - push = paddle.exp(-diff) - push = paddle.sum(push) - num_person - - push /= 2 * num_person * (num_person - 1) - return pull, push - - def __call__(self, preds, tagmaps): - bs = preds.shape[0] - losses = [ - self.apply_single(preds[i:i + 1].squeeze(), - tagmaps[i:i + 1].squeeze()) for i in range(bs) - ] - pull = self.pull_factor * sum(loss[0] for loss in losses) / len(losses) - push = self.push_factor * sum(loss[1] for loss in losses) / len(losses) - return pull, push - - -class ZipLoss(object): - def __init__(self, loss_funcs): - super(ZipLoss, self).__init__() - self.loss_funcs = loss_funcs - - def __call__(self, inputs, targets): - assert len(self.loss_funcs) == len(targets) >= len(inputs) - - def zip_repeat(*args): - longest = max(map(len, args)) - filled = [islice(cycle(x), longest) for x in args] - return zip(*filled) - - return tuple( - fn(x, y) - for x, y, fn in zip_repeat(inputs, targets, self.loss_funcs)) - - -def recursive_sum(inputs): - if isinstance(inputs, abc.Sequence): - return sum([recursive_sum(x) for x in inputs]) - return inputs - - -def oks_overlaps(kpt_preds, kpt_gts, kpt_valids, kpt_areas, sigmas): - if not kpt_gts.astype('bool').any(): - return kpt_preds.sum()*0 - - sigmas = paddle.to_tensor(sigmas, dtype=kpt_preds.dtype) - variances = (sigmas * 2)**2 - - assert kpt_preds.shape[0] == kpt_gts.shape[0] - kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1] // 2, 2)) - kpt_gts = kpt_gts.reshape((-1, kpt_gts.shape[-1] // 2, 2)) - - squared_distance = (kpt_preds[:, :, 0] - kpt_gts[:, :, 0]) ** 2 + \ - (kpt_preds[:, :, 1] - kpt_gts[:, :, 1]) ** 2 - assert (kpt_valids.sum(-1) > 0).all() - squared_distance0 = squared_distance / ( - kpt_areas[:, None] * variances[None, :] * 2) - squared_distance1 = paddle.exp(-squared_distance0) - squared_distance1 = squared_distance1 * kpt_valids - oks = squared_distance1.sum(axis=1) / kpt_valids.sum(axis=1) - - return oks - - -def oks_loss(pred, - target, - weight, - valid=None, - area=None, - linear=False, - sigmas=None, - eps=1e-6, - avg_factor=None, - reduction=None): - """Oks loss. - - Computing the oks loss between a set of predicted poses and target poses. - The loss is calculated as negative log of oks. - - Args: - pred (Tensor): Predicted poses of format (x1, y1, x2, y2, ...), - shape (n, K*2). - target (Tensor): Corresponding gt poses, shape (n, K*2). - linear (bool, optional): If True, use linear scale of loss instead of - log scale. Default: False. - eps (float): Eps to avoid log(0). - - Returns: - Tensor: Loss tensor. - """ - oks = oks_overlaps(pred, target, valid, area, sigmas).clip(min=eps) - if linear: - loss = 1 - oks - else: - loss = -oks.log() - - if weight is not None: - if weight.shape != loss.shape: - if weight.shape[0] == loss.shape[0]: - # For most cases, weight is of shape (num_priors, ), - # which means it does not have the second axis num_class - weight = weight.reshape((-1, 1)) - else: - # Sometimes, weight per anchor per class is also needed. e.g. - # in FSAF. But it may be flattened of shape - # (num_priors x num_class, ), while loss is still of shape - # (num_priors, num_class). - assert weight.numel() == loss.numel() - weight = weight.reshape((loss.shape[0], -1)) - assert weight.ndim == loss.ndim - loss = loss * weight - - # if avg_factor is not specified, just reduce the loss - if avg_factor is None: - if reduction == 'mean': - loss = loss.mean() - elif reduction == 'sum': - loss = loss.sum() - else: - # if reduction is mean, then average the loss by avg_factor - if reduction == 'mean': - # Avoid causing ZeroDivisionError when avg_factor is 0.0, - # i.e., all labels of an image belong to ignore index. - eps = 1e-10 - loss = loss.sum() / (avg_factor + eps) - # if reduction is 'none', then do nothing, otherwise raise an error - elif reduction != 'none': - raise ValueError('avg_factor can not be used with reduction="sum"') - - - return loss - -@register -@serializable -class OKSLoss(nn.Layer): - """OKSLoss. - - Computing the oks loss between a set of predicted poses and target poses. - - Args: - linear (bool): If True, use linear scale of loss instead of log scale. - Default: False. - eps (float): Eps to avoid log(0). - reduction (str): Options are "none", "mean" and "sum". - loss_weight (float): Weight of loss. - """ - - def __init__(self, - linear=False, - num_keypoints=17, - eps=1e-6, - reduction='mean', - loss_weight=1.0): - super(OKSLoss, self).__init__() - self.linear = linear - self.eps = eps - self.reduction = reduction - self.loss_weight = loss_weight - if num_keypoints == 17: - self.sigmas = np.array([ - .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, - 1.07, .87, .87, .89, .89 - ], dtype=np.float32) / 10.0 - elif num_keypoints == 14: - self.sigmas = np.array([ - .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89, - .79, .79 - ]) / 10.0 - else: - raise ValueError(f'Unsupported keypoints number {num_keypoints}') - - def forward(self, - pred, - target, - valid, - area, - weight=None, - avg_factor=None, - reduction_override=None, - **kwargs): - """Forward function. - - Args: - pred (Tensor): The prediction. - target (Tensor): The learning target of the prediction. - valid (Tensor): The visible flag of the target pose. - area (Tensor): The area of the target pose. - weight (Tensor, optional): The weight of loss for each - prediction. Defaults to None. - avg_factor (int, optional): Average factor that is used to average - the loss. Defaults to None. - reduction_override (str, optional): The reduction method used to - override the original reduction method of the loss. - Defaults to None. Options are "none", "mean" and "sum". - """ - assert reduction_override in (None, 'none', 'mean', 'sum') - reduction = ( - reduction_override if reduction_override else self.reduction) - if (weight is not None) and (not paddle.any(weight > 0)) and ( - reduction != 'none'): - if pred.dim() == weight.dim() + 1: - weight = weight.unsqueeze(1) - return (pred * weight).sum() # 0 - if weight is not None and weight.dim() > 1: - # TODO: remove this in the future - # reduce the weight of shape (n, 4) to (n,) to match the - # iou_loss of shape (n,) - assert weight.shape == pred.shape - weight = weight.mean(-1) - loss = self.loss_weight * oks_loss( - pred, - target, - weight, - valid=valid, - area=area, - linear=self.linear, - sigmas=self.sigmas, - eps=self.eps, - reduction=reduction, - avg_factor=avg_factor, - **kwargs) - return loss - - -def center_focal_loss(pred, gt, weight=None, mask=None, avg_factor=None, reduction=None): - """Modified focal loss. Exactly the same as CornerNet. - Runs faster and costs a little bit more memory. - - Args: - pred (Tensor): The prediction with shape [bs, c, h, w]. - gt (Tensor): The learning target of the prediction in gaussian - distribution, with shape [bs, c, h, w]. - mask (Tensor): The valid mask. Defaults to None. - """ - if not gt.astype('bool').any(): - return pred.sum()*0 - pos_inds = gt.equal(1).astype('float32') - if mask is None: - neg_inds = gt.less_than(paddle.to_tensor([1], dtype='float32')).astype('float32') - else: - neg_inds = gt.less_than(paddle.to_tensor([1], dtype='float32')).astype('float32') * mask.equal(0).astype('float32') - - neg_weights = paddle.pow(1 - gt, 4) - - loss = 0 - - pos_loss = paddle.log(pred) * paddle.pow(1 - pred, 2) * pos_inds - neg_loss = paddle.log(1 - pred) * paddle.pow(pred, 2) * neg_weights * \ - neg_inds - - num_pos = pos_inds.astype('float32').sum() - pos_loss = pos_loss.sum() - neg_loss = neg_loss.sum() - - if num_pos == 0: - loss = loss - neg_loss - else: - loss = loss - (pos_loss + neg_loss) / num_pos - - if weight is not None: - if weight.shape != loss.shape: - if weight.shape[0] == loss.shape[0]: - # For most cases, weight is of shape (num_priors, ), - # which means it does not have the second axis num_class - weight = weight.reshape((-1, 1)) - else: - # Sometimes, weight per anchor per class is also needed. e.g. - # in FSAF. But it may be flattened of shape - # (num_priors x num_class, ), while loss is still of shape - # (num_priors, num_class). - assert weight.numel() == loss.numel() - weight = weight.reshape((loss.shape[0], -1)) - assert weight.ndim == loss.ndim - loss = loss * weight - - # if avg_factor is not specified, just reduce the loss - if avg_factor is None: - if reduction == 'mean': - loss = loss.mean() - elif reduction == 'sum': - loss = loss.sum() - else: - # if reduction is mean, then average the loss by avg_factor - if reduction == 'mean': - # Avoid causing ZeroDivisionError when avg_factor is 0.0, - # i.e., all labels of an image belong to ignore index. - eps = 1e-10 - loss = loss.sum() / (avg_factor + eps) - # if reduction is 'none', then do nothing, otherwise raise an error - elif reduction != 'none': - raise ValueError('avg_factor can not be used with reduction="sum"') - - return loss - -@register -@serializable -class CenterFocalLoss(nn.Layer): - """CenterFocalLoss is a variant of focal loss. - - More details can be found in the `paper - `_ - - Args: - reduction (str): Options are "none", "mean" and "sum". - loss_weight (float): Loss weight of current loss. - """ - - def __init__(self, - reduction='none', - loss_weight=1.0): - super(CenterFocalLoss, self).__init__() - self.reduction = reduction - self.loss_weight = loss_weight - - def forward(self, - pred, - target, - weight=None, - mask=None, - avg_factor=None, - reduction_override=None): - """Forward function. - - Args: - pred (Tensor): The prediction. - target (Tensor): The learning target of the prediction in gaussian - distribution. - weight (Tensor, optional): The weight of loss for each - prediction. Defaults to None. - mask (Tensor): The valid mask. Defaults to None. - avg_factor (int, optional): Average factor that is used to average - the loss. Defaults to None. - reduction_override (str, optional): The reduction method used to - override the original reduction method of the loss. - Defaults to None. - """ - assert reduction_override in (None, 'none', 'mean', 'sum') - reduction = ( - reduction_override if reduction_override else self.reduction) - loss_reg = self.loss_weight * center_focal_loss( - pred, - target, - weight, - mask=mask, - reduction=reduction, - avg_factor=avg_factor) - return loss_reg - -def l1_loss(pred, target, weight=None, reduction='mean', avg_factor=None): - """L1 loss. - - Args: - pred (Tensor): The prediction. - target (Tensor): The learning target of the prediction. - - Returns: - Tensor: Calculated loss - """ - if not target.astype('bool').any(): - return pred.sum() * 0 - - assert pred.shape == target.shape - loss = paddle.abs(pred - target) - - if weight is not None: - if weight.shape != loss.shape: - if weight.shape[0] == loss.shape[0]: - # For most cases, weight is of shape (num_priors, ), - # which means it does not have the second axis num_class - weight = weight.reshape((-1, 1)) - else: - # Sometimes, weight per anchor per class is also needed. e.g. - # in FSAF. But it may be flattened of shape - # (num_priors x num_class, ), while loss is still of shape - # (num_priors, num_class). - assert weight.numel() == loss.numel() - weight = weight.reshape((loss.shape[0], -1)) - assert weight.ndim == loss.ndim - loss = loss * weight - - # if avg_factor is not specified, just reduce the loss - if avg_factor is None: - if reduction == 'mean': - loss = loss.mean() - elif reduction == 'sum': - loss = loss.sum() - else: - # if reduction is mean, then average the loss by avg_factor - if reduction == 'mean': - # Avoid causing ZeroDivisionError when avg_factor is 0.0, - # i.e., all labels of an image belong to ignore index. - eps = 1e-10 - loss = loss.sum() / (avg_factor + eps) - # if reduction is 'none', then do nothing, otherwise raise an error - elif reduction != 'none': - raise ValueError('avg_factor can not be used with reduction="sum"') - - - return loss - -@register -@serializable -class L1Loss(nn.Layer): - """L1 loss. - - Args: - reduction (str, optional): The method to reduce the loss. - Options are "none", "mean" and "sum". - loss_weight (float, optional): The weight of loss. - """ - - def __init__(self, reduction='mean', loss_weight=1.0): - super(L1Loss, self).__init__() - self.reduction = reduction - self.loss_weight = loss_weight - - def forward(self, - pred, - target, - weight=None, - avg_factor=None, - reduction_override=None): - """Forward function. - - Args: - pred (Tensor): The prediction. - target (Tensor): The learning target of the prediction. - weight (Tensor, optional): The weight of loss for each - prediction. Defaults to None. - avg_factor (int, optional): Average factor that is used to average - the loss. Defaults to None. - reduction_override (str, optional): The reduction method used to - override the original reduction method of the loss. - Defaults to None. - """ - assert reduction_override in (None, 'none', 'mean', 'sum') - reduction = ( - reduction_override if reduction_override else self.reduction) - loss_bbox = self.loss_weight * l1_loss( - pred, target, weight, reduction=reduction, avg_factor=avg_factor) - return loss_bbox - diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/pose3d_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/pose3d_loss.py deleted file mode 100644 index 4781d6e..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/pose3d_loss.py +++ /dev/null @@ -1,250 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from itertools import cycle, islice -from collections import abc -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - -from ppdet.core.workspace import register, serializable -from ppdet.utils.logger import setup_logger -logger = setup_logger('ppdet.engine') - -__all__ = ['Pose3DLoss'] - - -@register -@serializable -class Pose3DLoss(nn.Layer): - def __init__(self, weight_3d=1.0, weight_2d=0.0, reduction='none'): - """ - KeyPointMSELoss layer - - Args: - weight_3d (float): weight of 3d loss - weight_2d (float): weight of 2d loss - reduction (bool): whether use reduction to loss - """ - super(Pose3DLoss, self).__init__() - self.weight_3d = weight_3d - self.weight_2d = weight_2d - self.criterion_2dpose = nn.MSELoss(reduction=reduction) - self.criterion_3dpose = nn.L1Loss(reduction=reduction) - self.criterion_smoothl1 = nn.SmoothL1Loss( - reduction=reduction, delta=1.0) - self.criterion_vertices = nn.L1Loss() - - def forward(self, pred3d, pred2d, inputs): - """ - mpjpe: mpjpe loss between 3d joints - keypoint_2d_loss: 2d joints loss compute by criterion_2dpose - """ - gt_3d_joints = inputs['joints_3d'] - gt_2d_joints = inputs['joints_2d'] - has_3d_joints = inputs['has_3d_joints'] - has_2d_joints = inputs['has_2d_joints'] - - loss_3d = mpjpe_focal(pred3d, gt_3d_joints, has_3d_joints) - loss = self.weight_3d * loss_3d - epoch = inputs['epoch_id'] - if self.weight_2d > 0: - weight = self.weight_2d * pow(0.1, (epoch // 8)) - if epoch > 8: - weight = 0 - loss_2d = keypoint_2d_loss(self.criterion_2dpose, pred2d, - gt_2d_joints, has_2d_joints) - loss += weight * loss_2d - return loss - - -def filter_3d_joints(pred, gt, has_3d_joints): - """ - filter 3d joints - """ - gt = gt[has_3d_joints == 1] - gt = gt[:, :, :3] - pred = pred[has_3d_joints == 1] - - gt_pelvis = (gt[:, 2, :] + gt[:, 3, :]) / 2 - gt = gt - gt_pelvis[:, None, :] - pred_pelvis = (pred[:, 2, :] + pred[:, 3, :]) / 2 - pred = pred - pred_pelvis[:, None, :] - return pred, gt - - -def mpjpe(pred, gt, has_3d_joints): - """ - mPJPE loss - """ - pred, gt = filter_3d_joints(pred, gt, has_3d_joints) - error = paddle.sqrt((paddle.minimum((pred - gt), paddle.to_tensor(1.2))**2 - ).sum(axis=-1)).mean() - return error - - -def mpjpe_focal(pred, gt, has_3d_joints): - """ - mPJPE loss - """ - pred, gt = filter_3d_joints(pred, gt, has_3d_joints) - mse_error = ((pred - gt)**2).sum(axis=-1) - mpjpe_error = paddle.sqrt(mse_error) - mean = mpjpe_error.mean() - std = mpjpe_error.std() - atte = 2 * F.sigmoid(6 * (mpjpe_error - mean) / std) - mse_error *= atte - return mse_error.mean() - - -def mpjpe_mse(pred, gt, has_3d_joints, weight=1.): - """ - mPJPE loss - """ - pred, gt = filter_3d_joints(pred, gt, has_3d_joints) - error = (((pred - gt)**2).sum(axis=-1)).mean() - return error - - -def mpjpe_criterion(pred, gt, has_3d_joints, criterion_pose3d): - """ - mPJPE loss of self define criterion - """ - pred, gt = filter_3d_joints(pred, gt, has_3d_joints) - error = paddle.sqrt(criterion_pose3d(pred, gt)).mean() - return error - - -@register -@serializable -def weighted_mpjpe(pred, gt, has_3d_joints): - """ - Weighted_mPJPE - """ - pred, gt = filter_3d_joints(pred, gt, has_3d_joints) - weight = paddle.linalg.norm(pred, p=2, axis=-1) - weight = paddle.to_tensor( - [1.5, 1.3, 1.2, 1.2, 1.3, 1.5, 1.5, 1.3, 1.2, 1.2, 1.3, 1.5, 1., 1.]) - error = (weight * paddle.linalg.norm(pred - gt, p=2, axis=-1)).mean() - return error - - -@register -@serializable -def normed_mpjpe(pred, gt, has_3d_joints): - """ - Normalized MPJPE (scale only), adapted from: - https://github.com/hrhodin/UnsupervisedGeometryAwareRepresentationLearning/blob/master/losses/poses.py - """ - assert pred.shape == gt.shape - pred, gt = filter_3d_joints(pred, gt, has_3d_joints) - - norm_predicted = paddle.mean( - paddle.sum(pred**2, axis=3, keepdim=True), axis=2, keepdim=True) - norm_target = paddle.mean( - paddle.sum(gt * pred, axis=3, keepdim=True), axis=2, keepdim=True) - scale = norm_target / norm_predicted - return mpjpe(scale * pred, gt) - - -@register -@serializable -def mpjpe_np(pred, gt, has_3d_joints): - """ - mPJPE_NP - """ - pred, gt = filter_3d_joints(pred, gt, has_3d_joints) - error = np.sqrt(((pred - gt)**2).sum(axis=-1)).mean() - return error - - -@register -@serializable -def mean_per_vertex_error(pred, gt, has_smpl): - """ - Compute mPVE - """ - pred = pred[has_smpl == 1] - gt = gt[has_smpl == 1] - with paddle.no_grad(): - error = paddle.sqrt(((pred - gt)**2).sum(axis=-1)).mean() - return error - - -@register -@serializable -def keypoint_2d_loss(criterion_keypoints, pred_keypoints_2d, gt_keypoints_2d, - has_pose_2d): - """ - Compute 2D reprojection loss if 2D keypoint annotations are available. - The confidence (conf) is binary and indicates whether the keypoints exist or not. - """ - conf = gt_keypoints_2d[:, :, -1].unsqueeze(-1).clone() - loss = (conf * criterion_keypoints( - pred_keypoints_2d, gt_keypoints_2d[:, :, :-1] * 0.001)).mean() - return loss - - -@register -@serializable -def keypoint_3d_loss(criterion_keypoints, pred_keypoints_3d, gt_keypoints_3d, - has_pose_3d): - """ - Compute 3D keypoint loss if 3D keypoint annotations are available. - """ - conf = gt_keypoints_3d[:, :, -1].unsqueeze(-1).clone() - gt_keypoints_3d = gt_keypoints_3d[:, :, :-1].clone() - gt_keypoints_3d = gt_keypoints_3d[has_pose_3d == 1] - conf = conf[has_pose_3d == 1] - pred_keypoints_3d = pred_keypoints_3d[has_pose_3d == 1] - if len(gt_keypoints_3d) > 0: - gt_pelvis = (gt_keypoints_3d[:, 2, :] + gt_keypoints_3d[:, 3, :]) / 2 - gt_keypoints_3d = gt_keypoints_3d - gt_pelvis[:, None, :] - pred_pelvis = ( - pred_keypoints_3d[:, 2, :] + pred_keypoints_3d[:, 3, :]) / 2 - pred_keypoints_3d = pred_keypoints_3d - pred_pelvis[:, None, :] - return (conf * criterion_keypoints(pred_keypoints_3d, - gt_keypoints_3d)).mean() - else: - return paddle.to_tensor([1.]).fill_(0.) - - -@register -@serializable -def vertices_loss(criterion_vertices, pred_vertices, gt_vertices, has_smpl): - """ - Compute per-vertex loss if vertex annotations are available. - """ - pred_vertices_with_shape = pred_vertices[has_smpl == 1] - gt_vertices_with_shape = gt_vertices[has_smpl == 1] - if len(gt_vertices_with_shape) > 0: - return criterion_vertices(pred_vertices_with_shape, - gt_vertices_with_shape) - else: - return paddle.to_tensor([1.]).fill_(0.) - - -@register -@serializable -def rectify_pose(pose): - pose = pose.copy() - R_mod = cv2.Rodrigues(np.array([np.pi, 0, 0]))[0] - R_root = cv2.Rodrigues(pose[:3])[0] - new_root = R_root.dot(R_mod) - pose[:3] = cv2.Rodrigues(new_root)[0].reshape(3) - return pose diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/probiou_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/probiou_loss.py deleted file mode 100644 index c2a1c75..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/probiou_loss.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import numpy as np - -import paddle -import paddle.nn.functional as F - -from ppdet.core.workspace import register, serializable - -__all__ = ['ProbIoULoss'] - - -def gbb_form(boxes): - xy, wh, angle = paddle.split(boxes, [2, 2, 1], axis=-1) - return paddle.concat([xy, wh.pow(2) / 12., angle], axis=-1) - - -def rotated_form(a_, b_, angles): - cos_a = paddle.cos(angles) - sin_a = paddle.sin(angles) - a = a_ * paddle.pow(cos_a, 2) + b_ * paddle.pow(sin_a, 2) - b = a_ * paddle.pow(sin_a, 2) + b_ * paddle.pow(cos_a, 2) - c = (a_ - b_) * cos_a * sin_a - return a, b, c - - -def probiou_loss(pred, target, eps=1e-3, mode='l1'): - """ - pred -> a matrix [N,5](x,y,w,h,angle - in radians) containing ours predicted box ;in case of HBB angle == 0 - target -> a matrix [N,5](x,y,w,h,angle - in radians) containing ours target box ;in case of HBB angle == 0 - eps -> threshold to avoid infinite values - mode -> ('l1' in [0,1] or 'l2' in [0,inf]) metrics according our paper - - """ - - gbboxes1 = gbb_form(pred) - gbboxes2 = gbb_form(target) - - x1, y1, a1_, b1_, c1_ = gbboxes1[:, - 0], gbboxes1[:, - 1], gbboxes1[:, - 2], gbboxes1[:, - 3], gbboxes1[:, - 4] - x2, y2, a2_, b2_, c2_ = gbboxes2[:, - 0], gbboxes2[:, - 1], gbboxes2[:, - 2], gbboxes2[:, - 3], gbboxes2[:, - 4] - - a1, b1, c1 = rotated_form(a1_, b1_, c1_) - a2, b2, c2 = rotated_form(a2_, b2_, c2_) - - t1 = 0.25 * ((a1 + a2) * (paddle.pow(y1 - y2, 2)) + (b1 + b2) * (paddle.pow(x1 - x2, 2))) + \ - 0.5 * ((c1+c2)*(x2-x1)*(y1-y2)) - t2 = (a1 + a2) * (b1 + b2) - paddle.pow(c1 + c2, 2) - t3_ = (a1 * b1 - c1 * c1) * (a2 * b2 - c2 * c2) - t3 = 0.5 * paddle.log(t2 / (4 * paddle.sqrt(F.relu(t3_)) + eps)) - - B_d = (t1 / t2) + t3 - # B_d = t1 + t2 + t3 - - B_d = paddle.clip(B_d, min=eps, max=100.0) - l1 = paddle.sqrt(1.0 - paddle.exp(-B_d) + eps) - l_i = paddle.pow(l1, 2.0) - l2 = -paddle.log(1.0 - l_i + eps) - - if mode == 'l1': - probiou = l1 - if mode == 'l2': - probiou = l2 - - return probiou - - -@serializable -@register -class ProbIoULoss(object): - """ ProbIoU Loss, refer to https://arxiv.org/abs/2106.06072 for details """ - - def __init__(self, mode='l1', eps=1e-3): - super(ProbIoULoss, self).__init__() - self.mode = mode - self.eps = eps - - def __call__(self, pred_rboxes, assigned_rboxes): - return probiou_loss(pred_rboxes, assigned_rboxes, self.eps, self.mode) diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/queryinst_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/queryinst_loss.py deleted file mode 100644 index 640b9b4..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/queryinst_loss.py +++ /dev/null @@ -1,175 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn.functional as F - -from ppdet.core.workspace import register -from ppdet.modeling.losses.iou_loss import GIoULoss -from .sparsercnn_loss import HungarianMatcher - -__all__ = ['QueryInstLoss'] - - -@register -class QueryInstLoss(object): - __shared__ = ['num_classes'] - - def __init__(self, - num_classes=80, - focal_loss_alpha=0.25, - focal_loss_gamma=2.0, - class_weight=2.0, - l1_weight=5.0, - giou_weight=2.0, - mask_weight=8.0): - super(QueryInstLoss, self).__init__() - - self.num_classes = num_classes - self.focal_loss_alpha = focal_loss_alpha - self.focal_loss_gamma = focal_loss_gamma - self.loss_weights = { - "loss_cls": class_weight, - "loss_bbox": l1_weight, - "loss_giou": giou_weight, - "loss_mask": mask_weight - } - self.giou_loss = GIoULoss(eps=1e-6, reduction='sum') - - self.matcher = HungarianMatcher(focal_loss_alpha, focal_loss_gamma, - class_weight, l1_weight, giou_weight) - - def loss_classes(self, class_logits, targets, indices, avg_factor): - tgt_labels = paddle.full( - class_logits.shape[:2], self.num_classes, dtype='int32') - - if sum(len(v['labels']) for v in targets) > 0: - tgt_classes = paddle.concat([ - paddle.gather( - tgt['labels'], tgt_idx, axis=0) - for tgt, (_, tgt_idx) in zip(targets, indices) - ]) - batch_idx, src_idx = self._get_src_permutation_idx(indices) - for i, (batch_i, src_i) in enumerate(zip(batch_idx, src_idx)): - tgt_labels[int(batch_i), int(src_i)] = tgt_classes[i] - - tgt_labels = tgt_labels.flatten(0, 1).unsqueeze(-1) - - tgt_labels_onehot = paddle.cast( - tgt_labels == paddle.arange(0, self.num_classes), dtype='float32') - tgt_labels_onehot.stop_gradient = True - - src_logits = class_logits.flatten(0, 1) - - loss_cls = F.sigmoid_focal_loss( - src_logits, - tgt_labels_onehot, - alpha=self.focal_loss_alpha, - gamma=self.focal_loss_gamma, - reduction='sum') / avg_factor - losses = {'loss_cls': loss_cls * self.loss_weights['loss_cls']} - return losses - - def loss_bboxes(self, bbox_pred, targets, indices, avg_factor): - bboxes = paddle.concat([ - paddle.gather( - src, src_idx, axis=0) - for src, (src_idx, _) in zip(bbox_pred, indices) - ]) - - tgt_bboxes = paddle.concat([ - paddle.gather( - tgt['boxes'], tgt_idx, axis=0) - for tgt, (_, tgt_idx) in zip(targets, indices) - ]) - tgt_bboxes.stop_gradient = True - - im_shapes = paddle.concat([tgt['img_whwh_tgt'] for tgt in targets]) - bboxes_norm = bboxes / im_shapes - tgt_bboxes_norm = tgt_bboxes / im_shapes - - loss_giou = self.giou_loss(bboxes, tgt_bboxes) / avg_factor - loss_bbox = F.l1_loss( - bboxes_norm, tgt_bboxes_norm, reduction='sum') / avg_factor - losses = { - 'loss_bbox': loss_bbox * self.loss_weights['loss_bbox'], - 'loss_giou': loss_giou * self.loss_weights['loss_giou'] - } - return losses - - def loss_masks(self, pos_bbox_pred, mask_logits, targets, indices, - avg_factor): - tgt_segm = [ - paddle.gather( - tgt['gt_segm'], tgt_idx, axis=0) - for tgt, (_, tgt_idx) in zip(targets, indices) - ] - - tgt_masks = [] - for i in range(len(indices)): - gt_segm = tgt_segm[i].unsqueeze(1) - if len(gt_segm) == 0: - continue - boxes = pos_bbox_pred[i] - boxes[:, 0::2] = paddle.clip( - boxes[:, 0::2], min=0, max=gt_segm.shape[3]) - boxes[:, 1::2] = paddle.clip( - boxes[:, 1::2], min=0, max=gt_segm.shape[2]) - boxes_num = paddle.to_tensor([1] * len(boxes), dtype='int32') - gt_mask = paddle.vision.ops.roi_align( - gt_segm, - boxes, - boxes_num, - output_size=mask_logits.shape[-2:], - aligned=True) - tgt_masks.append(gt_mask) - tgt_masks = paddle.concat(tgt_masks).squeeze(1) - tgt_masks = paddle.cast(tgt_masks >= 0.5, dtype='float32') - tgt_masks.stop_gradient = True - - tgt_labels = paddle.concat([ - paddle.gather( - tgt['labels'], tgt_idx, axis=0) - for tgt, (_, tgt_idx) in zip(targets, indices) - ]) - - mask_label = F.one_hot(tgt_labels, self.num_classes).unsqueeze([2, 3]) - mask_label = paddle.expand_as(mask_label, mask_logits) - mask_label.stop_gradient = True - - src_masks = paddle.gather_nd(mask_logits, paddle.nonzero(mask_label)) - shape = mask_logits.shape - src_masks = paddle.reshape(src_masks, [shape[0], shape[2], shape[3]]) - src_masks = F.sigmoid(src_masks) - - X = src_masks.flatten(1) - Y = tgt_masks.flatten(1) - inter = paddle.sum(X * Y, 1) - union = paddle.sum(X * X, 1) + paddle.sum(Y * Y, 1) - dice = (2 * inter) / (union + 2e-5) - - loss_mask = (1 - dice).sum() / avg_factor - losses = {'loss_mask': loss_mask * self.loss_weights['loss_mask']} - return losses - - @staticmethod - def _get_src_permutation_idx(indices): - batch_idx = paddle.concat( - [paddle.full_like(src, i) for i, (src, _) in enumerate(indices)]) - src_idx = paddle.concat([src for (src, _) in indices]) - return batch_idx, src_idx diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/smooth_l1_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/smooth_l1_loss.py deleted file mode 100644 index f89c28f..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/smooth_l1_loss.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register - -__all__ = ['SmoothL1Loss'] - -@register -class SmoothL1Loss(nn.Layer): - """Smooth L1 Loss. - Args: - beta (float): controls smooth region, it becomes L1 Loss when beta=0.0 - loss_weight (float): the final loss will be multiplied by this - """ - def __init__(self, - beta=1.0, - loss_weight=1.0): - super(SmoothL1Loss, self).__init__() - assert beta >= 0 - self.beta = beta - self.loss_weight = loss_weight - - def forward(self, pred, target, reduction='none'): - """forward function, based on fvcore. - Args: - pred (Tensor): prediction tensor - target (Tensor): target tensor, pred.shape must be the same as target.shape - reduction (str): the way to reduce loss, one of (none, sum, mean) - """ - assert reduction in ('none', 'sum', 'mean') - target = target.detach() - if self.beta < 1e-5: - loss = paddle.abs(pred - target) - else: - n = paddle.abs(pred - target) - cond = n < self.beta - loss = paddle.where(cond, 0.5 * n ** 2 / self.beta, n - 0.5 * self.beta) - if reduction == 'mean': - loss = loss.mean() if loss.size > 0 else 0.0 * loss.sum() - elif reduction == 'sum': - loss = loss.sum() - return loss * self.loss_weight diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/solov2_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/solov2_loss.py deleted file mode 100644 index ef97a77..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/solov2_loss.py +++ /dev/null @@ -1,101 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn.functional as F -from ppdet.core.workspace import register, serializable - -__all__ = ['SOLOv2Loss'] - - -@register -@serializable -class SOLOv2Loss(object): - """ - SOLOv2Loss - Args: - ins_loss_weight (float): Weight of instance loss. - focal_loss_gamma (float): Gamma parameter for focal loss. - focal_loss_alpha (float): Alpha parameter for focal loss. - """ - - def __init__(self, - ins_loss_weight=3.0, - focal_loss_gamma=2.0, - focal_loss_alpha=0.25): - self.ins_loss_weight = ins_loss_weight - self.focal_loss_gamma = focal_loss_gamma - self.focal_loss_alpha = focal_loss_alpha - - def _dice_loss(self, input, target): - input = paddle.reshape(input, shape=(paddle.shape(input)[0], -1)) - target = paddle.reshape(target, shape=(paddle.shape(target)[0], -1)) - a = paddle.sum(input * target, axis=1) - b = paddle.sum(input * input, axis=1) + 0.001 - c = paddle.sum(target * target, axis=1) + 0.001 - d = (2 * a) / (b + c) - return 1 - d - - def __call__(self, ins_pred_list, ins_label_list, cate_preds, cate_labels, - num_ins): - """ - Get loss of network of SOLOv2. - Args: - ins_pred_list (list): Variable list of instance branch output. - ins_label_list (list): List of instance labels pre batch. - cate_preds (list): Concat Variable list of categroy branch output. - cate_labels (list): Concat list of categroy labels pre batch. - num_ins (int): Number of positive samples in a mini-batch. - Returns: - loss_ins (Variable): The instance loss Variable of SOLOv2 network. - loss_cate (Variable): The category loss Variable of SOLOv2 network. - """ - - #1. Ues dice_loss to calculate instance loss - loss_ins = [] - total_weights = paddle.zeros(shape=[1], dtype='float32') - for input, target in zip(ins_pred_list, ins_label_list): - if input is None: - continue - target = paddle.cast(target, 'float32') - target = paddle.reshape( - target, - shape=[-1, paddle.shape(input)[-2], paddle.shape(input)[-1]]) - weights = paddle.cast( - paddle.sum(target, axis=[1, 2]) > 0, 'float32') - input = F.sigmoid(input) - dice_out = paddle.multiply(self._dice_loss(input, target), weights) - total_weights += paddle.sum(weights) - loss_ins.append(dice_out) - loss_ins = paddle.sum(paddle.concat(loss_ins)) / total_weights - loss_ins = loss_ins * self.ins_loss_weight - - #2. Ues sigmoid_focal_loss to calculate category loss - # expand onehot labels - num_classes = cate_preds.shape[-1] - cate_labels_bin = F.one_hot(cate_labels, num_classes=num_classes + 1) - cate_labels_bin = cate_labels_bin[:, 1:] - - loss_cate = F.sigmoid_focal_loss( - cate_preds, - label=cate_labels_bin, - normalizer=num_ins + 1., - gamma=self.focal_loss_gamma, - alpha=self.focal_loss_alpha) - - return loss_ins, loss_cate diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/sparsercnn_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/sparsercnn_loss.py deleted file mode 100644 index ac9eba6..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/sparsercnn_loss.py +++ /dev/null @@ -1,430 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/PeizeSun/SparseR-CNN/blob/main/projects/SparseRCNN/sparsercnn/loss.py -Ths copyright of PeizeSun/SparseR-CNN is as follows: -MIT License [see LICENSE for details] -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from scipy.optimize import linear_sum_assignment -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.metric import accuracy -from ppdet.core.workspace import register -from ppdet.modeling.losses.iou_loss import GIoULoss - -__all__ = ["SparseRCNNLoss"] - - -@register -class SparseRCNNLoss(nn.Layer): - """ This class computes the loss for SparseRCNN. - The process happens in two steps: - 1) we compute hungarian assignment between ground truth boxes and the outputs of the model - 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) - """ - __shared__ = ['num_classes'] - - def __init__(self, - losses, - focal_loss_alpha, - focal_loss_gamma, - num_classes=80, - class_weight=2., - l1_weight=5., - giou_weight=2.): - """ Create the criterion. - Parameters: - num_classes: number of object categories, omitting the special no-object category - weight_dict: dict containing as key the names of the losses and as values their relative weight. - losses: list of all the losses to be applied. See get_loss for list of available losses. - matcher: module able to compute a matching between targets and proposals - """ - super().__init__() - self.num_classes = num_classes - weight_dict = { - "loss_ce": class_weight, - "loss_bbox": l1_weight, - "loss_giou": giou_weight - } - self.weight_dict = weight_dict - self.losses = losses - self.giou_loss = GIoULoss(reduction="sum") - - self.focal_loss_alpha = focal_loss_alpha - self.focal_loss_gamma = focal_loss_gamma - - self.matcher = HungarianMatcher(focal_loss_alpha, focal_loss_gamma, - class_weight, l1_weight, giou_weight) - - def loss_labels(self, outputs, targets, indices, num_boxes, log=True): - """Classification loss (NLL) - targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes] - """ - assert 'pred_logits' in outputs - src_logits = outputs['pred_logits'] - - idx = self._get_src_permutation_idx(indices) - target_classes_o = paddle.concat([ - paddle.gather( - t["labels"], J, axis=0) for t, (_, J) in zip(targets, indices) - ]) - target_classes = paddle.full( - src_logits.shape[:2], self.num_classes, dtype="int32") - for i, ind in enumerate(zip(idx[0], idx[1])): - target_classes[int(ind[0]), int(ind[1])] = target_classes_o[i] - target_classes.stop_gradient = True - - src_logits = src_logits.flatten(start_axis=0, stop_axis=1) - - # prepare one_hot target. - target_classes = target_classes.flatten(start_axis=0, stop_axis=1) - class_ids = paddle.arange(0, self.num_classes) - labels = (target_classes.unsqueeze(-1) == class_ids).astype("float32") - labels.stop_gradient = True - - # comp focal loss. - class_loss = sigmoid_focal_loss( - src_logits, - labels, - alpha=self.focal_loss_alpha, - gamma=self.focal_loss_gamma, - reduction="sum", ) / num_boxes - losses = {'loss_ce': class_loss} - - if log: - label_acc = target_classes_o.unsqueeze(-1) - src_idx = [src for (src, _) in indices] - - pred_list = [] - for i in range(outputs["pred_logits"].shape[0]): - pred_list.append( - paddle.gather( - outputs["pred_logits"][i], src_idx[i], axis=0)) - - pred = F.sigmoid(paddle.concat(pred_list, axis=0)) - acc = accuracy(pred, label_acc.astype("int64")) - losses["acc"] = acc - - return losses - - def loss_boxes(self, outputs, targets, indices, num_boxes): - """Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss - targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4] - The target boxes are expected in format (center_x, center_y, w, h), normalized by the image size. - """ - assert 'pred_boxes' in outputs # [batch_size, num_proposals, 4] - src_idx = [src for (src, _) in indices] - src_boxes_list = [] - - for i in range(outputs["pred_boxes"].shape[0]): - src_boxes_list.append( - paddle.gather( - outputs["pred_boxes"][i], src_idx[i], axis=0)) - - src_boxes = paddle.concat(src_boxes_list, axis=0) - - target_boxes = paddle.concat( - [ - paddle.gather( - t['boxes'], I, axis=0) - for t, (_, I) in zip(targets, indices) - ], - axis=0) - target_boxes.stop_gradient = True - losses = {} - - losses['loss_giou'] = self.giou_loss(src_boxes, - target_boxes) / num_boxes - - image_size = paddle.concat([v["img_whwh_tgt"] for v in targets]) - src_boxes_ = src_boxes / image_size - target_boxes_ = target_boxes / image_size - - loss_bbox = F.l1_loss(src_boxes_, target_boxes_, reduction='sum') - losses['loss_bbox'] = loss_bbox / num_boxes - - return losses - - def _get_src_permutation_idx(self, indices): - # permute predictions following indices - batch_idx = paddle.concat( - [paddle.full_like(src, i) for i, (src, _) in enumerate(indices)]) - src_idx = paddle.concat([src for (src, _) in indices]) - return batch_idx, src_idx - - def _get_tgt_permutation_idx(self, indices): - # permute targets following indices - batch_idx = paddle.concat( - [paddle.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]) - tgt_idx = paddle.concat([tgt for (_, tgt) in indices]) - return batch_idx, tgt_idx - - def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs): - loss_map = { - 'labels': self.loss_labels, - 'boxes': self.loss_boxes, - } - assert loss in loss_map, f'do you really want to compute {loss} loss?' - return loss_map[loss](outputs, targets, indices, num_boxes, **kwargs) - - def forward(self, outputs, targets): - """ This performs the loss computation. - Parameters: - outputs: dict of tensors, see the output specification of the model for the format - targets: list of dicts, such that len(targets) == batch_size. - The expected keys in each dict depends on the losses applied, see each loss' doc - """ - outputs_without_aux = { - k: v - for k, v in outputs.items() if k != 'aux_outputs' - } - - # Retrieve the matching between the outputs of the last layer and the targets - indices = self.matcher(outputs_without_aux, targets) - - # Compute the average number of target boxes across all nodes, for normalization purposes - num_boxes = sum(len(t["labels"]) for t in targets) - num_boxes = paddle.to_tensor( - [num_boxes], - dtype="float32", - place=next(iter(outputs.values())).place) - - # Compute all the requested losses - losses = {} - for loss in self.losses: - losses.update( - self.get_loss(loss, outputs, targets, indices, num_boxes)) - - # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. - if 'aux_outputs' in outputs: - for i, aux_outputs in enumerate(outputs['aux_outputs']): - indices = self.matcher(aux_outputs, targets) - for loss in self.losses: - kwargs = {} - if loss == 'labels': - # Logging is enabled only for the last layer - kwargs = {'log': False} - l_dict = self.get_loss(loss, aux_outputs, targets, indices, - num_boxes, **kwargs) - - w_dict = {} - for k in l_dict.keys(): - if k in self.weight_dict: - w_dict[k + f'_{i}'] = l_dict[k] * self.weight_dict[ - k] - else: - w_dict[k + f'_{i}'] = l_dict[k] - losses.update(w_dict) - - return losses - - -class HungarianMatcher(nn.Layer): - """This class computes an assignment between the targets and the predictions of the network - For efficiency reasons, the targets don't include the no_object. Because of this, in general, - there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, - while the others are un-matched (and thus treated as non-objects). - """ - - def __init__(self, - focal_loss_alpha, - focal_loss_gamma, - cost_class: float=1, - cost_bbox: float=1, - cost_giou: float=1): - """Creates the matcher - Params: - cost_class: This is the relative weight of the classification error in the matching cost - cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost - cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost - """ - super().__init__() - self.cost_class = cost_class - self.cost_bbox = cost_bbox - self.cost_giou = cost_giou - self.focal_loss_alpha = focal_loss_alpha - self.focal_loss_gamma = focal_loss_gamma - assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0" - - @paddle.no_grad() - def forward(self, outputs, targets): - """ Performs the matching - Args: - outputs: This is a dict that contains at least these entries: - "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits - "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates - eg. outputs = {"pred_logits": pred_logits, "pred_boxes": pred_boxes} - targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: - "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth - objects in the target) containing the class labels - "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates - eg. targets = [{"labels":labels, "boxes": boxes}, ...,{"labels":labels, "boxes": boxes}] - Returns: - A list of size batch_size, containing tuples of (index_i, index_j) where: - - index_i is the indices of the selected predictions (in order) - - index_j is the indices of the corresponding selected targets (in order) - For each batch element, it holds: - len(index_i) = len(index_j) = min(num_queries, num_target_boxes) - """ - bs, num_queries = outputs["pred_logits"].shape[:2] - - if sum(len(v["labels"]) for v in targets) == 0: - return [(paddle.to_tensor( - [], dtype=paddle.int64), paddle.to_tensor( - [], dtype=paddle.int64)) for _ in range(bs)] - - # We flatten to compute the cost matrices in a batch - out_prob = F.sigmoid(outputs["pred_logits"].flatten( - start_axis=0, stop_axis=1)) - out_bbox = outputs["pred_boxes"].flatten(start_axis=0, stop_axis=1) - - # Also concat the target labels and boxes - tgt_ids = paddle.concat([v["labels"] for v in targets]) - assert (tgt_ids > -1).all() - tgt_bbox = paddle.concat([v["boxes"] for v in targets]) - - # Compute the classification cost. Contrary to the loss, we don't use the NLL, - # but approximate it in 1 - proba[target class]. - # The 1 is a constant that doesn't change the matching, it can be ommitted. - - # Compute the classification cost. - alpha = self.focal_loss_alpha - gamma = self.focal_loss_gamma - - neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-( - 1 - out_prob + 1e-8).log()) - pos_cost_class = alpha * ((1 - out_prob) - **gamma) * (-(out_prob + 1e-8).log()) - - cost_class = paddle.gather( - pos_cost_class, tgt_ids, axis=1) - paddle.gather( - neg_cost_class, tgt_ids, axis=1) - - # Compute the L1 cost between boxes - image_size_out = paddle.concat( - [v["img_whwh"].unsqueeze(0) for v in targets]) - image_size_out = image_size_out.unsqueeze(1).tile( - [1, num_queries, 1]).flatten( - start_axis=0, stop_axis=1) - image_size_tgt = paddle.concat([v["img_whwh_tgt"] for v in targets]) - - out_bbox_ = out_bbox / image_size_out - tgt_bbox_ = tgt_bbox / image_size_tgt - cost_bbox = F.l1_loss( - out_bbox_.unsqueeze(-2), tgt_bbox_, - reduction='none').sum(-1) # [batch_size * num_queries, num_tgts] - - # Compute the giou cost betwen boxes - cost_giou = -get_bboxes_giou(out_bbox, tgt_bbox) - - # Final cost matrix - C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou - C = C.reshape([bs, num_queries, -1]) - - sizes = [len(v["boxes"]) for v in targets] - - indices = [ - linear_sum_assignment(c[i].numpy()) - for i, c in enumerate(C.split(sizes, -1)) - ] - return [(paddle.to_tensor( - i, dtype="int32"), paddle.to_tensor( - j, dtype="int32")) for i, j in indices] - - -def box_area(boxes): - assert (boxes[:, 2:] >= boxes[:, :2]).all() - wh = boxes[:, 2:] - boxes[:, :2] - return wh[:, 0] * wh[:, 1] - - -def boxes_iou(boxes1, boxes2): - ''' - Compute iou - - Args: - boxes1 (paddle.tensor) shape (N, 4) - boxes2 (paddle.tensor) shape (M, 4) - - Return: - (paddle.tensor) shape (N, M) - ''' - area1 = box_area(boxes1) - area2 = box_area(boxes2) - - lt = paddle.maximum(boxes1.unsqueeze(-2)[:, :, :2], boxes2[:, :2]) - rb = paddle.minimum(boxes1.unsqueeze(-2)[:, :, 2:], boxes2[:, 2:]) - - wh = (rb - lt).astype("float32").clip(min=1e-9) - inter = wh[:, :, 0] * wh[:, :, 1] - - union = area1.unsqueeze(-1) + area2 - inter + 1e-9 - - iou = inter / union - return iou, union - - -def get_bboxes_giou(boxes1, boxes2, eps=1e-9): - """calculate the ious of boxes1 and boxes2 - - Args: - boxes1 (Tensor): shape [N, 4] - boxes2 (Tensor): shape [M, 4] - eps (float): epsilon to avoid divide by zero - - Return: - ious (Tensor): ious of boxes1 and boxes2, with the shape [N, M] - """ - assert (boxes1[:, 2:] >= boxes1[:, :2]).all() - assert (boxes2[:, 2:] >= boxes2[:, :2]).all() - - iou, union = boxes_iou(boxes1, boxes2) - - lt = paddle.minimum(boxes1.unsqueeze(-2)[:, :, :2], boxes2[:, :2]) - rb = paddle.maximum(boxes1.unsqueeze(-2)[:, :, 2:], boxes2[:, 2:]) - - wh = (rb - lt).astype("float32").clip(min=eps) - enclose_area = wh[:, :, 0] * wh[:, :, 1] - - giou = iou - (enclose_area - union) / enclose_area - - return giou - - -def sigmoid_focal_loss(inputs, targets, alpha, gamma, reduction="sum"): - - assert reduction in ["sum", "mean" - ], f'do not support this {reduction} reduction?' - - p = F.sigmoid(inputs) - ce_loss = F.binary_cross_entropy_with_logits( - inputs, targets, reduction="none") - p_t = p * targets + (1 - p) * (1 - targets) - loss = ce_loss * ((1 - p_t)**gamma) - - if alpha >= 0: - alpha_t = alpha * targets + (1 - alpha) * (1 - targets) - loss = alpha_t * loss - - if reduction == "mean": - loss = loss.mean() - elif reduction == "sum": - loss = loss.sum() - - return loss diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/ssd_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/ssd_loss.py deleted file mode 100644 index 2ab94f2..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/ssd_loss.py +++ /dev/null @@ -1,168 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register -from ..bbox_utils import iou_similarity, bbox2delta - -__all__ = ['SSDLoss'] - - -@register -class SSDLoss(nn.Layer): - """ - SSDLoss - - Args: - overlap_threshold (float32, optional): IoU threshold for negative bboxes - and positive bboxes, 0.5 by default. - neg_pos_ratio (float): The ratio of negative samples / positive samples. - loc_loss_weight (float): The weight of loc_loss. - conf_loss_weight (float): The weight of conf_loss. - prior_box_var (list): Variances corresponding to prior box coord, [0.1, - 0.1, 0.2, 0.2] by default. - """ - - def __init__(self, - overlap_threshold=0.5, - neg_pos_ratio=3.0, - loc_loss_weight=1.0, - conf_loss_weight=1.0, - prior_box_var=[0.1, 0.1, 0.2, 0.2]): - super(SSDLoss, self).__init__() - self.overlap_threshold = overlap_threshold - self.neg_pos_ratio = neg_pos_ratio - self.loc_loss_weight = loc_loss_weight - self.conf_loss_weight = conf_loss_weight - self.prior_box_var = [1. / a for a in prior_box_var] - - def _bipartite_match_for_batch(self, gt_bbox, gt_label, prior_boxes, - bg_index): - """ - Args: - gt_bbox (Tensor): [B, N, 4] - gt_label (Tensor): [B, N, 1] - prior_boxes (Tensor): [A, 4] - bg_index (int): Background class index - """ - batch_size, num_priors = gt_bbox.shape[0], prior_boxes.shape[0] - ious = iou_similarity(gt_bbox.reshape((-1, 4)), prior_boxes).reshape( - (batch_size, -1, num_priors)) - - # For each prior box, get the max IoU of all GTs. - prior_max_iou, prior_argmax_iou = ious.max(axis=1), ious.argmax(axis=1) - # For each GT, get the max IoU of all prior boxes. - gt_max_iou, gt_argmax_iou = ious.max(axis=2), ious.argmax(axis=2) - - # Gather target bbox and label according to 'prior_argmax_iou' index. - batch_ind = paddle.arange(end=batch_size, dtype='int64').unsqueeze(-1) - prior_argmax_iou = paddle.stack( - [batch_ind.tile([1, num_priors]), prior_argmax_iou], axis=-1) - targets_bbox = paddle.gather_nd(gt_bbox, prior_argmax_iou) - targets_label = paddle.gather_nd(gt_label, prior_argmax_iou) - # Assign negative - bg_index_tensor = paddle.full([batch_size, num_priors, 1], bg_index, - 'int64') - targets_label = paddle.where( - prior_max_iou.unsqueeze(-1) < self.overlap_threshold, - bg_index_tensor, targets_label) - - # Ensure each GT can match the max IoU prior box. - batch_ind = (batch_ind * num_priors + gt_argmax_iou).flatten() - targets_bbox = paddle.scatter( - targets_bbox.reshape([-1, 4]), batch_ind, - gt_bbox.reshape([-1, 4])).reshape([batch_size, -1, 4]) - targets_label = paddle.scatter( - targets_label.reshape([-1, 1]), batch_ind, - gt_label.reshape([-1, 1])).reshape([batch_size, -1, 1]) - targets_label[:, :1] = bg_index - - # Encode box - prior_boxes = prior_boxes.unsqueeze(0).tile([batch_size, 1, 1]) - targets_bbox = bbox2delta( - prior_boxes.reshape([-1, 4]), - targets_bbox.reshape([-1, 4]), self.prior_box_var) - targets_bbox = targets_bbox.reshape([batch_size, -1, 4]) - - return targets_bbox, targets_label - - def _mine_hard_example(self, - conf_loss, - targets_label, - bg_index, - mine_neg_ratio=0.01): - pos = (targets_label != bg_index).astype(conf_loss.dtype) - num_pos = pos.sum(axis=1, keepdim=True) - neg = (targets_label == bg_index).astype(conf_loss.dtype) - - conf_loss = conf_loss.detach() * neg - loss_idx = conf_loss.argsort(axis=1, descending=True) - idx_rank = loss_idx.argsort(axis=1) - num_negs = [] - for i in range(conf_loss.shape[0]): - cur_num_pos = num_pos[i] - num_neg = paddle.clip( - cur_num_pos * self.neg_pos_ratio, max=pos.shape[1]) - num_neg = num_neg if num_neg > 0 else paddle.to_tensor( - [pos.shape[1] * mine_neg_ratio]) - num_negs.append(num_neg) - num_negs = paddle.stack(num_negs).expand_as(idx_rank) - neg_mask = (idx_rank < num_negs).astype(conf_loss.dtype) - - return (neg_mask + pos).astype('bool') - - def forward(self, boxes, scores, gt_bbox, gt_label, prior_boxes): - boxes = paddle.concat(boxes, axis=1) - scores = paddle.concat(scores, axis=1) - gt_label = gt_label.unsqueeze(-1).astype('int64') - prior_boxes = paddle.concat(prior_boxes, axis=0) - bg_index = scores.shape[-1] - 1 - - # Match bbox and get targets. - targets_bbox, targets_label = \ - self._bipartite_match_for_batch(gt_bbox, gt_label, prior_boxes, bg_index) - targets_bbox.stop_gradient = True - targets_label.stop_gradient = True - - # Compute regression loss. - # Select positive samples. - bbox_mask = paddle.tile(targets_label != bg_index, [1, 1, 4]) - if bbox_mask.astype(boxes.dtype).sum() > 0: - location = paddle.masked_select(boxes, bbox_mask) - targets_bbox = paddle.masked_select(targets_bbox, bbox_mask) - loc_loss = F.smooth_l1_loss(location, targets_bbox, reduction='sum') - loc_loss = loc_loss * self.loc_loss_weight - else: - loc_loss = paddle.zeros([1]) - - # Compute confidence loss. - conf_loss = F.cross_entropy(scores, targets_label, reduction="none") - # Mining hard examples. - label_mask = self._mine_hard_example( - conf_loss.squeeze(-1), targets_label.squeeze(-1), bg_index) - conf_loss = paddle.masked_select(conf_loss, label_mask.unsqueeze(-1)) - conf_loss = conf_loss.sum() * self.conf_loss_weight - - # Compute overall weighted loss. - normalizer = (targets_label != bg_index).astype('float32').sum().clip( - min=1) - loss = (conf_loss + loc_loss) / normalizer - - return loss diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/supcontrast.py b/pdfdet/models/Paddle/ppdet/modeling/losses/supcontrast.py deleted file mode 100644 index 3e59f08..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/supcontrast.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - -import random -from ppdet.core.workspace import register - - -__all__ = ['SupContrast'] - - -@register -class SupContrast(nn.Layer): - __shared__ = [ - 'num_classes' - ] - def __init__(self, num_classes=80, temperature=2.5, sample_num=4096, thresh=0.75): - super(SupContrast, self).__init__() - self.num_classes = num_classes - self.temperature = temperature - self.sample_num = sample_num - self.thresh = thresh - def forward(self, features, labels, scores): - - assert features.shape[0] == labels.shape[0] == scores.shape[0] - positive_mask = (labels < self.num_classes) - positive_features, positive_labels, positive_scores = features[positive_mask], labels[positive_mask], \ - scores[positive_mask] - - negative_mask = (labels == self.num_classes) - negative_features, negative_labels, negative_scores = features[negative_mask], labels[negative_mask], \ - scores[negative_mask] - - N = negative_features.shape[0] - S = self.sample_num - positive_mask.sum() - index = paddle.to_tensor(random.sample(range(N), int(S)), dtype='int32') - - negative_features = paddle.index_select(x=negative_features, index=index, axis=0) - negative_labels = paddle.index_select(x=negative_labels, index=index, axis=0) - negative_scores = paddle.index_select(x=negative_scores, index=index, axis=0) - - features = paddle.concat([positive_features, negative_features], 0) - labels = paddle.concat([positive_labels, negative_labels], 0) - scores = paddle.concat([positive_scores, negative_scores], 0) - - if len(labels.shape) == 1: - labels = labels.reshape([-1, 1]) - label_mask = paddle.equal(labels, labels.T).detach() - similarity = (paddle.matmul(features, features.T) / self.temperature) - - sim_row_max = paddle.max(similarity, axis=1, keepdim=True) - similarity = similarity - sim_row_max - - logits_mask = paddle.ones_like(similarity).detach() - logits_mask.fill_diagonal_(0) - - exp_sim = paddle.exp(similarity) * logits_mask - log_prob = similarity - paddle.log(exp_sim.sum(axis=1, keepdim=True)) - - per_label_log_prob = (log_prob * logits_mask * label_mask).sum(1) / label_mask.sum(1) - keep = scores > self.thresh - per_label_log_prob = per_label_log_prob[keep] - loss = -per_label_log_prob - - return loss.mean() \ No newline at end of file diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/varifocal_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/varifocal_loss.py deleted file mode 100644 index 42d18a6..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/varifocal_loss.py +++ /dev/null @@ -1,152 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The code is based on: -# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/losses/varifocal_loss.py - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register, serializable -from ppdet.modeling import ops - -__all__ = ['VarifocalLoss'] - - -def varifocal_loss(pred, - target, - alpha=0.75, - gamma=2.0, - iou_weighted=True, - use_sigmoid=True): - """`Varifocal Loss `_ - - Args: - pred (Tensor): The prediction with shape (N, C), C is the - number of classes - target (Tensor): The learning target of the iou-aware - classification score with shape (N, C), C is the number of classes. - alpha (float, optional): A balance factor for the negative part of - Varifocal Loss, which is different from the alpha of Focal Loss. - Defaults to 0.75. - gamma (float, optional): The gamma for calculating the modulating - factor. Defaults to 2.0. - iou_weighted (bool, optional): Whether to weight the loss of the - positive example with the iou target. Defaults to True. - """ - # pred and target should be of the same size - assert pred.shape == target.shape - if use_sigmoid: - pred_new = F.sigmoid(pred) - else: - pred_new = pred - target = target.cast(pred.dtype) - if iou_weighted: - focal_weight = target * (target > 0.0).cast('float32') + \ - alpha * (pred_new - target).abs().pow(gamma) * \ - (target <= 0.0).cast('float32') - else: - focal_weight = (target > 0.0).cast('float32') + \ - alpha * (pred_new - target).abs().pow(gamma) * \ - (target <= 0.0).cast('float32') - - if use_sigmoid: - loss = F.binary_cross_entropy_with_logits( - pred, target, reduction='none') * focal_weight - else: - loss = F.binary_cross_entropy( - pred, target, reduction='none') * focal_weight - loss = loss.sum(axis=1) - return loss - - -@register -@serializable -class VarifocalLoss(nn.Layer): - def __init__(self, - use_sigmoid=True, - alpha=0.75, - gamma=2.0, - iou_weighted=True, - reduction='mean', - loss_weight=1.0): - """`Varifocal Loss `_ - - Args: - use_sigmoid (bool, optional): Whether the prediction is - used for sigmoid or softmax. Defaults to True. - alpha (float, optional): A balance factor for the negative part of - Varifocal Loss, which is different from the alpha of Focal - Loss. Defaults to 0.75. - gamma (float, optional): The gamma for calculating the modulating - factor. Defaults to 2.0. - iou_weighted (bool, optional): Whether to weight the loss of the - positive examples with the iou target. Defaults to True. - reduction (str, optional): The method used to reduce the loss into - a scalar. Defaults to 'mean'. Options are "none", "mean" and - "sum". - loss_weight (float, optional): Weight of loss. Defaults to 1.0. - """ - super(VarifocalLoss, self).__init__() - assert alpha >= 0.0 - self.use_sigmoid = use_sigmoid - self.alpha = alpha - self.gamma = gamma - self.iou_weighted = iou_weighted - self.reduction = reduction - self.loss_weight = loss_weight - - def forward(self, pred, target, weight=None, avg_factor=None): - """Forward function. - - Args: - pred (Tensor): The prediction. - target (Tensor): The learning target of the prediction. - weight (Tensor, optional): The weight of loss for each - prediction. Defaults to None. - avg_factor (int, optional): Average factor that is used to average - the loss. Defaults to None. - Returns: - Tensor: The calculated loss - """ - loss = self.loss_weight * varifocal_loss( - pred, - target, - alpha=self.alpha, - gamma=self.gamma, - iou_weighted=self.iou_weighted, - use_sigmoid=self.use_sigmoid) - - if weight is not None: - loss = loss * weight - if avg_factor is None: - if self.reduction == 'none': - return loss - elif self.reduction == 'mean': - return loss.mean() - elif self.reduction == 'sum': - return loss.sum() - else: - # if reduction is mean, then average the loss by avg_factor - if self.reduction == 'mean': - loss = loss.sum() / avg_factor - # if reduction is 'none', then do nothing, otherwise raise an error - elif self.reduction != 'none': - raise ValueError( - 'avg_factor can not be used with reduction="sum"') - return loss diff --git a/pdfdet/models/Paddle/ppdet/modeling/losses/yolo_loss.py b/pdfdet/models/Paddle/ppdet/modeling/losses/yolo_loss.py deleted file mode 100644 index fecef9a..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/losses/yolo_loss.py +++ /dev/null @@ -1,207 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register - -from ..bbox_utils import decode_yolo, xywh2xyxy, batch_iou_similarity - -__all__ = ['YOLOv3Loss'] - - -def bbox_transform(pbox, anchor, downsample): - pbox = decode_yolo(pbox, anchor, downsample) - pbox = xywh2xyxy(pbox) - return pbox - - -@register -class YOLOv3Loss(nn.Layer): - - __inject__ = ['iou_loss', 'iou_aware_loss'] - __shared__ = ['num_classes'] - - def __init__(self, - num_classes=80, - ignore_thresh=0.7, - label_smooth=False, - downsample=[32, 16, 8], - scale_x_y=1., - iou_loss=None, - iou_aware_loss=None): - """ - YOLOv3Loss layer - - Args: - num_calsses (int): number of foreground classes - ignore_thresh (float): threshold to ignore confidence loss - label_smooth (bool): whether to use label smoothing - downsample (list): downsample ratio for each detection block - scale_x_y (float): scale_x_y factor - iou_loss (object): IoULoss instance - iou_aware_loss (object): IouAwareLoss instance - """ - super(YOLOv3Loss, self).__init__() - self.num_classes = num_classes - self.ignore_thresh = ignore_thresh - self.label_smooth = label_smooth - self.downsample = downsample - self.scale_x_y = scale_x_y - self.iou_loss = iou_loss - self.iou_aware_loss = iou_aware_loss - self.distill_pairs = [] - - def obj_loss(self, pbox, gbox, pobj, tobj, anchor, downsample): - # pbox - pbox = decode_yolo(pbox, anchor, downsample) - pbox = xywh2xyxy(pbox) - pbox = paddle.concat(pbox, axis=-1) - b = pbox.shape[0] - pbox = pbox.reshape((b, -1, 4)) - # gbox - gxy = gbox[:, :, 0:2] - gbox[:, :, 2:4] * 0.5 - gwh = gbox[:, :, 0:2] + gbox[:, :, 2:4] * 0.5 - gbox = paddle.concat([gxy, gwh], axis=-1) - - iou = batch_iou_similarity(pbox, gbox) - iou.stop_gradient = True - iou_max = iou.max(2) # [N, M1] - iou_mask = paddle.cast(iou_max <= self.ignore_thresh, dtype=pbox.dtype) - iou_mask.stop_gradient = True - - pobj = pobj.reshape((b, -1)) - tobj = tobj.reshape((b, -1)) - obj_mask = paddle.cast(tobj > 0, dtype=pbox.dtype) - obj_mask.stop_gradient = True - - loss_obj = F.binary_cross_entropy_with_logits( - pobj, obj_mask, reduction='none') - loss_obj_pos = (loss_obj * tobj) - loss_obj_neg = (loss_obj * (1 - obj_mask) * iou_mask) - return loss_obj_pos + loss_obj_neg - - def cls_loss(self, pcls, tcls): - if self.label_smooth: - delta = min(1. / self.num_classes, 1. / 40) - pos, neg = 1 - delta, delta - # 1 for positive, 0 for negative - tcls = pos * paddle.cast( - tcls > 0., dtype=tcls.dtype) + neg * paddle.cast( - tcls <= 0., dtype=tcls.dtype) - - loss_cls = F.binary_cross_entropy_with_logits( - pcls, tcls, reduction='none') - return loss_cls - - def yolov3_loss(self, p, t, gt_box, anchor, downsample, scale=1., - eps=1e-10): - na = len(anchor) - b, c, h, w = p.shape - if self.iou_aware_loss: - ioup, p = p[:, 0:na, :, :], p[:, na:, :, :] - ioup = ioup.unsqueeze(-1) - p = p.reshape((b, na, -1, h, w)).transpose((0, 1, 3, 4, 2)) - x, y = p[:, :, :, :, 0:1], p[:, :, :, :, 1:2] - w, h = p[:, :, :, :, 2:3], p[:, :, :, :, 3:4] - obj, pcls = p[:, :, :, :, 4:5], p[:, :, :, :, 5:] - self.distill_pairs.append([x, y, w, h, obj, pcls]) - - t = t.transpose((0, 1, 3, 4, 2)) - tx, ty = t[:, :, :, :, 0:1], t[:, :, :, :, 1:2] - tw, th = t[:, :, :, :, 2:3], t[:, :, :, :, 3:4] - tscale = t[:, :, :, :, 4:5] - tobj, tcls = t[:, :, :, :, 5:6], t[:, :, :, :, 6:] - - tscale_obj = tscale * tobj - loss = dict() - - x = scale * F.sigmoid(x) - 0.5 * (scale - 1.) - y = scale * F.sigmoid(y) - 0.5 * (scale - 1.) - - if abs(scale - 1.) < eps: - loss_x = F.binary_cross_entropy(x, tx, reduction='none') - loss_y = F.binary_cross_entropy(y, ty, reduction='none') - loss_xy = tscale_obj * (loss_x + loss_y) - else: - loss_x = paddle.abs(x - tx) - loss_y = paddle.abs(y - ty) - loss_xy = tscale_obj * (loss_x + loss_y) - - loss_xy = loss_xy.sum([1, 2, 3, 4]).mean() - - loss_w = paddle.abs(w - tw) - loss_h = paddle.abs(h - th) - loss_wh = tscale_obj * (loss_w + loss_h) - loss_wh = loss_wh.sum([1, 2, 3, 4]).mean() - - loss['loss_xy'] = loss_xy - loss['loss_wh'] = loss_wh - - if self.iou_loss is not None: - # warn: do not modify x, y, w, h in place - box, tbox = [x, y, w, h], [tx, ty, tw, th] - pbox = bbox_transform(box, anchor, downsample) - gbox = bbox_transform(tbox, anchor, downsample) - loss_iou = self.iou_loss(pbox, gbox) - loss_iou = loss_iou * tscale_obj - loss_iou = loss_iou.sum([1, 2, 3, 4]).mean() - loss['loss_iou'] = loss_iou - - if self.iou_aware_loss is not None: - box, tbox = [x, y, w, h], [tx, ty, tw, th] - pbox = bbox_transform(box, anchor, downsample) - gbox = bbox_transform(tbox, anchor, downsample) - loss_iou_aware = self.iou_aware_loss(ioup, pbox, gbox) - loss_iou_aware = loss_iou_aware * tobj - loss_iou_aware = loss_iou_aware.sum([1, 2, 3, 4]).mean() - loss['loss_iou_aware'] = loss_iou_aware - - box = [x, y, w, h] - loss_obj = self.obj_loss(box, gt_box, obj, tobj, anchor, downsample) - loss_obj = loss_obj.sum(-1).mean() - loss['loss_obj'] = loss_obj - loss_cls = self.cls_loss(pcls, tcls) * tobj - loss_cls = loss_cls.sum([1, 2, 3, 4]).mean() - loss['loss_cls'] = loss_cls - return loss - - def forward(self, inputs, targets, anchors): - np = len(inputs) - gt_targets = [targets['target{}'.format(i)] for i in range(np)] - gt_box = targets['gt_bbox'] - yolo_losses = dict() - self.distill_pairs.clear() - for x, t, anchor, downsample in zip(inputs, gt_targets, anchors, - self.downsample): - yolo_loss = self.yolov3_loss( - x.astype('float32'), t, gt_box, anchor, downsample, - self.scale_x_y) - for k, v in yolo_loss.items(): - if k in yolo_losses: - yolo_losses[k] += v - else: - yolo_losses[k] = v - - loss = 0 - for k, v in yolo_losses.items(): - loss += v - - yolo_losses['loss'] = loss - return yolo_losses diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/mot/__init__.py deleted file mode 100644 index 258e4c9..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import matching -from . import tracker -from . import motion -from . import visualization -from . import utils - -from .matching import * -from .tracker import * -from .motion import * -from .visualization import * -from .utils import * diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/matching/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/mot/matching/__init__.py deleted file mode 100644 index f6a88c5..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/matching/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import jde_matching -from . import deepsort_matching -from . import ocsort_matching - -from .jde_matching import * -from .deepsort_matching import * -from .ocsort_matching import * diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/matching/deepsort_matching.py b/pdfdet/models/Paddle/ppdet/modeling/mot/matching/deepsort_matching.py deleted file mode 100644 index 3859ccf..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/matching/deepsort_matching.py +++ /dev/null @@ -1,379 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/nwojke/deep_sort/tree/master/deep_sort -""" - -import numpy as np -from scipy.optimize import linear_sum_assignment -from ..motion import kalman_filter - -INFTY_COST = 1e+5 - -__all__ = [ - 'iou_1toN', - 'iou_cost', - '_nn_euclidean_distance', - '_nn_cosine_distance', - 'NearestNeighborDistanceMetric', - 'min_cost_matching', - 'matching_cascade', - 'gate_cost_matrix', -] - - -def iou_1toN(bbox, candidates): - """ - Computer intersection over union (IoU) by one box to N candidates. - - Args: - bbox (ndarray): A bounding box in format `(top left x, top left y, width, height)`. - candidates (ndarray): A matrix of candidate bounding boxes (one per row) in the - same format as `bbox`. - - Returns: - ious (ndarray): The intersection over union in [0, 1] between the `bbox` - and each candidate. A higher score means a larger fraction of the - `bbox` is occluded by the candidate. - """ - bbox_tl = bbox[:2] - bbox_br = bbox[:2] + bbox[2:] - candidates_tl = candidates[:, :2] - candidates_br = candidates[:, :2] + candidates[:, 2:] - - tl = np.c_[np.maximum(bbox_tl[0], candidates_tl[:, 0])[:, np.newaxis], - np.maximum(bbox_tl[1], candidates_tl[:, 1])[:, np.newaxis]] - br = np.c_[np.minimum(bbox_br[0], candidates_br[:, 0])[:, np.newaxis], - np.minimum(bbox_br[1], candidates_br[:, 1])[:, np.newaxis]] - wh = np.maximum(0., br - tl) - - area_intersection = wh.prod(axis=1) - area_bbox = bbox[2:].prod() - area_candidates = candidates[:, 2:].prod(axis=1) - ious = area_intersection / (area_bbox + area_candidates - area_intersection) - return ious - - -def iou_cost(tracks, detections, track_indices=None, detection_indices=None): - """ - IoU distance metric. - - Args: - tracks (list[Track]): A list of tracks. - detections (list[Detection]): A list of detections. - track_indices (Optional[list[int]]): A list of indices to tracks that - should be matched. Defaults to all `tracks`. - detection_indices (Optional[list[int]]): A list of indices to detections - that should be matched. Defaults to all `detections`. - - Returns: - cost_matrix (ndarray): A cost matrix of shape len(track_indices), - len(detection_indices) where entry (i, j) is - `1 - iou(tracks[track_indices[i]], detections[detection_indices[j]])`. - """ - if track_indices is None: - track_indices = np.arange(len(tracks)) - if detection_indices is None: - detection_indices = np.arange(len(detections)) - - cost_matrix = np.zeros((len(track_indices), len(detection_indices))) - for row, track_idx in enumerate(track_indices): - if tracks[track_idx].time_since_update > 1: - cost_matrix[row, :] = 1e+5 - continue - - bbox = tracks[track_idx].to_tlwh() - candidates = np.asarray([detections[i].tlwh for i in detection_indices]) - cost_matrix[row, :] = 1. - iou_1toN(bbox, candidates) - return cost_matrix - - -def _nn_euclidean_distance(s, q): - """ - Compute pair-wise squared (Euclidean) distance between points in `s` and `q`. - - Args: - s (ndarray): Sample points: an NxM matrix of N samples of dimensionality M. - q (ndarray): Query points: an LxM matrix of L samples of dimensionality M. - - Returns: - distances (ndarray): A vector of length M that contains for each entry in `q` the - smallest Euclidean distance to a sample in `s`. - """ - s, q = np.asarray(s), np.asarray(q) - if len(s) == 0 or len(q) == 0: - return np.zeros((len(s), len(q))) - s2, q2 = np.square(s).sum(axis=1), np.square(q).sum(axis=1) - distances = -2. * np.dot(s, q.T) + s2[:, None] + q2[None, :] - distances = np.clip(distances, 0., float(np.inf)) - - return np.maximum(0.0, distances.min(axis=0)) - - -def _nn_cosine_distance(s, q): - """ - Compute pair-wise cosine distance between points in `s` and `q`. - - Args: - s (ndarray): Sample points: an NxM matrix of N samples of dimensionality M. - q (ndarray): Query points: an LxM matrix of L samples of dimensionality M. - - Returns: - distances (ndarray): A vector of length M that contains for each entry in `q` the - smallest Euclidean distance to a sample in `s`. - """ - s = np.asarray(s) / np.linalg.norm(s, axis=1, keepdims=True) - q = np.asarray(q) / np.linalg.norm(q, axis=1, keepdims=True) - distances = 1. - np.dot(s, q.T) - - return distances.min(axis=0) - - -class NearestNeighborDistanceMetric(object): - """ - A nearest neighbor distance metric that, for each target, returns - the closest distance to any sample that has been observed so far. - - Args: - metric (str): Either "euclidean" or "cosine". - matching_threshold (float): The matching threshold. Samples with larger - distance are considered an invalid match. - budget (Optional[int]): If not None, fix samples per class to at most - this number. Removes the oldest samples when the budget is reached. - - Attributes: - samples (Dict[int -> List[ndarray]]): A dictionary that maps from target - identities to the list of samples that have been observed so far. - """ - - def __init__(self, metric, matching_threshold, budget=None): - if metric == "euclidean": - self._metric = _nn_euclidean_distance - elif metric == "cosine": - self._metric = _nn_cosine_distance - else: - raise ValueError( - "Invalid metric; must be either 'euclidean' or 'cosine'") - self.matching_threshold = matching_threshold - self.budget = budget - self.samples = {} - - def partial_fit(self, features, targets, active_targets): - """ - Update the distance metric with new data. - - Args: - features (ndarray): An NxM matrix of N features of dimensionality M. - targets (ndarray): An integer array of associated target identities. - active_targets (List[int]): A list of targets that are currently - present in the scene. - """ - for feature, target in zip(features, targets): - self.samples.setdefault(target, []).append(feature) - if self.budget is not None: - self.samples[target] = self.samples[target][-self.budget:] - self.samples = {k: self.samples[k] for k in active_targets} - - def distance(self, features, targets): - """ - Compute distance between features and targets. - - Args: - features (ndarray): An NxM matrix of N features of dimensionality M. - targets (list[int]): A list of targets to match the given `features` against. - - Returns: - cost_matrix (ndarray): a cost matrix of shape len(targets), len(features), - where element (i, j) contains the closest squared distance between - `targets[i]` and `features[j]`. - """ - cost_matrix = np.zeros((len(targets), len(features))) - for i, target in enumerate(targets): - cost_matrix[i, :] = self._metric(self.samples[target], features) - return cost_matrix - - -def min_cost_matching(distance_metric, - max_distance, - tracks, - detections, - track_indices=None, - detection_indices=None): - """ - Solve linear assignment problem. - - Args: - distance_metric : - Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray - The distance metric is given a list of tracks and detections as - well as a list of N track indices and M detection indices. The - metric should return the NxM dimensional cost matrix, where element - (i, j) is the association cost between the i-th track in the given - track indices and the j-th detection in the given detection_indices. - max_distance (float): Gating threshold. Associations with cost larger - than this value are disregarded. - tracks (list[Track]): A list of predicted tracks at the current time - step. - detections (list[Detection]): A list of detections at the current time - step. - track_indices (list[int]): List of track indices that maps rows in - `cost_matrix` to tracks in `tracks`. - detection_indices (List[int]): List of detection indices that maps - columns in `cost_matrix` to detections in `detections`. - - Returns: - A tuple (List[(int, int)], List[int], List[int]) with the following - three entries: - * A list of matched track and detection indices. - * A list of unmatched track indices. - * A list of unmatched detection indices. - """ - if track_indices is None: - track_indices = np.arange(len(tracks)) - if detection_indices is None: - detection_indices = np.arange(len(detections)) - - if len(detection_indices) == 0 or len(track_indices) == 0: - return [], track_indices, detection_indices # Nothing to match. - - cost_matrix = distance_metric(tracks, detections, track_indices, - detection_indices) - - cost_matrix[cost_matrix > max_distance] = max_distance + 1e-5 - indices = linear_sum_assignment(cost_matrix) - - matches, unmatched_tracks, unmatched_detections = [], [], [] - for col, detection_idx in enumerate(detection_indices): - if col not in indices[1]: - unmatched_detections.append(detection_idx) - for row, track_idx in enumerate(track_indices): - if row not in indices[0]: - unmatched_tracks.append(track_idx) - for row, col in zip(indices[0], indices[1]): - track_idx = track_indices[row] - detection_idx = detection_indices[col] - if cost_matrix[row, col] > max_distance: - unmatched_tracks.append(track_idx) - unmatched_detections.append(detection_idx) - else: - matches.append((track_idx, detection_idx)) - return matches, unmatched_tracks, unmatched_detections - - -def matching_cascade(distance_metric, - max_distance, - cascade_depth, - tracks, - detections, - track_indices=None, - detection_indices=None): - """ - Run matching cascade. - - Args: - distance_metric : - Callable[List[Track], List[Detection], List[int], List[int]) -> ndarray - The distance metric is given a list of tracks and detections as - well as a list of N track indices and M detection indices. The - metric should return the NxM dimensional cost matrix, where element - (i, j) is the association cost between the i-th track in the given - track indices and the j-th detection in the given detection_indices. - max_distance (float): Gating threshold. Associations with cost larger - than this value are disregarded. - cascade_depth (int): The cascade depth, should be se to the maximum - track age. - tracks (list[Track]): A list of predicted tracks at the current time - step. - detections (list[Detection]): A list of detections at the current time - step. - track_indices (list[int]): List of track indices that maps rows in - `cost_matrix` to tracks in `tracks`. - detection_indices (List[int]): List of detection indices that maps - columns in `cost_matrix` to detections in `detections`. - - Returns: - A tuple (List[(int, int)], List[int], List[int]) with the following - three entries: - * A list of matched track and detection indices. - * A list of unmatched track indices. - * A list of unmatched detection indices. - """ - if track_indices is None: - track_indices = list(range(len(tracks))) - if detection_indices is None: - detection_indices = list(range(len(detections))) - - unmatched_detections = detection_indices - matches = [] - for level in range(cascade_depth): - if len(unmatched_detections) == 0: # No detections left - break - - track_indices_l = [ - k for k in track_indices if tracks[k].time_since_update == 1 + level - ] - if len(track_indices_l) == 0: # Nothing to match at this level - continue - - matches_l, _, unmatched_detections = \ - min_cost_matching( - distance_metric, max_distance, tracks, detections, - track_indices_l, unmatched_detections) - matches += matches_l - unmatched_tracks = list(set(track_indices) - set(k for k, _ in matches)) - return matches, unmatched_tracks, unmatched_detections - - -def gate_cost_matrix(kf, - cost_matrix, - tracks, - detections, - track_indices, - detection_indices, - gated_cost=INFTY_COST, - only_position=False): - """ - Invalidate infeasible entries in cost matrix based on the state - distributions obtained by Kalman filtering. - - Args: - kf (object): The Kalman filter. - cost_matrix (ndarray): The NxM dimensional cost matrix, where N is the - number of track indices and M is the number of detection indices, - such that entry (i, j) is the association cost between - `tracks[track_indices[i]]` and `detections[detection_indices[j]]`. - tracks (list[Track]): A list of predicted tracks at the current time - step. - detections (list[Detection]): A list of detections at the current time - step. - track_indices (List[int]): List of track indices that maps rows in - `cost_matrix` to tracks in `tracks`. - detection_indices (List[int]): List of detection indices that maps - columns in `cost_matrix` to detections in `detections`. - gated_cost (Optional[float]): Entries in the cost matrix corresponding - to infeasible associations are set this value. Defaults to a very - large value. - only_position (Optional[bool]): If True, only the x, y position of the - state distribution is considered during gating. Default False. - """ - gating_dim = 2 if only_position else 4 - gating_threshold = kalman_filter.chi2inv95[gating_dim] - measurements = np.asarray( - [detections[i].to_xyah() for i in detection_indices]) - for row, track_idx in enumerate(track_indices): - track = tracks[track_idx] - gating_distance = kf.gating_distance(track.mean, track.covariance, - measurements, only_position) - cost_matrix[row, gating_distance > gating_threshold] = gated_cost - return cost_matrix diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/matching/jde_matching.py b/pdfdet/models/Paddle/ppdet/modeling/mot/matching/jde_matching.py deleted file mode 100644 index ac28f90..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/matching/jde_matching.py +++ /dev/null @@ -1,163 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/matching.py -""" - -try: - import lap -except: - print( - 'Warning: Unable to use JDE/FairMOT/ByteTrack, please install lap, for example: `pip install lap`, see https://github.com/gatagat/lap' - ) - pass - -import scipy -import numpy as np -from scipy.spatial.distance import cdist -from ..motion import kalman_filter -import warnings -warnings.filterwarnings("ignore") - -__all__ = [ - 'merge_matches', - 'linear_assignment', - 'bbox_ious', - 'iou_distance', - 'embedding_distance', - 'fuse_motion', -] - - -def merge_matches(m1, m2, shape): - O, P, Q = shape - m1 = np.asarray(m1) - m2 = np.asarray(m2) - - M1 = scipy.sparse.coo_matrix( - (np.ones(len(m1)), (m1[:, 0], m1[:, 1])), shape=(O, P)) - M2 = scipy.sparse.coo_matrix( - (np.ones(len(m2)), (m2[:, 0], m2[:, 1])), shape=(P, Q)) - - mask = M1 * M2 - match = mask.nonzero() - match = list(zip(match[0], match[1])) - unmatched_O = tuple(set(range(O)) - set([i for i, j in match])) - unmatched_Q = tuple(set(range(Q)) - set([j for i, j in match])) - - return match, unmatched_O, unmatched_Q - - -def linear_assignment(cost_matrix, thresh): - try: - import lap - except Exception as e: - raise RuntimeError( - 'Unable to use JDE/FairMOT/ByteTrack, please install lap, for example: `pip install lap`, see https://github.com/gatagat/lap' - ) - if cost_matrix.size == 0: - return np.empty( - (0, 2), dtype=int), tuple(range(cost_matrix.shape[0])), tuple( - range(cost_matrix.shape[1])) - matches, unmatched_a, unmatched_b = [], [], [] - cost, x, y = lap.lapjv(cost_matrix, extend_cost=True, cost_limit=thresh) - for ix, mx in enumerate(x): - if mx >= 0: - matches.append([ix, mx]) - unmatched_a = np.where(x < 0)[0] - unmatched_b = np.where(y < 0)[0] - matches = np.asarray(matches) - return matches, unmatched_a, unmatched_b - - -def bbox_ious(atlbrs, btlbrs): - boxes = np.ascontiguousarray(atlbrs, dtype=np.float32) - query_boxes = np.ascontiguousarray(btlbrs, dtype=np.float32) - N = boxes.shape[0] - K = query_boxes.shape[0] - ious = np.zeros((N, K), dtype=boxes.dtype) - if N * K == 0: - return ious - - for k in range(K): - box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + 1) * - (query_boxes[k, 3] - query_boxes[k, 1] + 1)) - for n in range(N): - iw = (min(boxes[n, 2], query_boxes[k, 2]) - max( - boxes[n, 0], query_boxes[k, 0]) + 1) - if iw > 0: - ih = (min(boxes[n, 3], query_boxes[k, 3]) - max( - boxes[n, 1], query_boxes[k, 1]) + 1) - if ih > 0: - ua = float((boxes[n, 2] - boxes[n, 0] + 1) * (boxes[ - n, 3] - boxes[n, 1] + 1) + box_area - iw * ih) - ious[n, k] = iw * ih / ua - return ious - - -def iou_distance(atracks, btracks): - """ - Compute cost based on IoU between two list[STrack]. - """ - if (len(atracks) > 0 and isinstance(atracks[0], np.ndarray)) or ( - len(btracks) > 0 and isinstance(btracks[0], np.ndarray)): - atlbrs = atracks - btlbrs = btracks - else: - atlbrs = [track.tlbr for track in atracks] - btlbrs = [track.tlbr for track in btracks] - _ious = bbox_ious(atlbrs, btlbrs) - cost_matrix = 1 - _ious - - return cost_matrix - - -def embedding_distance(tracks, detections, metric='euclidean'): - """ - Compute cost based on features between two list[STrack]. - """ - cost_matrix = np.zeros((len(tracks), len(detections)), dtype=np.float32) - if cost_matrix.size == 0: - return cost_matrix - det_features = np.asarray( - [track.curr_feat for track in detections], dtype=np.float32) - track_features = np.asarray( - [track.smooth_feat for track in tracks], dtype=np.float32) - cost_matrix = np.maximum(0.0, cdist(track_features, det_features, - metric)) # Nomalized features - return cost_matrix - - -def fuse_motion(kf, - cost_matrix, - tracks, - detections, - only_position=False, - lambda_=0.98): - if cost_matrix.size == 0: - return cost_matrix - gating_dim = 2 if only_position else 4 - gating_threshold = kalman_filter.chi2inv95[gating_dim] - measurements = np.asarray([det.to_xyah() for det in detections]) - for row, track in enumerate(tracks): - gating_distance = kf.gating_distance( - track.mean, - track.covariance, - measurements, - only_position, - metric='maha') - cost_matrix[row, gating_distance > gating_threshold] = np.inf - cost_matrix[row] = lambda_ * cost_matrix[row] + (1 - lambda_ - ) * gating_distance - return cost_matrix diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/matching/ocsort_matching.py b/pdfdet/models/Paddle/ppdet/modeling/mot/matching/ocsort_matching.py deleted file mode 100644 index 58f79a5..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/matching/ocsort_matching.py +++ /dev/null @@ -1,165 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/noahcao/OC_SORT/blob/master/trackers/ocsort_tracker/association.py -""" - -import os -import numpy as np - - -def iou_batch(bboxes1, bboxes2): - bboxes2 = np.expand_dims(bboxes2, 0) - bboxes1 = np.expand_dims(bboxes1, 1) - - xx1 = np.maximum(bboxes1[..., 0], bboxes2[..., 0]) - yy1 = np.maximum(bboxes1[..., 1], bboxes2[..., 1]) - xx2 = np.minimum(bboxes1[..., 2], bboxes2[..., 2]) - yy2 = np.minimum(bboxes1[..., 3], bboxes2[..., 3]) - w = np.maximum(0., xx2 - xx1) - h = np.maximum(0., yy2 - yy1) - area = w * h - iou_matrix = area / ((bboxes1[..., 2] - bboxes1[..., 0]) * - (bboxes1[..., 3] - bboxes1[..., 1]) + - (bboxes2[..., 2] - bboxes2[..., 0]) * - (bboxes2[..., 3] - bboxes2[..., 1]) - area) - return iou_matrix - - -def speed_direction_batch(dets, tracks): - tracks = tracks[..., np.newaxis] - CX1, CY1 = (dets[:, 0] + dets[:, 2]) / 2.0, (dets[:, 1] + dets[:, 3]) / 2.0 - CX2, CY2 = (tracks[:, 0] + tracks[:, 2]) / 2.0, ( - tracks[:, 1] + tracks[:, 3]) / 2.0 - dx = CX1 - CX2 - dy = CY1 - CY2 - norm = np.sqrt(dx**2 + dy**2) + 1e-6 - dx = dx / norm - dy = dy / norm - return dy, dx - - -def linear_assignment(cost_matrix): - try: - import lap - _, x, y = lap.lapjv(cost_matrix, extend_cost=True) - return np.array([[y[i], i] for i in x if i >= 0]) - except ImportError: - from scipy.optimize import linear_sum_assignment - x, y = linear_sum_assignment(cost_matrix) - return np.array(list(zip(x, y))) - - -def associate(detections, trackers, iou_threshold, velocities, previous_obs, - vdc_weight): - if (len(trackers) == 0): - return np.empty( - (0, 2), dtype=int), np.arange(len(detections)), np.empty( - (0, 5), dtype=int) - - Y, X = speed_direction_batch(detections, previous_obs) - inertia_Y, inertia_X = velocities[:, 0], velocities[:, 1] - inertia_Y = np.repeat(inertia_Y[:, np.newaxis], Y.shape[1], axis=1) - inertia_X = np.repeat(inertia_X[:, np.newaxis], X.shape[1], axis=1) - diff_angle_cos = inertia_X * X + inertia_Y * Y - diff_angle_cos = np.clip(diff_angle_cos, a_min=-1, a_max=1) - diff_angle = np.arccos(diff_angle_cos) - diff_angle = (np.pi / 2.0 - np.abs(diff_angle)) / np.pi - - valid_mask = np.ones(previous_obs.shape[0]) - valid_mask[np.where(previous_obs[:, 4] < 0)] = 0 - - iou_matrix = iou_batch(detections, trackers) - scores = np.repeat( - detections[:, -1][:, np.newaxis], trackers.shape[0], axis=1) - # iou_matrix = iou_matrix * scores # a trick sometiems works, we don't encourage this - valid_mask = np.repeat(valid_mask[:, np.newaxis], X.shape[1], axis=1) - - angle_diff_cost = (valid_mask * diff_angle) * vdc_weight - angle_diff_cost = angle_diff_cost.T - angle_diff_cost = angle_diff_cost * scores - - if min(iou_matrix.shape) > 0: - a = (iou_matrix > iou_threshold).astype(np.int32) - if a.sum(1).max() == 1 and a.sum(0).max() == 1: - matched_indices = np.stack(np.where(a), axis=1) - else: - matched_indices = linear_assignment(-(iou_matrix + angle_diff_cost)) - else: - matched_indices = np.empty(shape=(0, 2)) - - unmatched_detections = [] - for d, det in enumerate(detections): - if (d not in matched_indices[:, 0]): - unmatched_detections.append(d) - unmatched_trackers = [] - for t, trk in enumerate(trackers): - if (t not in matched_indices[:, 1]): - unmatched_trackers.append(t) - - # filter out matched with low IOU - matches = [] - for m in matched_indices: - if (iou_matrix[m[0], m[1]] < iou_threshold): - unmatched_detections.append(m[0]) - unmatched_trackers.append(m[1]) - else: - matches.append(m.reshape(1, 2)) - if (len(matches) == 0): - matches = np.empty((0, 2), dtype=int) - else: - matches = np.concatenate(matches, axis=0) - - return matches, np.array(unmatched_detections), np.array(unmatched_trackers) - - -def associate_only_iou(detections, trackers, iou_threshold): - if (len(trackers) == 0): - return np.empty( - (0, 2), dtype=int), np.arange(len(detections)), np.empty( - (0, 5), dtype=int) - - iou_matrix = iou_batch(detections, trackers) - - if min(iou_matrix.shape) > 0: - a = (iou_matrix > iou_threshold).astype(np.int32) - if a.sum(1).max() == 1 and a.sum(0).max() == 1: - matched_indices = np.stack(np.where(a), axis=1) - else: - matched_indices = linear_assignment(-iou_matrix) - else: - matched_indices = np.empty(shape=(0, 2)) - - unmatched_detections = [] - for d, det in enumerate(detections): - if (d not in matched_indices[:, 0]): - unmatched_detections.append(d) - unmatched_trackers = [] - for t, trk in enumerate(trackers): - if (t not in matched_indices[:, 1]): - unmatched_trackers.append(t) - - # filter out matched with low IOU - matches = [] - for m in matched_indices: - if (iou_matrix[m[0], m[1]] < iou_threshold): - unmatched_detections.append(m[0]) - unmatched_trackers.append(m[1]) - else: - matches.append(m.reshape(1, 2)) - if (len(matches) == 0): - matches = np.empty((0, 2), dtype=int) - else: - matches = np.concatenate(matches, axis=0) - return matches, np.array(unmatched_detections), np.array(unmatched_trackers) diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/motion/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/mot/motion/__init__.py deleted file mode 100644 index 6d20612..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/motion/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import kalman_filter - -from .kalman_filter import * -from .gmc import * \ No newline at end of file diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/motion/gmc.py b/pdfdet/models/Paddle/ppdet/modeling/mot/motion/gmc.py deleted file mode 100644 index 43ec42e..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/motion/gmc.py +++ /dev/null @@ -1,368 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/WWangYuHsiang/SMILEtrack/blob/main/BoT-SORT/tracker/gmc.py -""" - -import cv2 -import matplotlib.pyplot as plt -import numpy as np -import copy -import time -from ppdet.core.workspace import register, serializable - - -@register -@serializable -class GMC: - def __init__(self, method='sparseOptFlow', downscale=2, verbose=None): - super(GMC, self).__init__() - - self.method = method - self.downscale = max(1, int(downscale)) - - if self.method == 'orb': - self.detector = cv2.FastFeatureDetector_create(20) - self.extractor = cv2.ORB_create() - self.matcher = cv2.BFMatcher(cv2.NORM_HAMMING) - - elif self.method == 'sift': - self.detector = cv2.SIFT_create( - nOctaveLayers=3, contrastThreshold=0.02, edgeThreshold=20) - self.extractor = cv2.SIFT_create( - nOctaveLayers=3, contrastThreshold=0.02, edgeThreshold=20) - self.matcher = cv2.BFMatcher(cv2.NORM_L2) - - elif self.method == 'ecc': - number_of_iterations = 5000 - termination_eps = 1e-6 - self.warp_mode = cv2.MOTION_EUCLIDEAN - self.criteria = (cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, - number_of_iterations, termination_eps) - - elif self.method == 'sparseOptFlow': - self.feature_params = dict( - maxCorners=1000, - qualityLevel=0.01, - minDistance=1, - blockSize=3, - useHarrisDetector=False, - k=0.04) - # self.gmc_file = open('GMC_results.txt', 'w') - - elif self.method == 'file' or self.method == 'files': - seqName = verbose[0] - ablation = verbose[1] - if ablation: - filePath = r'tracker/GMC_files/MOT17_ablation' - else: - filePath = r'tracker/GMC_files/MOTChallenge' - - if '-FRCNN' in seqName: - seqName = seqName[:-6] - elif '-DPM' in seqName: - seqName = seqName[:-4] - elif '-SDP' in seqName: - seqName = seqName[:-4] - - self.gmcFile = open(filePath + "/GMC-" + seqName + ".txt", 'r') - - if self.gmcFile is None: - raise ValueError("Error: Unable to open GMC file in directory:" - + filePath) - elif self.method == 'none' or self.method == 'None': - self.method = 'none' - else: - raise ValueError("Error: Unknown CMC method:" + method) - - self.prevFrame = None - self.prevKeyPoints = None - self.prevDescriptors = None - - self.initializedFirstFrame = False - - def apply(self, raw_frame, detections=None): - if self.method == 'orb' or self.method == 'sift': - return self.applyFeaures(raw_frame, detections) - elif self.method == 'ecc': - return self.applyEcc(raw_frame, detections) - elif self.method == 'sparseOptFlow': - return self.applySparseOptFlow(raw_frame, detections) - elif self.method == 'file': - return self.applyFile(raw_frame, detections) - elif self.method == 'none': - return np.eye(2, 3) - else: - return np.eye(2, 3) - - def applyEcc(self, raw_frame, detections=None): - - # Initialize - height, width, _ = raw_frame.shape - frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY) - H = np.eye(2, 3, dtype=np.float32) - - # Downscale image (TODO: consider using pyramids) - if self.downscale > 1.0: - frame = cv2.GaussianBlur(frame, (3, 3), 1.5) - frame = cv2.resize(frame, (width // self.downscale, - height // self.downscale)) - width = width // self.downscale - height = height // self.downscale - - # Handle first frame - if not self.initializedFirstFrame: - # Initialize data - self.prevFrame = frame.copy() - - # Initialization done - self.initializedFirstFrame = True - - return H - - # Run the ECC algorithm. The results are stored in warp_matrix. - # (cc, H) = cv2.findTransformECC(self.prevFrame, frame, H, self.warp_mode, self.criteria) - try: - (cc, - H) = cv2.findTransformECC(self.prevFrame, frame, H, self.warp_mode, - self.criteria, None, 1) - except: - print('Warning: find transform failed. Set warp as identity') - - return H - - def applyFeaures(self, raw_frame, detections=None): - - # Initialize - height, width, _ = raw_frame.shape - frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY) - H = np.eye(2, 3) - - # Downscale image (TODO: consider using pyramids) - if self.downscale > 1.0: - # frame = cv2.GaussianBlur(frame, (3, 3), 1.5) - frame = cv2.resize(frame, (width // self.downscale, - height // self.downscale)) - width = width // self.downscale - height = height // self.downscale - - # find the keypoints - mask = np.zeros_like(frame) - # mask[int(0.05 * height): int(0.95 * height), int(0.05 * width): int(0.95 * width)] = 255 - mask[int(0.02 * height):int(0.98 * height), int(0.02 * width):int( - 0.98 * width)] = 255 - if detections is not None: - for det in detections: - tlbr = (det[:4] / self.downscale).astype(np.int_) - mask[tlbr[1]:tlbr[3], tlbr[0]:tlbr[2]] = 0 - - keypoints = self.detector.detect(frame, mask) - - # compute the descriptors - keypoints, descriptors = self.extractor.compute(frame, keypoints) - - # Handle first frame - if not self.initializedFirstFrame: - # Initialize data - self.prevFrame = frame.copy() - self.prevKeyPoints = copy.copy(keypoints) - self.prevDescriptors = copy.copy(descriptors) - - # Initialization done - self.initializedFirstFrame = True - - return H - - # Match descriptors. - knnMatches = self.matcher.knnMatch(self.prevDescriptors, descriptors, 2) - - # Filtered matches based on smallest spatial distance - matches = [] - spatialDistances = [] - - maxSpatialDistance = 0.25 * np.array([width, height]) - - # Handle empty matches case - if len(knnMatches) == 0: - # Store to next iteration - self.prevFrame = frame.copy() - self.prevKeyPoints = copy.copy(keypoints) - self.prevDescriptors = copy.copy(descriptors) - - return H - - for m, n in knnMatches: - if m.distance < 0.9 * n.distance: - prevKeyPointLocation = self.prevKeyPoints[m.queryIdx].pt - currKeyPointLocation = keypoints[m.trainIdx].pt - - spatialDistance = ( - prevKeyPointLocation[0] - currKeyPointLocation[0], - prevKeyPointLocation[1] - currKeyPointLocation[1]) - - if (np.abs(spatialDistance[0]) < maxSpatialDistance[0]) and \ - (np.abs(spatialDistance[1]) < maxSpatialDistance[1]): - spatialDistances.append(spatialDistance) - matches.append(m) - - meanSpatialDistances = np.mean(spatialDistances, 0) - stdSpatialDistances = np.std(spatialDistances, 0) - - inliesrs = (spatialDistances - meanSpatialDistances - ) < 2.5 * stdSpatialDistances - - goodMatches = [] - prevPoints = [] - currPoints = [] - for i in range(len(matches)): - if inliesrs[i, 0] and inliesrs[i, 1]: - goodMatches.append(matches[i]) - prevPoints.append(self.prevKeyPoints[matches[i].queryIdx].pt) - currPoints.append(keypoints[matches[i].trainIdx].pt) - - prevPoints = np.array(prevPoints) - currPoints = np.array(currPoints) - - # Draw the keypoint matches on the output image - if 0: - matches_img = np.hstack((self.prevFrame, frame)) - matches_img = cv2.cvtColor(matches_img, cv2.COLOR_GRAY2BGR) - W = np.size(self.prevFrame, 1) - for m in goodMatches: - prev_pt = np.array( - self.prevKeyPoints[m.queryIdx].pt, dtype=np.int_) - curr_pt = np.array(keypoints[m.trainIdx].pt, dtype=np.int_) - curr_pt[0] += W - color = np.random.randint(0, 255, (3, )) - color = (int(color[0]), int(color[1]), int(color[2])) - - matches_img = cv2.line(matches_img, prev_pt, curr_pt, - tuple(color), 1, cv2.LINE_AA) - matches_img = cv2.circle(matches_img, prev_pt, 2, - tuple(color), -1) - matches_img = cv2.circle(matches_img, curr_pt, 2, - tuple(color), -1) - - plt.figure() - plt.imshow(matches_img) - plt.show() - - # Find rigid matrix - if (np.size(prevPoints, 0) > 4) and ( - np.size(prevPoints, 0) == np.size(prevPoints, 0)): - H, inliesrs = cv2.estimateAffinePartial2D(prevPoints, currPoints, - cv2.RANSAC) - - # Handle downscale - if self.downscale > 1.0: - H[0, 2] *= self.downscale - H[1, 2] *= self.downscale - else: - print('Warning: not enough matching points') - - # Store to next iteration - self.prevFrame = frame.copy() - self.prevKeyPoints = copy.copy(keypoints) - self.prevDescriptors = copy.copy(descriptors) - - return H - - def applySparseOptFlow(self, raw_frame, detections=None): - - t0 = time.time() - - # Initialize - height, width, _ = raw_frame.shape - frame = cv2.cvtColor(raw_frame, cv2.COLOR_BGR2GRAY) - H = np.eye(2, 3) - - # Downscale image - if self.downscale > 1.0: - # frame = cv2.GaussianBlur(frame, (3, 3), 1.5) - frame = cv2.resize(frame, (width // self.downscale, - height // self.downscale)) - - # find the keypoints - keypoints = cv2.goodFeaturesToTrack( - frame, mask=None, **self.feature_params) - - # Handle first frame - if not self.initializedFirstFrame: - # Initialize data - self.prevFrame = frame.copy() - self.prevKeyPoints = copy.copy(keypoints) - - # Initialization done - self.initializedFirstFrame = True - - return H - - if self.prevFrame.shape != frame.shape: - self.prevFrame = frame.copy() - self.prevKeyPoints = copy.copy(keypoints) - return H - - # find correspondences - matchedKeypoints, status, err = cv2.calcOpticalFlowPyrLK( - self.prevFrame, frame, self.prevKeyPoints, None) - - # leave good correspondences only - prevPoints = [] - currPoints = [] - - for i in range(len(status)): - if status[i]: - prevPoints.append(self.prevKeyPoints[i]) - currPoints.append(matchedKeypoints[i]) - - prevPoints = np.array(prevPoints) - currPoints = np.array(currPoints) - - # Find rigid matrix - if (np.size(prevPoints, 0) > 4) and ( - np.size(prevPoints, 0) == np.size(prevPoints, 0)): - H, inliesrs = cv2.estimateAffinePartial2D(prevPoints, currPoints, - cv2.RANSAC) - - # Handle downscale - if self.downscale > 1.0: - H[0, 2] *= self.downscale - H[1, 2] *= self.downscale - else: - print('Warning: not enough matching points') - - # Store to next iteration - self.prevFrame = frame.copy() - self.prevKeyPoints = copy.copy(keypoints) - - t1 = time.time() - - # gmc_line = str(1000 * (t1 - t0)) + "\t" + str(H[0, 0]) + "\t" + str(H[0, 1]) + "\t" + str( - # H[0, 2]) + "\t" + str(H[1, 0]) + "\t" + str(H[1, 1]) + "\t" + str(H[1, 2]) + "\n" - # self.gmc_file.write(gmc_line) - - return H - - def applyFile(self, raw_frame, detections=None): - line = self.gmcFile.readline() - tokens = line.split("\t") - H = np.eye(2, 3, dtype=np.float_) - H[0, 0] = float(tokens[1]) - H[0, 1] = float(tokens[2]) - H[0, 2] = float(tokens[3]) - H[1, 0] = float(tokens[4]) - H[1, 1] = float(tokens[5]) - H[1, 2] = float(tokens[6]) - - return H diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/motion/kalman_filter.py b/pdfdet/models/Paddle/ppdet/modeling/mot/motion/kalman_filter.py deleted file mode 100644 index b4e3c93..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/motion/kalman_filter.py +++ /dev/null @@ -1,313 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/nwojke/deep_sort/blob/master/deep_sort/kalman_filter.py -""" - -import numpy as np -import scipy.linalg - -use_numba = True -try: - import numba as nb - - @nb.njit(fastmath=True, cache=True) - def nb_project(mean, covariance, std, _update_mat): - innovation_cov = np.diag(np.square(std)) - mean = np.dot(_update_mat, mean) - covariance = np.dot(np.dot(_update_mat, covariance), _update_mat.T) - return mean, covariance + innovation_cov - - @nb.njit(fastmath=True, cache=True) - def nb_multi_predict(mean, covariance, motion_cov, motion_mat): - mean = np.dot(mean, motion_mat.T) - left = np.dot(motion_mat, covariance) - covariance = np.dot(left, motion_mat.T) + motion_cov - return mean, covariance - - @nb.njit(fastmath=True, cache=True) - def nb_update(mean, covariance, proj_mean, proj_cov, measurement, meas_mat): - kalman_gain = np.linalg.solve(proj_cov, (covariance @meas_mat.T).T).T - innovation = measurement - proj_mean - mean = mean + innovation @kalman_gain.T - covariance = covariance - kalman_gain @proj_cov @kalman_gain.T - return mean, covariance - -except: - use_numba = False - pass - -__all__ = ['KalmanFilter'] -""" -Table for the 0.95 quantile of the chi-square distribution with N degrees of -freedom (contains values for N=1, ..., 9). Taken from MATLAB/Octave's chi2inv -function and used as Mahalanobis gating threshold. -""" - -chi2inv95 = { - 1: 3.8415, - 2: 5.9915, - 3: 7.8147, - 4: 9.4877, - 5: 11.070, - 6: 12.592, - 7: 14.067, - 8: 15.507, - 9: 16.919 -} - - -class KalmanFilter(object): - """ - A simple Kalman filter for tracking bounding boxes in image space. - - The 8-dimensional state space - - x, y, a, h, vx, vy, va, vh - - contains the bounding box center position (x, y), aspect ratio a, height h, - and their respective velocities. - - Object motion follows a constant velocity model. The bounding box location - (x, y, a, h) is taken as direct observation of the state space (linear - observation model). - - """ - - def __init__(self): - ndim, dt = 4, 1. - - # Create Kalman filter model matrices. - self._motion_mat = np.eye(2 * ndim, 2 * ndim, dtype=np.float32) - for i in range(ndim): - self._motion_mat[i, ndim + i] = dt - self._update_mat = np.eye(ndim, 2 * ndim, dtype=np.float32) - - # Motion and observation uncertainty are chosen relative to the current - # state estimate. These weights control the amount of uncertainty in - # the model. This is a bit hacky. - self._std_weight_position = 1. / 20 - self._std_weight_velocity = 1. / 160 - - def initiate(self, measurement): - """ - Create track from unassociated measurement. - - Args: - measurement (ndarray): Bounding box coordinates (x, y, a, h) with - center position (x, y), aspect ratio a, and height h. - - Returns: - The mean vector (8 dimensional) and covariance matrix (8x8 - dimensional) of the new track. Unobserved velocities are - initialized to 0 mean. - """ - mean_pos = measurement - mean_vel = np.zeros_like(mean_pos) - mean = np.r_[mean_pos, mean_vel] - - std = [ - 2 * self._std_weight_position * measurement[3], - 2 * self._std_weight_position * measurement[3], 1e-2, - 2 * self._std_weight_position * measurement[3], - 10 * self._std_weight_velocity * measurement[3], - 10 * self._std_weight_velocity * measurement[3], 1e-5, - 10 * self._std_weight_velocity * measurement[3] - ] - covariance = np.diag(np.square(std)) - return mean, np.float32(covariance) - - def predict(self, mean, covariance): - """ - Run Kalman filter prediction step. - - Args: - mean (ndarray): The 8 dimensional mean vector of the object state - at the previous time step. - covariance (ndarray): The 8x8 dimensional covariance matrix of the - object state at the previous time step. - - Returns: - The mean vector and covariance matrix of the predicted state. - Unobserved velocities are initialized to 0 mean. - """ - std_pos = [ - self._std_weight_position * mean[3], self._std_weight_position * - mean[3], 1e-2, self._std_weight_position * mean[3] - ] - std_vel = [ - self._std_weight_velocity * mean[3], self._std_weight_velocity * - mean[3], 1e-5, self._std_weight_velocity * mean[3] - ] - motion_cov = np.diag(np.square(np.r_[std_pos, std_vel])) - - #mean = np.dot(self._motion_mat, mean) - mean = np.dot(mean, self._motion_mat.T) - covariance = np.linalg.multi_dot( - (self._motion_mat, covariance, self._motion_mat.T)) + motion_cov - - return mean, covariance - - def project(self, mean, covariance): - """ - Project state distribution to measurement space. - - Args - mean (ndarray): The state's mean vector (8 dimensional array). - covariance (ndarray): The state's covariance matrix (8x8 dimensional). - - Returns: - The projected mean and covariance matrix of the given state estimate. - """ - std = np.array( - [ - self._std_weight_position * mean[3], self._std_weight_position * - mean[3], 1e-1, self._std_weight_position * mean[3] - ], - dtype=np.float32) - - if use_numba: - return nb_project(mean, covariance, std, self._update_mat) - - innovation_cov = np.diag(np.square(std)) - - mean = np.dot(self._update_mat, mean) - covariance = np.linalg.multi_dot((self._update_mat, covariance, - self._update_mat.T)) - return mean, covariance + innovation_cov - - def multi_predict(self, mean, covariance): - """ - Run Kalman filter prediction step (Vectorized version). - - Args: - mean (ndarray): The Nx8 dimensional mean matrix of the object states - at the previous time step. - covariance (ndarray): The Nx8x8 dimensional covariance matrics of the - object states at the previous time step. - - Returns: - The mean vector and covariance matrix of the predicted state. - Unobserved velocities are initialized to 0 mean. - """ - std_pos = np.array([ - self._std_weight_position * mean[:, 3], self._std_weight_position * - mean[:, 3], 1e-2 * np.ones_like(mean[:, 3]), - self._std_weight_position * mean[:, 3] - ]) - std_vel = np.array([ - self._std_weight_velocity * mean[:, 3], self._std_weight_velocity * - mean[:, 3], 1e-5 * np.ones_like(mean[:, 3]), - self._std_weight_velocity * mean[:, 3] - ]) - sqr = np.square(np.r_[std_pos, std_vel]).T - - if use_numba: - - means = [] - covariances = [] - for i in range(len(mean)): - a, b = nb_multi_predict(mean[i], covariance[i], - np.diag(sqr[i]), self._motion_mat) - means.append(a) - covariances.append(b) - return np.asarray(means), np.asarray(covariances) - - motion_cov = [] - for i in range(len(mean)): - motion_cov.append(np.diag(sqr[i])) - motion_cov = np.asarray(motion_cov) - - mean = np.dot(mean, self._motion_mat.T) - left = np.dot(self._motion_mat, covariance).transpose((1, 0, 2)) - covariance = np.dot(left, self._motion_mat.T) + motion_cov - - return mean, covariance - - def update(self, mean, covariance, measurement): - """ - Run Kalman filter correction step. - - Args: - mean (ndarray): The predicted state's mean vector (8 dimensional). - covariance (ndarray): The state's covariance matrix (8x8 dimensional). - measurement (ndarray): The 4 dimensional measurement vector - (x, y, a, h), where (x, y) is the center position, a the aspect - ratio, and h the height of the bounding box. - - Returns: - The measurement-corrected state distribution. - """ - projected_mean, projected_cov = self.project(mean, covariance) - - if use_numba: - - return nb_update(mean, covariance, projected_mean, projected_cov, - measurement, self._update_mat) - - kalman_gain = np.linalg.solve(projected_cov, - (covariance @self._update_mat.T).T).T - innovation = measurement - projected_mean - mean = mean + innovation @kalman_gain.T - covariance = covariance - kalman_gain @projected_cov @kalman_gain.T - return mean, covariance - - def gating_distance(self, - mean, - covariance, - measurements, - only_position=False, - metric='maha'): - """ - Compute gating distance between state distribution and measurements. - A suitable distance threshold can be obtained from `chi2inv95`. If - `only_position` is False, the chi-square distribution has 4 degrees of - freedom, otherwise 2. - - Args: - mean (ndarray): Mean vector over the state distribution (8 - dimensional). - covariance (ndarray): Covariance of the state distribution (8x8 - dimensional). - measurements (ndarray): An Nx4 dimensional matrix of N measurements, - each in format (x, y, a, h) where (x, y) is the bounding box center - position, a the aspect ratio, and h the height. - only_position (Optional[bool]): If True, distance computation is - done with respect to the bounding box center position only. - metric (str): Metric type, 'gaussian' or 'maha'. - - Returns - An array of length N, where the i-th element contains the squared - Mahalanobis distance between (mean, covariance) and `measurements[i]`. - """ - mean, covariance = self.project(mean, covariance) - if only_position: - mean, covariance = mean[:2], covariance[:2, :2] - measurements = measurements[:, :2] - - d = measurements - mean - if metric == 'gaussian': - return np.sum(d * d, axis=1) - elif metric == 'maha': - cholesky_factor = np.linalg.cholesky(covariance) - z = scipy.linalg.solve_triangular( - cholesky_factor, - d.T, - lower=True, - check_finite=False, - overwrite_b=True) - squared_maha = np.sum(z * z, axis=0) - return squared_maha - else: - raise ValueError('invalid distance metric') diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/motion/ocsort_kalman_filter.py b/pdfdet/models/Paddle/ppdet/modeling/mot/motion/ocsort_kalman_filter.py deleted file mode 100644 index 303426f..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/motion/ocsort_kalman_filter.py +++ /dev/null @@ -1,90 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/danbochman/SORT/blob/danny_opencv/kalman_filter.py -""" - -import numpy as np -from numpy import dot, zeros, eye -from numpy.linalg import inv - -use_numba = True -try: - import numba as nb - - @nb.njit(fastmath=True, cache=True) - def nb_predict(x, F, P, Q): - x = dot(F, x) - P = dot(dot(F, P), F.T) + Q - return x, P - - @nb.njit(fastmath=True, cache=True) - def nb_update(x, z, H, P, R, _I): - - y = z - np.dot(H, x) - PHT = dot(P, H.T) - - S = dot(H, PHT) + R - K = dot(PHT, inv(S)) - - x = x + dot(K, y) - - I_KH = _I - dot(K, H) - P = dot(dot(I_KH, P), I_KH.T) + dot(dot(K, R), K.T) - return x, P -except: - use_numba = False - pass - - -class OCSORTKalmanFilter: - def __init__(self, dim_x, dim_z): - self.dim_x = dim_x - self.dim_z = dim_z - self.x = zeros((dim_x, 1)) - self.P = eye(dim_x) - self.Q = eye(dim_x) - self.F = eye(dim_x) - self.H = zeros((dim_z, dim_x)) - self.R = eye(dim_z) - self.M = zeros((dim_z, dim_z)) - - self._I = eye(dim_x) - - def predict(self): - if use_numba: - self.x, self.P = nb_predict(self.x, self.F, self.P, self.Q) - else: - self.x = dot(self.F, self.x) - self.P = dot(dot(self.F, self.P), self.F.T) + self.Q - - def update(self, z): - - if z is None: - return - - if use_numba: - self.x, self.P = nb_update(self.x, z, self.H, self.P, self.R, - self._I) - else: - y = z - np.dot(self.H, self.x) - PHT = dot(self.P, self.H.T) - - S = dot(self.H, PHT) + self.R - K = dot(PHT, inv(S)) - - self.x = self.x + dot(K, y) - - I_KH = self._I - dot(K, self.H) - self.P = dot(dot(I_KH, self.P), I_KH.T) + dot(dot(K, self.R), K.T) diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/__init__.py deleted file mode 100644 index a3c4229..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/__init__.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import base_jde_tracker -from . import base_sde_tracker - -from .base_jde_tracker import * -from .base_sde_tracker import * - -from . import jde_tracker -from . import deepsort_tracker -from . import ocsort_tracker -from . import center_tracker - -from .jde_tracker import * -from .deepsort_tracker import * -from .ocsort_tracker import * -from .botsort_tracker import * -from .center_tracker import * diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/base_jde_tracker.py b/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/base_jde_tracker.py deleted file mode 100644 index e78fe00..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/base_jde_tracker.py +++ /dev/null @@ -1,311 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/multitracker.py -""" - -import numpy as np -from collections import defaultdict -from collections import deque, OrderedDict -from ..matching import jde_matching as matching -from ppdet.core.workspace import register, serializable -import warnings -warnings.filterwarnings("ignore") - -__all__ = [ - 'TrackState', - 'BaseTrack', - 'STrack', - 'joint_stracks', - 'sub_stracks', - 'remove_duplicate_stracks', -] - - -class TrackState(object): - New = 0 - Tracked = 1 - Lost = 2 - Removed = 3 - - -@register -@serializable -class BaseTrack(object): - _count_dict = defaultdict(int) # support single class and multi classes - - track_id = 0 - is_activated = False - state = TrackState.New - - history = OrderedDict() - features = [] - curr_feat = None - score = 0 - start_frame = 0 - frame_id = 0 - time_since_update = 0 - - # multi-camera - location = (np.inf, np.inf) - - @property - def end_frame(self): - return self.frame_id - - @staticmethod - def next_id(cls_id): - BaseTrack._count_dict[cls_id] += 1 - return BaseTrack._count_dict[cls_id] - - # @even: reset track id - @staticmethod - def init_count(num_classes): - """ - Initiate _count for all object classes - :param num_classes: - """ - for cls_id in range(num_classes): - BaseTrack._count_dict[cls_id] = 0 - - @staticmethod - def reset_track_count(cls_id): - BaseTrack._count_dict[cls_id] = 0 - - def activate(self, *args): - raise NotImplementedError - - def predict(self): - raise NotImplementedError - - def update(self, *args, **kwargs): - raise NotImplementedError - - def mark_lost(self): - self.state = TrackState.Lost - - def mark_removed(self): - self.state = TrackState.Removed - - -@register -@serializable -class STrack(BaseTrack): - def __init__(self, tlwh, score, cls_id, buff_size=30, temp_feat=None): - # wait activate - self._tlwh = np.asarray(tlwh, dtype=np.float32) - self.score = score - self.cls_id = cls_id - self.track_len = 0 - - self.kalman_filter = None - self.mean, self.covariance = None, None - self.is_activated = False - - self.use_reid = True if temp_feat is not None else False - if self.use_reid: - self.smooth_feat = None - self.update_features(temp_feat) - self.features = deque([], maxlen=buff_size) - self.alpha = 0.9 - - def update_features(self, feat): - # L2 normalizing, this function has no use for BYTETracker - feat /= np.linalg.norm(feat) - self.curr_feat = feat - if self.smooth_feat is None: - self.smooth_feat = feat - else: - self.smooth_feat = self.alpha * self.smooth_feat + (1.0 - self.alpha - ) * feat - self.features.append(feat) - self.smooth_feat /= np.linalg.norm(self.smooth_feat) - - def predict(self): - mean_state = self.mean.copy() - if self.state != TrackState.Tracked: - mean_state[7] = 0 - self.mean, self.covariance = self.kalman_filter.predict(mean_state, - self.covariance) - - @staticmethod - def multi_predict(tracks, kalman_filter): - if len(tracks) > 0: - multi_mean = np.asarray([track.mean.copy() for track in tracks]) - multi_covariance = np.asarray( - [track.covariance for track in tracks]) - for i, st in enumerate(tracks): - if st.state != TrackState.Tracked: - multi_mean[i][7] = 0 - multi_mean, multi_covariance = kalman_filter.multi_predict( - multi_mean, multi_covariance) - for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)): - tracks[i].mean = mean - tracks[i].covariance = cov - - @staticmethod - def multi_gmc(stracks, H=np.eye(2, 3)): - if len(stracks) > 0: - multi_mean = np.asarray([st.mean.copy() for st in stracks]) - multi_covariance = np.asarray([st.covariance for st in stracks]) - - R = H[:2, :2] - R8x8 = np.kron(np.eye(4, dtype=float), R) - t = H[:2, 2] - - for i, (mean, cov) in enumerate(zip(multi_mean, multi_covariance)): - mean = R8x8.dot(mean) - mean[:2] += t - cov = R8x8.dot(cov).dot(R8x8.transpose()) - - stracks[i].mean = mean - stracks[i].covariance = cov - - def reset_track_id(self): - self.reset_track_count(self.cls_id) - - def activate(self, kalman_filter, frame_id): - """Start a new track""" - self.kalman_filter = kalman_filter - # update track id for the object class - self.track_id = self.next_id(self.cls_id) - self.mean, self.covariance = self.kalman_filter.initiate( - self.tlwh_to_xyah(self._tlwh)) - - self.track_len = 0 - self.state = TrackState.Tracked # set flag 'tracked' - - if frame_id == 1: # to record the first frame's detection result - self.is_activated = True - - self.frame_id = frame_id - self.start_frame = frame_id - - def re_activate(self, new_track, frame_id, new_id=False): - self.mean, self.covariance = self.kalman_filter.update( - self.mean, self.covariance, self.tlwh_to_xyah(new_track.tlwh)) - if self.use_reid: - self.update_features(new_track.curr_feat) - self.track_len = 0 - self.state = TrackState.Tracked - self.is_activated = True - self.frame_id = frame_id - if new_id: # update track id for the object class - self.track_id = self.next_id(self.cls_id) - - def update(self, new_track, frame_id, update_feature=True): - self.frame_id = frame_id - self.track_len += 1 - - new_tlwh = new_track.tlwh - self.mean, self.covariance = self.kalman_filter.update( - self.mean, self.covariance, self.tlwh_to_xyah(new_tlwh)) - self.state = TrackState.Tracked # set flag 'tracked' - self.is_activated = True # set flag 'activated' - - self.score = new_track.score - if update_feature and self.use_reid: - self.update_features(new_track.curr_feat) - - @property - def tlwh(self): - """Get current position in bounding box format `(top left x, top left y, - width, height)`. - """ - if self.mean is None: - return self._tlwh.copy() - - ret = self.mean[:4].copy() - ret[2] *= ret[3] - ret[:2] -= ret[2:] / 2 - return ret - - @property - def tlbr(self): - """Convert bounding box to format `(min x, min y, max x, max y)`, i.e., - `(top left, bottom right)`. - """ - ret = self.tlwh.copy() - ret[2:] += ret[:2] - return ret - - @staticmethod - def tlwh_to_xyah(tlwh): - """Convert bounding box to format `(center x, center y, aspect ratio, - height)`, where the aspect ratio is `width / height`. - """ - ret = np.asarray(tlwh).copy() - ret[:2] += ret[2:] / 2 - ret[2] /= ret[3] - return ret - - def to_xyah(self): - return self.tlwh_to_xyah(self.tlwh) - - @staticmethod - def tlbr_to_tlwh(tlbr): - ret = np.asarray(tlbr).copy() - ret[2:] -= ret[:2] - return ret - - @staticmethod - def tlwh_to_tlbr(tlwh): - ret = np.asarray(tlwh).copy() - ret[2:] += ret[:2] - return ret - - def __repr__(self): - return 'OT_({}-{})_({}-{})'.format(self.cls_id, self.track_id, - self.start_frame, self.end_frame) - - -def joint_stracks(tlista, tlistb): - exists = {} - res = [] - for t in tlista: - exists[t.track_id] = 1 - res.append(t) - for t in tlistb: - tid = t.track_id - if not exists.get(tid, 0): - exists[tid] = 1 - res.append(t) - return res - - -def sub_stracks(tlista, tlistb): - stracks = {} - for t in tlista: - stracks[t.track_id] = t - for t in tlistb: - tid = t.track_id - if stracks.get(tid, 0): - del stracks[tid] - return list(stracks.values()) - - -def remove_duplicate_stracks(stracksa, stracksb): - pdist = matching.iou_distance(stracksa, stracksb) - pairs = np.where(pdist < 0.15) - dupa, dupb = list(), list() - for p, q in zip(*pairs): - timep = stracksa[p].frame_id - stracksa[p].start_frame - timeq = stracksb[q].frame_id - stracksb[q].start_frame - if timep > timeq: - dupb.append(q) - else: - dupa.append(p) - resa = [t for i, t in enumerate(stracksa) if not i in dupa] - resb = [t for i, t in enumerate(stracksb) if not i in dupb] - return resa, resb diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/base_sde_tracker.py b/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/base_sde_tracker.py deleted file mode 100644 index accc201..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/base_sde_tracker.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/nwojke/deep_sort/blob/master/deep_sort/track.py -""" - -import datetime -from ppdet.core.workspace import register, serializable - -__all__ = ['TrackState', 'Track'] - - -class TrackState(object): - """ - Enumeration type for the single target track state. Newly created tracks are - classified as `tentative` until enough evidence has been collected. Then, - the track state is changed to `confirmed`. Tracks that are no longer alive - are classified as `deleted` to mark them for removal from the set of active - tracks. - """ - Tentative = 1 - Confirmed = 2 - Deleted = 3 - - -@register -@serializable -class Track(object): - """ - A single target track with state space `(x, y, a, h)` and associated - velocities, where `(x, y)` is the center of the bounding box, `a` is the - aspect ratio and `h` is the height. - - Args: - mean (ndarray): Mean vector of the initial state distribution. - covariance (ndarray): Covariance matrix of the initial state distribution. - track_id (int): A unique track identifier. - n_init (int): Number of consecutive detections before the track is confirmed. - The track state is set to `Deleted` if a miss occurs within the first - `n_init` frames. - max_age (int): The maximum number of consecutive misses before the track - state is set to `Deleted`. - cls_id (int): The category id of the tracked box. - score (float): The confidence score of the tracked box. - feature (Optional[ndarray]): Feature vector of the detection this track - originates from. If not None, this feature is added to the `features` cache. - - Attributes: - hits (int): Total number of measurement updates. - age (int): Total number of frames since first occurance. - time_since_update (int): Total number of frames since last measurement - update. - state (TrackState): The current track state. - features (List[ndarray]): A cache of features. On each measurement update, - the associated feature vector is added to this list. - """ - - def __init__(self, - mean, - covariance, - track_id, - n_init, - max_age, - cls_id, - score, - feature=None): - self.mean = mean - self.covariance = covariance - self.track_id = track_id - self.hits = 1 - self.age = 1 - self.time_since_update = 0 - self.cls_id = cls_id - self.score = score - self.start_time = datetime.datetime.now() - - self.state = TrackState.Tentative - self.features = [] - self.feat = feature - if feature is not None: - self.features.append(feature) - - self._n_init = n_init - self._max_age = max_age - - def to_tlwh(self): - """Get position in format `(top left x, top left y, width, height)`.""" - ret = self.mean[:4].copy() - ret[2] *= ret[3] - ret[:2] -= ret[2:] / 2 - return ret - - def to_tlbr(self): - """Get position in bounding box format `(min x, miny, max x, max y)`.""" - ret = self.to_tlwh() - ret[2:] = ret[:2] + ret[2:] - return ret - - def predict(self, kalman_filter): - """ - Propagate the state distribution to the current time step using a Kalman - filter prediction step. - """ - self.mean, self.covariance = kalman_filter.predict(self.mean, - self.covariance) - self.age += 1 - self.time_since_update += 1 - - def update(self, kalman_filter, detection): - """ - Perform Kalman filter measurement update step and update the associated - detection feature cache. - """ - self.mean, self.covariance = kalman_filter.update(self.mean, - self.covariance, - detection.to_xyah()) - self.features.append(detection.feature) - self.feat = detection.feature - self.cls_id = detection.cls_id - self.score = detection.score - - self.hits += 1 - self.time_since_update = 0 - if self.state == TrackState.Tentative and self.hits >= self._n_init: - self.state = TrackState.Confirmed - - def mark_missed(self): - """Mark this track as missed (no association at the current time step). - """ - if self.state == TrackState.Tentative: - self.state = TrackState.Deleted - elif self.time_since_update > self._max_age: - self.state = TrackState.Deleted - - def is_tentative(self): - """Returns True if this track is tentative (unconfirmed).""" - return self.state == TrackState.Tentative - - def is_confirmed(self): - """Returns True if this track is confirmed.""" - return self.state == TrackState.Confirmed - - def is_deleted(self): - """Returns True if this track is dead and should be deleted.""" - return self.state == TrackState.Deleted diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/botsort_tracker.py b/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/botsort_tracker.py deleted file mode 100644 index 4f412a7..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/botsort_tracker.py +++ /dev/null @@ -1,242 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/WWangYuHsiang/SMILEtrack/blob/main/BoT-SORT/tracker/bot_sort.py -""" - -import cv2 -import matplotlib.pyplot as plt -import numpy as np -from collections import deque - -from ..matching import jde_matching as matching -from ..motion import GMC -from .base_jde_tracker import TrackState, STrack -from .base_jde_tracker import joint_stracks, sub_stracks, remove_duplicate_stracks -from ..motion import KalmanFilter - -from ppdet.core.workspace import register, serializable - - -@register -@serializable -class BOTSORTTracker(object): - """ - BOTSORT tracker, support single class - - Args: - track_high_thresh (float): threshold of detection high score - track_low_thresh (float): threshold of remove detection score - new_track_thresh (float): threshold of new track score - match_thresh (float): iou threshold for associate - track_buffer (int): tracking reserved frames,default 30 - min_box_area (float): reserved min box - camera_motion (bool): Whether use camera motion, default False - cmc_method (str): camera motion method,defalut sparseOptFlow - frame_rate (int): fps buffer_size=int(frame_rate / 30.0 * track_buffer) - """ - - def __init__(self, - track_high_thresh=0.3, - track_low_thresh=0.2, - new_track_thresh=0.4, - match_thresh=0.7, - track_buffer=30, - min_box_area=0, - camera_motion=False, - cmc_method='sparseOptFlow', - frame_rate=30): - - self.tracked_stracks = [] # type: list[STrack] - self.lost_stracks = [] # type: list[STrack] - self.removed_stracks = [] # type: list[STrack] - - self.frame_id = 0 - - self.track_high_thresh = track_high_thresh - self.track_low_thresh = track_low_thresh - self.new_track_thresh = new_track_thresh - self.match_thresh = match_thresh - self.buffer_size = int(frame_rate / 30.0 * track_buffer) - self.max_time_lost = self.buffer_size - self.kalman_filter = KalmanFilter() - self.min_box_area = min_box_area - - self.camera_motion = camera_motion - self.gmc = GMC(method=cmc_method) - - def update(self, output_results, img=None): - self.frame_id += 1 - activated_starcks = [] - refind_stracks = [] - lost_stracks = [] - removed_stracks = [] - - if len(output_results): - bboxes = output_results[:, 2:6] - scores = output_results[:, 1] - classes = output_results[:, 0] - - # Remove bad detections - lowest_inds = scores > self.track_low_thresh - bboxes = bboxes[lowest_inds] - scores = scores[lowest_inds] - classes = classes[lowest_inds] - - # Find high threshold detections - remain_inds = scores > self.track_high_thresh - dets = bboxes[remain_inds] - scores_keep = scores[remain_inds] - classes_keep = classes[remain_inds] - - else: - bboxes = [] - scores = [] - classes = [] - dets = [] - scores_keep = [] - classes_keep = [] - - if len(dets) > 0: - '''Detections''' - detections = [ - STrack(STrack.tlbr_to_tlwh(tlbr), s, c) - for (tlbr, s, c) in zip(dets, scores_keep, classes_keep) - ] - else: - detections = [] - ''' Add newly detected tracklets to tracked_stracks''' - unconfirmed = [] - tracked_stracks = [] # type: list[STrack] - for track in self.tracked_stracks: - if not track.is_activated: - unconfirmed.append(track) - else: - tracked_stracks.append(track) - ''' Step 2: First association, with high score detection boxes''' - strack_pool = joint_stracks(tracked_stracks, self.lost_stracks) - - # Predict the current location with KF - STrack.multi_predict(strack_pool, self.kalman_filter) - - # Fix camera motion - if self.camera_motion: - warp = self.gmc.apply(img[0], dets) - STrack.multi_gmc(strack_pool, warp) - STrack.multi_gmc(unconfirmed, warp) - - # Associate with high score detection boxes - ious_dists = matching.iou_distance(strack_pool, detections) - matches, u_track, u_detection = matching.linear_assignment( - ious_dists, thresh=self.match_thresh) - - for itracked, idet in matches: - track = strack_pool[itracked] - det = detections[idet] - if track.state == TrackState.Tracked: - track.update(detections[idet], self.frame_id) - activated_starcks.append(track) - else: - track.re_activate(det, self.frame_id, new_id=False) - refind_stracks.append(track) - ''' Step 3: Second association, with low score detection boxes''' - if len(scores): - inds_high = scores < self.track_high_thresh - inds_low = scores > self.track_low_thresh - inds_second = np.logical_and(inds_low, inds_high) - dets_second = bboxes[inds_second] - scores_second = scores[inds_second] - classes_second = classes[inds_second] - else: - dets_second = [] - scores_second = [] - classes_second = [] - - # association the untrack to the low score detections - if len(dets_second) > 0: - '''Detections''' - detections_second = [ - STrack(STrack.tlbr_to_tlwh(tlbr), s, c) for (tlbr, s, c) in - zip(dets_second, scores_second, classes_second) - ] - else: - detections_second = [] - - r_tracked_stracks = [ - strack_pool[i] for i in u_track - if strack_pool[i].state == TrackState.Tracked - ] - dists = matching.iou_distance(r_tracked_stracks, detections_second) - matches, u_track, u_detection_second = matching.linear_assignment( - dists, thresh=0.5) - for itracked, idet in matches: - track = r_tracked_stracks[itracked] - det = detections_second[idet] - if track.state == TrackState.Tracked: - track.update(det, self.frame_id) - activated_starcks.append(track) - else: - track.re_activate(det, self.frame_id, new_id=False) - refind_stracks.append(track) - - for it in u_track: - track = r_tracked_stracks[it] - if not track.state == TrackState.Lost: - track.mark_lost() - lost_stracks.append(track) - '''Deal with unconfirmed tracks, usually tracks with only one beginning frame''' - detections = [detections[i] for i in u_detection] - dists = matching.iou_distance(unconfirmed, detections) - - matches, u_unconfirmed, u_detection = matching.linear_assignment( - dists, thresh=0.7) - for itracked, idet in matches: - unconfirmed[itracked].update(detections[idet], self.frame_id) - activated_starcks.append(unconfirmed[itracked]) - for it in u_unconfirmed: - track = unconfirmed[it] - track.mark_removed() - removed_stracks.append(track) - """ Step 4: Init new stracks""" - for inew in u_detection: - track = detections[inew] - if track.score < self.new_track_thresh: - continue - - track.activate(self.kalman_filter, self.frame_id) - activated_starcks.append(track) - """ Step 5: Update state""" - for track in self.lost_stracks: - if self.frame_id - track.end_frame > self.max_time_lost: - track.mark_removed() - removed_stracks.append(track) - """ Merge """ - self.tracked_stracks = [ - t for t in self.tracked_stracks if t.state == TrackState.Tracked - ] - self.tracked_stracks = joint_stracks(self.tracked_stracks, - activated_starcks) - self.tracked_stracks = joint_stracks(self.tracked_stracks, - refind_stracks) - self.lost_stracks = sub_stracks(self.lost_stracks, self.tracked_stracks) - self.lost_stracks.extend(lost_stracks) - self.lost_stracks = sub_stracks(self.lost_stracks, self.removed_stracks) - self.removed_stracks.extend(removed_stracks) - self.tracked_stracks, self.lost_stracks = remove_duplicate_stracks( - self.tracked_stracks, self.lost_stracks) - - # output_stracks = [track for track in self.tracked_stracks if track.is_activated] - output_stracks = [track for track in self.tracked_stracks] - - return output_stracks diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/center_tracker.py b/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/center_tracker.py deleted file mode 100644 index 8005ddc..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/center_tracker.py +++ /dev/null @@ -1,149 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/xingyizhou/CenterTrack/blob/master/src/lib/utils/tracker.py -""" - -import copy -import numpy as np -import sklearn - -from ppdet.core.workspace import register, serializable -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -__all__ = ['CenterTracker'] - - -@register -@serializable -class CenterTracker(object): - __shared__ = ['num_classes'] - - def __init__(self, - num_classes=1, - min_box_area=0, - vertical_ratio=-1, - track_thresh=0.4, - pre_thresh=0.5, - new_thresh=0.4, - out_thresh=0.4, - hungarian=False): - self.num_classes = num_classes - self.min_box_area = min_box_area - self.vertical_ratio = vertical_ratio - - self.track_thresh = track_thresh - self.pre_thresh = max(track_thresh, pre_thresh) - self.new_thresh = max(track_thresh, new_thresh) - self.out_thresh = max(track_thresh, out_thresh) - self.hungarian = hungarian - - self.reset() - - def init_track(self, results): - print('Initialize tracking!') - for item in results: - if item['score'] > self.new_thresh: - self.id_count += 1 - item['tracking_id'] = self.id_count - if not ('ct' in item): - bbox = item['bbox'] - item['ct'] = [(bbox[0] + bbox[2]) / 2, - (bbox[1] + bbox[3]) / 2] - self.tracks.append(item) - - def reset(self): - self.id_count = 0 - self.tracks = [] - - def update(self, results, public_det=None): - N = len(results) - M = len(self.tracks) - - dets = np.array([det['ct'] + det['tracking'] for det in results], - np.float32) # N x 2 - track_size = np.array([((track['bbox'][2] - track['bbox'][0]) * \ - (track['bbox'][3] - track['bbox'][1])) \ - for track in self.tracks], np.float32) # M - track_cat = np.array([track['class'] for track in self.tracks], - np.int32) # M - item_size = np.array([((item['bbox'][2] - item['bbox'][0]) * \ - (item['bbox'][3] - item['bbox'][1])) \ - for item in results], np.float32) # N - item_cat = np.array([item['class'] for item in results], np.int32) # N - tracks = np.array([pre_det['ct'] for pre_det in self.tracks], - np.float32) # M x 2 - dist = (((tracks.reshape(1, -1, 2) - \ - dets.reshape(-1, 1, 2)) ** 2).sum(axis=2)) # N x M - - invalid = ((dist > track_size.reshape(1, M)) + \ - (dist > item_size.reshape(N, 1)) + \ - (item_cat.reshape(N, 1) != track_cat.reshape(1, M))) > 0 - dist = dist + invalid * 1e18 - - if self.hungarian: - item_score = np.array([item['score'] for item in results], - np.float32) - dist[dist > 1e18] = 1e18 - from sklearn.utils.linear_assignment_ import linear_assignment - matched_indices = linear_assignment(dist) - else: - matched_indices = greedy_assignment(copy.deepcopy(dist)) - - unmatched_dets = [d for d in range(dets.shape[0]) \ - if not (d in matched_indices[:, 0])] - unmatched_tracks = [d for d in range(tracks.shape[0]) \ - if not (d in matched_indices[:, 1])] - - if self.hungarian: - matches = [] - for m in matched_indices: - if dist[m[0], m[1]] > 1e16: - unmatched_dets.append(m[0]) - unmatched_tracks.append(m[1]) - else: - matches.append(m) - matches = np.array(matches).reshape(-1, 2) - else: - matches = matched_indices - - ret = [] - for m in matches: - track = results[m[0]] - track['tracking_id'] = self.tracks[m[1]]['tracking_id'] - ret.append(track) - - # Private detection: create tracks for all un-matched detections - for i in unmatched_dets: - track = results[i] - if track['score'] > self.new_thresh: - self.id_count += 1 - track['tracking_id'] = self.id_count - ret.append(track) - - self.tracks = ret - return ret - - -def greedy_assignment(dist): - matched_indices = [] - if dist.shape[1] == 0: - return np.array(matched_indices, np.int32).reshape(-1, 2) - for i in range(dist.shape[0]): - j = dist[i].argmin() - if dist[i][j] < 1e16: - dist[:, j] = 1e18 - matched_indices.append([i, j]) - return np.array(matched_indices, np.int32).reshape(-1, 2) diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/deepsort_tracker.py b/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/deepsort_tracker.py deleted file mode 100644 index 9065dfe..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/deepsort_tracker.py +++ /dev/null @@ -1,189 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/nwojke/deep_sort/blob/master/deep_sort/tracker.py -""" - -import numpy as np - -from ..motion import KalmanFilter -from ..matching.deepsort_matching import NearestNeighborDistanceMetric -from ..matching.deepsort_matching import iou_cost, min_cost_matching, matching_cascade, gate_cost_matrix -from .base_sde_tracker import Track -from ..utils import Detection - -from ppdet.core.workspace import register, serializable -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -__all__ = ['DeepSORTTracker'] - - -@register -@serializable -class DeepSORTTracker(object): - """ - DeepSORT tracker - - Args: - input_size (list): input feature map size to reid model, [h, w] format, - [64, 192] as default. - min_box_area (int): min box area to filter out low quality boxes - vertical_ratio (float): w/h, the vertical ratio of the bbox to filter - bad results, set 1.6 default for pedestrian tracking. If set <=0 - means no need to filter bboxes. - budget (int): If not None, fix samples per class to at most this number. - Removes the oldest samples when the budget is reached. - max_age (int): maximum number of missed misses before a track is deleted - n_init (float): Number of frames that a track remains in initialization - phase. Number of consecutive detections before the track is confirmed. - The track state is set to `Deleted` if a miss occurs within the first - `n_init` frames. - metric_type (str): either "euclidean" or "cosine", the distance metric - used for measurement to track association. - matching_threshold (float): samples with larger distance are - considered an invalid match. - max_iou_distance (float): max iou distance threshold - motion (object): KalmanFilter instance - """ - - def __init__(self, - input_size=[64, 192], - min_box_area=0, - vertical_ratio=-1, - budget=100, - max_age=70, - n_init=3, - metric_type='cosine', - matching_threshold=0.2, - max_iou_distance=0.9, - motion='KalmanFilter'): - self.input_size = input_size - self.min_box_area = min_box_area - self.vertical_ratio = vertical_ratio - self.max_age = max_age - self.n_init = n_init - self.metric = NearestNeighborDistanceMetric(metric_type, - matching_threshold, budget) - self.max_iou_distance = max_iou_distance - if motion == 'KalmanFilter': - self.motion = KalmanFilter() - - self.tracks = [] - self._next_id = 1 - - def predict(self): - """ - Propagate track state distributions one time step forward. - This function should be called once every time step, before `update`. - """ - for track in self.tracks: - track.predict(self.motion) - - def update(self, pred_dets, pred_embs): - """ - Perform measurement update and track management. - Args: - pred_dets (np.array): Detection results of the image, the shape is - [N, 6], means 'cls_id, score, x0, y0, x1, y1'. - pred_embs (np.array): Embedding results of the image, the shape is - [N, 128], usually pred_embs.shape[1] is a multiple of 128. - """ - pred_cls_ids = pred_dets[:, 0:1] - pred_scores = pred_dets[:, 1:2] - pred_xyxys = pred_dets[:, 2:6] - pred_tlwhs = np.concatenate((pred_xyxys[:, 0:2], pred_xyxys[:, 2:4] - pred_xyxys[:, 0:2] + 1), axis=1) - - detections = [ - Detection(tlwh, score, feat, cls_id) - for tlwh, score, feat, cls_id in zip(pred_tlwhs, pred_scores, - pred_embs, pred_cls_ids) - ] - - # Run matching cascade. - matches, unmatched_tracks, unmatched_detections = \ - self._match(detections) - - # Update track set. - for track_idx, detection_idx in matches: - self.tracks[track_idx].update(self.motion, - detections[detection_idx]) - for track_idx in unmatched_tracks: - self.tracks[track_idx].mark_missed() - for detection_idx in unmatched_detections: - self._initiate_track(detections[detection_idx]) - self.tracks = [t for t in self.tracks if not t.is_deleted()] - - # Update distance metric. - active_targets = [t.track_id for t in self.tracks if t.is_confirmed()] - features, targets = [], [] - for track in self.tracks: - if not track.is_confirmed(): - continue - features += track.features - targets += [track.track_id for _ in track.features] - track.features = [] - self.metric.partial_fit( - np.asarray(features), np.asarray(targets), active_targets) - output_stracks = self.tracks - return output_stracks - - def _match(self, detections): - def gated_metric(tracks, dets, track_indices, detection_indices): - features = np.array([dets[i].feature for i in detection_indices]) - targets = np.array([tracks[i].track_id for i in track_indices]) - cost_matrix = self.metric.distance(features, targets) - cost_matrix = gate_cost_matrix(self.motion, cost_matrix, tracks, - dets, track_indices, - detection_indices) - return cost_matrix - - # Split track set into confirmed and unconfirmed tracks. - confirmed_tracks = [ - i for i, t in enumerate(self.tracks) if t.is_confirmed() - ] - unconfirmed_tracks = [ - i for i, t in enumerate(self.tracks) if not t.is_confirmed() - ] - - # Associate confirmed tracks using appearance features. - matches_a, unmatched_tracks_a, unmatched_detections = \ - matching_cascade( - gated_metric, self.metric.matching_threshold, self.max_age, - self.tracks, detections, confirmed_tracks) - - # Associate remaining tracks together with unconfirmed tracks using IOU. - iou_track_candidates = unconfirmed_tracks + [ - k for k in unmatched_tracks_a - if self.tracks[k].time_since_update == 1 - ] - unmatched_tracks_a = [ - k for k in unmatched_tracks_a - if self.tracks[k].time_since_update != 1 - ] - matches_b, unmatched_tracks_b, unmatched_detections = \ - min_cost_matching( - iou_cost, self.max_iou_distance, self.tracks, - detections, iou_track_candidates, unmatched_detections) - - matches = matches_a + matches_b - unmatched_tracks = list(set(unmatched_tracks_a + unmatched_tracks_b)) - return matches, unmatched_tracks, unmatched_detections - - def _initiate_track(self, detection): - mean, covariance = self.motion.initiate(detection.to_xyah()) - self.tracks.append( - Track(mean, covariance, self._next_id, self.n_init, self.max_age, - detection.cls_id, detection.score, detection.feature)) - self._next_id += 1 diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/jde_tracker.py b/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/jde_tracker.py deleted file mode 100644 index 9571a6b..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/jde_tracker.py +++ /dev/null @@ -1,353 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/Zhongdao/Towards-Realtime-MOT/blob/master/tracker/multitracker.py -""" - -import numpy as np -from collections import defaultdict - -from ..matching import jde_matching as matching -from ..motion import KalmanFilter -from .base_jde_tracker import TrackState, STrack -from .base_jde_tracker import joint_stracks, sub_stracks, remove_duplicate_stracks - -from ppdet.core.workspace import register, serializable -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -__all__ = ['JDETracker'] - - -@register -@serializable -class JDETracker(object): - __shared__ = ['num_classes'] - """ - JDE tracker, support single class and multi classes - - Args: - use_byte (bool): Whether use ByteTracker, default False - num_classes (int): the number of classes - det_thresh (float): threshold of detection score - track_buffer (int): buffer for tracker - min_box_area (int): min box area to filter out low quality boxes - vertical_ratio (float): w/h, the vertical ratio of the bbox to filter - bad results. If set <= 0 means no need to filter bboxes,usually set - 1.6 for pedestrian tracking. - tracked_thresh (float): linear assignment threshold of tracked - stracks and detections - r_tracked_thresh (float): linear assignment threshold of - tracked stracks and unmatched detections - unconfirmed_thresh (float): linear assignment threshold of - unconfirmed stracks and unmatched detections - conf_thres (float): confidence threshold for tracking, also used in - ByteTracker as higher confidence threshold - match_thres (float): linear assignment threshold of tracked - stracks and detections in ByteTracker - low_conf_thres (float): lower confidence threshold for tracking in - ByteTracker - input_size (list): input feature map size to reid model, [h, w] format, - [64, 192] as default. - motion (str): motion model, KalmanFilter as default - metric_type (str): either "euclidean" or "cosine", the distance metric - used for measurement to track association. - """ - - def __init__(self, - use_byte=False, - num_classes=1, - det_thresh=0.3, - track_buffer=30, - min_box_area=0, - vertical_ratio=0, - tracked_thresh=0.7, - r_tracked_thresh=0.5, - unconfirmed_thresh=0.7, - conf_thres=0, - match_thres=0.8, - low_conf_thres=0.2, - input_size=[64, 192], - motion='KalmanFilter', - metric_type='euclidean'): - self.use_byte = use_byte - self.num_classes = num_classes - self.det_thresh = det_thresh if not use_byte else conf_thres + 0.1 - self.track_buffer = track_buffer - self.min_box_area = min_box_area - self.vertical_ratio = vertical_ratio - - self.tracked_thresh = tracked_thresh - self.r_tracked_thresh = r_tracked_thresh - self.unconfirmed_thresh = unconfirmed_thresh - self.conf_thres = conf_thres - self.match_thres = match_thres - self.low_conf_thres = low_conf_thres - - self.input_size = input_size - if motion == 'KalmanFilter': - self.motion = KalmanFilter() - self.metric_type = metric_type - - self.frame_id = 0 - self.tracked_tracks_dict = defaultdict(list) # dict(list[STrack]) - self.lost_tracks_dict = defaultdict(list) # dict(list[STrack]) - self.removed_tracks_dict = defaultdict(list) # dict(list[STrack]) - - self.max_time_lost = 0 - # max_time_lost will be calculated: int(frame_rate / 30.0 * track_buffer) - - def update(self, pred_dets, pred_embs=None): - """ - Processes the image frame and finds bounding box(detections). - Associates the detection with corresponding tracklets and also handles - lost, removed, refound and active tracklets. - - Args: - pred_dets (np.array): Detection results of the image, the shape is - [N, 6], means 'cls_id, score, x0, y0, x1, y1'. - pred_embs (np.array): Embedding results of the image, the shape is - [N, 128] or [N, 512]. - - Return: - output_stracks_dict (dict(list)): The list contains information - regarding the online_tracklets for the received image tensor. - """ - self.frame_id += 1 - if self.frame_id == 1: - STrack.init_count(self.num_classes) - activated_tracks_dict = defaultdict(list) - refined_tracks_dict = defaultdict(list) - lost_tracks_dict = defaultdict(list) - removed_tracks_dict = defaultdict(list) - output_tracks_dict = defaultdict(list) - - pred_dets_dict = defaultdict(list) - pred_embs_dict = defaultdict(list) - - # unify single and multi classes detection and embedding results - for cls_id in range(self.num_classes): - cls_idx = (pred_dets[:, 0:1] == cls_id).squeeze(-1) - pred_dets_dict[cls_id] = pred_dets[cls_idx] - if pred_embs is not None: - pred_embs_dict[cls_id] = pred_embs[cls_idx] - else: - pred_embs_dict[cls_id] = None - - for cls_id in range(self.num_classes): - """ Step 1: Get detections by class""" - pred_dets_cls = pred_dets_dict[cls_id] - pred_embs_cls = pred_embs_dict[cls_id] - remain_inds = (pred_dets_cls[:, 1:2] > self.conf_thres).squeeze(-1) - if remain_inds.sum() > 0: - pred_dets_cls = pred_dets_cls[remain_inds] - if pred_embs_cls is None: - # in original ByteTrack - detections = [ - STrack( - STrack.tlbr_to_tlwh(tlbrs[2:6]), - tlbrs[1], - cls_id, - 30, - temp_feat=None) for tlbrs in pred_dets_cls - ] - else: - pred_embs_cls = pred_embs_cls[remain_inds] - detections = [ - STrack( - STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1], cls_id, - 30, temp_feat) for (tlbrs, temp_feat) in - zip(pred_dets_cls, pred_embs_cls) - ] - else: - detections = [] - ''' Add newly detected tracklets to tracked_stracks''' - unconfirmed_dict = defaultdict(list) - tracked_tracks_dict = defaultdict(list) - for track in self.tracked_tracks_dict[cls_id]: - if not track.is_activated: - # previous tracks which are not active in the current frame are added in unconfirmed list - unconfirmed_dict[cls_id].append(track) - else: - # Active tracks are added to the local list 'tracked_stracks' - tracked_tracks_dict[cls_id].append(track) - """ Step 2: First association, with embedding""" - # building tracking pool for the current frame - track_pool_dict = defaultdict(list) - track_pool_dict[cls_id] = joint_stracks( - tracked_tracks_dict[cls_id], self.lost_tracks_dict[cls_id]) - - # Predict the current location with KalmanFilter - STrack.multi_predict(track_pool_dict[cls_id], self.motion) - - if pred_embs_cls is None: - # in original ByteTrack - dists = matching.iou_distance(track_pool_dict[cls_id], - detections) - matches, u_track, u_detection = matching.linear_assignment( - dists, thresh=self.match_thres) # not self.tracked_thresh - else: - dists = matching.embedding_distance( - track_pool_dict[cls_id], - detections, - metric=self.metric_type) - dists = matching.fuse_motion( - self.motion, dists, track_pool_dict[cls_id], detections) - matches, u_track, u_detection = matching.linear_assignment( - dists, thresh=self.tracked_thresh) - - for i_tracked, idet in matches: - # i_tracked is the id of the track and idet is the detection - track = track_pool_dict[cls_id][i_tracked] - det = detections[idet] - if track.state == TrackState.Tracked: - # If the track is active, add the detection to the track - track.update(detections[idet], self.frame_id) - activated_tracks_dict[cls_id].append(track) - else: - # We have obtained a detection from a track which is not active, - # hence put the track in refind_stracks list - track.re_activate(det, self.frame_id, new_id=False) - refined_tracks_dict[cls_id].append(track) - - # None of the steps below happen if there are no undetected tracks. - """ Step 3: Second association, with IOU""" - if self.use_byte: - inds_low = pred_dets_dict[cls_id][:, 1:2] > self.low_conf_thres - inds_high = pred_dets_dict[cls_id][:, 1:2] < self.conf_thres - inds_second = np.logical_and(inds_low, inds_high).squeeze(-1) - pred_dets_cls_second = pred_dets_dict[cls_id][inds_second] - - # association the untrack to the low score detections - if len(pred_dets_cls_second) > 0: - if pred_embs_dict[cls_id] is None: - # in original ByteTrack - detections_second = [ - STrack( - STrack.tlbr_to_tlwh(tlbrs[2:6]), - tlbrs[1], - cls_id, - 30, - temp_feat=None) - for tlbrs in pred_dets_cls_second - ] - else: - pred_embs_cls_second = pred_embs_dict[cls_id][ - inds_second] - detections_second = [ - STrack( - STrack.tlbr_to_tlwh(tlbrs[2:6]), tlbrs[1], - cls_id, 30, temp_feat) for (tlbrs, temp_feat) in - zip(pred_dets_cls_second, pred_embs_cls_second) - ] - else: - detections_second = [] - r_tracked_stracks = [ - track_pool_dict[cls_id][i] for i in u_track - if track_pool_dict[cls_id][i].state == TrackState.Tracked - ] - dists = matching.iou_distance(r_tracked_stracks, - detections_second) - matches, u_track, u_detection_second = matching.linear_assignment( - dists, thresh=0.4) # not r_tracked_thresh - else: - detections = [detections[i] for i in u_detection] - r_tracked_stracks = [] - for i in u_track: - if track_pool_dict[cls_id][i].state == TrackState.Tracked: - r_tracked_stracks.append(track_pool_dict[cls_id][i]) - dists = matching.iou_distance(r_tracked_stracks, detections) - - matches, u_track, u_detection = matching.linear_assignment( - dists, thresh=self.r_tracked_thresh) - - for i_tracked, idet in matches: - track = r_tracked_stracks[i_tracked] - det = detections[ - idet] if not self.use_byte else detections_second[idet] - if track.state == TrackState.Tracked: - track.update(det, self.frame_id) - activated_tracks_dict[cls_id].append(track) - else: - track.re_activate(det, self.frame_id, new_id=False) - refined_tracks_dict[cls_id].append(track) - - for it in u_track: - track = r_tracked_stracks[it] - if not track.state == TrackState.Lost: - track.mark_lost() - lost_tracks_dict[cls_id].append(track) - '''Deal with unconfirmed tracks, usually tracks with only one beginning frame''' - detections = [detections[i] for i in u_detection] - dists = matching.iou_distance(unconfirmed_dict[cls_id], detections) - matches, u_unconfirmed, u_detection = matching.linear_assignment( - dists, thresh=self.unconfirmed_thresh) - for i_tracked, idet in matches: - unconfirmed_dict[cls_id][i_tracked].update(detections[idet], - self.frame_id) - activated_tracks_dict[cls_id].append(unconfirmed_dict[cls_id][ - i_tracked]) - for it in u_unconfirmed: - track = unconfirmed_dict[cls_id][it] - track.mark_removed() - removed_tracks_dict[cls_id].append(track) - """ Step 4: Init new stracks""" - for inew in u_detection: - track = detections[inew] - if track.score < self.det_thresh: - continue - track.activate(self.motion, self.frame_id) - activated_tracks_dict[cls_id].append(track) - """ Step 5: Update state""" - for track in self.lost_tracks_dict[cls_id]: - if self.frame_id - track.end_frame > self.max_time_lost: - track.mark_removed() - removed_tracks_dict[cls_id].append(track) - - self.tracked_tracks_dict[cls_id] = [ - t for t in self.tracked_tracks_dict[cls_id] - if t.state == TrackState.Tracked - ] - self.tracked_tracks_dict[cls_id] = joint_stracks( - self.tracked_tracks_dict[cls_id], activated_tracks_dict[cls_id]) - self.tracked_tracks_dict[cls_id] = joint_stracks( - self.tracked_tracks_dict[cls_id], refined_tracks_dict[cls_id]) - self.lost_tracks_dict[cls_id] = sub_stracks( - self.lost_tracks_dict[cls_id], self.tracked_tracks_dict[cls_id]) - self.lost_tracks_dict[cls_id].extend(lost_tracks_dict[cls_id]) - self.lost_tracks_dict[cls_id] = sub_stracks( - self.lost_tracks_dict[cls_id], self.removed_tracks_dict[cls_id]) - self.removed_tracks_dict[cls_id].extend(removed_tracks_dict[cls_id]) - self.tracked_tracks_dict[cls_id], self.lost_tracks_dict[ - cls_id] = remove_duplicate_stracks( - self.tracked_tracks_dict[cls_id], - self.lost_tracks_dict[cls_id]) - - # get scores of lost tracks - output_tracks_dict[cls_id] = [ - track for track in self.tracked_tracks_dict[cls_id] - if track.is_activated - ] - - logger.debug('===========Frame {}=========='.format(self.frame_id)) - logger.debug('Activated: {}'.format( - [track.track_id for track in activated_tracks_dict[cls_id]])) - logger.debug('Refind: {}'.format( - [track.track_id for track in refined_tracks_dict[cls_id]])) - logger.debug('Lost: {}'.format( - [track.track_id for track in lost_tracks_dict[cls_id]])) - logger.debug('Removed: {}'.format( - [track.track_id for track in removed_tracks_dict[cls_id]])) - - return output_tracks_dict diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/ocsort_tracker.py b/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/ocsort_tracker.py deleted file mode 100644 index 49b44e3..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/tracker/ocsort_tracker.py +++ /dev/null @@ -1,371 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -This code is based on https://github.com/noahcao/OC_SORT/blob/master/trackers/ocsort_tracker/ocsort.py -""" - -import numpy as np -from ..matching.ocsort_matching import associate, linear_assignment, iou_batch, associate_only_iou -from ..motion.ocsort_kalman_filter import OCSORTKalmanFilter -from ppdet.core.workspace import register, serializable - - -def k_previous_obs(observations, cur_age, k): - if len(observations) == 0: - return [-1, -1, -1, -1, -1] - for i in range(k): - dt = k - i - if cur_age - dt in observations: - return observations[cur_age - dt] - max_age = max(observations.keys()) - return observations[max_age] - - -def convert_bbox_to_z(bbox): - """ - Takes a bounding box in the form [x1,y1,x2,y2] and returns z in the form - [x,y,s,r] where x,y is the centre of the box and s is the scale/area and r is - the aspect ratio - """ - w = bbox[2] - bbox[0] - h = bbox[3] - bbox[1] - x = bbox[0] + w / 2. - y = bbox[1] + h / 2. - s = w * h # scale is just area - r = w / float(h + 1e-6) - return np.array([x, y, s, r]).reshape((4, 1)) - - -def convert_x_to_bbox(x, score=None): - """ - Takes a bounding box in the centre form [x,y,s,r] and returns it in the form - [x1,y1,x2,y2] where x1,y1 is the top left and x2,y2 is the bottom right - """ - w = np.sqrt(x[2] * x[3]) - h = x[2] / w - if (score == None): - return np.array( - [x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., - x[1] + h / 2.]).reshape((1, 4)) - else: - score = np.array([score]) - return np.array([ - x[0] - w / 2., x[1] - h / 2., x[0] + w / 2., x[1] + h / 2., score - ]).reshape((1, 5)) - - -def speed_direction(bbox1, bbox2): - cx1, cy1 = (bbox1[0] + bbox1[2]) / 2.0, (bbox1[1] + bbox1[3]) / 2.0 - cx2, cy2 = (bbox2[0] + bbox2[2]) / 2.0, (bbox2[1] + bbox2[3]) / 2.0 - speed = np.array([cy2 - cy1, cx2 - cx1]) - norm = np.sqrt((cy2 - cy1)**2 + (cx2 - cx1)**2) + 1e-6 - return speed / norm - - -class KalmanBoxTracker(object): - """ - This class represents the internal state of individual tracked objects observed as bbox. - - Args: - bbox (np.array): bbox in [x1,y1,x2,y2,score] format. - delta_t (int): delta_t of previous observation - """ - count = 0 - - def __init__(self, bbox, delta_t=3): - - self.kf = OCSORTKalmanFilter(dim_x=7, dim_z=4) - self.kf.F = np.array([[1., 0, 0, 0, 1., 0, 0], [0, 1., 0, 0, 0, 1., 0], - [0, 0, 1., 0, 0, 0, 1], [0, 0, 0, 1., 0, 0, 0], - [0, 0, 0, 0, 1., 0, 0], [0, 0, 0, 0, 0, 1., 0], - [0, 0, 0, 0, 0, 0, 1.]]) - self.kf.H = np.array([[1., 0, 0, 0, 0, 0, 0], [0, 1., 0, 0, 0, 0, 0], - [0, 0, 1., 0, 0, 0, 0], [0, 0, 0, 1., 0, 0, 0]]) - self.kf.R[2:, 2:] *= 10. - self.kf.P[4:, 4:] *= 1000. - # give high uncertainty to the unobservable initial velocities - self.kf.P *= 10. - self.kf.Q[-1, -1] *= 0.01 - self.kf.Q[4:, 4:] *= 0.01 - - self.score = bbox[4] - self.kf.x[:4] = convert_bbox_to_z(bbox) - self.time_since_update = 0 - self.id = KalmanBoxTracker.count - KalmanBoxTracker.count += 1 - self.history = [] - self.hits = 0 - self.hit_streak = 0 - self.age = 0 - """ - NOTE: [-1,-1,-1,-1,-1] is a compromising placeholder for non-observation status, the same for the return of - function k_previous_obs. It is ugly and I do not like it. But to support generate observation array in a - fast and unified way, which you would see below k_observations = np.array([k_previous_obs(...]]), let's bear it for now. - """ - self.last_observation = np.array([-1, -1, -1, -1, -1]) # placeholder - self.observations = dict() - self.history_observations = [] - self.velocity = None - self.delta_t = delta_t - - def update(self, bbox, angle_cost=False): - """ - Updates the state vector with observed bbox. - """ - if bbox is not None: - if angle_cost and self.last_observation.sum( - ) >= 0: # no previous observation - previous_box = None - for i in range(self.delta_t): - dt = self.delta_t - i - if self.age - dt in self.observations: - previous_box = self.observations[self.age - dt] - break - if previous_box is None: - previous_box = self.last_observation - """ - Estimate the track speed direction with observations \Delta t steps away - """ - self.velocity = speed_direction(previous_box, bbox) - """ - Insert new observations. This is a ugly way to maintain both self.observations - and self.history_observations. Bear it for the moment. - """ - self.last_observation = bbox - self.observations[self.age] = bbox - self.history_observations.append(bbox) - - self.time_since_update = 0 - self.history = [] - self.hits += 1 - self.hit_streak += 1 - self.kf.update(convert_bbox_to_z(bbox)) - else: - self.kf.update(bbox) - - def predict(self): - """ - Advances the state vector and returns the predicted bounding box estimate. - """ - if ((self.kf.x[6] + self.kf.x[2]) <= 0): - self.kf.x[6] *= 0.0 - - self.kf.predict() - self.age += 1 - if (self.time_since_update > 0): - self.hit_streak = 0 - self.time_since_update += 1 - self.history.append(convert_x_to_bbox(self.kf.x, score=self.score)) - return self.history[-1] - - def get_state(self): - return convert_x_to_bbox(self.kf.x, score=self.score) - - -@register -@serializable -class OCSORTTracker(object): - """ - OCSORT tracker, support single class - - Args: - det_thresh (float): threshold of detection score - max_age (int): maximum number of missed misses before a track is deleted - min_hits (int): minimum hits for associate - iou_threshold (float): iou threshold for associate - delta_t (int): delta_t of previous observation - inertia (float): vdc_weight of angle_diff_cost for associate - vertical_ratio (float): w/h, the vertical ratio of the bbox to filter - bad results. If set <= 0 means no need to filter bboxes,usually set - 1.6 for pedestrian tracking. - min_box_area (int): min box area to filter out low quality boxes - use_byte (bool): Whether use ByteTracker, default False - """ - - def __init__(self, - det_thresh=0.6, - max_age=30, - min_hits=3, - iou_threshold=0.3, - delta_t=3, - inertia=0.2, - vertical_ratio=-1, - min_box_area=0, - use_byte=False, - use_angle_cost=False): - self.det_thresh = det_thresh - self.max_age = max_age - self.min_hits = min_hits - self.iou_threshold = iou_threshold - self.delta_t = delta_t - self.inertia = inertia - self.vertical_ratio = vertical_ratio - self.min_box_area = min_box_area - self.use_byte = use_byte - self.use_angle_cost = use_angle_cost - - self.trackers = [] - self.frame_count = 0 - KalmanBoxTracker.count = 0 - - def update(self, pred_dets, pred_embs=None): - """ - Args: - pred_dets (np.array): Detection results of the image, the shape is - [N, 6], means 'cls_id, score, x0, y0, x1, y1'. - pred_embs (np.array): Embedding results of the image, the shape is - [N, 128] or [N, 512], default as None. - - Return: - tracking boxes (np.array): [M, 6], means 'x0, y0, x1, y1, score, id'. - """ - if pred_dets is None: - return np.empty((0, 6)) - - self.frame_count += 1 - - bboxes = pred_dets[:, 2:] - scores = pred_dets[:, 1:2] - dets = np.concatenate((bboxes, scores), axis=1) - scores = scores.squeeze(-1) - - inds_low = scores > 0.1 - inds_high = scores < self.det_thresh - inds_second = np.logical_and(inds_low, inds_high) - # self.det_thresh > score > 0.1, for second matching - dets_second = dets[inds_second] # detections for second matching - remain_inds = scores > self.det_thresh - dets = dets[remain_inds] - - # get predicted locations from existing trackers. - trks = np.zeros((len(self.trackers), 5)) - to_del = [] - ret = [] - for t, trk in enumerate(trks): - pos = self.trackers[t].predict()[0] - trk[:] = [pos[0], pos[1], pos[2], pos[3], 0] - if np.any(np.isnan(pos)): - to_del.append(t) - trks = np.ma.compress_rows(np.ma.masked_invalid(trks)) - for t in reversed(to_del): - self.trackers.pop(t) - - if self.use_angle_cost: - velocities = np.array([ - trk.velocity if trk.velocity is not None else np.array((0, 0)) - for trk in self.trackers - ]) - - k_observations = np.array([ - k_previous_obs(trk.observations, trk.age, self.delta_t) - for trk in self.trackers - ]) - last_boxes = np.array([trk.last_observation for trk in self.trackers]) - """ - First round of association - """ - if self.use_angle_cost: - matched, unmatched_dets, unmatched_trks = associate( - dets, trks, self.iou_threshold, velocities, k_observations, - self.inertia) - else: - matched, unmatched_dets, unmatched_trks = associate_only_iou( - dets, trks, self.iou_threshold) - - for m in matched: - self.trackers[m[1]].update( - dets[m[0], :], angle_cost=self.use_angle_cost) - """ - Second round of associaton by OCR - """ - # BYTE association - if self.use_byte and len(dets_second) > 0 and unmatched_trks.shape[ - 0] > 0: - u_trks = trks[unmatched_trks] - iou_left = iou_batch( - dets_second, - u_trks) # iou between low score detections and unmatched tracks - iou_left = np.array(iou_left) - if iou_left.max() > self.iou_threshold: - """ - NOTE: by using a lower threshold, e.g., self.iou_threshold - 0.1, you may - get a higher performance especially on MOT17/MOT20 datasets. But we keep it - uniform here for simplicity - """ - matched_indices = linear_assignment(-iou_left) - to_remove_trk_indices = [] - for m in matched_indices: - det_ind, trk_ind = m[0], unmatched_trks[m[1]] - if iou_left[m[0], m[1]] < self.iou_threshold: - continue - self.trackers[trk_ind].update( - dets_second[det_ind, :], angle_cost=self.use_angle_cost) - to_remove_trk_indices.append(trk_ind) - unmatched_trks = np.setdiff1d(unmatched_trks, - np.array(to_remove_trk_indices)) - - if unmatched_dets.shape[0] > 0 and unmatched_trks.shape[0] > 0: - left_dets = dets[unmatched_dets] - left_trks = last_boxes[unmatched_trks] - iou_left = iou_batch(left_dets, left_trks) - iou_left = np.array(iou_left) - if iou_left.max() > self.iou_threshold: - """ - NOTE: by using a lower threshold, e.g., self.iou_threshold - 0.1, you may - get a higher performance especially on MOT17/MOT20 datasets. But we keep it - uniform here for simplicity - """ - rematched_indices = linear_assignment(-iou_left) - to_remove_det_indices = [] - to_remove_trk_indices = [] - for m in rematched_indices: - det_ind, trk_ind = unmatched_dets[m[0]], unmatched_trks[m[ - 1]] - if iou_left[m[0], m[1]] < self.iou_threshold: - continue - self.trackers[trk_ind].update( - dets[det_ind, :], angle_cost=self.use_angle_cost) - to_remove_det_indices.append(det_ind) - to_remove_trk_indices.append(trk_ind) - unmatched_dets = np.setdiff1d(unmatched_dets, - np.array(to_remove_det_indices)) - unmatched_trks = np.setdiff1d(unmatched_trks, - np.array(to_remove_trk_indices)) - - for m in unmatched_trks: - self.trackers[m].update(None) - - # create and initialise new trackers for unmatched detections - for i in unmatched_dets: - trk = KalmanBoxTracker(dets[i, :], delta_t=self.delta_t) - self.trackers.append(trk) - - i = len(self.trackers) - for trk in reversed(self.trackers): - if trk.last_observation.sum() < 0: - d = trk.get_state()[0] - else: - d = trk.last_observation # tlbr + score - if (trk.time_since_update < 1) and ( - trk.hit_streak >= self.min_hits or - self.frame_count <= self.min_hits): - # +1 as MOT benchmark requires positive - ret.append(np.concatenate((d, [trk.id + 1])).reshape(1, -1)) - i -= 1 - # remove dead tracklet - if (trk.time_since_update > self.max_age): - self.trackers.pop(i) - if (len(ret) > 0): - return np.concatenate(ret) - return np.empty((0, 6)) diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/utils.py b/pdfdet/models/Paddle/ppdet/modeling/mot/utils.py deleted file mode 100644 index f19b0d9..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/utils.py +++ /dev/null @@ -1,265 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import cv2 -import time -import numpy as np -from .visualization import plot_tracking_dict, plot_tracking - -__all__ = [ - 'MOTTimer', - 'Detection', - 'write_mot_results', - 'save_vis_results', - 'load_det_results', - 'preprocess_reid', - 'get_crops', - 'clip_box', - 'scale_coords', -] - - -class MOTTimer(object): - """ - This class used to compute and print the current FPS while evaling. - """ - - def __init__(self): - self.total_time = 0. - self.calls = 0 - self.start_time = 0. - self.diff = 0. - self.average_time = 0. - self.duration = 0. - - def tic(self): - # using time.time instead of time.clock because time time.clock - # does not normalize for multithreading - self.start_time = time.time() - - def toc(self, average=True): - self.diff = time.time() - self.start_time - self.total_time += self.diff - self.calls += 1 - self.average_time = self.total_time / self.calls - if average: - self.duration = self.average_time - else: - self.duration = self.diff - return self.duration - - def clear(self): - self.total_time = 0. - self.calls = 0 - self.start_time = 0. - self.diff = 0. - self.average_time = 0. - self.duration = 0. - - -class Detection(object): - """ - This class represents a bounding box detection in a single image. - - Args: - tlwh (Tensor): Bounding box in format `(top left x, top left y, - width, height)`. - score (Tensor): Bounding box confidence score. - feature (Tensor): A feature vector that describes the object - contained in this image. - cls_id (Tensor): Bounding box category id. - """ - - def __init__(self, tlwh, score, feature, cls_id): - self.tlwh = np.asarray(tlwh, dtype=np.float32) - self.score = float(score) - self.feature = np.asarray(feature, dtype=np.float32) - self.cls_id = int(cls_id) - - def to_tlbr(self): - """ - Convert bounding box to format `(min x, min y, max x, max y)`, i.e., - `(top left, bottom right)`. - """ - ret = self.tlwh.copy() - ret[2:] += ret[:2] - return ret - - def to_xyah(self): - """ - Convert bounding box to format `(center x, center y, aspect ratio, - height)`, where the aspect ratio is `width / height`. - """ - ret = self.tlwh.copy() - ret[:2] += ret[2:] / 2 - ret[2] /= ret[3] - return ret - - -def write_mot_results(filename, results, data_type='mot', num_classes=1): - # support single and multi classes - if data_type in ['mot', 'mcmot']: - save_format = '{frame},{id},{x1},{y1},{w},{h},{score},{cls_id},-1,-1\n' - elif data_type == 'kitti': - save_format = '{frame} {id} car 0 0 -10 {x1} {y1} {x2} {y2} -10 -10 -10 -1000 -1000 -1000 -10\n' - else: - raise ValueError(data_type) - - f = open(filename, 'w') - for cls_id in range(num_classes): - for frame_id, tlwhs, tscores, track_ids in results[cls_id]: - if data_type == 'kitti': - frame_id -= 1 - for tlwh, score, track_id in zip(tlwhs, tscores, track_ids): - if track_id < 0: continue - if data_type == 'mot': - cls_id = -1 - - x1, y1, w, h = tlwh - x2, y2 = x1 + w, y1 + h - line = save_format.format( - frame=frame_id, - id=track_id, - x1=x1, - y1=y1, - x2=x2, - y2=y2, - w=w, - h=h, - score=score, - cls_id=cls_id) - f.write(line) - print('MOT results save in {}'.format(filename)) - - -def save_vis_results(data, - frame_id, - online_ids, - online_tlwhs, - online_scores, - average_time, - show_image, - save_dir, - num_classes=1, - ids2names=[]): - if show_image or save_dir is not None: - assert 'ori_image' in data - img0 = data['ori_image'].numpy()[0] - if online_ids is None: - online_im = img0 - else: - if isinstance(online_tlwhs, dict): - online_im = plot_tracking_dict( - img0, - num_classes, - online_tlwhs, - online_ids, - online_scores, - frame_id=frame_id, - fps=1. / average_time, - ids2names=ids2names) - else: - online_im = plot_tracking( - img0, - online_tlwhs, - online_ids, - online_scores, - frame_id=frame_id, - fps=1. / average_time, - ids2names=ids2names) - if show_image: - cv2.imshow('online_im', online_im) - if save_dir is not None: - cv2.imwrite( - os.path.join(save_dir, '{:05d}.jpg'.format(frame_id)), online_im) - - -def load_det_results(det_file, num_frames): - assert os.path.exists(det_file) and os.path.isfile(det_file), \ - '{} is not exist or not a file.'.format(det_file) - labels = np.loadtxt(det_file, dtype='float32', delimiter=',') - assert labels.shape[1] == 7, \ - "Each line of {} should have 7 items: '[frame_id],[x0],[y0],[w],[h],[score],[class_id]'.".format(det_file) - results_list = [] - for frame_i in range(num_frames): - results = {'bbox': [], 'score': [], 'cls_id': []} - lables_with_frame = labels[labels[:, 0] == frame_i + 1] - # each line of lables_with_frame: - # [frame_id],[x0],[y0],[w],[h],[score],[class_id] - for l in lables_with_frame: - results['bbox'].append(l[1:5]) - results['score'].append(l[5:6]) - results['cls_id'].append(l[6:7]) - results_list.append(results) - return results_list - - -def scale_coords(coords, input_shape, im_shape, scale_factor): - # Note: ratio has only one value, scale_factor[0] == scale_factor[1] - # - # This function only used for JDE YOLOv3 or other detectors with - # LetterBoxResize and JDEBBoxPostProcess, coords output from detector had - # not scaled back to the origin image. - - ratio = scale_factor[0] - pad_w = (input_shape[1] - int(im_shape[1])) / 2 - pad_h = (input_shape[0] - int(im_shape[0])) / 2 - coords[:, 0::2] -= pad_w - coords[:, 1::2] -= pad_h - coords[:, 0:4] /= ratio - coords[:, :4] = np.clip(coords[:, :4], a_min=0, a_max=coords[:, :4].max()) - return coords.round() - - -def clip_box(xyxy, ori_image_shape): - H, W = ori_image_shape - xyxy[:, 0::2] = np.clip(xyxy[:, 0::2], a_min=0, a_max=W) - xyxy[:, 1::2] = np.clip(xyxy[:, 1::2], a_min=0, a_max=H) - w = xyxy[:, 2:3] - xyxy[:, 0:1] - h = xyxy[:, 3:4] - xyxy[:, 1:2] - mask = np.logical_and(h > 0, w > 0) - keep_idx = np.nonzero(mask) - return xyxy[keep_idx[0]], keep_idx - - -def get_crops(xyxy, ori_img, w, h): - crops = [] - xyxy = xyxy.astype(np.int64) - ori_img = ori_img.numpy() - ori_img = np.squeeze(ori_img, axis=0).transpose(1, 0, 2) # [h,w,3]->[w,h,3] - for i, bbox in enumerate(xyxy): - crop = ori_img[bbox[0]:bbox[2], bbox[1]:bbox[3], :] - crops.append(crop) - crops = preprocess_reid(crops, w, h) - return crops - - -def preprocess_reid(imgs, - w=64, - h=192, - mean=[0.485, 0.456, 0.406], - std=[0.229, 0.224, 0.225]): - im_batch = [] - for img in imgs: - img = cv2.resize(img, (w, h)) - img = img[:, :, ::-1].astype('float32').transpose((2, 0, 1)) / 255 - img_mean = np.array(mean).reshape((3, 1, 1)) - img_std = np.array(std).reshape((3, 1, 1)) - img -= img_mean - img /= img_std - img = np.expand_dims(img, axis=0) - im_batch.append(img) - im_batch = np.concatenate(im_batch, 0) - return im_batch diff --git a/pdfdet/models/Paddle/ppdet/modeling/mot/visualization.py b/pdfdet/models/Paddle/ppdet/modeling/mot/visualization.py deleted file mode 100644 index 6d13a28..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/mot/visualization.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import cv2 -import numpy as np - - -def get_color(idx): - idx = idx * 3 - color = ((37 * idx) % 255, (17 * idx) % 255, (29 * idx) % 255) - return color - - -def plot_tracking(image, - tlwhs, - obj_ids, - scores=None, - frame_id=0, - fps=0., - ids2names=[]): - im = np.ascontiguousarray(np.copy(image)) - im_h, im_w = im.shape[:2] - - top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255 - - text_scale = max(1, image.shape[1] / 1600.) - text_thickness = 2 - line_thickness = max(1, int(image.shape[1] / 500.)) - - radius = max(5, int(im_w / 140.)) - cv2.putText( - im, - 'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)), - (0, int(15 * text_scale)), - cv2.FONT_HERSHEY_PLAIN, - text_scale, (0, 0, 255), - thickness=2) - - for i, tlwh in enumerate(tlwhs): - x1, y1, w, h = tlwh - intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h))) - obj_id = int(obj_ids[i]) - id_text = '{}'.format(int(obj_id)) - if ids2names != []: - assert len( - ids2names) == 1, "plot_tracking only supports single classes." - id_text = '{}_'.format(ids2names[0]) + id_text - _line_thickness = 1 if obj_id <= 0 else line_thickness - color = get_color(abs(obj_id)) - cv2.rectangle( - im, intbox[0:2], intbox[2:4], color=color, thickness=line_thickness) - cv2.putText( - im, - id_text, (intbox[0], intbox[1] - 10), - cv2.FONT_HERSHEY_PLAIN, - text_scale, (0, 0, 255), - thickness=text_thickness) - - if scores is not None: - text = '{:.2f}'.format(float(scores[i])) - cv2.putText( - im, - text, (intbox[0], intbox[1] + 10), - cv2.FONT_HERSHEY_PLAIN, - text_scale, (0, 255, 255), - thickness=text_thickness) - return im - - -def plot_tracking_dict(image, - num_classes, - tlwhs_dict, - obj_ids_dict, - scores_dict, - frame_id=0, - fps=0., - ids2names=[]): - im = np.ascontiguousarray(np.copy(image)) - im_h, im_w = im.shape[:2] - - top_view = np.zeros([im_w, im_w, 3], dtype=np.uint8) + 255 - - text_scale = max(1, image.shape[1] / 1600.) - text_thickness = 2 - line_thickness = max(1, int(image.shape[1] / 500.)) - - radius = max(5, int(im_w / 140.)) - - for cls_id in range(num_classes): - tlwhs = tlwhs_dict[cls_id] - obj_ids = obj_ids_dict[cls_id] - scores = scores_dict[cls_id] - cv2.putText( - im, - 'frame: %d fps: %.2f num: %d' % (frame_id, fps, len(tlwhs)), - (0, int(15 * text_scale)), - cv2.FONT_HERSHEY_PLAIN, - text_scale, (0, 0, 255), - thickness=2) - - for i, tlwh in enumerate(tlwhs): - x1, y1, w, h = tlwh - intbox = tuple(map(int, (x1, y1, x1 + w, y1 + h))) - obj_id = int(obj_ids[i]) - - id_text = '{}'.format(int(obj_id)) - if ids2names != []: - id_text = '{}_{}'.format(ids2names[cls_id], id_text) - else: - id_text = 'class{}_{}'.format(cls_id, id_text) - - _line_thickness = 1 if obj_id <= 0 else line_thickness - color = get_color(abs(obj_id)) - cv2.rectangle( - im, - intbox[0:2], - intbox[2:4], - color=color, - thickness=line_thickness) - cv2.putText( - im, - id_text, (intbox[0], intbox[1] - 10), - cv2.FONT_HERSHEY_PLAIN, - text_scale, (0, 0, 255), - thickness=text_thickness) - - if scores is not None: - text = '{:.2f}'.format(float(scores[i])) - cv2.putText( - im, - text, (intbox[0], intbox[1] + 10), - cv2.FONT_HERSHEY_PLAIN, - text_scale, (0, 255, 255), - thickness=text_thickness) - return im diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/necks/__init__.py deleted file mode 100644 index afd2a95..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/necks/__init__.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import fpn -from . import yolo_fpn -from . import hrfpn -from . import ttf_fpn -from . import centernet_fpn -from . import bifpn -from . import csp_pan -from . import es_pan -from . import lc_pan -from . import custom_pan -from . import dilated_encoder -from . import clrnet_fpn - -from .fpn import * -from .yolo_fpn import * -from .hrfpn import * -from .ttf_fpn import * -from .centernet_fpn import * -from .blazeface_fpn import * -from .bifpn import * -from .csp_pan import * -from .es_pan import * -from .lc_pan import * -from .custom_pan import * -from .dilated_encoder import * -from .channel_mapper import * -from .clrnet_fpn import * diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/bifpn.py b/pdfdet/models/Paddle/ppdet/modeling/necks/bifpn.py deleted file mode 100644 index 9e794b8..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/necks/bifpn.py +++ /dev/null @@ -1,300 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.nn.initializer import Constant - -from ppdet.core.workspace import register, serializable -from ppdet.modeling.layers import ConvNormLayer -from ..shape_spec import ShapeSpec - -__all__ = ['BiFPN'] - - -class SeparableConvLayer(nn.Layer): - def __init__(self, - in_channels, - out_channels=None, - kernel_size=3, - norm_type='bn', - norm_groups=32, - act='swish'): - super(SeparableConvLayer, self).__init__() - assert norm_type in ['bn', 'sync_bn', 'gn', None] - assert act in ['swish', 'relu', None] - - self.in_channels = in_channels - if out_channels is None: - self.out_channels = self.in_channels - self.norm_type = norm_type - self.norm_groups = norm_groups - self.depthwise_conv = nn.Conv2D( - in_channels, - in_channels, - kernel_size, - padding=kernel_size // 2, - groups=in_channels, - bias_attr=False) - self.pointwise_conv = nn.Conv2D(in_channels, self.out_channels, 1) - - # norm type - if self.norm_type in ['bn', 'sync_bn']: - self.norm = nn.BatchNorm2D(self.out_channels) - elif self.norm_type == 'gn': - self.norm = nn.GroupNorm( - num_groups=self.norm_groups, num_channels=self.out_channels) - - # activation - if act == 'swish': - self.act = nn.Swish() - elif act == 'relu': - self.act = nn.ReLU() - - def forward(self, x): - if self.act is not None: - x = self.act(x) - out = self.depthwise_conv(x) - out = self.pointwise_conv(out) - if self.norm_type is not None: - out = self.norm(out) - return out - - -class BiFPNCell(nn.Layer): - def __init__(self, - channels=256, - num_levels=5, - eps=1e-5, - use_weighted_fusion=True, - kernel_size=3, - norm_type='bn', - norm_groups=32, - act='swish'): - super(BiFPNCell, self).__init__() - self.channels = channels - self.num_levels = num_levels - self.eps = eps - self.use_weighted_fusion = use_weighted_fusion - - # up - self.conv_up = nn.LayerList([ - SeparableConvLayer( - self.channels, - kernel_size=kernel_size, - norm_type=norm_type, - norm_groups=norm_groups, - act=act) for _ in range(self.num_levels - 1) - ]) - # down - self.conv_down = nn.LayerList([ - SeparableConvLayer( - self.channels, - kernel_size=kernel_size, - norm_type=norm_type, - norm_groups=norm_groups, - act=act) for _ in range(self.num_levels - 1) - ]) - - if self.use_weighted_fusion: - self.up_weights = self.create_parameter( - shape=[self.num_levels - 1, 2], - attr=ParamAttr(initializer=Constant(1.))) - self.down_weights = self.create_parameter( - shape=[self.num_levels - 1, 3], - attr=ParamAttr(initializer=Constant(1.))) - - def _feature_fusion_cell(self, - conv_layer, - lateral_feat, - sampling_feat, - route_feat=None, - weights=None): - if self.use_weighted_fusion: - weights = F.relu(weights) - weights = weights / (weights.sum() + self.eps) - if route_feat is not None: - out_feat = weights[0] * lateral_feat + \ - weights[1] * sampling_feat + \ - weights[2] * route_feat - else: - out_feat = weights[0] * lateral_feat + \ - weights[1] * sampling_feat - else: - if route_feat is not None: - out_feat = lateral_feat + sampling_feat + route_feat - else: - out_feat = lateral_feat + sampling_feat - - out_feat = conv_layer(out_feat) - return out_feat - - def forward(self, feats): - # feats: [P3 - P7] - lateral_feats = [] - - # up - up_feature = feats[-1] - for i, feature in enumerate(feats[::-1]): - if i == 0: - lateral_feats.append(feature) - else: - shape = paddle.shape(feature) - up_feature = F.interpolate( - up_feature, size=[shape[2], shape[3]]) - lateral_feature = self._feature_fusion_cell( - self.conv_up[i - 1], - feature, - up_feature, - weights=self.up_weights[i - 1] - if self.use_weighted_fusion else None) - lateral_feats.append(lateral_feature) - up_feature = lateral_feature - - out_feats = [] - # down - down_feature = lateral_feats[-1] - for i, (lateral_feature, - route_feature) in enumerate(zip(lateral_feats[::-1], feats)): - if i == 0: - out_feats.append(lateral_feature) - else: - down_feature = F.max_pool2d(down_feature, 3, 2, 1) - if i == len(feats) - 1: - route_feature = None - weights = self.down_weights[ - i - 1][:2] if self.use_weighted_fusion else None - else: - weights = self.down_weights[ - i - 1] if self.use_weighted_fusion else None - out_feature = self._feature_fusion_cell( - self.conv_down[i - 1], - lateral_feature, - down_feature, - route_feature, - weights=weights) - out_feats.append(out_feature) - down_feature = out_feature - - return out_feats - - -@register -@serializable -class BiFPN(nn.Layer): - """ - Bidirectional Feature Pyramid Network, see https://arxiv.org/abs/1911.09070 - - Args: - in_channels (list[int]): input channels of each level which can be - derived from the output shape of backbone by from_config. - out_channel (int): output channel of each level. - num_extra_levels (int): the number of extra stages added to the last level. - default: 2 - fpn_strides (List): The stride of each level. - num_stacks (int): the number of stacks for BiFPN, default: 1. - use_weighted_fusion (bool): use weighted feature fusion in BiFPN, default: True. - norm_type (string|None): the normalization type in BiFPN module. If - norm_type is None, norm will not be used after conv and if - norm_type is string, bn, gn, sync_bn are available. default: bn. - norm_groups (int): if you use gn, set this param. - act (string|None): the activation function of BiFPN. - """ - - def __init__(self, - in_channels=(512, 1024, 2048), - out_channel=256, - num_extra_levels=2, - fpn_strides=[8, 16, 32, 64, 128], - num_stacks=1, - use_weighted_fusion=True, - norm_type='bn', - norm_groups=32, - act='swish'): - super(BiFPN, self).__init__() - assert num_stacks > 0, "The number of stacks of BiFPN is at least 1." - assert norm_type in ['bn', 'sync_bn', 'gn', None] - assert act in ['swish', 'relu', None] - assert num_extra_levels >= 0, \ - "The `num_extra_levels` must be non negative(>=0)." - - self.in_channels = in_channels - self.out_channel = out_channel - self.num_extra_levels = num_extra_levels - self.num_stacks = num_stacks - self.use_weighted_fusion = use_weighted_fusion - self.norm_type = norm_type - self.norm_groups = norm_groups - self.act = act - self.num_levels = len(self.in_channels) + self.num_extra_levels - if len(fpn_strides) != self.num_levels: - for i in range(self.num_extra_levels): - fpn_strides += [fpn_strides[-1] * 2] - self.fpn_strides = fpn_strides - - self.lateral_convs = nn.LayerList() - for in_c in in_channels: - self.lateral_convs.append( - ConvNormLayer(in_c, self.out_channel, 1, 1)) - if self.num_extra_levels > 0: - self.extra_convs = nn.LayerList() - for i in range(self.num_extra_levels): - if i == 0: - self.extra_convs.append( - ConvNormLayer(self.in_channels[-1], self.out_channel, 3, - 2)) - else: - self.extra_convs.append(nn.MaxPool2D(3, 2, 1)) - - self.bifpn_cells = nn.LayerList() - for i in range(self.num_stacks): - self.bifpn_cells.append( - BiFPNCell( - self.out_channel, - self.num_levels, - use_weighted_fusion=self.use_weighted_fusion, - norm_type=self.norm_type, - norm_groups=self.norm_groups, - act=self.act)) - - @classmethod - def from_config(cls, cfg, input_shape): - return { - 'in_channels': [i.channels for i in input_shape], - 'fpn_strides': [i.stride for i in input_shape] - } - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=self.out_channel, stride=s) for s in self.fpn_strides - ] - - def forward(self, feats): - assert len(feats) == len(self.in_channels) - fpn_feats = [] - for conv_layer, feature in zip(self.lateral_convs, feats): - fpn_feats.append(conv_layer(feature)) - if self.num_extra_levels > 0: - feat = feats[-1] - for conv_layer in self.extra_convs: - feat = conv_layer(feat) - fpn_feats.append(feat) - - for bifpn_cell in self.bifpn_cells: - fpn_feats = bifpn_cell(fpn_feats) - return fpn_feats diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/blazeface_fpn.py b/pdfdet/models/Paddle/ppdet/modeling/necks/blazeface_fpn.py deleted file mode 100644 index b903c97..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/necks/blazeface_fpn.py +++ /dev/null @@ -1,213 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn.functional as F -from paddle import ParamAttr -import paddle.nn as nn -from paddle.nn.initializer import KaimingNormal -from ppdet.core.workspace import register, serializable -from ..shape_spec import ShapeSpec - -__all__ = ['BlazeNeck'] - - -def hard_swish(x): - return x * F.relu6(x + 3) / 6. - - -class ConvBNLayer(nn.Layer): - def __init__(self, - in_channels, - out_channels, - kernel_size, - stride, - padding, - num_groups=1, - act='relu', - conv_lr=0.1, - conv_decay=0., - norm_decay=0., - norm_type='bn', - name=None): - super(ConvBNLayer, self).__init__() - self.act = act - self._conv = nn.Conv2D( - in_channels, - out_channels, - kernel_size=kernel_size, - stride=stride, - padding=padding, - groups=num_groups, - weight_attr=ParamAttr( - learning_rate=conv_lr, initializer=KaimingNormal()), - bias_attr=False) - - if norm_type in ['sync_bn', 'bn']: - self._batch_norm = nn.BatchNorm2D(out_channels) - - def forward(self, x): - x = self._conv(x) - x = self._batch_norm(x) - if self.act == "relu": - x = F.relu(x) - elif self.act == "relu6": - x = F.relu6(x) - elif self.act == 'leaky': - x = F.leaky_relu(x) - elif self.act == 'hard_swish': - x = hard_swish(x) - return x - - -class FPN(nn.Layer): - def __init__(self, in_channels, out_channels, name=None): - super(FPN, self).__init__() - self.conv1_fpn = ConvBNLayer( - in_channels, - out_channels // 2, - kernel_size=1, - padding=0, - stride=1, - act='leaky', - name=name + '_output1') - self.conv2_fpn = ConvBNLayer( - in_channels, - out_channels // 2, - kernel_size=1, - padding=0, - stride=1, - act='leaky', - name=name + '_output2') - self.conv3_fpn = ConvBNLayer( - out_channels // 2, - out_channels // 2, - kernel_size=3, - padding=1, - stride=1, - act='leaky', - name=name + '_merge') - - def forward(self, input): - output1 = self.conv1_fpn(input[0]) - output2 = self.conv2_fpn(input[1]) - up2 = F.upsample( - output2, size=paddle.shape(output1)[-2:], mode='nearest') - output1 = paddle.add(output1, up2) - output1 = self.conv3_fpn(output1) - return output1, output2 - - -class SSH(nn.Layer): - def __init__(self, in_channels, out_channels, name=None): - super(SSH, self).__init__() - assert out_channels % 4 == 0 - self.conv0_ssh = ConvBNLayer( - in_channels, - out_channels // 2, - kernel_size=3, - padding=1, - stride=1, - act=None, - name=name + 'ssh_conv3') - self.conv1_ssh = ConvBNLayer( - out_channels // 2, - out_channels // 4, - kernel_size=3, - padding=1, - stride=1, - act='leaky', - name=name + 'ssh_conv5_1') - self.conv2_ssh = ConvBNLayer( - out_channels // 4, - out_channels // 4, - kernel_size=3, - padding=1, - stride=1, - act=None, - name=name + 'ssh_conv5_2') - self.conv3_ssh = ConvBNLayer( - out_channels // 4, - out_channels // 4, - kernel_size=3, - padding=1, - stride=1, - act='leaky', - name=name + 'ssh_conv7_1') - self.conv4_ssh = ConvBNLayer( - out_channels // 4, - out_channels // 4, - kernel_size=3, - padding=1, - stride=1, - act=None, - name=name + 'ssh_conv7_2') - - def forward(self, x): - conv0 = self.conv0_ssh(x) - conv1 = self.conv1_ssh(conv0) - conv2 = self.conv2_ssh(conv1) - conv3 = self.conv3_ssh(conv2) - conv4 = self.conv4_ssh(conv3) - concat = paddle.concat([conv0, conv2, conv4], axis=1) - return F.relu(concat) - - -@register -@serializable -class BlazeNeck(nn.Layer): - def __init__(self, in_channel, neck_type="None", data_format='NCHW'): - super(BlazeNeck, self).__init__() - self.neck_type = neck_type - self.reture_input = False - self._out_channels = in_channel - if self.neck_type == 'None': - self.reture_input = True - if "fpn" in self.neck_type: - self.fpn = FPN(self._out_channels[0], - self._out_channels[1], - name='fpn') - self._out_channels = [ - self._out_channels[0] // 2, self._out_channels[1] // 2 - ] - if "ssh" in self.neck_type: - self.ssh1 = SSH(self._out_channels[0], - self._out_channels[0], - name='ssh1') - self.ssh2 = SSH(self._out_channels[1], - self._out_channels[1], - name='ssh2') - self._out_channels = [self._out_channels[0], self._out_channels[1]] - - def forward(self, inputs): - if self.reture_input: - return inputs - output1, output2 = None, None - if "fpn" in self.neck_type: - backout_4, backout_1 = inputs - output1, output2 = self.fpn([backout_4, backout_1]) - if self.neck_type == "only_fpn": - return [output1, output2] - if self.neck_type == "only_ssh": - output1, output2 = inputs - feature1 = self.ssh1(output1) - feature2 = self.ssh2(output2) - return [feature1, feature2] - - @property - def out_shape(self): - return [ - ShapeSpec(channels=c) - for c in [self._out_channels[0], self._out_channels[1]] - ] diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/centernet_fpn.py b/pdfdet/models/Paddle/ppdet/modeling/necks/centernet_fpn.py deleted file mode 100644 index d4dded8..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/necks/centernet_fpn.py +++ /dev/null @@ -1,426 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import math -import paddle -import paddle.nn as nn -from paddle import ParamAttr -from paddle.nn.initializer import Uniform -import paddle.nn.functional as F -from ppdet.core.workspace import register, serializable -from ppdet.modeling.layers import ConvNormLayer -from ppdet.modeling.backbones.hardnet import ConvLayer, HarDBlock -from ..shape_spec import ShapeSpec - -__all__ = ['CenterNetDLAFPN', 'CenterNetHarDNetFPN'] - - -# SGE attention -class BasicConv(nn.Layer): - def __init__(self, - in_planes, - out_planes, - kernel_size, - stride=1, - padding=0, - dilation=1, - groups=1, - relu=True, - bn=True, - bias_attr=False): - super(BasicConv, self).__init__() - self.out_channels = out_planes - self.conv = nn.Conv2D( - in_planes, - out_planes, - kernel_size=kernel_size, - stride=stride, - padding=padding, - dilation=dilation, - groups=groups, - bias_attr=bias_attr) - self.bn = nn.BatchNorm2D( - out_planes, - epsilon=1e-5, - momentum=0.01, - weight_attr=False, - bias_attr=False) if bn else None - self.relu = nn.ReLU() if relu else None - - def forward(self, x): - x = self.conv(x) - if self.bn is not None: - x = self.bn(x) - if self.relu is not None: - x = self.relu(x) - return x - - -class ChannelPool(nn.Layer): - def forward(self, x): - return paddle.concat( - (paddle.max(x, 1).unsqueeze(1), paddle.mean(x, 1).unsqueeze(1)), - axis=1) - - -class SpatialGate(nn.Layer): - def __init__(self): - super(SpatialGate, self).__init__() - kernel_size = 7 - self.compress = ChannelPool() - self.spatial = BasicConv( - 2, - 1, - kernel_size, - stride=1, - padding=(kernel_size - 1) // 2, - relu=False) - - def forward(self, x): - x_compress = self.compress(x) - x_out = self.spatial(x_compress) - scale = F.sigmoid(x_out) # broadcasting - return x * scale - - -def fill_up_weights(up): - weight = up.weight.numpy() - f = math.ceil(weight.shape[2] / 2) - c = (2 * f - 1 - f % 2) / (2. * f) - for i in range(weight.shape[2]): - for j in range(weight.shape[3]): - weight[0, 0, i, j] = \ - (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) - for c in range(1, weight.shape[0]): - weight[c, 0, :, :] = weight[0, 0, :, :] - up.weight.set_value(weight) - - -class IDAUp(nn.Layer): - def __init__(self, ch_ins, ch_out, up_strides, dcn_v2=True): - super(IDAUp, self).__init__() - for i in range(1, len(ch_ins)): - ch_in = ch_ins[i] - up_s = int(up_strides[i]) - fan_in = ch_in * 3 * 3 - stdv = 1. / math.sqrt(fan_in) - proj = nn.Sequential( - ConvNormLayer( - ch_in, - ch_out, - filter_size=3, - stride=1, - use_dcn=dcn_v2, - bias_on=dcn_v2, - norm_decay=None, - dcn_lr_scale=1., - dcn_regularizer=None, - initializer=Uniform(-stdv, stdv)), - nn.ReLU()) - node = nn.Sequential( - ConvNormLayer( - ch_out, - ch_out, - filter_size=3, - stride=1, - use_dcn=dcn_v2, - bias_on=dcn_v2, - norm_decay=None, - dcn_lr_scale=1., - dcn_regularizer=None, - initializer=Uniform(-stdv, stdv)), - nn.ReLU()) - - kernel_size = up_s * 2 - fan_in = ch_out * kernel_size * kernel_size - stdv = 1. / math.sqrt(fan_in) - up = nn.Conv2DTranspose( - ch_out, - ch_out, - kernel_size=up_s * 2, - stride=up_s, - padding=up_s // 2, - groups=ch_out, - weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)), - bias_attr=False) - fill_up_weights(up) - setattr(self, 'proj_' + str(i), proj) - setattr(self, 'up_' + str(i), up) - setattr(self, 'node_' + str(i), node) - - def forward(self, inputs, start_level, end_level): - for i in range(start_level + 1, end_level): - upsample = getattr(self, 'up_' + str(i - start_level)) - project = getattr(self, 'proj_' + str(i - start_level)) - inputs[i] = project(inputs[i]) - inputs[i] = upsample(inputs[i]) - node = getattr(self, 'node_' + str(i - start_level)) - inputs[i] = node(paddle.add(inputs[i], inputs[i - 1])) - return inputs - - -class DLAUp(nn.Layer): - def __init__(self, start_level, channels, scales, ch_in=None, dcn_v2=True): - super(DLAUp, self).__init__() - self.start_level = start_level - if ch_in is None: - ch_in = channels - self.channels = channels - channels = list(channels) - scales = np.array(scales, dtype=int) - for i in range(len(channels) - 1): - j = -i - 2 - setattr( - self, - 'ida_{}'.format(i), - IDAUp( - ch_in[j:], - channels[j], - scales[j:] // scales[j], - dcn_v2=dcn_v2)) - scales[j + 1:] = scales[j] - ch_in[j + 1:] = [channels[j] for _ in channels[j + 1:]] - - def forward(self, inputs): - out = [inputs[-1]] # start with 32 - for i in range(len(inputs) - self.start_level - 1): - ida = getattr(self, 'ida_{}'.format(i)) - outputs = ida(inputs, len(inputs) - i - 2, len(inputs)) - out.insert(0, outputs[-1]) - return out - - -@register -@serializable -class CenterNetDLAFPN(nn.Layer): - """ - Args: - in_channels (list): number of input feature channels from backbone. - [16, 32, 64, 128, 256, 512] by default, means the channels of DLA-34 - down_ratio (int): the down ratio from images to heatmap, 4 by default - last_level (int): the last level of input feature fed into the upsamplng block - out_channel (int): the channel of the output feature, 0 by default means - the channel of the input feature whose down ratio is `down_ratio` - first_level (None): the first level of input feature fed into the upsamplng block. - if None, the first level stands for logs(down_ratio) - dcn_v2 (bool): whether use the DCNv2, True by default - with_sge (bool): whether use SGE attention, False by default - """ - - def __init__(self, - in_channels, - down_ratio=4, - last_level=5, - out_channel=0, - first_level=None, - dcn_v2=True, - with_sge=False): - super(CenterNetDLAFPN, self).__init__() - self.first_level = int(np.log2( - down_ratio)) if first_level is None else first_level - assert self.first_level >= 0, "first level in CenterNetDLAFPN should be greater or equal to 0, but received {}".format( - self.first_level) - self.down_ratio = down_ratio - self.last_level = last_level - scales = [2**i for i in range(len(in_channels[self.first_level:]))] - self.dla_up = DLAUp( - self.first_level, - in_channels[self.first_level:], - scales, - dcn_v2=dcn_v2) - self.out_channel = out_channel - if out_channel == 0: - self.out_channel = in_channels[self.first_level] - self.ida_up = IDAUp( - in_channels[self.first_level:self.last_level], - self.out_channel, - [2**i for i in range(self.last_level - self.first_level)], - dcn_v2=dcn_v2) - - self.with_sge = with_sge - if self.with_sge: - self.sge_attention = SpatialGate() - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape]} - - def forward(self, body_feats): - - inputs = [body_feats[i] for i in range(len(body_feats))] - - dla_up_feats = self.dla_up(inputs) - - ida_up_feats = [] - for i in range(self.last_level - self.first_level): - ida_up_feats.append(dla_up_feats[i].clone()) - - self.ida_up(ida_up_feats, 0, len(ida_up_feats)) - - feat = ida_up_feats[-1] - if self.with_sge: - feat = self.sge_attention(feat) - if self.down_ratio != 4: - feat = F.interpolate( - feat, - scale_factor=self.down_ratio // 4, - mode="bilinear", - align_corners=True) - return feat - - @property - def out_shape(self): - return [ShapeSpec(channels=self.out_channel, stride=self.down_ratio)] - - -class TransitionUp(nn.Layer): - def __init__(self, in_channels, out_channels): - super().__init__() - - def forward(self, x, skip): - w, h = skip.shape[2], skip.shape[3] - out = F.interpolate(x, size=(w, h), mode="bilinear", align_corners=True) - out = paddle.concat([out, skip], 1) - return out - - -@register -@serializable -class CenterNetHarDNetFPN(nn.Layer): - """ - Args: - in_channels (list): number of input feature channels from backbone. - [96, 214, 458, 784] by default, means the channels of HarDNet85 - num_layers (int): HarDNet laters, 85 by default - down_ratio (int): the down ratio from images to heatmap, 4 by default - first_level (int|None): the first level of input feature fed into the upsamplng block. - if None, the first level stands for logs(down_ratio) - 1 - - last_level (int): the last level of input feature fed into the upsamplng block - out_channel (int): the channel of the output feature, 0 by default means - the channel of the input feature whose down ratio is `down_ratio` - """ - - def __init__(self, - in_channels, - num_layers=85, - down_ratio=4, - first_level=None, - last_level=4, - out_channel=0): - super(CenterNetHarDNetFPN, self).__init__() - self.first_level = int(np.log2( - down_ratio)) - 1 if first_level is None else first_level - assert self.first_level >= 0, "first level in CenterNetDLAFPN should be greater or equal to 0, but received {}".format( - self.first_level) - self.down_ratio = down_ratio - self.last_level = last_level - self.last_pool = nn.AvgPool2D(kernel_size=2, stride=2) - - assert num_layers in [68, 85], "HarDNet-{} not support.".format( - num_layers) - if num_layers == 85: - self.last_proj = ConvLayer(784, 256, kernel_size=1) - self.last_blk = HarDBlock(768, 80, 1.7, 8) - self.skip_nodes = [1, 3, 8, 13] - self.SC = [32, 32, 0] - gr = [64, 48, 28] - layers = [8, 8, 4] - ch_list2 = [224 + self.SC[0], 160 + self.SC[1], 96 + self.SC[2]] - channels = [96, 214, 458, 784] - self.skip_lv = 3 - - elif num_layers == 68: - self.last_proj = ConvLayer(654, 192, kernel_size=1) - self.last_blk = HarDBlock(576, 72, 1.7, 8) - self.skip_nodes = [1, 3, 8, 11] - self.SC = [32, 32, 0] - gr = [48, 32, 20] - layers = [8, 8, 4] - ch_list2 = [224 + self.SC[0], 96 + self.SC[1], 64 + self.SC[2]] - channels = [64, 124, 328, 654] - self.skip_lv = 2 - - self.transUpBlocks = nn.LayerList([]) - self.denseBlocksUp = nn.LayerList([]) - self.conv1x1_up = nn.LayerList([]) - self.avg9x9 = nn.AvgPool2D(kernel_size=(9, 9), stride=1, padding=(4, 4)) - prev_ch = self.last_blk.get_out_ch() - - for i in range(3): - skip_ch = channels[3 - i] - self.transUpBlocks.append(TransitionUp(prev_ch, prev_ch)) - if i < self.skip_lv: - cur_ch = prev_ch + skip_ch - else: - cur_ch = prev_ch - self.conv1x1_up.append( - ConvLayer( - cur_ch, ch_list2[i], kernel_size=1)) - cur_ch = ch_list2[i] - cur_ch -= self.SC[i] - cur_ch *= 3 - - blk = HarDBlock(cur_ch, gr[i], 1.7, layers[i]) - self.denseBlocksUp.append(blk) - prev_ch = blk.get_out_ch() - - prev_ch += self.SC[0] + self.SC[1] + self.SC[2] - self.out_channel = prev_ch - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape]} - - def forward(self, body_feats): - x = body_feats[-1] - x_sc = [] - x = self.last_proj(x) - x = self.last_pool(x) - x2 = self.avg9x9(x) - x3 = x / (x.sum((2, 3), keepdim=True) + 0.1) - x = paddle.concat([x, x2, x3], 1) - x = self.last_blk(x) - - for i in range(3): - skip_x = body_feats[3 - i] - x_up = self.transUpBlocks[i](x, skip_x) - x_ch = self.conv1x1_up[i](x_up) - if self.SC[i] > 0: - end = x_ch.shape[1] - new_st = end - self.SC[i] - x_sc.append(x_ch[:, new_st:, :, :]) - x_ch = x_ch[:, :new_st, :, :] - x2 = self.avg9x9(x_ch) - x3 = x_ch / (x_ch.sum((2, 3), keepdim=True) + 0.1) - x_new = paddle.concat([x_ch, x2, x3], 1) - x = self.denseBlocksUp[i](x_new) - - scs = [x] - for i in range(3): - if self.SC[i] > 0: - scs.insert( - 0, - F.interpolate( - x_sc[i], - size=(x.shape[2], x.shape[3]), - mode="bilinear", - align_corners=True)) - neck_feat = paddle.concat(scs, 1) - return neck_feat - - @property - def out_shape(self): - return [ShapeSpec(channels=self.out_channel, stride=self.down_ratio)] diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/channel_mapper.py b/pdfdet/models/Paddle/ppdet/modeling/necks/channel_mapper.py deleted file mode 100644 index 6eff3f8..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/necks/channel_mapper.py +++ /dev/null @@ -1,122 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -this code is base on mmdet: git@github.com:open-mmlab/mmdetection.git -""" -import paddle.nn as nn - -from ppdet.core.workspace import register, serializable -from ..backbones.hrnet import ConvNormLayer -from ..shape_spec import ShapeSpec -from ..initializer import xavier_uniform_, constant_ - -__all__ = ['ChannelMapper'] - - -@register -@serializable -class ChannelMapper(nn.Layer): - """Channel Mapper to reduce/increase channels of backbone features. - - This is used to reduce/increase channels of backbone features. - - Args: - in_channels (List[int]): Number of input channels per scale. - out_channels (int): Number of output channels (used at each scale). - kernel_size (int, optional): kernel_size for reducing channels (used - at each scale). Default: 3. - conv_cfg (dict, optional): Config dict for convolution layer. - Default: None. - norm_cfg (dict, optional): Config dict for normalization layer. - Default: None. - act_cfg (dict, optional): Config dict for activation layer in - ConvModule. Default: dict(type='ReLU'). - num_outs (int, optional): Number of output feature maps. There - would be extra_convs when num_outs larger than the length - of in_channels. - init_cfg (dict or list[dict], optional): Initialization config dict. - - """ - - def __init__(self, - in_channels, - out_channels, - kernel_size=3, - norm_type="gn", - norm_groups=32, - act='relu', - num_outs=None, - init_cfg=dict( - type='Xavier', layer='Conv2d', distribution='uniform')): - super(ChannelMapper, self).__init__() - assert isinstance(in_channels, list) - self.extra_convs = None - if num_outs is None: - num_outs = len(in_channels) - self.convs = nn.LayerList() - for in_channel in in_channels: - self.convs.append( - ConvNormLayer( - ch_in=in_channel, - ch_out=out_channels, - filter_size=kernel_size, - norm_type='gn', - norm_groups=32, - act=act)) - - if num_outs > len(in_channels): - self.extra_convs = nn.LayerList() - for i in range(len(in_channels), num_outs): - if i == len(in_channels): - in_channel = in_channels[-1] - else: - in_channel = out_channels - self.extra_convs.append( - ConvNormLayer( - ch_in=in_channel, - ch_out=out_channels, - filter_size=3, - stride=2, - norm_type='gn', - norm_groups=32, - act=act)) - self.init_weights() - - def forward(self, inputs): - """Forward function.""" - assert len(inputs) == len(self.convs) - outs = [self.convs[i](inputs[i]) for i in range(len(inputs))] - if self.extra_convs: - for i in range(len(self.extra_convs)): - if i == 0: - outs.append(self.extra_convs[0](inputs[-1])) - else: - outs.append(self.extra_convs[i](outs[-1])) - return tuple(outs) - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=self.out_channel, stride=1. / s) - for s in self.spatial_scales - ] - - def init_weights(self): - """Initialize the transformer weights.""" - for p in self.parameters(): - if p.rank() > 1: - xavier_uniform_(p) - if hasattr(p, 'bias') and p.bias is not None: - constant_(p.bais) diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/clrnet_fpn.py b/pdfdet/models/Paddle/ppdet/modeling/necks/clrnet_fpn.py deleted file mode 100644 index 936c7e7..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/necks/clrnet_fpn.py +++ /dev/null @@ -1,254 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.nn.initializer import XavierUniform -from ppdet.modeling.initializer import kaiming_normal_, constant_ -from ppdet.core.workspace import register, serializable -from ppdet.modeling.layers import ConvNormLayer -from ppdet.modeling.shape_spec import ShapeSpec - -__all__ = ['CLRFPN'] - - -@register -@serializable -class CLRFPN(nn.Layer): - """ - Feature Pyramid Network, see https://arxiv.org/abs/1612.03144 - Args: - in_channels (list[int]): input channels of each level which can be - derived from the output shape of backbone by from_config - out_channel (int): output channel of each level - spatial_scales (list[float]): the spatial scales between input feature - maps and original input image which can be derived from the output - shape of backbone by from_config - has_extra_convs (bool): whether to add extra conv to the last level. - default False - extra_stage (int): the number of extra stages added to the last level. - default 1 - use_c5 (bool): Whether to use c5 as the input of extra stage, - otherwise p5 is used. default True - norm_type (string|None): The normalization type in FPN module. If - norm_type is None, norm will not be used after conv and if - norm_type is string, bn, gn, sync_bn are available. default None - norm_decay (float): weight decay for normalization layer weights. - default 0. - freeze_norm (bool): whether to freeze normalization layer. - default False - relu_before_extra_convs (bool): whether to add relu before extra convs. - default False - - """ - - def __init__(self, - in_channels, - out_channel, - spatial_scales=[0.25, 0.125, 0.0625, 0.03125], - has_extra_convs=False, - extra_stage=1, - use_c5=True, - norm_type=None, - norm_decay=0., - freeze_norm=False, - relu_before_extra_convs=True): - super(CLRFPN, self).__init__() - self.out_channel = out_channel - for s in range(extra_stage): - spatial_scales = spatial_scales + [spatial_scales[-1] / 2.] - self.spatial_scales = spatial_scales - self.has_extra_convs = has_extra_convs - self.extra_stage = extra_stage - self.use_c5 = use_c5 - self.relu_before_extra_convs = relu_before_extra_convs - self.norm_type = norm_type - self.norm_decay = norm_decay - self.freeze_norm = freeze_norm - self.in_channels = in_channels - self.lateral_convs = [] - self.fpn_convs = [] - fan = out_channel * 3 * 3 - - # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone - # 0 <= st_stage < ed_stage <= 3 - st_stage = 4 - len(in_channels) - ed_stage = st_stage + len(in_channels) - 1 - - for i in range(st_stage, ed_stage + 1): - # if i == 3: - # lateral_name = 'fpn_inner_res5_sum' - # else: - # lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2) - lateral_name = "lateral_convs.{}.conv".format(i - 1) - in_c = in_channels[i - st_stage] - if self.norm_type is not None: - lateral = self.add_sublayer( - lateral_name, - ConvNormLayer( - ch_in=in_c, - ch_out=out_channel, - filter_size=1, - stride=1, - norm_type=self.norm_type, - norm_decay=self.norm_decay, - freeze_norm=self.freeze_norm, - initializer=XavierUniform(fan_out=in_c))) - else: - lateral = self.add_sublayer( - lateral_name, - nn.Conv2D( - in_channels=in_c, - out_channels=out_channel, - kernel_size=1, - weight_attr=ParamAttr( - initializer=XavierUniform(fan_out=in_c)))) - self.lateral_convs.append(lateral) - - fpn_name = "fpn_convs.{}.conv".format(i - 1) - if self.norm_type is not None: - fpn_conv = self.add_sublayer( - fpn_name, - ConvNormLayer( - ch_in=out_channel, - ch_out=out_channel, - filter_size=3, - stride=1, - norm_type=self.norm_type, - norm_decay=self.norm_decay, - freeze_norm=self.freeze_norm, - initializer=XavierUniform(fan_out=fan))) - else: - fpn_conv = self.add_sublayer( - fpn_name, - nn.Conv2D( - in_channels=out_channel, - out_channels=out_channel, - kernel_size=3, - padding=1, - weight_attr=ParamAttr( - initializer=XavierUniform(fan_out=fan)))) - self.fpn_convs.append(fpn_conv) - - # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5) - if self.has_extra_convs: - for i in range(self.extra_stage): - lvl = ed_stage + 1 + i - if i == 0 and self.use_c5: - in_c = in_channels[-1] - else: - in_c = out_channel - extra_fpn_name = 'fpn_{}'.format(lvl + 2) - if self.norm_type is not None: - extra_fpn_conv = self.add_sublayer( - extra_fpn_name, - ConvNormLayer( - ch_in=in_c, - ch_out=out_channel, - filter_size=3, - stride=2, - norm_type=self.norm_type, - norm_decay=self.norm_decay, - freeze_norm=self.freeze_norm, - initializer=XavierUniform(fan_out=fan))) - else: - extra_fpn_conv = self.add_sublayer( - extra_fpn_name, - nn.Conv2D( - in_channels=in_c, - out_channels=out_channel, - kernel_size=3, - stride=2, - padding=1, - weight_attr=ParamAttr( - initializer=XavierUniform(fan_out=fan)))) - self.fpn_convs.append(extra_fpn_conv) - self.init_weights() - - def init_weights(self): - for m in self.lateral_convs: - if isinstance(m, (nn.Conv1D, nn.Conv2D)): - kaiming_normal_( - m.weight, a=0, mode='fan_out', nonlinearity='relu') - if m.bias is not None: - constant_(m.bias, value=0.) - elif isinstance(m, (nn.BatchNorm1D, nn.BatchNorm2D)): - constant_(m.weight, value=1) - constant_(m.bias, value=0) - for m in self.fpn_convs: - if isinstance(m, (nn.Conv1D, nn.Conv2D)): - kaiming_normal_( - m.weight, a=0, mode='fan_out', nonlinearity='relu') - if m.bias is not None: - constant_(m.bias, value=0.) - elif isinstance(m, (nn.BatchNorm1D, nn.BatchNorm2D)): - constant_(m.weight, value=1) - constant_(m.bias, value=0) - - @classmethod - def from_config(cls, cfg, input_shape): - return {} - - def forward(self, body_feats): - laterals = [] - if len(body_feats) > len(self.in_channels): - for _ in range(len(body_feats) - len(self.in_channels)): - del body_feats[0] - num_levels = len(body_feats) - # print("body_feats",num_levels) - for i in range(num_levels): - laterals.append(self.lateral_convs[i](body_feats[i])) - - for i in range(1, num_levels): - lvl = num_levels - i - upsample = F.interpolate( - laterals[lvl], - scale_factor=2., - mode='nearest', ) - laterals[lvl - 1] += upsample - - fpn_output = [] - for lvl in range(num_levels): - fpn_output.append(self.fpn_convs[lvl](laterals[lvl])) - - if self.extra_stage > 0: - # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN) - if not self.has_extra_convs: - assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs' - fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2)) - # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5) - else: - if self.use_c5: - extra_source = body_feats[-1] - else: - extra_source = fpn_output[-1] - fpn_output.append(self.fpn_convs[num_levels](extra_source)) - - for i in range(1, self.extra_stage): - if self.relu_before_extra_convs: - fpn_output.append(self.fpn_convs[num_levels + i](F.relu( - fpn_output[-1]))) - else: - fpn_output.append(self.fpn_convs[num_levels + i]( - fpn_output[-1])) - return fpn_output - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=self.out_channel, stride=1. / s) - for s in self.spatial_scales - ] diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/csp_pan.py b/pdfdet/models/Paddle/ppdet/modeling/necks/csp_pan.py deleted file mode 100644 index 5c3539a..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/necks/csp_pan.py +++ /dev/null @@ -1,363 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The code is based on: -# https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/necks/yolox_pafpn.py - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from ppdet.core.workspace import register, serializable -from ..shape_spec import ShapeSpec - -__all__ = ['CSPPAN'] - - -class ConvBNLayer(nn.Layer): - def __init__(self, - in_channel=96, - out_channel=96, - kernel_size=3, - stride=1, - groups=1, - act='leaky_relu'): - super(ConvBNLayer, self).__init__() - initializer = nn.initializer.KaimingUniform() - self.conv = nn.Conv2D( - in_channels=in_channel, - out_channels=out_channel, - kernel_size=kernel_size, - groups=groups, - padding=(kernel_size - 1) // 2, - stride=stride, - weight_attr=ParamAttr(initializer=initializer), - bias_attr=False) - self.bn = nn.BatchNorm2D(out_channel) - if act == "hard_swish": - act = 'hardswish' - self.act = act - - def forward(self, x): - x = self.bn(self.conv(x)) - if self.act: - x = getattr(F, self.act)(x) - return x - - -class DPModule(nn.Layer): - """ - Depth-wise and point-wise module. - Args: - in_channel (int): The input channels of this Module. - out_channel (int): The output channels of this Module. - kernel_size (int): The conv2d kernel size of this Module. - stride (int): The conv2d's stride of this Module. - act (str): The activation function of this Module, - Now support `leaky_relu` and `hard_swish`. - """ - - def __init__(self, - in_channel=96, - out_channel=96, - kernel_size=3, - stride=1, - act='leaky_relu', - use_act_in_out=True): - super(DPModule, self).__init__() - initializer = nn.initializer.KaimingUniform() - self.use_act_in_out = use_act_in_out - self.dwconv = nn.Conv2D( - in_channels=in_channel, - out_channels=out_channel, - kernel_size=kernel_size, - groups=out_channel, - padding=(kernel_size - 1) // 2, - stride=stride, - weight_attr=ParamAttr(initializer=initializer), - bias_attr=False) - self.bn1 = nn.BatchNorm2D(out_channel) - self.pwconv = nn.Conv2D( - in_channels=out_channel, - out_channels=out_channel, - kernel_size=1, - groups=1, - padding=0, - weight_attr=ParamAttr(initializer=initializer), - bias_attr=False) - self.bn2 = nn.BatchNorm2D(out_channel) - if act == "hard_swish": - act = 'hardswish' - self.act = act - - def forward(self, x): - x = self.bn1(self.dwconv(x)) - if self.act: - x = getattr(F, self.act)(x) - x = self.bn2(self.pwconv(x)) - if self.use_act_in_out and self.act: - x = getattr(F, self.act)(x) - return x - - -class DarknetBottleneck(nn.Layer): - """The basic bottleneck block used in Darknet. - - Each Block consists of two ConvModules and the input is added to the - final output. Each ConvModule is composed of Conv, BN, and act. - The first convLayer has filter size of 1x1 and the second one has the - filter size of 3x3. - - Args: - in_channels (int): The input channels of this Module. - out_channels (int): The output channels of this Module. - expansion (int): The kernel size of the convolution. Default: 0.5 - add_identity (bool): Whether to add identity to the out. - Default: True - use_depthwise (bool): Whether to use depthwise separable convolution. - Default: False - """ - - def __init__(self, - in_channels, - out_channels, - kernel_size=3, - expansion=0.5, - add_identity=True, - use_depthwise=False, - act="leaky_relu"): - super(DarknetBottleneck, self).__init__() - hidden_channels = int(out_channels * expansion) - conv_func = DPModule if use_depthwise else ConvBNLayer - self.conv1 = ConvBNLayer( - in_channel=in_channels, - out_channel=hidden_channels, - kernel_size=1, - act=act) - self.conv2 = conv_func( - in_channel=hidden_channels, - out_channel=out_channels, - kernel_size=kernel_size, - stride=1, - act=act) - self.add_identity = \ - add_identity and in_channels == out_channels - - def forward(self, x): - identity = x - out = self.conv1(x) - out = self.conv2(out) - - if self.add_identity: - return out + identity - else: - return out - - -class CSPLayer(nn.Layer): - """Cross Stage Partial Layer. - - Args: - in_channels (int): The input channels of the CSP layer. - out_channels (int): The output channels of the CSP layer. - expand_ratio (float): Ratio to adjust the number of channels of the - hidden layer. Default: 0.5 - num_blocks (int): Number of blocks. Default: 1 - add_identity (bool): Whether to add identity in blocks. - Default: True - use_depthwise (bool): Whether to depthwise separable convolution in - blocks. Default: False - """ - - def __init__(self, - in_channels, - out_channels, - kernel_size=3, - expand_ratio=0.5, - num_blocks=1, - add_identity=True, - use_depthwise=False, - act="leaky_relu"): - super().__init__() - mid_channels = int(out_channels * expand_ratio) - self.main_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act) - self.short_conv = ConvBNLayer(in_channels, mid_channels, 1, act=act) - self.final_conv = ConvBNLayer( - 2 * mid_channels, out_channels, 1, act=act) - - self.blocks = nn.Sequential(* [ - DarknetBottleneck( - mid_channels, - mid_channels, - kernel_size, - 1.0, - add_identity, - use_depthwise, - act=act) for _ in range(num_blocks) - ]) - - def forward(self, x): - x_short = self.short_conv(x) - - x_main = self.main_conv(x) - x_main = self.blocks(x_main) - - x_final = paddle.concat((x_main, x_short), axis=1) - return self.final_conv(x_final) - - -class Channel_T(nn.Layer): - def __init__(self, - in_channels=[116, 232, 464], - out_channels=96, - act="leaky_relu"): - super(Channel_T, self).__init__() - self.convs = nn.LayerList() - for i in range(len(in_channels)): - self.convs.append( - ConvBNLayer( - in_channels[i], out_channels, 1, act=act)) - - def forward(self, x): - outs = [self.convs[i](x[i]) for i in range(len(x))] - return outs - - -@register -@serializable -class CSPPAN(nn.Layer): - """Path Aggregation Network with CSP module. - - Args: - in_channels (List[int]): Number of input channels per scale. - out_channels (int): Number of output channels (used at each scale) - kernel_size (int): The conv2d kernel size of this Module. - num_features (int): Number of output features of CSPPAN module. - num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1 - use_depthwise (bool): Whether to depthwise separable convolution in - blocks. Default: True - """ - - def __init__(self, - in_channels, - out_channels, - kernel_size=5, - num_features=3, - num_csp_blocks=1, - use_depthwise=True, - act='hard_swish', - spatial_scales=[0.125, 0.0625, 0.03125]): - super(CSPPAN, self).__init__() - self.conv_t = Channel_T(in_channels, out_channels, act=act) - in_channels = [out_channels] * len(spatial_scales) - self.in_channels = in_channels - self.out_channels = out_channels - self.spatial_scales = spatial_scales - self.num_features = num_features - conv_func = DPModule if use_depthwise else ConvBNLayer - - if self.num_features == 4: - self.first_top_conv = conv_func( - in_channels[0], in_channels[0], kernel_size, stride=2, act=act) - self.second_top_conv = conv_func( - in_channels[0], in_channels[0], kernel_size, stride=2, act=act) - self.spatial_scales.append(self.spatial_scales[-1] / 2) - - # build top-down blocks - self.upsample = nn.Upsample(scale_factor=2, mode='nearest') - self.top_down_blocks = nn.LayerList() - for idx in range(len(in_channels) - 1, 0, -1): - self.top_down_blocks.append( - CSPLayer( - in_channels[idx - 1] * 2, - in_channels[idx - 1], - kernel_size=kernel_size, - num_blocks=num_csp_blocks, - add_identity=False, - use_depthwise=use_depthwise, - act=act)) - - # build bottom-up blocks - self.downsamples = nn.LayerList() - self.bottom_up_blocks = nn.LayerList() - for idx in range(len(in_channels) - 1): - self.downsamples.append( - conv_func( - in_channels[idx], - in_channels[idx], - kernel_size=kernel_size, - stride=2, - act=act)) - self.bottom_up_blocks.append( - CSPLayer( - in_channels[idx] * 2, - in_channels[idx + 1], - kernel_size=kernel_size, - num_blocks=num_csp_blocks, - add_identity=False, - use_depthwise=use_depthwise, - act=act)) - - def forward(self, inputs): - """ - Args: - inputs (tuple[Tensor]): input features. - - Returns: - tuple[Tensor]: CSPPAN features. - """ - assert len(inputs) == len(self.in_channels) - inputs = self.conv_t(inputs) - - # top-down path - inner_outs = [inputs[-1]] - for idx in range(len(self.in_channels) - 1, 0, -1): - feat_heigh = inner_outs[0] - feat_low = inputs[idx - 1] - - upsample_feat = self.upsample(feat_heigh) - - inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx]( - paddle.concat([upsample_feat, feat_low], 1)) - inner_outs.insert(0, inner_out) - - # bottom-up path - outs = [inner_outs[0]] - for idx in range(len(self.in_channels) - 1): - feat_low = outs[-1] - feat_height = inner_outs[idx + 1] - downsample_feat = self.downsamples[idx](feat_low) - out = self.bottom_up_blocks[idx](paddle.concat( - [downsample_feat, feat_height], 1)) - outs.append(out) - - top_features = None - if self.num_features == 4: - top_features = self.first_top_conv(inputs[-1]) - top_features = top_features + self.second_top_conv(outs[-1]) - outs.append(top_features) - - return tuple(outs) - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=self.out_channels, stride=1. / s) - for s in self.spatial_scales - ] - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/custom_pan.py b/pdfdet/models/Paddle/ppdet/modeling/necks/custom_pan.py deleted file mode 100644 index cf7ec84..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/necks/custom_pan.py +++ /dev/null @@ -1,398 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import math -import copy -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register, serializable -from ppdet.modeling.layers import DropBlock, MultiHeadAttention -from ppdet.modeling.ops import get_act_fn -from ..backbones.cspresnet import ConvBNLayer, BasicBlock -from ..shape_spec import ShapeSpec -from ..initializer import linear_init_ - -__all__ = ['CustomCSPPAN'] - - -def _get_clones(module, N): - return nn.LayerList([copy.deepcopy(module) for _ in range(N)]) - - -class SPP(nn.Layer): - def __init__(self, - ch_in, - ch_out, - k, - pool_size, - act='swish', - data_format='NCHW'): - super(SPP, self).__init__() - self.pool = [] - self.data_format = data_format - for i, size in enumerate(pool_size): - pool = self.add_sublayer( - 'pool{}'.format(i), - nn.MaxPool2D( - kernel_size=size, - stride=1, - padding=size // 2, - data_format=data_format, - ceil_mode=False)) - self.pool.append(pool) - self.conv = ConvBNLayer(ch_in, ch_out, k, padding=k // 2, act=act) - - def forward(self, x): - outs = [x] - for pool in self.pool: - outs.append(pool(x)) - if self.data_format == 'NCHW': - y = paddle.concat(outs, axis=1) - else: - y = paddle.concat(outs, axis=-1) - - y = self.conv(y) - return y - - -class CSPStage(nn.Layer): - def __init__(self, - block_fn, - ch_in, - ch_out, - n, - act='swish', - spp=False, - use_alpha=False): - super(CSPStage, self).__init__() - - ch_mid = int(ch_out // 2) - self.conv1 = ConvBNLayer(ch_in, ch_mid, 1, act=act) - self.conv2 = ConvBNLayer(ch_in, ch_mid, 1, act=act) - self.convs = nn.Sequential() - next_ch_in = ch_mid - for i in range(n): - self.convs.add_sublayer( - str(i), - eval(block_fn)(next_ch_in, - ch_mid, - act=act, - shortcut=False, - use_alpha=use_alpha)) - if i == (n - 1) // 2 and spp: - self.convs.add_sublayer( - 'spp', SPP(ch_mid * 4, ch_mid, 1, [5, 9, 13], act=act)) - next_ch_in = ch_mid - self.conv3 = ConvBNLayer(ch_mid * 2, ch_out, 1, act=act) - - def forward(self, x): - y1 = self.conv1(x) - y2 = self.conv2(x) - y2 = self.convs(y2) - y = paddle.concat([y1, y2], axis=1) - y = self.conv3(y) - return y - - -class TransformerEncoderLayer(nn.Layer): - def __init__(self, - d_model, - nhead, - dim_feedforward=2048, - dropout=0.1, - activation="relu", - attn_dropout=None, - act_dropout=None, - normalize_before=False): - super(TransformerEncoderLayer, self).__init__() - attn_dropout = dropout if attn_dropout is None else attn_dropout - act_dropout = dropout if act_dropout is None else act_dropout - self.normalize_before = normalize_before - - self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout) - # Implementation of Feedforward model - self.linear1 = nn.Linear(d_model, dim_feedforward) - self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train") - self.linear2 = nn.Linear(dim_feedforward, d_model) - - self.norm1 = nn.LayerNorm(d_model) - self.norm2 = nn.LayerNorm(d_model) - self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") - self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train") - self.activation = getattr(F, activation) - self._reset_parameters() - - def _reset_parameters(self): - linear_init_(self.linear1) - linear_init_(self.linear2) - - @staticmethod - def with_pos_embed(tensor, pos_embed): - return tensor if pos_embed is None else tensor + pos_embed - - def forward(self, src, src_mask=None, pos_embed=None): - residual = src - if self.normalize_before: - src = self.norm1(src) - q = k = self.with_pos_embed(src, pos_embed) - src = self.self_attn(q, k, value=src, attn_mask=src_mask) - - src = residual + self.dropout1(src) - if not self.normalize_before: - src = self.norm1(src) - - residual = src - if self.normalize_before: - src = self.norm2(src) - src = self.linear2(self.dropout(self.activation(self.linear1(src)))) - src = residual + self.dropout2(src) - if not self.normalize_before: - src = self.norm2(src) - return src - - -class TransformerEncoder(nn.Layer): - def __init__(self, encoder_layer, num_layers, norm=None): - super(TransformerEncoder, self).__init__() - self.layers = _get_clones(encoder_layer, num_layers) - self.num_layers = num_layers - self.norm = norm - - def forward(self, src, src_mask=None, pos_embed=None): - output = src - for layer in self.layers: - output = layer(output, src_mask=src_mask, pos_embed=pos_embed) - - if self.norm is not None: - output = self.norm(output) - - return output - - -@register -@serializable -class CustomCSPPAN(nn.Layer): - __shared__ = [ - 'norm_type', 'data_format', 'width_mult', 'depth_mult', 'trt', - 'eval_size' - ] - - def __init__(self, - in_channels=[256, 512, 1024], - out_channels=[1024, 512, 256], - norm_type='bn', - act='leaky', - stage_fn='CSPStage', - block_fn='BasicBlock', - stage_num=1, - block_num=3, - drop_block=False, - block_size=3, - keep_prob=0.9, - spp=False, - data_format='NCHW', - width_mult=1.0, - depth_mult=1.0, - use_alpha=False, - trt=False, - dim_feedforward=2048, - dropout=0.1, - activation='gelu', - nhead=4, - num_layers=4, - attn_dropout=None, - act_dropout=None, - normalize_before=False, - use_trans=False, - eval_size=None): - - super(CustomCSPPAN, self).__init__() - out_channels = [max(round(c * width_mult), 1) for c in out_channels] - block_num = max(round(block_num * depth_mult), 1) - act = get_act_fn( - act, trt=trt) if act is None or isinstance(act, - (str, dict)) else act - self.num_blocks = len(in_channels) - self.data_format = data_format - self._out_channels = out_channels - - self.hidden_dim = in_channels[-1] - in_channels = in_channels[::-1] - - self.use_trans = use_trans - self.eval_size = eval_size - if use_trans: - if eval_size is not None: - self.pos_embed = self.build_2d_sincos_position_embedding( - eval_size[1] // 32, - eval_size[0] // 32, - embed_dim=self.hidden_dim) - else: - self.pos_embed = None - - encoder_layer = TransformerEncoderLayer( - self.hidden_dim, nhead, dim_feedforward, dropout, activation, - attn_dropout, act_dropout, normalize_before) - encoder_norm = nn.LayerNorm( - self.hidden_dim) if normalize_before else None - self.encoder = TransformerEncoder(encoder_layer, num_layers, - encoder_norm) - - fpn_stages = [] - fpn_routes = [] - for i, (ch_in, ch_out) in enumerate(zip(in_channels, out_channels)): - if i > 0: - ch_in += ch_pre // 2 - - stage = nn.Sequential() - for j in range(stage_num): - stage.add_sublayer( - str(j), - eval(stage_fn)(block_fn, - ch_in if j == 0 else ch_out, - ch_out, - block_num, - act=act, - spp=(spp and i == 0), - use_alpha=use_alpha)) - - if drop_block: - stage.add_sublayer('drop', DropBlock(block_size, keep_prob)) - - fpn_stages.append(stage) - - if i < self.num_blocks - 1: - fpn_routes.append( - ConvBNLayer( - ch_in=ch_out, - ch_out=ch_out // 2, - filter_size=1, - stride=1, - padding=0, - act=act)) - - ch_pre = ch_out - - self.fpn_stages = nn.LayerList(fpn_stages) - self.fpn_routes = nn.LayerList(fpn_routes) - - pan_stages = [] - pan_routes = [] - for i in reversed(range(self.num_blocks - 1)): - pan_routes.append( - ConvBNLayer( - ch_in=out_channels[i + 1], - ch_out=out_channels[i + 1], - filter_size=3, - stride=2, - padding=1, - act=act)) - - ch_in = out_channels[i] + out_channels[i + 1] - ch_out = out_channels[i] - stage = nn.Sequential() - for j in range(stage_num): - stage.add_sublayer( - str(j), - eval(stage_fn)(block_fn, - ch_in if j == 0 else ch_out, - ch_out, - block_num, - act=act, - spp=False, - use_alpha=use_alpha)) - if drop_block: - stage.add_sublayer('drop', DropBlock(block_size, keep_prob)) - - pan_stages.append(stage) - - self.pan_stages = nn.LayerList(pan_stages[::-1]) - self.pan_routes = nn.LayerList(pan_routes[::-1]) - - def build_2d_sincos_position_embedding( - self, - w, - h, - embed_dim=1024, - temperature=10000., ): - grid_w = paddle.arange(int(w), dtype=paddle.float32) - grid_h = paddle.arange(int(h), dtype=paddle.float32) - grid_w, grid_h = paddle.meshgrid(grid_w, grid_h) - assert embed_dim % 4 == 0, 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' - pos_dim = embed_dim // 4 - omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim - omega = 1. / (temperature**omega) - - out_w = grid_w.flatten()[..., None] @omega[None] - out_h = grid_h.flatten()[..., None] @omega[None] - - pos_emb = paddle.concat( - [ - paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h), - paddle.cos(out_h) - ], - axis=1)[None, :, :] - - return pos_emb - - def forward(self, blocks, for_mot=False): - if self.use_trans: - last_feat = blocks[-1] - n, c, h, w = last_feat.shape - - # flatten [B, C, H, W] to [B, HxW, C] - src_flatten = last_feat.flatten(2).transpose([0, 2, 1]) - if self.eval_size is not None and not self.training: - pos_embed = self.pos_embed - else: - pos_embed = self.build_2d_sincos_position_embedding( - w=w, h=h, embed_dim=self.hidden_dim) - - memory = self.encoder(src_flatten, pos_embed=pos_embed) - last_feat_encode = memory.transpose([0, 2, 1]).reshape([n, c, h, w]) - blocks[-1] = last_feat_encode - - blocks = blocks[::-1] - fpn_feats = [] - - for i, block in enumerate(blocks): - if i > 0: - block = paddle.concat([route, block], axis=1) - route = self.fpn_stages[i](block) - fpn_feats.append(route) - - if i < self.num_blocks - 1: - route = self.fpn_routes[i](route) - route = F.interpolate( - route, scale_factor=2., data_format=self.data_format) - - pan_feats = [fpn_feats[-1], ] - route = fpn_feats[-1] - for i in reversed(range(self.num_blocks - 1)): - block = fpn_feats[i] - route = self.pan_routes[i](route) - block = paddle.concat([route, block], axis=1) - route = self.pan_stages[i](block) - pan_feats.append(route) - - return pan_feats[::-1] - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } - - @property - def out_shape(self): - return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/dilated_encoder.py b/pdfdet/models/Paddle/ppdet/modeling/necks/dilated_encoder.py deleted file mode 100644 index 0bbc7fd..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/necks/dilated_encoder.py +++ /dev/null @@ -1,150 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -from paddle import ParamAttr -from paddle.regularizer import L2Decay -from paddle.nn.initializer import KaimingUniform, Constant, Normal -from ppdet.core.workspace import register, serializable -from ..shape_spec import ShapeSpec - -__all__ = ['DilatedEncoder'] - - -class Bottleneck(nn.Layer): - def __init__(self, in_channels, mid_channels, dilation): - super(Bottleneck, self).__init__() - self.conv1 = nn.Sequential(* [ - nn.Conv2D( - in_channels, - mid_channels, - 1, - padding=0, - weight_attr=ParamAttr(initializer=Normal( - mean=0, std=0.01)), - bias_attr=ParamAttr(initializer=Constant(0.0))), - nn.BatchNorm2D( - mid_channels, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))), - nn.ReLU(), - ]) - self.conv2 = nn.Sequential(* [ - nn.Conv2D( - mid_channels, - mid_channels, - 3, - padding=dilation, - dilation=dilation, - weight_attr=ParamAttr(initializer=Normal( - mean=0, std=0.01)), - bias_attr=ParamAttr(initializer=Constant(0.0))), - nn.BatchNorm2D( - mid_channels, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))), - nn.ReLU(), - ]) - self.conv3 = nn.Sequential(* [ - nn.Conv2D( - mid_channels, - in_channels, - 1, - padding=0, - weight_attr=ParamAttr(initializer=Normal( - mean=0, std=0.01)), - bias_attr=ParamAttr(initializer=Constant(0.0))), - nn.BatchNorm2D( - in_channels, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))), - nn.ReLU(), - ]) - - def forward(self, x): - identity = x - y = self.conv3(self.conv2(self.conv1(x))) - return y + identity - - -@register -class DilatedEncoder(nn.Layer): - """ - DilatedEncoder used in YOLOF - """ - - def __init__(self, - in_channels=[2048], - out_channels=[512], - block_mid_channels=128, - num_residual_blocks=4, - block_dilations=[2, 4, 6, 8]): - super(DilatedEncoder, self).__init__() - self.in_channels = in_channels - self.out_channels = out_channels - assert len(self.in_channels) == 1, "YOLOF only has one level feature." - assert len(self.out_channels) == 1, "YOLOF only has one level feature." - - self.block_mid_channels = block_mid_channels - self.num_residual_blocks = num_residual_blocks - self.block_dilations = block_dilations - - out_ch = self.out_channels[0] - self.lateral_conv = nn.Conv2D( - self.in_channels[0], - out_ch, - 1, - weight_attr=ParamAttr(initializer=KaimingUniform( - negative_slope=1, nonlinearity='leaky_relu')), - bias_attr=ParamAttr(initializer=Constant(value=0.0))) - self.lateral_norm = nn.BatchNorm2D( - out_ch, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - - self.fpn_conv = nn.Conv2D( - out_ch, - out_ch, - 3, - padding=1, - weight_attr=ParamAttr(initializer=KaimingUniform( - negative_slope=1, nonlinearity='leaky_relu'))) - self.fpn_norm = nn.BatchNorm2D( - out_ch, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - - encoder_blocks = [] - for i in range(self.num_residual_blocks): - encoder_blocks.append( - Bottleneck( - out_ch, - self.block_mid_channels, - dilation=block_dilations[i])) - self.dilated_encoder_blocks = nn.Sequential(*encoder_blocks) - - def forward(self, inputs, for_mot=False): - out = self.lateral_norm(self.lateral_conv(inputs[0])) - out = self.fpn_norm(self.fpn_conv(out)) - out = self.dilated_encoder_blocks(out) - return [out] - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } - - @property - def out_shape(self): - return [ShapeSpec(channels=c) for c in self.out_channels] diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/es_pan.py b/pdfdet/models/Paddle/ppdet/modeling/necks/es_pan.py deleted file mode 100644 index bc24877..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/necks/es_pan.py +++ /dev/null @@ -1,212 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.regularizer import L2Decay -from ppdet.core.workspace import register, serializable - -from ..shape_spec import ShapeSpec -from ..backbones.esnet import SEModule -from .csp_pan import ConvBNLayer, Channel_T, DPModule - -__all__ = ['ESPAN'] - - -class ES_Block(nn.Layer): - def __init__(self, - in_channels, - mid_channels, - out_channels, - kernel_size=5, - stride=1, - act='leaky_relu'): - super(ES_Block, self).__init__() - self._residual = ConvBNLayer( - in_channel=in_channels, - out_channel=out_channels, - kernel_size=1, - stride=1, - groups=1, - act=act) - self._conv_pw = ConvBNLayer( - in_channel=in_channels, - out_channel=mid_channels // 2, - kernel_size=1, - stride=1, - groups=1, - act=act) - self._conv_dw = ConvBNLayer( - in_channel=mid_channels // 2, - out_channel=mid_channels // 2, - kernel_size=kernel_size, - stride=stride, - groups=mid_channels // 2, - act=None) - self._se = SEModule(mid_channels) - - self._conv_linear = ConvBNLayer( - in_channel=mid_channels, - out_channel=out_channels, - kernel_size=1, - stride=1, - groups=1, - act=act) - - self._out_conv = ConvBNLayer( - in_channel=out_channels * 2, - out_channel=out_channels, - kernel_size=1, - stride=1, - groups=1, - act=act) - - def forward(self, inputs): - x1 = self._residual(inputs) - x2 = self._conv_pw(inputs) - x3 = self._conv_dw(x2) - x3 = paddle.concat([x2, x3], axis=1) - x3 = self._se(x3) - x3 = self._conv_linear(x3) - out = paddle.concat([x1, x3], axis=1) - out = self._out_conv(out) - return out - - -@register -@serializable -class ESPAN(nn.Layer): - """Path Aggregation Network with ES module. - - Args: - in_channels (List[int]): Number of input channels per scale. - out_channels (int): Number of output channels (used at each scale) - kernel_size (int): The conv2d kernel size of this Module. - num_features (int): Number of output features of CSPPAN module. - num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1 - use_depthwise (bool): Whether to depthwise separable convolution in - blocks. Default: True - """ - - def __init__(self, - in_channels, - out_channels, - kernel_size=5, - num_features=3, - use_depthwise=True, - act='hard_swish', - spatial_scales=[0.125, 0.0625, 0.03125]): - super(ESPAN, self).__init__() - self.conv_t = Channel_T(in_channels, out_channels, act=act) - in_channels = [out_channels] * len(spatial_scales) - self.in_channels = in_channels - self.out_channels = out_channels - self.spatial_scales = spatial_scales - self.num_features = num_features - conv_func = DPModule if use_depthwise else ConvBNLayer - - if self.num_features == 4: - self.first_top_conv = conv_func( - in_channels[0], in_channels[0], kernel_size, stride=2, act=act) - self.second_top_conv = conv_func( - in_channels[0], in_channels[0], kernel_size, stride=2, act=act) - self.spatial_scales.append(self.spatial_scales[-1] / 2) - - # build top-down blocks - self.upsample = nn.Upsample(scale_factor=2, mode='nearest') - self.top_down_blocks = nn.LayerList() - for idx in range(len(in_channels) - 1, 0, -1): - self.top_down_blocks.append( - ES_Block( - in_channels[idx - 1] * 2, - in_channels[idx - 1], - in_channels[idx - 1], - kernel_size=kernel_size, - stride=1, - act=act)) - - # build bottom-up blocks - self.downsamples = nn.LayerList() - self.bottom_up_blocks = nn.LayerList() - for idx in range(len(in_channels) - 1): - self.downsamples.append( - conv_func( - in_channels[idx], - in_channels[idx], - kernel_size=kernel_size, - stride=2, - act=act)) - self.bottom_up_blocks.append( - ES_Block( - in_channels[idx] * 2, - in_channels[idx + 1], - in_channels[idx + 1], - kernel_size=kernel_size, - stride=1, - act=act)) - - def forward(self, inputs): - """ - Args: - inputs (tuple[Tensor]): input features. - - Returns: - tuple[Tensor]: CSPPAN features. - """ - assert len(inputs) == len(self.in_channels) - inputs = self.conv_t(inputs) - - # top-down path - inner_outs = [inputs[-1]] - for idx in range(len(self.in_channels) - 1, 0, -1): - feat_heigh = inner_outs[0] - feat_low = inputs[idx - 1] - - upsample_feat = self.upsample(feat_heigh) - - inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx]( - paddle.concat([upsample_feat, feat_low], 1)) - inner_outs.insert(0, inner_out) - - # bottom-up path - outs = [inner_outs[0]] - for idx in range(len(self.in_channels) - 1): - feat_low = outs[-1] - feat_height = inner_outs[idx + 1] - downsample_feat = self.downsamples[idx](feat_low) - out = self.bottom_up_blocks[idx](paddle.concat( - [downsample_feat, feat_height], 1)) - outs.append(out) - - top_features = None - if self.num_features == 4: - top_features = self.first_top_conv(inputs[-1]) - top_features = top_features + self.second_top_conv(outs[-1]) - outs.append(top_features) - - return tuple(outs) - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=self.out_channels, stride=1. / s) - for s in self.spatial_scales - ] - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/fpn.py b/pdfdet/models/Paddle/ppdet/modeling/necks/fpn.py deleted file mode 100644 index d08ca41..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/necks/fpn.py +++ /dev/null @@ -1,231 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.nn.initializer import XavierUniform - -from ppdet.core.workspace import register, serializable -from ppdet.modeling.layers import ConvNormLayer -from ..shape_spec import ShapeSpec - -__all__ = ['FPN'] - - -@register -@serializable -class FPN(nn.Layer): - """ - Feature Pyramid Network, see https://arxiv.org/abs/1612.03144 - - Args: - in_channels (list[int]): input channels of each level which can be - derived from the output shape of backbone by from_config - out_channel (int): output channel of each level - spatial_scales (list[float]): the spatial scales between input feature - maps and original input image which can be derived from the output - shape of backbone by from_config - has_extra_convs (bool): whether to add extra conv to the last level. - default False - extra_stage (int): the number of extra stages added to the last level. - default 1 - use_c5 (bool): Whether to use c5 as the input of extra stage, - otherwise p5 is used. default True - norm_type (string|None): The normalization type in FPN module. If - norm_type is None, norm will not be used after conv and if - norm_type is string, bn, gn, sync_bn are available. default None - norm_decay (float): weight decay for normalization layer weights. - default 0. - freeze_norm (bool): whether to freeze normalization layer. - default False - relu_before_extra_convs (bool): whether to add relu before extra convs. - default False - - """ - - def __init__(self, - in_channels, - out_channel, - spatial_scales=[0.25, 0.125, 0.0625, 0.03125], - has_extra_convs=False, - extra_stage=1, - use_c5=True, - norm_type=None, - norm_decay=0., - freeze_norm=False, - relu_before_extra_convs=True): - super(FPN, self).__init__() - self.out_channel = out_channel - for s in range(extra_stage): - spatial_scales = spatial_scales + [spatial_scales[-1] / 2.] - self.spatial_scales = spatial_scales - self.has_extra_convs = has_extra_convs - self.extra_stage = extra_stage - self.use_c5 = use_c5 - self.relu_before_extra_convs = relu_before_extra_convs - self.norm_type = norm_type - self.norm_decay = norm_decay - self.freeze_norm = freeze_norm - - self.lateral_convs = [] - self.fpn_convs = [] - fan = out_channel * 3 * 3 - - # stage index 0,1,2,3 stands for res2,res3,res4,res5 on ResNet Backbone - # 0 <= st_stage < ed_stage <= 3 - st_stage = 4 - len(in_channels) - ed_stage = st_stage + len(in_channels) - 1 - for i in range(st_stage, ed_stage + 1): - if i == 3: - lateral_name = 'fpn_inner_res5_sum' - else: - lateral_name = 'fpn_inner_res{}_sum_lateral'.format(i + 2) - in_c = in_channels[i - st_stage] - if self.norm_type is not None: - lateral = self.add_sublayer( - lateral_name, - ConvNormLayer( - ch_in=in_c, - ch_out=out_channel, - filter_size=1, - stride=1, - norm_type=self.norm_type, - norm_decay=self.norm_decay, - freeze_norm=self.freeze_norm, - initializer=XavierUniform(fan_out=in_c))) - else: - lateral = self.add_sublayer( - lateral_name, - nn.Conv2D( - in_channels=in_c, - out_channels=out_channel, - kernel_size=1, - weight_attr=ParamAttr( - initializer=XavierUniform(fan_out=in_c)))) - self.lateral_convs.append(lateral) - - fpn_name = 'fpn_res{}_sum'.format(i + 2) - if self.norm_type is not None: - fpn_conv = self.add_sublayer( - fpn_name, - ConvNormLayer( - ch_in=out_channel, - ch_out=out_channel, - filter_size=3, - stride=1, - norm_type=self.norm_type, - norm_decay=self.norm_decay, - freeze_norm=self.freeze_norm, - initializer=XavierUniform(fan_out=fan))) - else: - fpn_conv = self.add_sublayer( - fpn_name, - nn.Conv2D( - in_channels=out_channel, - out_channels=out_channel, - kernel_size=3, - padding=1, - weight_attr=ParamAttr( - initializer=XavierUniform(fan_out=fan)))) - self.fpn_convs.append(fpn_conv) - - # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5) - if self.has_extra_convs: - for i in range(self.extra_stage): - lvl = ed_stage + 1 + i - if i == 0 and self.use_c5: - in_c = in_channels[-1] - else: - in_c = out_channel - extra_fpn_name = 'fpn_{}'.format(lvl + 2) - if self.norm_type is not None: - extra_fpn_conv = self.add_sublayer( - extra_fpn_name, - ConvNormLayer( - ch_in=in_c, - ch_out=out_channel, - filter_size=3, - stride=2, - norm_type=self.norm_type, - norm_decay=self.norm_decay, - freeze_norm=self.freeze_norm, - initializer=XavierUniform(fan_out=fan))) - else: - extra_fpn_conv = self.add_sublayer( - extra_fpn_name, - nn.Conv2D( - in_channels=in_c, - out_channels=out_channel, - kernel_size=3, - stride=2, - padding=1, - weight_attr=ParamAttr( - initializer=XavierUniform(fan_out=fan)))) - self.fpn_convs.append(extra_fpn_conv) - - @classmethod - def from_config(cls, cfg, input_shape): - return { - 'in_channels': [i.channels for i in input_shape], - 'spatial_scales': [1.0 / i.stride for i in input_shape], - } - - def forward(self, body_feats): - laterals = [] - num_levels = len(body_feats) - for i in range(num_levels): - laterals.append(self.lateral_convs[i](body_feats[i])) - - for i in range(1, num_levels): - lvl = num_levels - i - upsample = F.interpolate( - laterals[lvl], - scale_factor=2., - mode='nearest', ) - laterals[lvl - 1] += upsample - - fpn_output = [] - for lvl in range(num_levels): - fpn_output.append(self.fpn_convs[lvl](laterals[lvl])) - - if self.extra_stage > 0: - # use max pool to get more levels on top of outputs (Faster R-CNN, Mask R-CNN) - if not self.has_extra_convs: - assert self.extra_stage == 1, 'extra_stage should be 1 if FPN has not extra convs' - fpn_output.append(F.max_pool2d(fpn_output[-1], 1, stride=2)) - # add extra conv levels for RetinaNet(use_c5)/FCOS(use_p5) - else: - if self.use_c5: - extra_source = body_feats[-1] - else: - extra_source = fpn_output[-1] - fpn_output.append(self.fpn_convs[num_levels](extra_source)) - - for i in range(1, self.extra_stage): - if self.relu_before_extra_convs: - fpn_output.append(self.fpn_convs[num_levels + i](F.relu( - fpn_output[-1]))) - else: - fpn_output.append(self.fpn_convs[num_levels + i]( - fpn_output[-1])) - return fpn_output - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=self.out_channel, stride=1. / s) - for s in self.spatial_scales - ] diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/hrfpn.py b/pdfdet/models/Paddle/ppdet/modeling/necks/hrfpn.py deleted file mode 100644 index 5c45c99..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/necks/hrfpn.py +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn.functional as F -import paddle.nn as nn -from ppdet.core.workspace import register -from ..shape_spec import ShapeSpec - -__all__ = ['HRFPN'] - - -@register -class HRFPN(nn.Layer): - """ - Args: - in_channels (list): number of input feature channels from backbone - out_channel (int): number of output feature channels - share_conv (bool): whether to share conv for different layers' reduction - extra_stage (int): add extra stage for returning HRFPN fpn_feats - spatial_scales (list): feature map scaling factor - """ - - def __init__(self, - in_channels=[18, 36, 72, 144], - out_channel=256, - share_conv=False, - extra_stage=1, - spatial_scales=[1. / 4, 1. / 8, 1. / 16, 1. / 32], - use_bias=False): - super(HRFPN, self).__init__() - in_channel = sum(in_channels) - self.in_channel = in_channel - self.out_channel = out_channel - self.share_conv = share_conv - for i in range(extra_stage): - spatial_scales = spatial_scales + [spatial_scales[-1] / 2.] - self.spatial_scales = spatial_scales - self.num_out = len(self.spatial_scales) - self.use_bias = use_bias - bias_attr = False if use_bias is False else None - - self.reduction = nn.Conv2D( - in_channels=in_channel, - out_channels=out_channel, - kernel_size=1, - bias_attr=bias_attr) - - if share_conv: - self.fpn_conv = nn.Conv2D( - in_channels=out_channel, - out_channels=out_channel, - kernel_size=3, - padding=1, - bias_attr=bias_attr) - else: - self.fpn_conv = [] - for i in range(self.num_out): - conv_name = "fpn_conv_" + str(i) - conv = self.add_sublayer( - conv_name, - nn.Conv2D( - in_channels=out_channel, - out_channels=out_channel, - kernel_size=3, - padding=1, - bias_attr=bias_attr)) - self.fpn_conv.append(conv) - - def forward(self, body_feats): - num_backbone_stages = len(body_feats) - - outs = [] - outs.append(body_feats[0]) - - # resize - for i in range(1, num_backbone_stages): - resized = F.interpolate( - body_feats[i], scale_factor=2**i, mode='bilinear') - outs.append(resized) - - # concat - out = paddle.concat(outs, axis=1) - assert out.shape[ - 1] == self.in_channel, 'in_channel should be {}, be received {}'.format( - out.shape[1], self.in_channel) - - # reduction - out = self.reduction(out) - - # conv - outs = [out] - for i in range(1, self.num_out): - outs.append(F.avg_pool2d(out, kernel_size=2**i, stride=2**i)) - outputs = [] - - for i in range(self.num_out): - conv_func = self.fpn_conv if self.share_conv else self.fpn_conv[i] - conv = conv_func(outs[i]) - outputs.append(conv) - - fpn_feats = [outputs[k] for k in range(self.num_out)] - return fpn_feats - - @classmethod - def from_config(cls, cfg, input_shape): - return { - 'in_channels': [i.channels for i in input_shape], - 'spatial_scales': [1.0 / i.stride for i in input_shape], - } - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=self.out_channel, stride=1. / s) - for s in self.spatial_scales - ] diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/lc_pan.py b/pdfdet/models/Paddle/ppdet/modeling/necks/lc_pan.py deleted file mode 100644 index 0c59c8a..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/necks/lc_pan.py +++ /dev/null @@ -1,168 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.regularizer import L2Decay -from ppdet.core.workspace import register, serializable - -from ..shape_spec import ShapeSpec -from ..backbones.lcnet import DepthwiseSeparable -from .csp_pan import ConvBNLayer, Channel_T, DPModule - -__all__ = ['LCPAN'] - - -@register -@serializable -class LCPAN(nn.Layer): - """Path Aggregation Network with LCNet module. - Args: - in_channels (List[int]): Number of input channels per scale. - out_channels (int): Number of output channels (used at each scale) - kernel_size (int): The conv2d kernel size of this Module. - num_features (int): Number of output features of CSPPAN module. - num_csp_blocks (int): Number of bottlenecks in CSPLayer. Default: 1 - use_depthwise (bool): Whether to depthwise separable convolution in - blocks. Default: True - """ - - def __init__(self, - in_channels, - out_channels, - kernel_size=5, - num_features=3, - use_depthwise=True, - act='hard_swish', - spatial_scales=[0.125, 0.0625, 0.03125]): - super(LCPAN, self).__init__() - self.conv_t = Channel_T(in_channels, out_channels, act=act) - in_channels = [out_channels] * len(spatial_scales) - self.in_channels = in_channels - self.out_channels = out_channels - self.spatial_scales = spatial_scales - self.num_features = num_features - conv_func = DPModule if use_depthwise else ConvBNLayer - - NET_CONFIG = { - #k, in_c, out_c, stride, use_se - "block1": [ - [kernel_size, out_channels * 2, out_channels * 2, 1, False], - [kernel_size, out_channels * 2, out_channels, 1, False], - ], - "block2": [ - [kernel_size, out_channels * 2, out_channels * 2, 1, False], - [kernel_size, out_channels * 2, out_channels, 1, False], - ] - } - - if self.num_features == 4: - self.first_top_conv = conv_func( - in_channels[0], in_channels[0], kernel_size, stride=2, act=act) - self.second_top_conv = conv_func( - in_channels[0], in_channels[0], kernel_size, stride=2, act=act) - self.spatial_scales.append(self.spatial_scales[-1] / 2) - - # build top-down blocks - self.upsample = nn.Upsample(scale_factor=2, mode='nearest') - self.top_down_blocks = nn.LayerList() - for idx in range(len(in_channels) - 1, 0, -1): - self.top_down_blocks.append( - nn.Sequential(* [ - DepthwiseSeparable( - num_channels=in_c, - num_filters=out_c, - dw_size=k, - stride=s, - use_se=se) - for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[ - "block1"]) - ])) - - # build bottom-up blocks - self.downsamples = nn.LayerList() - self.bottom_up_blocks = nn.LayerList() - for idx in range(len(in_channels) - 1): - self.downsamples.append( - conv_func( - in_channels[idx], - in_channels[idx], - kernel_size=kernel_size, - stride=2, - act=act)) - self.bottom_up_blocks.append( - nn.Sequential(* [ - DepthwiseSeparable( - num_channels=in_c, - num_filters=out_c, - dw_size=k, - stride=s, - use_se=se) - for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG[ - "block2"]) - ])) - - def forward(self, inputs): - """ - Args: - inputs (tuple[Tensor]): input features. - Returns: - tuple[Tensor]: CSPPAN features. - """ - assert len(inputs) == len(self.in_channels) - inputs = self.conv_t(inputs) - - # top-down path - inner_outs = [inputs[-1]] - for idx in range(len(self.in_channels) - 1, 0, -1): - feat_heigh = inner_outs[0] - feat_low = inputs[idx - 1] - - upsample_feat = self.upsample(feat_heigh) - - inner_out = self.top_down_blocks[len(self.in_channels) - 1 - idx]( - paddle.concat([upsample_feat, feat_low], 1)) - inner_outs.insert(0, inner_out) - - # bottom-up path - outs = [inner_outs[0]] - for idx in range(len(self.in_channels) - 1): - feat_low = outs[-1] - feat_height = inner_outs[idx + 1] - downsample_feat = self.downsamples[idx](feat_low) - out = self.bottom_up_blocks[idx](paddle.concat( - [downsample_feat, feat_height], 1)) - outs.append(out) - - top_features = None - if self.num_features == 4: - top_features = self.first_top_conv(inputs[-1]) - top_features = top_features + self.second_top_conv(outs[-1]) - outs.append(top_features) - - return tuple(outs) - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=self.out_channels, stride=1. / s) - for s in self.spatial_scales - ] - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/ttf_fpn.py b/pdfdet/models/Paddle/ppdet/modeling/necks/ttf_fpn.py deleted file mode 100644 index 60cc69f..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/necks/ttf_fpn.py +++ /dev/null @@ -1,242 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.nn.initializer import Constant, Uniform, Normal, XavierUniform -from ppdet.core.workspace import register, serializable -from paddle.regularizer import L2Decay -from ppdet.modeling.layers import DeformableConvV2, ConvNormLayer, LiteConv -import math -from ppdet.modeling.ops import batch_norm -from ..shape_spec import ShapeSpec - -__all__ = ['TTFFPN'] - - -class Upsample(nn.Layer): - def __init__(self, ch_in, ch_out, norm_type='bn'): - super(Upsample, self).__init__() - fan_in = ch_in * 3 * 3 - stdv = 1. / math.sqrt(fan_in) - self.dcn = DeformableConvV2( - ch_in, - ch_out, - kernel_size=3, - weight_attr=ParamAttr(initializer=Uniform(-stdv, stdv)), - bias_attr=ParamAttr( - initializer=Constant(0), - regularizer=L2Decay(0.), - learning_rate=2.), - lr_scale=2., - regularizer=L2Decay(0.)) - - self.bn = batch_norm( - ch_out, norm_type=norm_type, initializer=Constant(1.)) - - def forward(self, feat): - dcn = self.dcn(feat) - bn = self.bn(dcn) - relu = F.relu(bn) - out = F.interpolate(relu, scale_factor=2., mode='bilinear') - return out - - -class DeConv(nn.Layer): - def __init__(self, ch_in, ch_out, norm_type='bn'): - super(DeConv, self).__init__() - self.deconv = nn.Sequential() - conv1 = ConvNormLayer( - ch_in=ch_in, - ch_out=ch_out, - stride=1, - filter_size=1, - norm_type=norm_type, - initializer=XavierUniform()) - conv2 = nn.Conv2DTranspose( - in_channels=ch_out, - out_channels=ch_out, - kernel_size=4, - padding=1, - stride=2, - groups=ch_out, - weight_attr=ParamAttr(initializer=XavierUniform()), - bias_attr=False) - bn = batch_norm(ch_out, norm_type=norm_type, norm_decay=0.) - conv3 = ConvNormLayer( - ch_in=ch_out, - ch_out=ch_out, - stride=1, - filter_size=1, - norm_type=norm_type, - initializer=XavierUniform()) - - self.deconv.add_sublayer('conv1', conv1) - self.deconv.add_sublayer('relu6_1', nn.ReLU6()) - self.deconv.add_sublayer('conv2', conv2) - self.deconv.add_sublayer('bn', bn) - self.deconv.add_sublayer('relu6_2', nn.ReLU6()) - self.deconv.add_sublayer('conv3', conv3) - self.deconv.add_sublayer('relu6_3', nn.ReLU6()) - - def forward(self, inputs): - return self.deconv(inputs) - - -class LiteUpsample(nn.Layer): - def __init__(self, ch_in, ch_out, norm_type='bn'): - super(LiteUpsample, self).__init__() - self.deconv = DeConv(ch_in, ch_out, norm_type=norm_type) - self.conv = LiteConv(ch_in, ch_out, norm_type=norm_type) - - def forward(self, inputs): - deconv_up = self.deconv(inputs) - conv = self.conv(inputs) - interp_up = F.interpolate(conv, scale_factor=2., mode='bilinear') - return deconv_up + interp_up - - -class ShortCut(nn.Layer): - def __init__(self, - layer_num, - ch_in, - ch_out, - norm_type='bn', - lite_neck=False, - name=None): - super(ShortCut, self).__init__() - shortcut_conv = nn.Sequential() - for i in range(layer_num): - fan_out = 3 * 3 * ch_out - std = math.sqrt(2. / fan_out) - in_channels = ch_in if i == 0 else ch_out - shortcut_name = name + '.conv.{}'.format(i) - if lite_neck: - shortcut_conv.add_sublayer( - shortcut_name, - LiteConv( - in_channels=in_channels, - out_channels=ch_out, - with_act=i < layer_num - 1, - norm_type=norm_type)) - else: - shortcut_conv.add_sublayer( - shortcut_name, - nn.Conv2D( - in_channels=in_channels, - out_channels=ch_out, - kernel_size=3, - padding=1, - weight_attr=ParamAttr(initializer=Normal(0, std)), - bias_attr=ParamAttr( - learning_rate=2., regularizer=L2Decay(0.)))) - if i < layer_num - 1: - shortcut_conv.add_sublayer(shortcut_name + '.act', - nn.ReLU()) - self.shortcut = self.add_sublayer('shortcut', shortcut_conv) - - def forward(self, feat): - out = self.shortcut(feat) - return out - - -@register -@serializable -class TTFFPN(nn.Layer): - """ - Args: - in_channels (list): number of input feature channels from backbone. - [128,256,512,1024] by default, means the channels of DarkNet53 - backbone return_idx [1,2,3,4]. - planes (list): the number of output feature channels of FPN. - [256, 128, 64] by default - shortcut_num (list): the number of convolution layers in each shortcut. - [3,2,1] by default, means DarkNet53 backbone return_idx_1 has 3 convs - in its shortcut, return_idx_2 has 2 convs and return_idx_3 has 1 conv. - norm_type (string): norm type, 'sync_bn', 'bn', 'gn' are optional. - bn by default - lite_neck (bool): whether to use lite conv in TTFNet FPN, - False by default - fusion_method (string): the method to fusion upsample and lateral layer. - 'add' and 'concat' are optional, add by default - """ - - __shared__ = ['norm_type'] - - def __init__(self, - in_channels, - planes=[256, 128, 64], - shortcut_num=[3, 2, 1], - norm_type='bn', - lite_neck=False, - fusion_method='add'): - super(TTFFPN, self).__init__() - self.planes = planes - self.shortcut_num = shortcut_num[::-1] - self.shortcut_len = len(shortcut_num) - self.ch_in = in_channels[::-1] - self.fusion_method = fusion_method - - self.upsample_list = [] - self.shortcut_list = [] - self.upper_list = [] - for i, out_c in enumerate(self.planes): - in_c = self.ch_in[i] if i == 0 else self.upper_list[-1] - upsample_module = LiteUpsample if lite_neck else Upsample - upsample = self.add_sublayer( - 'upsample.' + str(i), - upsample_module( - in_c, out_c, norm_type=norm_type)) - self.upsample_list.append(upsample) - if i < self.shortcut_len: - shortcut = self.add_sublayer( - 'shortcut.' + str(i), - ShortCut( - self.shortcut_num[i], - self.ch_in[i + 1], - out_c, - norm_type=norm_type, - lite_neck=lite_neck, - name='shortcut.' + str(i))) - self.shortcut_list.append(shortcut) - if self.fusion_method == 'add': - upper_c = out_c - elif self.fusion_method == 'concat': - upper_c = out_c * 2 - else: - raise ValueError('Illegal fusion method. Expected add or\ - concat, but received {}'.format(self.fusion_method)) - self.upper_list.append(upper_c) - - def forward(self, inputs): - feat = inputs[-1] - for i, out_c in enumerate(self.planes): - feat = self.upsample_list[i](feat) - if i < self.shortcut_len: - shortcut = self.shortcut_list[i](inputs[-i - 2]) - if self.fusion_method == 'add': - feat = feat + shortcut - else: - feat = paddle.concat([feat, shortcut], axis=1) - return feat - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } - - @property - def out_shape(self): - return [ShapeSpec(channels=self.upper_list[-1], )] diff --git a/pdfdet/models/Paddle/ppdet/modeling/necks/yolo_fpn.py b/pdfdet/models/Paddle/ppdet/modeling/necks/yolo_fpn.py deleted file mode 100644 index 79f4cea..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/necks/yolo_fpn.py +++ /dev/null @@ -1,1099 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register, serializable -from ppdet.modeling.layers import DropBlock -from ppdet.modeling.ops import get_act_fn -from ..backbones.darknet import ConvBNLayer -from ..shape_spec import ShapeSpec -from ..backbones.csp_darknet import BaseConv, DWConv, CSPLayer - -__all__ = ['YOLOv3FPN', 'PPYOLOFPN', 'PPYOLOTinyFPN', 'PPYOLOPAN', 'YOLOCSPPAN'] - - -def add_coord(x, data_format): - b = paddle.shape(x)[0] - if data_format == 'NCHW': - h, w = x.shape[2], x.shape[3] - else: - h, w = x.shape[1], x.shape[2] - - gx = paddle.cast(paddle.arange(w) / ((w - 1.) * 2.0) - 1., x.dtype) - gy = paddle.cast(paddle.arange(h) / ((h - 1.) * 2.0) - 1., x.dtype) - - if data_format == 'NCHW': - gx = gx.reshape([1, 1, 1, w]).expand([b, 1, h, w]) - gy = gy.reshape([1, 1, h, 1]).expand([b, 1, h, w]) - else: - gx = gx.reshape([1, 1, w, 1]).expand([b, h, w, 1]) - gy = gy.reshape([1, h, 1, 1]).expand([b, h, w, 1]) - - gx.stop_gradient = True - gy.stop_gradient = True - return gx, gy - - -class YoloDetBlock(nn.Layer): - def __init__(self, - ch_in, - channel, - norm_type, - freeze_norm=False, - name='', - data_format='NCHW'): - """ - YOLODetBlock layer for yolov3, see https://arxiv.org/abs/1804.02767 - - Args: - ch_in (int): input channel - channel (int): base channel - norm_type (str): batch norm type - freeze_norm (bool): whether to freeze norm, default False - name (str): layer name - data_format (str): data format, NCHW or NHWC - """ - super(YoloDetBlock, self).__init__() - self.ch_in = ch_in - self.channel = channel - assert channel % 2 == 0, \ - "channel {} cannot be divided by 2".format(channel) - conv_def = [ - ['conv0', ch_in, channel, 1, '.0.0'], - ['conv1', channel, channel * 2, 3, '.0.1'], - ['conv2', channel * 2, channel, 1, '.1.0'], - ['conv3', channel, channel * 2, 3, '.1.1'], - ['route', channel * 2, channel, 1, '.2'], - ] - - self.conv_module = nn.Sequential() - for idx, (conv_name, ch_in, ch_out, filter_size, - post_name) in enumerate(conv_def): - self.conv_module.add_sublayer( - conv_name, - ConvBNLayer( - ch_in=ch_in, - ch_out=ch_out, - filter_size=filter_size, - padding=(filter_size - 1) // 2, - norm_type=norm_type, - freeze_norm=freeze_norm, - data_format=data_format, - name=name + post_name)) - - self.tip = ConvBNLayer( - ch_in=channel, - ch_out=channel * 2, - filter_size=3, - padding=1, - norm_type=norm_type, - freeze_norm=freeze_norm, - data_format=data_format, - name=name + '.tip') - - def forward(self, inputs): - route = self.conv_module(inputs) - tip = self.tip(route) - return route, tip - - -class SPP(nn.Layer): - def __init__(self, - ch_in, - ch_out, - k, - pool_size, - norm_type='bn', - freeze_norm=False, - name='', - act='leaky', - data_format='NCHW'): - """ - SPP layer, which consist of four pooling layer follwed by conv layer - - Args: - ch_in (int): input channel of conv layer - ch_out (int): output channel of conv layer - k (int): kernel size of conv layer - norm_type (str): batch norm type - freeze_norm (bool): whether to freeze norm, default False - name (str): layer name - act (str): activation function - data_format (str): data format, NCHW or NHWC - """ - super(SPP, self).__init__() - self.pool = [] - self.data_format = data_format - for size in pool_size: - pool = self.add_sublayer( - '{}.pool1'.format(name), - nn.MaxPool2D( - kernel_size=size, - stride=1, - padding=size // 2, - data_format=data_format, - ceil_mode=False)) - self.pool.append(pool) - self.conv = ConvBNLayer( - ch_in, - ch_out, - k, - padding=k // 2, - norm_type=norm_type, - freeze_norm=freeze_norm, - name=name, - act=act, - data_format=data_format) - - def forward(self, x): - outs = [x] - for pool in self.pool: - outs.append(pool(x)) - if self.data_format == "NCHW": - y = paddle.concat(outs, axis=1) - else: - y = paddle.concat(outs, axis=-1) - - y = self.conv(y) - return y - - -class CoordConv(nn.Layer): - def __init__(self, - ch_in, - ch_out, - filter_size, - padding, - norm_type, - freeze_norm=False, - name='', - data_format='NCHW'): - """ - CoordConv layer, see https://arxiv.org/abs/1807.03247 - - Args: - ch_in (int): input channel - ch_out (int): output channel - filter_size (int): filter size, default 3 - padding (int): padding size, default 0 - norm_type (str): batch norm type, default bn - name (str): layer name - data_format (str): data format, NCHW or NHWC - - """ - super(CoordConv, self).__init__() - self.conv = ConvBNLayer( - ch_in + 2, - ch_out, - filter_size=filter_size, - padding=padding, - norm_type=norm_type, - freeze_norm=freeze_norm, - data_format=data_format, - name=name) - self.data_format = data_format - - def forward(self, x): - gx, gy = add_coord(x, self.data_format) - if self.data_format == 'NCHW': - y = paddle.concat([x, gx, gy], axis=1) - else: - y = paddle.concat([x, gx, gy], axis=-1) - y = self.conv(y) - return y - - -class PPYOLODetBlock(nn.Layer): - def __init__(self, cfg, name, data_format='NCHW'): - """ - PPYOLODetBlock layer - - Args: - cfg (list): layer configs for this block - name (str): block name - data_format (str): data format, NCHW or NHWC - """ - super(PPYOLODetBlock, self).__init__() - self.conv_module = nn.Sequential() - for idx, (conv_name, layer, args, kwargs) in enumerate(cfg[:-1]): - kwargs.update( - name='{}.{}'.format(name, conv_name), data_format=data_format) - self.conv_module.add_sublayer(conv_name, layer(*args, **kwargs)) - - conv_name, layer, args, kwargs = cfg[-1] - kwargs.update( - name='{}.{}'.format(name, conv_name), data_format=data_format) - self.tip = layer(*args, **kwargs) - - def forward(self, inputs): - route = self.conv_module(inputs) - tip = self.tip(route) - return route, tip - - -class PPYOLOTinyDetBlock(nn.Layer): - def __init__(self, - ch_in, - ch_out, - name, - drop_block=False, - block_size=3, - keep_prob=0.9, - data_format='NCHW'): - """ - PPYOLO Tiny DetBlock layer - Args: - ch_in (list): input channel number - ch_out (list): output channel number - name (str): block name - drop_block: whether user DropBlock - block_size: drop block size - keep_prob: probability to keep block in DropBlock - data_format (str): data format, NCHW or NHWC - """ - super(PPYOLOTinyDetBlock, self).__init__() - self.drop_block_ = drop_block - self.conv_module = nn.Sequential() - - cfgs = [ - # name, in channels, out channels, filter_size, - # stride, padding, groups - ['.0', ch_in, ch_out, 1, 1, 0, 1], - ['.1', ch_out, ch_out, 5, 1, 2, ch_out], - ['.2', ch_out, ch_out, 1, 1, 0, 1], - ['.route', ch_out, ch_out, 5, 1, 2, ch_out], - ] - for cfg in cfgs: - conv_name, conv_ch_in, conv_ch_out, filter_size, stride, padding, \ - groups = cfg - self.conv_module.add_sublayer( - name + conv_name, - ConvBNLayer( - ch_in=conv_ch_in, - ch_out=conv_ch_out, - filter_size=filter_size, - stride=stride, - padding=padding, - groups=groups, - name=name + conv_name)) - - self.tip = ConvBNLayer( - ch_in=ch_out, - ch_out=ch_out, - filter_size=1, - stride=1, - padding=0, - groups=1, - name=name + conv_name) - - if self.drop_block_: - self.drop_block = DropBlock( - block_size=block_size, - keep_prob=keep_prob, - data_format=data_format, - name=name + '.dropblock') - - def forward(self, inputs): - if self.drop_block_: - inputs = self.drop_block(inputs) - route = self.conv_module(inputs) - tip = self.tip(route) - return route, tip - - -class PPYOLODetBlockCSP(nn.Layer): - def __init__(self, - cfg, - ch_in, - ch_out, - act, - norm_type, - name, - data_format='NCHW'): - """ - PPYOLODetBlockCSP layer - - Args: - cfg (list): layer configs for this block - ch_in (int): input channel - ch_out (int): output channel - act (str): default mish - name (str): block name - data_format (str): data format, NCHW or NHWC - """ - super(PPYOLODetBlockCSP, self).__init__() - self.data_format = data_format - self.conv1 = ConvBNLayer( - ch_in, - ch_out, - 1, - padding=0, - act=act, - norm_type=norm_type, - name=name + '.left', - data_format=data_format) - self.conv2 = ConvBNLayer( - ch_in, - ch_out, - 1, - padding=0, - act=act, - norm_type=norm_type, - name=name + '.right', - data_format=data_format) - self.conv3 = ConvBNLayer( - ch_out * 2, - ch_out * 2, - 1, - padding=0, - act=act, - norm_type=norm_type, - name=name, - data_format=data_format) - self.conv_module = nn.Sequential() - for idx, (layer_name, layer, args, kwargs) in enumerate(cfg): - kwargs.update(name=name + layer_name, data_format=data_format) - self.conv_module.add_sublayer(layer_name, layer(*args, **kwargs)) - - def forward(self, inputs): - conv_left = self.conv1(inputs) - conv_right = self.conv2(inputs) - conv_left = self.conv_module(conv_left) - if self.data_format == 'NCHW': - conv = paddle.concat([conv_left, conv_right], axis=1) - else: - conv = paddle.concat([conv_left, conv_right], axis=-1) - - conv = self.conv3(conv) - return conv, conv - - -@register -@serializable -class YOLOv3FPN(nn.Layer): - __shared__ = ['norm_type', 'data_format'] - - def __init__(self, - in_channels=[256, 512, 1024], - norm_type='bn', - freeze_norm=False, - data_format='NCHW'): - """ - YOLOv3FPN layer - - Args: - in_channels (list): input channels for fpn - norm_type (str): batch norm type, default bn - data_format (str): data format, NCHW or NHWC - - """ - super(YOLOv3FPN, self).__init__() - assert len(in_channels) > 0, "in_channels length should > 0" - self.in_channels = in_channels - self.num_blocks = len(in_channels) - - self._out_channels = [] - self.yolo_blocks = [] - self.routes = [] - self.data_format = data_format - for i in range(self.num_blocks): - name = 'yolo_block.{}'.format(i) - in_channel = in_channels[-i - 1] - if i > 0: - in_channel += 512 // (2**i) - yolo_block = self.add_sublayer( - name, - YoloDetBlock( - in_channel, - channel=512 // (2**i), - norm_type=norm_type, - freeze_norm=freeze_norm, - data_format=data_format, - name=name)) - self.yolo_blocks.append(yolo_block) - # tip layer output channel doubled - self._out_channels.append(1024 // (2**i)) - - if i < self.num_blocks - 1: - name = 'yolo_transition.{}'.format(i) - route = self.add_sublayer( - name, - ConvBNLayer( - ch_in=512 // (2**i), - ch_out=256 // (2**i), - filter_size=1, - stride=1, - padding=0, - norm_type=norm_type, - freeze_norm=freeze_norm, - data_format=data_format, - name=name)) - self.routes.append(route) - - def forward(self, blocks, for_mot=False): - assert len(blocks) == self.num_blocks - blocks = blocks[::-1] - yolo_feats = [] - - # add embedding features output for multi-object tracking model - if for_mot: - emb_feats = [] - - for i, block in enumerate(blocks): - if i > 0: - if self.data_format == 'NCHW': - block = paddle.concat([route, block], axis=1) - else: - block = paddle.concat([route, block], axis=-1) - route, tip = self.yolo_blocks[i](block) - yolo_feats.append(tip) - - if for_mot: - # add embedding features output - emb_feats.append(route) - - if i < self.num_blocks - 1: - route = self.routes[i](route) - route = F.interpolate( - route, scale_factor=2., data_format=self.data_format) - - if for_mot: - return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats} - else: - return yolo_feats - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } - - @property - def out_shape(self): - return [ShapeSpec(channels=c) for c in self._out_channels] - - -@register -@serializable -class PPYOLOFPN(nn.Layer): - __shared__ = ['norm_type', 'data_format'] - - def __init__(self, - in_channels=[512, 1024, 2048], - norm_type='bn', - freeze_norm=False, - data_format='NCHW', - coord_conv=False, - conv_block_num=2, - drop_block=False, - block_size=3, - keep_prob=0.9, - spp=False): - """ - PPYOLOFPN layer - - Args: - in_channels (list): input channels for fpn - norm_type (str): batch norm type, default bn - data_format (str): data format, NCHW or NHWC - coord_conv (bool): whether use CoordConv or not - conv_block_num (int): conv block num of each pan block - drop_block (bool): whether use DropBlock or not - block_size (int): block size of DropBlock - keep_prob (float): keep probability of DropBlock - spp (bool): whether use spp or not - - """ - super(PPYOLOFPN, self).__init__() - assert len(in_channels) > 0, "in_channels length should > 0" - self.in_channels = in_channels - self.num_blocks = len(in_channels) - # parse kwargs - self.coord_conv = coord_conv - self.drop_block = drop_block - self.block_size = block_size - self.keep_prob = keep_prob - self.spp = spp - self.conv_block_num = conv_block_num - self.data_format = data_format - if self.coord_conv: - ConvLayer = CoordConv - else: - ConvLayer = ConvBNLayer - - if self.drop_block: - dropblock_cfg = [[ - 'dropblock', DropBlock, [self.block_size, self.keep_prob], - dict() - ]] - else: - dropblock_cfg = [] - - self._out_channels = [] - self.yolo_blocks = [] - self.routes = [] - for i, ch_in in enumerate(self.in_channels[::-1]): - if i > 0: - ch_in += 512 // (2**i) - channel = 64 * (2**self.num_blocks) // (2**i) - base_cfg = [] - c_in, c_out = ch_in, channel - for j in range(self.conv_block_num): - base_cfg += [ - [ - 'conv{}'.format(2 * j), ConvLayer, [c_in, c_out, 1], - dict( - padding=0, - norm_type=norm_type, - freeze_norm=freeze_norm) - ], - [ - 'conv{}'.format(2 * j + 1), ConvBNLayer, - [c_out, c_out * 2, 3], dict( - padding=1, - norm_type=norm_type, - freeze_norm=freeze_norm) - ], - ] - c_in, c_out = c_out * 2, c_out - - base_cfg += [[ - 'route', ConvLayer, [c_in, c_out, 1], dict( - padding=0, norm_type=norm_type, freeze_norm=freeze_norm) - ], [ - 'tip', ConvLayer, [c_out, c_out * 2, 3], dict( - padding=1, norm_type=norm_type, freeze_norm=freeze_norm) - ]] - - if self.conv_block_num == 2: - if i == 0: - if self.spp: - spp_cfg = [[ - 'spp', SPP, [channel * 4, channel, 1], dict( - pool_size=[5, 9, 13], - norm_type=norm_type, - freeze_norm=freeze_norm) - ]] - else: - spp_cfg = [] - cfg = base_cfg[0:3] + spp_cfg + base_cfg[ - 3:4] + dropblock_cfg + base_cfg[4:6] - else: - cfg = base_cfg[0:2] + dropblock_cfg + base_cfg[2:6] - elif self.conv_block_num == 0: - if self.spp and i == 0: - spp_cfg = [[ - 'spp', SPP, [c_in * 4, c_in, 1], dict( - pool_size=[5, 9, 13], - norm_type=norm_type, - freeze_norm=freeze_norm) - ]] - else: - spp_cfg = [] - cfg = spp_cfg + dropblock_cfg + base_cfg - name = 'yolo_block.{}'.format(i) - yolo_block = self.add_sublayer(name, PPYOLODetBlock(cfg, name)) - self.yolo_blocks.append(yolo_block) - self._out_channels.append(channel * 2) - if i < self.num_blocks - 1: - name = 'yolo_transition.{}'.format(i) - route = self.add_sublayer( - name, - ConvBNLayer( - ch_in=channel, - ch_out=256 // (2**i), - filter_size=1, - stride=1, - padding=0, - norm_type=norm_type, - freeze_norm=freeze_norm, - data_format=data_format, - name=name)) - self.routes.append(route) - - def forward(self, blocks, for_mot=False): - assert len(blocks) == self.num_blocks - blocks = blocks[::-1] - yolo_feats = [] - - # add embedding features output for multi-object tracking model - if for_mot: - emb_feats = [] - - for i, block in enumerate(blocks): - if i > 0: - if self.data_format == 'NCHW': - block = paddle.concat([route, block], axis=1) - else: - block = paddle.concat([route, block], axis=-1) - route, tip = self.yolo_blocks[i](block) - yolo_feats.append(tip) - - if for_mot: - # add embedding features output - emb_feats.append(route) - - if i < self.num_blocks - 1: - route = self.routes[i](route) - route = F.interpolate( - route, scale_factor=2., data_format=self.data_format) - - if for_mot: - return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats} - else: - return yolo_feats - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } - - @property - def out_shape(self): - return [ShapeSpec(channels=c) for c in self._out_channels] - - -@register -@serializable -class PPYOLOTinyFPN(nn.Layer): - __shared__ = ['norm_type', 'data_format'] - - def __init__(self, - in_channels=[80, 56, 34], - detection_block_channels=[160, 128, 96], - norm_type='bn', - data_format='NCHW', - **kwargs): - """ - PPYOLO Tiny FPN layer - Args: - in_channels (list): input channels for fpn - detection_block_channels (list): channels in fpn - norm_type (str): batch norm type, default bn - data_format (str): data format, NCHW or NHWC - kwargs: extra key-value pairs, such as parameter of DropBlock and spp - """ - super(PPYOLOTinyFPN, self).__init__() - assert len(in_channels) > 0, "in_channels length should > 0" - self.in_channels = in_channels[::-1] - assert len(detection_block_channels - ) > 0, "detection_block_channelslength should > 0" - self.detection_block_channels = detection_block_channels - self.data_format = data_format - self.num_blocks = len(in_channels) - # parse kwargs - self.drop_block = kwargs.get('drop_block', False) - self.block_size = kwargs.get('block_size', 3) - self.keep_prob = kwargs.get('keep_prob', 0.9) - - self.spp_ = kwargs.get('spp', False) - if self.spp_: - self.spp = SPP(self.in_channels[0] * 4, - self.in_channels[0], - k=1, - pool_size=[5, 9, 13], - norm_type=norm_type, - name='spp') - - self._out_channels = [] - self.yolo_blocks = [] - self.routes = [] - for i, ( - ch_in, ch_out - ) in enumerate(zip(self.in_channels, self.detection_block_channels)): - name = 'yolo_block.{}'.format(i) - if i > 0: - ch_in += self.detection_block_channels[i - 1] - yolo_block = self.add_sublayer( - name, - PPYOLOTinyDetBlock( - ch_in, - ch_out, - name, - drop_block=self.drop_block, - block_size=self.block_size, - keep_prob=self.keep_prob)) - self.yolo_blocks.append(yolo_block) - self._out_channels.append(ch_out) - - if i < self.num_blocks - 1: - name = 'yolo_transition.{}'.format(i) - route = self.add_sublayer( - name, - ConvBNLayer( - ch_in=ch_out, - ch_out=ch_out, - filter_size=1, - stride=1, - padding=0, - norm_type=norm_type, - data_format=data_format, - name=name)) - self.routes.append(route) - - def forward(self, blocks, for_mot=False): - assert len(blocks) == self.num_blocks - blocks = blocks[::-1] - yolo_feats = [] - - # add embedding features output for multi-object tracking model - if for_mot: - emb_feats = [] - - for i, block in enumerate(blocks): - if i == 0 and self.spp_: - block = self.spp(block) - - if i > 0: - if self.data_format == 'NCHW': - block = paddle.concat([route, block], axis=1) - else: - block = paddle.concat([route, block], axis=-1) - route, tip = self.yolo_blocks[i](block) - yolo_feats.append(tip) - - if for_mot: - # add embedding features output - emb_feats.append(route) - - if i < self.num_blocks - 1: - route = self.routes[i](route) - route = F.interpolate( - route, scale_factor=2., data_format=self.data_format) - - if for_mot: - return {'yolo_feats': yolo_feats, 'emb_feats': emb_feats} - else: - return yolo_feats - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } - - @property - def out_shape(self): - return [ShapeSpec(channels=c) for c in self._out_channels] - - -@register -@serializable -class PPYOLOPAN(nn.Layer): - __shared__ = ['norm_type', 'data_format'] - - def __init__(self, - in_channels=[512, 1024, 2048], - norm_type='bn', - data_format='NCHW', - act='mish', - conv_block_num=3, - drop_block=False, - block_size=3, - keep_prob=0.9, - spp=False): - """ - PPYOLOPAN layer with SPP, DropBlock and CSP connection. - - Args: - in_channels (list): input channels for fpn - norm_type (str): batch norm type, default bn - data_format (str): data format, NCHW or NHWC - act (str): activation function, default mish - conv_block_num (int): conv block num of each pan block - drop_block (bool): whether use DropBlock or not - block_size (int): block size of DropBlock - keep_prob (float): keep probability of DropBlock - spp (bool): whether use spp or not - - """ - super(PPYOLOPAN, self).__init__() - assert len(in_channels) > 0, "in_channels length should > 0" - self.in_channels = in_channels - self.num_blocks = len(in_channels) - # parse kwargs - self.drop_block = drop_block - self.block_size = block_size - self.keep_prob = keep_prob - self.spp = spp - self.conv_block_num = conv_block_num - self.data_format = data_format - if self.drop_block: - dropblock_cfg = [[ - 'dropblock', DropBlock, [self.block_size, self.keep_prob], - dict() - ]] - else: - dropblock_cfg = [] - - # fpn - self.fpn_blocks = [] - self.fpn_routes = [] - fpn_channels = [] - for i, ch_in in enumerate(self.in_channels[::-1]): - if i > 0: - ch_in += 512 // (2**(i - 1)) - channel = 512 // (2**i) - base_cfg = [] - for j in range(self.conv_block_num): - base_cfg += [ - # name, layer, args - [ - '{}.0'.format(j), ConvBNLayer, [channel, channel, 1], - dict( - padding=0, act=act, norm_type=norm_type) - ], - [ - '{}.1'.format(j), ConvBNLayer, [channel, channel, 3], - dict( - padding=1, act=act, norm_type=norm_type) - ] - ] - - if i == 0 and self.spp: - base_cfg[3] = [ - 'spp', SPP, [channel * 4, channel, 1], dict( - pool_size=[5, 9, 13], act=act, norm_type=norm_type) - ] - - cfg = base_cfg[:4] + dropblock_cfg + base_cfg[4:] - name = 'fpn.{}'.format(i) - fpn_block = self.add_sublayer( - name, - PPYOLODetBlockCSP(cfg, ch_in, channel, act, norm_type, name, - data_format)) - self.fpn_blocks.append(fpn_block) - fpn_channels.append(channel * 2) - if i < self.num_blocks - 1: - name = 'fpn_transition.{}'.format(i) - route = self.add_sublayer( - name, - ConvBNLayer( - ch_in=channel * 2, - ch_out=channel, - filter_size=1, - stride=1, - padding=0, - act=act, - norm_type=norm_type, - data_format=data_format, - name=name)) - self.fpn_routes.append(route) - # pan - self.pan_blocks = [] - self.pan_routes = [] - self._out_channels = [512 // (2**(self.num_blocks - 2)), ] - for i in reversed(range(self.num_blocks - 1)): - name = 'pan_transition.{}'.format(i) - route = self.add_sublayer( - name, - ConvBNLayer( - ch_in=fpn_channels[i + 1], - ch_out=fpn_channels[i + 1], - filter_size=3, - stride=2, - padding=1, - act=act, - norm_type=norm_type, - data_format=data_format, - name=name)) - self.pan_routes = [route, ] + self.pan_routes - base_cfg = [] - ch_in = fpn_channels[i] + fpn_channels[i + 1] - channel = 512 // (2**i) - for j in range(self.conv_block_num): - base_cfg += [ - # name, layer, args - [ - '{}.0'.format(j), ConvBNLayer, [channel, channel, 1], - dict( - padding=0, act=act, norm_type=norm_type) - ], - [ - '{}.1'.format(j), ConvBNLayer, [channel, channel, 3], - dict( - padding=1, act=act, norm_type=norm_type) - ] - ] - - cfg = base_cfg[:4] + dropblock_cfg + base_cfg[4:] - name = 'pan.{}'.format(i) - pan_block = self.add_sublayer( - name, - PPYOLODetBlockCSP(cfg, ch_in, channel, act, norm_type, name, - data_format)) - - self.pan_blocks = [pan_block, ] + self.pan_blocks - self._out_channels.append(channel * 2) - - self._out_channels = self._out_channels[::-1] - - def forward(self, blocks, for_mot=False): - assert len(blocks) == self.num_blocks - blocks = blocks[::-1] - fpn_feats = [] - - # add embedding features output for multi-object tracking model - if for_mot: - emb_feats = [] - - for i, block in enumerate(blocks): - if i > 0: - if self.data_format == 'NCHW': - block = paddle.concat([route, block], axis=1) - else: - block = paddle.concat([route, block], axis=-1) - route, tip = self.fpn_blocks[i](block) - fpn_feats.append(tip) - - if for_mot: - # add embedding features output - emb_feats.append(route) - - if i < self.num_blocks - 1: - route = self.fpn_routes[i](route) - route = F.interpolate( - route, scale_factor=2., data_format=self.data_format) - - pan_feats = [fpn_feats[-1], ] - route = fpn_feats[self.num_blocks - 1] - for i in reversed(range(self.num_blocks - 1)): - block = fpn_feats[i] - route = self.pan_routes[i](route) - if self.data_format == 'NCHW': - block = paddle.concat([route, block], axis=1) - else: - block = paddle.concat([route, block], axis=-1) - - route, tip = self.pan_blocks[i](block) - pan_feats.append(tip) - - if for_mot: - return {'yolo_feats': pan_feats[::-1], 'emb_feats': emb_feats} - else: - return pan_feats[::-1] - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } - - @property - def out_shape(self): - return [ShapeSpec(channels=c) for c in self._out_channels] - - -@register -@serializable -class YOLOCSPPAN(nn.Layer): - """ - YOLO CSP-PAN, used in YOLOv5 and YOLOX. - """ - __shared__ = ['depth_mult', 'data_format', 'act', 'trt'] - - def __init__(self, - depth_mult=1.0, - in_channels=[256, 512, 1024], - depthwise=False, - data_format='NCHW', - act='silu', - trt=False): - super(YOLOCSPPAN, self).__init__() - self.in_channels = in_channels - self._out_channels = in_channels - Conv = DWConv if depthwise else BaseConv - - self.data_format = data_format - act = get_act_fn( - act, trt=trt) if act is None or isinstance(act, - (str, dict)) else act - self.upsample = nn.Upsample(scale_factor=2, mode="nearest") - - # top-down fpn - self.lateral_convs = nn.LayerList() - self.fpn_blocks = nn.LayerList() - for idx in range(len(in_channels) - 1, 0, -1): - self.lateral_convs.append( - BaseConv( - int(in_channels[idx]), - int(in_channels[idx - 1]), - 1, - 1, - act=act)) - self.fpn_blocks.append( - CSPLayer( - int(in_channels[idx - 1] * 2), - int(in_channels[idx - 1]), - round(3 * depth_mult), - shortcut=False, - depthwise=depthwise, - act=act)) - - # bottom-up pan - self.downsample_convs = nn.LayerList() - self.pan_blocks = nn.LayerList() - for idx in range(len(in_channels) - 1): - self.downsample_convs.append( - Conv( - int(in_channels[idx]), - int(in_channels[idx]), - 3, - stride=2, - act=act)) - self.pan_blocks.append( - CSPLayer( - int(in_channels[idx] * 2), - int(in_channels[idx + 1]), - round(3 * depth_mult), - shortcut=False, - depthwise=depthwise, - act=act)) - - def forward(self, feats, for_mot=False): - assert len(feats) == len(self.in_channels) - - # top-down fpn - inner_outs = [feats[-1]] - for idx in range(len(self.in_channels) - 1, 0, -1): - feat_heigh = inner_outs[0] - feat_low = feats[idx - 1] - feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx]( - feat_heigh) - inner_outs[0] = feat_heigh - - upsample_feat = F.interpolate( - feat_heigh, - scale_factor=2., - mode="nearest", - data_format=self.data_format) - inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx]( - paddle.concat( - [upsample_feat, feat_low], axis=1)) - inner_outs.insert(0, inner_out) - - # bottom-up pan - outs = [inner_outs[0]] - for idx in range(len(self.in_channels) - 1): - feat_low = outs[-1] - feat_height = inner_outs[idx + 1] - downsample_feat = self.downsample_convs[idx](feat_low) - out = self.pan_blocks[idx](paddle.concat( - [downsample_feat, feat_height], axis=1)) - outs.append(out) - - return outs - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_channels': [i.channels for i in input_shape], } - - @property - def out_shape(self): - return [ShapeSpec(channels=c) for c in self._out_channels] diff --git a/pdfdet/models/Paddle/ppdet/modeling/ops.py b/pdfdet/models/Paddle/ppdet/modeling/ops.py deleted file mode 100644 index d9a1192..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/ops.py +++ /dev/null @@ -1,1114 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn.functional as F -import paddle.nn as nn -from paddle import ParamAttr -from paddle.regularizer import L2Decay -try: - import paddle._legacy_C_ops as C_ops -except: - import paddle._C_ops as C_ops - -from paddle import in_dynamic_mode -from paddle.common_ops_import import Variable, LayerHelper, check_variable_and_dtype, check_type, check_dtype - -__all__ = [ - 'prior_box', 'generate_proposals', 'box_coder', 'multiclass_nms', - 'distribute_fpn_proposals', 'matrix_nms', 'batch_norm', 'mish', 'silu', - 'swish', 'identity', 'anchor_generator' -] - - -def identity(x): - return x - - -def mish(x): - return F.mish(x) if hasattr(F, mish) else x * F.tanh(F.softplus(x)) - - -def silu(x): - return F.silu(x) - - -def swish(x): - return x * F.sigmoid(x) - - -TRT_ACT_SPEC = {'swish': swish, 'silu': swish} - -ACT_SPEC = {'mish': mish, 'silu': silu} - - -def get_act_fn(act=None, trt=False): - assert act is None or isinstance(act, ( - str, dict)), 'name of activation should be str, dict or None' - if not act: - return identity - - if isinstance(act, dict): - name = act['name'] - act.pop('name') - kwargs = act - else: - name = act - kwargs = dict() - - if trt and name in TRT_ACT_SPEC: - fn = TRT_ACT_SPEC[name] - elif name in ACT_SPEC: - fn = ACT_SPEC[name] - else: - fn = getattr(F, name) - - return lambda x: fn(x, **kwargs) - - -def batch_norm(ch, - norm_type='bn', - norm_decay=0., - freeze_norm=False, - initializer=None, - data_format='NCHW'): - - norm_lr = 0. if freeze_norm else 1. - weight_attr = ParamAttr( - initializer=initializer, - learning_rate=norm_lr, - regularizer=L2Decay(norm_decay), - trainable=False if freeze_norm else True) - bias_attr = ParamAttr( - learning_rate=norm_lr, - regularizer=L2Decay(norm_decay), - trainable=False if freeze_norm else True) - - if norm_type in ['sync_bn', 'bn']: - norm_layer = nn.BatchNorm2D( - ch, - weight_attr=weight_attr, - bias_attr=bias_attr, - data_format=data_format) - - norm_params = norm_layer.parameters() - if freeze_norm: - for param in norm_params: - param.stop_gradient = True - - return norm_layer - - -@paddle.jit.not_to_static -def anchor_generator(input, - anchor_sizes=None, - aspect_ratios=None, - variance=[0.1, 0.1, 0.2, 0.2], - stride=None, - offset=0.5): - """ - **Anchor generator operator** - Generate anchors for Faster RCNN algorithm. - Each position of the input produce N anchors, N = - size(anchor_sizes) * size(aspect_ratios). The order of generated anchors - is firstly aspect_ratios loop then anchor_sizes loop. - Args: - input(Variable): 4-D Tensor with shape [N,C,H,W]. The input feature map. - anchor_sizes(float32|list|tuple, optional): The anchor sizes of generated - anchors, given in absolute pixels e.g. [64., 128., 256., 512.]. - For instance, the anchor size of 64 means the area of this anchor - equals to 64**2. None by default. - aspect_ratios(float32|list|tuple, optional): The height / width ratios - of generated anchors, e.g. [0.5, 1.0, 2.0]. None by default. - variance(list|tuple, optional): The variances to be used in box - regression deltas. The data type is float32, [0.1, 0.1, 0.2, 0.2] by - default. - stride(list|tuple, optional): The anchors stride across width and height. - The data type is float32. e.g. [16.0, 16.0]. None by default. - offset(float32, optional): Prior boxes center offset. 0.5 by default. - Returns: - Tuple: - Anchors(Variable): The output anchors with a layout of [H, W, num_anchors, 4]. - H is the height of input, W is the width of input, - num_anchors is the box count of each position. - Each anchor is in (xmin, ymin, xmax, ymax) format an unnormalized. - - Variances(Variable): The expanded variances of anchors - with a layout of [H, W, num_priors, 4]. - H is the height of input, W is the width of input - num_anchors is the box count of each position. - Each variance is in (xcenter, ycenter, w, h) format. - Examples: - .. code-block:: python - import paddle.fluid as fluid - conv1 = fluid.data(name='conv1', shape=[None, 48, 16, 16], dtype='float32') - anchor, var = fluid.layers.anchor_generator( - input=conv1, - anchor_sizes=[64, 128, 256, 512], - aspect_ratios=[0.5, 1.0, 2.0], - variance=[0.1, 0.1, 0.2, 0.2], - stride=[16.0, 16.0], - offset=0.5) - """ - - def _is_list_or_tuple_(data): - return (isinstance(data, list) or isinstance(data, tuple)) - - if not _is_list_or_tuple_(anchor_sizes): - anchor_sizes = [anchor_sizes] - if not _is_list_or_tuple_(aspect_ratios): - aspect_ratios = [aspect_ratios] - if not (_is_list_or_tuple_(stride) and len(stride) == 2): - raise ValueError('stride should be a list or tuple ', - 'with length 2, (stride_width, stride_height).') - - anchor_sizes = list(map(float, anchor_sizes)) - aspect_ratios = list(map(float, aspect_ratios)) - stride = list(map(float, stride)) - - if in_dynamic_mode(): - attrs = ('anchor_sizes', anchor_sizes, 'aspect_ratios', aspect_ratios, - 'variances', variance, 'stride', stride, 'offset', offset) - anchor, var = C_ops.anchor_generator(input, *attrs) - return anchor, var - - helper = LayerHelper("anchor_generator", **locals()) - dtype = helper.input_dtype() - attrs = { - 'anchor_sizes': anchor_sizes, - 'aspect_ratios': aspect_ratios, - 'variances': variance, - 'stride': stride, - 'offset': offset - } - - anchor = helper.create_variable_for_type_inference(dtype) - var = helper.create_variable_for_type_inference(dtype) - helper.append_op( - type="anchor_generator", - inputs={"Input": input}, - outputs={"Anchors": anchor, - "Variances": var}, - attrs=attrs, ) - anchor.stop_gradient = True - var.stop_gradient = True - return anchor, var - - -@paddle.jit.not_to_static -def distribute_fpn_proposals(fpn_rois, - min_level, - max_level, - refer_level, - refer_scale, - pixel_offset=False, - rois_num=None, - name=None): - r""" - - **This op only takes LoDTensor as input.** In Feature Pyramid Networks - (FPN) models, it is needed to distribute all proposals into different FPN - level, with respect to scale of the proposals, the referring scale and the - referring level. Besides, to restore the order of proposals, we return an - array which indicates the original index of rois in current proposals. - To compute FPN level for each roi, the formula is given as follows: - - .. math:: - - roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} - - level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level) - - where BBoxArea is a function to compute the area of each roi. - - Args: - - fpn_rois(Variable): 2-D Tensor with shape [N, 4] and data type is - float32 or float64. The input fpn_rois. - min_level(int32): The lowest level of FPN layer where the proposals come - from. - max_level(int32): The highest level of FPN layer where the proposals - come from. - refer_level(int32): The referring level of FPN layer with specified scale. - refer_scale(int32): The referring scale of FPN layer with specified level. - rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. - The shape is [B] and data type is int32. B is the number of images. - If it is not None then return a list of 1-D Tensor. Each element - is the output RoIs' number of each image on the corresponding level - and the shape is [B]. None by default. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. - - Returns: - Tuple: - - multi_rois(List) : A list of 2-D LoDTensor with shape [M, 4] - and data type of float32 and float64. The length is - max_level-min_level+1. The proposals in each FPN level. - - restore_ind(Variable): A 2-D Tensor with shape [N, 1], N is - the number of total rois. The data type is int32. It is - used to restore the order of fpn_rois. - - rois_num_per_level(List): A list of 1-D Tensor and each Tensor is - the RoIs' number in each image on the corresponding level. The shape - is [B] and data type of int32. B is the number of images - - - Examples: - .. code-block:: python - - import paddle - from ppdet.modeling import ops - paddle.enable_static() - fpn_rois = paddle.static.data( - name='data', shape=[None, 4], dtype='float32', lod_level=1) - multi_rois, restore_ind = ops.distribute_fpn_proposals( - fpn_rois=fpn_rois, - min_level=2, - max_level=5, - refer_level=4, - refer_scale=224) - """ - num_lvl = max_level - min_level + 1 - - if in_dynamic_mode(): - assert rois_num is not None, "rois_num should not be None in dygraph mode." - attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level', - refer_level, 'refer_scale', refer_scale, 'pixel_offset', - pixel_offset) - multi_rois, restore_ind, rois_num_per_level = C_ops.distribute_fpn_proposals( - fpn_rois, rois_num, num_lvl, num_lvl, *attrs) - - return multi_rois, restore_ind, rois_num_per_level - - else: - check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'], - 'distribute_fpn_proposals') - helper = LayerHelper('distribute_fpn_proposals', **locals()) - dtype = helper.input_dtype('fpn_rois') - multi_rois = [ - helper.create_variable_for_type_inference(dtype) - for i in range(num_lvl) - ] - - restore_ind = helper.create_variable_for_type_inference(dtype='int32') - - inputs = {'FpnRois': fpn_rois} - outputs = { - 'MultiFpnRois': multi_rois, - 'RestoreIndex': restore_ind, - } - - if rois_num is not None: - inputs['RoisNum'] = rois_num - rois_num_per_level = [ - helper.create_variable_for_type_inference(dtype='int32') - for i in range(num_lvl) - ] - outputs['MultiLevelRoIsNum'] = rois_num_per_level - else: - rois_num_per_level = None - - helper.append_op( - type='distribute_fpn_proposals', - inputs=inputs, - outputs=outputs, - attrs={ - 'min_level': min_level, - 'max_level': max_level, - 'refer_level': refer_level, - 'refer_scale': refer_scale, - 'pixel_offset': pixel_offset - }) - return multi_rois, restore_ind, rois_num_per_level - - -@paddle.jit.not_to_static -def prior_box(input, - image, - min_sizes, - max_sizes=None, - aspect_ratios=[1.], - variance=[0.1, 0.1, 0.2, 0.2], - flip=False, - clip=False, - steps=[0.0, 0.0], - offset=0.5, - min_max_aspect_ratios_order=False, - name=None): - """ - - This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm. - Each position of the input produce N prior boxes, N is determined by - the count of min_sizes, max_sizes and aspect_ratios, The size of the - box is in range(min_size, max_size) interval, which is generated in - sequence according to the aspect_ratios. - - Parameters: - input(Tensor): 4-D tensor(NCHW), the data type should be float32 or float64. - image(Tensor): 4-D tensor(NCHW), the input image data of PriorBoxOp, - the data type should be float32 or float64. - min_sizes(list|tuple|float): the min sizes of generated prior boxes. - max_sizes(list|tuple|None): the max sizes of generated prior boxes. - Default: None. - aspect_ratios(list|tuple|float): the aspect ratios of generated - prior boxes. Default: [1.]. - variance(list|tuple): the variances to be encoded in prior boxes. - Default:[0.1, 0.1, 0.2, 0.2]. - flip(bool): Whether to flip aspect ratios. Default:False. - clip(bool): Whether to clip out-of-boundary boxes. Default: False. - step(list|tuple): Prior boxes step across width and height, If - step[0] equals to 0.0 or step[1] equals to 0.0, the prior boxes step across - height or weight of the input will be automatically calculated. - Default: [0., 0.] - offset(float): Prior boxes center offset. Default: 0.5 - min_max_aspect_ratios_order(bool): If set True, the output prior box is - in order of [min, max, aspect_ratios], which is consistent with - Caffe. Please note, this order affects the weights order of - convolution layer followed by and does not affect the final - detection results. Default: False. - name(str, optional): The default value is None. Normally there is no need for - user to set this property. For more information, please refer to :ref:`api_guide_Name` - - Returns: - Tuple: A tuple with two Variable (boxes, variances) - - boxes(Tensor): the output prior boxes of PriorBox. - 4-D tensor, the layout is [H, W, num_priors, 4]. - H is the height of input, W is the width of input, - num_priors is the total box count of each position of input. - - variances(Tensor): the expanded variances of PriorBox. - 4-D tensor, the layput is [H, W, num_priors, 4]. - H is the height of input, W is the width of input - num_priors is the total box count of each position of input - - Examples: - .. code-block:: python - - import paddle - from ppdet.modeling import ops - - paddle.enable_static() - input = paddle.static.data(name="input", shape=[None,3,6,9]) - image = paddle.static.data(name="image", shape=[None,3,9,12]) - box, var = ops.prior_box( - input=input, - image=image, - min_sizes=[100.], - clip=True, - flip=True) - """ - helper = LayerHelper("prior_box", **locals()) - dtype = helper.input_dtype() - check_variable_and_dtype( - input, 'input', ['uint8', 'int8', 'float32', 'float64'], 'prior_box') - - def _is_list_or_tuple_(data): - return (isinstance(data, list) or isinstance(data, tuple)) - - if not _is_list_or_tuple_(min_sizes): - min_sizes = [min_sizes] - if not _is_list_or_tuple_(aspect_ratios): - aspect_ratios = [aspect_ratios] - if not (_is_list_or_tuple_(steps) and len(steps) == 2): - raise ValueError('steps should be a list or tuple ', - 'with length 2, (step_width, step_height).') - - min_sizes = list(map(float, min_sizes)) - aspect_ratios = list(map(float, aspect_ratios)) - steps = list(map(float, steps)) - - cur_max_sizes = None - if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0: - if not _is_list_or_tuple_(max_sizes): - max_sizes = [max_sizes] - cur_max_sizes = max_sizes - - if in_dynamic_mode(): - attrs = ('min_sizes', min_sizes, 'aspect_ratios', aspect_ratios, - 'variances', variance, 'flip', flip, 'clip', clip, 'step_w', - steps[0], 'step_h', steps[1], 'offset', offset, - 'min_max_aspect_ratios_order', min_max_aspect_ratios_order) - if cur_max_sizes is not None: - attrs += ('max_sizes', cur_max_sizes) - box, var = C_ops.prior_box(input, image, *attrs) - return box, var - else: - attrs = { - 'min_sizes': min_sizes, - 'aspect_ratios': aspect_ratios, - 'variances': variance, - 'flip': flip, - 'clip': clip, - 'step_w': steps[0], - 'step_h': steps[1], - 'offset': offset, - 'min_max_aspect_ratios_order': min_max_aspect_ratios_order - } - - if cur_max_sizes is not None: - attrs['max_sizes'] = cur_max_sizes - - box = helper.create_variable_for_type_inference(dtype) - var = helper.create_variable_for_type_inference(dtype) - helper.append_op( - type="prior_box", - inputs={"Input": input, - "Image": image}, - outputs={"Boxes": box, - "Variances": var}, - attrs=attrs, ) - box.stop_gradient = True - var.stop_gradient = True - return box, var - - -@paddle.jit.not_to_static -def multiclass_nms(bboxes, - scores, - score_threshold, - nms_top_k, - keep_top_k, - nms_threshold=0.3, - normalized=True, - nms_eta=1., - background_label=-1, - return_index=False, - return_rois_num=True, - rois_num=None, - name=None): - """ - This operator is to do multi-class non maximum suppression (NMS) on - boxes and scores. - In the NMS step, this operator greedily selects a subset of detection bounding - boxes that have high scores larger than score_threshold, if providing this - threshold, then selects the largest nms_top_k confidences scores if nms_top_k - is larger than -1. Then this operator pruns away boxes that have high IOU - (intersection over union) overlap with already selected boxes by adaptive - threshold NMS based on parameters of nms_threshold and nms_eta. - Aftern NMS step, at most keep_top_k number of total bboxes are to be kept - per image if keep_top_k is larger than -1. - Args: - bboxes (Tensor): Two types of bboxes are supported: - 1. (Tensor) A 3-D Tensor with shape - [N, M, 4 or 8 16 24 32] represents the - predicted locations of M bounding bboxes, - N is the batch size. Each bounding box has four - coordinate values and the layout is - [xmin, ymin, xmax, ymax], when box size equals to 4. - 2. (LoDTensor) A 3-D Tensor with shape [M, C, 4] - M is the number of bounding boxes, C is the - class number - scores (Tensor): Two types of scores are supported: - 1. (Tensor) A 3-D Tensor with shape [N, C, M] - represents the predicted confidence predictions. - N is the batch size, C is the class number, M is - number of bounding boxes. For each category there - are total M scores which corresponding M bounding - boxes. Please note, M is equal to the 2nd dimension - of BBoxes. - 2. (LoDTensor) A 2-D LoDTensor with shape [M, C]. - M is the number of bbox, C is the class number. - In this case, input BBoxes should be the second - case with shape [M, C, 4]. - background_label (int): The index of background label, the background - label will be ignored. If set to -1, then all - categories will be considered. Default: 0 - score_threshold (float): Threshold to filter out bounding boxes with - low confidence score. If not provided, - consider all boxes. - nms_top_k (int): Maximum number of detections to be kept according to - the confidences after the filtering detections based - on score_threshold. - nms_threshold (float): The threshold to be used in NMS. Default: 0.3 - nms_eta (float): The threshold to be used in NMS. Default: 1.0 - keep_top_k (int): Number of total bboxes to be kept per image after NMS - step. -1 means keeping all bboxes after NMS step. - normalized (bool): Whether detections are normalized. Default: True - return_index(bool): Whether return selected index. Default: False - rois_num(Tensor): 1-D Tensor contains the number of RoIs in each image. - The shape is [B] and data type is int32. B is the number of images. - If it is not None then return a list of 1-D Tensor. Each element - is the output RoIs' number of each image on the corresponding level - and the shape is [B]. None by default. - name(str): Name of the multiclass nms op. Default: None. - Returns: - A tuple with two Variables: (Out, Index) if return_index is True, - otherwise, a tuple with one Variable(Out) is returned. - Out: A 2-D LoDTensor with shape [No, 6] represents the detections. - Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax] - or A 2-D LoDTensor with shape [No, 10] represents the detections. - Each row has 10 values: [label, confidence, x1, y1, x2, y2, x3, y3, - x4, y4]. No is the total number of detections. - If all images have not detected results, all elements in LoD will be - 0, and output tensor is empty (None). - Index: Only return when return_index is True. A 2-D LoDTensor with - shape [No, 1] represents the selected index which type is Integer. - The index is the absolute value cross batches. No is the same number - as Out. If the index is used to gather other attribute such as age, - one needs to reshape the input(N, M, 1) to (N * M, 1) as first, where - N is the batch size and M is the number of boxes. - Examples: - .. code-block:: python - - import paddle - from ppdet.modeling import ops - boxes = paddle.static.data(name='bboxes', shape=[81, 4], - dtype='float32', lod_level=1) - scores = paddle.static.data(name='scores', shape=[81], - dtype='float32', lod_level=1) - out, index = ops.multiclass_nms(bboxes=boxes, - scores=scores, - background_label=0, - score_threshold=0.5, - nms_top_k=400, - nms_threshold=0.3, - keep_top_k=200, - normalized=False, - return_index=True) - """ - helper = LayerHelper('multiclass_nms3', **locals()) - - if in_dynamic_mode(): - attrs = ('background_label', background_label, 'score_threshold', - score_threshold, 'nms_top_k', nms_top_k, 'nms_threshold', - nms_threshold, 'keep_top_k', keep_top_k, 'nms_eta', nms_eta, - 'normalized', normalized) - output, index, nms_rois_num = C_ops.multiclass_nms3(bboxes, scores, - rois_num, *attrs) - if not return_index: - index = None - return output, nms_rois_num, index - - else: - output = helper.create_variable_for_type_inference(dtype=bboxes.dtype) - index = helper.create_variable_for_type_inference(dtype='int32') - - inputs = {'BBoxes': bboxes, 'Scores': scores} - outputs = {'Out': output, 'Index': index} - - if rois_num is not None: - inputs['RoisNum'] = rois_num - - if return_rois_num: - nms_rois_num = helper.create_variable_for_type_inference( - dtype='int32') - outputs['NmsRoisNum'] = nms_rois_num - - helper.append_op( - type="multiclass_nms3", - inputs=inputs, - attrs={ - 'background_label': background_label, - 'score_threshold': score_threshold, - 'nms_top_k': nms_top_k, - 'nms_threshold': nms_threshold, - 'keep_top_k': keep_top_k, - 'nms_eta': nms_eta, - 'normalized': normalized - }, - outputs=outputs) - output.stop_gradient = True - index.stop_gradient = True - if not return_index: - index = None - if not return_rois_num: - nms_rois_num = None - - return output, nms_rois_num, index - - -@paddle.jit.not_to_static -def matrix_nms(bboxes, - scores, - score_threshold, - post_threshold, - nms_top_k, - keep_top_k, - use_gaussian=False, - gaussian_sigma=2., - background_label=0, - normalized=True, - return_index=False, - return_rois_num=True, - name=None): - """ - **Matrix NMS** - This operator does matrix non maximum suppression (NMS). - First selects a subset of candidate bounding boxes that have higher scores - than score_threshold (if provided), then the top k candidate is selected if - nms_top_k is larger than -1. Score of the remaining candidate are then - decayed according to the Matrix NMS scheme. - Aftern NMS step, at most keep_top_k number of total bboxes are to be kept - per image if keep_top_k is larger than -1. - Args: - bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the - predicted locations of M bounding bboxes, - N is the batch size. Each bounding box has four - coordinate values and the layout is - [xmin, ymin, xmax, ymax], when box size equals to 4. - The data type is float32 or float64. - scores (Tensor): A 3-D Tensor with shape [N, C, M] - represents the predicted confidence predictions. - N is the batch size, C is the class number, M is - number of bounding boxes. For each category there - are total M scores which corresponding M bounding - boxes. Please note, M is equal to the 2nd dimension - of BBoxes. The data type is float32 or float64. - score_threshold (float): Threshold to filter out bounding boxes with - low confidence score. - post_threshold (float): Threshold to filter out bounding boxes with - low confidence score AFTER decaying. - nms_top_k (int): Maximum number of detections to be kept according to - the confidences after the filtering detections based - on score_threshold. - keep_top_k (int): Number of total bboxes to be kept per image after NMS - step. -1 means keeping all bboxes after NMS step. - use_gaussian (bool): Use Gaussian as the decay function. Default: False - gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0 - background_label (int): The index of background label, the background - label will be ignored. If set to -1, then all - categories will be considered. Default: 0 - normalized (bool): Whether detections are normalized. Default: True - return_index(bool): Whether return selected index. Default: False - return_rois_num(bool): whether return rois_num. Default: True - name(str): Name of the matrix nms op. Default: None. - Returns: - A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True, - otherwise, a tuple with two Tensor (Out, RoisNum) is returned. - Out (Tensor): A 2-D Tensor with shape [No, 6] containing the - detection results. - Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax] - (After version 1.3, when no boxes detected, the lod is changed - from {0} to {1}) - Index (Tensor): A 2-D Tensor with shape [No, 1] containing the - selected indices, which are absolute values cross batches. - rois_num (Tensor): A 1-D Tensor with shape [N] containing - the number of detected boxes in each image. - Examples: - .. code-block:: python - import paddle - from ppdet.modeling import ops - boxes = paddle.static.data(name='bboxes', shape=[None,81, 4], - dtype='float32', lod_level=1) - scores = paddle.static.data(name='scores', shape=[None,81], - dtype='float32', lod_level=1) - out = ops.matrix_nms(bboxes=boxes, scores=scores, background_label=0, - score_threshold=0.5, post_threshold=0.1, - nms_top_k=400, keep_top_k=200, normalized=False) - """ - check_variable_and_dtype(bboxes, 'BBoxes', ['float32', 'float64'], - 'matrix_nms') - check_variable_and_dtype(scores, 'Scores', ['float32', 'float64'], - 'matrix_nms') - check_type(score_threshold, 'score_threshold', float, 'matrix_nms') - check_type(post_threshold, 'post_threshold', float, 'matrix_nms') - check_type(nms_top_k, 'nums_top_k', int, 'matrix_nms') - check_type(keep_top_k, 'keep_top_k', int, 'matrix_nms') - check_type(normalized, 'normalized', bool, 'matrix_nms') - check_type(use_gaussian, 'use_gaussian', bool, 'matrix_nms') - check_type(gaussian_sigma, 'gaussian_sigma', float, 'matrix_nms') - check_type(background_label, 'background_label', int, 'matrix_nms') - - if in_dynamic_mode(): - attrs = ('background_label', background_label, 'score_threshold', - score_threshold, 'post_threshold', post_threshold, 'nms_top_k', - nms_top_k, 'gaussian_sigma', gaussian_sigma, 'use_gaussian', - use_gaussian, 'keep_top_k', keep_top_k, 'normalized', - normalized) - out, index, rois_num = C_ops.matrix_nms(bboxes, scores, *attrs) - if not return_index: - index = None - if not return_rois_num: - rois_num = None - return out, rois_num, index - else: - helper = LayerHelper('matrix_nms', **locals()) - output = helper.create_variable_for_type_inference(dtype=bboxes.dtype) - index = helper.create_variable_for_type_inference(dtype='int32') - outputs = {'Out': output, 'Index': index} - if return_rois_num: - rois_num = helper.create_variable_for_type_inference(dtype='int32') - outputs['RoisNum'] = rois_num - - helper.append_op( - type="matrix_nms", - inputs={'BBoxes': bboxes, - 'Scores': scores}, - attrs={ - 'background_label': background_label, - 'score_threshold': score_threshold, - 'post_threshold': post_threshold, - 'nms_top_k': nms_top_k, - 'gaussian_sigma': gaussian_sigma, - 'use_gaussian': use_gaussian, - 'keep_top_k': keep_top_k, - 'normalized': normalized - }, - outputs=outputs) - output.stop_gradient = True - - if not return_index: - index = None - if not return_rois_num: - rois_num = None - return output, rois_num, index - - -@paddle.jit.not_to_static -def box_coder(prior_box, - prior_box_var, - target_box, - code_type="encode_center_size", - box_normalized=True, - axis=0, - name=None): - r""" - **Box Coder Layer** - Encode/Decode the target bounding box with the priorbox information. - - The Encoding schema described below: - .. math:: - ox = (tx - px) / pw / pxv - oy = (ty - py) / ph / pyv - ow = \log(\abs(tw / pw)) / pwv - oh = \log(\abs(th / ph)) / phv - The Decoding schema described below: - - .. math:: - - ox = (pw * pxv * tx * + px) - tw / 2 - oy = (ph * pyv * ty * + py) - th / 2 - ow = \exp(pwv * tw) * pw + tw / 2 - oh = \exp(phv * th) * ph + th / 2 - where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, - width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote - the priorbox's (anchor) center coordinates, width and height. `pxv`, - `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`, - `ow`, `oh` denote the encoded/decoded coordinates, width and height. - During Box Decoding, two modes for broadcast are supported. Say target - box has shape [N, M, 4], and the shape of prior box can be [N, 4] or - [M, 4]. Then prior box will broadcast to target box along the - assigned axis. - - Args: - prior_box(Tensor): Box list prior_box is a 2-D Tensor with shape - [M, 4] holds M boxes and data type is float32 or float64. Each box - is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the - left top coordinate of the anchor box, if the input is image feature - map, they are close to the origin of the coordinate system. - [xmax, ymax] is the right bottom coordinate of the anchor box. - prior_box_var(List|Tensor|None): prior_box_var supports three types - of input. One is Tensor with shape [M, 4] which holds M group and - data type is float32 or float64. The second is list consist of - 4 elements shared by all boxes and data type is float32 or float64. - Other is None and not involved in calculation. - target_box(Tensor): This input can be a 2-D LoDTensor with shape - [N, 4] when code_type is 'encode_center_size'. This input also can - be a 3-D Tensor with shape [N, M, 4] when code_type is - 'decode_center_size'. Each box is represented as - [xmin, ymin, xmax, ymax]. The data type is float32 or float64. - code_type(str): The code type used with the target box. It can be - `encode_center_size` or `decode_center_size`. `encode_center_size` - by default. - box_normalized(bool): Whether treat the priorbox as a normalized box. - Set true by default. - axis(int): Which axis in PriorBox to broadcast for box decode, - for example, if axis is 0 and TargetBox has shape [N, M, 4] and - PriorBox has shape [M, 4], then PriorBox will broadcast to [N, M, 4] - for decoding. It is only valid when code type is - `decode_center_size`. Set 0 by default. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. - - Returns: - Tensor: - output_box(Tensor): When code_type is 'encode_center_size', the - output tensor of box_coder_op with shape [N, M, 4] representing the - result of N target boxes encoded with M Prior boxes and variances. - When code_type is 'decode_center_size', N represents the batch size - and M represents the number of decoded boxes. - - Examples: - - .. code-block:: python - - import paddle - from ppdet.modeling import ops - paddle.enable_static() - # For encode - prior_box_encode = paddle.static.data(name='prior_box_encode', - shape=[512, 4], - dtype='float32') - target_box_encode = paddle.static.data(name='target_box_encode', - shape=[81, 4], - dtype='float32') - output_encode = ops.box_coder(prior_box=prior_box_encode, - prior_box_var=[0.1,0.1,0.2,0.2], - target_box=target_box_encode, - code_type="encode_center_size") - # For decode - prior_box_decode = paddle.static.data(name='prior_box_decode', - shape=[512, 4], - dtype='float32') - target_box_decode = paddle.static.data(name='target_box_decode', - shape=[512, 81, 4], - dtype='float32') - output_decode = ops.box_coder(prior_box=prior_box_decode, - prior_box_var=[0.1,0.1,0.2,0.2], - target_box=target_box_decode, - code_type="decode_center_size", - box_normalized=False, - axis=1) - """ - check_variable_and_dtype(prior_box, 'prior_box', ['float32', 'float64'], - 'box_coder') - check_variable_and_dtype(target_box, 'target_box', ['float32', 'float64'], - 'box_coder') - - if in_dynamic_mode(): - if isinstance(prior_box_var, Variable): - output_box = C_ops.box_coder( - prior_box, prior_box_var, target_box, "code_type", code_type, - "box_normalized", box_normalized, "axis", axis) - - elif isinstance(prior_box_var, list): - output_box = C_ops.box_coder( - prior_box, None, target_box, "code_type", code_type, - "box_normalized", box_normalized, "axis", axis, "variance", - prior_box_var) - else: - raise TypeError( - "Input variance of box_coder must be Variable or list") - return output_box - else: - helper = LayerHelper("box_coder", **locals()) - - output_box = helper.create_variable_for_type_inference( - dtype=prior_box.dtype) - - inputs = {"PriorBox": prior_box, "TargetBox": target_box} - attrs = { - "code_type": code_type, - "box_normalized": box_normalized, - "axis": axis - } - if isinstance(prior_box_var, Variable): - inputs['PriorBoxVar'] = prior_box_var - elif isinstance(prior_box_var, list): - attrs['variance'] = prior_box_var - else: - raise TypeError( - "Input variance of box_coder must be Variable or list") - helper.append_op( - type="box_coder", - inputs=inputs, - attrs=attrs, - outputs={"OutputBox": output_box}) - return output_box - - -@paddle.jit.not_to_static -def generate_proposals(scores, - bbox_deltas, - im_shape, - anchors, - variances, - pre_nms_top_n=6000, - post_nms_top_n=1000, - nms_thresh=0.5, - min_size=0.1, - eta=1.0, - pixel_offset=False, - return_rois_num=False, - name=None): - """ - **Generate proposal Faster-RCNN** - This operation proposes RoIs according to each box with their - probability to be a foreground object and - the box can be calculated by anchors. Bbox_deltais and scores - to be an object are the output of RPN. Final proposals - could be used to train detection net. - For generating proposals, this operation performs following steps: - 1. Transposes and resizes scores and bbox_deltas in size of - (H*W*A, 1) and (H*W*A, 4) - 2. Calculate box locations as proposals candidates. - 3. Clip boxes to image - 4. Remove predicted boxes with small area. - 5. Apply NMS to get final proposals as output. - Args: - scores(Tensor): A 4-D Tensor with shape [N, A, H, W] represents - the probability for each box to be an object. - N is batch size, A is number of anchors, H and W are height and - width of the feature map. The data type must be float32. - bbox_deltas(Tensor): A 4-D Tensor with shape [N, 4*A, H, W] - represents the difference between predicted box location and - anchor location. The data type must be float32. - im_shape(Tensor): A 2-D Tensor with shape [N, 2] represents H, W, the - origin image size or input size. The data type can be float32 or - float64. - anchors(Tensor): A 4-D Tensor represents the anchors with a layout - of [H, W, A, 4]. H and W are height and width of the feature map, - num_anchors is the box count of each position. Each anchor is - in (xmin, ymin, xmax, ymax) format an unnormalized. The data type must be float32. - variances(Tensor): A 4-D Tensor. The expanded variances of anchors with a layout of - [H, W, num_priors, 4]. Each variance is in - (xcenter, ycenter, w, h) format. The data type must be float32. - pre_nms_top_n(float): Number of total bboxes to be kept per - image before NMS. The data type must be float32. `6000` by default. - post_nms_top_n(float): Number of total bboxes to be kept per - image after NMS. The data type must be float32. `1000` by default. - nms_thresh(float): Threshold in NMS. The data type must be float32. `0.5` by default. - min_size(float): Remove predicted boxes with either height or - width < min_size. The data type must be float32. `0.1` by default. - eta(float): Apply in adaptive NMS, if adaptive `threshold > 0.5`, - `adaptive_threshold = adaptive_threshold * eta` in each iteration. - return_rois_num(bool): When setting True, it will return a 1D Tensor with shape [N, ] that includes Rois's - num of each image in one batch. The N is the image's num. For example, the tensor has values [4,5] that represents - the first image has 4 Rois, the second image has 5 Rois. It only used in rcnn model. - 'False' by default. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. - - Returns: - tuple: - A tuple with format ``(rpn_rois, rpn_roi_probs)``. - - **rpn_rois**: The generated RoIs. 2-D Tensor with shape ``[N, 4]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``. - - **rpn_roi_probs**: The scores of generated RoIs. 2-D Tensor with shape ``[N, 1]`` while ``N`` is the number of RoIs. The data type is the same as ``scores``. - - Examples: - .. code-block:: python - - import paddle - from ppdet.modeling import ops - paddle.enable_static() - scores = paddle.static.data(name='scores', shape=[None, 4, 5, 5], dtype='float32') - bbox_deltas = paddle.static.data(name='bbox_deltas', shape=[None, 16, 5, 5], dtype='float32') - im_shape = paddle.static.data(name='im_shape', shape=[None, 2], dtype='float32') - anchors = paddle.static.data(name='anchors', shape=[None, 5, 4, 4], dtype='float32') - variances = paddle.static.data(name='variances', shape=[None, 5, 10, 4], dtype='float32') - rois, roi_probs = ops.generate_proposals(scores, bbox_deltas, - im_shape, anchors, variances) - """ - if in_dynamic_mode(): - assert return_rois_num, "return_rois_num should be True in dygraph mode." - attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n, - 'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta, - 'pixel_offset', pixel_offset) - rpn_rois, rpn_roi_probs, rpn_rois_num = C_ops.generate_proposals_v2( - scores, bbox_deltas, im_shape, anchors, variances, *attrs) - if not return_rois_num: - rpn_rois_num = None - return rpn_rois, rpn_roi_probs, rpn_rois_num - - else: - helper = LayerHelper('generate_proposals_v2', **locals()) - - check_variable_and_dtype(scores, 'scores', ['float32'], - 'generate_proposals_v2') - check_variable_and_dtype(bbox_deltas, 'bbox_deltas', ['float32'], - 'generate_proposals_v2') - check_variable_and_dtype(im_shape, 'im_shape', ['float32', 'float64'], - 'generate_proposals_v2') - check_variable_and_dtype(anchors, 'anchors', ['float32'], - 'generate_proposals_v2') - check_variable_and_dtype(variances, 'variances', ['float32'], - 'generate_proposals_v2') - - rpn_rois = helper.create_variable_for_type_inference( - dtype=bbox_deltas.dtype) - rpn_roi_probs = helper.create_variable_for_type_inference( - dtype=scores.dtype) - outputs = { - 'RpnRois': rpn_rois, - 'RpnRoiProbs': rpn_roi_probs, - } - if return_rois_num: - rpn_rois_num = helper.create_variable_for_type_inference( - dtype='int32') - rpn_rois_num.stop_gradient = True - outputs['RpnRoisNum'] = rpn_rois_num - - helper.append_op( - type="generate_proposals_v2", - inputs={ - 'Scores': scores, - 'BboxDeltas': bbox_deltas, - 'ImShape': im_shape, - 'Anchors': anchors, - 'Variances': variances - }, - attrs={ - 'pre_nms_topN': pre_nms_top_n, - 'post_nms_topN': post_nms_top_n, - 'nms_thresh': nms_thresh, - 'min_size': min_size, - 'eta': eta, - 'pixel_offset': pixel_offset - }, - outputs=outputs) - rpn_rois.stop_gradient = True - rpn_roi_probs.stop_gradient = True - if not return_rois_num: - rpn_rois_num = None - - return rpn_rois, rpn_roi_probs, rpn_rois_num - - -def sigmoid_cross_entropy_with_logits(input, - label, - ignore_index=-100, - normalize=False): - output = F.binary_cross_entropy_with_logits(input, label, reduction='none') - mask_tensor = paddle.cast(label != ignore_index, 'float32') - output = paddle.multiply(output, mask_tensor) - if normalize: - sum_valid_mask = paddle.sum(mask_tensor) - output = output / sum_valid_mask - return output - - -def smooth_l1(input, label, inside_weight=None, outside_weight=None, - sigma=None): - input_new = paddle.multiply(input, inside_weight) - label_new = paddle.multiply(label, inside_weight) - delta = 1 / (sigma * sigma) - out = F.smooth_l1_loss(input_new, label_new, reduction='none', delta=delta) - out = paddle.multiply(out, outside_weight) - out = out / delta - out = paddle.reshape(out, shape=[out.shape[0], -1]) - out = paddle.sum(out, axis=1) - return out - - -def channel_shuffle(x, groups): - batch_size, num_channels, height, width = x.shape[0:4] - assert num_channels % groups == 0, 'num_channels should be divisible by groups' - channels_per_group = num_channels // groups - x = paddle.reshape( - x=x, shape=[batch_size, groups, channels_per_group, height, width]) - x = paddle.transpose(x=x, perm=[0, 2, 1, 3, 4]) - x = paddle.reshape(x=x, shape=[batch_size, num_channels, height, width]) - return x - - -def get_static_shape(tensor): - shape = paddle.shape(tensor) - shape.stop_gradient = True - return shape diff --git a/pdfdet/models/Paddle/ppdet/modeling/post_process.py b/pdfdet/models/Paddle/ppdet/modeling/post_process.py deleted file mode 100644 index efde830..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/post_process.py +++ /dev/null @@ -1,801 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register -from ppdet.modeling.bbox_utils import nonempty_bbox -from .transformers import bbox_cxcywh_to_xyxy -try: - from collections.abc import Sequence -except Exception: - from collections import Sequence - -__all__ = [ - 'BBoxPostProcess', 'MaskPostProcess', 'JDEBBoxPostProcess', - 'CenterNetPostProcess', 'DETRPostProcess', 'SparsePostProcess', - 'DETRBBoxSemiPostProcess' -] - - -@register -class BBoxPostProcess(object): - __shared__ = ['num_classes', 'export_onnx', 'export_eb'] - __inject__ = ['decode', 'nms'] - - def __init__(self, - num_classes=80, - decode=None, - nms=None, - export_onnx=False, - export_eb=False): - super(BBoxPostProcess, self).__init__() - self.num_classes = num_classes - self.decode = decode - self.nms = nms - self.export_onnx = export_onnx - self.export_eb = export_eb - - def __call__(self, head_out, rois, im_shape, scale_factor): - """ - Decode the bbox and do NMS if needed. - - Args: - head_out (tuple): bbox_pred and cls_prob of bbox_head output. - rois (tuple): roi and rois_num of rpn_head output. - im_shape (Tensor): The shape of the input image. - scale_factor (Tensor): The scale factor of the input image. - export_onnx (bool): whether export model to onnx - Returns: - bbox_pred (Tensor): The output prediction with shape [N, 6], including - labels, scores and bboxes. The size of bboxes are corresponding - to the input image, the bboxes may be used in other branch. - bbox_num (Tensor): The number of prediction boxes of each batch with - shape [1], and is N. - """ - if self.nms is not None: - bboxes, score = self.decode(head_out, rois, im_shape, scale_factor) - bbox_pred, bbox_num, before_nms_indexes = self.nms(bboxes, score, - self.num_classes) - - else: - bbox_pred, bbox_num = self.decode(head_out, rois, im_shape, - scale_factor) - - if self.export_onnx: - # add fake box after postprocess when exporting onnx - fake_bboxes = paddle.to_tensor( - np.array( - [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32')) - - bbox_pred = paddle.concat([bbox_pred, fake_bboxes]) - bbox_num = bbox_num + 1 - - if self.nms is not None: - return bbox_pred, bbox_num, before_nms_indexes - else: - return bbox_pred, bbox_num - - def get_pred(self, bboxes, bbox_num, im_shape, scale_factor): - """ - Rescale, clip and filter the bbox from the output of NMS to - get final prediction. - - Notes: - Currently only support bs = 1. - - Args: - bboxes (Tensor): The output bboxes with shape [N, 6] after decode - and NMS, including labels, scores and bboxes. - bbox_num (Tensor): The number of prediction boxes of each batch with - shape [1], and is N. - im_shape (Tensor): The shape of the input image. - scale_factor (Tensor): The scale factor of the input image. - Returns: - pred_result (Tensor): The final prediction results with shape [N, 6] - including labels, scores and bboxes. - """ - if self.export_eb: - # enable rcnn models for edgeboard hw to skip the following postprocess. - return bboxes, bboxes, bbox_num - - if not self.export_onnx: - bboxes_list = [] - bbox_num_list = [] - id_start = 0 - fake_bboxes = paddle.to_tensor( - np.array( - [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32')) - fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32')) - - # add fake bbox when output is empty for each batch - for i in range(bbox_num.shape[0]): - if bbox_num[i] == 0: - bboxes_i = fake_bboxes - bbox_num_i = fake_bbox_num - else: - bboxes_i = bboxes[id_start:id_start + bbox_num[i], :] - bbox_num_i = bbox_num[i:i + 1] - id_start += bbox_num[i:i + 1] - bboxes_list.append(bboxes_i) - bbox_num_list.append(bbox_num_i) - bboxes = paddle.concat(bboxes_list) - bbox_num = paddle.concat(bbox_num_list) - - origin_shape = paddle.floor(im_shape / scale_factor + 0.5) - - if not self.export_onnx: - origin_shape_list = [] - scale_factor_list = [] - # scale_factor: scale_y, scale_x - for i in range(bbox_num.shape[0]): - expand_shape = paddle.expand(origin_shape[i:i + 1, :], - [bbox_num[i:i + 1], 2]) - scale_y, scale_x = scale_factor[i, 0:1], scale_factor[i, 1:2] - scale = paddle.concat([scale_x, scale_y, scale_x, scale_y]) - expand_scale = paddle.expand(scale, [bbox_num[i:i + 1], 4]) - origin_shape_list.append(expand_shape) - scale_factor_list.append(expand_scale) - - self.origin_shape_list = paddle.concat(origin_shape_list) - scale_factor_list = paddle.concat(scale_factor_list) - - else: - # simplify the computation for bs=1 when exporting onnx - scale_y, scale_x = scale_factor[0][0], scale_factor[0][1] - scale = paddle.concat( - [scale_x, scale_y, scale_x, scale_y]).unsqueeze(0) - self.origin_shape_list = paddle.expand(origin_shape, - [bbox_num[0:1], 2]) - scale_factor_list = paddle.expand(scale, [bbox_num[0:1], 4]) - - # bboxes: [N, 6], label, score, bbox - pred_label = bboxes[:, 0:1] - pred_score = bboxes[:, 1:2] - pred_bbox = bboxes[:, 2:] - # rescale bbox to original image - scaled_bbox = pred_bbox / scale_factor_list - origin_h = self.origin_shape_list[:, 0] - origin_w = self.origin_shape_list[:, 1] - zeros = paddle.zeros_like(origin_h) - # clip bbox to [0, original_size] - x1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 0], origin_w), zeros) - y1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 1], origin_h), zeros) - x2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 2], origin_w), zeros) - y2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 3], origin_h), zeros) - pred_bbox = paddle.stack([x1, y1, x2, y2], axis=-1) - # filter empty bbox - keep_mask = nonempty_bbox(pred_bbox, return_mask=True) - keep_mask = paddle.unsqueeze(keep_mask, [1]) - pred_label = paddle.where(keep_mask, pred_label, - paddle.ones_like(pred_label) * -1) - pred_result = paddle.concat([pred_label, pred_score, pred_bbox], axis=1) - return bboxes, pred_result, bbox_num - - def get_origin_shape(self, ): - return self.origin_shape_list - - -@register -class MaskPostProcess(object): - __shared__ = ['export_onnx', 'assign_on_cpu'] - """ - refer to: - https://github.com/facebookresearch/detectron2/layers/mask_ops.py - - Get Mask output according to the output from model - """ - - def __init__(self, - binary_thresh=0.5, - export_onnx=False, - assign_on_cpu=False): - super(MaskPostProcess, self).__init__() - self.binary_thresh = binary_thresh - self.export_onnx = export_onnx - self.assign_on_cpu = assign_on_cpu - - def __call__(self, mask_out, bboxes, bbox_num, origin_shape): - """ - Decode the mask_out and paste the mask to the origin image. - - Args: - mask_out (Tensor): mask_head output with shape [N, 28, 28]. - bbox_pred (Tensor): The output bboxes with shape [N, 6] after decode - and NMS, including labels, scores and bboxes. - bbox_num (Tensor): The number of prediction boxes of each batch with - shape [1], and is N. - origin_shape (Tensor): The origin shape of the input image, the tensor - shape is [N, 2], and each row is [h, w]. - Returns: - pred_result (Tensor): The final prediction mask results with shape - [N, h, w] in binary mask style. - """ - num_mask = mask_out.shape[0] - origin_shape = paddle.cast(origin_shape, 'int32') - device = paddle.device.get_device() - - if self.export_onnx: - h, w = origin_shape[0][0], origin_shape[0][1] - mask_onnx = paste_mask(mask_out[:, None, :, :], bboxes[:, 2:], h, w, - self.assign_on_cpu) - mask_onnx = mask_onnx >= self.binary_thresh - pred_result = paddle.cast(mask_onnx, 'int32') - - else: - max_h = paddle.max(origin_shape[:, 0]) - max_w = paddle.max(origin_shape[:, 1]) - pred_result = paddle.zeros( - [num_mask, max_h, max_w], dtype='int32') - 1 - - id_start = 0 - for i in range(paddle.shape(bbox_num)[0]): - bboxes_i = bboxes[id_start:id_start + bbox_num[i], :] - mask_out_i = mask_out[id_start:id_start + bbox_num[i], :, :] - im_h = origin_shape[i, 0] - im_w = origin_shape[i, 1] - pred_mask = paste_mask(mask_out_i[:, None, :, :], - bboxes_i[:, 2:], im_h, im_w, - self.assign_on_cpu) - pred_mask = paddle.cast(pred_mask >= self.binary_thresh, - 'int32') - pred_result[id_start:id_start + bbox_num[i], :im_h, : - im_w] = pred_mask - id_start += bbox_num[i] - if self.assign_on_cpu: - paddle.set_device(device) - - return pred_result - - -@register -class JDEBBoxPostProcess(nn.Layer): - __shared__ = ['num_classes'] - __inject__ = ['decode', 'nms'] - - def __init__(self, num_classes=1, decode=None, nms=None, return_idx=True): - super(JDEBBoxPostProcess, self).__init__() - self.num_classes = num_classes - self.decode = decode - self.nms = nms - self.return_idx = return_idx - - self.fake_bbox_pred = paddle.to_tensor( - np.array( - [[-1, 0.0, 0.0, 0.0, 0.0, 0.0]], dtype='float32')) - self.fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32')) - self.fake_nms_keep_idx = paddle.to_tensor( - np.array( - [[0]], dtype='int32')) - - self.fake_yolo_boxes_out = paddle.to_tensor( - np.array( - [[[0.0, 0.0, 0.0, 0.0]]], dtype='float32')) - self.fake_yolo_scores_out = paddle.to_tensor( - np.array( - [[[0.0]]], dtype='float32')) - self.fake_boxes_idx = paddle.to_tensor(np.array([[0]], dtype='int64')) - - def forward(self, head_out, anchors): - """ - Decode the bbox and do NMS for JDE model. - - Args: - head_out (list): Bbox_pred and cls_prob of bbox_head output. - anchors (list): Anchors of JDE model. - - Returns: - boxes_idx (Tensor): The index of kept bboxes after decode 'JDEBox'. - bbox_pred (Tensor): The output is the prediction with shape [N, 6] - including labels, scores and bboxes. - bbox_num (Tensor): The number of prediction of each batch with shape [N]. - nms_keep_idx (Tensor): The index of kept bboxes after NMS. - """ - boxes_idx, yolo_boxes_scores = self.decode(head_out, anchors) - - if len(boxes_idx) == 0: - boxes_idx = self.fake_boxes_idx - yolo_boxes_out = self.fake_yolo_boxes_out - yolo_scores_out = self.fake_yolo_scores_out - else: - yolo_boxes = paddle.gather_nd(yolo_boxes_scores, boxes_idx) - # TODO: only support bs=1 now - yolo_boxes_out = paddle.reshape( - yolo_boxes[:, :4], shape=[1, len(boxes_idx), 4]) - yolo_scores_out = paddle.reshape( - yolo_boxes[:, 4:5], shape=[1, 1, len(boxes_idx)]) - boxes_idx = boxes_idx[:, 1:] - - if self.return_idx: - bbox_pred, bbox_num, nms_keep_idx = self.nms( - yolo_boxes_out, yolo_scores_out, self.num_classes) - if bbox_pred.shape[0] == 0: - bbox_pred = self.fake_bbox_pred - bbox_num = self.fake_bbox_num - nms_keep_idx = self.fake_nms_keep_idx - return boxes_idx, bbox_pred, bbox_num, nms_keep_idx - else: - bbox_pred, bbox_num, _ = self.nms(yolo_boxes_out, yolo_scores_out, - self.num_classes) - if bbox_pred.shape[0] == 0: - bbox_pred = self.fake_bbox_pred - bbox_num = self.fake_bbox_num - return _, bbox_pred, bbox_num, _ - - -@register -class CenterNetPostProcess(object): - """ - Postprocess the model outputs to get final prediction: - 1. Do NMS for heatmap to get top `max_per_img` bboxes. - 2. Decode bboxes using center offset and box size. - 3. Rescale decoded bboxes reference to the origin image shape. - Args: - max_per_img(int): the maximum number of predicted objects in a image, - 500 by default. - down_ratio(int): the down ratio from images to heatmap, 4 by default. - regress_ltrb (bool): whether to regress left/top/right/bottom or - width/height for a box, true by default. - """ - __shared__ = ['down_ratio'] - - def __init__(self, max_per_img=500, down_ratio=4, regress_ltrb=True): - super(CenterNetPostProcess, self).__init__() - self.max_per_img = max_per_img - self.down_ratio = down_ratio - self.regress_ltrb = regress_ltrb - # _simple_nms() _topk() are same as TTFBox in ppdet/modeling/layers.py - - def _simple_nms(self, heat, kernel=3): - """ Use maxpool to filter the max score, get local peaks. """ - pad = (kernel - 1) // 2 - hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad) - keep = paddle.cast(hmax == heat, 'float32') - return heat * keep - - def _topk(self, scores): - """ Select top k scores and decode to get xy coordinates. """ - k = self.max_per_img - shape_fm = paddle.shape(scores) - shape_fm.stop_gradient = True - cat, height, width = shape_fm[1], shape_fm[2], shape_fm[3] - # batch size is 1 - scores_r = paddle.reshape(scores, [cat, -1]) - topk_scores, topk_inds = paddle.topk(scores_r, k) - topk_ys = topk_inds // width - topk_xs = topk_inds % width - - topk_score_r = paddle.reshape(topk_scores, [-1]) - topk_score, topk_ind = paddle.topk(topk_score_r, k) - k_t = paddle.full(paddle.shape(topk_ind), k, dtype='int64') - topk_clses = paddle.cast(paddle.floor_divide(topk_ind, k_t), 'float32') - - topk_inds = paddle.reshape(topk_inds, [-1]) - topk_ys = paddle.reshape(topk_ys, [-1, 1]) - topk_xs = paddle.reshape(topk_xs, [-1, 1]) - topk_inds = paddle.gather(topk_inds, topk_ind) - topk_ys = paddle.gather(topk_ys, topk_ind) - topk_xs = paddle.gather(topk_xs, topk_ind) - return topk_score, topk_inds, topk_clses, topk_ys, topk_xs - - def __call__(self, hm, wh, reg, im_shape, scale_factor): - # 1.get clses and scores, note that hm had been done sigmoid - heat = self._simple_nms(hm) - scores, inds, topk_clses, ys, xs = self._topk(heat) - clses = topk_clses.unsqueeze(1) - scores = scores.unsqueeze(1) - - # 2.get bboxes, note only support batch_size=1 now - reg_t = paddle.transpose(reg, [0, 2, 3, 1]) - reg = paddle.reshape(reg_t, [-1, reg_t.shape[-1]]) - reg = paddle.gather(reg, inds) - xs = paddle.cast(xs, 'float32') - ys = paddle.cast(ys, 'float32') - xs = xs + reg[:, 0:1] - ys = ys + reg[:, 1:2] - wh_t = paddle.transpose(wh, [0, 2, 3, 1]) - wh = paddle.reshape(wh_t, [-1, wh_t.shape[-1]]) - wh = paddle.gather(wh, inds) - if self.regress_ltrb: - x1 = xs - wh[:, 0:1] - y1 = ys - wh[:, 1:2] - x2 = xs + wh[:, 2:3] - y2 = ys + wh[:, 3:4] - else: - x1 = xs - wh[:, 0:1] / 2 - y1 = ys - wh[:, 1:2] / 2 - x2 = xs + wh[:, 0:1] / 2 - y2 = ys + wh[:, 1:2] / 2 - n, c, feat_h, feat_w = paddle.shape(hm) - padw = (feat_w * self.down_ratio - im_shape[0, 1]) / 2 - padh = (feat_h * self.down_ratio - im_shape[0, 0]) / 2 - x1 = x1 * self.down_ratio - y1 = y1 * self.down_ratio - x2 = x2 * self.down_ratio - y2 = y2 * self.down_ratio - x1 = x1 - padw - y1 = y1 - padh - x2 = x2 - padw - y2 = y2 - padh - bboxes = paddle.concat([x1, y1, x2, y2], axis=1) - scale_y = scale_factor[:, 0:1] - scale_x = scale_factor[:, 1:2] - scale_expand = paddle.concat( - [scale_x, scale_y, scale_x, scale_y], axis=1) - boxes_shape = bboxes.shape[:] - scale_expand = paddle.expand(scale_expand, shape=boxes_shape) - bboxes = paddle.divide(bboxes, scale_expand) - - results = paddle.concat([clses, scores, bboxes], axis=1) - return results, paddle.shape(results)[0:1], inds, topk_clses, ys, xs - - -@register -class DETRPostProcess(object): - __shared__ = ['num_classes', 'use_focal_loss', 'with_mask'] - __inject__ = [] - - def __init__(self, - num_classes=80, - num_top_queries=100, - dual_queries=False, - dual_groups=0, - use_focal_loss=False, - with_mask=False, - mask_threshold=0.5, - use_avg_mask_score=False, - bbox_decode_type='origin'): - super(DETRPostProcess, self).__init__() - assert bbox_decode_type in ['origin', 'pad'] - - self.num_classes = num_classes - self.num_top_queries = num_top_queries - self.dual_queries = dual_queries - self.dual_groups = dual_groups - self.use_focal_loss = use_focal_loss - self.with_mask = with_mask - self.mask_threshold = mask_threshold - self.use_avg_mask_score = use_avg_mask_score - self.bbox_decode_type = bbox_decode_type - - def _mask_postprocess(self, mask_pred, score_pred, index): - mask_score = F.sigmoid(paddle.gather_nd(mask_pred, index)) - mask_pred = (mask_score > self.mask_threshold).astype(mask_score.dtype) - if self.use_avg_mask_score: - avg_mask_score = (mask_pred * mask_score).sum([-2, -1]) / ( - mask_pred.sum([-2, -1]) + 1e-6) - score_pred *= avg_mask_score - - return mask_pred[0].astype('int32'), score_pred - - def __call__(self, head_out, im_shape, scale_factor, pad_shape): - """ - Decode the bbox and mask. - - Args: - head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output. - im_shape (Tensor): The shape of the input image without padding. - scale_factor (Tensor): The scale factor of the input image. - pad_shape (Tensor): The shape of the input image with padding. - Returns: - bbox_pred (Tensor): The output prediction with shape [N, 6], including - labels, scores and bboxes. The size of bboxes are corresponding - to the input image, the bboxes may be used in other branch. - bbox_num (Tensor): The number of prediction boxes of each batch with - shape [bs], and is N. - """ - bboxes, logits, masks = head_out - if self.dual_queries: - num_queries = logits.shape[1] - logits, bboxes = logits[:, :int(num_queries // (self.dual_groups + 1)), :], \ - bboxes[:, :int(num_queries // (self.dual_groups + 1)), :] - - bbox_pred = bbox_cxcywh_to_xyxy(bboxes) - # calculate the original shape of the image - origin_shape = paddle.floor(im_shape / scale_factor + 0.5) - img_h, img_w = paddle.split(origin_shape, 2, axis=-1) - if self.bbox_decode_type == 'pad': - # calculate the shape of the image with padding - out_shape = pad_shape / im_shape * origin_shape - out_shape = out_shape.flip(1).tile([1, 2]).unsqueeze(1) - elif self.bbox_decode_type == 'origin': - out_shape = origin_shape.flip(1).tile([1, 2]).unsqueeze(1) - else: - raise Exception( - f'Wrong `bbox_decode_type`: {self.bbox_decode_type}.') - bbox_pred *= out_shape - - scores = F.sigmoid(logits) if self.use_focal_loss else F.softmax( - logits)[:, :, :-1] - - if not self.use_focal_loss: - scores, labels = scores.max(-1), scores.argmax(-1) - if scores.shape[1] > self.num_top_queries: - scores, index = paddle.topk( - scores, self.num_top_queries, axis=-1) - batch_ind = paddle.arange( - end=scores.shape[0]).unsqueeze(-1).tile( - [1, self.num_top_queries]) - index = paddle.stack([batch_ind, index], axis=-1) - labels = paddle.gather_nd(labels, index) - bbox_pred = paddle.gather_nd(bbox_pred, index) - else: - scores, index = paddle.topk( - scores.flatten(1), self.num_top_queries, axis=-1) - labels = index % self.num_classes - index = index // self.num_classes - batch_ind = paddle.arange(end=scores.shape[0]).unsqueeze(-1).tile( - [1, self.num_top_queries]) - index = paddle.stack([batch_ind, index], axis=-1) - bbox_pred = paddle.gather_nd(bbox_pred, index) - - mask_pred = None - if self.with_mask: - assert masks is not None - masks = F.interpolate( - masks, scale_factor=4, mode="bilinear", align_corners=False) - # TODO: Support prediction with bs>1. - # remove padding for input image - h, w = im_shape.astype('int32')[0] - masks = masks[..., :h, :w] - # get pred_mask in the original resolution. - img_h = img_h[0].astype('int32') - img_w = img_w[0].astype('int32') - masks = F.interpolate( - masks, - size=(img_h, img_w), - mode="bilinear", - align_corners=False) - mask_pred, scores = self._mask_postprocess(masks, scores, index) - - bbox_pred = paddle.concat( - [ - labels.unsqueeze(-1).astype('float32'), scores.unsqueeze(-1), - bbox_pred - ], - axis=-1) - bbox_num = paddle.to_tensor( - self.num_top_queries, dtype='int32').tile([bbox_pred.shape[0]]) - bbox_pred = bbox_pred.reshape([-1, 6]) - return bbox_pred, bbox_num, mask_pred - - -@register -class SparsePostProcess(object): - __shared__ = ['num_classes', 'assign_on_cpu'] - - def __init__(self, - num_proposals, - num_classes=80, - binary_thresh=0.5, - assign_on_cpu=False): - super(SparsePostProcess, self).__init__() - self.num_classes = num_classes - self.num_proposals = num_proposals - self.binary_thresh = binary_thresh - self.assign_on_cpu = assign_on_cpu - - def __call__(self, scores, bboxes, scale_factor, ori_shape, masks=None): - assert len(scores) == len(bboxes) == \ - len(ori_shape) == len(scale_factor) - device = paddle.device.get_device() - batch_size = len(ori_shape) - - scores = F.sigmoid(scores) - has_mask = masks is not None - if has_mask: - masks = F.sigmoid(masks) - masks = masks.reshape([batch_size, -1, *masks.shape[1:]]) - - bbox_pred = [] - mask_pred = [] if has_mask else None - bbox_num = paddle.zeros([batch_size], dtype='int32') - for i in range(batch_size): - score = scores[i] - bbox = bboxes[i] - score, indices = score.flatten(0, 1).topk( - self.num_proposals, sorted=False) - label = indices % self.num_classes - if has_mask: - mask = masks[i] - mask = mask.flatten(0, 1)[indices] - - H, W = ori_shape[i][0], ori_shape[i][1] - bbox = bbox[paddle.cast(indices / self.num_classes, indices.dtype)] - bbox /= scale_factor[i] - bbox[:, 0::2] = paddle.clip(bbox[:, 0::2], 0, W) - bbox[:, 1::2] = paddle.clip(bbox[:, 1::2], 0, H) - - keep = ((bbox[:, 2] - bbox[:, 0]).numpy() > 1.) & \ - ((bbox[:, 3] - bbox[:, 1]).numpy() > 1.) - if keep.sum() == 0: - bbox = paddle.zeros([1, 6], dtype='float32') - if has_mask: - mask = paddle.zeros([1, H, W], dtype='uint8') - else: - label = paddle.to_tensor(label.numpy()[keep]).astype( - 'float32').unsqueeze(-1) - score = paddle.to_tensor(score.numpy()[keep]).astype( - 'float32').unsqueeze(-1) - bbox = paddle.to_tensor(bbox.numpy()[keep]).astype('float32') - if has_mask: - mask = paddle.to_tensor(mask.numpy()[keep]).astype( - 'float32').unsqueeze(1) - mask = paste_mask(mask, bbox, H, W, self.assign_on_cpu) - mask = paddle.cast(mask >= self.binary_thresh, 'uint8') - bbox = paddle.concat([label, score, bbox], axis=-1) - - bbox_num[i] = bbox.shape[0] - bbox_pred.append(bbox) - if has_mask: - mask_pred.append(mask) - - bbox_pred = paddle.concat(bbox_pred) - mask_pred = paddle.concat(mask_pred) if has_mask else None - - if self.assign_on_cpu: - paddle.set_device(device) - - if has_mask: - return bbox_pred, bbox_num, mask_pred - else: - return bbox_pred, bbox_num - - -def paste_mask(masks, boxes, im_h, im_w, assign_on_cpu=False): - """ - Paste the mask prediction to the original image. - """ - x0_int, y0_int = 0, 0 - x1_int, y1_int = im_w, im_h - x0, y0, x1, y1 = paddle.split(boxes, 4, axis=1) - N = masks.shape[0] - img_y = paddle.arange(y0_int, y1_int) + 0.5 - img_x = paddle.arange(x0_int, x1_int) + 0.5 - - img_y = (img_y - y0) / (y1 - y0) * 2 - 1 - img_x = (img_x - x0) / (x1 - x0) * 2 - 1 - # img_x, img_y have shapes (N, w), (N, h) - - if assign_on_cpu: - paddle.set_device('cpu') - gx = img_x[:, None, :].expand( - [N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]]) - gy = img_y[:, :, None].expand( - [N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]]) - grid = paddle.stack([gx, gy], axis=3) - img_masks = F.grid_sample(masks, grid, align_corners=False) - return img_masks[:, 0] - - -def multiclass_nms(bboxs, num_classes, match_threshold=0.6, match_metric='iou'): - final_boxes = [] - for c in range(num_classes): - idxs = bboxs[:, 0] == c - if np.count_nonzero(idxs) == 0: continue - r = nms(bboxs[idxs, 1:], match_threshold, match_metric) - final_boxes.append(np.concatenate([np.full((r.shape[0], 1), c), r], 1)) - return final_boxes - - -def nms(dets, match_threshold=0.6, match_metric='iou'): - """ Apply NMS to avoid detecting too many overlapping bounding boxes. - Args: - dets: shape [N, 5], [score, x1, y1, x2, y2] - match_metric: 'iou' or 'ios' - match_threshold: overlap thresh for match metric. - """ - if dets.shape[0] == 0: - return dets[[], :] - scores = dets[:, 0] - x1 = dets[:, 1] - y1 = dets[:, 2] - x2 = dets[:, 3] - y2 = dets[:, 4] - areas = (x2 - x1 + 1) * (y2 - y1 + 1) - order = scores.argsort()[::-1] - - ndets = dets.shape[0] - suppressed = np.zeros((ndets), dtype=np.int32) - - for _i in range(ndets): - i = order[_i] - if suppressed[i] == 1: - continue - ix1 = x1[i] - iy1 = y1[i] - ix2 = x2[i] - iy2 = y2[i] - iarea = areas[i] - for _j in range(_i + 1, ndets): - j = order[_j] - if suppressed[j] == 1: - continue - xx1 = max(ix1, x1[j]) - yy1 = max(iy1, y1[j]) - xx2 = min(ix2, x2[j]) - yy2 = min(iy2, y2[j]) - w = max(0.0, xx2 - xx1 + 1) - h = max(0.0, yy2 - yy1 + 1) - inter = w * h - if match_metric == 'iou': - union = iarea + areas[j] - inter - match_value = inter / union - elif match_metric == 'ios': - smaller = min(iarea, areas[j]) - match_value = inter / smaller - else: - raise ValueError() - if match_value >= match_threshold: - suppressed[j] = 1 - keep = np.where(suppressed == 0)[0] - dets = dets[keep, :] - return dets - - -@register -class DETRBBoxSemiPostProcess(object): - __shared__ = ['num_classes', 'use_focal_loss'] - __inject__ = [] - - def __init__(self, - num_classes=80, - num_top_queries=100, - use_focal_loss=False): - super(DETRBBoxSemiPostProcess, self).__init__() - self.num_classes = num_classes - self.num_top_queries = num_top_queries - self.use_focal_loss = use_focal_loss - - def __call__(self, head_out): - """ - Decode the bbox. - Args: - head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output. - im_shape (Tensor): The shape of the input image. - scale_factor (Tensor): The scale factor of the input image. - Returns: - bbox_pred (Tensor): The output prediction with shape [N, 6], including - labels, scores and bboxes. The size of bboxes are corresponding - to the input image, the bboxes may be used in other branch. - bbox_num (Tensor): The number of prediction boxes of each batch with - shape [bs], and is N. - """ - bboxes, logits, masks = head_out - bbox_pred = bboxes - - scores = F.softmax(logits, axis=2) - - import copy - soft_scores = copy.deepcopy(scores) - scores, index = paddle.topk(scores.max(-1), 300, axis=-1) - - batch_ind = paddle.arange(end=scores.shape[0]).unsqueeze(-1).tile( - [1, 300]) - index = paddle.stack([batch_ind, index], axis=-1) - labels = paddle.gather_nd(soft_scores.argmax(-1), index).astype('int32') - score_class = paddle.gather_nd(soft_scores, index) - bbox_pred = paddle.gather_nd(bbox_pred, index) - bbox_pred = paddle.concat( - [ - labels.unsqueeze(-1).astype('float32'), score_class, - scores.unsqueeze(-1), bbox_pred - ], - axis=-1) - bbox_num = paddle.to_tensor( - bbox_pred.shape[1], dtype='int32').tile([bbox_pred.shape[0]]) - bbox_pred = bbox_pred.reshape([-1, bbox_pred.shape[-1]]) - return bbox_pred, bbox_num diff --git a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/__init__.py deleted file mode 100644 index f3ad199..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from . import rpn_head -from . import embedding_rpn_head - -from .rpn_head import * -from .embedding_rpn_head import * diff --git a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/anchor_generator.py b/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/anchor_generator.py deleted file mode 100644 index 9a8e24e..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/anchor_generator.py +++ /dev/null @@ -1,266 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The code is based on -# https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/anchor_generator.py - -import math - -import paddle -import paddle.nn as nn -import numpy as np - -from ppdet.core.workspace import register - -__all__ = ['AnchorGenerator', 'RetinaAnchorGenerator', 'S2ANetAnchorGenerator'] - - -@register -class AnchorGenerator(nn.Layer): - """ - Generate anchors according to the feature maps - - Args: - anchor_sizes (list[float] | list[list[float]]): The anchor sizes at - each feature point. list[float] means all feature levels share the - same sizes. list[list[float]] means the anchor sizes for - each level. The sizes stand for the scale of input size. - aspect_ratios (list[float] | list[list[float]]): The aspect ratios at - each feature point. list[float] means all feature levels share the - same ratios. list[list[float]] means the aspect ratios for - each level. - strides (list[float]): The strides of feature maps which generate - anchors - offset (float): The offset of the coordinate of anchors, default 0. - - """ - - def __init__(self, - anchor_sizes=[32, 64, 128, 256, 512], - aspect_ratios=[0.5, 1.0, 2.0], - strides=[16.0], - variance=[1.0, 1.0, 1.0, 1.0], - offset=0.): - super(AnchorGenerator, self).__init__() - self.anchor_sizes = anchor_sizes - self.aspect_ratios = aspect_ratios - self.strides = strides - self.variance = variance - self.cell_anchors = self._calculate_anchors(len(strides)) - self.offset = offset - - def _broadcast_params(self, params, num_features): - if not isinstance(params[0], (list, tuple)): # list[float] - return [params] * num_features - if len(params) == 1: - return list(params) * num_features - return params - - def generate_cell_anchors(self, sizes, aspect_ratios): - anchors = [] - for size in sizes: - area = size**2.0 - for aspect_ratio in aspect_ratios: - w = math.sqrt(area / aspect_ratio) - h = aspect_ratio * w - x0, y0, x1, y1 = -w / 2.0, -h / 2.0, w / 2.0, h / 2.0 - anchors.append([x0, y0, x1, y1]) - return paddle.to_tensor(anchors, dtype='float32') - - def _calculate_anchors(self, num_features): - sizes = self._broadcast_params(self.anchor_sizes, num_features) - aspect_ratios = self._broadcast_params(self.aspect_ratios, num_features) - cell_anchors = [ - self.generate_cell_anchors(s, a) - for s, a in zip(sizes, aspect_ratios) - ] - [ - self.register_buffer( - t.name, t, persistable=False) for t in cell_anchors - ] - return cell_anchors - - def _create_grid_offsets(self, size, stride, offset): - grid_height, grid_width = size[0], size[1] - shifts_x = paddle.arange( - offset * stride, grid_width * stride, step=stride, dtype='float32') - shifts_y = paddle.arange( - offset * stride, grid_height * stride, step=stride, dtype='float32') - shift_y, shift_x = paddle.meshgrid(shifts_y, shifts_x) - shift_x = paddle.reshape(shift_x, [-1]) - shift_y = paddle.reshape(shift_y, [-1]) - return shift_x, shift_y - - def _grid_anchors(self, grid_sizes): - anchors = [] - for size, stride, base_anchors in zip(grid_sizes, self.strides, - self.cell_anchors): - shift_x, shift_y = self._create_grid_offsets(size, stride, - self.offset) - shifts = paddle.stack((shift_x, shift_y, shift_x, shift_y), axis=1) - shifts = paddle.reshape(shifts, [-1, 1, 4]) - base_anchors = paddle.reshape(base_anchors, [1, -1, 4]) - - anchors.append(paddle.reshape(shifts + base_anchors, [-1, 4])) - - return anchors - - def forward(self, input): - grid_sizes = [paddle.shape(feature_map)[-2:] for feature_map in input] - anchors_over_all_feature_maps = self._grid_anchors(grid_sizes) - return anchors_over_all_feature_maps - - @property - def num_anchors(self): - """ - Returns: - int: number of anchors at every pixel - location, on that feature map. - For example, if at every pixel we use anchors of 3 aspect - ratios and 5 sizes, the number of anchors is 15. - For FPN models, `num_anchors` on every feature map is the same. - """ - return len(self.cell_anchors[0]) - - -@register -class RetinaAnchorGenerator(AnchorGenerator): - def __init__(self, - octave_base_scale=4, - scales_per_octave=3, - aspect_ratios=[0.5, 1.0, 2.0], - strides=[8.0, 16.0, 32.0, 64.0, 128.0], - variance=[1.0, 1.0, 1.0, 1.0], - offset=0.0): - anchor_sizes = [] - for s in strides: - anchor_sizes.append([ - s * octave_base_scale * 2**(i/scales_per_octave) \ - for i in range(scales_per_octave)]) - super(RetinaAnchorGenerator, self).__init__( - anchor_sizes=anchor_sizes, - aspect_ratios=aspect_ratios, - strides=strides, - variance=variance, - offset=offset) - - -@register -class S2ANetAnchorGenerator(nn.Layer): - """ - AnchorGenerator by paddle - """ - - def __init__(self, base_size, scales, ratios, scale_major=True, ctr=None): - super(S2ANetAnchorGenerator, self).__init__() - self.base_size = base_size - self.scales = paddle.to_tensor(scales) - self.ratios = paddle.to_tensor(ratios) - self.scale_major = scale_major - self.ctr = ctr - self.base_anchors = self.gen_base_anchors() - - @property - def num_base_anchors(self): - return self.base_anchors.shape[0] - - def gen_base_anchors(self): - w = self.base_size - h = self.base_size - if self.ctr is None: - x_ctr = 0.5 * (w - 1) - y_ctr = 0.5 * (h - 1) - else: - x_ctr, y_ctr = self.ctr - - h_ratios = paddle.sqrt(self.ratios) - w_ratios = 1 / h_ratios - if self.scale_major: - ws = (w * w_ratios[:] * self.scales[:]).reshape([-1]) - hs = (h * h_ratios[:] * self.scales[:]).reshape([-1]) - else: - ws = (w * self.scales[:] * w_ratios[:]).reshape([-1]) - hs = (h * self.scales[:] * h_ratios[:]).reshape([-1]) - - base_anchors = paddle.stack( - [ - x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1), - x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1) - ], - axis=-1) - base_anchors = paddle.round(base_anchors) - return base_anchors - - def _meshgrid(self, x, y, row_major=True): - yy, xx = paddle.meshgrid(y, x) - yy = yy.reshape([-1]) - xx = xx.reshape([-1]) - if row_major: - return xx, yy - else: - return yy, xx - - def forward(self, featmap_size, stride=16): - # featmap_size*stride project it to original area - - feat_h = featmap_size[0] - feat_w = featmap_size[1] - shift_x = paddle.arange(0, feat_w, 1, 'int32') * stride - shift_y = paddle.arange(0, feat_h, 1, 'int32') * stride - shift_xx, shift_yy = self._meshgrid(shift_x, shift_y) - shifts = paddle.stack([shift_xx, shift_yy, shift_xx, shift_yy], axis=-1) - - all_anchors = self.base_anchors[:, :] + shifts[:, :] - all_anchors = all_anchors.cast(paddle.float32).reshape( - [feat_h * feat_w, 4]) - all_anchors = self.rect2rbox(all_anchors) - return all_anchors - - def valid_flags(self, featmap_size, valid_size): - feat_h, feat_w = featmap_size - valid_h, valid_w = valid_size - assert valid_h <= feat_h and valid_w <= feat_w - valid_x = paddle.zeros([feat_w], dtype='int32') - valid_y = paddle.zeros([feat_h], dtype='int32') - valid_x[:valid_w] = 1 - valid_y[:valid_h] = 1 - valid_xx, valid_yy = self._meshgrid(valid_x, valid_y) - valid = valid_xx & valid_yy - valid = paddle.reshape(valid, [-1, 1]) - valid = paddle.expand(valid, [-1, self.num_base_anchors]).reshape([-1]) - return valid - - def rect2rbox(self, bboxes): - """ - :param bboxes: shape (L, 4) (xmin, ymin, xmax, ymax) - :return: dbboxes: shape (L, 5) (x_ctr, y_ctr, w, h, angle) - """ - x1, y1, x2, y2 = paddle.split(bboxes, 4, axis=-1) - - x_ctr = (x1 + x2) / 2.0 - y_ctr = (y1 + y2) / 2.0 - edges1 = paddle.abs(x2 - x1) - edges2 = paddle.abs(y2 - y1) - - rbox_w = paddle.maximum(edges1, edges2) - rbox_h = paddle.minimum(edges1, edges2) - - # set angle - inds = edges1 < edges2 - inds = paddle.cast(inds, paddle.float32) - rboxes_angle = inds * np.pi / 2.0 - - rboxes = paddle.concat( - (x_ctr, y_ctr, rbox_w, rbox_h, rboxes_angle), axis=-1) - return rboxes diff --git a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/embedding_rpn_head.py b/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/embedding_rpn_head.py deleted file mode 100644 index 2917498..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/embedding_rpn_head.py +++ /dev/null @@ -1,63 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# This code is referenced from: https://github.com/open-mmlab/mmdetection - -import paddle -from paddle import nn - -from ppdet.core.workspace import register - -__all__ = ['EmbeddingRPNHead'] - - -@register -class EmbeddingRPNHead(nn.Layer): - __shared__ = ['proposal_embedding_dim'] - - def __init__(self, num_proposals, proposal_embedding_dim=256): - super(EmbeddingRPNHead, self).__init__() - - self.num_proposals = num_proposals - self.proposal_embedding_dim = proposal_embedding_dim - - self._init_layers() - self._init_weights() - - def _init_layers(self): - self.init_proposal_bboxes = nn.Embedding(self.num_proposals, 4) - self.init_proposal_features = nn.Embedding(self.num_proposals, - self.proposal_embedding_dim) - - def _init_weights(self): - init_bboxes = paddle.empty_like(self.init_proposal_bboxes.weight) - init_bboxes[:, :2] = 0.5 - init_bboxes[:, 2:] = 1.0 - self.init_proposal_bboxes.weight.set_value(init_bboxes) - - @staticmethod - def bbox_cxcywh_to_xyxy(x): - cxcy, wh = paddle.split(x, 2, axis=-1) - return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1) - - def forward(self, img_whwh): - proposal_bboxes = self.init_proposal_bboxes.weight.clone() - proposal_bboxes = self.bbox_cxcywh_to_xyxy(proposal_bboxes) - proposal_bboxes = proposal_bboxes.unsqueeze(0) * img_whwh.unsqueeze(1) - - proposal_features = self.init_proposal_features.weight.clone() - proposal_features = proposal_features.unsqueeze(0).tile( - [img_whwh.shape[0], 1, 1]) - - return proposal_bboxes, proposal_features diff --git a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/proposal_generator.py b/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/proposal_generator.py deleted file mode 100644 index b87a72c..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/proposal_generator.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle - -from ppdet.core.workspace import register, serializable -from .. import ops - - -@register -@serializable -class ProposalGenerator(object): - """ - Proposal generation module - - For more details, please refer to the document of generate_proposals - in ppdet/modeing/ops.py - - Args: - pre_nms_top_n (int): Number of total bboxes to be kept per - image before NMS. default 6000 - post_nms_top_n (int): Number of total bboxes to be kept per - image after NMS. default 1000 - nms_thresh (float): Threshold in NMS. default 0.5 - min_size (flaot): Remove predicted boxes with either height or - width < min_size. default 0.1 - eta (float): Apply in adaptive NMS, if adaptive `threshold > 0.5`, - `adaptive_threshold = adaptive_threshold * eta` in each iteration. - default 1. - topk_after_collect (bool): whether to adopt topk after batch - collection. If topk_after_collect is true, box filter will not be - used after NMS at each image in proposal generation. default false - """ - - def __init__(self, - pre_nms_top_n=12000, - post_nms_top_n=2000, - nms_thresh=.5, - min_size=.1, - eta=1., - topk_after_collect=False): - super(ProposalGenerator, self).__init__() - self.pre_nms_top_n = pre_nms_top_n - self.post_nms_top_n = post_nms_top_n - self.nms_thresh = nms_thresh - self.min_size = min_size - self.eta = eta - self.topk_after_collect = topk_after_collect - - def __call__(self, scores, bbox_deltas, anchors, im_shape): - - top_n = self.pre_nms_top_n if self.topk_after_collect else self.post_nms_top_n - variances = paddle.ones_like(anchors) - if hasattr(paddle.vision.ops, "generate_proposals"): - generate_proposals = getattr(paddle.vision.ops, - "generate_proposals") - else: - generate_proposals = ops.generate_proposals - rpn_rois, rpn_rois_prob, rpn_rois_num = generate_proposals( - scores, - bbox_deltas, - im_shape, - anchors, - variances, - pre_nms_top_n=self.pre_nms_top_n, - post_nms_top_n=top_n, - nms_thresh=self.nms_thresh, - min_size=self.min_size, - eta=self.eta, - return_rois_num=True) - - return rpn_rois, rpn_rois_prob, rpn_rois_num, self.post_nms_top_n diff --git a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/rpn_head.py b/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/rpn_head.py deleted file mode 100644 index 7c56d8d..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/rpn_head.py +++ /dev/null @@ -1,313 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.nn.initializer import Normal - -from ppdet.core.workspace import register -from .anchor_generator import AnchorGenerator -from .target_layer import RPNTargetAssign -from .proposal_generator import ProposalGenerator -from ..cls_utils import _get_class_default_kwargs - - -class RPNFeat(nn.Layer): - """ - Feature extraction in RPN head - - Args: - in_channel (int): Input channel - out_channel (int): Output channel - """ - - def __init__(self, in_channel=1024, out_channel=1024): - super(RPNFeat, self).__init__() - # rpn feat is shared with each level - self.rpn_conv = nn.Conv2D( - in_channels=in_channel, - out_channels=out_channel, - kernel_size=3, - padding=1, - weight_attr=paddle.ParamAttr(initializer=Normal( - mean=0., std=0.01))) - self.rpn_conv.skip_quant = True - - def forward(self, feats): - rpn_feats = [] - for feat in feats: - rpn_feats.append(F.relu(self.rpn_conv(feat))) - return rpn_feats - - -@register -class RPNHead(nn.Layer): - """ - Region Proposal Network - - Args: - anchor_generator (dict): configure of anchor generation - rpn_target_assign (dict): configure of rpn targets assignment - train_proposal (dict): configure of proposals generation - at the stage of training - test_proposal (dict): configure of proposals generation - at the stage of prediction - in_channel (int): channel of input feature maps which can be - derived by from_config - """ - __shared__ = ['export_onnx'] - __inject__ = ['loss_rpn_bbox'] - - def __init__(self, - anchor_generator=_get_class_default_kwargs(AnchorGenerator), - rpn_target_assign=_get_class_default_kwargs(RPNTargetAssign), - train_proposal=_get_class_default_kwargs(ProposalGenerator, - 12000, 2000), - test_proposal=_get_class_default_kwargs(ProposalGenerator), - in_channel=1024, - export_onnx=False, - loss_rpn_bbox=None): - super(RPNHead, self).__init__() - self.anchor_generator = anchor_generator - self.rpn_target_assign = rpn_target_assign - self.train_proposal = train_proposal - self.test_proposal = test_proposal - self.export_onnx = export_onnx - if isinstance(anchor_generator, dict): - self.anchor_generator = AnchorGenerator(**anchor_generator) - if isinstance(rpn_target_assign, dict): - self.rpn_target_assign = RPNTargetAssign(**rpn_target_assign) - if isinstance(train_proposal, dict): - self.train_proposal = ProposalGenerator(**train_proposal) - if isinstance(test_proposal, dict): - self.test_proposal = ProposalGenerator(**test_proposal) - self.loss_rpn_bbox = loss_rpn_bbox - - num_anchors = self.anchor_generator.num_anchors - self.rpn_feat = RPNFeat(in_channel, in_channel) - # rpn head is shared with each level - # rpn roi classification scores - self.rpn_rois_score = nn.Conv2D( - in_channels=in_channel, - out_channels=num_anchors, - kernel_size=1, - padding=0, - weight_attr=paddle.ParamAttr(initializer=Normal( - mean=0., std=0.01))) - self.rpn_rois_score.skip_quant = True - - # rpn roi bbox regression deltas - self.rpn_rois_delta = nn.Conv2D( - in_channels=in_channel, - out_channels=4 * num_anchors, - kernel_size=1, - padding=0, - weight_attr=paddle.ParamAttr(initializer=Normal( - mean=0., std=0.01))) - self.rpn_rois_delta.skip_quant = True - - @classmethod - def from_config(cls, cfg, input_shape): - # FPN share same rpn head - if isinstance(input_shape, (list, tuple)): - input_shape = input_shape[0] - return {'in_channel': input_shape.channels} - - def forward(self, feats, inputs): - rpn_feats = self.rpn_feat(feats) - scores = [] - deltas = [] - - for rpn_feat in rpn_feats: - rrs = self.rpn_rois_score(rpn_feat) - rrd = self.rpn_rois_delta(rpn_feat) - scores.append(rrs) - deltas.append(rrd) - - anchors = self.anchor_generator(rpn_feats) - - rois, rois_num = self._gen_proposal(scores, deltas, anchors, inputs) - if self.training: - loss = self.get_loss(scores, deltas, anchors, inputs) - return rois, rois_num, loss - else: - return rois, rois_num, None - - def _gen_proposal(self, scores, bbox_deltas, anchors, inputs): - """ - scores (list[Tensor]): Multi-level scores prediction - bbox_deltas (list[Tensor]): Multi-level deltas prediction - anchors (list[Tensor]): Multi-level anchors - inputs (dict): ground truth info - """ - prop_gen = self.train_proposal if self.training else self.test_proposal - im_shape = inputs['im_shape'] - - # Collect multi-level proposals for each batch - # Get 'topk' of them as final output - - if self.export_onnx: - # bs = 1 when exporting onnx - onnx_rpn_rois_list = [] - onnx_rpn_prob_list = [] - onnx_rpn_rois_num_list = [] - - for rpn_score, rpn_delta, anchor in zip(scores, bbox_deltas, - anchors): - onnx_rpn_rois, onnx_rpn_rois_prob, onnx_rpn_rois_num, onnx_post_nms_top_n = prop_gen( - scores=rpn_score[0:1], - bbox_deltas=rpn_delta[0:1], - anchors=anchor, - im_shape=im_shape[0:1]) - onnx_rpn_rois_list.append(onnx_rpn_rois) - onnx_rpn_prob_list.append(onnx_rpn_rois_prob) - onnx_rpn_rois_num_list.append(onnx_rpn_rois_num) - - onnx_rpn_rois = paddle.concat(onnx_rpn_rois_list) - onnx_rpn_prob = paddle.concat(onnx_rpn_prob_list).flatten() - - onnx_top_n = paddle.to_tensor(onnx_post_nms_top_n).cast('int32') - onnx_num_rois = paddle.shape(onnx_rpn_prob)[0].cast('int32') - k = paddle.minimum(onnx_top_n, onnx_num_rois) - onnx_topk_prob, onnx_topk_inds = paddle.topk(onnx_rpn_prob, k) - onnx_topk_rois = paddle.gather(onnx_rpn_rois, onnx_topk_inds) - # TODO(wangguanzhong): Now bs_rois_collect in export_onnx is moved outside conditional branch - # due to problems in dy2static of paddle. Will fix it when updating paddle framework. - # bs_rois_collect = [onnx_topk_rois] - # bs_rois_num_collect = paddle.shape(onnx_topk_rois)[0] - - else: - bs_rois_collect = [] - bs_rois_num_collect = [] - - batch_size = paddle.slice(paddle.shape(im_shape), [0], [0], [1]) - - # Generate proposals for each level and each batch. - # Discard batch-computing to avoid sorting bbox cross different batches. - for i in range(batch_size): - rpn_rois_list = [] - rpn_prob_list = [] - rpn_rois_num_list = [] - - for rpn_score, rpn_delta, anchor in zip(scores, bbox_deltas, - anchors): - rpn_rois, rpn_rois_prob, rpn_rois_num, post_nms_top_n = prop_gen( - scores=rpn_score[i:i + 1], - bbox_deltas=rpn_delta[i:i + 1], - anchors=anchor, - im_shape=im_shape[i:i + 1]) - rpn_rois_list.append(rpn_rois) - rpn_prob_list.append(rpn_rois_prob) - rpn_rois_num_list.append(rpn_rois_num) - - if len(scores) > 1: - rpn_rois = paddle.concat(rpn_rois_list) - rpn_prob = paddle.concat(rpn_prob_list).flatten() - - num_rois = paddle.shape(rpn_prob)[0].cast('int32') - if num_rois > post_nms_top_n: - topk_prob, topk_inds = paddle.topk(rpn_prob, - post_nms_top_n) - topk_rois = paddle.gather(rpn_rois, topk_inds) - else: - topk_rois = rpn_rois - topk_prob = rpn_prob - else: - topk_rois = rpn_rois_list[0] - topk_prob = rpn_prob_list[0].flatten() - - bs_rois_collect.append(topk_rois) - bs_rois_num_collect.append(paddle.shape(topk_rois)[0:1]) - - bs_rois_num_collect = paddle.concat(bs_rois_num_collect) - - if self.export_onnx: - output_rois = [onnx_topk_rois] - output_rois_num = paddle.shape(onnx_topk_rois)[0] - else: - output_rois = bs_rois_collect - output_rois_num = bs_rois_num_collect - - return output_rois, output_rois_num - - def get_loss(self, pred_scores, pred_deltas, anchors, inputs): - """ - pred_scores (list[Tensor]): Multi-level scores prediction - pred_deltas (list[Tensor]): Multi-level deltas prediction - anchors (list[Tensor]): Multi-level anchors - inputs (dict): ground truth info, including im, gt_bbox, gt_score - """ - anchors = [paddle.reshape(a, shape=(-1, 4)) for a in anchors] - anchors = paddle.concat(anchors) - - scores = [ - paddle.reshape( - paddle.transpose( - v, perm=[0, 2, 3, 1]), - shape=(v.shape[0], -1, 1)) for v in pred_scores - ] - scores = paddle.concat(scores, axis=1) - - deltas = [ - paddle.reshape( - paddle.transpose( - v, perm=[0, 2, 3, 1]), - shape=(v.shape[0], -1, 4)) for v in pred_deltas - ] - deltas = paddle.concat(deltas, axis=1) - - score_tgt, bbox_tgt, loc_tgt, norm = self.rpn_target_assign(inputs, - anchors) - - scores = paddle.reshape(x=scores, shape=(-1, )) - deltas = paddle.reshape(x=deltas, shape=(-1, 4)) - - score_tgt = paddle.concat(score_tgt) - score_tgt.stop_gradient = True - - pos_mask = score_tgt == 1 - pos_ind = paddle.nonzero(pos_mask) - - valid_mask = score_tgt >= 0 - valid_ind = paddle.nonzero(valid_mask) - - # cls loss - if valid_ind.shape[0] == 0: - loss_rpn_cls = paddle.zeros([1], dtype='float32') - else: - score_pred = paddle.gather(scores, valid_ind) - score_label = paddle.gather(score_tgt, valid_ind).cast('float32') - score_label.stop_gradient = True - loss_rpn_cls = F.binary_cross_entropy_with_logits( - logit=score_pred, label=score_label, reduction="sum") - - # reg loss - if pos_ind.shape[0] == 0: - loss_rpn_reg = paddle.zeros([1], dtype='float32') - else: - loc_pred = paddle.gather(deltas, pos_ind) - loc_tgt = paddle.concat(loc_tgt) - loc_tgt = paddle.gather(loc_tgt, pos_ind) - loc_tgt.stop_gradient = True - - if self.loss_rpn_bbox is None: - loss_rpn_reg = paddle.abs(loc_pred - loc_tgt).sum() - else: - loss_rpn_reg = self.loss_rpn_bbox(loc_pred, loc_tgt).sum() - - return { - 'loss_rpn_cls': loss_rpn_cls / norm, - 'loss_rpn_reg': loss_rpn_reg / norm - } diff --git a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/target.py b/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/target.py deleted file mode 100644 index 041b2c7..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/target.py +++ /dev/null @@ -1,678 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import paddle -from ..bbox_utils import bbox2delta, bbox_overlaps - - -def rpn_anchor_target(anchors, - gt_boxes, - rpn_batch_size_per_im, - rpn_positive_overlap, - rpn_negative_overlap, - rpn_fg_fraction, - use_random=True, - batch_size=1, - ignore_thresh=-1, - is_crowd=None, - weights=[1., 1., 1., 1.], - assign_on_cpu=False): - tgt_labels = [] - tgt_bboxes = [] - tgt_deltas = [] - for i in range(batch_size): - gt_bbox = gt_boxes[i] - is_crowd_i = is_crowd[i] if is_crowd else None - # Step1: match anchor and gt_bbox - matches, match_labels = label_box( - anchors, gt_bbox, rpn_positive_overlap, rpn_negative_overlap, True, - ignore_thresh, is_crowd_i, assign_on_cpu) - # Step2: sample anchor - fg_inds, bg_inds = subsample_labels(match_labels, rpn_batch_size_per_im, - rpn_fg_fraction, 0, use_random) - # Fill with the ignore label (-1), then set positive and negative labels - labels = paddle.full(match_labels.shape, -1, dtype='int32') - if bg_inds.shape[0] > 0: - labels = paddle.scatter(labels, bg_inds, paddle.zeros_like(bg_inds)) - if fg_inds.shape[0] > 0: - labels = paddle.scatter(labels, fg_inds, paddle.ones_like(fg_inds)) - # Step3: make output - if gt_bbox.shape[0] == 0: - matched_gt_boxes = paddle.zeros([matches.shape[0], 4]) - tgt_delta = paddle.zeros([matches.shape[0], 4]) - else: - matched_gt_boxes = paddle.gather(gt_bbox, matches) - tgt_delta = bbox2delta(anchors, matched_gt_boxes, weights) - matched_gt_boxes.stop_gradient = True - tgt_delta.stop_gradient = True - labels.stop_gradient = True - tgt_labels.append(labels) - tgt_bboxes.append(matched_gt_boxes) - tgt_deltas.append(tgt_delta) - - return tgt_labels, tgt_bboxes, tgt_deltas - - -def label_box(anchors, - gt_boxes, - positive_overlap, - negative_overlap, - allow_low_quality, - ignore_thresh, - is_crowd=None, - assign_on_cpu=False): - if assign_on_cpu: - device = paddle.device.get_device() - paddle.set_device("cpu") - iou = bbox_overlaps(gt_boxes, anchors) - paddle.set_device(device) - - else: - iou = bbox_overlaps(gt_boxes, anchors) - n_gt = gt_boxes.shape[0] - if n_gt == 0 or is_crowd is None: - n_gt_crowd = 0 - else: - n_gt_crowd = paddle.nonzero(is_crowd).shape[0] - if iou.shape[0] == 0 or n_gt_crowd == n_gt: - # No truth, assign everything to background - default_matches = paddle.full((iou.shape[1], ), 0, dtype='int64') - default_match_labels = paddle.full((iou.shape[1], ), 0, dtype='int32') - return default_matches, default_match_labels - # if ignore_thresh > 0, remove anchor if it is closed to - # one of the crowded ground-truth - if n_gt_crowd > 0: - N_a = anchors.shape[0] - ones = paddle.ones([N_a]) - mask = is_crowd * ones - - if ignore_thresh > 0: - crowd_iou = iou * mask - valid = (paddle.sum((crowd_iou > ignore_thresh).cast('int32'), - axis=0) > 0).cast('float32') - iou = iou * (1 - valid) - valid - - # ignore the iou between anchor and crowded ground-truth - iou = iou * (1 - mask) - mask - - matched_vals, matches = paddle.topk(iou, k=1, axis=0) - match_labels = paddle.full(matches.shape, -1, dtype='int32') - # set ignored anchor with iou = -1 - neg_cond = paddle.logical_and(matched_vals > -1, - matched_vals < negative_overlap) - match_labels = paddle.where(neg_cond, - paddle.zeros_like(match_labels), match_labels) - match_labels = paddle.where(matched_vals >= positive_overlap, - paddle.ones_like(match_labels), match_labels) - if allow_low_quality: - highest_quality_foreach_gt = iou.max(axis=1, keepdim=True) - pred_inds_with_highest_quality = paddle.logical_and( - iou > 0, iou == highest_quality_foreach_gt).cast('int32').sum( - 0, keepdim=True) - match_labels = paddle.where(pred_inds_with_highest_quality > 0, - paddle.ones_like(match_labels), - match_labels) - - matches = matches.flatten() - match_labels = match_labels.flatten() - - return matches, match_labels - - -def subsample_labels(labels, - num_samples, - fg_fraction, - bg_label=0, - use_random=True): - positive = paddle.nonzero( - paddle.logical_and(labels != -1, labels != bg_label)) - negative = paddle.nonzero(labels == bg_label) - - fg_num = int(num_samples * fg_fraction) - fg_num = min(positive.numel(), fg_num) - bg_num = num_samples - fg_num - bg_num = min(negative.numel(), bg_num) - if fg_num == 0 and bg_num == 0: - fg_inds = paddle.zeros([0], dtype='int32') - bg_inds = paddle.zeros([0], dtype='int32') - return fg_inds, bg_inds - - # randomly select positive and negative examples - - negative = negative.cast('int32').flatten() - bg_perm = paddle.randperm(negative.numel(), dtype='int32') - bg_perm = paddle.slice(bg_perm, axes=[0], starts=[0], ends=[bg_num]) - if use_random: - bg_inds = paddle.gather(negative, bg_perm) - else: - bg_inds = paddle.slice(negative, axes=[0], starts=[0], ends=[bg_num]) - if fg_num == 0: - fg_inds = paddle.zeros([0], dtype='int32') - return fg_inds, bg_inds - - positive = positive.cast('int32').flatten() - fg_perm = paddle.randperm(positive.numel(), dtype='int32') - fg_perm = paddle.slice(fg_perm, axes=[0], starts=[0], ends=[fg_num]) - if use_random: - fg_inds = paddle.gather(positive, fg_perm) - else: - fg_inds = paddle.slice(positive, axes=[0], starts=[0], ends=[fg_num]) - - return fg_inds, bg_inds - - -def generate_proposal_target(rpn_rois, - gt_classes, - gt_boxes, - batch_size_per_im, - fg_fraction, - fg_thresh, - bg_thresh, - num_classes, - ignore_thresh=-1., - is_crowd=None, - use_random=True, - is_cascade=False, - cascade_iou=0.5, - assign_on_cpu=False, - add_gt_as_proposals=True): - - rois_with_gt = [] - tgt_labels = [] - tgt_bboxes = [] - tgt_gt_inds = [] - new_rois_num = [] - - # In cascade rcnn, the threshold for foreground and background - # is used from cascade_iou - fg_thresh = cascade_iou if is_cascade else fg_thresh - bg_thresh = cascade_iou if is_cascade else bg_thresh - for i, rpn_roi in enumerate(rpn_rois): - gt_bbox = gt_boxes[i] - is_crowd_i = is_crowd[i] if is_crowd else None - gt_class = paddle.squeeze(gt_classes[i], axis=-1) - - # Concat RoIs and gt boxes except cascade rcnn or none gt - if add_gt_as_proposals and gt_bbox.shape[0] > 0: - bbox = paddle.concat([rpn_roi, gt_bbox]) - else: - bbox = rpn_roi - - # Step1: label bbox - matches, match_labels = label_box(bbox, gt_bbox, fg_thresh, bg_thresh, - False, ignore_thresh, is_crowd_i, - assign_on_cpu) - # Step2: sample bbox - sampled_inds, sampled_gt_classes = sample_bbox( - matches, match_labels, gt_class, batch_size_per_im, fg_fraction, - num_classes, use_random, is_cascade) - - # Step3: make output - rois_per_image = bbox if is_cascade else paddle.gather(bbox, - sampled_inds) - sampled_gt_ind = matches if is_cascade else paddle.gather(matches, - sampled_inds) - if gt_bbox.shape[0] > 0: - sampled_bbox = paddle.gather(gt_bbox, sampled_gt_ind) - else: - num = rois_per_image.shape[0] - sampled_bbox = paddle.zeros([num, 4], dtype='float32') - - rois_per_image.stop_gradient = True - sampled_gt_ind.stop_gradient = True - sampled_bbox.stop_gradient = True - tgt_labels.append(sampled_gt_classes) - tgt_bboxes.append(sampled_bbox) - rois_with_gt.append(rois_per_image) - tgt_gt_inds.append(sampled_gt_ind) - new_rois_num.append(paddle.shape(sampled_inds)[0:1]) - new_rois_num = paddle.concat(new_rois_num) - return rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num - - -def sample_bbox(matches, - match_labels, - gt_classes, - batch_size_per_im, - fg_fraction, - num_classes, - use_random=True, - is_cascade=False): - - n_gt = gt_classes.shape[0] - if n_gt == 0: - # No truth, assign everything to background - gt_classes = paddle.ones(matches.shape, dtype='int32') * num_classes - #return matches, match_labels + num_classes - else: - gt_classes = paddle.gather(gt_classes, matches) - gt_classes = paddle.where(match_labels == 0, - paddle.ones_like(gt_classes) * num_classes, - gt_classes) - gt_classes = paddle.where(match_labels == -1, - paddle.ones_like(gt_classes) * -1, gt_classes) - if is_cascade: - index = paddle.arange(matches.shape[0]) - return index, gt_classes - rois_per_image = int(batch_size_per_im) - - fg_inds, bg_inds = subsample_labels(gt_classes, rois_per_image, fg_fraction, - num_classes, use_random) - if fg_inds.shape[0] == 0 and bg_inds.shape[0] == 0: - # fake output labeled with -1 when all boxes are neither - # foreground nor background - sampled_inds = paddle.zeros([1], dtype='int32') - else: - sampled_inds = paddle.concat([fg_inds, bg_inds]) - sampled_gt_classes = paddle.gather(gt_classes, sampled_inds) - return sampled_inds, sampled_gt_classes - - -def polygons_to_mask(polygons, height, width): - """ - Convert the polygons to mask format - - Args: - polygons (list[ndarray]): each array has shape (Nx2,) - height (int): mask height - width (int): mask width - Returns: - ndarray: a bool mask of shape (height, width) - """ - import pycocotools.mask as mask_util - assert len(polygons) > 0, "COCOAPI does not support empty polygons" - rles = mask_util.frPyObjects(polygons, height, width) - rle = mask_util.merge(rles) - return mask_util.decode(rle).astype(np.bool_) - - -def rasterize_polygons_within_box(poly, box, resolution): - w, h = box[2] - box[0], box[3] - box[1] - polygons = [np.asarray(p, dtype=np.float64) for p in poly] - for p in polygons: - p[0::2] = p[0::2] - box[0] - p[1::2] = p[1::2] - box[1] - - ratio_h = resolution / max(h, 0.1) - ratio_w = resolution / max(w, 0.1) - - if ratio_h == ratio_w: - for p in polygons: - p *= ratio_h - else: - for p in polygons: - p[0::2] *= ratio_w - p[1::2] *= ratio_h - - # 3. Rasterize the polygons with coco api - mask = polygons_to_mask(polygons, resolution, resolution) - mask = paddle.to_tensor(mask, dtype='int32') - return mask - - -def generate_mask_target(gt_segms, rois, labels_int32, sampled_gt_inds, - num_classes, resolution): - mask_rois = [] - mask_rois_num = [] - tgt_masks = [] - tgt_classes = [] - mask_index = [] - tgt_weights = [] - for k in range(len(rois)): - labels_per_im = labels_int32[k] - # select rois labeled with foreground - fg_inds = paddle.nonzero( - paddle.logical_and(labels_per_im != -1, labels_per_im != - num_classes)) - has_fg = True - # generate fake roi if foreground is empty - if fg_inds.numel() == 0: - has_fg = False - fg_inds = paddle.ones([1, 1], dtype='int64') - inds_per_im = sampled_gt_inds[k] - inds_per_im = paddle.gather(inds_per_im, fg_inds) - - rois_per_im = rois[k] - fg_rois = paddle.gather(rois_per_im, fg_inds) - # Copy the foreground roi to cpu - # to generate mask target with ground-truth - boxes = fg_rois.numpy() - gt_segms_per_im = gt_segms[k] - - new_segm = [] - inds_per_im = inds_per_im.numpy() - if len(gt_segms_per_im) > 0: - for i in inds_per_im: - new_segm.append(gt_segms_per_im[i]) - fg_inds_new = fg_inds.reshape([-1]).numpy() - results = [] - if len(gt_segms_per_im) > 0: - for j in range(fg_inds_new.shape[0]): - results.append( - rasterize_polygons_within_box(new_segm[j], boxes[j], - resolution)) - else: - results.append(paddle.ones([resolution, resolution], dtype='int32')) - - fg_classes = paddle.gather(labels_per_im, fg_inds) - weight = paddle.ones([fg_rois.shape[0]], dtype='float32') - if not has_fg: - # now all sampled classes are background - # which will cause error in loss calculation, - # make fake classes with weight of 0. - fg_classes = paddle.zeros([1], dtype='int32') - weight = weight - 1 - tgt_mask = paddle.stack(results) - tgt_mask.stop_gradient = True - fg_rois.stop_gradient = True - - mask_index.append(fg_inds) - mask_rois.append(fg_rois) - mask_rois_num.append(paddle.shape(fg_rois)[0:1]) - tgt_classes.append(fg_classes) - tgt_masks.append(tgt_mask) - tgt_weights.append(weight) - - mask_index = paddle.concat(mask_index) - mask_rois_num = paddle.concat(mask_rois_num) - tgt_classes = paddle.concat(tgt_classes, axis=0) - tgt_masks = paddle.concat(tgt_masks, axis=0) - tgt_weights = paddle.concat(tgt_weights, axis=0) - - return mask_rois, mask_rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights - - -def libra_sample_pos(max_overlaps, max_classes, pos_inds, num_expected): - if len(pos_inds) <= num_expected: - return pos_inds - else: - unique_gt_inds = np.unique(max_classes[pos_inds]) - num_gts = len(unique_gt_inds) - num_per_gt = int(round(num_expected / float(num_gts)) + 1) - - sampled_inds = [] - for i in unique_gt_inds: - inds = np.nonzero(max_classes == i)[0] - before_len = len(inds) - inds = list(set(inds) & set(pos_inds)) - after_len = len(inds) - if len(inds) > num_per_gt: - inds = np.random.choice(inds, size=num_per_gt, replace=False) - sampled_inds.extend(list(inds)) # combine as a new sampler - if len(sampled_inds) < num_expected: - num_extra = num_expected - len(sampled_inds) - extra_inds = np.array(list(set(pos_inds) - set(sampled_inds))) - assert len(sampled_inds) + len(extra_inds) == len(pos_inds), \ - "sum of sampled_inds({}) and extra_inds({}) length must be equal with pos_inds({})!".format( - len(sampled_inds), len(extra_inds), len(pos_inds)) - if len(extra_inds) > num_extra: - extra_inds = np.random.choice( - extra_inds, size=num_extra, replace=False) - sampled_inds.extend(extra_inds.tolist()) - elif len(sampled_inds) > num_expected: - sampled_inds = np.random.choice( - sampled_inds, size=num_expected, replace=False) - return paddle.to_tensor(sampled_inds) - - -def libra_sample_via_interval(max_overlaps, full_set, num_expected, floor_thr, - num_bins, bg_thresh): - max_iou = max_overlaps.max() - iou_interval = (max_iou - floor_thr) / num_bins - per_num_expected = int(num_expected / num_bins) - - sampled_inds = [] - for i in range(num_bins): - start_iou = floor_thr + i * iou_interval - end_iou = floor_thr + (i + 1) * iou_interval - - tmp_set = set( - np.where( - np.logical_and(max_overlaps >= start_iou, max_overlaps < - end_iou))[0]) - tmp_inds = list(tmp_set & full_set) - - if len(tmp_inds) > per_num_expected: - tmp_sampled_set = np.random.choice( - tmp_inds, size=per_num_expected, replace=False) - else: - tmp_sampled_set = np.array(tmp_inds, dtype=np.int32) - sampled_inds.append(tmp_sampled_set) - - sampled_inds = np.concatenate(sampled_inds) - if len(sampled_inds) < num_expected: - num_extra = num_expected - len(sampled_inds) - extra_inds = np.array(list(full_set - set(sampled_inds))) - assert len(sampled_inds) + len(extra_inds) == len(full_set), \ - "sum of sampled_inds({}) and extra_inds({}) length must be equal with full_set({})!".format( - len(sampled_inds), len(extra_inds), len(full_set)) - - if len(extra_inds) > num_extra: - extra_inds = np.random.choice(extra_inds, num_extra, replace=False) - sampled_inds = np.concatenate([sampled_inds, extra_inds]) - - return sampled_inds - - -def libra_sample_neg(max_overlaps, - max_classes, - neg_inds, - num_expected, - floor_thr=-1, - floor_fraction=0, - num_bins=3, - bg_thresh=0.5): - if len(neg_inds) <= num_expected: - return neg_inds - else: - # balance sampling for negative samples - neg_set = set(neg_inds.tolist()) - if floor_thr > 0: - floor_set = set( - np.where( - np.logical_and(max_overlaps >= 0, max_overlaps < floor_thr)) - [0]) - iou_sampling_set = set(np.where(max_overlaps >= floor_thr)[0]) - elif floor_thr == 0: - floor_set = set(np.where(max_overlaps == 0)[0]) - iou_sampling_set = set(np.where(max_overlaps > floor_thr)[0]) - else: - floor_set = set() - iou_sampling_set = set(np.where(max_overlaps > floor_thr)[0]) - floor_thr = 0 - - floor_neg_inds = list(floor_set & neg_set) - iou_sampling_neg_inds = list(iou_sampling_set & neg_set) - - num_expected_iou_sampling = int(num_expected * (1 - floor_fraction)) - if len(iou_sampling_neg_inds) > num_expected_iou_sampling: - if num_bins >= 2: - iou_sampled_inds = libra_sample_via_interval( - max_overlaps, - set(iou_sampling_neg_inds), num_expected_iou_sampling, - floor_thr, num_bins, bg_thresh) - else: - iou_sampled_inds = np.random.choice( - iou_sampling_neg_inds, - size=num_expected_iou_sampling, - replace=False) - else: - iou_sampled_inds = np.array(iou_sampling_neg_inds, dtype=np.int32) - num_expected_floor = num_expected - len(iou_sampled_inds) - if len(floor_neg_inds) > num_expected_floor: - sampled_floor_inds = np.random.choice( - floor_neg_inds, size=num_expected_floor, replace=False) - else: - sampled_floor_inds = np.array(floor_neg_inds, dtype=np.int32) - sampled_inds = np.concatenate((sampled_floor_inds, iou_sampled_inds)) - if len(sampled_inds) < num_expected: - num_extra = num_expected - len(sampled_inds) - extra_inds = np.array(list(neg_set - set(sampled_inds))) - if len(extra_inds) > num_extra: - extra_inds = np.random.choice( - extra_inds, size=num_extra, replace=False) - sampled_inds = np.concatenate((sampled_inds, extra_inds)) - return paddle.to_tensor(sampled_inds) - - -def libra_label_box(anchors, gt_boxes, gt_classes, positive_overlap, - negative_overlap, num_classes): - # TODO: use paddle API to speed up - gt_classes = gt_classes.numpy() - gt_overlaps = np.zeros((anchors.shape[0], num_classes)) - matches = np.zeros((anchors.shape[0]), dtype=np.int32) - if len(gt_boxes) > 0: - proposal_to_gt_overlaps = bbox_overlaps(anchors, gt_boxes).numpy() - overlaps_argmax = proposal_to_gt_overlaps.argmax(axis=1) - overlaps_max = proposal_to_gt_overlaps.max(axis=1) - # Boxes which with non-zero overlap with gt boxes - overlapped_boxes_ind = np.where(overlaps_max > 0)[0] - overlapped_boxes_gt_classes = gt_classes[overlaps_argmax[ - overlapped_boxes_ind]] - - for idx in range(len(overlapped_boxes_ind)): - gt_overlaps[overlapped_boxes_ind[idx], overlapped_boxes_gt_classes[ - idx]] = overlaps_max[overlapped_boxes_ind[idx]] - matches[overlapped_boxes_ind[idx]] = overlaps_argmax[ - overlapped_boxes_ind[idx]] - - gt_overlaps = paddle.to_tensor(gt_overlaps) - matches = paddle.to_tensor(matches) - - matched_vals = paddle.max(gt_overlaps, axis=1) - match_labels = paddle.full(matches.shape, -1, dtype='int32') - match_labels = paddle.where(matched_vals < negative_overlap, - paddle.zeros_like(match_labels), match_labels) - match_labels = paddle.where(matched_vals >= positive_overlap, - paddle.ones_like(match_labels), match_labels) - - return matches, match_labels, matched_vals - - -def libra_sample_bbox(matches, - match_labels, - matched_vals, - gt_classes, - batch_size_per_im, - num_classes, - fg_fraction, - fg_thresh, - bg_thresh, - num_bins, - use_random=True, - is_cascade_rcnn=False): - rois_per_image = int(batch_size_per_im) - fg_rois_per_im = int(np.round(fg_fraction * rois_per_image)) - bg_rois_per_im = rois_per_image - fg_rois_per_im - - if is_cascade_rcnn: - fg_inds = paddle.nonzero(matched_vals >= fg_thresh) - bg_inds = paddle.nonzero(matched_vals < bg_thresh) - else: - matched_vals_np = matched_vals.numpy() - match_labels_np = match_labels.numpy() - - # sample fg - fg_inds = paddle.nonzero(matched_vals >= fg_thresh).flatten() - fg_nums = int(np.minimum(fg_rois_per_im, fg_inds.shape[0])) - if (fg_inds.shape[0] > fg_nums) and use_random: - fg_inds = libra_sample_pos(matched_vals_np, match_labels_np, - fg_inds.numpy(), fg_rois_per_im) - fg_inds = fg_inds[:fg_nums] - - # sample bg - bg_inds = paddle.nonzero(matched_vals < bg_thresh).flatten() - bg_nums = int(np.minimum(rois_per_image - fg_nums, bg_inds.shape[0])) - if (bg_inds.shape[0] > bg_nums) and use_random: - bg_inds = libra_sample_neg( - matched_vals_np, - match_labels_np, - bg_inds.numpy(), - bg_rois_per_im, - num_bins=num_bins, - bg_thresh=bg_thresh) - bg_inds = bg_inds[:bg_nums] - - sampled_inds = paddle.concat([fg_inds, bg_inds]) - - gt_classes = paddle.gather(gt_classes, matches) - gt_classes = paddle.where(match_labels == 0, - paddle.ones_like(gt_classes) * num_classes, - gt_classes) - gt_classes = paddle.where(match_labels == -1, - paddle.ones_like(gt_classes) * -1, gt_classes) - sampled_gt_classes = paddle.gather(gt_classes, sampled_inds) - - return sampled_inds, sampled_gt_classes - - -def libra_generate_proposal_target(rpn_rois, - gt_classes, - gt_boxes, - batch_size_per_im, - fg_fraction, - fg_thresh, - bg_thresh, - num_classes, - use_random=True, - is_cascade_rcnn=False, - max_overlaps=None, - num_bins=3): - - rois_with_gt = [] - tgt_labels = [] - tgt_bboxes = [] - sampled_max_overlaps = [] - tgt_gt_inds = [] - new_rois_num = [] - - for i, rpn_roi in enumerate(rpn_rois): - max_overlap = max_overlaps[i] if is_cascade_rcnn else None - gt_bbox = gt_boxes[i] - gt_class = paddle.squeeze(gt_classes[i], axis=-1) - if is_cascade_rcnn: - rpn_roi = filter_roi(rpn_roi, max_overlap) - bbox = paddle.concat([rpn_roi, gt_bbox]) - - # Step1: label bbox - matches, match_labels, matched_vals = libra_label_box( - bbox, gt_bbox, gt_class, fg_thresh, bg_thresh, num_classes) - - # Step2: sample bbox - sampled_inds, sampled_gt_classes = libra_sample_bbox( - matches, match_labels, matched_vals, gt_class, batch_size_per_im, - num_classes, fg_fraction, fg_thresh, bg_thresh, num_bins, - use_random, is_cascade_rcnn) - - # Step3: make output - rois_per_image = paddle.gather(bbox, sampled_inds) - sampled_gt_ind = paddle.gather(matches, sampled_inds) - sampled_bbox = paddle.gather(gt_bbox, sampled_gt_ind) - sampled_overlap = paddle.gather(matched_vals, sampled_inds) - - rois_per_image.stop_gradient = True - sampled_gt_ind.stop_gradient = True - sampled_bbox.stop_gradient = True - sampled_overlap.stop_gradient = True - - tgt_labels.append(sampled_gt_classes) - tgt_bboxes.append(sampled_bbox) - rois_with_gt.append(rois_per_image) - sampled_max_overlaps.append(sampled_overlap) - tgt_gt_inds.append(sampled_gt_ind) - new_rois_num.append(paddle.shape(sampled_inds)[0:1]) - new_rois_num = paddle.concat(new_rois_num) - # rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num - return rois_with_gt, tgt_labels, tgt_bboxes, tgt_gt_inds, new_rois_num diff --git a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/target_layer.py b/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/target_layer.py deleted file mode 100644 index c010c81..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/proposal_generator/target_layer.py +++ /dev/null @@ -1,481 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import sys -import paddle -from ppdet.core.workspace import register, serializable - -from .target import rpn_anchor_target, generate_proposal_target, generate_mask_target, libra_generate_proposal_target -import numpy as np - - -@register -@serializable -class RPNTargetAssign(object): - __shared__ = ['assign_on_cpu'] - """ - RPN targets assignment module - - The assignment consists of three steps: - 1. Match anchor and ground-truth box, label the anchor with foreground - or background sample - 2. Sample anchors to keep the properly ratio between foreground and - background - 3. Generate the targets for classification and regression branch - - - Args: - batch_size_per_im (int): Total number of RPN samples per image. - default 256 - fg_fraction (float): Fraction of anchors that is labeled - foreground, default 0.5 - positive_overlap (float): Minimum overlap required between an anchor - and ground-truth box for the (anchor, gt box) pair to be - a foreground sample. default 0.7 - negative_overlap (float): Maximum overlap allowed between an anchor - and ground-truth box for the (anchor, gt box) pair to be - a background sample. default 0.3 - ignore_thresh(float): Threshold for ignoring the is_crowd ground-truth - if the value is larger than zero. - use_random (bool): Use random sampling to choose foreground and - background boxes, default true. - assign_on_cpu (bool): In case the number of gt box is too large, - compute IoU on CPU, default false. - """ - - def __init__(self, - batch_size_per_im=256, - fg_fraction=0.5, - positive_overlap=0.7, - negative_overlap=0.3, - ignore_thresh=-1., - use_random=True, - assign_on_cpu=False): - super(RPNTargetAssign, self).__init__() - self.batch_size_per_im = batch_size_per_im - self.fg_fraction = fg_fraction - self.positive_overlap = positive_overlap - self.negative_overlap = negative_overlap - self.ignore_thresh = ignore_thresh - self.use_random = use_random - self.assign_on_cpu = assign_on_cpu - - def __call__(self, inputs, anchors): - """ - inputs: ground-truth instances. - anchor_box (Tensor): [num_anchors, 4], num_anchors are all anchors in all feature maps. - """ - gt_boxes = inputs['gt_bbox'] - is_crowd = inputs.get('is_crowd', None) - batch_size = len(gt_boxes) - tgt_labels, tgt_bboxes, tgt_deltas = rpn_anchor_target( - anchors, - gt_boxes, - self.batch_size_per_im, - self.positive_overlap, - self.negative_overlap, - self.fg_fraction, - self.use_random, - batch_size, - self.ignore_thresh, - is_crowd, - assign_on_cpu=self.assign_on_cpu) - norm = self.batch_size_per_im * batch_size - - return tgt_labels, tgt_bboxes, tgt_deltas, norm - - -@register -class BBoxAssigner(object): - __shared__ = ['num_classes', 'assign_on_cpu'] - """ - RCNN targets assignment module - - The assignment consists of three steps: - 1. Match RoIs and ground-truth box, label the RoIs with foreground - or background sample - 2. Sample anchors to keep the properly ratio between foreground and - background - 3. Generate the targets for classification and regression branch - - Args: - batch_size_per_im (int): Total number of RoIs per image. - default 512 - fg_fraction (float): Fraction of RoIs that is labeled - foreground, default 0.25 - fg_thresh (float): Minimum overlap required between a RoI - and ground-truth box for the (roi, gt box) pair to be - a foreground sample. default 0.5 - bg_thresh (float): Maximum overlap allowed between a RoI - and ground-truth box for the (roi, gt box) pair to be - a background sample. default 0.5 - ignore_thresh(float): Threshold for ignoring the is_crowd ground-truth - if the value is larger than zero. - use_random (bool): Use random sampling to choose foreground and - background boxes, default true - cascade_iou (list[iou]): The list of overlap to select foreground and - background of each stage, which is only used In Cascade RCNN. - num_classes (int): The number of class. - assign_on_cpu (bool): In case the number of gt box is too large, - compute IoU on CPU, default false. - """ - - def __init__(self, - batch_size_per_im=512, - fg_fraction=.25, - fg_thresh=.5, - bg_thresh=.5, - ignore_thresh=-1., - use_random=True, - cascade_iou=[0.5, 0.6, 0.7], - num_classes=80, - assign_on_cpu=False): - super(BBoxAssigner, self).__init__() - self.batch_size_per_im = batch_size_per_im - self.fg_fraction = fg_fraction - self.fg_thresh = fg_thresh - self.bg_thresh = bg_thresh - self.ignore_thresh = ignore_thresh - self.use_random = use_random - self.cascade_iou = cascade_iou - self.num_classes = num_classes - self.assign_on_cpu = assign_on_cpu - - def __call__(self, - rpn_rois, - rpn_rois_num, - inputs, - stage=0, - is_cascade=False, - add_gt_as_proposals=True): - gt_classes = inputs['gt_class'] - gt_boxes = inputs['gt_bbox'] - is_crowd = inputs.get('is_crowd', None) - # rois, tgt_labels, tgt_bboxes, tgt_gt_inds - # new_rois_num - outs = generate_proposal_target( - rpn_rois, gt_classes, gt_boxes, self.batch_size_per_im, - self.fg_fraction, self.fg_thresh, self.bg_thresh, self.num_classes, - self.ignore_thresh, is_crowd, self.use_random, is_cascade, - self.cascade_iou[stage], self.assign_on_cpu, add_gt_as_proposals) - rois = outs[0] - rois_num = outs[-1] - # tgt_labels, tgt_bboxes, tgt_gt_inds - targets = outs[1:4] - return rois, rois_num, targets - - -@register -class BBoxLibraAssigner(object): - __shared__ = ['num_classes'] - """ - Libra-RCNN targets assignment module - - The assignment consists of three steps: - 1. Match RoIs and ground-truth box, label the RoIs with foreground - or background sample - 2. Sample anchors to keep the properly ratio between foreground and - background - 3. Generate the targets for classification and regression branch - - Args: - batch_size_per_im (int): Total number of RoIs per image. - default 512 - fg_fraction (float): Fraction of RoIs that is labeled - foreground, default 0.25 - fg_thresh (float): Minimum overlap required between a RoI - and ground-truth box for the (roi, gt box) pair to be - a foreground sample. default 0.5 - bg_thresh (float): Maximum overlap allowed between a RoI - and ground-truth box for the (roi, gt box) pair to be - a background sample. default 0.5 - use_random (bool): Use random sampling to choose foreground and - background boxes, default true - cascade_iou (list[iou]): The list of overlap to select foreground and - background of each stage, which is only used In Cascade RCNN. - num_classes (int): The number of class. - num_bins (int): The number of libra_sample. - """ - - def __init__(self, - batch_size_per_im=512, - fg_fraction=.25, - fg_thresh=.5, - bg_thresh=.5, - use_random=True, - cascade_iou=[0.5, 0.6, 0.7], - num_classes=80, - num_bins=3): - super(BBoxLibraAssigner, self).__init__() - self.batch_size_per_im = batch_size_per_im - self.fg_fraction = fg_fraction - self.fg_thresh = fg_thresh - self.bg_thresh = bg_thresh - self.use_random = use_random - self.cascade_iou = cascade_iou - self.num_classes = num_classes - self.num_bins = num_bins - - def __call__(self, - rpn_rois, - rpn_rois_num, - inputs, - stage=0, - is_cascade=False): - gt_classes = inputs['gt_class'] - gt_boxes = inputs['gt_bbox'] - # rois, tgt_labels, tgt_bboxes, tgt_gt_inds - outs = libra_generate_proposal_target( - rpn_rois, gt_classes, gt_boxes, self.batch_size_per_im, - self.fg_fraction, self.fg_thresh, self.bg_thresh, self.num_classes, - self.use_random, is_cascade, self.cascade_iou[stage], self.num_bins) - rois = outs[0] - rois_num = outs[-1] - # tgt_labels, tgt_bboxes, tgt_gt_inds - targets = outs[1:4] - return rois, rois_num, targets - - -@register -@serializable -class MaskAssigner(object): - __shared__ = ['num_classes', 'mask_resolution'] - """ - Mask targets assignment module - - The assignment consists of three steps: - 1. Select RoIs labels with foreground. - 2. Encode the RoIs and corresponding gt polygons to generate - mask target - - Args: - num_classes (int): The number of class - mask_resolution (int): The resolution of mask target, default 14 - """ - - def __init__(self, num_classes=80, mask_resolution=14): - super(MaskAssigner, self).__init__() - self.num_classes = num_classes - self.mask_resolution = mask_resolution - - def __call__(self, rois, tgt_labels, tgt_gt_inds, inputs): - gt_segms = inputs['gt_poly'] - - outs = generate_mask_target(gt_segms, rois, tgt_labels, tgt_gt_inds, - self.num_classes, self.mask_resolution) - - # mask_rois, mask_rois_num, tgt_classes, tgt_masks, mask_index, tgt_weights - return outs - - -@register -class RBoxAssigner(object): - """ - assigner of rbox - Args: - pos_iou_thr (float): threshold of pos samples - neg_iou_thr (float): threshold of neg samples - min_iou_thr (float): the min threshold of samples - ignore_iof_thr (int): the ignored threshold - """ - - def __init__(self, - pos_iou_thr=0.5, - neg_iou_thr=0.4, - min_iou_thr=0.0, - ignore_iof_thr=-2): - super(RBoxAssigner, self).__init__() - - self.pos_iou_thr = pos_iou_thr - self.neg_iou_thr = neg_iou_thr - self.min_iou_thr = min_iou_thr - self.ignore_iof_thr = ignore_iof_thr - - def anchor_valid(self, anchors): - """ - - Args: - anchor: M x 4 - - Returns: - - """ - if anchors.ndim == 3: - anchors = anchors.reshape(-1, anchors.shape[-1]) - assert anchors.ndim == 2 - anchor_num = anchors.shape[0] - anchor_valid = np.ones((anchor_num), np.int32) - anchor_inds = np.arange(anchor_num) - return anchor_inds - - def rbox2delta(self, - proposals, - gt, - means=[0, 0, 0, 0, 0], - stds=[1, 1, 1, 1, 1]): - """ - Args: - proposals: tensor [N, 5] - gt: gt [N, 5] - means: means [5] - stds: stds [5] - Returns: - - """ - proposals = proposals.astype(np.float64) - - PI = np.pi - - gt_widths = gt[..., 2] - gt_heights = gt[..., 3] - gt_angle = gt[..., 4] - - proposals_widths = proposals[..., 2] - proposals_heights = proposals[..., 3] - proposals_angle = proposals[..., 4] - - coord = gt[..., 0:2] - proposals[..., 0:2] - dx = (np.cos(proposals[..., 4]) * coord[..., 0] + - np.sin(proposals[..., 4]) * coord[..., 1]) / proposals_widths - dy = (-np.sin(proposals[..., 4]) * coord[..., 0] + - np.cos(proposals[..., 4]) * coord[..., 1]) / proposals_heights - dw = np.log(gt_widths / proposals_widths) - dh = np.log(gt_heights / proposals_heights) - da = (gt_angle - proposals_angle) - - da = (da + PI / 4) % PI - PI / 4 - da /= PI - - deltas = np.stack([dx, dy, dw, dh, da], axis=-1) - means = np.array(means, dtype=deltas.dtype) - stds = np.array(stds, dtype=deltas.dtype) - deltas = (deltas - means) / stds - deltas = deltas.astype(np.float32) - return deltas - - def assign_anchor(self, - anchors, - gt_bboxes, - gt_labels, - pos_iou_thr, - neg_iou_thr, - min_iou_thr=0.0, - ignore_iof_thr=-2): - assert anchors.shape[1] == 4 or anchors.shape[1] == 5 - assert gt_bboxes.shape[1] == 4 or gt_bboxes.shape[1] == 5 - anchors_xc_yc = anchors - gt_bboxes_xc_yc = gt_bboxes - - # calc rbox iou - anchors_xc_yc = anchors_xc_yc.astype(np.float32) - gt_bboxes_xc_yc = gt_bboxes_xc_yc.astype(np.float32) - anchors_xc_yc = paddle.to_tensor(anchors_xc_yc) - gt_bboxes_xc_yc = paddle.to_tensor(gt_bboxes_xc_yc) - - try: - from ext_op import rbox_iou - except Exception as e: - print("import custom_ops error, try install ext_op " \ - "following ppdet/ext_op/README.md", e) - sys.stdout.flush() - sys.exit(-1) - - iou = rbox_iou(gt_bboxes_xc_yc, anchors_xc_yc) - iou = iou.numpy() - iou = iou.T - - # every gt's anchor's index - gt_bbox_anchor_inds = iou.argmax(axis=0) - gt_bbox_anchor_iou = iou[gt_bbox_anchor_inds, np.arange(iou.shape[1])] - gt_bbox_anchor_iou_inds = np.where(iou == gt_bbox_anchor_iou)[0] - - # every anchor's gt bbox's index - anchor_gt_bbox_inds = iou.argmax(axis=1) - anchor_gt_bbox_iou = iou[np.arange(iou.shape[0]), anchor_gt_bbox_inds] - - # (1) set labels=-2 as default - labels = np.ones((iou.shape[0], ), dtype=np.int32) * ignore_iof_thr - - # (2) assign ignore - labels[anchor_gt_bbox_iou < min_iou_thr] = ignore_iof_thr - - # (3) assign neg_ids -1 - assign_neg_ids1 = anchor_gt_bbox_iou >= min_iou_thr - assign_neg_ids2 = anchor_gt_bbox_iou < neg_iou_thr - assign_neg_ids = np.logical_and(assign_neg_ids1, assign_neg_ids2) - labels[assign_neg_ids] = -1 - - # anchor_gt_bbox_iou_inds - # (4) assign max_iou as pos_ids >=0 - anchor_gt_bbox_iou_inds = anchor_gt_bbox_inds[gt_bbox_anchor_iou_inds] - # gt_bbox_anchor_iou_inds = np.logical_and(gt_bbox_anchor_iou_inds, anchor_gt_bbox_iou >= min_iou_thr) - labels[gt_bbox_anchor_iou_inds] = gt_labels[anchor_gt_bbox_iou_inds] - - # (5) assign >= pos_iou_thr as pos_ids - iou_pos_iou_thr_ids = anchor_gt_bbox_iou >= pos_iou_thr - iou_pos_iou_thr_ids_box_inds = anchor_gt_bbox_inds[iou_pos_iou_thr_ids] - labels[iou_pos_iou_thr_ids] = gt_labels[iou_pos_iou_thr_ids_box_inds] - return anchor_gt_bbox_inds, anchor_gt_bbox_iou, labels - - def __call__(self, anchors, gt_bboxes, gt_labels, is_crowd): - - assert anchors.ndim == 2 - assert anchors.shape[1] == 5 - assert gt_bboxes.ndim == 2 - assert gt_bboxes.shape[1] == 5 - - pos_iou_thr = self.pos_iou_thr - neg_iou_thr = self.neg_iou_thr - min_iou_thr = self.min_iou_thr - ignore_iof_thr = self.ignore_iof_thr - - anchor_num = anchors.shape[0] - - gt_bboxes = gt_bboxes - is_crowd_slice = is_crowd - not_crowd_inds = np.where(is_crowd_slice == 0) - - # Step1: match anchor and gt_bbox - anchor_gt_bbox_inds, anchor_gt_bbox_iou, labels = self.assign_anchor( - anchors, gt_bboxes, - gt_labels.reshape(-1), pos_iou_thr, neg_iou_thr, min_iou_thr, - ignore_iof_thr) - - # Step2: sample anchor - pos_inds = np.where(labels >= 0)[0] - neg_inds = np.where(labels == -1)[0] - - # Step3: make output - anchors_num = anchors.shape[0] - bbox_targets = np.zeros_like(anchors) - bbox_weights = np.zeros_like(anchors) - bbox_gt_bboxes = np.zeros_like(anchors) - pos_labels = np.zeros(anchors_num, dtype=np.int32) - pos_labels_weights = np.zeros(anchors_num, dtype=np.float32) - - pos_sampled_anchors = anchors[pos_inds] - pos_sampled_gt_boxes = gt_bboxes[anchor_gt_bbox_inds[pos_inds]] - if len(pos_inds) > 0: - pos_bbox_targets = self.rbox2delta(pos_sampled_anchors, - pos_sampled_gt_boxes) - bbox_targets[pos_inds, :] = pos_bbox_targets - bbox_gt_bboxes[pos_inds, :] = pos_sampled_gt_boxes - bbox_weights[pos_inds, :] = 1.0 - - pos_labels[pos_inds] = labels[pos_inds] - pos_labels_weights[pos_inds] = 1.0 - - if len(neg_inds) > 0: - pos_labels_weights[neg_inds] = 1.0 - return (pos_labels, pos_labels_weights, bbox_targets, bbox_weights, - bbox_gt_bboxes, pos_inds, neg_inds) diff --git a/pdfdet/models/Paddle/ppdet/modeling/rbox_utils.py b/pdfdet/models/Paddle/ppdet/modeling/rbox_utils.py deleted file mode 100644 index a5f19a2..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/rbox_utils.py +++ /dev/null @@ -1,295 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -import paddle -import numpy as np -import cv2 - - -def norm_angle(angle, range=[-np.pi / 4, np.pi]): - return (angle - range[0]) % range[1] + range[0] - - -# rbox function implemented using numpy -def poly2rbox_le135_np(poly): - """convert poly to rbox [-pi / 4, 3 * pi / 4] - - Args: - poly: [x1, y1, x2, y2, x3, y3, x4, y4] - - Returns: - rbox: [cx, cy, w, h, angle] - """ - poly = np.array(poly[:8], dtype=np.float32) - - pt1 = (poly[0], poly[1]) - pt2 = (poly[2], poly[3]) - pt3 = (poly[4], poly[5]) - pt4 = (poly[6], poly[7]) - - edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[1]) * - (pt1[1] - pt2[1])) - edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[1]) * - (pt2[1] - pt3[1])) - - width = max(edge1, edge2) - height = min(edge1, edge2) - - rbox_angle = 0 - if edge1 > edge2: - rbox_angle = np.arctan2(float(pt2[1] - pt1[1]), float(pt2[0] - pt1[0])) - elif edge2 >= edge1: - rbox_angle = np.arctan2(float(pt4[1] - pt1[1]), float(pt4[0] - pt1[0])) - - rbox_angle = norm_angle(rbox_angle) - - x_ctr = float(pt1[0] + pt3[0]) / 2 - y_ctr = float(pt1[1] + pt3[1]) / 2 - return [x_ctr, y_ctr, width, height, rbox_angle] - - -def poly2rbox_oc_np(poly): - """convert poly to rbox (0, pi / 2] - - Args: - poly: [x1, y1, x2, y2, x3, y3, x4, y4] - - Returns: - rbox: [cx, cy, w, h, angle] - """ - points = np.array(poly, dtype=np.float32).reshape((-1, 2)) - (cx, cy), (w, h), angle = cv2.minAreaRect(points) - # using the new OpenCV Rotated BBox definition since 4.5.1 - # if angle < 0, opencv is older than 4.5.1, angle is in [-90, 0) - if angle < 0: - angle += 90 - w, h = h, w - - # convert angle to [0, 90) - if angle == -0.0: - angle = 0.0 - if angle == 90.0: - angle = 0.0 - w, h = h, w - - angle = angle / 180 * np.pi - return [cx, cy, w, h, angle] - - -def poly2rbox_np(polys, rbox_type='oc'): - """ - polys: [x0,y0,x1,y1,x2,y2,x3,y3] - to - rboxes: [x_ctr,y_ctr,w,h,angle] - """ - assert rbox_type in ['oc', 'le135'], 'only oc or le135 is supported now' - poly2rbox_fn = poly2rbox_oc_np if rbox_type == 'oc' else poly2rbox_le135_np - rboxes = [] - for poly in polys: - x, y, w, h, angle = poly2rbox_fn(poly) - rbox = np.array([x, y, w, h, angle], dtype=np.float32) - rboxes.append(rbox) - - return np.array(rboxes) - - -def cal_line_length(point1, point2): - return math.sqrt( - math.pow(point1[0] - point2[0], 2) + math.pow(point1[1] - point2[1], 2)) - - -def get_best_begin_point_single(coordinate): - x1, y1, x2, y2, x3, y3, x4, y4 = coordinate - xmin = min(x1, x2, x3, x4) - ymin = min(y1, y2, y3, y4) - xmax = max(x1, x2, x3, x4) - ymax = max(y1, y2, y3, y4) - combinate = [[[x1, y1], [x2, y2], [x3, y3], [x4, y4]], - [[x4, y4], [x1, y1], [x2, y2], [x3, y3]], - [[x3, y3], [x4, y4], [x1, y1], [x2, y2]], - [[x2, y2], [x3, y3], [x4, y4], [x1, y1]]] - dst_coordinate = [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]] - force = 100000000.0 - force_flag = 0 - for i in range(4): - temp_force = cal_line_length(combinate[i][0], dst_coordinate[0]) \ - + cal_line_length(combinate[i][1], dst_coordinate[1]) \ - + cal_line_length(combinate[i][2], dst_coordinate[2]) \ - + cal_line_length(combinate[i][3], dst_coordinate[3]) - if temp_force < force: - force = temp_force - force_flag = i - if force_flag != 0: - pass - return np.array(combinate[force_flag]).reshape(8) - - -def rbox2poly_np(rboxes): - """ - rboxes:[x_ctr,y_ctr,w,h,angle] - to - poly:[x0,y0,x1,y1,x2,y2,x3,y3] - """ - polys = [] - for i in range(len(rboxes)): - x_ctr, y_ctr, width, height, angle = rboxes[i][:5] - tl_x, tl_y, br_x, br_y = -width / 2, -height / 2, width / 2, height / 2 - rect = np.array([[tl_x, br_x, br_x, tl_x], [tl_y, tl_y, br_y, br_y]]) - R = np.array([[np.cos(angle), -np.sin(angle)], - [np.sin(angle), np.cos(angle)]]) - poly = R.dot(rect) - x0, x1, x2, x3 = poly[0, :4] + x_ctr - y0, y1, y2, y3 = poly[1, :4] + y_ctr - poly = np.array([x0, y0, x1, y1, x2, y2, x3, y3], dtype=np.float32) - poly = get_best_begin_point_single(poly) - polys.append(poly) - polys = np.array(polys) - return polys - - -# rbox function implemented using paddle -def box2corners(box): - """convert box coordinate to corners - Args: - box (Tensor): (B, N, 5) with (x, y, w, h, alpha) angle is in [0, 90) - Returns: - corners (Tensor): (B, N, 4, 2) with (x1, y1, x2, y2, x3, y3, x4, y4) - """ - B = box.shape[0] - x, y, w, h, alpha = paddle.split(box, 5, axis=-1) - x4 = paddle.to_tensor( - [0.5, 0.5, -0.5, -0.5], dtype=paddle.float32).reshape( - (1, 1, 4)) # (1,1,4) - x4 = x4 * w # (B, N, 4) - y4 = paddle.to_tensor( - [-0.5, 0.5, 0.5, -0.5], dtype=paddle.float32).reshape((1, 1, 4)) - y4 = y4 * h # (B, N, 4) - corners = paddle.stack([x4, y4], axis=-1) # (B, N, 4, 2) - sin = paddle.sin(alpha) - cos = paddle.cos(alpha) - row1 = paddle.concat([cos, sin], axis=-1) - row2 = paddle.concat([-sin, cos], axis=-1) # (B, N, 2) - rot_T = paddle.stack([row1, row2], axis=-2) # (B, N, 2, 2) - rotated = paddle.bmm(corners.reshape([-1, 4, 2]), rot_T.reshape([-1, 2, 2])) - rotated = rotated.reshape([B, -1, 4, 2]) # (B*N, 4, 2) -> (B, N, 4, 2) - rotated[..., 0] += x - rotated[..., 1] += y - return rotated - - -def paddle_gather(x, dim, index): - index_shape = index.shape - index_flatten = index.flatten() - if dim < 0: - dim = len(x.shape) + dim - nd_index = [] - for k in range(len(x.shape)): - if k == dim: - nd_index.append(index_flatten) - else: - reshape_shape = [1] * len(x.shape) - reshape_shape[k] = x.shape[k] - x_arange = paddle.arange(x.shape[k], dtype=index.dtype) - x_arange = x_arange.reshape(reshape_shape) - dim_index = paddle.expand(x_arange, index_shape).flatten() - nd_index.append(dim_index) - ind2 = paddle.transpose(paddle.stack(nd_index), [1, 0]).astype("int64") - paddle_out = paddle.gather_nd(x, ind2).reshape(index_shape) - return paddle_out - - -def check_points_in_polys(points, polys): - """Check whether point is in rotated boxes - Args: - points (tensor): (1, L, 2) anchor points - polys (tensor): [B, N, 4, 2] gt_polys - eps (float): default 1e-9 - Returns: - is_in_polys (tensor): (B, N, L) - """ - # [1, L, 2] -> [1, 1, L, 2] - points = points.unsqueeze(0) - # [B, N, 4, 2] -> [B, N, 1, 2] - a, b, c, d = polys.split(4, axis=2) - ab = b - a - ad = d - a - # [B, N, L, 2] - ap = points - a - # [B, N, 1] - norm_ab = paddle.sum(ab * ab, axis=-1) - # [B, N, 1] - norm_ad = paddle.sum(ad * ad, axis=-1) - # [B, N, L] dot product - ap_dot_ab = paddle.sum(ap * ab, axis=-1) - # [B, N, L] dot product - ap_dot_ad = paddle.sum(ap * ad, axis=-1) - # [B, N, L] = |A|*|B|*cos(theta) - is_in_polys = (ap_dot_ab >= 0) & (ap_dot_ab <= norm_ab) & ( - ap_dot_ad >= 0) & (ap_dot_ad <= norm_ad) - return is_in_polys - - -def check_points_in_rotated_boxes(points, boxes): - """Check whether point is in rotated boxes - - Args: - points (tensor): (1, L, 2) anchor points - boxes (tensor): [B, N, 5] gt_bboxes - eps (float): default 1e-9 - - Returns: - is_in_box (tensor): (B, N, L) - - """ - # [B, N, 5] -> [B, N, 4, 2] - corners = box2corners(boxes) - # [1, L, 2] -> [1, 1, L, 2] - points = points.unsqueeze(0) - # [B, N, 4, 2] -> [B, N, 1, 2] - a, b, c, d = corners.split(4, axis=2) - ab = b - a - ad = d - a - # [B, N, L, 2] - ap = points - a - # [B, N, L] - norm_ab = paddle.sum(ab * ab, axis=-1) - # [B, N, L] - norm_ad = paddle.sum(ad * ad, axis=-1) - # [B, N, L] dot product - ap_dot_ab = paddle.sum(ap * ab, axis=-1) - # [B, N, L] dot product - ap_dot_ad = paddle.sum(ap * ad, axis=-1) - # [B, N, L] = |A|*|B|*cos(theta) - is_in_box = (ap_dot_ab >= 0) & (ap_dot_ab <= norm_ab) & (ap_dot_ad >= 0) & ( - ap_dot_ad <= norm_ad) - return is_in_box - - -def rotated_iou_similarity(box1, box2, eps=1e-9, func=''): - """Calculate iou of box1 and box2 - - Args: - box1 (Tensor): box with the shape [N, M1, 5] - box2 (Tensor): box with the shape [N, M2, 5] - - Return: - iou (Tensor): iou between box1 and box2 with the shape [N, M1, M2] - """ - from ext_op import rbox_iou - rotated_ious = [] - for b1, b2 in zip(box1, box2): - rotated_ious.append(rbox_iou(b1, b2)) - - return paddle.stack(rotated_ious, axis=0) diff --git a/pdfdet/models/Paddle/ppdet/modeling/reid/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/reid/__init__.py deleted file mode 100644 index 3c176d7..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/reid/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import jde_embedding_head -from . import fairmot_embedding_head -from . import resnet -from . import pyramidal_embedding -from . import pplcnet_embedding -from . import resnet_embedding - -from .fairmot_embedding_head import * -from .jde_embedding_head import * -from .resnet import * -from .pyramidal_embedding import * -from .pplcnet_embedding import * -from .resnet_embedding import * diff --git a/pdfdet/models/Paddle/ppdet/modeling/reid/fairmot_embedding_head.py b/pdfdet/models/Paddle/ppdet/modeling/reid/fairmot_embedding_head.py deleted file mode 100644 index 98ca257..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/reid/fairmot_embedding_head.py +++ /dev/null @@ -1,224 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import math -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.nn.initializer import KaimingUniform, Uniform -from ppdet.core.workspace import register -from ppdet.modeling.heads.centernet_head import ConvLayer - -__all__ = ['FairMOTEmbeddingHead'] - - -@register -class FairMOTEmbeddingHead(nn.Layer): - __shared__ = ['num_classes'] - """ - Args: - in_channels (int): the channel number of input to FairMOTEmbeddingHead. - ch_head (int): the channel of features before fed into embedding, 256 by default. - ch_emb (int): the channel of the embedding feature, 128 by default. - num_identities_dict (dict): the number of identities of each category, - support single class and multi-calss, {0: 14455} as default. - """ - - def __init__(self, - in_channels, - ch_head=256, - ch_emb=128, - num_classes=1, - num_identities_dict={0: 14455}): - super(FairMOTEmbeddingHead, self).__init__() - assert num_classes >= 1 - self.num_classes = num_classes - self.ch_emb = ch_emb - self.num_identities_dict = num_identities_dict - self.reid = nn.Sequential( - ConvLayer( - in_channels, ch_head, kernel_size=3, padding=1, bias=True), - nn.ReLU(), - ConvLayer( - ch_head, ch_emb, kernel_size=1, stride=1, padding=0, bias=True)) - param_attr = paddle.ParamAttr(initializer=KaimingUniform()) - bound = 1 / math.sqrt(ch_emb) - bias_attr = paddle.ParamAttr(initializer=Uniform(-bound, bound)) - self.reid_loss = nn.CrossEntropyLoss(ignore_index=-1, reduction='sum') - - if num_classes == 1: - nID = self.num_identities_dict[0] # single class - self.classifier = nn.Linear( - ch_emb, nID, weight_attr=param_attr, bias_attr=bias_attr) - # When num_identities(nID) is 1, emb_scale is set as 1 - self.emb_scale = math.sqrt(2) * math.log(nID - 1) if nID > 1 else 1 - else: - self.classifiers = dict() - self.emb_scale_dict = dict() - for cls_id, nID in self.num_identities_dict.items(): - self.classifiers[str(cls_id)] = nn.Linear( - ch_emb, nID, weight_attr=param_attr, bias_attr=bias_attr) - # When num_identities(nID) is 1, emb_scale is set as 1 - self.emb_scale_dict[str(cls_id)] = math.sqrt(2) * math.log( - nID - 1) if nID > 1 else 1 - - @classmethod - def from_config(cls, cfg, input_shape): - if isinstance(input_shape, (list, tuple)): - input_shape = input_shape[0] - return {'in_channels': input_shape.channels} - - def process_by_class(self, bboxes, embedding, bbox_inds, topk_clses): - pred_dets, pred_embs = [], [] - for cls_id in range(self.num_classes): - inds_masks = topk_clses == cls_id - inds_masks = paddle.cast(inds_masks, 'float32') - - pos_num = inds_masks.sum().numpy() - if pos_num == 0: - continue - - cls_inds_mask = inds_masks > 0 - - bbox_mask = paddle.nonzero(cls_inds_mask) - cls_bboxes = paddle.gather_nd(bboxes, bbox_mask) - pred_dets.append(cls_bboxes) - - cls_inds = paddle.masked_select(bbox_inds, cls_inds_mask) - cls_inds = cls_inds.unsqueeze(-1) - cls_embedding = paddle.gather_nd(embedding, cls_inds) - pred_embs.append(cls_embedding) - - return paddle.concat(pred_dets), paddle.concat(pred_embs) - - def forward(self, - neck_feat, - inputs, - bboxes=None, - bbox_inds=None, - topk_clses=None): - reid_feat = self.reid(neck_feat) - if self.training: - if self.num_classes == 1: - loss = self.get_loss(reid_feat, inputs) - else: - loss = self.get_mc_loss(reid_feat, inputs) - return loss - else: - assert bboxes is not None and bbox_inds is not None - reid_feat = F.normalize(reid_feat) - embedding = paddle.transpose(reid_feat, [0, 2, 3, 1]) - embedding = paddle.reshape(embedding, [-1, self.ch_emb]) - # embedding shape: [bs * h * w, ch_emb] - - if self.num_classes == 1: - pred_dets = bboxes - pred_embs = paddle.gather(embedding, bbox_inds) - else: - pred_dets, pred_embs = self.process_by_class( - bboxes, embedding, bbox_inds, topk_clses) - return pred_dets, pred_embs - - def get_loss(self, feat, inputs): - index = inputs['index'] - mask = inputs['index_mask'] - target = inputs['reid'] - target = paddle.masked_select(target, mask > 0) - target = paddle.unsqueeze(target, 1) - - feat = paddle.transpose(feat, perm=[0, 2, 3, 1]) - feat_n, feat_h, feat_w, feat_c = feat.shape - feat = paddle.reshape(feat, shape=[feat_n, -1, feat_c]) - index = paddle.unsqueeze(index, 2) - batch_inds = list() - for i in range(feat_n): - batch_ind = paddle.full( - shape=[1, index.shape[1], 1], fill_value=i, dtype='int64') - batch_inds.append(batch_ind) - batch_inds = paddle.concat(batch_inds, axis=0) - index = paddle.concat(x=[batch_inds, index], axis=2) - feat = paddle.gather_nd(feat, index=index) - - mask = paddle.unsqueeze(mask, axis=2) - mask = paddle.expand_as(mask, feat) - mask.stop_gradient = True - feat = paddle.masked_select(feat, mask > 0) - feat = paddle.reshape(feat, shape=[-1, feat_c]) - feat = F.normalize(feat) - feat = self.emb_scale * feat - logit = self.classifier(feat) - target.stop_gradient = True - loss = self.reid_loss(logit, target) - valid = (target != self.reid_loss.ignore_index) - valid.stop_gradient = True - count = paddle.sum((paddle.cast(valid, dtype=np.int32))) - count.stop_gradient = True - if count > 0: - loss = loss / count - - return loss - - def get_mc_loss(self, feat, inputs): - # feat.shape = [bs, ch_emb, h, w] - assert 'cls_id_map' in inputs and 'cls_tr_ids' in inputs - index = inputs['index'] - mask = inputs['index_mask'] - cls_id_map = inputs['cls_id_map'] # [bs, h, w] - cls_tr_ids = inputs['cls_tr_ids'] # [bs, num_classes, h, w] - - feat = paddle.transpose(feat, perm=[0, 2, 3, 1]) - feat_n, feat_h, feat_w, feat_c = feat.shape - feat = paddle.reshape(feat, shape=[feat_n, -1, feat_c]) - - index = paddle.unsqueeze(index, 2) - batch_inds = list() - for i in range(feat_n): - batch_ind = paddle.full( - shape=[1, index.shape[1], 1], fill_value=i, dtype='int64') - batch_inds.append(batch_ind) - batch_inds = paddle.concat(batch_inds, axis=0) - index = paddle.concat(x=[batch_inds, index], axis=2) - feat = paddle.gather_nd(feat, index=index) - - mask = paddle.unsqueeze(mask, axis=2) - mask = paddle.expand_as(mask, feat) - mask.stop_gradient = True - feat = paddle.masked_select(feat, mask > 0) - feat = paddle.reshape(feat, shape=[-1, feat_c]) - - reid_losses = 0 - for cls_id, id_num in self.num_identities_dict.items(): - # target - cur_cls_tr_ids = paddle.reshape( - cls_tr_ids[:, cls_id, :, :], shape=[feat_n, -1]) # [bs, h*w] - cls_id_target = paddle.gather_nd(cur_cls_tr_ids, index=index) - mask = inputs['index_mask'] - cls_id_target = paddle.masked_select(cls_id_target, mask > 0) - cls_id_target.stop_gradient = True - - # feat - cls_id_feat = self.emb_scale_dict[str(cls_id)] * F.normalize(feat) - cls_id_pred = self.classifiers[str(cls_id)](cls_id_feat) - - loss = self.reid_loss(cls_id_pred, cls_id_target) - valid = (cls_id_target != self.reid_loss.ignore_index) - valid.stop_gradient = True - count = paddle.sum((paddle.cast(valid, dtype=np.int32))) - count.stop_gradient = True - if count > 0: - loss = loss / count - reid_losses += loss - - return reid_losses diff --git a/pdfdet/models/Paddle/ppdet/modeling/reid/jde_embedding_head.py b/pdfdet/models/Paddle/ppdet/modeling/reid/jde_embedding_head.py deleted file mode 100644 index 1d1e60f..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/reid/jde_embedding_head.py +++ /dev/null @@ -1,211 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import math -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.regularizer import L2Decay -from ppdet.core.workspace import register -from paddle.nn.initializer import Normal, Constant - -__all__ = ['JDEEmbeddingHead'] - - -class LossParam(nn.Layer): - def __init__(self, init_value=0., use_uncertainy=True): - super(LossParam, self).__init__() - self.loss_param = self.create_parameter( - shape=[1], - attr=ParamAttr(initializer=Constant(value=init_value)), - dtype="float32") - - def forward(self, inputs): - out = paddle.exp(-self.loss_param) * inputs + self.loss_param - return out * 0.5 - - -@register -class JDEEmbeddingHead(nn.Layer): - __shared__ = ['num_classes'] - __inject__ = ['emb_loss', 'jde_loss'] - """ - JDEEmbeddingHead - Args: - num_classes(int): Number of classes. Only support one class tracking. - num_identities(int): Number of identities. - anchor_levels(int): Number of anchor levels, same as FPN levels. - anchor_scales(int): Number of anchor scales on each FPN level. - embedding_dim(int): Embedding dimension. Default: 512. - emb_loss(object): Instance of 'JDEEmbeddingLoss' - jde_loss(object): Instance of 'JDELoss' - """ - - def __init__( - self, - num_classes=1, - num_identities=14455, # dataset.num_identities_dict[0] - anchor_levels=3, - anchor_scales=4, - embedding_dim=512, - emb_loss='JDEEmbeddingLoss', - jde_loss='JDELoss'): - super(JDEEmbeddingHead, self).__init__() - self.num_classes = num_classes - self.num_identities = num_identities - self.anchor_levels = anchor_levels - self.anchor_scales = anchor_scales - self.embedding_dim = embedding_dim - self.emb_loss = emb_loss - self.jde_loss = jde_loss - - self.emb_scale = math.sqrt(2) * math.log( - self.num_identities - 1) if self.num_identities > 1 else 1 - - self.identify_outputs = [] - self.loss_params_cls = [] - self.loss_params_reg = [] - self.loss_params_ide = [] - for i in range(self.anchor_levels): - name = 'identify_output.{}'.format(i) - identify_output = self.add_sublayer( - name, - nn.Conv2D( - in_channels=64 * (2**self.anchor_levels) // (2**i), - out_channels=self.embedding_dim, - kernel_size=3, - stride=1, - padding=1, - bias_attr=ParamAttr(regularizer=L2Decay(0.)))) - self.identify_outputs.append(identify_output) - - loss_p_cls = self.add_sublayer('cls.{}'.format(i), LossParam(-4.15)) - self.loss_params_cls.append(loss_p_cls) - loss_p_reg = self.add_sublayer('reg.{}'.format(i), LossParam(-4.85)) - self.loss_params_reg.append(loss_p_reg) - loss_p_ide = self.add_sublayer('ide.{}'.format(i), LossParam(-2.3)) - self.loss_params_ide.append(loss_p_ide) - - self.classifier = self.add_sublayer( - 'classifier', - nn.Linear( - self.embedding_dim, - self.num_identities, - weight_attr=ParamAttr( - learning_rate=1., initializer=Normal( - mean=0.0, std=0.01)), - bias_attr=ParamAttr( - learning_rate=2., regularizer=L2Decay(0.)))) - - def forward(self, - identify_feats, - targets, - loss_confs=None, - loss_boxes=None, - bboxes=None, - boxes_idx=None, - nms_keep_idx=None): - assert self.num_classes == 1, 'JDE only support sindle class MOT.' - assert len(identify_feats) == self.anchor_levels - ide_outs = [] - for feat, ide_head in zip(identify_feats, self.identify_outputs): - ide_outs.append(ide_head(feat)) - - if self.training: - assert len(loss_confs) == len(loss_boxes) == self.anchor_levels - loss_ides = self.emb_loss(ide_outs, targets, self.emb_scale, - self.classifier) - jde_losses = self.jde_loss( - loss_confs, loss_boxes, loss_ides, self.loss_params_cls, - self.loss_params_reg, self.loss_params_ide, targets) - return jde_losses - else: - assert bboxes is not None - assert boxes_idx is not None - assert nms_keep_idx is not None - - emb_outs = self.get_emb_outs(ide_outs) - emb_valid = paddle.gather_nd(emb_outs, boxes_idx) - pred_embs = paddle.gather_nd(emb_valid, nms_keep_idx) - - input_shape = targets['image'].shape[2:] - # input_shape: [h, w], before data transforms, set in model config - im_shape = targets['im_shape'][0].numpy() - # im_shape: [new_h, new_w], after data transforms - scale_factor = targets['scale_factor'][0].numpy() - bboxes[:, 2:] = self.scale_coords(bboxes[:, 2:], input_shape, - im_shape, scale_factor) - # cls_ids, scores, tlwhs - pred_dets = bboxes - return pred_dets, pred_embs - - def scale_coords(self, coords, input_shape, im_shape, scale_factor): - ratio = scale_factor[0] - pad_w = (input_shape[1] - int(im_shape[1])) / 2 - pad_h = (input_shape[0] - int(im_shape[0])) / 2 - coords = paddle.cast(coords, 'float32') - coords[:, 0::2] -= pad_w - coords[:, 1::2] -= pad_h - coords[:, 0:4] /= ratio - coords[:, :4] = paddle.clip( - coords[:, :4], min=0, max=coords[:, :4].max()) - return coords.round() - - def get_emb_and_gt_outs(self, ide_outs, targets): - emb_and_gts = [] - for i, p_ide in enumerate(ide_outs): - t_conf = targets['tconf{}'.format(i)] - t_ide = targets['tide{}'.format(i)] - - p_ide = p_ide.transpose((0, 2, 3, 1)) - p_ide_flatten = paddle.reshape(p_ide, [-1, self.embedding_dim]) - - mask = t_conf > 0 - mask = paddle.cast(mask, dtype="int64") - emb_mask = mask.max(1).flatten() - emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten() - if len(emb_mask_inds) > 0: - t_ide_flatten = paddle.reshape(t_ide.max(1), [-1, 1]) - tids = paddle.gather(t_ide_flatten, emb_mask_inds) - - embedding = paddle.gather(p_ide_flatten, emb_mask_inds) - embedding = self.emb_scale * F.normalize(embedding) - emb_and_gt = paddle.concat([embedding, tids], axis=1) - emb_and_gts.append(emb_and_gt) - - if len(emb_and_gts) > 0: - return paddle.concat(emb_and_gts, axis=0) - else: - return paddle.zeros((1, self.embedding_dim + 1)) - - def get_emb_outs(self, ide_outs): - emb_outs = [] - for i, p_ide in enumerate(ide_outs): - p_ide = p_ide.transpose((0, 2, 3, 1)) - - p_ide_repeat = paddle.tile(p_ide, [self.anchor_scales, 1, 1, 1]) - embedding = F.normalize(p_ide_repeat, axis=-1) - emb = paddle.reshape(embedding, [-1, self.embedding_dim]) - emb_outs.append(emb) - - if len(emb_outs) > 0: - return paddle.concat(emb_outs, axis=0) - else: - return paddle.zeros((1, self.embedding_dim)) diff --git a/pdfdet/models/Paddle/ppdet/modeling/reid/pplcnet_embedding.py b/pdfdet/models/Paddle/ppdet/modeling/reid/pplcnet_embedding.py deleted file mode 100644 index d360f89..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/reid/pplcnet_embedding.py +++ /dev/null @@ -1,281 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.nn.initializer import Normal, Constant -from paddle import ParamAttr -from paddle.nn import AdaptiveAvgPool2D, BatchNorm2D, Conv2D, Linear -from paddle.regularizer import L2Decay -from paddle.nn.initializer import KaimingNormal, XavierNormal -from ppdet.core.workspace import register - -__all__ = ['PPLCNetEmbedding'] - - -# Each element(list) represents a depthwise block, which is composed of k, in_c, out_c, s, use_se. -# k: kernel_size -# in_c: input channel number in depthwise block -# out_c: output channel number in depthwise block -# s: stride in depthwise block -# use_se: whether to use SE block - -NET_CONFIG = { - "blocks2": - #k, in_c, out_c, s, use_se - [[3, 16, 32, 1, False]], - "blocks3": [[3, 32, 64, 2, False], [3, 64, 64, 1, False]], - "blocks4": [[3, 64, 128, 2, False], [3, 128, 128, 1, False]], - "blocks5": [[3, 128, 256, 2, False], [5, 256, 256, 1, False], - [5, 256, 256, 1, False], [5, 256, 256, 1, False], - [5, 256, 256, 1, False], [5, 256, 256, 1, False]], - "blocks6": [[5, 256, 512, 2, True], [5, 512, 512, 1, True]] -} - - -def make_divisible(v, divisor=8, min_value=None): - if min_value is None: - min_value = divisor - new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) - if new_v < 0.9 * v: - new_v += divisor - return new_v - - -class ConvBNLayer(nn.Layer): - def __init__(self, - num_channels, - filter_size, - num_filters, - stride, - num_groups=1): - super().__init__() - - self.conv = Conv2D( - in_channels=num_channels, - out_channels=num_filters, - kernel_size=filter_size, - stride=stride, - padding=(filter_size - 1) // 2, - groups=num_groups, - weight_attr=ParamAttr(initializer=KaimingNormal()), - bias_attr=False) - - self.bn = BatchNorm2D( - num_filters, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - self.hardswish = nn.Hardswish() - - def forward(self, x): - x = self.conv(x) - x = self.bn(x) - x = self.hardswish(x) - return x - - -class DepthwiseSeparable(nn.Layer): - def __init__(self, - num_channels, - num_filters, - stride, - dw_size=3, - use_se=False): - super().__init__() - self.use_se = use_se - self.dw_conv = ConvBNLayer( - num_channels=num_channels, - num_filters=num_channels, - filter_size=dw_size, - stride=stride, - num_groups=num_channels) - if use_se: - self.se = SEModule(num_channels) - self.pw_conv = ConvBNLayer( - num_channels=num_channels, - filter_size=1, - num_filters=num_filters, - stride=1) - - def forward(self, x): - x = self.dw_conv(x) - if self.use_se: - x = self.se(x) - x = self.pw_conv(x) - return x - - -class SEModule(nn.Layer): - def __init__(self, channel, reduction=4): - super().__init__() - self.avg_pool = AdaptiveAvgPool2D(1) - self.conv1 = Conv2D( - in_channels=channel, - out_channels=channel // reduction, - kernel_size=1, - stride=1, - padding=0) - self.relu = nn.ReLU() - self.conv2 = Conv2D( - in_channels=channel // reduction, - out_channels=channel, - kernel_size=1, - stride=1, - padding=0) - self.hardsigmoid = nn.Hardsigmoid() - - def forward(self, x): - identity = x - x = self.avg_pool(x) - x = self.conv1(x) - x = self.relu(x) - x = self.conv2(x) - x = self.hardsigmoid(x) - x = paddle.multiply(x=identity, y=x) - return x - - -class PPLCNet(nn.Layer): - """ - PP-LCNet, see https://arxiv.org/abs/2109.15099. - This code is different from PPLCNet in ppdet/modeling/backbones/lcnet.py - or in PaddleClas, because the output is the flatten feature of last_conv. - - Args: - scale (float): Scale ratio of channels. - class_expand (int): Number of channels of conv feature. - """ - - def __init__(self, scale=1.0, class_expand=1280): - super(PPLCNet, self).__init__() - self.scale = scale - self.class_expand = class_expand - - self.conv1 = ConvBNLayer( - num_channels=3, - filter_size=3, - num_filters=make_divisible(16 * scale), - stride=2) - - self.blocks2 = nn.Sequential(*[ - DepthwiseSeparable( - num_channels=make_divisible(in_c * scale), - num_filters=make_divisible(out_c * scale), - dw_size=k, - stride=s, - use_se=se) - for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks2"]) - ]) - - self.blocks3 = nn.Sequential(*[ - DepthwiseSeparable( - num_channels=make_divisible(in_c * scale), - num_filters=make_divisible(out_c * scale), - dw_size=k, - stride=s, - use_se=se) - for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks3"]) - ]) - - self.blocks4 = nn.Sequential(*[ - DepthwiseSeparable( - num_channels=make_divisible(in_c * scale), - num_filters=make_divisible(out_c * scale), - dw_size=k, - stride=s, - use_se=se) - for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks4"]) - ]) - - self.blocks5 = nn.Sequential(*[ - DepthwiseSeparable( - num_channels=make_divisible(in_c * scale), - num_filters=make_divisible(out_c * scale), - dw_size=k, - stride=s, - use_se=se) - for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks5"]) - ]) - - self.blocks6 = nn.Sequential(*[ - DepthwiseSeparable( - num_channels=make_divisible(in_c * scale), - num_filters=make_divisible(out_c * scale), - dw_size=k, - stride=s, - use_se=se) - for i, (k, in_c, out_c, s, se) in enumerate(NET_CONFIG["blocks6"]) - ]) - - self.avg_pool = AdaptiveAvgPool2D(1) - self.last_conv = Conv2D( - in_channels=make_divisible(NET_CONFIG["blocks6"][-1][2] * scale), - out_channels=self.class_expand, - kernel_size=1, - stride=1, - padding=0, - bias_attr=False) - self.hardswish = nn.Hardswish() - self.flatten = nn.Flatten(start_axis=1, stop_axis=-1) - - def forward(self, x): - x = self.conv1(x) - - x = self.blocks2(x) - x = self.blocks3(x) - x = self.blocks4(x) - x = self.blocks5(x) - x = self.blocks6(x) - - x = self.avg_pool(x) - x = self.last_conv(x) - x = self.hardswish(x) - x = self.flatten(x) - return x - - -class FC(nn.Layer): - def __init__(self, input_ch, output_ch): - super(FC, self).__init__() - weight_attr = ParamAttr(initializer=XavierNormal()) - self.fc = paddle.nn.Linear(input_ch, output_ch, weight_attr=weight_attr) - - def forward(self, x): - out = self.fc(x) - return out - - -@register -class PPLCNetEmbedding(nn.Layer): - """ - PPLCNet Embedding - - Args: - input_ch (int): Number of channels of input conv feature. - output_ch (int): Number of channels of output conv feature. - """ - def __init__(self, scale=2.5, input_ch=1280, output_ch=512): - super(PPLCNetEmbedding, self).__init__() - self.backbone = PPLCNet(scale=scale) - self.neck = FC(input_ch, output_ch) - - def forward(self, x): - feat = self.backbone(x) - feat_out = self.neck(feat) - return feat_out diff --git a/pdfdet/models/Paddle/ppdet/modeling/reid/pyramidal_embedding.py b/pdfdet/models/Paddle/ppdet/modeling/reid/pyramidal_embedding.py deleted file mode 100644 index 6b2a76d..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/reid/pyramidal_embedding.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.nn.initializer import Normal, Constant -from paddle import ParamAttr -from .resnet import ResNet50, ResNet101 -from ppdet.core.workspace import register - -__all__ = ['PCBPyramid'] - - -@register -class PCBPyramid(nn.Layer): - """ - PCB (Part-based Convolutional Baseline), see https://arxiv.org/abs/1711.09349, - Pyramidal Person Re-IDentification, see https://arxiv.org/abs/1810.12193 - - Args: - input_ch (int): Number of channels of the input feature. - num_stripes (int): Number of sub-parts. - used_levels (tuple): Whether the level is used, 1 means used. - num_classes (int): Number of classes for identities, default 751 in - Market-1501 dataset. - last_conv_stride (int): Stride of the last conv. - last_conv_dilation (int): Dilation of the last conv. - num_conv_out_channels (int): Number of channels of conv feature. - """ - - def __init__(self, - input_ch=2048, - model_name='ResNet101', - num_stripes=6, - used_levels=(1, 1, 1, 1, 1, 1), - num_classes=751, - last_conv_stride=1, - last_conv_dilation=1, - num_conv_out_channels=128): - super(PCBPyramid, self).__init__() - self.num_stripes = num_stripes - self.used_levels = used_levels - self.num_classes = num_classes - - self.num_in_each_level = [i for i in range(self.num_stripes, 0, -1)] - self.num_branches = sum(self.num_in_each_level) - - assert model_name in ['ResNet50', 'ResNet101'], "Unsupported ReID arch: {}".format(model_name) - self.base = eval(model_name)( - lr_mult=0.1, - last_conv_stride=last_conv_stride, - last_conv_dilation=last_conv_dilation) - self.dropout_layer = nn.Dropout(p=0.2) - self.pyramid_conv_list0, self.pyramid_fc_list0 = self.basic_branch( - num_conv_out_channels, input_ch) - - def basic_branch(self, num_conv_out_channels, input_ch): - # the level indexes are defined from fine to coarse, - # the branch will contain one more part than that of its previous level - # the sliding step is set to 1 - pyramid_conv_list = nn.LayerList() - pyramid_fc_list = nn.LayerList() - - idx_levels = 0 - for idx_branches in range(self.num_branches): - if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]): - idx_levels += 1 - - pyramid_conv_list.append( - nn.Sequential( - nn.Conv2D(input_ch, num_conv_out_channels, 1), - nn.BatchNorm2D(num_conv_out_channels), nn.ReLU())) - - idx_levels = 0 - for idx_branches in range(self.num_branches): - if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]): - idx_levels += 1 - - fc = nn.Linear( - in_features=num_conv_out_channels, - out_features=self.num_classes, - weight_attr=ParamAttr(initializer=Normal( - mean=0., std=0.001)), - bias_attr=ParamAttr(initializer=Constant(value=0.))) - pyramid_fc_list.append(fc) - return pyramid_conv_list, pyramid_fc_list - - def pyramid_forward(self, feat): - each_stripe_size = int(feat.shape[2] / self.num_stripes) - - feat_list, logits_list = [], [] - idx_levels = 0 - used_branches = 0 - for idx_branches in range(self.num_branches): - if idx_branches >= sum(self.num_in_each_level[0:idx_levels + 1]): - idx_levels += 1 - idx_in_each_level = idx_branches - sum(self.num_in_each_level[ - 0:idx_levels]) - stripe_size_in_each_level = each_stripe_size * (idx_levels + 1) - start = idx_in_each_level * each_stripe_size - end = start + stripe_size_in_each_level - - k = feat.shape[-1] - local_feat_avgpool = F.avg_pool2d( - feat[:, :, start:end, :], - kernel_size=(stripe_size_in_each_level, k)) - local_feat_maxpool = F.max_pool2d( - feat[:, :, start:end, :], - kernel_size=(stripe_size_in_each_level, k)) - local_feat = local_feat_avgpool + local_feat_maxpool - - local_feat = self.pyramid_conv_list0[used_branches](local_feat) - local_feat = paddle.reshape( - local_feat, shape=[local_feat.shape[0], -1]) - feat_list.append(local_feat) - - local_logits = self.pyramid_fc_list0[used_branches]( - self.dropout_layer(local_feat)) - logits_list.append(local_logits) - - used_branches += 1 - - return feat_list, logits_list - - def forward(self, x): - feat = self.base(x) - assert feat.shape[2] % self.num_stripes == 0 - feat_list, logits_list = self.pyramid_forward(feat) - feat_out = paddle.concat(feat_list, axis=-1) - return feat_out diff --git a/pdfdet/models/Paddle/ppdet/modeling/reid/resnet.py b/pdfdet/models/Paddle/ppdet/modeling/reid/resnet.py deleted file mode 100644 index 2e2a855..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/reid/resnet.py +++ /dev/null @@ -1,312 +0,0 @@ -# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import math -import paddle -from paddle import ParamAttr -import paddle.nn as nn -import paddle.nn.functional as F -from paddle.nn.initializer import Normal - -__all__ = ["ResNet18", "ResNet34", "ResNet50", "ResNet101", "ResNet152"] - - -class ConvBNLayer(nn.Layer): - def __init__(self, - num_channels, - num_filters, - filter_size, - stride=1, - dilation=1, - groups=1, - act=None, - lr_mult=1.0, - name=None, - data_format="NCHW"): - super(ConvBNLayer, self).__init__() - conv_stdv = filter_size * filter_size * num_filters - self._conv = nn.Conv2D( - in_channels=num_channels, - out_channels=num_filters, - kernel_size=filter_size, - stride=stride, - padding=(filter_size - 1) // 2, - dilation=dilation, - groups=groups, - weight_attr=ParamAttr( - learning_rate=lr_mult, - initializer=Normal(0, math.sqrt(2. / conv_stdv))), - bias_attr=False, - data_format=data_format) - - self._batch_norm = nn.BatchNorm2D(num_filters) - self.act = act - - def forward(self, inputs): - y = self._conv(inputs) - y = self._batch_norm(y) - if self.act: - y = getattr(F, self.act)(y) - return y - - -class BottleneckBlock(nn.Layer): - def __init__(self, - num_channels, - num_filters, - stride, - shortcut=True, - name=None, - lr_mult=1.0, - dilation=1, - data_format="NCHW"): - super(BottleneckBlock, self).__init__() - self.conv0 = ConvBNLayer( - num_channels=num_channels, - num_filters=num_filters, - filter_size=1, - dilation=dilation, - act="relu", - lr_mult=lr_mult, - name=name + "_branch2a", - data_format=data_format) - self.conv1 = ConvBNLayer( - num_channels=num_filters, - num_filters=num_filters, - filter_size=3, - dilation=dilation, - stride=stride, - act="relu", - lr_mult=lr_mult, - name=name + "_branch2b", - data_format=data_format) - self.conv2 = ConvBNLayer( - num_channels=num_filters, - num_filters=num_filters * 4, - filter_size=1, - dilation=dilation, - act=None, - lr_mult=lr_mult, - name=name + "_branch2c", - data_format=data_format) - if not shortcut: - self.short = ConvBNLayer( - num_channels=num_channels, - num_filters=num_filters * 4, - filter_size=1, - dilation=dilation, - stride=stride, - lr_mult=lr_mult, - name=name + "_branch1", - data_format=data_format) - self.shortcut = shortcut - self._num_channels_out = num_filters * 4 - - def forward(self, inputs): - y = self.conv0(inputs) - conv1 = self.conv1(y) - conv2 = self.conv2(conv1) - if self.shortcut: - short = inputs - else: - short = self.short(inputs) - y = paddle.add(x=short, y=conv2) - y = F.relu(y) - return y - - -class BasicBlock(nn.Layer): - def __init__(self, - num_channels, - num_filters, - stride, - shortcut=True, - name=None, - data_format="NCHW"): - super(BasicBlock, self).__init__() - self.stride = stride - self.conv0 = ConvBNLayer( - num_channels=num_channels, - num_filters=num_filters, - filter_size=3, - stride=stride, - act="relu", - name=name + "_branch2a", - data_format=data_format) - self.conv1 = ConvBNLayer( - num_channels=num_filters, - num_filters=num_filters, - filter_size=3, - act=None, - name=name + "_branch2b", - data_format=data_format) - if not shortcut: - self.short = ConvBNLayer( - num_channels=num_channels, - num_filters=num_filters, - filter_size=1, - stride=stride, - name=name + "_branch1", - data_format=data_format) - self.shortcut = shortcut - - def forward(self, inputs): - y = self.conv0(inputs) - conv1 = self.conv1(y) - if self.shortcut: - short = inputs - else: - short = self.short(inputs) - y = paddle.add(x=short, y=conv1) - y = F.relu(y) - return y - - -class ResNet(nn.Layer): - def __init__(self, - layers=50, - lr_mult=1.0, - last_conv_stride=2, - last_conv_dilation=1): - super(ResNet, self).__init__() - self.layers = layers - self.data_format = "NCHW" - self.input_image_channel = 3 - supported_layers = [18, 34, 50, 101, 152] - assert layers in supported_layers, \ - "supported layers are {} but input layer is {}".format( - supported_layers, layers) - if layers == 18: - depth = [2, 2, 2, 2] - elif layers == 34 or layers == 50: - depth = [3, 4, 6, 3] - elif layers == 101: - depth = [3, 4, 23, 3] - elif layers == 152: - depth = [3, 8, 36, 3] - num_channels = [64, 256, 512, - 1024] if layers >= 50 else [64, 64, 128, 256] - num_filters = [64, 128, 256, 512] - self.conv = ConvBNLayer( - num_channels=self.input_image_channel, - num_filters=64, - filter_size=7, - stride=2, - act="relu", - lr_mult=lr_mult, - name="conv1", - data_format=self.data_format) - self.pool2d_max = nn.MaxPool2D( - kernel_size=3, stride=2, padding=1, data_format=self.data_format) - self.block_list = [] - if layers >= 50: - for block in range(len(depth)): - shortcut = False - for i in range(depth[block]): - if layers in [101, 152] and block == 2: - if i == 0: - conv_name = "res" + str(block + 2) + "a" - else: - conv_name = "res" + str(block + 2) + "b" + str(i) - else: - conv_name = "res" + str(block + 2) + chr(97 + i) - if i != 0 or block == 0: - stride = 1 - elif block == len(depth) - 1: - stride = last_conv_stride - else: - stride = 2 - bottleneck_block = self.add_sublayer( - conv_name, - BottleneckBlock( - num_channels=num_channels[block] - if i == 0 else num_filters[block] * 4, - num_filters=num_filters[block], - stride=stride, - shortcut=shortcut, - name=conv_name, - lr_mult=lr_mult, - dilation=last_conv_dilation - if block == len(depth) - 1 else 1, - data_format=self.data_format)) - self.block_list.append(bottleneck_block) - shortcut = True - else: - for block in range(len(depth)): - shortcut = False - for i in range(depth[block]): - conv_name = "res" + str(block + 2) + chr(97 + i) - basic_block = self.add_sublayer( - conv_name, - BasicBlock( - num_channels=num_channels[block] - if i == 0 else num_filters[block], - num_filters=num_filters[block], - stride=2 if i == 0 and block != 0 else 1, - shortcut=shortcut, - name=conv_name, - data_format=self.data_format)) - self.block_list.append(basic_block) - shortcut = True - - def forward(self, inputs): - y = self.conv(inputs) - y = self.pool2d_max(y) - for block in self.block_list: - y = block(y) - return y - - -def ResNet18(**args): - model = ResNet(layers=18, **args) - return model - - -def ResNet34(**args): - model = ResNet(layers=34, **args) - return model - - -def ResNet50(pretrained=None, **args): - model = ResNet(layers=50, **args) - if pretrained is not None: - if not (os.path.isdir(pretrained) or - os.path.exists(pretrained + '.pdparams')): - raise ValueError("Model pretrain path {} does not " - "exists.".format(pretrained)) - param_state_dict = paddle.load(pretrained + '.pdparams') - model.set_dict(param_state_dict) - return model - - -def ResNet101(pretrained=None, **args): - model = ResNet(layers=101, **args) - if pretrained is not None: - if not (os.path.isdir(pretrained) or - os.path.exists(pretrained + '.pdparams')): - raise ValueError("Model pretrain path {} does not " - "exists.".format(pretrained)) - param_state_dict = paddle.load(pretrained + '.pdparams') - model.set_dict(param_state_dict) - return model - - -def ResNet152(**args): - model = ResNet(layers=152, **args) - return model diff --git a/pdfdet/models/Paddle/ppdet/modeling/reid/resnet_embedding.py b/pdfdet/models/Paddle/ppdet/modeling/reid/resnet_embedding.py deleted file mode 100644 index 28c11eb..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/reid/resnet_embedding.py +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import paddle -import paddle.nn.functional as F -from paddle import nn -from .resnet import ResNet50, ResNet101 -from ppdet.core.workspace import register - -__all__ = ['ResNetEmbedding'] - - -@register -class ResNetEmbedding(nn.Layer): - in_planes = 2048 - def __init__(self, model_name='ResNet50', last_stride=1): - super(ResNetEmbedding, self).__init__() - assert model_name in ['ResNet50', 'ResNet101'], "Unsupported ReID arch: {}".format(model_name) - self.base = eval(model_name)(last_conv_stride=last_stride) - self.gap = nn.AdaptiveAvgPool2D(output_size=1) - self.flatten = nn.Flatten(start_axis=1, stop_axis=-1) - self.bn = nn.BatchNorm1D(self.in_planes, bias_attr=False) - - def forward(self, x): - base_out = self.base(x) - global_feat = self.gap(base_out) - global_feat = self.flatten(global_feat) - global_feat = self.bn(global_feat) - return global_feat diff --git a/pdfdet/models/Paddle/ppdet/modeling/shape_spec.py b/pdfdet/models/Paddle/ppdet/modeling/shape_spec.py deleted file mode 100644 index 81601fd..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/shape_spec.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# The code is based on: -# https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/shape_spec.py - -from collections import namedtuple - - -class ShapeSpec( - namedtuple("_ShapeSpec", ["channels", "height", "width", "stride"])): - def __new__(cls, channels=None, height=None, width=None, stride=None): - return super(ShapeSpec, cls).__new__(cls, channels, height, width, - stride) diff --git a/pdfdet/models/Paddle/ppdet/modeling/ssod/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/ssod/__init__.py deleted file mode 100644 index e758857..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/ssod/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import utils -from . import losses - -from .utils import * -from .losses import * diff --git a/pdfdet/models/Paddle/ppdet/modeling/ssod/losses.py b/pdfdet/models/Paddle/ppdet/modeling/ssod/losses.py deleted file mode 100644 index e4c5038..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/ssod/losses.py +++ /dev/null @@ -1,236 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - -from ppdet.core.workspace import register -from ppdet.modeling.losses.iou_loss import GIoULoss -from .utils import QFLv2 - -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -__all__ = [ - 'SSODFCOSLoss', - 'SSODPPYOLOELoss', -] - - -@register -class SSODFCOSLoss(nn.Layer): - def __init__(self, loss_weight=1.0): - super(SSODFCOSLoss, self).__init__() - self.loss_weight = loss_weight - - def forward(self, student_head_outs, teacher_head_outs, train_cfg): - # for semi-det distill - student_logits, student_deltas, student_quality = student_head_outs - teacher_logits, teacher_deltas, teacher_quality = teacher_head_outs - nc = student_logits[0].shape[1] - - student_logits = paddle.concat( - [ - _.transpose([0, 2, 3, 1]).reshape([-1, nc]) - for _ in student_logits - ], - axis=0) - teacher_logits = paddle.concat( - [ - _.transpose([0, 2, 3, 1]).reshape([-1, nc]) - for _ in teacher_logits - ], - axis=0) - - student_deltas = paddle.concat( - [ - _.transpose([0, 2, 3, 1]).reshape([-1, 4]) - for _ in student_deltas - ], - axis=0) - teacher_deltas = paddle.concat( - [ - _.transpose([0, 2, 3, 1]).reshape([-1, 4]) - for _ in teacher_deltas - ], - axis=0) - - student_quality = paddle.concat( - [ - _.transpose([0, 2, 3, 1]).reshape([-1, 1]) - for _ in student_quality - ], - axis=0) - teacher_quality = paddle.concat( - [ - _.transpose([0, 2, 3, 1]).reshape([-1, 1]) - for _ in teacher_quality - ], - axis=0) - - ratio = train_cfg.get('ratio', 0.01) - with paddle.no_grad(): - # Region Selection - count_num = int(teacher_logits.shape[0] * ratio) - teacher_probs = F.sigmoid(teacher_logits) - max_vals = paddle.max(teacher_probs, 1) - sorted_vals, sorted_inds = paddle.topk(max_vals, - teacher_logits.shape[0]) - mask = paddle.zeros_like(max_vals) - mask[sorted_inds[:count_num]] = 1. - fg_num = sorted_vals[:count_num].sum() - b_mask = mask > 0 - - # distill_loss_cls - loss_logits = QFLv2( - F.sigmoid(student_logits), - teacher_probs, - weight=mask, - reduction="sum") / fg_num - - # distill_loss_box - inputs = paddle.concat( - (-student_deltas[b_mask][..., :2], student_deltas[b_mask][..., 2:]), - axis=-1) - targets = paddle.concat( - (-teacher_deltas[b_mask][..., :2], teacher_deltas[b_mask][..., 2:]), - axis=-1) - iou_loss = GIoULoss(reduction='mean') - loss_deltas = iou_loss(inputs, targets) - - # distill_loss_quality - loss_quality = F.binary_cross_entropy( - F.sigmoid(student_quality[b_mask]), - F.sigmoid(teacher_quality[b_mask]), - reduction='mean') - - return { - "distill_loss_cls": loss_logits, - "distill_loss_box": loss_deltas, - "distill_loss_quality": loss_quality, - "fg_sum": fg_num, - } - - -@register -class SSODPPYOLOELoss(nn.Layer): - def __init__(self, loss_weight=1.0): - super(SSODPPYOLOELoss, self).__init__() - self.loss_weight = loss_weight - - def forward(self, student_head_outs, teacher_head_outs, train_cfg): - # for semi-det distill - # student_probs: already sigmoid - student_probs, student_deltas, student_dfl = student_head_outs - teacher_probs, teacher_deltas, teacher_dfl = teacher_head_outs - bs, l, nc = student_probs.shape[:] # bs, l, num_classes - bs, l, _, reg_ch = student_dfl.shape[:] # bs, l, 4, reg_ch - student_probs = student_probs.reshape([-1, nc]) - teacher_probs = teacher_probs.reshape([-1, nc]) - student_deltas = student_deltas.reshape([-1, 4]) - teacher_deltas = teacher_deltas.reshape([-1, 4]) - student_dfl = student_dfl.reshape([-1, 4, reg_ch]) - teacher_dfl = teacher_dfl.reshape([-1, 4, reg_ch]) - - ratio = train_cfg.get('ratio', 0.01) - - # for contrast loss - curr_iter = train_cfg['curr_iter'] - st_iter = train_cfg['st_iter'] - if curr_iter == st_iter + 1: - # start semi-det training - self.queue_ptr = 0 - self.queue_size = int(bs * l * ratio) - self.queue_feats = paddle.zeros([self.queue_size, nc]) - self.queue_probs = paddle.zeros([self.queue_size, nc]) - contrast_loss_cfg = train_cfg['contrast_loss'] - temperature = contrast_loss_cfg.get('temperature', 0.2) - alpha = contrast_loss_cfg.get('alpha', 0.9) - smooth_iter = contrast_loss_cfg.get('smooth_iter', 100) + st_iter - - with paddle.no_grad(): - # Region Selection - count_num = int(teacher_probs.shape[0] * ratio) - max_vals = paddle.max(teacher_probs, 1) - sorted_vals, sorted_inds = paddle.topk(max_vals, - teacher_probs.shape[0]) - mask = paddle.zeros_like(max_vals) - mask[sorted_inds[:count_num]] = 1. - fg_num = sorted_vals[:count_num].sum() - b_mask = mask > 0. - - # for contrast loss - probs = teacher_probs[b_mask].detach() - if curr_iter > smooth_iter: # memory-smoothing - A = paddle.exp( - paddle.mm(teacher_probs[b_mask], self.queue_probs.t()) / - temperature) - A = A / A.sum(1, keepdim=True) - probs = alpha * probs + (1 - alpha) * paddle.mm( - A, self.queue_probs) - n = student_probs[b_mask].shape[0] - # update memory bank - self.queue_feats[self.queue_ptr:self.queue_ptr + - n, :] = teacher_probs[b_mask].detach() - self.queue_probs[self.queue_ptr:self.queue_ptr + - n, :] = teacher_probs[b_mask].detach() - self.queue_ptr = (self.queue_ptr + n) % self.queue_size - - # embedding similarity - sim = paddle.exp( - paddle.mm(student_probs[b_mask], teacher_probs[b_mask].t()) / 0.2) - sim_probs = sim / sim.sum(1, keepdim=True) - # pseudo-label graph with self-loop - Q = paddle.mm(probs, probs.t()) - Q.fill_diagonal_(1) - pos_mask = (Q >= 0.5).astype('float32') - Q = Q * pos_mask - Q = Q / Q.sum(1, keepdim=True) - # contrastive loss - loss_contrast = -(paddle.log(sim_probs + 1e-7) * Q).sum(1) - loss_contrast = loss_contrast.mean() - - # distill_loss_cls - loss_cls = QFLv2( - student_probs, teacher_probs, weight=mask, reduction="sum") / fg_num - - # distill_loss_iou - inputs = paddle.concat( - (-student_deltas[b_mask][..., :2], student_deltas[b_mask][..., 2:]), - -1) - targets = paddle.concat( - (-teacher_deltas[b_mask][..., :2], teacher_deltas[b_mask][..., 2:]), - -1) - iou_loss = GIoULoss(reduction='mean') - loss_iou = iou_loss(inputs, targets) - - # distill_loss_dfl - loss_dfl = F.cross_entropy( - student_dfl[b_mask].reshape([-1, reg_ch]), - teacher_dfl[b_mask].reshape([-1, reg_ch]), - soft_label=True, - reduction='mean') - - return { - "distill_loss_cls": loss_cls, - "distill_loss_iou": loss_iou, - "distill_loss_dfl": loss_dfl, - "distill_loss_contrast": loss_contrast, - "fg_sum": fg_num, - } diff --git a/pdfdet/models/Paddle/ppdet/modeling/ssod/utils.py b/pdfdet/models/Paddle/ppdet/modeling/ssod/utils.py deleted file mode 100644 index 6c9e86f..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/ssod/utils.py +++ /dev/null @@ -1,104 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn.functional as F - - -def align_weak_strong_shape(data_weak, data_strong): - max_shape_x = max(data_strong['image'].shape[2], - data_weak['image'].shape[2]) - max_shape_y = max(data_strong['image'].shape[3], - data_weak['image'].shape[3]) - - scale_x_s = max_shape_x / data_strong['image'].shape[2] - scale_y_s = max_shape_y / data_strong['image'].shape[3] - scale_x_w = max_shape_x / data_weak['image'].shape[2] - scale_y_w = max_shape_y / data_weak['image'].shape[3] - target_size = [max_shape_x, max_shape_y] - - if scale_x_s != 1 or scale_y_s != 1: - data_strong['image'] = F.interpolate( - data_strong['image'], - size=target_size, - mode='bilinear', - align_corners=False) - if 'gt_bbox' in data_strong: - gt_bboxes = data_strong['gt_bbox'].numpy() - for i in range(len(gt_bboxes)): - if len(gt_bboxes[i]) > 0: - gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x_s - gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y_s - data_strong['gt_bbox'] = paddle.to_tensor(gt_bboxes) - - if scale_x_w != 1 or scale_y_w != 1: - data_weak['image'] = F.interpolate( - data_weak['image'], - size=target_size, - mode='bilinear', - align_corners=False) - if 'gt_bbox' in data_weak: - gt_bboxes = data_weak['gt_bbox'].numpy() - for i in range(len(gt_bboxes)): - if len(gt_bboxes[i]) > 0: - gt_bboxes[i][:, 0::2] = gt_bboxes[i][:, 0::2] * scale_x_w - gt_bboxes[i][:, 1::2] = gt_bboxes[i][:, 1::2] * scale_y_w - data_weak['gt_bbox'] = paddle.to_tensor(gt_bboxes) - return data_weak, data_strong - - -def QFLv2(pred_sigmoid, - teacher_sigmoid, - weight=None, - beta=2.0, - reduction='mean'): - pt = pred_sigmoid - zerolabel = paddle.zeros_like(pt) - loss = F.binary_cross_entropy( - pred_sigmoid, zerolabel, reduction='none') * pt.pow(beta) - pos = weight > 0 - - pt = teacher_sigmoid[pos] - pred_sigmoid[pos] - loss[pos] = F.binary_cross_entropy( - pred_sigmoid[pos], teacher_sigmoid[pos], - reduction='none') * pt.pow(beta) - - valid = weight >= 0 - if reduction == "mean": - loss = loss[valid].mean() - elif reduction == "sum": - loss = loss[valid].sum() - return loss - - -def filter_invalid(bbox, label=None, score=None, thr=0.0, min_size=0): - if score.numel() > 0: - soft_score = score.max(-1) - valid = soft_score >= thr - bbox = bbox[valid] - - if label is not None: - label = label[valid] - score = score[valid] - if min_size is not None and bbox.shape[0] > 0: - bw = bbox[:, 2] - bh = bbox[:, 3] - valid = (bw > min_size) & (bh > min_size) - bbox = bbox[valid] - - if label is not None: - label = label[valid] - score = score[valid] - - return bbox, label, score diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/__init__.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/__init__.py deleted file mode 100644 index 33a1240..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/transformers/__init__.py +++ /dev/null @@ -1,36 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import detr_transformer -from . import utils -from . import matchers -from . import position_encoding -from . import deformable_transformer -from . import dino_transformer -from . import group_detr_transformer -from . import mask_dino_transformer -from . import rtdetr_transformer -from . import hybrid_encoder - -from .detr_transformer import * -from .utils import * -from .matchers import * -from .position_encoding import * -from .deformable_transformer import * -from .dino_transformer import * -from .petr_transformer import * -from .group_detr_transformer import * -from .mask_dino_transformer import * -from .rtdetr_transformer import * -from .hybrid_encoder import * diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/deformable_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/deformable_transformer.py deleted file mode 100644 index 97a9314..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/transformers/deformable_transformer.py +++ /dev/null @@ -1,646 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR) -# Copyright (c) 2020 SenseTime. All Rights Reserved. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import math -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr - -from ppdet.core.workspace import register -from ..layers import MultiHeadAttention -from .position_encoding import PositionEmbedding -from .utils import _get_clones, get_valid_ratio -from ..initializer import linear_init_, constant_, xavier_uniform_, normal_ - -__all__ = ['DeformableTransformer'] - - -class MSDeformableAttention(nn.Layer): - def __init__(self, - embed_dim=256, - num_heads=8, - num_levels=4, - num_points=4, - lr_mult=0.1): - """ - Multi-Scale Deformable Attention Module - """ - super(MSDeformableAttention, self).__init__() - self.embed_dim = embed_dim - self.num_heads = num_heads - self.num_levels = num_levels - self.num_points = num_points - self.total_points = num_heads * num_levels * num_points - - self.head_dim = embed_dim // num_heads - assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" - - self.sampling_offsets = nn.Linear( - embed_dim, - self.total_points * 2, - weight_attr=ParamAttr(learning_rate=lr_mult), - bias_attr=ParamAttr(learning_rate=lr_mult)) - - self.attention_weights = nn.Linear(embed_dim, self.total_points) - self.value_proj = nn.Linear(embed_dim, embed_dim) - self.output_proj = nn.Linear(embed_dim, embed_dim) - try: - # use cuda op - from deformable_detr_ops import ms_deformable_attn - except: - # use paddle func - from .utils import deformable_attention_core_func as ms_deformable_attn - self.ms_deformable_attn_core = ms_deformable_attn - - self._reset_parameters() - - def _reset_parameters(self): - # sampling_offsets - constant_(self.sampling_offsets.weight) - thetas = paddle.arange( - self.num_heads, - dtype=paddle.float32) * (2.0 * math.pi / self.num_heads) - grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1) - grid_init = grid_init / grid_init.abs().max(-1, keepdim=True) - grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile( - [1, self.num_levels, self.num_points, 1]) - scaling = paddle.arange( - 1, self.num_points + 1, - dtype=paddle.float32).reshape([1, 1, -1, 1]) - grid_init *= scaling - self.sampling_offsets.bias.set_value(grid_init.flatten()) - # attention_weights - constant_(self.attention_weights.weight) - constant_(self.attention_weights.bias) - # proj - xavier_uniform_(self.value_proj.weight) - constant_(self.value_proj.bias) - xavier_uniform_(self.output_proj.weight) - constant_(self.output_proj.bias) - - def forward(self, - query, - reference_points, - value, - value_spatial_shapes, - value_level_start_index, - value_mask=None): - """ - Args: - query (Tensor): [bs, query_length, C] - reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0), - bottom-right (1, 1), including padding area - value (Tensor): [bs, value_length, C] - value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] - value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...] - value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements - - Returns: - output (Tensor): [bs, Length_{query}, C] - """ - bs, Len_q = query.shape[:2] - Len_v = value.shape[1] - assert int(value_spatial_shapes.prod(1).sum()) == Len_v - - value = self.value_proj(value) - if value_mask is not None: - value_mask = value_mask.astype(value.dtype).unsqueeze(-1) - value *= value_mask - value = value.reshape([bs, Len_v, self.num_heads, self.head_dim]) - - sampling_offsets = self.sampling_offsets(query).reshape( - [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2]) - attention_weights = self.attention_weights(query).reshape( - [bs, Len_q, self.num_heads, self.num_levels * self.num_points]) - attention_weights = F.softmax(attention_weights).reshape( - [bs, Len_q, self.num_heads, self.num_levels, self.num_points]) - - if reference_points.shape[-1] == 2: - offset_normalizer = value_spatial_shapes.flip([1]).reshape( - [1, 1, 1, self.num_levels, 1, 2]) - sampling_locations = reference_points.reshape([ - bs, Len_q, 1, self.num_levels, 1, 2 - ]) + sampling_offsets / offset_normalizer - elif reference_points.shape[-1] == 4: - sampling_locations = ( - reference_points[:, :, None, :, None, :2] + sampling_offsets / - self.num_points * reference_points[:, :, None, :, None, 2:] * - 0.5) - else: - raise ValueError( - "Last dim of reference_points must be 2 or 4, but get {} instead.". - format(reference_points.shape[-1])) - - output = self.ms_deformable_attn_core( - value, value_spatial_shapes, value_level_start_index, - sampling_locations, attention_weights) - output = self.output_proj(output) - - return output - - -class DeformableTransformerEncoderLayer(nn.Layer): - def __init__(self, - d_model=256, - n_head=8, - dim_feedforward=1024, - dropout=0.1, - activation="relu", - n_levels=4, - n_points=4, - lr_mult=0.1, - weight_attr=None, - bias_attr=None): - super(DeformableTransformerEncoderLayer, self).__init__() - # self attention - self.self_attn = MSDeformableAttention(d_model, n_head, n_levels, - n_points, lr_mult) - self.dropout1 = nn.Dropout(dropout) - self.norm1 = nn.LayerNorm( - d_model, weight_attr=weight_attr, bias_attr=bias_attr) - # ffn - self.linear1 = nn.Linear(d_model, dim_feedforward) - self.activation = getattr(F, activation) - self.dropout2 = nn.Dropout(dropout) - self.linear2 = nn.Linear(dim_feedforward, d_model) - self.dropout3 = nn.Dropout(dropout) - self.norm2 = nn.LayerNorm( - d_model, weight_attr=weight_attr, bias_attr=bias_attr) - self._reset_parameters() - - def _reset_parameters(self): - linear_init_(self.linear1) - linear_init_(self.linear2) - xavier_uniform_(self.linear1.weight) - xavier_uniform_(self.linear2.weight) - - def with_pos_embed(self, tensor, pos): - return tensor if pos is None else tensor + pos - - def forward_ffn(self, src): - src2 = self.linear2(self.dropout2(self.activation(self.linear1(src)))) - src = src + self.dropout3(src2) - src = self.norm2(src) - return src - - def forward(self, - src, - reference_points, - spatial_shapes, - level_start_index, - src_mask=None, - query_pos_embed=None): - # self attention - src2 = self.self_attn( - self.with_pos_embed(src, query_pos_embed), reference_points, src, - spatial_shapes, level_start_index, src_mask) - src = src + self.dropout1(src2) - src = self.norm1(src) - # ffn - src = self.forward_ffn(src) - - return src - - -class DeformableTransformerEncoder(nn.Layer): - def __init__(self, encoder_layer, num_layers): - super(DeformableTransformerEncoder, self).__init__() - self.layers = _get_clones(encoder_layer, num_layers) - self.num_layers = num_layers - - @staticmethod - def get_reference_points(spatial_shapes, valid_ratios, offset=0.5): - valid_ratios = valid_ratios.unsqueeze(1) - reference_points = [] - for i, (H, W) in enumerate(spatial_shapes): - ref_y, ref_x = paddle.meshgrid( - paddle.arange(end=H) + offset, paddle.arange(end=W) + offset) - ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] * - H) - ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] * - W) - reference_points.append(paddle.stack((ref_x, ref_y), axis=-1)) - reference_points = paddle.concat(reference_points, 1).unsqueeze(2) - reference_points = reference_points * valid_ratios - return reference_points - - def forward(self, - feat, - spatial_shapes, - level_start_index, - feat_mask=None, - query_pos_embed=None, - valid_ratios=None): - if valid_ratios is None: - valid_ratios = paddle.ones( - [feat.shape[0], spatial_shapes.shape[0], 2]) - reference_points = self.get_reference_points(spatial_shapes, - valid_ratios) - for layer in self.layers: - feat = layer(feat, reference_points, spatial_shapes, - level_start_index, feat_mask, query_pos_embed) - - return feat - - -class DeformableTransformerDecoderLayer(nn.Layer): - def __init__(self, - d_model=256, - n_head=8, - dim_feedforward=1024, - dropout=0.1, - activation="relu", - n_levels=4, - n_points=4, - lr_mult=0.1, - weight_attr=None, - bias_attr=None): - super(DeformableTransformerDecoderLayer, self).__init__() - - # self attention - self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout) - self.dropout1 = nn.Dropout(dropout) - self.norm1 = nn.LayerNorm( - d_model, weight_attr=weight_attr, bias_attr=bias_attr) - - # cross attention - self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels, - n_points, lr_mult) - self.dropout2 = nn.Dropout(dropout) - self.norm2 = nn.LayerNorm( - d_model, weight_attr=weight_attr, bias_attr=bias_attr) - - # ffn - self.linear1 = nn.Linear(d_model, dim_feedforward) - self.activation = getattr(F, activation) - self.dropout3 = nn.Dropout(dropout) - self.linear2 = nn.Linear(dim_feedforward, d_model) - self.dropout4 = nn.Dropout(dropout) - self.norm3 = nn.LayerNorm( - d_model, weight_attr=weight_attr, bias_attr=bias_attr) - self._reset_parameters() - - def _reset_parameters(self): - linear_init_(self.linear1) - linear_init_(self.linear2) - xavier_uniform_(self.linear1.weight) - xavier_uniform_(self.linear2.weight) - - def with_pos_embed(self, tensor, pos): - return tensor if pos is None else tensor + pos - - def forward_ffn(self, tgt): - tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) - tgt = tgt + self.dropout4(tgt2) - tgt = self.norm3(tgt) - return tgt - - def forward(self, - tgt, - reference_points, - memory, - memory_spatial_shapes, - memory_level_start_index, - memory_mask=None, - query_pos_embed=None): - # self attention - q = k = self.with_pos_embed(tgt, query_pos_embed) - tgt2 = self.self_attn(q, k, value=tgt) - tgt = tgt + self.dropout1(tgt2) - tgt = self.norm1(tgt) - - # cross attention - tgt2 = self.cross_attn( - self.with_pos_embed(tgt, query_pos_embed), reference_points, memory, - memory_spatial_shapes, memory_level_start_index, memory_mask) - tgt = tgt + self.dropout2(tgt2) - tgt = self.norm2(tgt) - - # ffn - tgt = self.forward_ffn(tgt) - - return tgt - - -class DeformableTransformerDecoder(nn.Layer): - def __init__(self, decoder_layer, num_layers, return_intermediate=False): - super(DeformableTransformerDecoder, self).__init__() - self.layers = _get_clones(decoder_layer, num_layers) - self.num_layers = num_layers - self.return_intermediate = return_intermediate - - def forward(self, - tgt, - reference_points, - memory, - memory_spatial_shapes, - memory_level_start_index, - memory_mask=None, - query_pos_embed=None): - output = tgt - intermediate = [] - for lid, layer in enumerate(self.layers): - output = layer(output, reference_points, memory, - memory_spatial_shapes, memory_level_start_index, - memory_mask, query_pos_embed) - - if self.return_intermediate: - intermediate.append(output) - - if self.return_intermediate: - return paddle.stack(intermediate) - - return output.unsqueeze(0) - - -@register -class DeformableTransformer(nn.Layer): - __shared__ = ['hidden_dim'] - - def __init__(self, - num_queries=300, - position_embed_type='sine', - return_intermediate_dec=True, - in_feats_channel=[512, 1024, 2048], - num_feature_levels=4, - num_encoder_points=4, - num_decoder_points=4, - hidden_dim=256, - nhead=8, - num_encoder_layers=6, - num_decoder_layers=6, - dim_feedforward=1024, - dropout=0.1, - activation="relu", - lr_mult=0.1, - pe_temperature=10000, - pe_offset=-0.5): - super(DeformableTransformer, self).__init__() - assert position_embed_type in ['sine', 'learned'], \ - f'ValueError: position_embed_type not supported {position_embed_type}!' - assert len(in_feats_channel) <= num_feature_levels - - self.hidden_dim = hidden_dim - self.nhead = nhead - self.num_feature_levels = num_feature_levels - - encoder_layer = DeformableTransformerEncoderLayer( - hidden_dim, nhead, dim_feedforward, dropout, activation, - num_feature_levels, num_encoder_points, lr_mult) - self.encoder = DeformableTransformerEncoder(encoder_layer, - num_encoder_layers) - - decoder_layer = DeformableTransformerDecoderLayer( - hidden_dim, nhead, dim_feedforward, dropout, activation, - num_feature_levels, num_decoder_points) - self.decoder = DeformableTransformerDecoder( - decoder_layer, num_decoder_layers, return_intermediate_dec) - - self.level_embed = nn.Embedding(num_feature_levels, hidden_dim) - self.tgt_embed = nn.Embedding(num_queries, hidden_dim) - self.query_pos_embed = nn.Embedding(num_queries, hidden_dim) - - self.reference_points = nn.Linear( - hidden_dim, - 2, - weight_attr=ParamAttr(learning_rate=lr_mult), - bias_attr=ParamAttr(learning_rate=lr_mult)) - - self.input_proj = nn.LayerList() - for in_channels in in_feats_channel: - self.input_proj.append( - nn.Sequential( - nn.Conv2D( - in_channels, hidden_dim, kernel_size=1), - nn.GroupNorm(32, hidden_dim))) - in_channels = in_feats_channel[-1] - for _ in range(num_feature_levels - len(in_feats_channel)): - self.input_proj.append( - nn.Sequential( - nn.Conv2D( - in_channels, - hidden_dim, - kernel_size=3, - stride=2, - padding=1), - nn.GroupNorm(32, hidden_dim))) - in_channels = hidden_dim - - self.position_embedding = PositionEmbedding( - hidden_dim // 2, - temperature=pe_temperature, - normalize=True if position_embed_type == 'sine' else False, - embed_type=position_embed_type, - offset=pe_offset, - eps=1e-4) - - self._reset_parameters() - - def _reset_parameters(self): - normal_(self.level_embed.weight) - normal_(self.tgt_embed.weight) - normal_(self.query_pos_embed.weight) - xavier_uniform_(self.reference_points.weight) - constant_(self.reference_points.bias) - for l in self.input_proj: - xavier_uniform_(l[0].weight) - constant_(l[0].bias) - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_feats_channel': [i.channels for i in input_shape], } - - def forward(self, src_feats, src_mask=None, *args, **kwargs): - srcs = [] - for i in range(len(src_feats)): - srcs.append(self.input_proj[i](src_feats[i])) - if self.num_feature_levels > len(srcs): - len_srcs = len(srcs) - for i in range(len_srcs, self.num_feature_levels): - if i == len_srcs: - srcs.append(self.input_proj[i](src_feats[-1])) - else: - srcs.append(self.input_proj[i](srcs[-1])) - src_flatten = [] - mask_flatten = [] - lvl_pos_embed_flatten = [] - spatial_shapes = [] - valid_ratios = [] - for level, src in enumerate(srcs): - src_shape = paddle.shape(src) - bs = src_shape[0:1] - h = src_shape[2:3] - w = src_shape[3:4] - spatial_shapes.append(paddle.concat([h, w])) - src = src.flatten(2).transpose([0, 2, 1]) - src_flatten.append(src) - if src_mask is not None: - mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0] - else: - mask = paddle.ones([bs, h, w]) - valid_ratios.append(get_valid_ratio(mask)) - pos_embed = self.position_embedding(mask).flatten(1, 2) - lvl_pos_embed = pos_embed + self.level_embed.weight[level] - lvl_pos_embed_flatten.append(lvl_pos_embed) - mask = mask.flatten(1) - mask_flatten.append(mask) - src_flatten = paddle.concat(src_flatten, 1) - mask_flatten = None if src_mask is None else paddle.concat(mask_flatten, - 1) - lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1) - # [l, 2] - spatial_shapes = paddle.to_tensor( - paddle.stack(spatial_shapes).astype('int64')) - # [l], 每一个level的起始index - level_start_index = paddle.concat([ - paddle.zeros( - [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1] - ]) - # [b, l, 2] - valid_ratios = paddle.stack(valid_ratios, 1) - - # encoder - memory = self.encoder(src_flatten, spatial_shapes, level_start_index, - mask_flatten, lvl_pos_embed_flatten, valid_ratios) - - # prepare input for decoder - bs, _, c = memory.shape - query_embed = self.query_pos_embed.weight.unsqueeze(0).tile([bs, 1, 1]) - tgt = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) - reference_points = F.sigmoid(self.reference_points(query_embed)) - reference_points_input = reference_points.unsqueeze( - 2) * valid_ratios.unsqueeze(1) - - # decoder - hs = self.decoder(tgt, reference_points_input, memory, spatial_shapes, - level_start_index, mask_flatten, query_embed) - - return (hs, memory, reference_points) - - -class QRDeformableTransformerDecoder(DeformableTransformerDecoder): - def __init__(self, decoder_layer, num_layers, - start_q=None, end_q=None, return_intermediate=False): - super(QRDeformableTransformerDecoder, self).__init__( - decoder_layer, num_layers, return_intermediate=return_intermediate) - self.start_q = start_q - self.end_q = end_q - - def forward(self, - tgt, - reference_points, - memory, - memory_spatial_shapes, - memory_level_start_index, - memory_mask=None, - query_pos_embed=None): - - if not self.training: - return super(QRDeformableTransformerDecoder, self).forward( - tgt, reference_points, - memory, memory_spatial_shapes, - memory_level_start_index, - memory_mask=memory_mask, - query_pos_embed=query_pos_embed) - - batchsize = tgt.shape[0] - query_list_reserve = [tgt] - intermediate = [] - for lid, layer in enumerate(self.layers): - - start_q = self.start_q[lid] - end_q = self.end_q[lid] - query_list = query_list_reserve.copy()[start_q:end_q] - - # prepare for parallel process - output = paddle.concat(query_list, axis=0) - fakesetsize = int(output.shape[0] / batchsize) - reference_points_tiled = reference_points.tile([fakesetsize, 1, 1, 1]) - - memory_tiled = memory.tile([fakesetsize, 1, 1]) - query_pos_embed_tiled = query_pos_embed.tile([fakesetsize, 1, 1]) - memory_mask_tiled = memory_mask.tile([fakesetsize, 1]) - - output = layer(output, reference_points_tiled, memory_tiled, - memory_spatial_shapes, memory_level_start_index, - memory_mask_tiled, query_pos_embed_tiled) - - for i in range(fakesetsize): - query_list_reserve.append(output[batchsize*i:batchsize*(i+1)]) - - if self.return_intermediate: - for i in range(fakesetsize): - intermediate.append(output[batchsize*i:batchsize*(i+1)]) - - if self.return_intermediate: - return paddle.stack(intermediate) - - return output.unsqueeze(0) - - -@register -class QRDeformableTransformer(DeformableTransformer): - - def __init__(self, - num_queries=300, - position_embed_type='sine', - return_intermediate_dec=True, - in_feats_channel=[512, 1024, 2048], - num_feature_levels=4, - num_encoder_points=4, - num_decoder_points=4, - hidden_dim=256, - nhead=8, - num_encoder_layers=6, - num_decoder_layers=6, - dim_feedforward=1024, - dropout=0.1, - activation="relu", - lr_mult=0.1, - pe_temperature=10000, - pe_offset=-0.5, - start_q=None, - end_q=None): - super(QRDeformableTransformer, self).__init__( - num_queries=num_queries, - position_embed_type=position_embed_type, - return_intermediate_dec=return_intermediate_dec, - in_feats_channel=in_feats_channel, - num_feature_levels=num_feature_levels, - num_encoder_points=num_encoder_points, - num_decoder_points=num_decoder_points, - hidden_dim=hidden_dim, - nhead=nhead, - num_encoder_layers=num_encoder_layers, - num_decoder_layers=num_decoder_layers, - dim_feedforward=dim_feedforward, - dropout=dropout, - activation=activation, - lr_mult=lr_mult, - pe_temperature=pe_temperature, - pe_offset=pe_offset) - - decoder_layer = DeformableTransformerDecoderLayer( - hidden_dim, nhead, dim_feedforward, dropout, activation, - num_feature_levels, num_decoder_points) - self.decoder = QRDeformableTransformerDecoder( - decoder_layer, num_decoder_layers, start_q, end_q, return_intermediate_dec) diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/detr_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/detr_transformer.py deleted file mode 100644 index efeb320..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/transformers/detr_transformer.py +++ /dev/null @@ -1,359 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Modified from DETR (https://github.com/facebookresearch/detr) -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - -from ppdet.core.workspace import register -from ..layers import MultiHeadAttention, _convert_attention_mask -from .position_encoding import PositionEmbedding -from .utils import _get_clones -from ..initializer import linear_init_, conv_init_, xavier_uniform_, normal_ - -__all__ = ['DETRTransformer'] - - -class TransformerEncoderLayer(nn.Layer): - def __init__(self, - d_model, - nhead, - dim_feedforward=2048, - dropout=0.1, - activation="relu", - attn_dropout=None, - act_dropout=None, - normalize_before=False): - super(TransformerEncoderLayer, self).__init__() - attn_dropout = dropout if attn_dropout is None else attn_dropout - act_dropout = dropout if act_dropout is None else act_dropout - self.normalize_before = normalize_before - - self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout) - # Implementation of Feedforward model - self.linear1 = nn.Linear(d_model, dim_feedforward) - self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train") - self.linear2 = nn.Linear(dim_feedforward, d_model) - - self.norm1 = nn.LayerNorm(d_model) - self.norm2 = nn.LayerNorm(d_model) - self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") - self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train") - self.activation = getattr(F, activation) - self._reset_parameters() - - def _reset_parameters(self): - linear_init_(self.linear1) - linear_init_(self.linear2) - - @staticmethod - def with_pos_embed(tensor, pos_embed): - return tensor if pos_embed is None else tensor + pos_embed - - def forward(self, src, src_mask=None, pos_embed=None): - residual = src - if self.normalize_before: - src = self.norm1(src) - q = k = self.with_pos_embed(src, pos_embed) - src = self.self_attn(q, k, value=src, attn_mask=src_mask) - - src = residual + self.dropout1(src) - if not self.normalize_before: - src = self.norm1(src) - - residual = src - if self.normalize_before: - src = self.norm2(src) - src = self.linear2(self.dropout(self.activation(self.linear1(src)))) - src = residual + self.dropout2(src) - if not self.normalize_before: - src = self.norm2(src) - return src - - -class TransformerEncoder(nn.Layer): - def __init__(self, encoder_layer, num_layers, norm=None): - super(TransformerEncoder, self).__init__() - self.layers = _get_clones(encoder_layer, num_layers) - self.num_layers = num_layers - self.norm = norm - - def forward(self, src, src_mask=None, pos_embed=None): - output = src - for layer in self.layers: - output = layer(output, src_mask=src_mask, pos_embed=pos_embed) - - if self.norm is not None: - output = self.norm(output) - - return output - - -class TransformerDecoderLayer(nn.Layer): - def __init__(self, - d_model, - nhead, - dim_feedforward=2048, - dropout=0.1, - activation="relu", - attn_dropout=None, - act_dropout=None, - normalize_before=False): - super(TransformerDecoderLayer, self).__init__() - attn_dropout = dropout if attn_dropout is None else attn_dropout - act_dropout = dropout if act_dropout is None else act_dropout - self.normalize_before = normalize_before - - self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout) - self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout) - # Implementation of Feedforward model - self.linear1 = nn.Linear(d_model, dim_feedforward) - self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train") - self.linear2 = nn.Linear(dim_feedforward, d_model) - - self.norm1 = nn.LayerNorm(d_model) - self.norm2 = nn.LayerNorm(d_model) - self.norm3 = nn.LayerNorm(d_model) - self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") - self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train") - self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train") - self.activation = getattr(F, activation) - self._reset_parameters() - - def _reset_parameters(self): - linear_init_(self.linear1) - linear_init_(self.linear2) - - @staticmethod - def with_pos_embed(tensor, pos_embed): - return tensor if pos_embed is None else tensor + pos_embed - - def forward(self, - tgt, - memory, - tgt_mask=None, - memory_mask=None, - pos_embed=None, - query_pos_embed=None): - tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) - - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - q = k = self.with_pos_embed(tgt, query_pos_embed) - tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask) - tgt = residual + self.dropout1(tgt) - if not self.normalize_before: - tgt = self.norm1(tgt) - - residual = tgt - if self.normalize_before: - tgt = self.norm2(tgt) - q = self.with_pos_embed(tgt, query_pos_embed) - k = self.with_pos_embed(memory, pos_embed) - tgt = self.cross_attn(q, k, value=memory, attn_mask=memory_mask) - tgt = residual + self.dropout2(tgt) - if not self.normalize_before: - tgt = self.norm2(tgt) - - residual = tgt - if self.normalize_before: - tgt = self.norm3(tgt) - tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) - tgt = residual + self.dropout3(tgt) - if not self.normalize_before: - tgt = self.norm3(tgt) - return tgt - - -class TransformerDecoder(nn.Layer): - def __init__(self, - decoder_layer, - num_layers, - norm=None, - return_intermediate=False): - super(TransformerDecoder, self).__init__() - self.layers = _get_clones(decoder_layer, num_layers) - self.num_layers = num_layers - self.norm = norm - self.return_intermediate = return_intermediate - - def forward(self, - tgt, - memory, - tgt_mask=None, - memory_mask=None, - pos_embed=None, - query_pos_embed=None): - tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) - - output = tgt - intermediate = [] - for layer in self.layers: - output = layer( - output, - memory, - tgt_mask=tgt_mask, - memory_mask=memory_mask, - pos_embed=pos_embed, - query_pos_embed=query_pos_embed) - if self.return_intermediate: - intermediate.append(self.norm(output)) - - if self.norm is not None: - output = self.norm(output) - - if self.return_intermediate: - return paddle.stack(intermediate) - - return output.unsqueeze(0) - - -@register -class DETRTransformer(nn.Layer): - __shared__ = ['hidden_dim'] - - def __init__(self, - num_queries=100, - position_embed_type='sine', - return_intermediate_dec=True, - backbone_num_channels=2048, - hidden_dim=256, - nhead=8, - num_encoder_layers=6, - num_decoder_layers=6, - dim_feedforward=2048, - dropout=0.1, - activation="relu", - pe_temperature=10000, - pe_offset=0., - attn_dropout=None, - act_dropout=None, - normalize_before=False): - super(DETRTransformer, self).__init__() - assert position_embed_type in ['sine', 'learned'],\ - f'ValueError: position_embed_type not supported {position_embed_type}!' - self.hidden_dim = hidden_dim - self.nhead = nhead - - encoder_layer = TransformerEncoderLayer( - hidden_dim, nhead, dim_feedforward, dropout, activation, - attn_dropout, act_dropout, normalize_before) - encoder_norm = nn.LayerNorm(hidden_dim) if normalize_before else None - self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, - encoder_norm) - - decoder_layer = TransformerDecoderLayer( - hidden_dim, nhead, dim_feedforward, dropout, activation, - attn_dropout, act_dropout, normalize_before) - decoder_norm = nn.LayerNorm(hidden_dim) - self.decoder = TransformerDecoder( - decoder_layer, - num_decoder_layers, - decoder_norm, - return_intermediate=return_intermediate_dec) - - self.input_proj = nn.Conv2D( - backbone_num_channels, hidden_dim, kernel_size=1) - self.query_pos_embed = nn.Embedding(num_queries, hidden_dim) - self.position_embedding = PositionEmbedding( - hidden_dim // 2, - temperature=pe_temperature, - normalize=True if position_embed_type == 'sine' else False, - embed_type=position_embed_type, - offset=pe_offset) - - self._reset_parameters() - - def _reset_parameters(self): - for p in self.parameters(): - if p.dim() > 1: - xavier_uniform_(p) - conv_init_(self.input_proj) - normal_(self.query_pos_embed.weight) - - @classmethod - def from_config(cls, cfg, input_shape): - return { - 'backbone_num_channels': [i.channels for i in input_shape][-1], - } - - def _convert_attention_mask(self, mask): - return (mask - 1.0) * 1e9 - - def forward(self, src, src_mask=None, *args, **kwargs): - r""" - Applies a Transformer model on the inputs. - - Parameters: - src (List(Tensor)): Backbone feature maps with shape [[bs, c, h, w]]. - src_mask (Tensor, optional): A tensor used in multi-head attention - to prevents attention to some unwanted positions, usually the - paddings or the subsequent positions. It is a tensor with shape - [bs, H, W]`. When the data type is bool, the unwanted positions - have `False` values and the others have `True` values. When the - data type is int, the unwanted positions have 0 values and the - others have 1 values. When the data type is float, the unwanted - positions have `-INF` values and the others have 0 values. It - can be None when nothing wanted or needed to be prevented - attention to. Default None. - - Returns: - output (Tensor): [num_levels, batch_size, num_queries, hidden_dim] - memory (Tensor): [batch_size, hidden_dim, h, w] - """ - # use last level feature map - src_proj = self.input_proj(src[-1]) - bs, c, h, w = paddle.shape(src_proj) - # flatten [B, C, H, W] to [B, HxW, C] - src_flatten = src_proj.flatten(2).transpose([0, 2, 1]) - if src_mask is not None: - src_mask = F.interpolate(src_mask.unsqueeze(0), size=(h, w))[0] - else: - src_mask = paddle.ones([bs, h, w]) - pos_embed = self.position_embedding(src_mask).flatten(1, 2) - - if self.training: - src_mask = self._convert_attention_mask(src_mask) - src_mask = src_mask.reshape([bs, 1, 1, h * w]) - else: - src_mask = None - - memory = self.encoder( - src_flatten, src_mask=src_mask, pos_embed=pos_embed) - - query_pos_embed = self.query_pos_embed.weight.unsqueeze(0).tile( - [bs, 1, 1]) - tgt = paddle.zeros_like(query_pos_embed) - output = self.decoder( - tgt, - memory, - memory_mask=src_mask, - pos_embed=pos_embed, - query_pos_embed=query_pos_embed) - - if self.training: - src_mask = src_mask.reshape([bs, 1, 1, h, w]) - else: - src_mask = None - - return (output, memory.transpose([0, 2, 1]).reshape([bs, c, h, w]), - src_proj, src_mask) diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/dino_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/dino_transformer.py deleted file mode 100644 index d08a0ad..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/transformers/dino_transformer.py +++ /dev/null @@ -1,528 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR) -# Copyright (c) 2020 SenseTime. All Rights Reserved. -# Modified from detrex (https://github.com/IDEA-Research/detrex) -# Copyright 2022 The IDEA Authors. All rights reserved. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.regularizer import L2Decay - -from ppdet.core.workspace import register -from ..layers import MultiHeadAttention -from .position_encoding import PositionEmbedding -from ..heads.detr_head import MLP -from .deformable_transformer import (MSDeformableAttention, - DeformableTransformerEncoderLayer, - DeformableTransformerEncoder) -from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_, - bias_init_with_prob) -from .utils import (_get_clones, get_valid_ratio, - get_contrastive_denoising_training_group, - get_sine_pos_embed, inverse_sigmoid) - -__all__ = ['DINOTransformer'] - - -class DINOTransformerDecoderLayer(nn.Layer): - def __init__(self, - d_model=256, - n_head=8, - dim_feedforward=1024, - dropout=0., - activation="relu", - n_levels=4, - n_points=4, - lr_mult=1.0, - weight_attr=None, - bias_attr=None): - super(DINOTransformerDecoderLayer, self).__init__() - - # self attention - self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout) - self.dropout1 = nn.Dropout(dropout) - self.norm1 = nn.LayerNorm( - d_model, weight_attr=weight_attr, bias_attr=bias_attr) - - # cross attention - self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels, - n_points, lr_mult) - self.dropout2 = nn.Dropout(dropout) - self.norm2 = nn.LayerNorm( - d_model, weight_attr=weight_attr, bias_attr=bias_attr) - - # ffn - self.linear1 = nn.Linear(d_model, dim_feedforward) - self.activation = getattr(F, activation) - self.dropout3 = nn.Dropout(dropout) - self.linear2 = nn.Linear(dim_feedforward, d_model) - self.dropout4 = nn.Dropout(dropout) - self.norm3 = nn.LayerNorm( - d_model, weight_attr=weight_attr, bias_attr=bias_attr) - self._reset_parameters() - - def _reset_parameters(self): - linear_init_(self.linear1) - linear_init_(self.linear2) - xavier_uniform_(self.linear1.weight) - xavier_uniform_(self.linear2.weight) - - def with_pos_embed(self, tensor, pos): - return tensor if pos is None else tensor + pos - - def forward_ffn(self, tgt): - return self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) - - def forward(self, - tgt, - reference_points, - memory, - memory_spatial_shapes, - memory_level_start_index, - attn_mask=None, - memory_mask=None, - query_pos_embed=None): - # self attention - q = k = self.with_pos_embed(tgt, query_pos_embed) - if attn_mask is not None: - attn_mask = paddle.where( - attn_mask.astype('bool'), - paddle.zeros(attn_mask.shape, tgt.dtype), - paddle.full(attn_mask.shape, float("-inf"), tgt.dtype)) - tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask) - tgt = tgt + self.dropout1(tgt2) - tgt = self.norm1(tgt) - - # cross attention - tgt2 = self.cross_attn( - self.with_pos_embed(tgt, query_pos_embed), reference_points, memory, - memory_spatial_shapes, memory_level_start_index, memory_mask) - tgt = tgt + self.dropout2(tgt2) - tgt = self.norm2(tgt) - - # ffn - tgt2 = self.forward_ffn(tgt) - tgt = tgt + self.dropout4(tgt2) - tgt = self.norm3(tgt) - - return tgt - - -class DINOTransformerDecoder(nn.Layer): - def __init__(self, - hidden_dim, - decoder_layer, - num_layers, - weight_attr=None, - bias_attr=None): - super(DINOTransformerDecoder, self).__init__() - self.layers = _get_clones(decoder_layer, num_layers) - self.hidden_dim = hidden_dim - self.num_layers = num_layers - self.norm = nn.LayerNorm( - hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr) - - def forward(self, - tgt, - ref_points_unact, - memory, - memory_spatial_shapes, - memory_level_start_index, - bbox_head, - query_pos_head, - valid_ratios=None, - attn_mask=None, - memory_mask=None): - if valid_ratios is None: - valid_ratios = paddle.ones( - [memory.shape[0], memory_spatial_shapes.shape[0], 2]) - - output = tgt - intermediate = [] - inter_bboxes = [] - ref_points = F.sigmoid(ref_points_unact) - for i, layer in enumerate(self.layers): - reference_points_input = ref_points.detach().unsqueeze( - 2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1) - query_pos_embed = get_sine_pos_embed( - reference_points_input[..., 0, :], self.hidden_dim // 2) - query_pos_embed = query_pos_head(query_pos_embed) - - output = layer(output, reference_points_input, memory, - memory_spatial_shapes, memory_level_start_index, - attn_mask, memory_mask, query_pos_embed) - - ref_points = F.sigmoid(bbox_head[i](output) + inverse_sigmoid( - ref_points.detach())) - - intermediate.append(self.norm(output)) - inter_bboxes.append(ref_points) - - return paddle.stack(intermediate), paddle.stack(inter_bboxes) - - -@register -class DINOTransformer(nn.Layer): - __shared__ = ['num_classes', 'hidden_dim'] - - def __init__(self, - num_classes=80, - hidden_dim=256, - num_queries=900, - position_embed_type='sine', - in_feats_channel=[512, 1024, 2048], - num_levels=4, - num_encoder_points=4, - num_decoder_points=4, - nhead=8, - num_encoder_layers=6, - num_decoder_layers=6, - dim_feedforward=1024, - dropout=0., - activation="relu", - lr_mult=1.0, - pe_temperature=10000, - pe_offset=-0.5, - num_denoising=100, - label_noise_ratio=0.5, - box_noise_scale=1.0, - learnt_init_query=True, - eps=1e-2): - super(DINOTransformer, self).__init__() - assert position_embed_type in ['sine', 'learned'], \ - f'ValueError: position_embed_type not supported {position_embed_type}!' - assert len(in_feats_channel) <= num_levels - - self.hidden_dim = hidden_dim - self.nhead = nhead - self.num_levels = num_levels - self.num_classes = num_classes - self.num_queries = num_queries - self.eps = eps - self.num_decoder_layers = num_decoder_layers - - weight_attr = ParamAttr(regularizer=L2Decay(0.0)) - bias_attr = ParamAttr(regularizer=L2Decay(0.0)) - # backbone feature projection - self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr) - - # Transformer module - encoder_layer = DeformableTransformerEncoderLayer( - hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, - num_encoder_points, lr_mult, weight_attr, bias_attr) - self.encoder = DeformableTransformerEncoder(encoder_layer, - num_encoder_layers) - decoder_layer = DINOTransformerDecoderLayer( - hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, - num_decoder_points, lr_mult, weight_attr, bias_attr) - self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer, - num_decoder_layers, weight_attr, - bias_attr) - - # denoising part - self.denoising_class_embed = nn.Embedding( - num_classes, - hidden_dim, - weight_attr=ParamAttr(initializer=nn.initializer.Normal())) - self.num_denoising = num_denoising - self.label_noise_ratio = label_noise_ratio - self.box_noise_scale = box_noise_scale - - # position embedding - self.position_embedding = PositionEmbedding( - hidden_dim // 2, - temperature=pe_temperature, - normalize=True if position_embed_type == 'sine' else False, - embed_type=position_embed_type, - offset=pe_offset) - self.level_embed = nn.Embedding(num_levels, hidden_dim) - # decoder embedding - self.learnt_init_query = learnt_init_query - if learnt_init_query: - self.tgt_embed = nn.Embedding(num_queries, hidden_dim) - self.query_pos_head = MLP(2 * hidden_dim, - hidden_dim, - hidden_dim, - num_layers=2) - - # encoder head - self.enc_output = nn.Sequential( - nn.Linear(hidden_dim, hidden_dim), - nn.LayerNorm( - hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)) - self.enc_score_head = nn.Linear(hidden_dim, num_classes) - self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3) - # decoder head - self.dec_score_head = nn.LayerList([ - nn.Linear(hidden_dim, num_classes) - for _ in range(num_decoder_layers) - ]) - self.dec_bbox_head = nn.LayerList([ - MLP(hidden_dim, hidden_dim, 4, num_layers=3) - for _ in range(num_decoder_layers) - ]) - - self._reset_parameters() - - def _reset_parameters(self): - # class and bbox head init - bias_cls = bias_init_with_prob(0.01) - linear_init_(self.enc_score_head) - constant_(self.enc_score_head.bias, bias_cls) - constant_(self.enc_bbox_head.layers[-1].weight) - constant_(self.enc_bbox_head.layers[-1].bias) - for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head): - linear_init_(cls_) - constant_(cls_.bias, bias_cls) - constant_(reg_.layers[-1].weight) - constant_(reg_.layers[-1].bias) - - linear_init_(self.enc_output[0]) - xavier_uniform_(self.enc_output[0].weight) - normal_(self.level_embed.weight) - if self.learnt_init_query: - xavier_uniform_(self.tgt_embed.weight) - xavier_uniform_(self.query_pos_head.layers[0].weight) - xavier_uniform_(self.query_pos_head.layers[1].weight) - for l in self.input_proj: - xavier_uniform_(l[0].weight) - constant_(l[0].bias) - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_feats_channel': [i.channels for i in input_shape], } - - def _build_input_proj_layer(self, - in_feats_channel, - weight_attr=None, - bias_attr=None): - self.input_proj = nn.LayerList() - for in_channels in in_feats_channel: - self.input_proj.append( - nn.Sequential( - ('conv', nn.Conv2D( - in_channels, self.hidden_dim, kernel_size=1)), ( - 'norm', nn.GroupNorm( - 32, - self.hidden_dim, - weight_attr=weight_attr, - bias_attr=bias_attr)))) - in_channels = in_feats_channel[-1] - for _ in range(self.num_levels - len(in_feats_channel)): - self.input_proj.append( - nn.Sequential( - ('conv', nn.Conv2D( - in_channels, - self.hidden_dim, - kernel_size=3, - stride=2, - padding=1)), ('norm', nn.GroupNorm( - 32, - self.hidden_dim, - weight_attr=weight_attr, - bias_attr=bias_attr)))) - in_channels = self.hidden_dim - - def _get_encoder_input(self, feats, pad_mask=None): - # get projection features - proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] - if self.num_levels > len(proj_feats): - len_srcs = len(proj_feats) - for i in range(len_srcs, self.num_levels): - if i == len_srcs: - proj_feats.append(self.input_proj[i](feats[-1])) - else: - proj_feats.append(self.input_proj[i](proj_feats[-1])) - - # get encoder inputs - feat_flatten = [] - mask_flatten = [] - lvl_pos_embed_flatten = [] - spatial_shapes = [] - valid_ratios = [] - for i, feat in enumerate(proj_feats): - bs, _, h, w = paddle.shape(feat) - spatial_shapes.append(paddle.stack([h, w])) - # [b,c,h,w] -> [b,h*w,c] - feat_flatten.append(feat.flatten(2).transpose([0, 2, 1])) - if pad_mask is not None: - mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0] - else: - mask = paddle.ones([bs, h, w]) - valid_ratios.append(get_valid_ratio(mask)) - # [b, h*w, c] - pos_embed = self.position_embedding(mask).flatten(1, 2) - lvl_pos_embed = pos_embed + self.level_embed.weight[i] - lvl_pos_embed_flatten.append(lvl_pos_embed) - if pad_mask is not None: - # [b, h*w] - mask_flatten.append(mask.flatten(1)) - - # [b, l, c] - feat_flatten = paddle.concat(feat_flatten, 1) - # [b, l] - mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten, - 1) - # [b, l, c] - lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1) - # [num_levels, 2] - spatial_shapes = paddle.to_tensor( - paddle.stack(spatial_shapes).astype('int64')) - # [l] start index of each level - level_start_index = paddle.concat([ - paddle.zeros( - [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1] - ]) - # [b, num_levels, 2] - valid_ratios = paddle.stack(valid_ratios, 1) - return (feat_flatten, spatial_shapes, level_start_index, mask_flatten, - lvl_pos_embed_flatten, valid_ratios) - - def forward(self, feats, pad_mask=None, gt_meta=None): - # input projection and embedding - (feat_flatten, spatial_shapes, level_start_index, mask_flatten, - lvl_pos_embed_flatten, - valid_ratios) = self._get_encoder_input(feats, pad_mask) - - # encoder - memory = self.encoder(feat_flatten, spatial_shapes, level_start_index, - mask_flatten, lvl_pos_embed_flatten, valid_ratios) - - # prepare denoising training - if self.training: - denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \ - get_contrastive_denoising_training_group(gt_meta, - self.num_classes, - self.num_queries, - self.denoising_class_embed.weight, - self.num_denoising, - self.label_noise_ratio, - self.box_noise_scale) - else: - denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None - - target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \ - self._get_decoder_input( - memory, spatial_shapes, mask_flatten, denoising_class, - denoising_bbox_unact) - - # decoder - inter_feats, inter_bboxes = self.decoder( - target, init_ref_points_unact, memory, spatial_shapes, - level_start_index, self.dec_bbox_head, self.query_pos_head, - valid_ratios, attn_mask, mask_flatten) - out_bboxes = [] - out_logits = [] - for i in range(self.num_decoder_layers): - out_logits.append(self.dec_score_head[i](inter_feats[i])) - if i == 0: - out_bboxes.append( - F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) + - init_ref_points_unact)) - else: - out_bboxes.append( - F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) + - inverse_sigmoid(inter_bboxes[i - 1]))) - out_bboxes = paddle.stack(out_bboxes) - out_logits = paddle.stack(out_logits) - - return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits, - dn_meta) - - def _get_encoder_output_anchors(self, - memory, - spatial_shapes, - memory_mask=None, - grid_size=0.05): - output_anchors = [] - idx = 0 - for lvl, (h, w) in enumerate(spatial_shapes): - if memory_mask is not None: - mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w]) - valid_H = paddle.sum(mask_[:, :, 0], 1) - valid_W = paddle.sum(mask_[:, 0, :], 1) - else: - valid_H, valid_W = h, w - - grid_y, grid_x = paddle.meshgrid( - paddle.arange(end=h), paddle.arange(end=w)) - grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype) - - valid_WH = paddle.stack([valid_W, valid_H], -1).reshape( - [-1, 1, 1, 2]).astype(grid_xy.dtype) - grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH - wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl) - output_anchors.append( - paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4])) - idx += h * w - - output_anchors = paddle.concat(output_anchors, 1) - valid_mask = ((output_anchors > self.eps) * - (output_anchors < 1 - self.eps)).all(-1, keepdim=True) - output_anchors = paddle.log(output_anchors / (1 - output_anchors)) - if memory_mask is not None: - valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0 - output_anchors = paddle.where(valid_mask, output_anchors, - paddle.to_tensor(float("inf"))) - - memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.)) - output_memory = self.enc_output(memory) - return output_memory, output_anchors - - def _get_decoder_input(self, - memory, - spatial_shapes, - memory_mask=None, - denoising_class=None, - denoising_bbox_unact=None): - bs, _, _ = memory.shape - # prepare input for decoder - output_memory, output_anchors = self._get_encoder_output_anchors( - memory, spatial_shapes, memory_mask) - enc_outputs_class = self.enc_score_head(output_memory) - enc_outputs_coord_unact = self.enc_bbox_head( - output_memory) + output_anchors - - _, topk_ind = paddle.topk( - enc_outputs_class.max(-1), self.num_queries, axis=1) - # extract region proposal boxes - batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype) - batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries]) - topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1) - reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact, - topk_ind) # unsigmoided. - enc_topk_bboxes = F.sigmoid(reference_points_unact) - if denoising_bbox_unact is not None: - reference_points_unact = paddle.concat( - [denoising_bbox_unact, reference_points_unact], 1) - enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind) - - # extract region features - if self.learnt_init_query: - target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) - else: - target = paddle.gather_nd(output_memory, topk_ind).detach() - if denoising_class is not None: - target = paddle.concat([denoising_class, target], 1) - - return target, reference_points_unact.detach( - ), enc_topk_bboxes, enc_topk_logits diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/README.md b/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/README.md deleted file mode 100644 index 290926d..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/README.md +++ /dev/null @@ -1,85 +0,0 @@ -# Multi-scale deformable attention自定义OP编译 -该自定义OP是参考[自定义外部算子](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/custom_op/new_cpp_op_cn.html) 。 - -## 1. 环境依赖 -- Paddle >= 2.3.2 -- gcc 8.2 - -## 2. 安装 -请在当前路径下进行编译安装 -``` -cd PaddleDetection/ppdet/modeling/transformers/ext_op/ -python setup_ms_deformable_attn_op.py install -``` - -编译完成后即可使用,以下为`ms_deformable_attn`的使用示例 -``` -# 引入自定义op -from deformable_detr_ops import ms_deformable_attn - -# 构造fake input tensor -bs, n_heads, c = 2, 8, 8 -query_length, n_levels, n_points = 2, 2, 2 -spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64) -level_start_index = paddle.concat((paddle.to_tensor( - [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1])) -value_length = sum([(H * W).item() for H, W in spatial_shapes]) - -def get_test_tensors(channels): - value = paddle.rand( - [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01 - sampling_locations = paddle.rand( - [bs, query_length, n_heads, n_levels, n_points, 2], - dtype=paddle.float32) - attention_weights = paddle.rand( - [bs, query_length, n_heads, n_levels, n_points], - dtype=paddle.float32) + 1e-5 - attention_weights /= attention_weights.sum(-1, keepdim=True).sum( - -2, keepdim=True) - return [value, sampling_locations, attention_weights] - -value, sampling_locations, attention_weights = get_test_tensors(c) - -output = ms_deformable_attn(value, - spatial_shapes, - level_start_index, - sampling_locations, - attention_weights) -``` - -## 3. 单元测试 -可以通过执行单元测试来确认自定义算子功能的正确性,执行单元测试的示例如下所示: -``` -python test_ms_deformable_attn_op.py -``` -运行成功后,打印如下: -``` -*True check_forward_equal_with_paddle_float: max_abs_err 6.98e-10 max_rel_err 2.03e-07 -*tensor1 True check_gradient_numerical(D=30) -*tensor2 True check_gradient_numerical(D=30) -*tensor3 True check_gradient_numerical(D=30) -*tensor1 True check_gradient_numerical(D=32) -*tensor2 True check_gradient_numerical(D=32) -*tensor3 True check_gradient_numerical(D=32) -*tensor1 True check_gradient_numerical(D=64) -*tensor2 True check_gradient_numerical(D=64) -*tensor3 True check_gradient_numerical(D=64) -*tensor1 True check_gradient_numerical(D=71) -*tensor2 True check_gradient_numerical(D=71) -*tensor3 True check_gradient_numerical(D=71) -*tensor1 True check_gradient_numerical(D=128) -*tensor2 True check_gradient_numerical(D=128) -*tensor3 True check_gradient_numerical(D=128) -*tensor1 True check_gradient_numerical(D=1024) -*tensor2 True check_gradient_numerical(D=1024) -*tensor3 True check_gradient_numerical(D=1024) -*tensor1 True check_gradient_numerical(D=1025) -*tensor2 True check_gradient_numerical(D=1025) -*tensor3 True check_gradient_numerical(D=1025) -*tensor1 True check_gradient_numerical(D=2048) -*tensor2 True check_gradient_numerical(D=2048) -*tensor3 True check_gradient_numerical(D=2048) -*tensor1 True check_gradient_numerical(D=3096) -*tensor2 True check_gradient_numerical(D=3096) -*tensor3 True check_gradient_numerical(D=3096) -``` diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc b/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc deleted file mode 100644 index d1758ad..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cc +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/extension.h" - -#include - -// declare GPU implementation -std::vector -MSDeformableAttnCUDAForward(const paddle::Tensor &value, - const paddle::Tensor &value_spatial_shapes, - const paddle::Tensor &value_level_start_index, - const paddle::Tensor &sampling_locations, - const paddle::Tensor &attention_weights); - -std::vector MSDeformableAttnCUDABackward( - const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes, - const paddle::Tensor &value_level_start_index, - const paddle::Tensor &sampling_locations, - const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out); - -//// CPU not implemented - -std::vector> -MSDeformableAttnInferShape(std::vector value_shape, - std::vector value_spatial_shapes_shape, - std::vector value_level_start_index_shape, - std::vector sampling_locations_shape, - std::vector attention_weights_shape) { - return {{value_shape[0], sampling_locations_shape[1], - value_shape[2] * value_shape[3]}}; -} - -std::vector -MSDeformableAttnInferDtype(paddle::DataType value_dtype, - paddle::DataType value_spatial_shapes_dtype, - paddle::DataType value_level_start_index_dtype, - paddle::DataType sampling_locations_dtype, - paddle::DataType attention_weights_dtype) { - return {value_dtype}; -} - -PD_BUILD_OP(ms_deformable_attn) - .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations", - "AttentionWeights"}) - .Outputs({"Out"}) - .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDAForward)) - .SetInferShapeFn(PD_INFER_SHAPE(MSDeformableAttnInferShape)) - .SetInferDtypeFn(PD_INFER_DTYPE(MSDeformableAttnInferDtype)); - -PD_BUILD_GRAD_OP(ms_deformable_attn) - .Inputs({"Value", "SpatialShapes", "LevelIndex", "SamplingLocations", - "AttentionWeights", paddle::Grad("Out")}) - .Outputs({paddle::Grad("Value"), paddle::Grad("SpatialShapes"), - paddle::Grad("LevelIndex"), paddle::Grad("SamplingLocations"), - paddle::Grad("AttentionWeights")}) - .SetKernelFn(PD_KERNEL(MSDeformableAttnCUDABackward)); diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu b/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu deleted file mode 100644 index d5a8d16..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/ms_deformable_attn_op.cu +++ /dev/null @@ -1,1073 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/extension.h" - -#define CUDA_KERNEL_LOOP(i, n) \ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ - i += blockDim.x * gridDim.x) - -const int CUDA_NUM_THREADS = 1024; -inline int GET_BLOCKS(const int N, const int num_threads) { - return (N + num_threads - 1) / num_threads; -} - -// forward bilinear -template -__device__ data_t deformable_attn_bilinear_forward( - const data_t *&bottom_data, const int &height, const int &width, - const int &nheads, const int &channels, const data_t &h, const data_t &w, - const int &m, const int &c) { - const int h_low = floor(h); - const int w_low = floor(w); - const int h_high = h_low + 1; - const int w_high = w_low + 1; - - const data_t lh = h - h_low; - const data_t lw = w - w_low; - const data_t hh = 1 - lh, hw = 1 - lw; - - const int w_stride = nheads * channels; - const int h_stride = width * w_stride; - const int h_low_ptr_offset = h_low * h_stride; - const int h_high_ptr_offset = h_low_ptr_offset + h_stride; - const int w_low_ptr_offset = w_low * w_stride; - const int w_high_ptr_offset = w_low_ptr_offset + w_stride; - const int base_ptr = m * channels + c; - - data_t v1 = 0; - if (h_low >= 0 && w_low >= 0) { - const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; - v1 = bottom_data[ptr1]; - } - data_t v2 = 0; - if (h_low >= 0 && w_high <= width - 1) { - const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; - v2 = bottom_data[ptr2]; - } - data_t v3 = 0; - if (h_high <= height - 1 && w_low >= 0) { - const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; - v3 = bottom_data[ptr3]; - } - data_t v4 = 0; - if (h_high <= height - 1 && w_high <= width - 1) { - const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; - v4 = bottom_data[ptr4]; - } - - const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; - - const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); - return val; -} - -// forward kernel -template -__global__ void deformable_attn_cuda_kernel_forward( - const int n, const data_t *data_value, const int64_t *data_spatial_shapes, - const int64_t *data_level_start_index, const data_t *data_sampling_loc, - const data_t *data_attn_weight, const int batch_size, - const int value_length, const int num_heads, const int channels, - const int num_levels, const int query_length, const int num_points, - data_t *output_data_ptr) { - CUDA_KERNEL_LOOP(index, n) { - int _temp = index; - const int c_col = _temp % channels; - _temp /= channels; - const int sampling_index = _temp; - const int m_col = _temp % num_heads; - _temp /= num_heads; - const int q_col = _temp % query_length; - _temp /= query_length; - const int b_col = _temp; - - data_t *data_ptr = output_data_ptr + index; - int data_weight_ptr = sampling_index * num_levels * num_points; - int data_loc_w_ptr = data_weight_ptr << 1; - const int qid_stride = num_heads * channels; - const int data_value_ptr_init_offset = b_col * value_length * qid_stride; - data_t col = 0; - - for (int l_col = 0; l_col < num_levels; ++l_col) { - const int level_start_id = data_level_start_index[l_col]; - const int spatial_h_ptr = l_col << 1; - const int spatial_h = data_spatial_shapes[spatial_h_ptr]; - const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; - const data_t *data_value_ptr = data_value + (data_value_ptr_init_offset + - level_start_id * qid_stride); - for (int p_col = 0; p_col < num_points; ++p_col) { - const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; - const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; - const data_t weight = data_attn_weight[data_weight_ptr]; - - const data_t h_im = loc_h * spatial_h - 0.5; - const data_t w_im = loc_w * spatial_w - 0.5; - - if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { - col += deformable_attn_bilinear_forward( - data_value_ptr, spatial_h, spatial_w, num_heads, channels, - h_im, w_im, m_col, c_col) * - weight; - } - - data_weight_ptr += 1; - data_loc_w_ptr += 2; - } - } - *data_ptr = col; - } -} - -#define CHECK_INPUT_GPU(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") -// forward -std::vector -MSDeformableAttnCUDAForward(const paddle::Tensor &value, - const paddle::Tensor &value_spatial_shapes, - const paddle::Tensor &value_level_start_index, - const paddle::Tensor &sampling_locations, - const paddle::Tensor &attention_weights) { - - CHECK_INPUT_GPU(value); - CHECK_INPUT_GPU(value_spatial_shapes); - CHECK_INPUT_GPU(value_level_start_index); - CHECK_INPUT_GPU(sampling_locations); - CHECK_INPUT_GPU(attention_weights); - - const int batch_size = value.shape()[0]; - const int value_length = value.shape()[1]; - const int num_heads = value.shape()[2]; - const int channels = value.shape()[3]; - - const int num_levels = value_spatial_shapes.shape()[0]; - const int query_length = sampling_locations.shape()[1]; - const int num_points = sampling_locations.shape()[4]; - - auto output = paddle::full({batch_size, query_length, num_heads * channels}, - 0, value.dtype(), paddle::GPUPlace()); - - const int num_kernels = batch_size * query_length * num_heads * channels; - deformable_attn_cuda_kernel_forward - <<>>(num_kernels, value.data(), - value_spatial_shapes.data(), - value_level_start_index.data(), - sampling_locations.data(), - attention_weights.data(), batch_size, - value_length, num_heads, channels, num_levels, - query_length, num_points, output.data()); - return {output}; -} - -// backward bilinear -template -__device__ void deformable_attn_bilinear_backward( - const data_t *&bottom_data, const int &height, const int &width, - const int &nheads, const int &channels, const data_t &h, const data_t &w, - const int &m, const int &c, const data_t &top_grad, - const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc, - data_t *grad_attn_weight) { - const int h_low = floor(h); - const int w_low = floor(w); - const int h_high = h_low + 1; - const int w_high = w_low + 1; - - const data_t lh = h - h_low; - const data_t lw = w - w_low; - const data_t hh = 1 - lh, hw = 1 - lw; - - const int w_stride = nheads * channels; - const int h_stride = width * w_stride; - const int h_low_ptr_offset = h_low * h_stride; - const int h_high_ptr_offset = h_low_ptr_offset + h_stride; - const int w_low_ptr_offset = w_low * w_stride; - const int w_high_ptr_offset = w_low_ptr_offset + w_stride; - const int base_ptr = m * channels + c; - - const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; - const data_t top_grad_value = top_grad * attn_weight; - data_t grad_h_weight = 0, grad_w_weight = 0; - - data_t v1 = 0; - if (h_low >= 0 && w_low >= 0) { - const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; - v1 = bottom_data[ptr1]; - grad_h_weight -= hw * v1; - grad_w_weight -= hh * v1; - atomicAdd(grad_value + ptr1, w1 * top_grad_value); - } - data_t v2 = 0; - if (h_low >= 0 && w_high <= width - 1) { - const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; - v2 = bottom_data[ptr2]; - grad_h_weight -= lw * v2; - grad_w_weight += hh * v2; - atomicAdd(grad_value + ptr2, w2 * top_grad_value); - } - data_t v3 = 0; - if (h_high <= height - 1 && w_low >= 0) { - const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; - v3 = bottom_data[ptr3]; - grad_h_weight += hw * v3; - grad_w_weight -= lh * v3; - atomicAdd(grad_value + ptr3, w3 * top_grad_value); - } - data_t v4 = 0; - if (h_high <= height - 1 && w_high <= width - 1) { - const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; - v4 = bottom_data[ptr4]; - grad_h_weight += lw * v4; - grad_w_weight += lh * v4; - atomicAdd(grad_value + ptr4, w4 * top_grad_value); - } - - const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); - *grad_attn_weight = top_grad * val; - *grad_sampling_loc = width * grad_w_weight * top_grad_value; - *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value; -} - -template -__device__ void deformable_attn_bilinear_backward_gm( - const data_t *&bottom_data, const int &height, const int &width, - const int &nheads, const int &channels, const data_t &h, const data_t &w, - const int &m, const int &c, const data_t &top_grad, - const data_t &attn_weight, data_t *&grad_value, data_t *grad_sampling_loc, - data_t *grad_attn_weight) { - const int h_low = floor(h); - const int w_low = floor(w); - const int h_high = h_low + 1; - const int w_high = w_low + 1; - - const data_t lh = h - h_low; - const data_t lw = w - w_low; - const data_t hh = 1 - lh, hw = 1 - lw; - - const int w_stride = nheads * channels; - const int h_stride = width * w_stride; - const int h_low_ptr_offset = h_low * h_stride; - const int h_high_ptr_offset = h_low_ptr_offset + h_stride; - const int w_low_ptr_offset = w_low * w_stride; - const int w_high_ptr_offset = w_low_ptr_offset + w_stride; - const int base_ptr = m * channels + c; - - const data_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; - const data_t top_grad_value = top_grad * attn_weight; - data_t grad_h_weight = 0, grad_w_weight = 0; - - data_t v1 = 0; - if (h_low >= 0 && w_low >= 0) { - const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; - v1 = bottom_data[ptr1]; - grad_h_weight -= hw * v1; - grad_w_weight -= hh * v1; - atomicAdd(grad_value + ptr1, w1 * top_grad_value); - } - data_t v2 = 0; - if (h_low >= 0 && w_high <= width - 1) { - const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; - v2 = bottom_data[ptr2]; - grad_h_weight -= lw * v2; - grad_w_weight += hh * v2; - atomicAdd(grad_value + ptr2, w2 * top_grad_value); - } - data_t v3 = 0; - if (h_high <= height - 1 && w_low >= 0) { - const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; - v3 = bottom_data[ptr3]; - grad_h_weight += hw * v3; - grad_w_weight -= lh * v3; - atomicAdd(grad_value + ptr3, w3 * top_grad_value); - } - data_t v4 = 0; - if (h_high <= height - 1 && w_high <= width - 1) { - const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; - v4 = bottom_data[ptr4]; - grad_h_weight += lw * v4; - grad_w_weight += lh * v4; - atomicAdd(grad_value + ptr4, w4 * top_grad_value); - } - - const data_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); - atomicAdd(grad_attn_weight, top_grad * val); - atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); - atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); -} - -// backward kernels -// channels > 1024 -template -__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks( - const int n, const data_t *grad_col, const data_t *data_value, - const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, - const data_t *data_sampling_loc, const data_t *data_attn_weight, - const int batch_size, const int value_length, const int num_heads, - const int channels, const int num_levels, const int query_length, - const int num_points, data_t *grad_value, data_t *grad_sampling_loc, - data_t *grad_attn_weight) { - CUDA_KERNEL_LOOP(index, n) { - extern __shared__ int _s[]; - data_t *cache_grad_sampling_loc = (data_t *)_s; - data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; - unsigned int tid = threadIdx.x; - int _temp = index; - const int c_col = _temp % channels; - _temp /= channels; - const int sampling_index = _temp; - const int m_col = _temp % num_heads; - _temp /= num_heads; - const int q_col = _temp % query_length; - _temp /= query_length; - const int b_col = _temp; - - const data_t top_grad = grad_col[index]; - - int data_weight_ptr = sampling_index * num_levels * num_points; - int data_loc_w_ptr = data_weight_ptr << 1; - const int grad_sampling_ptr = data_weight_ptr; - grad_sampling_loc += grad_sampling_ptr << 1; - grad_attn_weight += grad_sampling_ptr; - const int grad_weight_stride = 1; - const int grad_loc_stride = 2; - const int qid_stride = num_heads * channels; - const int data_value_ptr_init_offset = b_col * value_length * qid_stride; - - for (int l_col = 0; l_col < num_levels; ++l_col) { - const int level_start_id = data_level_start_index[l_col]; - const int spatial_h_ptr = l_col << 1; - const int spatial_h = data_spatial_shapes[spatial_h_ptr]; - const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; - const int value_ptr_offset = - data_value_ptr_init_offset + level_start_id * qid_stride; - const data_t *data_value_ptr = data_value + value_ptr_offset; - data_t *grad_value_ptr = grad_value + value_ptr_offset; - - for (int p_col = 0; p_col < num_points; ++p_col) { - const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; - const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; - const data_t weight = data_attn_weight[data_weight_ptr]; - - const data_t h_im = loc_h * spatial_h - 0.5; - const data_t w_im = loc_w * spatial_w - 0.5; - *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; - *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; - *(cache_grad_attn_weight + threadIdx.x) = 0; - if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { - deformable_attn_bilinear_backward( - data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, - w_im, m_col, c_col, top_grad, weight, grad_value_ptr, - cache_grad_sampling_loc + (threadIdx.x << 1), - cache_grad_attn_weight + threadIdx.x); - } - - __syncthreads(); - - for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; - s >>= 1, spre >>= 1) { - if (tid < s) { - const unsigned int xid1 = tid << 1; - const unsigned int xid2 = (tid + s) << 1; - cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; - cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; - cache_grad_sampling_loc[xid1 + 1] += - cache_grad_sampling_loc[xid2 + 1]; - if (tid + (s << 1) < spre) { - cache_grad_attn_weight[tid] += - cache_grad_attn_weight[tid + (s << 1)]; - cache_grad_sampling_loc[xid1] += - cache_grad_sampling_loc[xid2 + (s << 1)]; - cache_grad_sampling_loc[xid1 + 1] += - cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; - } - } - __syncthreads(); - } - - if (tid == 0) { - atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]); - atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]); - atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]); - } - __syncthreads(); - - data_weight_ptr += 1; - data_loc_w_ptr += 2; - grad_attn_weight += grad_weight_stride; - grad_sampling_loc += grad_loc_stride; - } - } - } -} - -template -__global__ void deformable_attn_cuda_kernel_backward_gm( - const int n, const data_t *grad_col, const data_t *data_value, - const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, - const data_t *data_sampling_loc, const data_t *data_attn_weight, - const int batch_size, const int value_length, const int num_heads, - const int channels, const int num_levels, const int query_length, - const int num_points, data_t *grad_value, data_t *grad_sampling_loc, - data_t *grad_attn_weight) { - CUDA_KERNEL_LOOP(index, n) { - int _temp = index; - const int c_col = _temp % channels; - _temp /= channels; - const int sampling_index = _temp; - const int m_col = _temp % num_heads; - _temp /= num_heads; - const int q_col = _temp % query_length; - _temp /= query_length; - const int b_col = _temp; - - const data_t top_grad = grad_col[index]; - - int data_weight_ptr = sampling_index * num_levels * num_points; - int data_loc_w_ptr = data_weight_ptr << 1; - const int grad_sampling_ptr = data_weight_ptr; - grad_sampling_loc += grad_sampling_ptr << 1; - grad_attn_weight += grad_sampling_ptr; - const int grad_weight_stride = 1; - const int grad_loc_stride = 2; - const int qid_stride = num_heads * channels; - const int data_value_ptr_init_offset = b_col * value_length * qid_stride; - - for (int l_col = 0; l_col < num_levels; ++l_col) { - const int level_start_id = data_level_start_index[l_col]; - const int spatial_h_ptr = l_col << 1; - const int spatial_h = data_spatial_shapes[spatial_h_ptr]; - const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; - const int value_ptr_offset = - data_value_ptr_init_offset + level_start_id * qid_stride; - const data_t *data_value_ptr = data_value + value_ptr_offset; - data_t *grad_value_ptr = grad_value + value_ptr_offset; - - for (int p_col = 0; p_col < num_points; ++p_col) { - const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; - const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; - const data_t weight = data_attn_weight[data_weight_ptr]; - - const data_t h_im = loc_h * spatial_h - 0.5; - const data_t w_im = loc_w * spatial_w - 0.5; - if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { - deformable_attn_bilinear_backward_gm( - data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, - w_im, m_col, c_col, top_grad, weight, grad_value_ptr, - grad_sampling_loc, grad_attn_weight); - } - data_weight_ptr += 1; - data_loc_w_ptr += 2; - grad_attn_weight += grad_weight_stride; - grad_sampling_loc += grad_loc_stride; - } - } - } -} - -// channels <= 1024 -template -__global__ void -deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1( - const int n, const data_t *grad_col, const data_t *data_value, - const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, - const data_t *data_sampling_loc, const data_t *data_attn_weight, - const int batch_size, const int value_length, const int num_heads, - const int channels, const int num_levels, const int query_length, - const int num_points, data_t *grad_value, data_t *grad_sampling_loc, - data_t *grad_attn_weight) { - CUDA_KERNEL_LOOP(index, n) { - __shared__ data_t cache_grad_sampling_loc[blockSize * 2]; - __shared__ data_t cache_grad_attn_weight[blockSize]; - unsigned int tid = threadIdx.x; - int _temp = index; - const int c_col = _temp % channels; - _temp /= channels; - const int sampling_index = _temp; - const int m_col = _temp % num_heads; - _temp /= num_heads; - const int q_col = _temp % query_length; - _temp /= query_length; - const int b_col = _temp; - - const data_t top_grad = grad_col[index]; - - int data_weight_ptr = sampling_index * num_levels * num_points; - int data_loc_w_ptr = data_weight_ptr << 1; - const int grad_sampling_ptr = data_weight_ptr; - grad_sampling_loc += grad_sampling_ptr << 1; - grad_attn_weight += grad_sampling_ptr; - const int grad_weight_stride = 1; - const int grad_loc_stride = 2; - const int qid_stride = num_heads * channels; - const int data_value_ptr_init_offset = b_col * value_length * qid_stride; - - for (int l_col = 0; l_col < num_levels; ++l_col) { - const int level_start_id = data_level_start_index[l_col]; - const int spatial_h_ptr = l_col << 1; - const int spatial_h = data_spatial_shapes[spatial_h_ptr]; - const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; - const int value_ptr_offset = - data_value_ptr_init_offset + level_start_id * qid_stride; - const data_t *data_value_ptr = data_value + value_ptr_offset; - data_t *grad_value_ptr = grad_value + value_ptr_offset; - - for (int p_col = 0; p_col < num_points; ++p_col) { - const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; - const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; - const data_t weight = data_attn_weight[data_weight_ptr]; - - const data_t h_im = loc_h * spatial_h - 0.5; - const data_t w_im = loc_w * spatial_w - 0.5; - *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; - *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; - *(cache_grad_attn_weight + threadIdx.x) = 0; - if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { - deformable_attn_bilinear_backward( - data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, - w_im, m_col, c_col, top_grad, weight, grad_value_ptr, - cache_grad_sampling_loc + (threadIdx.x << 1), - cache_grad_attn_weight + threadIdx.x); - } - - __syncthreads(); - if (tid == 0) { - data_t _grad_w = cache_grad_sampling_loc[0], - _grad_h = cache_grad_sampling_loc[1], - _grad_a = cache_grad_attn_weight[0]; - int sid = 2; - for (unsigned int tid = 1; tid < blockSize; ++tid) { - _grad_w += cache_grad_sampling_loc[sid]; - _grad_h += cache_grad_sampling_loc[sid + 1]; - _grad_a += cache_grad_attn_weight[tid]; - sid += 2; - } - - *grad_sampling_loc = _grad_w; - *(grad_sampling_loc + 1) = _grad_h; - *grad_attn_weight = _grad_a; - } - __syncthreads(); - - data_weight_ptr += 1; - data_loc_w_ptr += 2; - grad_attn_weight += grad_weight_stride; - grad_sampling_loc += grad_loc_stride; - } - } - } -} - -template -__global__ void -deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2( - const int n, const data_t *grad_col, const data_t *data_value, - const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, - const data_t *data_sampling_loc, const data_t *data_attn_weight, - const int batch_size, const int value_length, const int num_heads, - const int channels, const int num_levels, const int query_length, - const int num_points, data_t *grad_value, data_t *grad_sampling_loc, - data_t *grad_attn_weight) { - CUDA_KERNEL_LOOP(index, n) { - __shared__ data_t cache_grad_sampling_loc[blockSize * 2]; - __shared__ data_t cache_grad_attn_weight[blockSize]; - unsigned int tid = threadIdx.x; - int _temp = index; - const int c_col = _temp % channels; - _temp /= channels; - const int sampling_index = _temp; - const int m_col = _temp % num_heads; - _temp /= num_heads; - const int q_col = _temp % query_length; - _temp /= query_length; - const int b_col = _temp; - - const data_t top_grad = grad_col[index]; - - int data_weight_ptr = sampling_index * num_levels * num_points; - int data_loc_w_ptr = data_weight_ptr << 1; - const int grad_sampling_ptr = data_weight_ptr; - grad_sampling_loc += grad_sampling_ptr << 1; - grad_attn_weight += grad_sampling_ptr; - const int grad_weight_stride = 1; - const int grad_loc_stride = 2; - const int qid_stride = num_heads * channels; - const int data_value_ptr_init_offset = b_col * value_length * qid_stride; - - for (int l_col = 0; l_col < num_levels; ++l_col) { - const int level_start_id = data_level_start_index[l_col]; - const int spatial_h_ptr = l_col << 1; - const int spatial_h = data_spatial_shapes[spatial_h_ptr]; - const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; - const int value_ptr_offset = - data_value_ptr_init_offset + level_start_id * qid_stride; - const data_t *data_value_ptr = data_value + value_ptr_offset; - data_t *grad_value_ptr = grad_value + value_ptr_offset; - - for (int p_col = 0; p_col < num_points; ++p_col) { - const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; - const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; - const data_t weight = data_attn_weight[data_weight_ptr]; - - const data_t h_im = loc_h * spatial_h - 0.5; - const data_t w_im = loc_w * spatial_w - 0.5; - *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; - *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; - *(cache_grad_attn_weight + threadIdx.x) = 0; - if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { - deformable_attn_bilinear_backward( - data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, - w_im, m_col, c_col, top_grad, weight, grad_value_ptr, - cache_grad_sampling_loc + (threadIdx.x << 1), - cache_grad_attn_weight + threadIdx.x); - } - - __syncthreads(); - - for (unsigned int s = blockSize / 2; s > 0; s >>= 1) { - if (tid < s) { - const unsigned int xid1 = tid << 1; - const unsigned int xid2 = (tid + s) << 1; - cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; - cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; - cache_grad_sampling_loc[xid1 + 1] += - cache_grad_sampling_loc[xid2 + 1]; - } - __syncthreads(); - } - - if (tid == 0) { - *grad_sampling_loc = cache_grad_sampling_loc[0]; - *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; - *grad_attn_weight = cache_grad_attn_weight[0]; - } - __syncthreads(); - - data_weight_ptr += 1; - data_loc_w_ptr += 2; - grad_attn_weight += grad_weight_stride; - grad_sampling_loc += grad_loc_stride; - } - } - } -} - -template -__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v1( - const int n, const data_t *grad_col, const data_t *data_value, - const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, - const data_t *data_sampling_loc, const data_t *data_attn_weight, - const int batch_size, const int value_length, const int num_heads, - const int channels, const int num_levels, const int query_length, - const int num_points, data_t *grad_value, data_t *grad_sampling_loc, - data_t *grad_attn_weight) { - CUDA_KERNEL_LOOP(index, n) { - extern __shared__ int _s[]; - data_t *cache_grad_sampling_loc = (data_t *)_s; - data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; - unsigned int tid = threadIdx.x; - int _temp = index; - const int c_col = _temp % channels; - _temp /= channels; - const int sampling_index = _temp; - const int m_col = _temp % num_heads; - _temp /= num_heads; - const int q_col = _temp % query_length; - _temp /= query_length; - const int b_col = _temp; - - const data_t top_grad = grad_col[index]; - - int data_weight_ptr = sampling_index * num_levels * num_points; - int data_loc_w_ptr = data_weight_ptr << 1; - const int grad_sampling_ptr = data_weight_ptr; - grad_sampling_loc += grad_sampling_ptr << 1; - grad_attn_weight += grad_sampling_ptr; - const int grad_weight_stride = 1; - const int grad_loc_stride = 2; - const int qid_stride = num_heads * channels; - const int data_value_ptr_init_offset = b_col * value_length * qid_stride; - - for (int l_col = 0; l_col < num_levels; ++l_col) { - const int level_start_id = data_level_start_index[l_col]; - const int spatial_h_ptr = l_col << 1; - const int spatial_h = data_spatial_shapes[spatial_h_ptr]; - const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; - const int value_ptr_offset = - data_value_ptr_init_offset + level_start_id * qid_stride; - const data_t *data_value_ptr = data_value + value_ptr_offset; - data_t *grad_value_ptr = grad_value + value_ptr_offset; - - for (int p_col = 0; p_col < num_points; ++p_col) { - const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; - const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; - const data_t weight = data_attn_weight[data_weight_ptr]; - - const data_t h_im = loc_h * spatial_h - 0.5; - const data_t w_im = loc_w * spatial_w - 0.5; - *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; - *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; - *(cache_grad_attn_weight + threadIdx.x) = 0; - if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { - deformable_attn_bilinear_backward( - data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, - w_im, m_col, c_col, top_grad, weight, grad_value_ptr, - cache_grad_sampling_loc + (threadIdx.x << 1), - cache_grad_attn_weight + threadIdx.x); - } - - __syncthreads(); - if (tid == 0) { - data_t _grad_w = cache_grad_sampling_loc[0], - _grad_h = cache_grad_sampling_loc[1], - _grad_a = cache_grad_attn_weight[0]; - int sid = 2; - for (unsigned int tid = 1; tid < blockDim.x; ++tid) { - _grad_w += cache_grad_sampling_loc[sid]; - _grad_h += cache_grad_sampling_loc[sid + 1]; - _grad_a += cache_grad_attn_weight[tid]; - sid += 2; - } - - *grad_sampling_loc = _grad_w; - *(grad_sampling_loc + 1) = _grad_h; - *grad_attn_weight = _grad_a; - } - __syncthreads(); - - data_weight_ptr += 1; - data_loc_w_ptr += 2; - grad_attn_weight += grad_weight_stride; - grad_sampling_loc += grad_loc_stride; - } - } - } -} - -template -__global__ void deformable_attn_cuda_kernel_backward_shm_reduce_v2( - const int n, const data_t *grad_col, const data_t *data_value, - const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, - const data_t *data_sampling_loc, const data_t *data_attn_weight, - const int batch_size, const int value_length, const int num_heads, - const int channels, const int num_levels, const int query_length, - const int num_points, data_t *grad_value, data_t *grad_sampling_loc, - data_t *grad_attn_weight) { - CUDA_KERNEL_LOOP(index, n) { - extern __shared__ int _s[]; - data_t *cache_grad_sampling_loc = (data_t *)_s; - data_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; - unsigned int tid = threadIdx.x; - int _temp = index; - const int c_col = _temp % channels; - _temp /= channels; - const int sampling_index = _temp; - const int m_col = _temp % num_heads; - _temp /= num_heads; - const int q_col = _temp % query_length; - _temp /= query_length; - const int b_col = _temp; - - const data_t top_grad = grad_col[index]; - - int data_weight_ptr = sampling_index * num_levels * num_points; - int data_loc_w_ptr = data_weight_ptr << 1; - const int grad_sampling_ptr = data_weight_ptr; - grad_sampling_loc += grad_sampling_ptr << 1; - grad_attn_weight += grad_sampling_ptr; - const int grad_weight_stride = 1; - const int grad_loc_stride = 2; - const int qid_stride = num_heads * channels; - const int data_value_ptr_init_offset = b_col * value_length * qid_stride; - - for (int l_col = 0; l_col < num_levels; ++l_col) { - const int level_start_id = data_level_start_index[l_col]; - const int spatial_h_ptr = l_col << 1; - const int spatial_h = data_spatial_shapes[spatial_h_ptr]; - const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; - const int value_ptr_offset = - data_value_ptr_init_offset + level_start_id * qid_stride; - const data_t *data_value_ptr = data_value + value_ptr_offset; - data_t *grad_value_ptr = grad_value + value_ptr_offset; - - for (int p_col = 0; p_col < num_points; ++p_col) { - const data_t loc_w = data_sampling_loc[data_loc_w_ptr]; - const data_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; - const data_t weight = data_attn_weight[data_weight_ptr]; - - const data_t h_im = loc_h * spatial_h - 0.5; - const data_t w_im = loc_w * spatial_w - 0.5; - *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0; - *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0; - *(cache_grad_attn_weight + threadIdx.x) = 0; - if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) { - deformable_attn_bilinear_backward( - data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, - w_im, m_col, c_col, top_grad, weight, grad_value_ptr, - cache_grad_sampling_loc + (threadIdx.x << 1), - cache_grad_attn_weight + threadIdx.x); - } - - __syncthreads(); - - for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; - s >>= 1, spre >>= 1) { - if (tid < s) { - const unsigned int xid1 = tid << 1; - const unsigned int xid2 = (tid + s) << 1; - cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; - cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; - cache_grad_sampling_loc[xid1 + 1] += - cache_grad_sampling_loc[xid2 + 1]; - if (tid + (s << 1) < spre) { - cache_grad_attn_weight[tid] += - cache_grad_attn_weight[tid + (s << 1)]; - cache_grad_sampling_loc[xid1] += - cache_grad_sampling_loc[xid2 + (s << 1)]; - cache_grad_sampling_loc[xid1 + 1] += - cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; - } - } - __syncthreads(); - } - - if (tid == 0) { - *grad_sampling_loc = cache_grad_sampling_loc[0]; - *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; - *grad_attn_weight = cache_grad_attn_weight[0]; - } - __syncthreads(); - - data_weight_ptr += 1; - data_loc_w_ptr += 2; - grad_attn_weight += grad_weight_stride; - grad_sampling_loc += grad_loc_stride; - } - } - } -} - -// backward branch -template -void deformable_attn_cuda_backward( - cudaStream_t stream, const data_t *grad_out, const data_t *data_value, - const int64_t *data_spatial_shapes, const int64_t *data_level_start_index, - const data_t *data_sampling_loc, const data_t *data_attn_weight, - const int batch_size, const int value_length, const int num_heads, - const int channels, const int num_levels, const int query_length, - const int num_points, data_t *grad_value, data_t *grad_sampling_loc, - data_t *grad_attn_weight) { - const int num_threads = - (channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : channels; - const int num_kernels = batch_size * query_length * num_heads * channels; - const int num_actual_kernels = - batch_size * query_length * num_heads * channels; - if (channels > 1024) { - if ((channels & 1023) == 0) { - deformable_attn_cuda_kernel_backward_shm_reduce_v2_multi_blocks - <<>>( - num_kernels, grad_out, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, value_length, num_heads, channels, num_levels, - query_length, num_points, grad_value, grad_sampling_loc, - grad_attn_weight); - } else { - deformable_attn_cuda_kernel_backward_gm - <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, value_length, num_heads, - channels, num_levels, query_length, num_points, - grad_value, grad_sampling_loc, grad_attn_weight); - } - } else { - switch (channels) { - case 1: - deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 - <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, value_length, num_heads, - channels, num_levels, query_length, num_points, - grad_value, grad_sampling_loc, grad_attn_weight); - break; - case 2: - deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 - <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, value_length, num_heads, - channels, num_levels, query_length, num_points, - grad_value, grad_sampling_loc, grad_attn_weight); - break; - case 4: - deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 - <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, value_length, num_heads, - channels, num_levels, query_length, num_points, - grad_value, grad_sampling_loc, grad_attn_weight); - break; - case 8: - deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 - <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, value_length, num_heads, - channels, num_levels, query_length, num_points, - grad_value, grad_sampling_loc, grad_attn_weight); - break; - case 16: - deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 - <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, value_length, num_heads, - channels, num_levels, query_length, num_points, - grad_value, grad_sampling_loc, grad_attn_weight); - break; - case 32: - deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v1 - <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, value_length, num_heads, - channels, num_levels, query_length, num_points, - grad_value, grad_sampling_loc, grad_attn_weight); - break; - case 64: - deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 - <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, value_length, num_heads, - channels, num_levels, query_length, num_points, - grad_value, grad_sampling_loc, grad_attn_weight); - break; - case 128: - deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 - <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, value_length, num_heads, - channels, num_levels, query_length, num_points, - grad_value, grad_sampling_loc, grad_attn_weight); - break; - case 256: - deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 - <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, value_length, num_heads, - channels, num_levels, query_length, num_points, - grad_value, grad_sampling_loc, grad_attn_weight); - break; - case 512: - deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 - <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, value_length, num_heads, - channels, num_levels, query_length, num_points, - grad_value, grad_sampling_loc, grad_attn_weight); - break; - case 1024: - deformable_attn_cuda_kernel_backward_shm_blocksize_aware_reduce_v2 - <<>>(num_kernels, grad_out, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, - data_attn_weight, batch_size, value_length, num_heads, - channels, num_levels, query_length, num_points, - grad_value, grad_sampling_loc, grad_attn_weight); - break; - default: - if (channels < 64) { - deformable_attn_cuda_kernel_backward_shm_reduce_v1 - <<>>( - num_kernels, grad_out, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, value_length, num_heads, channels, num_levels, - query_length, num_points, grad_value, grad_sampling_loc, - grad_attn_weight); - } else { - deformable_attn_cuda_kernel_backward_shm_reduce_v2 - <<>>( - num_kernels, grad_out, data_value, data_spatial_shapes, - data_level_start_index, data_sampling_loc, data_attn_weight, - batch_size, value_length, num_heads, channels, num_levels, - query_length, num_points, grad_value, grad_sampling_loc, - grad_attn_weight); - } - } - } -} - -// backward -std::vector MSDeformableAttnCUDABackward( - const paddle::Tensor &value, const paddle::Tensor &value_spatial_shapes, - const paddle::Tensor &value_level_start_index, - const paddle::Tensor &sampling_locations, - const paddle::Tensor &attention_weights, const paddle::Tensor &grad_out) { - - CHECK_INPUT_GPU(value); - CHECK_INPUT_GPU(value_spatial_shapes); - CHECK_INPUT_GPU(value_level_start_index); - CHECK_INPUT_GPU(sampling_locations); - CHECK_INPUT_GPU(attention_weights); - CHECK_INPUT_GPU(grad_out); - - const int batch_size = value.shape()[0]; - const int value_length = value.shape()[1]; - const int num_heads = value.shape()[2]; - const int channels = value.shape()[3]; - - const int num_levels = value_spatial_shapes.shape()[0]; - const int query_length = sampling_locations.shape()[1]; - const int num_points = sampling_locations.shape()[4]; - - auto grad_value = - paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace()); - auto grad_spatial_shapes = - paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace()); - auto grad_level_start_index = - paddle::full(value.shape(), 0, value.dtype(), paddle::GPUPlace()); - auto grad_sampling_locations = - paddle::full(sampling_locations.shape(), 0, sampling_locations.dtype(), - paddle::GPUPlace()); - auto grad_attention_weights = - paddle::full(attention_weights.shape(), 0, attention_weights.dtype(), - paddle::GPUPlace()); - - deformable_attn_cuda_backward( - value.stream(), grad_out.data(), value.data(), - value_spatial_shapes.data(), - value_level_start_index.data(), sampling_locations.data(), - attention_weights.data(), batch_size, value_length, num_heads, - channels, num_levels, query_length, num_points, grad_value.data(), - grad_sampling_locations.data(), - grad_attention_weights.data()); - - return {grad_value, grad_spatial_shapes, grad_level_start_index, - grad_sampling_locations, grad_attention_weights}; -} diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py deleted file mode 100644 index 7c3c386..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/setup_ms_deformable_attn_op.py +++ /dev/null @@ -1,7 +0,0 @@ -from paddle.utils.cpp_extension import CUDAExtension, setup - -if __name__ == "__main__": - setup( - name='deformable_detr_ops', - ext_modules=CUDAExtension( - sources=['ms_deformable_attn_op.cc', 'ms_deformable_attn_op.cu'])) diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py deleted file mode 100644 index 94a0573..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/transformers/ext_op/test_ms_deformable_attn_op.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import print_function -from __future__ import division - -import os -import sys -import random -import numpy as np -import paddle -# add python path of PaddleDetection to sys.path -parent_path = os.path.abspath(os.path.join(__file__, *(['..'] * 5))) -if parent_path not in sys.path: - sys.path.append(parent_path) - -from ppdet.modeling.transformers.utils import deformable_attention_core_func -ms_deform_attn_core_paddle = deformable_attention_core_func - -try: - gpu_index = int(sys.argv[1]) -except: - gpu_index = 0 -print(f'Use gpu {gpu_index} to test...') -paddle.set_device(f'gpu:{gpu_index}') - -try: - from deformable_detr_ops import ms_deformable_attn -except Exception as e: - print('import deformable_detr_ops error', e) - sys.exit(-1) - -paddle.seed(1) -random.seed(1) -np.random.seed(1) - -bs, n_heads, c = 2, 8, 8 -query_length, n_levels, n_points = 2, 2, 2 -spatial_shapes = paddle.to_tensor([(6, 4), (3, 2)], dtype=paddle.int64) -level_start_index = paddle.concat((paddle.to_tensor( - [0], dtype=paddle.int64), spatial_shapes.prod(1).cumsum(0)[:-1])) -value_length = sum([(H * W).item() for H, W in spatial_shapes]) - - -def get_test_tensors(channels): - value = paddle.rand( - [bs, value_length, n_heads, channels], dtype=paddle.float32) * 0.01 - sampling_locations = paddle.rand( - [bs, query_length, n_heads, n_levels, n_points, 2], - dtype=paddle.float32) - attention_weights = paddle.rand( - [bs, query_length, n_heads, n_levels, n_points], - dtype=paddle.float32) + 1e-5 - attention_weights /= attention_weights.sum(-1, keepdim=True).sum( - -2, keepdim=True) - - return [value, sampling_locations, attention_weights] - - -@paddle.no_grad() -def check_forward_equal_with_paddle_float(): - value, sampling_locations, attention_weights = get_test_tensors(c) - - output_paddle = ms_deform_attn_core_paddle( - value, spatial_shapes, level_start_index, sampling_locations, - attention_weights).detach().cpu() - output_cuda = ms_deformable_attn(value, spatial_shapes, level_start_index, - sampling_locations, - attention_weights).detach().cpu() - fwdok = paddle.allclose( - output_cuda, output_paddle, rtol=1e-2, atol=1e-3).item() - max_abs_err = (output_cuda - output_paddle).abs().max().item() - max_rel_err = ( - (output_cuda - output_paddle).abs() / output_paddle.abs()).max().item() - - print( - f'*{fwdok} check_forward_equal_with_paddle_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}' - ) - - -def check_gradient_numerical(channels=4): - value_paddle, sampling_locations_paddle, attention_weights_paddle = get_test_tensors( - channels) - value_paddle.stop_gradient = False - sampling_locations_paddle.stop_gradient = False - attention_weights_paddle.stop_gradient = False - - value_cuda = value_paddle.detach().clone() - sampling_locations_cuda = sampling_locations_paddle.detach().clone() - attention_weights_cuda = attention_weights_paddle.detach().clone() - value_cuda.stop_gradient = False - sampling_locations_cuda.stop_gradient = False - attention_weights_cuda.stop_gradient = False - - output_paddle = ms_deform_attn_core_paddle( - value_paddle, spatial_shapes, level_start_index, - sampling_locations_paddle, attention_weights_paddle) - output_paddle.sum().backward() - - output_cuda = ms_deformable_attn(value_cuda, spatial_shapes, - level_start_index, sampling_locations_cuda, - attention_weights_cuda) - output_cuda.sum().backward() - - res = paddle.allclose( - value_paddle.grad, value_cuda.grad, rtol=1e-2, atol=1e-3).item() - print(f'*tensor1 {res} check_gradient_numerical(D={channels})') - - res = paddle.allclose( - sampling_locations_paddle.grad, - sampling_locations_cuda.grad, - rtol=1e-2, - atol=1e-3).item() - print(f'*tensor2 {res} check_gradient_numerical(D={channels})') - - res = paddle.allclose( - attention_weights_paddle.grad, - attention_weights_cuda.grad, - rtol=1e-2, - atol=1e-3).item() - print(f'*tensor3 {res} check_gradient_numerical(D={channels})') - - -if __name__ == '__main__': - check_forward_equal_with_paddle_float() - - for channels in [30, 32, 64, 71, 128, 1024, 1025, 2048, 3096]: - check_gradient_numerical(channels) diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/group_detr_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/group_detr_transformer.py deleted file mode 100644 index 31ec617..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/transformers/group_detr_transformer.py +++ /dev/null @@ -1,857 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR) -# Copyright (c) 2020 SenseTime. All Rights Reserved. -# Modified from detrex (https://github.com/IDEA-Research/detrex) -# Copyright 2022 The IDEA Authors. All rights reserved. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.regularizer import L2Decay - -from ppdet.core.workspace import register -from ..layers import MultiHeadAttention -from .position_encoding import PositionEmbedding -from ..heads.detr_head import MLP -from .deformable_transformer import MSDeformableAttention -from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_, - bias_init_with_prob) -from .utils import (_get_clones, get_valid_ratio, - get_contrastive_denoising_training_group, - get_sine_pos_embed, inverse_sigmoid) - -__all__ = ['GroupDINOTransformer'] - - -class DINOTransformerEncoderLayer(nn.Layer): - def __init__(self, - d_model=256, - n_head=8, - dim_feedforward=1024, - dropout=0., - activation="relu", - n_levels=4, - n_points=4, - weight_attr=None, - bias_attr=None): - super(DINOTransformerEncoderLayer, self).__init__() - # self attention - self.self_attn = MSDeformableAttention(d_model, n_head, n_levels, - n_points, 1.0) - self.dropout1 = nn.Dropout(dropout) - self.norm1 = nn.LayerNorm( - d_model, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - # ffn - self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr, - bias_attr) - self.activation = getattr(F, activation) - self.dropout2 = nn.Dropout(dropout) - self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr, - bias_attr) - self.dropout3 = nn.Dropout(dropout) - self.norm2 = nn.LayerNorm( - d_model, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - self._reset_parameters() - - def _reset_parameters(self): - linear_init_(self.linear1) - linear_init_(self.linear2) - xavier_uniform_(self.linear1.weight) - xavier_uniform_(self.linear2.weight) - - def with_pos_embed(self, tensor, pos): - return tensor if pos is None else tensor + pos - - def forward_ffn(self, src): - src2 = self.linear2(self.dropout2(self.activation(self.linear1(src)))) - src = src + self.dropout3(src2) - src = self.norm2(src) - return src - - def forward(self, - src, - reference_points, - spatial_shapes, - level_start_index, - src_mask=None, - query_pos_embed=None): - # self attention - src2 = self.self_attn( - self.with_pos_embed(src, query_pos_embed), reference_points, src, - spatial_shapes, level_start_index, src_mask) - src = src + self.dropout1(src2) - src = self.norm1(src) - # ffn - src = self.forward_ffn(src) - - return src - - -class DINOTransformerEncoder(nn.Layer): - def __init__(self, encoder_layer, num_layers): - super(DINOTransformerEncoder, self).__init__() - self.layers = _get_clones(encoder_layer, num_layers) - self.num_layers = num_layers - - @staticmethod - def get_reference_points(spatial_shapes, valid_ratios, offset=0.5): - valid_ratios = valid_ratios.unsqueeze(1) - reference_points = [] - for i, (H, W) in enumerate(spatial_shapes): - ref_y, ref_x = paddle.meshgrid( - paddle.arange(end=H) + offset, paddle.arange(end=W) + offset) - ref_y = ref_y.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 1] * - H) - ref_x = ref_x.flatten().unsqueeze(0) / (valid_ratios[:, :, i, 0] * - W) - reference_points.append(paddle.stack((ref_x, ref_y), axis=-1)) - reference_points = paddle.concat(reference_points, 1).unsqueeze(2) - reference_points = reference_points * valid_ratios - return reference_points - - def forward(self, - feat, - spatial_shapes, - level_start_index, - feat_mask=None, - query_pos_embed=None, - valid_ratios=None): - if valid_ratios is None: - valid_ratios = paddle.ones( - [feat.shape[0], spatial_shapes.shape[0], 2]) - reference_points = self.get_reference_points(spatial_shapes, - valid_ratios) - for layer in self.layers: - feat = layer(feat, reference_points, spatial_shapes, - level_start_index, feat_mask, query_pos_embed) - - return feat - - -class DINOTransformerDecoderLayer(nn.Layer): - def __init__(self, - d_model=256, - n_head=8, - dim_feedforward=1024, - dropout=0., - activation="relu", - n_levels=4, - n_points=4, - dual_queries=False, - dual_groups=0, - weight_attr=None, - bias_attr=None): - super(DINOTransformerDecoderLayer, self).__init__() - - # self attention - self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout) - self.dropout1 = nn.Dropout(dropout) - self.norm1 = nn.LayerNorm( - d_model, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - - # cross attention - self.cross_attn = MSDeformableAttention(d_model, n_head, n_levels, - n_points, 1.0) - self.dropout2 = nn.Dropout(dropout) - self.norm2 = nn.LayerNorm( - d_model, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - - # ffn - self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr, - bias_attr) - self.activation = getattr(F, activation) - self.dropout3 = nn.Dropout(dropout) - self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr, - bias_attr) - self.dropout4 = nn.Dropout(dropout) - self.norm3 = nn.LayerNorm( - d_model, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - - # for dual groups - self.dual_queries = dual_queries - self.dual_groups = dual_groups - self.n_head = n_head - - self._reset_parameters() - - def _reset_parameters(self): - linear_init_(self.linear1) - linear_init_(self.linear2) - xavier_uniform_(self.linear1.weight) - xavier_uniform_(self.linear2.weight) - - def with_pos_embed(self, tensor, pos): - return tensor if pos is None else tensor + pos - - def forward_ffn(self, tgt): - return self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) - - def forward(self, - tgt, - reference_points, - memory, - memory_spatial_shapes, - memory_level_start_index, - attn_mask=None, - memory_mask=None, - query_pos_embed=None): - # self attention - q = k = self.with_pos_embed(tgt, query_pos_embed) - if self.dual_queries: - dual_groups = self.dual_groups - bs, num_queries, n_model = paddle.shape(q) - q = paddle.concat(q.split(dual_groups + 1, axis=1), axis=0) - k = paddle.concat(k.split(dual_groups + 1, axis=1), axis=0) - tgt = paddle.concat(tgt.split(dual_groups + 1, axis=1), axis=0) - - g_num_queries = num_queries // (dual_groups + 1) - if attn_mask is None or attn_mask[0] is None: - attn_mask = None - else: - # [(dual_groups + 1), g_num_queries, g_num_queries] - attn_mask = paddle.concat( - [sa_mask.unsqueeze(0) for sa_mask in attn_mask], axis=0) - # [1, (dual_groups + 1), 1, g_num_queries, g_num_queries] - # --> [bs, (dual_groups + 1), nhead, g_num_queries, g_num_queries] - # --> [bs * (dual_groups + 1), nhead, g_num_queries, g_num_queries] - attn_mask = attn_mask.unsqueeze(0).unsqueeze(2).tile( - [bs, 1, self.n_head, 1, 1]) - attn_mask = attn_mask.reshape([ - bs * (dual_groups + 1), self.n_head, g_num_queries, - g_num_queries - ]) - - if attn_mask is not None: - attn_mask = attn_mask.astype('bool') - - tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask) - tgt = tgt + self.dropout1(tgt2) - tgt = self.norm2(tgt) - - # trace back - if self.dual_queries: - tgt = paddle.concat(tgt.split(dual_groups + 1, axis=0), axis=1) - - # cross attention - tgt2 = self.cross_attn( - self.with_pos_embed(tgt, query_pos_embed), reference_points, memory, - memory_spatial_shapes, memory_level_start_index, memory_mask) - tgt = tgt + self.dropout2(tgt2) - tgt = self.norm1(tgt) - - # ffn - tgt2 = self.forward_ffn(tgt) - tgt = tgt + self.dropout4(tgt2) - tgt = self.norm3(tgt) - - return tgt - - -class DINOTransformerDecoder(nn.Layer): - def __init__(self, - hidden_dim, - decoder_layer, - num_layers, - return_intermediate=True): - super(DINOTransformerDecoder, self).__init__() - self.layers = _get_clones(decoder_layer, num_layers) - self.hidden_dim = hidden_dim - self.num_layers = num_layers - self.return_intermediate = return_intermediate - - self.norm = nn.LayerNorm( - hidden_dim, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - - def forward(self, - tgt, - reference_points, - memory, - memory_spatial_shapes, - memory_level_start_index, - bbox_head, - query_pos_head, - valid_ratios=None, - attn_mask=None, - memory_mask=None): - if valid_ratios is None: - valid_ratios = paddle.ones( - [memory.shape[0], memory_spatial_shapes.shape[0], 2]) - - output = tgt - intermediate = [] - inter_ref_bboxes = [] - for i, layer in enumerate(self.layers): - reference_points_input = reference_points.unsqueeze( - 2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1) - query_pos_embed = get_sine_pos_embed( - reference_points_input[..., 0, :], self.hidden_dim // 2) - query_pos_embed = query_pos_head(query_pos_embed) - - output = layer(output, reference_points_input, memory, - memory_spatial_shapes, memory_level_start_index, - attn_mask, memory_mask, query_pos_embed) - inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid( - reference_points)) - - if self.return_intermediate: - intermediate.append(self.norm(output)) - inter_ref_bboxes.append(inter_ref_bbox) - - reference_points = inter_ref_bbox.detach() - - if self.return_intermediate: - return paddle.stack(intermediate), paddle.stack(inter_ref_bboxes) - - return output, reference_points - - -@register -class GroupDINOTransformer(nn.Layer): - __shared__ = ['num_classes', 'hidden_dim'] - - def __init__(self, - num_classes=80, - hidden_dim=256, - num_queries=900, - position_embed_type='sine', - return_intermediate_dec=True, - backbone_feat_channels=[512, 1024, 2048], - num_levels=4, - num_encoder_points=4, - num_decoder_points=4, - nhead=8, - num_encoder_layers=6, - num_decoder_layers=6, - dim_feedforward=1024, - dropout=0., - activation="relu", - pe_temperature=10000, - pe_offset=-0.5, - num_denoising=100, - label_noise_ratio=0.5, - box_noise_scale=1.0, - learnt_init_query=True, - use_input_proj=True, - dual_queries=False, - dual_groups=0, - eps=1e-2): - super(GroupDINOTransformer, self).__init__() - assert position_embed_type in ['sine', 'learned'], \ - f'ValueError: position_embed_type not supported {position_embed_type}!' - assert len(backbone_feat_channels) <= num_levels - - self.hidden_dim = hidden_dim - self.nhead = nhead - self.num_levels = num_levels - self.num_classes = num_classes - self.num_queries = num_queries - self.eps = eps - self.num_decoder_layers = num_decoder_layers - self.use_input_proj = use_input_proj - - if use_input_proj: - # backbone feature projection - self._build_input_proj_layer(backbone_feat_channels) - - # Transformer module - encoder_layer = DINOTransformerEncoderLayer( - hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, - num_encoder_points) - self.encoder = DINOTransformerEncoder(encoder_layer, num_encoder_layers) - decoder_layer = DINOTransformerDecoderLayer( - hidden_dim, - nhead, - dim_feedforward, - dropout, - activation, - num_levels, - num_decoder_points, - dual_queries=dual_queries, - dual_groups=dual_groups) - self.decoder = DINOTransformerDecoder(hidden_dim, decoder_layer, - num_decoder_layers, - return_intermediate_dec) - - # denoising part - self.denoising_class_embed = nn.Embedding( - num_classes, - hidden_dim, - weight_attr=ParamAttr(initializer=nn.initializer.Normal())) - self.num_denoising = num_denoising - self.label_noise_ratio = label_noise_ratio - self.box_noise_scale = box_noise_scale - - # for dual group - self.dual_queries = dual_queries - self.dual_groups = dual_groups - if self.dual_queries: - self.denoising_class_embed_groups = nn.LayerList([ - nn.Embedding( - num_classes, - hidden_dim, - weight_attr=ParamAttr(initializer=nn.initializer.Normal())) - for _ in range(self.dual_groups) - ]) - - # position embedding - self.position_embedding = PositionEmbedding( - hidden_dim // 2, - temperature=pe_temperature, - normalize=True if position_embed_type == 'sine' else False, - embed_type=position_embed_type, - offset=pe_offset) - self.level_embed = nn.Embedding(num_levels, hidden_dim) - # decoder embedding - self.learnt_init_query = learnt_init_query - if learnt_init_query: - self.tgt_embed = nn.Embedding(num_queries, hidden_dim) - normal_(self.tgt_embed.weight) - if self.dual_queries: - self.tgt_embed_dual = nn.LayerList([ - nn.Embedding(num_queries, hidden_dim) - for _ in range(self.dual_groups) - ]) - for dual_tgt_module in self.tgt_embed_dual: - normal_(dual_tgt_module.weight) - self.query_pos_head = MLP(2 * hidden_dim, - hidden_dim, - hidden_dim, - num_layers=2) - - # encoder head - self.enc_output = nn.Sequential( - nn.Linear(hidden_dim, hidden_dim), - nn.LayerNorm( - hidden_dim, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0)))) - if self.dual_queries: - self.enc_output = _get_clones(self.enc_output, self.dual_groups + 1) - else: - self.enc_output = _get_clones(self.enc_output, 1) - - self.enc_score_head = nn.Linear(hidden_dim, num_classes) - self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3) - - if self.dual_queries: - self.enc_bbox_head_dq = nn.LayerList([ - MLP(hidden_dim, hidden_dim, 4, num_layers=3) - for i in range(self.dual_groups) - ]) - self.enc_score_head_dq = nn.LayerList([ - nn.Linear(hidden_dim, num_classes) - for i in range(self.dual_groups) - ]) - - # decoder head - self.dec_score_head = nn.LayerList([ - nn.Linear(hidden_dim, num_classes) - for _ in range(num_decoder_layers) - ]) - self.dec_bbox_head = nn.LayerList([ - MLP(hidden_dim, hidden_dim, 4, num_layers=3) - for _ in range(num_decoder_layers) - ]) - - self._reset_parameters() - - def _reset_parameters(self): - # class and bbox head init - bias_cls = bias_init_with_prob(0.01) - linear_init_(self.enc_score_head) - constant_(self.enc_score_head.bias, bias_cls) - constant_(self.enc_bbox_head.layers[-1].weight) - constant_(self.enc_bbox_head.layers[-1].bias) - for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head): - linear_init_(cls_) - constant_(cls_.bias, bias_cls) - constant_(reg_.layers[-1].weight) - constant_(reg_.layers[-1].bias) - - for enc_output in self.enc_output: - linear_init_(enc_output[0]) - xavier_uniform_(enc_output[0].weight) - normal_(self.level_embed.weight) - if self.learnt_init_query: - xavier_uniform_(self.tgt_embed.weight) - xavier_uniform_(self.query_pos_head.layers[0].weight) - xavier_uniform_(self.query_pos_head.layers[1].weight) - normal_(self.denoising_class_embed.weight) - if self.use_input_proj: - for l in self.input_proj: - xavier_uniform_(l[0].weight) - constant_(l[0].bias) - - @classmethod - def from_config(cls, cfg, input_shape): - return {'backbone_feat_channels': [i.channels for i in input_shape], } - - def _build_input_proj_layer(self, backbone_feat_channels): - self.input_proj = nn.LayerList() - for in_channels in backbone_feat_channels: - self.input_proj.append( - nn.Sequential( - ('conv', nn.Conv2D( - in_channels, self.hidden_dim, kernel_size=1)), - ('norm', nn.GroupNorm( - 32, - self.hidden_dim, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))) - in_channels = backbone_feat_channels[-1] - for _ in range(self.num_levels - len(backbone_feat_channels)): - self.input_proj.append( - nn.Sequential( - ('conv', nn.Conv2D( - in_channels, - self.hidden_dim, - kernel_size=3, - stride=2, - padding=1)), ('norm', nn.GroupNorm( - 32, - self.hidden_dim, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))) - in_channels = self.hidden_dim - - def _get_encoder_input(self, feats, pad_mask=None): - if self.use_input_proj: - # get projection features - proj_feats = [ - self.input_proj[i](feat) for i, feat in enumerate(feats) - ] - if self.num_levels > len(proj_feats): - len_srcs = len(proj_feats) - for i in range(len_srcs, self.num_levels): - if i == len_srcs: - proj_feats.append(self.input_proj[i](feats[-1])) - else: - proj_feats.append(self.input_proj[i](proj_feats[-1])) - else: - proj_feats = feats - # get encoder inputs - feat_flatten = [] - mask_flatten = [] - lvl_pos_embed_flatten = [] - spatial_shapes = [] - valid_ratios = [] - for i, feat in enumerate(proj_feats): - bs, _, h, w = paddle.shape(feat) - spatial_shapes.append(paddle.concat([h, w])) - # [b,c,h,w] -> [b,h*w,c] - feat_flatten.append(feat.flatten(2).transpose([0, 2, 1])) - if pad_mask is not None: - mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0] - else: - mask = paddle.ones([bs, h, w]) - valid_ratios.append(get_valid_ratio(mask)) - # [b, h*w, c] - pos_embed = self.position_embedding(mask).flatten(1, 2) - lvl_pos_embed = pos_embed + self.level_embed.weight[i].reshape( - [1, 1, -1]) - lvl_pos_embed_flatten.append(lvl_pos_embed) - if pad_mask is not None: - # [b, h*w] - mask_flatten.append(mask.flatten(1)) - - # [b, l, c] - feat_flatten = paddle.concat(feat_flatten, 1) - # [b, l] - mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten, - 1) - # [b, l, c] - lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1) - # [num_levels, 2] - spatial_shapes = paddle.to_tensor( - paddle.stack(spatial_shapes).astype('int64')) - # [l] start index of each level - level_start_index = paddle.concat([ - paddle.zeros( - [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1] - ]) - # [b, num_levels, 2] - valid_ratios = paddle.stack(valid_ratios, 1) - return (feat_flatten, spatial_shapes, level_start_index, mask_flatten, - lvl_pos_embed_flatten, valid_ratios) - - def forward(self, feats, pad_mask=None, gt_meta=None): - # input projection and embedding - (feat_flatten, spatial_shapes, level_start_index, mask_flatten, - lvl_pos_embed_flatten, - valid_ratios) = self._get_encoder_input(feats, pad_mask) - - # encoder - memory = self.encoder(feat_flatten, spatial_shapes, level_start_index, - mask_flatten, lvl_pos_embed_flatten, valid_ratios) - - # prepare denoising training - if self.training: - denoising_class, denoising_bbox, attn_mask, dn_meta = \ - get_contrastive_denoising_training_group(gt_meta, - self.num_classes, - self.num_queries, - self.denoising_class_embed.weight, - self.num_denoising, - self.label_noise_ratio, - self.box_noise_scale) - if self.dual_queries: - denoising_class_groups = [] - denoising_bbox_groups = [] - attn_mask_groups = [] - dn_meta_groups = [] - for g_id in range(self.dual_groups): - denoising_class_gid, denoising_bbox_gid, attn_mask_gid, dn_meta_gid = \ - get_contrastive_denoising_training_group(gt_meta, - self.num_classes, - self.num_queries, - self.denoising_class_embed_groups[g_id].weight, - self.num_denoising, - self.label_noise_ratio, - self.box_noise_scale) - denoising_class_groups.append(denoising_class_gid) - denoising_bbox_groups.append(denoising_bbox_gid) - attn_mask_groups.append(attn_mask_gid) - dn_meta_groups.append(dn_meta_gid) - - # combine - denoising_class = [denoising_class] + denoising_class_groups - denoising_bbox = [denoising_bbox] + denoising_bbox_groups - attn_mask = [attn_mask] + attn_mask_groups - dn_meta = [dn_meta] + dn_meta_groups - else: - denoising_class, denoising_bbox, attn_mask, dn_meta = None, None, None, None - - target, init_ref_points, enc_topk_bboxes, enc_topk_logits = \ - self._get_decoder_input( - memory, spatial_shapes, mask_flatten, denoising_class, - denoising_bbox) - - # decoder - inter_feats, inter_ref_bboxes = self.decoder( - target, init_ref_points, memory, spatial_shapes, level_start_index, - self.dec_bbox_head, self.query_pos_head, valid_ratios, attn_mask, - mask_flatten) - # solve hang during distributed training - inter_feats[0] += self.denoising_class_embed.weight[0, 0] * 0. - if self.dual_queries: - for g_id in range(self.dual_groups): - inter_feats[0] += self.denoising_class_embed_groups[ - g_id].weight[0, 0] * 0.0 - - out_bboxes = [] - out_logits = [] - for i in range(self.num_decoder_layers): - out_logits.append(self.dec_score_head[i](inter_feats[i])) - if i == 0: - out_bboxes.append( - F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) + - inverse_sigmoid(init_ref_points))) - else: - out_bboxes.append( - F.sigmoid(self.dec_bbox_head[i](inter_feats[i]) + - inverse_sigmoid(inter_ref_bboxes[i - 1]))) - - out_bboxes = paddle.stack(out_bboxes) - out_logits = paddle.stack(out_logits) - return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits, - dn_meta) - - def _get_encoder_output_anchors(self, - memory, - spatial_shapes, - memory_mask=None, - grid_size=0.05): - output_anchors = [] - idx = 0 - for lvl, (h, w) in enumerate(spatial_shapes): - if memory_mask is not None: - mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w]) - valid_H = paddle.sum(mask_[:, :, 0], 1) - valid_W = paddle.sum(mask_[:, 0, :], 1) - else: - valid_H, valid_W = h, w - - grid_y, grid_x = paddle.meshgrid( - paddle.arange( - end=h, dtype=memory.dtype), - paddle.arange( - end=w, dtype=memory.dtype)) - grid_xy = paddle.stack([grid_x, grid_y], -1) - - valid_WH = paddle.stack([valid_W, valid_H], -1).reshape( - [-1, 1, 1, 2]).astype(grid_xy.dtype) - grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH - wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl) - output_anchors.append( - paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4])) - idx += h * w - - output_anchors = paddle.concat(output_anchors, 1) - valid_mask = ((output_anchors > self.eps) * - (output_anchors < 1 - self.eps)).all(-1, keepdim=True) - output_anchors = paddle.log(output_anchors / (1 - output_anchors)) - if memory_mask is not None: - valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0 - output_anchors = paddle.where(valid_mask, output_anchors, - paddle.to_tensor(float("inf"))) - - memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.)) - if self.dual_queries: - output_memory = [ - self.enc_output[g_id](memory) - for g_id in range(self.dual_groups + 1) - ] - else: - output_memory = self.enc_output[0](memory) - return output_memory, output_anchors - - def _get_decoder_input(self, - memory, - spatial_shapes, - memory_mask=None, - denoising_class=None, - denoising_bbox=None): - bs, _, _ = memory.shape - # prepare input for decoder - output_memory, output_anchors = self._get_encoder_output_anchors( - memory, spatial_shapes, memory_mask) - if self.dual_queries: - enc_outputs_class = self.enc_score_head(output_memory[0]) - enc_outputs_coord_unact = self.enc_bbox_head(output_memory[ - 0]) + output_anchors - else: - enc_outputs_class = self.enc_score_head(output_memory) - enc_outputs_coord_unact = self.enc_bbox_head( - output_memory) + output_anchors - - _, topk_ind = paddle.topk( - enc_outputs_class.max(-1), self.num_queries, axis=1) - # extract region proposal boxes - batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype) - batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries]) - topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1) - topk_coords_unact = paddle.gather_nd(enc_outputs_coord_unact, - topk_ind) # unsigmoided. - enc_topk_bboxes = F.sigmoid(topk_coords_unact) - reference_points = enc_topk_bboxes.detach() - enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind) - - if self.dual_queries: - enc_topk_logits_groups = [] - enc_topk_bboxes_groups = [] - reference_points_groups = [] - topk_ind_groups = [] - for g_id in range(self.dual_groups): - enc_outputs_class_gid = self.enc_score_head_dq[g_id]( - output_memory[g_id + 1]) - enc_outputs_coord_unact_gid = self.enc_bbox_head_dq[g_id]( - output_memory[g_id + 1]) + output_anchors - _, topk_ind_gid = paddle.topk( - enc_outputs_class_gid.max(-1), self.num_queries, axis=1) - # extract region proposal boxes - batch_ind = paddle.arange(end=bs, dtype=topk_ind_gid.dtype) - batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries]) - topk_ind_gid = paddle.stack([batch_ind, topk_ind_gid], axis=-1) - topk_coords_unact_gid = paddle.gather_nd( - enc_outputs_coord_unact_gid, topk_ind_gid) # unsigmoided. - enc_topk_bboxes_gid = F.sigmoid(topk_coords_unact_gid) - reference_points_gid = enc_topk_bboxes_gid.detach() - enc_topk_logits_gid = paddle.gather_nd(enc_outputs_class_gid, - topk_ind_gid) - - # append and combine - topk_ind_groups.append(topk_ind_gid) - enc_topk_logits_groups.append(enc_topk_logits_gid) - enc_topk_bboxes_groups.append(enc_topk_bboxes_gid) - reference_points_groups.append(reference_points_gid) - - enc_topk_bboxes = paddle.concat( - [enc_topk_bboxes] + enc_topk_bboxes_groups, 1) - enc_topk_logits = paddle.concat( - [enc_topk_logits] + enc_topk_logits_groups, 1) - reference_points = paddle.concat( - [reference_points] + reference_points_groups, 1) - topk_ind = paddle.concat([topk_ind] + topk_ind_groups, 1) - - # extract region features - if self.learnt_init_query: - target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) - if self.dual_queries: - target = paddle.concat([target] + [ - self.tgt_embed_dual[g_id].weight.unsqueeze(0).tile( - [bs, 1, 1]) for g_id in range(self.dual_groups) - ], 1) - else: - if self.dual_queries: - target = paddle.gather_nd(output_memory[0], topk_ind) - target_groups = [] - for g_id in range(self.dual_groups): - target_gid = paddle.gather_nd(output_memory[g_id + 1], - topk_ind_groups[g_id]) - target_groups.append(target_gid) - target = paddle.concat([target] + target_groups, 1).detach() - else: - target = paddle.gather_nd(output_memory, topk_ind).detach() - - if denoising_bbox is not None: - if isinstance(denoising_bbox, list) and isinstance( - denoising_class, list) and self.dual_queries: - if denoising_bbox[0] is not None: - reference_points_list = paddle.split( - reference_points, self.dual_groups + 1, axis=1) - reference_points = paddle.concat( - [ - paddle.concat( - [ref, ref_], axis=1) - for ref, ref_ in zip(denoising_bbox, - reference_points_list) - ], - axis=1) - - target_list = paddle.split( - target, self.dual_groups + 1, axis=1) - target = paddle.concat( - [ - paddle.concat( - [tgt, tgt_], axis=1) - for tgt, tgt_ in zip(denoising_class, target_list) - ], - axis=1) - else: - reference_points, target = reference_points, target - else: - reference_points = paddle.concat( - [denoising_bbox, reference_points], 1) - target = paddle.concat([denoising_class, target], 1) - - return target, reference_points, enc_topk_bboxes, enc_topk_logits diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/hybrid_encoder.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/hybrid_encoder.py deleted file mode 100644 index 5694803..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/transformers/hybrid_encoder.py +++ /dev/null @@ -1,301 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from ppdet.core.workspace import register, serializable -from ppdet.modeling.ops import get_act_fn -from ..shape_spec import ShapeSpec -from ..backbones.csp_darknet import BaseConv -from ..backbones.cspresnet import RepVggBlock -from ppdet.modeling.transformers.detr_transformer import TransformerEncoder -from ..initializer import xavier_uniform_, linear_init_ -from ..layers import MultiHeadAttention -from paddle import ParamAttr -from paddle.regularizer import L2Decay - -__all__ = ['HybridEncoder'] - - -class CSPRepLayer(nn.Layer): - def __init__(self, - in_channels, - out_channels, - num_blocks=3, - expansion=1.0, - bias=False, - act="silu"): - super(CSPRepLayer, self).__init__() - hidden_channels = int(out_channels * expansion) - self.conv1 = BaseConv( - in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) - self.conv2 = BaseConv( - in_channels, hidden_channels, ksize=1, stride=1, bias=bias, act=act) - self.bottlenecks = nn.Sequential(* [ - RepVggBlock( - hidden_channels, hidden_channels, act=act) - for _ in range(num_blocks) - ]) - if hidden_channels != out_channels: - self.conv3 = BaseConv( - hidden_channels, - out_channels, - ksize=1, - stride=1, - bias=bias, - act=act) - else: - self.conv3 = nn.Identity() - - def forward(self, x): - x_1 = self.conv1(x) - x_1 = self.bottlenecks(x_1) - x_2 = self.conv2(x) - return self.conv3(x_1 + x_2) - - -@register -class TransformerLayer(nn.Layer): - def __init__(self, - d_model, - nhead, - dim_feedforward=1024, - dropout=0., - activation="relu", - attn_dropout=None, - act_dropout=None, - normalize_before=False): - super(TransformerLayer, self).__init__() - attn_dropout = dropout if attn_dropout is None else attn_dropout - act_dropout = dropout if act_dropout is None else act_dropout - self.normalize_before = normalize_before - - self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout) - # Implementation of Feedforward model - self.linear1 = nn.Linear(d_model, dim_feedforward) - self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train") - self.linear2 = nn.Linear(dim_feedforward, d_model) - - self.norm1 = nn.LayerNorm(d_model) - self.norm2 = nn.LayerNorm(d_model) - self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") - self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train") - self.activation = getattr(F, activation) - self._reset_parameters() - - def _reset_parameters(self): - linear_init_(self.linear1) - linear_init_(self.linear2) - - @staticmethod - def with_pos_embed(tensor, pos_embed): - return tensor if pos_embed is None else tensor + pos_embed - - def forward(self, src, src_mask=None, pos_embed=None): - residual = src - if self.normalize_before: - src = self.norm1(src) - q = k = self.with_pos_embed(src, pos_embed) - src = self.self_attn(q, k, value=src, attn_mask=src_mask) - - src = residual + self.dropout1(src) - if not self.normalize_before: - src = self.norm1(src) - - residual = src - if self.normalize_before: - src = self.norm2(src) - src = self.linear2(self.dropout(self.activation(self.linear1(src)))) - src = residual + self.dropout2(src) - if not self.normalize_before: - src = self.norm2(src) - return src - - -@register -@serializable -class HybridEncoder(nn.Layer): - __shared__ = ['depth_mult', 'act', 'trt', 'eval_size'] - __inject__ = ['encoder_layer'] - - def __init__(self, - in_channels=[512, 1024, 2048], - feat_strides=[8, 16, 32], - hidden_dim=256, - use_encoder_idx=[2], - num_encoder_layers=1, - encoder_layer='TransformerLayer', - pe_temperature=10000, - expansion=1.0, - depth_mult=1.0, - act='silu', - trt=False, - eval_size=None): - super(HybridEncoder, self).__init__() - self.in_channels = in_channels - self.feat_strides = feat_strides - self.hidden_dim = hidden_dim - self.use_encoder_idx = use_encoder_idx - self.num_encoder_layers = num_encoder_layers - self.pe_temperature = pe_temperature - self.eval_size = eval_size - - # channel projection - self.input_proj = nn.LayerList() - for in_channel in in_channels: - self.input_proj.append( - nn.Sequential( - nn.Conv2D( - in_channel, hidden_dim, kernel_size=1, bias_attr=False), - nn.BatchNorm2D( - hidden_dim, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))))) - # encoder transformer - self.encoder = nn.LayerList([ - TransformerEncoder(encoder_layer, num_encoder_layers) - for _ in range(len(use_encoder_idx)) - ]) - - act = get_act_fn( - act, trt=trt) if act is None or isinstance(act, - (str, dict)) else act - # top-down fpn - self.lateral_convs = nn.LayerList() - self.fpn_blocks = nn.LayerList() - for idx in range(len(in_channels) - 1, 0, -1): - self.lateral_convs.append( - BaseConv( - hidden_dim, hidden_dim, 1, 1, act=act)) - self.fpn_blocks.append( - CSPRepLayer( - hidden_dim * 2, - hidden_dim, - round(3 * depth_mult), - act=act, - expansion=expansion)) - - # bottom-up pan - self.downsample_convs = nn.LayerList() - self.pan_blocks = nn.LayerList() - for idx in range(len(in_channels) - 1): - self.downsample_convs.append( - BaseConv( - hidden_dim, hidden_dim, 3, stride=2, act=act)) - self.pan_blocks.append( - CSPRepLayer( - hidden_dim * 2, - hidden_dim, - round(3 * depth_mult), - act=act, - expansion=expansion)) - - self._reset_parameters() - - def _reset_parameters(self): - if self.eval_size: - for idx in self.use_encoder_idx: - stride = self.feat_strides[idx] - pos_embed = self.build_2d_sincos_position_embedding( - self.eval_size[1] // stride, self.eval_size[0] // stride, - self.hidden_dim, self.pe_temperature) - setattr(self, f'pos_embed{idx}', pos_embed) - - @staticmethod - def build_2d_sincos_position_embedding(w, - h, - embed_dim=256, - temperature=10000.): - grid_w = paddle.arange(int(w), dtype=paddle.float32) - grid_h = paddle.arange(int(h), dtype=paddle.float32) - grid_w, grid_h = paddle.meshgrid(grid_w, grid_h) - assert embed_dim % 4 == 0, \ - 'Embed dimension must be divisible by 4 for 2D sin-cos position embedding' - pos_dim = embed_dim // 4 - omega = paddle.arange(pos_dim, dtype=paddle.float32) / pos_dim - omega = 1. / (temperature**omega) - - out_w = grid_w.flatten()[..., None] @omega[None] - out_h = grid_h.flatten()[..., None] @omega[None] - - return paddle.concat( - [ - paddle.sin(out_w), paddle.cos(out_w), paddle.sin(out_h), - paddle.cos(out_h) - ], - axis=1)[None, :, :] - - def forward(self, feats, for_mot=False, is_teacher=False): - assert len(feats) == len(self.in_channels) - # get projection features - proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] - # encoder - if self.num_encoder_layers > 0: - for i, enc_ind in enumerate(self.use_encoder_idx): - h, w = proj_feats[enc_ind].shape[2:] - # flatten [B, C, H, W] to [B, HxW, C] - src_flatten = proj_feats[enc_ind].flatten(2).transpose( - [0, 2, 1]) - if self.training or self.eval_size is None or is_teacher: - pos_embed = self.build_2d_sincos_position_embedding( - w, h, self.hidden_dim, self.pe_temperature) - else: - pos_embed = getattr(self, f'pos_embed{enc_ind}', None) - memory = self.encoder[i](src_flatten, pos_embed=pos_embed) - proj_feats[enc_ind] = memory.transpose([0, 2, 1]).reshape( - [-1, self.hidden_dim, h, w]) - - # top-down fpn - inner_outs = [proj_feats[-1]] - for idx in range(len(self.in_channels) - 1, 0, -1): - feat_heigh = inner_outs[0] - feat_low = proj_feats[idx - 1] - feat_heigh = self.lateral_convs[len(self.in_channels) - 1 - idx]( - feat_heigh) - inner_outs[0] = feat_heigh - - upsample_feat = F.interpolate( - feat_heigh, scale_factor=2., mode="nearest") - inner_out = self.fpn_blocks[len(self.in_channels) - 1 - idx]( - paddle.concat( - [upsample_feat, feat_low], axis=1)) - inner_outs.insert(0, inner_out) - - # bottom-up pan - outs = [inner_outs[0]] - for idx in range(len(self.in_channels) - 1): - feat_low = outs[-1] - feat_height = inner_outs[idx + 1] - downsample_feat = self.downsample_convs[idx](feat_low) - out = self.pan_blocks[idx](paddle.concat( - [downsample_feat, feat_height], axis=1)) - outs.append(out) - - return outs - - @classmethod - def from_config(cls, cfg, input_shape): - return { - 'in_channels': [i.channels for i in input_shape], - 'feat_strides': [i.stride for i in input_shape] - } - - @property - def out_shape(self): - return [ - ShapeSpec( - channels=self.hidden_dim, stride=self.feat_strides[idx]) - for idx in range(len(self.in_channels)) - ] diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/mask_dino_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/mask_dino_transformer.py deleted file mode 100644 index 6b29223..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/transformers/mask_dino_transformer.py +++ /dev/null @@ -1,536 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR) -# Copyright (c) 2020 SenseTime. All Rights Reserved. -# Modified from detrex (https://github.com/IDEA-Research/detrex) -# Copyright 2022 The IDEA Authors. All rights reserved. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import math -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.regularizer import L2Decay - -from ppdet.core.workspace import register -from .position_encoding import PositionEmbedding -from ..heads.detr_head import MLP -from .deformable_transformer import (DeformableTransformerEncoderLayer, - DeformableTransformerEncoder) -from .dino_transformer import (DINOTransformerDecoderLayer) -from ..initializer import (linear_init_, constant_, xavier_uniform_, - bias_init_with_prob) -from .utils import (_get_clones, get_valid_ratio, get_denoising_training_group, - get_sine_pos_embed, inverse_sigmoid, mask_to_box_coordinate) - -__all__ = ['MaskDINO'] - - -class ConvGNBlock(nn.Layer): - def __init__(self, - in_channels, - out_channels, - kernel_size, - stride=1, - groups=1, - num_groups=32, - bias=False, - act=None): - super(ConvGNBlock, self).__init__() - self.conv = nn.Conv2D( - in_channels, - out_channels, - kernel_size=kernel_size, - stride=stride, - padding=(kernel_size - 1) // 2, - groups=groups, - bias_attr=bias) - self.norm = nn.GroupNorm( - num_groups, - out_channels, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - self.act = getattr(F, act) if act is not None else None - - self._init_weights() - - def _init_weights(self): - xavier_uniform_(self.conv.weight) - - def forward(self, x): - x = self.norm(self.conv(x)) - if self.act is not None: - x = self.act(x) - return x - - -class MaskDINOTransformerDecoder(nn.Layer): - def __init__(self, hidden_dim, decoder_layer, num_layers): - super(MaskDINOTransformerDecoder, self).__init__() - self.layers = _get_clones(decoder_layer, num_layers) - self.hidden_dim = hidden_dim - self.num_layers = num_layers - - def forward(self, - tgt, - ref_points_unact, - memory, - memory_spatial_shapes, - memory_level_start_index, - bbox_head, - query_pos_head, - dec_norm, - valid_ratios=None, - attn_mask=None, - memory_mask=None): - if valid_ratios is None: - valid_ratios = paddle.ones( - [memory.shape[0], memory_spatial_shapes.shape[0], 2]) - - output = tgt - intermediate = [] - inter_bboxes = [] - ref_points = F.sigmoid(ref_points_unact) - for i, layer in enumerate(self.layers): - reference_points_input = ref_points.detach().unsqueeze( - 2) * valid_ratios.tile([1, 1, 2]).unsqueeze(1) - query_pos_embed = get_sine_pos_embed( - reference_points_input[..., 0, :], self.hidden_dim // 2) - query_pos_embed = query_pos_head(query_pos_embed) - - output = layer(output, reference_points_input, memory, - memory_spatial_shapes, memory_level_start_index, - attn_mask, memory_mask, query_pos_embed) - - ref_points = F.sigmoid( - bbox_head(output) + inverse_sigmoid(ref_points.detach())) - - intermediate.append(dec_norm(output)) - inter_bboxes.append(ref_points) - - return paddle.stack(intermediate), paddle.stack(inter_bboxes) - - -@register -class MaskDINO(nn.Layer): - __shared__ = ['num_classes', 'hidden_dim'] - - def __init__(self, - num_classes=80, - hidden_dim=256, - num_queries=300, - position_embed_type='sine', - in_feats_channel=[256, 512, 1024, 2048], - num_levels=3, - num_encoder_points=4, - num_decoder_points=4, - nhead=8, - num_encoder_layers=6, - num_decoder_layers=9, - enc_dim_feedforward=1024, - dec_dim_feedforward=2048, - dropout=0., - activation="relu", - lr_mult=1.0, - pe_temperature=10000, - pe_offset=-0.5, - num_denoising=100, - label_noise_ratio=0.4, - box_noise_scale=0.4, - learnt_init_query=False, - mask_enhanced=True, - eps=1e-2): - super(MaskDINO, self).__init__() - assert position_embed_type in ['sine', 'learned'], \ - f'ValueError: position_embed_type not supported {position_embed_type}!' - feat0_dim = in_feats_channel.pop(0) - assert len(in_feats_channel) <= num_levels - - self.hidden_dim = hidden_dim - self.nhead = nhead - self.num_levels = num_levels - self.num_classes = num_classes - self.num_queries = num_queries - self.eps = eps - self.num_decoder_layers = num_decoder_layers - self.mask_enhanced = mask_enhanced - - weight_attr = ParamAttr(regularizer=L2Decay(0.0)) - bias_attr = ParamAttr(regularizer=L2Decay(0.0)) - # backbone feature projection - self._build_input_proj_layer(in_feats_channel, weight_attr, bias_attr) - - # Transformer module - encoder_layer = DeformableTransformerEncoderLayer( - hidden_dim, nhead, enc_dim_feedforward, dropout, activation, - num_levels, num_encoder_points, lr_mult, weight_attr, bias_attr) - self.encoder = DeformableTransformerEncoder(encoder_layer, - num_encoder_layers) - decoder_layer = DINOTransformerDecoderLayer( - hidden_dim, nhead, dec_dim_feedforward, dropout, activation, - num_levels, num_decoder_points, lr_mult, weight_attr, bias_attr) - self.decoder = MaskDINOTransformerDecoder(hidden_dim, decoder_layer, - num_decoder_layers) - - # denoising part - self.denoising_class_embed = nn.Embedding( - num_classes, - hidden_dim, - weight_attr=ParamAttr(initializer=nn.initializer.Normal())) - self.num_denoising = num_denoising - self.label_noise_ratio = label_noise_ratio - self.box_noise_scale = box_noise_scale - - # position embedding - self.position_embedding = PositionEmbedding( - hidden_dim // 2, - temperature=pe_temperature, - normalize=True if position_embed_type == 'sine' else False, - embed_type=position_embed_type, - offset=pe_offset) - self.level_embed = nn.Embedding( - num_levels, - hidden_dim, - weight_attr=ParamAttr(initializer=nn.initializer.Normal())) - # decoder embedding - self.learnt_init_query = learnt_init_query - if learnt_init_query: - self.tgt_embed = nn.Embedding(num_queries, hidden_dim) - self.query_pos_head = MLP(2 * hidden_dim, - hidden_dim, - hidden_dim, - num_layers=2) - # mask embedding - self.mask_query_head = MLP(hidden_dim, - hidden_dim, - hidden_dim, - num_layers=3) - - # encoder mask head - self.enc_mask_lateral = ConvGNBlock(feat0_dim, hidden_dim, 1) - self.enc_mask_output = nn.Sequential( - ConvGNBlock( - hidden_dim, hidden_dim, 3, act=activation), - nn.Conv2D(hidden_dim, hidden_dim, 1)) - # encoder head - self.enc_output = nn.Sequential( - nn.Linear(hidden_dim, hidden_dim), - nn.LayerNorm( - hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr)) - # decoder norm layer - self.dec_norm = nn.LayerNorm( - hidden_dim, weight_attr=weight_attr, bias_attr=bias_attr) - # shared prediction head - self.class_head = nn.Linear(hidden_dim, num_classes) - self.bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3) - - self._reset_parameters() - - def _reset_parameters(self): - # class and bbox head init - bias_cls = bias_init_with_prob(0.01) - linear_init_(self.class_head) - constant_(self.class_head.bias, bias_cls) - constant_(self.bbox_head.layers[-1].weight) - constant_(self.bbox_head.layers[-1].bias) - - xavier_uniform_(self.enc_mask_output[1].weight) - linear_init_(self.enc_output[0]) - xavier_uniform_(self.enc_output[0].weight) - if self.learnt_init_query: - xavier_uniform_(self.tgt_embed.weight) - xavier_uniform_(self.query_pos_head.layers[0].weight) - xavier_uniform_(self.query_pos_head.layers[1].weight) - for l in self.input_proj: - xavier_uniform_(l[0].weight) - - @classmethod - def from_config(cls, cfg, input_shape): - return {'in_feats_channel': [i.channels for i in input_shape], } - - def _build_input_proj_layer(self, - in_feats_channel, - weight_attr=None, - bias_attr=None): - self.input_proj = nn.LayerList() - for in_channels in in_feats_channel: - self.input_proj.append( - nn.Sequential( - ('conv', nn.Conv2D( - in_channels, self.hidden_dim, kernel_size=1)), ( - 'norm', nn.GroupNorm( - 32, - self.hidden_dim, - weight_attr=weight_attr, - bias_attr=bias_attr)))) - in_channels = in_feats_channel[-1] - for _ in range(self.num_levels - len(in_feats_channel)): - self.input_proj.append( - nn.Sequential( - ('conv', nn.Conv2D( - in_channels, - self.hidden_dim, - kernel_size=3, - stride=2, - padding=1)), ('norm', nn.GroupNorm( - 32, - self.hidden_dim, - weight_attr=weight_attr, - bias_attr=bias_attr)))) - in_channels = self.hidden_dim - - def _get_encoder_input(self, feats, pad_mask=None): - # get projection features - proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] - if self.num_levels > len(proj_feats): - len_srcs = len(proj_feats) - for i in range(len_srcs, self.num_levels): - if i == len_srcs: - proj_feats.append(self.input_proj[i](feats[-1])) - else: - proj_feats.append(self.input_proj[i](proj_feats[-1])) - - # get encoder inputs - feat_flatten = [] - mask_flatten = [] - lvl_pos_embed_flatten = [] - spatial_shapes = [] - valid_ratios = [] - for i, feat in enumerate(proj_feats): - bs, _, h, w = paddle.shape(feat) - spatial_shapes.append(paddle.concat([h, w])) - # [b,c,h,w] -> [b,h*w,c] - feat_flatten.append(feat.flatten(2).transpose([0, 2, 1])) - if pad_mask is not None: - mask = F.interpolate(pad_mask.unsqueeze(0), size=(h, w))[0] - else: - mask = paddle.ones([bs, h, w]) - valid_ratios.append(get_valid_ratio(mask)) - # [b, h*w, c] - pos_embed = self.position_embedding(mask).flatten(1, 2) - lvl_pos_embed = pos_embed + self.level_embed.weight[i] - lvl_pos_embed_flatten.append(lvl_pos_embed) - if pad_mask is not None: - # [b, h*w] - mask_flatten.append(mask.flatten(1)) - - # [b, l, c] - feat_flatten = paddle.concat(feat_flatten, 1) - # [b, l] - mask_flatten = None if pad_mask is None else paddle.concat(mask_flatten, - 1) - # [b, l, c] - lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1) - # [num_levels, 2] - spatial_shapes = paddle.to_tensor( - paddle.stack(spatial_shapes).astype('int64')) - # [l], 每一个level的起始index - level_start_index = paddle.concat([ - paddle.zeros( - [1], dtype='int64'), spatial_shapes.prod(1).cumsum(0)[:-1] - ]) - # [b, num_levels, 2] - valid_ratios = paddle.stack(valid_ratios, 1) - return (feat_flatten, spatial_shapes, level_start_index, mask_flatten, - lvl_pos_embed_flatten, valid_ratios) - - def forward(self, feats, pad_mask=None, gt_meta=None): - feat0 = feats.pop(0) - # input projection and embedding - (feat_flatten, spatial_shapes, level_start_index, mask_flatten, - lvl_pos_embed_flatten, - valid_ratios) = self._get_encoder_input(feats, pad_mask) - - # encoder - memory = self.encoder(feat_flatten, spatial_shapes, level_start_index, - mask_flatten, lvl_pos_embed_flatten, valid_ratios) - - mask_feat = self._get_encoder_mask_feature(feat0, memory, - spatial_shapes) - - # prepare denoising training - if self.training: - denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \ - get_denoising_training_group(gt_meta, - self.num_classes, - self.num_queries, - self.denoising_class_embed.weight, - self.num_denoising, - self.label_noise_ratio, - self.box_noise_scale) - else: - denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None - - target, init_ref_points_unact, enc_out, init_out = \ - self._get_decoder_input( - memory, mask_feat, spatial_shapes, mask_flatten, denoising_class, - denoising_bbox_unact) - - # decoder - inter_feats, inter_bboxes = self.decoder( - target, init_ref_points_unact, memory, spatial_shapes, - level_start_index, self.bbox_head, self.query_pos_head, - self.dec_norm, valid_ratios, attn_mask, mask_flatten) - - out_logits = [] - out_bboxes = [] - out_masks = [] - for i in range(self.num_decoder_layers): - if self.training or i == self.num_decoder_layers - 1: - logits_, masks_ = self._get_pred_class_and_mask(inter_feats[i], - mask_feat) - else: - continue - out_logits.append(logits_) - out_masks.append(masks_) - if i == 0: - out_bboxes.append( - F.sigmoid( - self.bbox_head(inter_feats[i]) + init_ref_points_unact)) - else: - out_bboxes.append( - F.sigmoid( - self.bbox_head(inter_feats[i]) + inverse_sigmoid( - inter_bboxes[i - 1]))) - out_bboxes = paddle.stack(out_bboxes) - out_logits = paddle.stack(out_logits) - out_masks = paddle.stack(out_masks) - - return (out_logits, out_bboxes, out_masks, enc_out, init_out, dn_meta) - - def _get_encoder_mask_feature(self, in_feat, memory, spatial_shapes): - memory_feat0 = memory.split( - spatial_shapes.prod(1).split(self.num_levels), axis=1)[0] - h, w = spatial_shapes[0] - memory_feat0 = memory_feat0.reshape( - [0, h, w, self.hidden_dim]).transpose([0, 3, 1, 2]) - out = self.enc_mask_lateral(in_feat) + F.interpolate( - memory_feat0, - scale_factor=2.0, - mode='bilinear', - align_corners=False) - return self.enc_mask_output(out) - - def _get_encoder_output_anchors(self, - memory, - spatial_shapes, - memory_mask=None, - grid_size=0.05): - output_anchors = [] - idx = 0 - for lvl, (h, w) in enumerate(spatial_shapes): - if memory_mask is not None: - mask_ = memory_mask[:, idx:idx + h * w].reshape([-1, h, w]) - valid_H = paddle.sum(mask_[:, :, 0], 1) - valid_W = paddle.sum(mask_[:, 0, :], 1) - else: - valid_H, valid_W = h, w - - grid_y, grid_x = paddle.meshgrid( - paddle.arange(end=h), paddle.arange(end=w)) - grid_xy = paddle.stack([grid_x, grid_y], -1).astype(memory.dtype) - - valid_WH = paddle.stack([valid_W, valid_H], -1).reshape( - [-1, 1, 1, 2]).astype(grid_xy.dtype) - grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH - wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl) - output_anchors.append( - paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4])) - idx += h * w - - output_anchors = paddle.concat(output_anchors, 1) - valid_mask = ((output_anchors > self.eps) * - (output_anchors < 1 - self.eps)).all(-1, keepdim=True) - output_anchors = paddle.log(output_anchors / (1 - output_anchors)) - if memory_mask is not None: - valid_mask = (valid_mask * (memory_mask.unsqueeze(-1) > 0)) > 0 - output_anchors = paddle.where(valid_mask, output_anchors, - paddle.to_tensor(float("inf"))) - - memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.)) - output_memory = self.enc_output(memory) - return output_memory, output_anchors - - def _get_decoder_input(self, - memory, - mask_feat, - spatial_shapes, - memory_mask=None, - denoising_class=None, - denoising_bbox_unact=None): - # prepare input for decoder - bs, _, _ = memory.shape - output_memory, output_anchors = self._get_encoder_output_anchors( - memory, spatial_shapes, memory_mask) - enc_logits_unact = self.class_head(output_memory) - enc_bboxes_unact = self.bbox_head(output_memory) + output_anchors - - # get topk index - _, topk_ind = paddle.topk( - enc_logits_unact.max(-1), self.num_queries, axis=1) - batch_ind = paddle.arange(end=bs).astype(topk_ind.dtype) - batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries]) - topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1) - - # extract content and position query embedding - target = paddle.gather_nd(output_memory, topk_ind) - reference_points_unact = paddle.gather_nd(enc_bboxes_unact, - topk_ind) # unsigmoided. - # get encoder output: {logits, bboxes, masks} - enc_out_logits, enc_out_masks = self._get_pred_class_and_mask(target, - mask_feat) - enc_out_bboxes = F.sigmoid(reference_points_unact) - enc_out = (enc_out_logits, enc_out_bboxes, enc_out_masks) - - # concat denoising query - if self.learnt_init_query: - target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) - else: - target = target.detach() - if denoising_class is not None: - target = paddle.concat([denoising_class, target], 1) - if self.mask_enhanced: - # use mask-enhanced anchor box initialization - reference_points = mask_to_box_coordinate( - enc_out_masks > 0, normalize=True, format="xywh") - reference_points_unact = inverse_sigmoid(reference_points) - if denoising_bbox_unact is not None: - reference_points_unact = paddle.concat( - [denoising_bbox_unact, reference_points_unact], 1) - - # direct prediction from the matching and denoising part in the begining - if self.training and denoising_class is not None: - init_out_logits, init_out_masks = self._get_pred_class_and_mask( - target, mask_feat) - init_out_bboxes = F.sigmoid(reference_points_unact) - init_out = (init_out_logits, init_out_bboxes, init_out_masks) - else: - init_out = None - - return target, reference_points_unact.detach(), enc_out, init_out - - def _get_pred_class_and_mask(self, query_embed, mask_feat): - out_query = self.dec_norm(query_embed) - out_logits = self.class_head(out_query) - mask_query_embed = self.mask_query_head(out_query) - _, _, h, w = paddle.shape(mask_feat) - # [b, q, c] x [b, c, h, w] -> [b, q, h, w] - out_mask = paddle.bmm(mask_query_embed, mask_feat.flatten(2)).reshape( - [0, 0, h, w]) - return out_logits, out_mask diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/matchers.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/matchers.py deleted file mode 100644 index d8f85fc..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/transformers/matchers.py +++ /dev/null @@ -1,191 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Modified from DETR (https://github.com/facebookresearch/detr) -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from scipy.optimize import linear_sum_assignment - -from ppdet.core.workspace import register, serializable -from ..losses.iou_loss import GIoULoss -from .utils import bbox_cxcywh_to_xyxy - -__all__ = ['HungarianMatcher'] - - -@register -@serializable -class HungarianMatcher(nn.Layer): - __shared__ = ['use_focal_loss', 'with_mask', 'num_sample_points'] - - def __init__(self, - matcher_coeff={ - 'class': 1, - 'bbox': 5, - 'giou': 2, - 'mask': 1, - 'dice': 1 - }, - use_focal_loss=False, - with_mask=False, - num_sample_points=12544, - alpha=0.25, - gamma=2.0): - r""" - Args: - matcher_coeff (dict): The coefficient of hungarian matcher cost. - """ - super(HungarianMatcher, self).__init__() - self.matcher_coeff = matcher_coeff - self.use_focal_loss = use_focal_loss - self.with_mask = with_mask - self.num_sample_points = num_sample_points - self.alpha = alpha - self.gamma = gamma - - self.giou_loss = GIoULoss() - - def forward(self, - boxes, - logits, - gt_bbox, - gt_class, - masks=None, - gt_mask=None): - r""" - Args: - boxes (Tensor): [b, query, 4] - logits (Tensor): [b, query, num_classes] - gt_bbox (List(Tensor)): list[[n, 4]] - gt_class (List(Tensor)): list[[n, 1]] - masks (Tensor|None): [b, query, h, w] - gt_mask (List(Tensor)): list[[n, H, W]] - - Returns: - A list of size batch_size, containing tuples of (index_i, index_j) where: - - index_i is the indices of the selected predictions (in order) - - index_j is the indices of the corresponding selected targets (in order) - For each batch element, it holds: - len(index_i) = len(index_j) = min(num_queries, num_target_boxes) - """ - bs, num_queries = boxes.shape[:2] - - num_gts = [len(a) for a in gt_class] - if sum(num_gts) == 0: - return [(paddle.to_tensor( - [], dtype=paddle.int64), paddle.to_tensor( - [], dtype=paddle.int64)) for _ in range(bs)] - - # We flatten to compute the cost matrices in a batch - # [batch_size * num_queries, num_classes] - logits = logits.detach() - out_prob = F.sigmoid(logits.flatten( - 0, 1)) if self.use_focal_loss else F.softmax(logits.flatten(0, 1)) - # [batch_size * num_queries, 4] - out_bbox = boxes.detach().flatten(0, 1) - - # Also concat the target labels and boxes - tgt_ids = paddle.concat(gt_class).flatten() - tgt_bbox = paddle.concat(gt_bbox) - - # Compute the classification cost - out_prob = paddle.gather(out_prob, tgt_ids, axis=1) - if self.use_focal_loss: - neg_cost_class = (1 - self.alpha) * (out_prob**self.gamma) * (-( - 1 - out_prob + 1e-8).log()) - pos_cost_class = self.alpha * ( - (1 - out_prob)**self.gamma) * (-(out_prob + 1e-8).log()) - cost_class = pos_cost_class - neg_cost_class - else: - cost_class = -out_prob - - # Compute the L1 cost between boxes - cost_bbox = ( - out_bbox.unsqueeze(1) - tgt_bbox.unsqueeze(0)).abs().sum(-1) - - # Compute the giou cost betwen boxes - giou_loss = self.giou_loss( - bbox_cxcywh_to_xyxy(out_bbox.unsqueeze(1)), - bbox_cxcywh_to_xyxy(tgt_bbox.unsqueeze(0))).squeeze(-1) - cost_giou = giou_loss - 1 - - # Final cost matrix - C = self.matcher_coeff['class'] * cost_class + \ - self.matcher_coeff['bbox'] * cost_bbox + \ - self.matcher_coeff['giou'] * cost_giou - # Compute the mask cost and dice cost - if self.with_mask: - assert (masks is not None and gt_mask is not None, - 'Make sure the input has `mask` and `gt_mask`') - # all masks share the same set of points for efficient matching - sample_points = paddle.rand([bs, 1, self.num_sample_points, 2]) - sample_points = 2.0 * sample_points - 1.0 - - out_mask = F.grid_sample( - masks.detach(), sample_points, align_corners=False).squeeze(-2) - out_mask = out_mask.flatten(0, 1) - - tgt_mask = paddle.concat(gt_mask).unsqueeze(1) - sample_points = paddle.concat([ - a.tile([b, 1, 1, 1]) for a, b in zip(sample_points, num_gts) - if b > 0 - ]) - tgt_mask = F.grid_sample( - tgt_mask, sample_points, align_corners=False).squeeze([1, 2]) - - with paddle.amp.auto_cast(enable=False): - # binary cross entropy cost - pos_cost_mask = F.binary_cross_entropy_with_logits( - out_mask, paddle.ones_like(out_mask), reduction='none') - neg_cost_mask = F.binary_cross_entropy_with_logits( - out_mask, paddle.zeros_like(out_mask), reduction='none') - cost_mask = paddle.matmul( - pos_cost_mask, tgt_mask, transpose_y=True) + paddle.matmul( - neg_cost_mask, 1 - tgt_mask, transpose_y=True) - cost_mask /= self.num_sample_points - - # dice cost - out_mask = F.sigmoid(out_mask) - numerator = 2 * paddle.matmul( - out_mask, tgt_mask, transpose_y=True) - denominator = out_mask.sum( - -1, keepdim=True) + tgt_mask.sum(-1).unsqueeze(0) - cost_dice = 1 - (numerator + 1) / (denominator + 1) - - C = C + self.matcher_coeff['mask'] * cost_mask + \ - self.matcher_coeff['dice'] * cost_dice - - C = C.reshape([bs, num_queries, -1]) - C = [a.squeeze(0) for a in C.chunk(bs)] - sizes = [a.shape[0] for a in gt_bbox] - if hasattr(paddle.Tensor, "contiguous"): - indices = [ - linear_sum_assignment(c.split(sizes, -1)[i].contiguous().numpy()) - for i, c in enumerate(C) - ] - else: - indices = [ - linear_sum_assignment(c.split(sizes, -1)[i].numpy()) - for i, c in enumerate(C) - ] - return [(paddle.to_tensor( - i, dtype=paddle.int64), paddle.to_tensor( - j, dtype=paddle.int64)) for i, j in indices] diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/petr_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/petr_transformer.py deleted file mode 100644 index 7859b0d..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/transformers/petr_transformer.py +++ /dev/null @@ -1,1198 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/utils/transformer.py -""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import math -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr - -from ppdet.core.workspace import register -from ..layers import MultiHeadAttention, _convert_attention_mask -from .utils import _get_clones -from ..initializer import linear_init_, normal_, constant_, xavier_uniform_ - -__all__ = [ - 'PETRTransformer', 'MultiScaleDeformablePoseAttention', - 'PETR_TransformerDecoderLayer', 'PETR_TransformerDecoder', - 'PETR_DeformableDetrTransformerDecoder', - 'PETR_DeformableTransformerDecoder', 'TransformerEncoderLayer', - 'TransformerEncoder', 'MSDeformableAttention' -] - - -def masked_fill(x, mask, value): - y = paddle.full(x.shape, value, x.dtype) - return paddle.where(mask, y, x) - - -def inverse_sigmoid(x, eps=1e-5): - """Inverse function of sigmoid. - - Args: - x (Tensor): The tensor to do the - inverse. - eps (float): EPS avoid numerical - overflow. Defaults 1e-5. - Returns: - Tensor: The x has passed the inverse - function of sigmoid, has same - shape with input. - """ - x = x.clip(min=0, max=1) - x1 = x.clip(min=eps) - x2 = (1 - x).clip(min=eps) - return paddle.log(x1 / x2) - - -@register -class TransformerEncoderLayer(nn.Layer): - __inject__ = ['attn'] - - def __init__(self, - d_model, - attn=None, - nhead=8, - dim_feedforward=2048, - dropout=0.1, - activation="relu", - attn_dropout=None, - act_dropout=None, - normalize_before=False): - super(TransformerEncoderLayer, self).__init__() - attn_dropout = dropout if attn_dropout is None else attn_dropout - act_dropout = dropout if act_dropout is None else act_dropout - self.normalize_before = normalize_before - self.embed_dims = d_model - - if attn is None: - self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout) - else: - self.self_attn = attn - # Implementation of Feedforward model - self.linear1 = nn.Linear(d_model, dim_feedforward) - self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train") - self.linear2 = nn.Linear(dim_feedforward, d_model) - - self.norm1 = nn.LayerNorm(d_model) - self.norm2 = nn.LayerNorm(d_model) - self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") - self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train") - self.activation = getattr(F, activation) - self._reset_parameters() - - def _reset_parameters(self): - linear_init_(self.linear1) - linear_init_(self.linear2) - - @staticmethod - def with_pos_embed(tensor, pos_embed): - return tensor if pos_embed is None else tensor + pos_embed - - def forward(self, src, src_mask=None, pos_embed=None, **kwargs): - residual = src - if self.normalize_before: - src = self.norm1(src) - q = k = self.with_pos_embed(src, pos_embed) - src = self.self_attn(q, k, value=src, attn_mask=src_mask, **kwargs) - - src = residual + self.dropout1(src) - if not self.normalize_before: - src = self.norm1(src) - - residual = src - if self.normalize_before: - src = self.norm2(src) - src = self.linear2(self.dropout(self.activation(self.linear1(src)))) - src = residual + self.dropout2(src) - if not self.normalize_before: - src = self.norm2(src) - return src - - -@register -class TransformerEncoder(nn.Layer): - __inject__ = ['encoder_layer'] - - def __init__(self, encoder_layer, num_layers, norm=None): - super(TransformerEncoder, self).__init__() - self.layers = _get_clones(encoder_layer, num_layers) - self.num_layers = num_layers - self.norm = norm - self.embed_dims = encoder_layer.embed_dims - - def forward(self, src, src_mask=None, pos_embed=None, **kwargs): - output = src - for layer in self.layers: - output = layer( - output, src_mask=src_mask, pos_embed=pos_embed, **kwargs) - - if self.norm is not None: - output = self.norm(output) - - return output - - -@register -class MSDeformableAttention(nn.Layer): - def __init__(self, - embed_dim=256, - num_heads=8, - num_levels=4, - num_points=4, - lr_mult=0.1): - """ - Multi-Scale Deformable Attention Module - """ - super(MSDeformableAttention, self).__init__() - self.embed_dim = embed_dim - self.num_heads = num_heads - self.num_levels = num_levels - self.num_points = num_points - self.total_points = num_heads * num_levels * num_points - - self.head_dim = embed_dim // num_heads - assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" - - self.sampling_offsets = nn.Linear( - embed_dim, - self.total_points * 2, - weight_attr=ParamAttr(learning_rate=lr_mult), - bias_attr=ParamAttr(learning_rate=lr_mult)) - - self.attention_weights = nn.Linear(embed_dim, self.total_points) - self.value_proj = nn.Linear(embed_dim, embed_dim) - self.output_proj = nn.Linear(embed_dim, embed_dim) - try: - # use cuda op - print("use deformable_detr_ops in ms_deformable_attn") - from deformable_detr_ops import ms_deformable_attn - except: - # use paddle func - from .utils import deformable_attention_core_func as ms_deformable_attn - self.ms_deformable_attn_core = ms_deformable_attn - - self._reset_parameters() - - def _reset_parameters(self): - # sampling_offsets - constant_(self.sampling_offsets.weight) - thetas = paddle.arange( - self.num_heads, - dtype=paddle.float32) * (2.0 * math.pi / self.num_heads) - grid_init = paddle.stack([thetas.cos(), thetas.sin()], -1) - grid_init = grid_init / grid_init.abs().max(-1, keepdim=True) - grid_init = grid_init.reshape([self.num_heads, 1, 1, 2]).tile( - [1, self.num_levels, self.num_points, 1]) - scaling = paddle.arange( - 1, self.num_points + 1, - dtype=paddle.float32).reshape([1, 1, -1, 1]) - grid_init *= scaling - self.sampling_offsets.bias.set_value(grid_init.flatten()) - # attention_weights - constant_(self.attention_weights.weight) - constant_(self.attention_weights.bias) - # proj - xavier_uniform_(self.value_proj.weight) - constant_(self.value_proj.bias) - xavier_uniform_(self.output_proj.weight) - constant_(self.output_proj.bias) - - def forward(self, - query, - key, - value, - reference_points, - value_spatial_shapes, - value_level_start_index, - attn_mask=None, - **kwargs): - """ - Args: - query (Tensor): [bs, query_length, C] - reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0), - bottom-right (1, 1), including padding area - value (Tensor): [bs, value_length, C] - value_spatial_shapes (Tensor): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] - value_level_start_index (Tensor(int64)): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...] - attn_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements - - Returns: - output (Tensor): [bs, Length_{query}, C] - """ - bs, Len_q = query.shape[:2] - Len_v = value.shape[1] - assert int(value_spatial_shapes.prod(1).sum()) == Len_v - - value = self.value_proj(value) - if attn_mask is not None: - attn_mask = attn_mask.astype(value.dtype).unsqueeze(-1) - value *= attn_mask - value = value.reshape([bs, Len_v, self.num_heads, self.head_dim]) - - sampling_offsets = self.sampling_offsets(query).reshape( - [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2]) - attention_weights = self.attention_weights(query).reshape( - [bs, Len_q, self.num_heads, self.num_levels * self.num_points]) - attention_weights = F.softmax(attention_weights).reshape( - [bs, Len_q, self.num_heads, self.num_levels, self.num_points]) - - if reference_points.shape[-1] == 2: - offset_normalizer = value_spatial_shapes.flip([1]).reshape( - [1, 1, 1, self.num_levels, 1, 2]) - sampling_locations = reference_points.reshape([ - bs, Len_q, 1, self.num_levels, 1, 2 - ]) + sampling_offsets / offset_normalizer - elif reference_points.shape[-1] == 4: - sampling_locations = ( - reference_points[:, :, None, :, None, :2] + sampling_offsets / - self.num_points * reference_points[:, :, None, :, None, 2:] * - 0.5) - else: - raise ValueError( - "Last dim of reference_points must be 2 or 4, but get {} instead.". - format(reference_points.shape[-1])) - - output = self.ms_deformable_attn_core( - value, value_spatial_shapes, value_level_start_index, - sampling_locations, attention_weights) - output = self.output_proj(output) - - return output - - -@register -class MultiScaleDeformablePoseAttention(nn.Layer): - """An attention module used in PETR. `End-to-End Multi-Person - Pose Estimation with Transformers`. - - Args: - embed_dims (int): The embedding dimension of Attention. - Default: 256. - num_heads (int): Parallel attention heads. Default: 8. - num_levels (int): The number of feature map used in - Attention. Default: 4. - num_points (int): The number of sampling points for - each query in each head. Default: 17. - im2col_step (int): The step used in image_to_column. - Default: 64. - dropout (float): A Dropout layer on `inp_residual`. - Default: 0.1. - init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. - Default: None. - """ - - def __init__(self, - embed_dims=256, - num_heads=8, - num_levels=4, - num_points=17, - im2col_step=64, - dropout=0.1, - norm_cfg=None, - init_cfg=None, - batch_first=False, - lr_mult=0.1): - super().__init__() - if embed_dims % num_heads != 0: - raise ValueError(f'embed_dims must be divisible by num_heads, ' - f'but got {embed_dims} and {num_heads}') - dim_per_head = embed_dims // num_heads - self.norm_cfg = norm_cfg - self.init_cfg = init_cfg - self.dropout = nn.Dropout(dropout) - self.batch_first = batch_first - - # you'd better set dim_per_head to a power of 2 - # which is more efficient in the CUDA implementation - def _is_power_of_2(n): - if (not isinstance(n, int)) or (n < 0): - raise ValueError( - 'invalid input for _is_power_of_2: {} (type: {})'.format( - n, type(n))) - return (n & (n - 1) == 0) and n != 0 - - if not _is_power_of_2(dim_per_head): - warnings.warn("You'd better set embed_dims in " - 'MultiScaleDeformAttention to make ' - 'the dimension of each attention head a power of 2 ' - 'which is more efficient in our CUDA implementation.') - - self.im2col_step = im2col_step - self.embed_dims = embed_dims - self.num_levels = num_levels - self.num_heads = num_heads - self.num_points = num_points - self.sampling_offsets = nn.Linear( - embed_dims, - num_heads * num_levels * num_points * 2, - weight_attr=ParamAttr(learning_rate=lr_mult), - bias_attr=ParamAttr(learning_rate=lr_mult)) - self.attention_weights = nn.Linear(embed_dims, - num_heads * num_levels * num_points) - self.value_proj = nn.Linear(embed_dims, embed_dims) - self.output_proj = nn.Linear(embed_dims, embed_dims) - - try: - # use cuda op - from deformable_detr_ops import ms_deformable_attn - except: - # use paddle func - from .utils import deformable_attention_core_func as ms_deformable_attn - self.ms_deformable_attn_core = ms_deformable_attn - - self.init_weights() - - def init_weights(self): - """Default initialization for Parameters of Module.""" - constant_(self.sampling_offsets.weight) - constant_(self.sampling_offsets.bias) - constant_(self.attention_weights.weight) - constant_(self.attention_weights.bias) - xavier_uniform_(self.value_proj.weight) - constant_(self.value_proj.bias) - xavier_uniform_(self.output_proj.weight) - constant_(self.output_proj.bias) - - def forward(self, - query, - key, - value, - residual=None, - attn_mask=None, - reference_points=None, - value_spatial_shapes=None, - value_level_start_index=None, - **kwargs): - """Forward Function of MultiScaleDeformAttention. - - Args: - query (Tensor): Query of Transformer with shape - (num_query, bs, embed_dims). - key (Tensor): The key tensor with shape (num_key, bs, embed_dims). - value (Tensor): The value tensor with shape - (num_key, bs, embed_dims). - residual (Tensor): The tensor used for addition, with the - same shape as `x`. Default None. If None, `x` will be used. - reference_points (Tensor): The normalized reference points with - shape (bs, num_query, num_levels, K*2), all elements is range - in [0, 1], top-left (0,0), bottom-right (1, 1), including - padding area. - attn_mask (Tensor): ByteTensor for `query`, with - shape [bs, num_key]. - value_spatial_shapes (Tensor): Spatial shape of features in - different level. With shape (num_levels, 2), - last dimension represent (h, w). - value_level_start_index (Tensor): The start index of each level. - A tensor has shape (num_levels) and can be represented - as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. - - Returns: - Tensor: forwarded results with shape [num_query, bs, embed_dims]. - """ - - if key is None: - key = query - if value is None: - value = key - - bs, num_query, _ = query.shape - bs, num_key, _ = value.shape - assert (value_spatial_shapes[:, 0].numpy() * - value_spatial_shapes[:, 1].numpy()).sum() == num_key - - value = self.value_proj(value) - if attn_mask is not None: - # value = value.masked_fill(attn_mask[..., None], 0.0) - value *= attn_mask.unsqueeze(-1) - value = value.reshape([bs, num_key, self.num_heads, -1]) - sampling_offsets = self.sampling_offsets(query).reshape([ - bs, num_query, self.num_heads, self.num_levels, self.num_points, 2 - ]) - attention_weights = self.attention_weights(query).reshape( - [bs, num_query, self.num_heads, self.num_levels * self.num_points]) - attention_weights = F.softmax(attention_weights, axis=-1) - - attention_weights = attention_weights.reshape( - [bs, num_query, self.num_heads, self.num_levels, self.num_points]) - if reference_points.shape[-1] == self.num_points * 2: - reference_points_reshape = reference_points.reshape( - (bs, num_query, self.num_levels, -1, 2)).unsqueeze(2) - x1 = reference_points[:, :, :, 0::2].min(axis=-1, keepdim=True) - y1 = reference_points[:, :, :, 1::2].min(axis=-1, keepdim=True) - x2 = reference_points[:, :, :, 0::2].max(axis=-1, keepdim=True) - y2 = reference_points[:, :, :, 1::2].max(axis=-1, keepdim=True) - w = paddle.clip(x2 - x1, min=1e-4) - h = paddle.clip(y2 - y1, min=1e-4) - wh = paddle.concat([w, h], axis=-1)[:, :, None, :, None, :] - - sampling_locations = reference_points_reshape \ - + sampling_offsets * wh * 0.5 - else: - raise ValueError( - f'Last dim of reference_points must be' - f' 2K, but get {reference_points.shape[-1]} instead.') - - output = self.ms_deformable_attn_core( - value, value_spatial_shapes, value_level_start_index, - sampling_locations, attention_weights) - - output = self.output_proj(output) - return output - - -@register -class PETR_TransformerDecoderLayer(nn.Layer): - __inject__ = ['self_attn', 'cross_attn'] - - def __init__(self, - d_model, - nhead=8, - self_attn=None, - cross_attn=None, - dim_feedforward=2048, - dropout=0.1, - activation="relu", - attn_dropout=None, - act_dropout=None, - normalize_before=False): - super(PETR_TransformerDecoderLayer, self).__init__() - attn_dropout = dropout if attn_dropout is None else attn_dropout - act_dropout = dropout if act_dropout is None else act_dropout - self.normalize_before = normalize_before - - if self_attn is None: - self.self_attn = MultiHeadAttention(d_model, nhead, attn_dropout) - else: - self.self_attn = self_attn - if cross_attn is None: - self.cross_attn = MultiHeadAttention(d_model, nhead, attn_dropout) - else: - self.cross_attn = cross_attn - # Implementation of Feedforward model - self.linear1 = nn.Linear(d_model, dim_feedforward) - self.dropout = nn.Dropout(act_dropout, mode="upscale_in_train") - self.linear2 = nn.Linear(dim_feedforward, d_model) - - self.norm1 = nn.LayerNorm(d_model) - self.norm2 = nn.LayerNorm(d_model) - self.norm3 = nn.LayerNorm(d_model) - self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") - self.dropout2 = nn.Dropout(dropout, mode="upscale_in_train") - self.dropout3 = nn.Dropout(dropout, mode="upscale_in_train") - self.activation = getattr(F, activation) - self._reset_parameters() - - def _reset_parameters(self): - linear_init_(self.linear1) - linear_init_(self.linear2) - - @staticmethod - def with_pos_embed(tensor, pos_embed): - return tensor if pos_embed is None else tensor + pos_embed - - def forward(self, - tgt, - memory, - tgt_mask=None, - memory_mask=None, - pos_embed=None, - query_pos_embed=None, - **kwargs): - tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) - - residual = tgt - if self.normalize_before: - tgt = self.norm1(tgt) - q = k = self.with_pos_embed(tgt, query_pos_embed) - tgt = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask) - tgt = residual + self.dropout1(tgt) - if not self.normalize_before: - tgt = self.norm1(tgt) - - residual = tgt - if self.normalize_before: - tgt = self.norm2(tgt) - q = self.with_pos_embed(tgt, query_pos_embed) - key_tmp = tgt - # k = self.with_pos_embed(memory, pos_embed) - tgt = self.cross_attn( - q, key=key_tmp, value=memory, attn_mask=memory_mask, **kwargs) - tgt = residual + self.dropout2(tgt) - if not self.normalize_before: - tgt = self.norm2(tgt) - - residual = tgt - if self.normalize_before: - tgt = self.norm3(tgt) - tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) - tgt = residual + self.dropout3(tgt) - if not self.normalize_before: - tgt = self.norm3(tgt) - return tgt - - -@register -class PETR_TransformerDecoder(nn.Layer): - """Implements the decoder in PETR transformer. - - Args: - return_intermediate (bool): Whether to return intermediate outputs. - coder_norm_cfg (dict): Config of last normalization layer. Default: - `LN`. - """ - __inject__ = ['decoder_layer'] - - def __init__(self, - decoder_layer, - num_layers, - norm=None, - return_intermediate=False, - num_keypoints=17, - **kwargs): - super(PETR_TransformerDecoder, self).__init__() - self.layers = _get_clones(decoder_layer, num_layers) - self.num_layers = num_layers - self.norm = norm - self.return_intermediate = return_intermediate - self.num_keypoints = num_keypoints - - def forward(self, - query, - *args, - reference_points=None, - valid_ratios=None, - kpt_branches=None, - **kwargs): - """Forward function for `TransformerDecoder`. - - Args: - query (Tensor): Input query with shape (num_query, bs, embed_dims). - reference_points (Tensor): The reference points of offset, - has shape (bs, num_query, K*2). - valid_ratios (Tensor): The radios of valid points on the feature - map, has shape (bs, num_levels, 2). - kpt_branches: (obj:`nn.LayerList`): Used for refining the - regression results. Only would be passed when `with_box_refine` - is True, otherwise would be passed a `None`. - - Returns: - tuple (Tensor): Results with shape [1, num_query, bs, embed_dims] when - return_intermediate is `False`, otherwise it has shape - [num_layers, num_query, bs, embed_dims] and - [num_layers, bs, num_query, K*2]. - """ - output = query - intermediate = [] - intermediate_reference_points = [] - for lid, layer in enumerate(self.layers): - if reference_points.shape[-1] == self.num_keypoints * 2: - reference_points_input = \ - reference_points[:, :, None] * \ - valid_ratios.tile((1, 1, self.num_keypoints))[:, None] - else: - assert reference_points.shape[-1] == 2 - reference_points_input = reference_points[:, :, None] * \ - valid_ratios[:, None] - output = layer( - output, - *args, - reference_points=reference_points_input, - **kwargs) - - if kpt_branches is not None: - tmp = kpt_branches[lid](output) - if reference_points.shape[-1] == self.num_keypoints * 2: - new_reference_points = tmp + inverse_sigmoid( - reference_points) - new_reference_points = F.sigmoid(new_reference_points) - else: - raise NotImplementedError - reference_points = new_reference_points.detach() - - if self.return_intermediate: - intermediate.append(output) - intermediate_reference_points.append(reference_points) - - if self.return_intermediate: - return paddle.stack(intermediate), paddle.stack( - intermediate_reference_points) - - return output, reference_points - - -@register -class PETR_DeformableTransformerDecoder(nn.Layer): - __inject__ = ['decoder_layer'] - - def __init__(self, decoder_layer, num_layers, return_intermediate=False): - super(PETR_DeformableTransformerDecoder, self).__init__() - self.layers = _get_clones(decoder_layer, num_layers) - self.num_layers = num_layers - self.return_intermediate = return_intermediate - - def forward(self, - tgt, - reference_points, - memory, - memory_spatial_shapes, - memory_mask=None, - query_pos_embed=None): - output = tgt - intermediate = [] - for lid, layer in enumerate(self.layers): - output = layer(output, reference_points, memory, - memory_spatial_shapes, memory_mask, query_pos_embed) - - if self.return_intermediate: - intermediate.append(output) - - if self.return_intermediate: - return paddle.stack(intermediate) - - return output.unsqueeze(0) - - -@register -class PETR_DeformableDetrTransformerDecoder(PETR_DeformableTransformerDecoder): - """Implements the decoder in DETR transformer. - - Args: - return_intermediate (bool): Whether to return intermediate outputs. - coder_norm_cfg (dict): Config of last normalization layer. Default: - `LN`. - """ - - def __init__(self, *args, return_intermediate=False, **kwargs): - - super(PETR_DeformableDetrTransformerDecoder, self).__init__(*args, - **kwargs) - self.return_intermediate = return_intermediate - - def forward(self, - query, - *args, - reference_points=None, - valid_ratios=None, - reg_branches=None, - **kwargs): - """Forward function for `TransformerDecoder`. - - Args: - query (Tensor): Input query with shape - `(num_query, bs, embed_dims)`. - reference_points (Tensor): The reference - points of offset. has shape - (bs, num_query, 4) when as_two_stage, - otherwise has shape ((bs, num_query, 2). - valid_ratios (Tensor): The radios of valid - points on the feature map, has shape - (bs, num_levels, 2) - reg_branch: (obj:`nn.LayerList`): Used for - refining the regression results. Only would - be passed when with_box_refine is True, - otherwise would be passed a `None`. - - Returns: - Tensor: Results with shape [1, num_query, bs, embed_dims] when - return_intermediate is `False`, otherwise it has shape - [num_layers, num_query, bs, embed_dims]. - """ - output = query - intermediate = [] - intermediate_reference_points = [] - for lid, layer in enumerate(self.layers): - if reference_points.shape[-1] == 4: - reference_points_input = reference_points[:, :, None] * \ - paddle.concat([valid_ratios, valid_ratios], -1)[:, None] - else: - assert reference_points.shape[-1] == 2 - reference_points_input = reference_points[:, :, None] * \ - valid_ratios[:, None] - output = layer( - output, - *args, - reference_points=reference_points_input, - **kwargs) - - if reg_branches is not None: - tmp = reg_branches[lid](output) - if reference_points.shape[-1] == 4: - new_reference_points = tmp + inverse_sigmoid( - reference_points) - new_reference_points = F.sigmoid(new_reference_points) - else: - assert reference_points.shape[-1] == 2 - new_reference_points = tmp - new_reference_points[..., :2] = tmp[ - ..., :2] + inverse_sigmoid(reference_points) - new_reference_points = F.sigmoid(new_reference_points) - reference_points = new_reference_points.detach() - - if self.return_intermediate: - intermediate.append(output) - intermediate_reference_points.append(reference_points) - - if self.return_intermediate: - return paddle.stack(intermediate), paddle.stack( - intermediate_reference_points) - - return output, reference_points - - -@register -class PETRTransformer(nn.Layer): - """Implements the PETR transformer. - - Args: - as_two_stage (bool): Generate query from encoder features. - Default: False. - num_feature_levels (int): Number of feature maps from FPN: - Default: 4. - two_stage_num_proposals (int): Number of proposals when set - `as_two_stage` as True. Default: 300. - """ - __inject__ = ["encoder", "decoder", "hm_encoder", "refine_decoder"] - - def __init__(self, - encoder="", - decoder="", - hm_encoder="", - refine_decoder="", - as_two_stage=True, - num_feature_levels=4, - two_stage_num_proposals=300, - num_keypoints=17, - **kwargs): - super(PETRTransformer, self).__init__(**kwargs) - self.as_two_stage = as_two_stage - self.num_feature_levels = num_feature_levels - self.two_stage_num_proposals = two_stage_num_proposals - self.num_keypoints = num_keypoints - self.encoder = encoder - self.decoder = decoder - self.embed_dims = self.encoder.embed_dims - self.hm_encoder = hm_encoder - self.refine_decoder = refine_decoder - self.init_layers() - self.init_weights() - - def init_layers(self): - """Initialize layers of the DeformableDetrTransformer.""" - #paddle.create_parameter - self.level_embeds = paddle.create_parameter( - (self.num_feature_levels, self.embed_dims), dtype="float32") - - if self.as_two_stage: - self.enc_output = nn.Linear(self.embed_dims, self.embed_dims) - self.enc_output_norm = nn.LayerNorm(self.embed_dims) - self.refine_query_embedding = nn.Embedding(self.num_keypoints, - self.embed_dims * 2) - else: - self.reference_points = nn.Linear(self.embed_dims, - 2 * self.num_keypoints) - - def init_weights(self): - """Initialize the transformer weights.""" - for p in self.parameters(): - if p.rank() > 1: - xavier_uniform_(p) - if hasattr(p, 'bias') and p.bias is not None: - constant_(p.bais) - for m in self.sublayers(): - if isinstance(m, MSDeformableAttention): - m._reset_parameters() - for m in self.sublayers(): - if isinstance(m, MultiScaleDeformablePoseAttention): - m.init_weights() - if not self.as_two_stage: - xavier_uniform_(self.reference_points.weight) - constant_(self.reference_points.bias) - normal_(self.level_embeds) - normal_(self.refine_query_embedding.weight) - - def gen_encoder_output_proposals(self, memory, memory_padding_mask, - spatial_shapes): - """Generate proposals from encoded memory. - - Args: - memory (Tensor): The output of encoder, has shape - (bs, num_key, embed_dim). num_key is equal the number of points - on feature map from all level. - memory_padding_mask (Tensor): Padding mask for memory. - has shape (bs, num_key). - spatial_shapes (Tensor): The shape of all feature maps. - has shape (num_level, 2). - - Returns: - tuple: A tuple of feature map and bbox prediction. - - - output_memory (Tensor): The input of decoder, has shape - (bs, num_key, embed_dim). num_key is equal the number of - points on feature map from all levels. - - output_proposals (Tensor): The normalized proposal - after a inverse sigmoid, has shape (bs, num_keys, 4). - """ - - N, S, C = memory.shape - proposals = [] - _cur = 0 - for lvl, (H, W) in enumerate(spatial_shapes): - mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H * W)].reshape( - [N, H, W, 1]) - valid_H = paddle.sum(mask_flatten_[:, :, 0, 0], 1) - valid_W = paddle.sum(mask_flatten_[:, 0, :, 0], 1) - - grid_y, grid_x = paddle.meshgrid( - paddle.linspace( - 0, H - 1, H, dtype="float32"), - paddle.linspace( - 0, W - 1, W, dtype="float32")) - grid = paddle.concat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], - -1) - - scale = paddle.concat( - [valid_W.unsqueeze(-1), - valid_H.unsqueeze(-1)], 1).reshape([N, 1, 1, 2]) - grid = (grid.unsqueeze(0).expand((N, -1, -1, -1)) + 0.5) / scale - proposal = grid.reshape([N, -1, 2]) - proposals.append(proposal) - _cur += (H * W) - output_proposals = paddle.concat(proposals, 1) - output_proposals_valid = ((output_proposals > 0.01) & - (output_proposals < 0.99)).all( - -1, keepdim=True).astype("bool") - output_proposals = paddle.log(output_proposals / (1 - output_proposals)) - output_proposals = masked_fill( - output_proposals, ~memory_padding_mask.astype("bool").unsqueeze(-1), - float('inf')) - output_proposals = masked_fill(output_proposals, - ~output_proposals_valid, float('inf')) - - output_memory = memory - output_memory = masked_fill( - output_memory, ~memory_padding_mask.astype("bool").unsqueeze(-1), - float(0)) - output_memory = masked_fill(output_memory, ~output_proposals_valid, - float(0)) - output_memory = self.enc_output_norm(self.enc_output(output_memory)) - return output_memory, output_proposals - - @staticmethod - def get_reference_points(spatial_shapes, valid_ratios): - """Get the reference points used in decoder. - - Args: - spatial_shapes (Tensor): The shape of all feature maps, - has shape (num_level, 2). - valid_ratios (Tensor): The radios of valid points on the - feature map, has shape (bs, num_levels, 2). - - Returns: - Tensor: reference points used in decoder, has \ - shape (bs, num_keys, num_levels, 2). - """ - reference_points_list = [] - for lvl, (H, W) in enumerate(spatial_shapes): - ref_y, ref_x = paddle.meshgrid( - paddle.linspace( - 0.5, H - 0.5, H, dtype="float32"), - paddle.linspace( - 0.5, W - 0.5, W, dtype="float32")) - ref_y = ref_y.reshape( - (-1, ))[None] / (valid_ratios[:, None, lvl, 1] * H) - ref_x = ref_x.reshape( - (-1, ))[None] / (valid_ratios[:, None, lvl, 0] * W) - ref = paddle.stack((ref_x, ref_y), -1) - reference_points_list.append(ref) - reference_points = paddle.concat(reference_points_list, 1) - reference_points = reference_points[:, :, None] * valid_ratios[:, None] - return reference_points - - def get_valid_ratio(self, mask): - """Get the valid radios of feature maps of all level.""" - _, H, W = mask.shape - valid_H = paddle.sum(mask[:, :, 0].astype('float'), 1) - valid_W = paddle.sum(mask[:, 0, :].astype('float'), 1) - valid_ratio_h = valid_H.astype('float') / H - valid_ratio_w = valid_W.astype('float') / W - valid_ratio = paddle.stack([valid_ratio_w, valid_ratio_h], -1) - return valid_ratio - - def get_proposal_pos_embed(self, - proposals, - num_pos_feats=128, - temperature=10000): - """Get the position embedding of proposal.""" - scale = 2 * math.pi - dim_t = paddle.arange(num_pos_feats, dtype="float32") - dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats) - # N, L, 4 - proposals = F.sigmoid(proposals) * scale - # N, L, 4, 128 - pos = proposals[:, :, :, None] / dim_t - # N, L, 4, 64, 2 - pos = paddle.stack( - (pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), - axis=4).flatten(2) - return pos - - def forward(self, - mlvl_feats, - mlvl_masks, - query_embed, - mlvl_pos_embeds, - kpt_branches=None, - cls_branches=None): - """Forward function for `Transformer`. - - Args: - mlvl_feats (list(Tensor)): Input queries from different level. - Each element has shape [bs, embed_dims, h, w]. - mlvl_masks (list(Tensor)): The key_padding_mask from different - level used for encoder and decoder, each element has shape - [bs, h, w]. - query_embed (Tensor): The query embedding for decoder, - with shape [num_query, c]. - mlvl_pos_embeds (list(Tensor)): The positional encoding - of feats from different level, has the shape - [bs, embed_dims, h, w]. - kpt_branches (obj:`nn.LayerList`): Keypoint Regression heads for - feature maps from each decoder layer. Only would be passed when - `with_box_refine` is Ture. Default to None. - cls_branches (obj:`nn.LayerList`): Classification heads for - feature maps from each decoder layer. Only would be passed when - `as_two_stage` is Ture. Default to None. - - Returns: - tuple[Tensor]: results of decoder containing the following tensor. - - - inter_states: Outputs from decoder. If - `return_intermediate_dec` is True output has shape \ - (num_dec_layers, bs, num_query, embed_dims), else has \ - shape (1, bs, num_query, embed_dims). - - init_reference_out: The initial value of reference \ - points, has shape (bs, num_queries, 4). - - inter_references_out: The internal value of reference \ - points in decoder, has shape \ - (num_dec_layers, bs,num_query, embed_dims) - - enc_outputs_class: The classification score of proposals \ - generated from encoder's feature maps, has shape \ - (batch, h*w, num_classes). \ - Only would be returned when `as_two_stage` is True, \ - otherwise None. - - enc_outputs_kpt_unact: The regression results generated from \ - encoder's feature maps., has shape (batch, h*w, K*2). - Only would be returned when `as_two_stage` is True, \ - otherwise None. - """ - assert self.as_two_stage or query_embed is not None - - feat_flatten = [] - mask_flatten = [] - lvl_pos_embed_flatten = [] - spatial_shapes = [] - for lvl, (feat, mask, pos_embed - ) in enumerate(zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): - bs, c, h, w = feat.shape - spatial_shape = (h, w) - spatial_shapes.append(spatial_shape) - feat = feat.flatten(2).transpose((0, 2, 1)) - mask = mask.flatten(1) - pos_embed = pos_embed.flatten(2).transpose((0, 2, 1)) - lvl_pos_embed = pos_embed + self.level_embeds[lvl].reshape( - [1, 1, -1]) - lvl_pos_embed_flatten.append(lvl_pos_embed) - feat_flatten.append(feat) - mask_flatten.append(mask) - feat_flatten = paddle.concat(feat_flatten, 1) - mask_flatten = paddle.concat(mask_flatten, 1) - lvl_pos_embed_flatten = paddle.concat(lvl_pos_embed_flatten, 1) - spatial_shapes_cumsum = paddle.to_tensor( - np.array(spatial_shapes).prod(1).cumsum(0)) - spatial_shapes = paddle.to_tensor(spatial_shapes, dtype="int64") - level_start_index = paddle.concat((paddle.zeros( - (1, ), dtype=spatial_shapes.dtype), spatial_shapes_cumsum[:-1])) - valid_ratios = paddle.stack( - [self.get_valid_ratio(m) for m in mlvl_masks], 1) - - reference_points = \ - self.get_reference_points(spatial_shapes, - valid_ratios) - - memory = self.encoder( - src=feat_flatten, - pos_embed=lvl_pos_embed_flatten, - src_mask=mask_flatten, - value_spatial_shapes=spatial_shapes, - reference_points=reference_points, - value_level_start_index=level_start_index, - valid_ratios=valid_ratios) - - bs, _, c = memory.shape - - hm_proto = None - if self.training: - hm_memory = paddle.slice( - memory, - starts=level_start_index[0], - ends=level_start_index[1], - axes=[1]) - hm_pos_embed = paddle.slice( - lvl_pos_embed_flatten, - starts=level_start_index[0], - ends=level_start_index[1], - axes=[1]) - hm_mask = paddle.slice( - mask_flatten, - starts=level_start_index[0], - ends=level_start_index[1], - axes=[1]) - hm_reference_points = paddle.slice( - reference_points, - starts=level_start_index[0], - ends=level_start_index[1], - axes=[1])[:, :, :1, :] - - # official code make a mistake of pos_embed to pose_embed, which disable pos_embed - hm_memory = self.hm_encoder( - src=hm_memory, - pose_embed=hm_pos_embed, - src_mask=hm_mask, - value_spatial_shapes=spatial_shapes[[0]], - reference_points=hm_reference_points, - value_level_start_index=level_start_index[0], - valid_ratios=valid_ratios[:, :1, :]) - hm_memory = hm_memory.reshape((bs, spatial_shapes[0, 0], - spatial_shapes[0, 1], -1)) - hm_proto = (hm_memory, mlvl_masks[0]) - - if self.as_two_stage: - output_memory, output_proposals = \ - self.gen_encoder_output_proposals( - memory, mask_flatten, spatial_shapes) - enc_outputs_class = cls_branches[self.decoder.num_layers]( - output_memory) - enc_outputs_kpt_unact = \ - kpt_branches[self.decoder.num_layers](output_memory) - enc_outputs_kpt_unact[..., 0::2] += output_proposals[..., 0:1] - enc_outputs_kpt_unact[..., 1::2] += output_proposals[..., 1:2] - - topk = self.two_stage_num_proposals - topk_proposals = paddle.topk( - enc_outputs_class[..., 0], topk, axis=1)[1].unsqueeze(-1) - - #paddle.take_along_axis 对应torch.gather - topk_kpts_unact = paddle.take_along_axis(enc_outputs_kpt_unact, - topk_proposals, 1) - topk_kpts_unact = topk_kpts_unact.detach() - - reference_points = F.sigmoid(topk_kpts_unact) - init_reference_out = reference_points - # learnable query and query_pos - query_pos, query = paddle.split( - query_embed, query_embed.shape[1] // c, axis=1) - query_pos = query_pos.unsqueeze(0).expand((bs, -1, -1)) - query = query.unsqueeze(0).expand((bs, -1, -1)) - else: - query_pos, query = paddle.split( - query_embed, query_embed.shape[1] // c, axis=1) - query_pos = query_pos.unsqueeze(0).expand((bs, -1, -1)) - query = query.unsqueeze(0).expand((bs, -1, -1)) - reference_points = F.sigmoid(self.reference_points(query_pos)) - init_reference_out = reference_points - - # decoder - inter_states, inter_references = self.decoder( - query=query, - memory=memory, - query_pos_embed=query_pos, - memory_mask=mask_flatten, - reference_points=reference_points, - value_spatial_shapes=spatial_shapes, - value_level_start_index=level_start_index, - valid_ratios=valid_ratios, - kpt_branches=kpt_branches) - - inter_references_out = inter_references - if self.as_two_stage: - return inter_states, init_reference_out, \ - inter_references_out, enc_outputs_class, \ - enc_outputs_kpt_unact, hm_proto, memory - return inter_states, init_reference_out, \ - inter_references_out, None, None, None, None, None, hm_proto - - def forward_refine(self, - mlvl_masks, - memory, - reference_points_pose, - img_inds, - kpt_branches=None, - **kwargs): - mask_flatten = [] - spatial_shapes = [] - for lvl, mask in enumerate(mlvl_masks): - bs, h, w = mask.shape - spatial_shape = (h, w) - spatial_shapes.append(spatial_shape) - mask = mask.flatten(1) - mask_flatten.append(mask) - mask_flatten = paddle.concat(mask_flatten, 1) - spatial_shapes_cumsum = paddle.to_tensor( - np.array( - spatial_shapes, dtype='int64').prod(1).cumsum(0)) - spatial_shapes = paddle.to_tensor(spatial_shapes, dtype="int64") - level_start_index = paddle.concat((paddle.zeros( - (1, ), dtype=spatial_shapes.dtype), spatial_shapes_cumsum[:-1])) - valid_ratios = paddle.stack( - [self.get_valid_ratio(m) for m in mlvl_masks], 1) - - # pose refinement (17 queries corresponding to 17 keypoints) - # learnable query and query_pos - refine_query_embedding = self.refine_query_embedding.weight - query_pos, query = paddle.split(refine_query_embedding, 2, axis=1) - pos_num = reference_points_pose.shape[0] - query_pos = query_pos.unsqueeze(0).expand((pos_num, -1, -1)) - query = query.unsqueeze(0).expand((pos_num, -1, -1)) - reference_points = reference_points_pose.reshape( - (pos_num, reference_points_pose.shape[1] // 2, 2)) - pos_memory = memory[img_inds] - mask_flatten = mask_flatten[img_inds] - valid_ratios = valid_ratios[img_inds] - if img_inds.size == 1: - pos_memory = pos_memory.unsqueeze(0) - mask_flatten = mask_flatten.unsqueeze(0) - valid_ratios = valid_ratios.unsqueeze(0) - inter_states, inter_references = self.refine_decoder( - query=query, - memory=pos_memory, - query_pos_embed=query_pos, - memory_mask=mask_flatten, - reference_points=reference_points, - value_spatial_shapes=spatial_shapes, - value_level_start_index=level_start_index, - valid_ratios=valid_ratios, - reg_branches=kpt_branches, - **kwargs) - # [num_decoder, num_query, bs, embed_dim] - - init_reference_out = reference_points - return inter_states, init_reference_out, inter_references diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/position_encoding.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/position_encoding.py deleted file mode 100644 index a2c3260..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/transformers/position_encoding.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Modified from DETR (https://github.com/facebookresearch/detr) -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import math -import paddle -import paddle.nn as nn - -from ppdet.core.workspace import register, serializable - - -@register -@serializable -class PositionEmbedding(nn.Layer): - def __init__(self, - num_pos_feats=128, - temperature=10000, - normalize=True, - scale=2 * math.pi, - embed_type='sine', - num_embeddings=50, - offset=0., - eps=1e-6): - super(PositionEmbedding, self).__init__() - assert embed_type in ['sine', 'learned'] - - self.embed_type = embed_type - self.offset = offset - self.eps = eps - if self.embed_type == 'sine': - self.num_pos_feats = num_pos_feats - self.temperature = temperature - self.normalize = normalize - self.scale = scale - elif self.embed_type == 'learned': - self.row_embed = nn.Embedding(num_embeddings, num_pos_feats) - self.col_embed = nn.Embedding(num_embeddings, num_pos_feats) - else: - raise ValueError(f"{self.embed_type} is not supported.") - - def forward(self, mask): - """ - Args: - mask (Tensor): [B, H, W] - Returns: - pos (Tensor): [B, H, W, C] - """ - if self.embed_type == 'sine': - y_embed = mask.cumsum(1) - x_embed = mask.cumsum(2) - if self.normalize: - y_embed = (y_embed + self.offset) / ( - y_embed[:, -1:, :] + self.eps) * self.scale - x_embed = (x_embed + self.offset) / ( - x_embed[:, :, -1:] + self.eps) * self.scale - - dim_t = 2 * (paddle.arange(self.num_pos_feats) // - 2).astype('float32') - dim_t = self.temperature**(dim_t / self.num_pos_feats) - - pos_x = x_embed.unsqueeze(-1) / dim_t - pos_y = y_embed.unsqueeze(-1) / dim_t - pos_x = paddle.stack( - (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), - axis=4).flatten(3) - pos_y = paddle.stack( - (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), - axis=4).flatten(3) - return paddle.concat((pos_y, pos_x), axis=3) - elif self.embed_type == 'learned': - h, w = mask.shape[-2:] - i = paddle.arange(w) - j = paddle.arange(h) - x_emb = self.col_embed(i) - y_emb = self.row_embed(j) - return paddle.concat( - [ - x_emb.unsqueeze(0).tile([h, 1, 1]), - y_emb.unsqueeze(1).tile([1, w, 1]), - ], - axis=-1).unsqueeze(0) - else: - raise ValueError(f"not supported {self.embed_type}") diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/rtdetr_transformer.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/rtdetr_transformer.py deleted file mode 100644 index f3d021f..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/transformers/rtdetr_transformer.py +++ /dev/null @@ -1,557 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Modified from Deformable-DETR (https://github.com/fundamentalvision/Deformable-DETR) -# Copyright (c) 2020 SenseTime. All Rights Reserved. -# Modified from detrex (https://github.com/IDEA-Research/detrex) -# Copyright 2022 The IDEA Authors. All rights reserved. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr -from paddle.regularizer import L2Decay - -from ppdet.core.workspace import register -from ..layers import MultiHeadAttention -from ..heads.detr_head import MLP -from .deformable_transformer import MSDeformableAttention -from ..initializer import (linear_init_, constant_, xavier_uniform_, normal_, - bias_init_with_prob) -from .utils import (_get_clones, get_sine_pos_embed, - get_contrastive_denoising_training_group, inverse_sigmoid) - -__all__ = ['RTDETRTransformer'] - - -class PPMSDeformableAttention(MSDeformableAttention): - def forward(self, - query, - reference_points, - value, - value_spatial_shapes, - value_level_start_index, - value_mask=None): - """ - Args: - query (Tensor): [bs, query_length, C] - reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0), - bottom-right (1, 1), including padding area - value (Tensor): [bs, value_length, C] - value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] - value_level_start_index (List): [n_levels], [0, H_0*W_0, H_0*W_0+H_1*W_1, ...] - value_mask (Tensor): [bs, value_length], True for non-padding elements, False for padding elements - - Returns: - output (Tensor): [bs, Length_{query}, C] - """ - bs, Len_q = query.shape[:2] - Len_v = value.shape[1] - - value = self.value_proj(value) - if value_mask is not None: - value_mask = value_mask.astype(value.dtype).unsqueeze(-1) - value *= value_mask - value = value.reshape([bs, Len_v, self.num_heads, self.head_dim]) - - sampling_offsets = self.sampling_offsets(query).reshape( - [bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2]) - attention_weights = self.attention_weights(query).reshape( - [bs, Len_q, self.num_heads, self.num_levels * self.num_points]) - attention_weights = F.softmax(attention_weights).reshape( - [bs, Len_q, self.num_heads, self.num_levels, self.num_points]) - - if reference_points.shape[-1] == 2: - offset_normalizer = paddle.to_tensor(value_spatial_shapes) - offset_normalizer = offset_normalizer.flip([1]).reshape( - [1, 1, 1, self.num_levels, 1, 2]) - sampling_locations = reference_points.reshape([ - bs, Len_q, 1, self.num_levels, 1, 2 - ]) + sampling_offsets / offset_normalizer - elif reference_points.shape[-1] == 4: - sampling_locations = ( - reference_points[:, :, None, :, None, :2] + sampling_offsets / - self.num_points * reference_points[:, :, None, :, None, 2:] * - 0.5) - else: - raise ValueError( - "Last dim of reference_points must be 2 or 4, but get {} instead.". - format(reference_points.shape[-1])) - - if not isinstance(query, paddle.Tensor): - from ppdet.modeling.transformers.utils import deformable_attention_core_func - output = deformable_attention_core_func( - value, value_spatial_shapes, value_level_start_index, - sampling_locations, attention_weights) - else: - value_spatial_shapes = paddle.to_tensor(value_spatial_shapes) - value_level_start_index = paddle.to_tensor(value_level_start_index) - output = self.ms_deformable_attn_core( - value, value_spatial_shapes, value_level_start_index, - sampling_locations, attention_weights) - output = self.output_proj(output) - - return output - - -class TransformerDecoderLayer(nn.Layer): - def __init__(self, - d_model=256, - n_head=8, - dim_feedforward=1024, - dropout=0., - activation="relu", - n_levels=4, - n_points=4, - weight_attr=None, - bias_attr=None): - super(TransformerDecoderLayer, self).__init__() - - # self attention - self.self_attn = MultiHeadAttention(d_model, n_head, dropout=dropout) - self.dropout1 = nn.Dropout(dropout) - self.norm1 = nn.LayerNorm( - d_model, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - - # cross attention - self.cross_attn = PPMSDeformableAttention(d_model, n_head, n_levels, - n_points, 1.0) - self.dropout2 = nn.Dropout(dropout) - self.norm2 = nn.LayerNorm( - d_model, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - - # ffn - self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attr, - bias_attr) - self.activation = getattr(F, activation) - self.dropout3 = nn.Dropout(dropout) - self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attr, - bias_attr) - self.dropout4 = nn.Dropout(dropout) - self.norm3 = nn.LayerNorm( - d_model, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0))) - self._reset_parameters() - - def _reset_parameters(self): - linear_init_(self.linear1) - linear_init_(self.linear2) - xavier_uniform_(self.linear1.weight) - xavier_uniform_(self.linear2.weight) - - def with_pos_embed(self, tensor, pos): - return tensor if pos is None else tensor + pos - - def forward_ffn(self, tgt): - return self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) - - def forward(self, - tgt, - reference_points, - memory, - memory_spatial_shapes, - memory_level_start_index, - attn_mask=None, - memory_mask=None, - query_pos_embed=None): - # self attention - q = k = self.with_pos_embed(tgt, query_pos_embed) - if attn_mask is not None: - attn_mask = paddle.where( - attn_mask.astype('bool'), - paddle.zeros(attn_mask.shape, tgt.dtype), - paddle.full(attn_mask.shape, float("-inf"), tgt.dtype)) - tgt2 = self.self_attn(q, k, value=tgt, attn_mask=attn_mask) - tgt = tgt + self.dropout1(tgt2) - tgt = self.norm1(tgt) - - # cross attention - tgt2 = self.cross_attn( - self.with_pos_embed(tgt, query_pos_embed), reference_points, memory, - memory_spatial_shapes, memory_level_start_index, memory_mask) - tgt = tgt + self.dropout2(tgt2) - tgt = self.norm2(tgt) - - # ffn - tgt2 = self.forward_ffn(tgt) - tgt = tgt + self.dropout4(tgt2) - tgt = self.norm3(tgt) - - return tgt - - -class TransformerDecoder(nn.Layer): - def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1): - super(TransformerDecoder, self).__init__() - self.layers = _get_clones(decoder_layer, num_layers) - self.hidden_dim = hidden_dim - self.num_layers = num_layers - self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx - - def forward(self, - tgt, - ref_points_unact, - memory, - memory_spatial_shapes, - memory_level_start_index, - bbox_head, - score_head, - query_pos_head, - attn_mask=None, - memory_mask=None, - query_pos_head_inv_sig=False): - output = tgt - dec_out_bboxes = [] - dec_out_logits = [] - ref_points_detach = F.sigmoid(ref_points_unact) - for i, layer in enumerate(self.layers): - ref_points_input = ref_points_detach.unsqueeze(2) - if not query_pos_head_inv_sig: - query_pos_embed = query_pos_head(ref_points_detach) - else: - query_pos_embed = query_pos_head( - inverse_sigmoid(ref_points_detach)) - - output = layer(output, ref_points_input, memory, - memory_spatial_shapes, memory_level_start_index, - attn_mask, memory_mask, query_pos_embed) - - inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid( - ref_points_detach)) - - if self.training: - dec_out_logits.append(score_head[i](output)) - if i == 0: - dec_out_bboxes.append(inter_ref_bbox) - else: - dec_out_bboxes.append( - F.sigmoid(bbox_head[i](output) + inverse_sigmoid( - ref_points))) - elif i == self.eval_idx: - dec_out_logits.append(score_head[i](output)) - dec_out_bboxes.append(inter_ref_bbox) - break - - ref_points = inter_ref_bbox - ref_points_detach = inter_ref_bbox.detach( - ) if self.training else inter_ref_bbox - - return paddle.stack(dec_out_bboxes), paddle.stack(dec_out_logits) - - -@register -class RTDETRTransformer(nn.Layer): - __shared__ = ['num_classes', 'hidden_dim', 'eval_size'] - - def __init__(self, - num_classes=80, - hidden_dim=256, - num_queries=300, - position_embed_type='sine', - backbone_feat_channels=[512, 1024, 2048], - feat_strides=[8, 16, 32], - num_levels=3, - num_decoder_points=4, - nhead=8, - num_decoder_layers=6, - dim_feedforward=1024, - dropout=0., - activation="relu", - num_denoising=100, - label_noise_ratio=0.5, - box_noise_scale=1.0, - learnt_init_query=True, - query_pos_head_inv_sig=False, - eval_size=None, - eval_idx=-1, - eps=1e-2): - super(RTDETRTransformer, self).__init__() - assert position_embed_type in ['sine', 'learned'], \ - f'ValueError: position_embed_type not supported {position_embed_type}!' - assert len(backbone_feat_channels) <= num_levels - assert len(feat_strides) == len(backbone_feat_channels) - for _ in range(num_levels - len(feat_strides)): - feat_strides.append(feat_strides[-1] * 2) - - self.hidden_dim = hidden_dim - self.nhead = nhead - self.feat_strides = feat_strides - self.num_levels = num_levels - self.num_classes = num_classes - self.num_queries = num_queries - self.eps = eps - self.num_decoder_layers = num_decoder_layers - self.eval_size = eval_size - - # backbone feature projection - self._build_input_proj_layer(backbone_feat_channels) - - # Transformer module - decoder_layer = TransformerDecoderLayer( - hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, - num_decoder_points) - self.decoder = TransformerDecoder(hidden_dim, decoder_layer, - num_decoder_layers, eval_idx) - - # denoising part - self.denoising_class_embed = nn.Embedding( - num_classes, - hidden_dim, - weight_attr=ParamAttr(initializer=nn.initializer.Normal())) - self.num_denoising = num_denoising - self.label_noise_ratio = label_noise_ratio - self.box_noise_scale = box_noise_scale - - # decoder embedding - self.learnt_init_query = learnt_init_query - if learnt_init_query: - self.tgt_embed = nn.Embedding(num_queries, hidden_dim) - self.query_pos_head = MLP(4, 2 * hidden_dim, hidden_dim, num_layers=2) - self.query_pos_head_inv_sig = query_pos_head_inv_sig - - # encoder head - self.enc_output = nn.Sequential( - nn.Linear(hidden_dim, hidden_dim), - nn.LayerNorm( - hidden_dim, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0)))) - self.enc_score_head = nn.Linear(hidden_dim, num_classes) - self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3) - - # decoder head - self.dec_score_head = nn.LayerList([ - nn.Linear(hidden_dim, num_classes) - for _ in range(num_decoder_layers) - ]) - self.dec_bbox_head = nn.LayerList([ - MLP(hidden_dim, hidden_dim, 4, num_layers=3) - for _ in range(num_decoder_layers) - ]) - - self._reset_parameters() - - def _reset_parameters(self): - # class and bbox head init - bias_cls = bias_init_with_prob(0.01) - linear_init_(self.enc_score_head) - constant_(self.enc_score_head.bias, bias_cls) - constant_(self.enc_bbox_head.layers[-1].weight) - constant_(self.enc_bbox_head.layers[-1].bias) - for cls_, reg_ in zip(self.dec_score_head, self.dec_bbox_head): - linear_init_(cls_) - constant_(cls_.bias, bias_cls) - constant_(reg_.layers[-1].weight) - constant_(reg_.layers[-1].bias) - - linear_init_(self.enc_output[0]) - xavier_uniform_(self.enc_output[0].weight) - if self.learnt_init_query: - xavier_uniform_(self.tgt_embed.weight) - xavier_uniform_(self.query_pos_head.layers[0].weight) - xavier_uniform_(self.query_pos_head.layers[1].weight) - for l in self.input_proj: - xavier_uniform_(l[0].weight) - - # init encoder output anchors and valid_mask - if self.eval_size: - self.anchors, self.valid_mask = self._generate_anchors() - - @classmethod - def from_config(cls, cfg, input_shape): - return {'backbone_feat_channels': [i.channels for i in input_shape]} - - def _build_input_proj_layer(self, backbone_feat_channels): - self.input_proj = nn.LayerList() - for in_channels in backbone_feat_channels: - self.input_proj.append( - nn.Sequential( - ('conv', nn.Conv2D( - in_channels, - self.hidden_dim, - kernel_size=1, - bias_attr=False)), ('norm', nn.BatchNorm2D( - self.hidden_dim, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))) - in_channels = backbone_feat_channels[-1] - for _ in range(self.num_levels - len(backbone_feat_channels)): - self.input_proj.append( - nn.Sequential( - ('conv', nn.Conv2D( - in_channels, - self.hidden_dim, - kernel_size=3, - stride=2, - padding=1, - bias_attr=False)), ('norm', nn.BatchNorm2D( - self.hidden_dim, - weight_attr=ParamAttr(regularizer=L2Decay(0.0)), - bias_attr=ParamAttr(regularizer=L2Decay(0.0)))))) - in_channels = self.hidden_dim - - def _get_encoder_input(self, feats): - # get projection features - proj_feats = [self.input_proj[i](feat) for i, feat in enumerate(feats)] - if self.num_levels > len(proj_feats): - len_srcs = len(proj_feats) - for i in range(len_srcs, self.num_levels): - if i == len_srcs: - proj_feats.append(self.input_proj[i](feats[-1])) - else: - proj_feats.append(self.input_proj[i](proj_feats[-1])) - - # get encoder inputs - feat_flatten = [] - spatial_shapes = [] - level_start_index = [0, ] - for i, feat in enumerate(proj_feats): - _, _, h, w = feat.shape - # [b, c, h, w] -> [b, h*w, c] - feat_flatten.append(feat.flatten(2).transpose([0, 2, 1])) - # [num_levels, 2] - spatial_shapes.append([h, w]) - # [l], start index of each level - level_start_index.append(h * w + level_start_index[-1]) - - # [b, l, c] - feat_flatten = paddle.concat(feat_flatten, 1) - level_start_index.pop() - return (feat_flatten, spatial_shapes, level_start_index) - - def forward(self, feats, pad_mask=None, gt_meta=None, is_teacher=False): - # input projection and embedding - (memory, spatial_shapes, - level_start_index) = self._get_encoder_input(feats) - - # prepare denoising training - if self.training: - denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \ - get_contrastive_denoising_training_group(gt_meta, - self.num_classes, - self.num_queries, - self.denoising_class_embed.weight, - self.num_denoising, - self.label_noise_ratio, - self.box_noise_scale) - else: - denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None - - target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \ - self._get_decoder_input( - memory, spatial_shapes, denoising_class, denoising_bbox_unact,is_teacher) - - # decoder - out_bboxes, out_logits = self.decoder( - target, - init_ref_points_unact, - memory, - spatial_shapes, - level_start_index, - self.dec_bbox_head, - self.dec_score_head, - self.query_pos_head, - attn_mask=attn_mask, - memory_mask=None, - query_pos_head_inv_sig=self.query_pos_head_inv_sig) - return (out_bboxes, out_logits, enc_topk_bboxes, enc_topk_logits, - dn_meta) - - def _generate_anchors(self, - spatial_shapes=None, - grid_size=0.05, - dtype="float32"): - if spatial_shapes is None: - spatial_shapes = [ - [int(self.eval_size[0] / s), int(self.eval_size[1] / s)] - for s in self.feat_strides - ] - anchors = [] - for lvl, (h, w) in enumerate(spatial_shapes): - grid_y, grid_x = paddle.meshgrid( - paddle.arange( - end=h, dtype=dtype), - paddle.arange( - end=w, dtype=dtype)) - grid_xy = paddle.stack([grid_x, grid_y], -1) - - valid_WH = paddle.to_tensor([h, w]).astype(dtype) - grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH - wh = paddle.ones_like(grid_xy) * grid_size * (2.0**lvl) - anchors.append( - paddle.concat([grid_xy, wh], -1).reshape([-1, h * w, 4])) - - anchors = paddle.concat(anchors, 1) - valid_mask = ((anchors > self.eps) * - (anchors < 1 - self.eps)).all(-1, keepdim=True) - anchors = paddle.log(anchors / (1 - anchors)) - anchors = paddle.where(valid_mask, anchors, - paddle.to_tensor(float("inf"))) - return anchors, valid_mask - - def _get_decoder_input(self, - memory, - spatial_shapes, - denoising_class=None, - denoising_bbox_unact=None, - is_teacher=False): - bs, _, _ = memory.shape - # prepare input for decoder - if self.training or self.eval_size is None or is_teacher: - anchors, valid_mask = self._generate_anchors(spatial_shapes) - else: - anchors, valid_mask = self.anchors, self.valid_mask - memory = paddle.where(valid_mask, memory, paddle.to_tensor(0.)) - output_memory = self.enc_output(memory) - - enc_outputs_class = self.enc_score_head(output_memory) - enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors - - _, topk_ind = paddle.topk( - enc_outputs_class.max(-1), self.num_queries, axis=1) - # extract region proposal boxes - batch_ind = paddle.arange(end=bs, dtype=topk_ind.dtype) - batch_ind = batch_ind.unsqueeze(-1).tile([1, self.num_queries]) - topk_ind = paddle.stack([batch_ind, topk_ind], axis=-1) - - reference_points_unact = paddle.gather_nd(enc_outputs_coord_unact, - topk_ind) # unsigmoided. - enc_topk_bboxes = F.sigmoid(reference_points_unact) - if denoising_bbox_unact is not None: - reference_points_unact = paddle.concat( - [denoising_bbox_unact, reference_points_unact], 1) - if self.training: - reference_points_unact = reference_points_unact.detach() - enc_topk_logits = paddle.gather_nd(enc_outputs_class, topk_ind) - - # extract region features - if self.learnt_init_query: - target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) - else: - target = paddle.gather_nd(output_memory, topk_ind) - if self.training: - target = target.detach() - if denoising_class is not None: - target = paddle.concat([denoising_class, target], 1) - - return target, reference_points_unact, enc_topk_bboxes, enc_topk_logits diff --git a/pdfdet/models/Paddle/ppdet/modeling/transformers/utils.py b/pdfdet/models/Paddle/ppdet/modeling/transformers/utils.py deleted file mode 100644 index a6f211a..0000000 --- a/pdfdet/models/Paddle/ppdet/modeling/transformers/utils.py +++ /dev/null @@ -1,410 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Modified from DETR (https://github.com/facebookresearch/detr) -# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved -# Modified from detrex (https://github.com/IDEA-Research/detrex) -# Copyright 2022 The IDEA Authors. All rights reserved. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import copy -import math -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - -from ..bbox_utils import bbox_overlaps - -__all__ = [ - '_get_clones', 'bbox_overlaps', 'bbox_cxcywh_to_xyxy', - 'bbox_xyxy_to_cxcywh', 'sigmoid_focal_loss', 'inverse_sigmoid', - 'deformable_attention_core_func', 'varifocal_loss_with_logits' -] - - -def _get_clones(module, N): - return nn.LayerList([copy.deepcopy(module) for _ in range(N)]) - - -def bbox_cxcywh_to_xyxy(x): - cxcy, wh = paddle.split(x, 2, axis=-1) - return paddle.concat([cxcy - 0.5 * wh, cxcy + 0.5 * wh], axis=-1) - - -def bbox_xyxy_to_cxcywh(x): - x1, y1, x2, y2 = x.split(4, axis=-1) - return paddle.concat( - [(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)], axis=-1) - - -def sigmoid_focal_loss(logit, label, normalizer=1.0, alpha=0.25, gamma=2.0): - prob = F.sigmoid(logit) - ce_loss = F.binary_cross_entropy_with_logits(logit, label, reduction="none") - p_t = prob * label + (1 - prob) * (1 - label) - loss = ce_loss * ((1 - p_t)**gamma) - - if alpha >= 0: - alpha_t = alpha * label + (1 - alpha) * (1 - label) - loss = alpha_t * loss - return loss.mean(1).sum() / normalizer - - -def inverse_sigmoid(x, eps=1e-5): - x = x.clip(min=0., max=1.) - return paddle.log(x.clip(min=eps) / (1 - x).clip(min=eps)) - - -def deformable_attention_core_func(value, value_spatial_shapes, - value_level_start_index, sampling_locations, - attention_weights): - """ - Args: - value (Tensor): [bs, value_length, n_head, c] - value_spatial_shapes (Tensor|List): [n_levels, 2] - value_level_start_index (Tensor|List): [n_levels] - sampling_locations (Tensor): [bs, query_length, n_head, n_levels, n_points, 2] - attention_weights (Tensor): [bs, query_length, n_head, n_levels, n_points] - - Returns: - output (Tensor): [bs, Length_{query}, C] - """ - bs, _, n_head, c = value.shape - _, Len_q, _, n_levels, n_points, _ = sampling_locations.shape - - split_shape = [h * w for h, w in value_spatial_shapes] - value_list = value.split(split_shape, axis=1) - sampling_grids = 2 * sampling_locations - 1 - sampling_value_list = [] - for level, (h, w) in enumerate(value_spatial_shapes): - # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ - value_l_ = value_list[level].flatten(2).transpose( - [0, 2, 1]).reshape([bs * n_head, c, h, w]) - # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 - sampling_grid_l_ = sampling_grids[:, :, :, level].transpose( - [0, 2, 1, 3, 4]).flatten(0, 1) - # N_*M_, D_, Lq_, P_ - sampling_value_l_ = F.grid_sample( - value_l_, - sampling_grid_l_, - mode='bilinear', - padding_mode='zeros', - align_corners=False) - sampling_value_list.append(sampling_value_l_) - # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_*M_, 1, Lq_, L_*P_) - attention_weights = attention_weights.transpose([0, 2, 1, 3, 4]).reshape( - [bs * n_head, 1, Len_q, n_levels * n_points]) - output = (paddle.stack( - sampling_value_list, axis=-2).flatten(-2) * - attention_weights).sum(-1).reshape([bs, n_head * c, Len_q]) - - return output.transpose([0, 2, 1]) - - -def get_valid_ratio(mask): - _, H, W = paddle.shape(mask) - valid_ratio_h = paddle.sum(mask[:, :, 0], 1) / H - valid_ratio_w = paddle.sum(mask[:, 0, :], 1) / W - # [b, 2] - return paddle.stack([valid_ratio_w, valid_ratio_h], -1) - - -def get_denoising_training_group(targets, - num_classes, - num_queries, - class_embed, - num_denoising=100, - label_noise_ratio=0.5, - box_noise_scale=1.0): - if num_denoising <= 0: - return None, None, None, None - num_gts = [len(t) for t in targets["gt_class"]] - max_gt_num = max(num_gts) - if max_gt_num == 0: - return None, None, None, None - - num_group = num_denoising // max_gt_num - num_group = 1 if num_group == 0 else num_group - # pad gt to max_num of a batch - bs = len(targets["gt_class"]) - input_query_class = paddle.full( - [bs, max_gt_num], num_classes, dtype='int32') - input_query_bbox = paddle.zeros([bs, max_gt_num, 4]) - pad_gt_mask = paddle.zeros([bs, max_gt_num]) - for i in range(bs): - num_gt = num_gts[i] - if num_gt > 0: - input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1) - input_query_bbox[i, :num_gt] = targets["gt_bbox"][i] - pad_gt_mask[i, :num_gt] = 1 - - input_query_class = input_query_class.tile([1, num_group]) - input_query_bbox = input_query_bbox.tile([1, num_group, 1]) - pad_gt_mask = pad_gt_mask.tile([1, num_group]) - - dn_positive_idx = paddle.nonzero(pad_gt_mask)[:, 1] - dn_positive_idx = paddle.split(dn_positive_idx, - [n * num_group for n in num_gts]) - # total denoising queries - num_denoising = int(max_gt_num * num_group) - - if label_noise_ratio > 0: - input_query_class = input_query_class.flatten() - pad_gt_mask = pad_gt_mask.flatten() - # half of bbox prob - mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5) - chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1) - # randomly put a new one here - new_label = paddle.randint_like( - chosen_idx, 0, num_classes, dtype=input_query_class.dtype) - input_query_class.scatter_(chosen_idx, new_label) - input_query_class.reshape_([bs, num_denoising]) - pad_gt_mask.reshape_([bs, num_denoising]) - - if box_noise_scale > 0: - diff = paddle.concat( - [input_query_bbox[..., 2:] * 0.5, input_query_bbox[..., 2:]], - axis=-1) * box_noise_scale - diff *= (paddle.rand(input_query_bbox.shape) * 2.0 - 1.0) - input_query_bbox += diff - input_query_bbox = inverse_sigmoid(input_query_bbox) - - class_embed = paddle.concat( - [class_embed, paddle.zeros([1, class_embed.shape[-1]])]) - input_query_class = paddle.gather( - class_embed, input_query_class.flatten(), - axis=0).reshape([bs, num_denoising, -1]) - - tgt_size = num_denoising + num_queries - attn_mask = paddle.ones([tgt_size, tgt_size]) < 0 - # match query cannot see the reconstruction - attn_mask[num_denoising:, :num_denoising] = True - # reconstruct cannot see each other - for i in range(num_group): - if i == 0: - attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1): - num_denoising] = True - if i == num_group - 1: - attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num * - i] = True - else: - attn_mask[max_gt_num * i:max_gt_num * (i + 1), max_gt_num * (i + 1): - num_denoising] = True - attn_mask[max_gt_num * i:max_gt_num * (i + 1), :max_gt_num * - i] = True - attn_mask = ~attn_mask - dn_meta = { - "dn_positive_idx": dn_positive_idx, - "dn_num_group": num_group, - "dn_num_split": [num_denoising, num_queries] - } - - return input_query_class, input_query_bbox, attn_mask, dn_meta - - -def get_contrastive_denoising_training_group(targets, - num_classes, - num_queries, - class_embed, - num_denoising=100, - label_noise_ratio=0.5, - box_noise_scale=1.0): - if num_denoising <= 0: - return None, None, None, None - num_gts = [len(t) for t in targets["gt_class"]] - max_gt_num = max(num_gts) - if max_gt_num == 0: - return None, None, None, None - - num_group = num_denoising // max_gt_num - num_group = 1 if num_group == 0 else num_group - # pad gt to max_num of a batch - bs = len(targets["gt_class"]) - input_query_class = paddle.full( - [bs, max_gt_num], num_classes, dtype='int32') - input_query_bbox = paddle.zeros([bs, max_gt_num, 4]) - pad_gt_mask = paddle.zeros([bs, max_gt_num]) - for i in range(bs): - num_gt = num_gts[i] - if num_gt > 0: - input_query_class[i, :num_gt] = targets["gt_class"][i].squeeze(-1) - input_query_bbox[i, :num_gt] = targets["gt_bbox"][i] - pad_gt_mask[i, :num_gt] = 1 - # each group has positive and negative queries. - input_query_class = input_query_class.tile([1, 2 * num_group]) - input_query_bbox = input_query_bbox.tile([1, 2 * num_group, 1]) - pad_gt_mask = pad_gt_mask.tile([1, 2 * num_group]) - # positive and negative mask - negative_gt_mask = paddle.zeros([bs, max_gt_num * 2, 1]) - negative_gt_mask[:, max_gt_num:] = 1 - negative_gt_mask = negative_gt_mask.tile([1, num_group, 1]) - positive_gt_mask = 1 - negative_gt_mask - # contrastive denoising training positive index - positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask - dn_positive_idx = paddle.nonzero(positive_gt_mask)[:, 1] - dn_positive_idx = paddle.split(dn_positive_idx, - [n * num_group for n in num_gts]) - # total denoising queries - num_denoising = int(max_gt_num * 2 * num_group) - - if label_noise_ratio > 0: - input_query_class = input_query_class.flatten() - pad_gt_mask = pad_gt_mask.flatten() - # half of bbox prob - mask = paddle.rand(input_query_class.shape) < (label_noise_ratio * 0.5) - chosen_idx = paddle.nonzero(mask * pad_gt_mask).squeeze(-1) - # randomly put a new one here - new_label = paddle.randint_like( - chosen_idx, 0, num_classes, dtype=input_query_class.dtype) - input_query_class.scatter_(chosen_idx, new_label) - input_query_class.reshape_([bs, num_denoising]) - pad_gt_mask.reshape_([bs, num_denoising]) - - if box_noise_scale > 0: - known_bbox = bbox_cxcywh_to_xyxy(input_query_bbox) - - diff = paddle.tile(input_query_bbox[..., 2:] * 0.5, - [1, 1, 2]) * box_noise_scale - - rand_sign = paddle.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0 - rand_part = paddle.rand(input_query_bbox.shape) - rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * ( - 1 - negative_gt_mask) - rand_part *= rand_sign - known_bbox += rand_part * diff - known_bbox.clip_(min=0.0, max=1.0) - input_query_bbox = bbox_xyxy_to_cxcywh(known_bbox) - input_query_bbox = inverse_sigmoid(input_query_bbox) - - class_embed = paddle.concat( - [class_embed, paddle.zeros([1, class_embed.shape[-1]])]) - input_query_class = paddle.gather( - class_embed, input_query_class.flatten(), - axis=0).reshape([bs, num_denoising, -1]) - - tgt_size = num_denoising + num_queries - attn_mask = paddle.ones([tgt_size, tgt_size]) < 0 - # match query cannot see the reconstruction - attn_mask[num_denoising:, :num_denoising] = True - # reconstruct cannot see each other - for i in range(num_group): - if i == 0: - attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num * - 2 * (i + 1):num_denoising] = True - if i == num_group - 1: - attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num * - i * 2] = True - else: - attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), max_gt_num * - 2 * (i + 1):num_denoising] = True - attn_mask[max_gt_num * 2 * i:max_gt_num * 2 * (i + 1), :max_gt_num * - 2 * i] = True - attn_mask = ~attn_mask - dn_meta = { - "dn_positive_idx": dn_positive_idx, - "dn_num_group": num_group, - "dn_num_split": [num_denoising, num_queries] - } - - return input_query_class, input_query_bbox, attn_mask, dn_meta - - -def get_sine_pos_embed(pos_tensor, - num_pos_feats=128, - temperature=10000, - exchange_xy=True): - """generate sine position embedding from a position tensor - - Args: - pos_tensor (Tensor): Shape as `(None, n)`. - num_pos_feats (int): projected shape for each float in the tensor. Default: 128 - temperature (int): The temperature used for scaling - the position embedding. Default: 10000. - exchange_xy (bool, optional): exchange pos x and pos y. \ - For example, input tensor is `[x, y]`, the results will # noqa - be `[pos(y), pos(x)]`. Defaults: True. - - Returns: - Tensor: Returned position embedding # noqa - with shape `(None, n * num_pos_feats)`. - """ - scale = 2. * math.pi - dim_t = 2. * paddle.floor_divide( - paddle.arange(num_pos_feats), paddle.to_tensor(2)) - dim_t = scale / temperature**(dim_t / num_pos_feats) - - def sine_func(x): - x *= dim_t - return paddle.stack( - (x[:, :, 0::2].sin(), x[:, :, 1::2].cos()), axis=3).flatten(2) - - pos_res = [sine_func(x) for x in pos_tensor.split(pos_tensor.shape[-1], -1)] - if exchange_xy: - pos_res[0], pos_res[1] = pos_res[1], pos_res[0] - pos_res = paddle.concat(pos_res, axis=2) - return pos_res - - -def mask_to_box_coordinate(mask, - normalize=False, - format="xyxy", - dtype="float32"): - """ - Compute the bounding boxes around the provided mask. - Args: - mask (Tensor:bool): [b, c, h, w] - - Returns: - bbox (Tensor): [b, c, 4] - """ - assert mask.ndim == 4 - assert format in ["xyxy", "xywh"] - if mask.sum() == 0: - return paddle.zeros([mask.shape[0], mask.shape[1], 4], dtype=dtype) - - h, w = mask.shape[-2:] - y, x = paddle.meshgrid( - paddle.arange( - end=h, dtype=dtype), paddle.arange( - end=w, dtype=dtype)) - - x_mask = x * mask - x_max = x_mask.flatten(-2).max(-1) + 1 - x_min = paddle.where(mask, x_mask, - paddle.to_tensor(1e8)).flatten(-2).min(-1) - - y_mask = y * mask - y_max = y_mask.flatten(-2).max(-1) + 1 - y_min = paddle.where(mask, y_mask, - paddle.to_tensor(1e8)).flatten(-2).min(-1) - out_bbox = paddle.stack([x_min, y_min, x_max, y_max], axis=-1) - if normalize: - out_bbox /= paddle.to_tensor([w, h, w, h]).astype(dtype) - - return out_bbox if format == "xyxy" else bbox_xyxy_to_cxcywh(out_bbox) - - -def varifocal_loss_with_logits(pred_logits, - gt_score, - label, - normalizer=1.0, - alpha=0.75, - gamma=2.0): - pred_score = F.sigmoid(pred_logits) - weight = alpha * pred_score.pow(gamma) * (1 - label) + gt_score * label - loss = F.binary_cross_entropy_with_logits( - pred_logits, gt_score, weight=weight, reduction='none') - return loss.mean(1).sum() / normalizer diff --git a/pdfdet/models/Paddle/ppdet/optimizer/__init__.py b/pdfdet/models/Paddle/ppdet/optimizer/__init__.py deleted file mode 100644 index aa690dc..0000000 --- a/pdfdet/models/Paddle/ppdet/optimizer/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import optimizer -from . import ema - -from .optimizer import * -from .ema import * diff --git a/pdfdet/models/Paddle/ppdet/optimizer/adamw.py b/pdfdet/models/Paddle/ppdet/optimizer/adamw.py deleted file mode 100644 index 12ab619..0000000 --- a/pdfdet/models/Paddle/ppdet/optimizer/adamw.py +++ /dev/null @@ -1,272 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -from paddle.optimizer import AdamW -from functools import partial -import re - -IS_PADDLE_LATER_2_4 = ( - int(paddle.version.major) >= 2 and - int(paddle.version.minor) >= 4) or int(paddle.version.major) == 0 - - -def layerwise_lr_decay(decay_rate, name_dict, n_layers, param): - """ - Args: - decay_rate (float): - The layer-wise decay ratio. - name_dict (dict): - The keys of name_dict is dynamic name of model while the value - of name_dict is static name. - Use model.named_parameters() to get name_dict. - n_layers (int): - Total number of layers in the transformer encoder. - """ - ratio = 1.0 - static_name = name_dict[param.name] - if 'blocks.' in static_name or 'layers.' in static_name: - idx_1 = static_name.find('blocks.') - idx_2 = static_name.find('layers.') - assert any([x >= 0 for x in [idx_1, idx_2]]), '' - idx = idx_1 if idx_1 >= 0 else idx_2 - # idx = re.findall('[blocks|layers]\.(\d+)\.', static_name)[0] - - layer = int(static_name[idx:].split('.')[1]) - ratio = decay_rate**(n_layers - layer) - - elif 'cls_token' in static_name or 'patch_embed' in static_name or 'pos_embed' in static_name: - ratio = decay_rate**(n_layers + 1) - - if IS_PADDLE_LATER_2_4: - return ratio - else: - param.optimize_attr['learning_rate'] *= ratio - - -class AdamWDL(AdamW): - r""" - The AdamWDL optimizer is implemented based on the AdamW Optimization with dynamic lr setting. - Generally it's used for transformer model. - - We use "layerwise_lr_decay" as default dynamic lr setting method of AdamWDL. - “Layer-wise decay” means exponentially decaying the learning rates of individual - layers in a top-down manner. For example, suppose the 24-th layer uses a learning - rate l, and the Layer-wise decay rate is α, then the learning rate of layer m - is lα^(24-m). See more details on: https://arxiv.org/abs/1906.08237. - - .. math:: - & t = t + 1 - - & moment\_1\_out = {\beta}_1 * moment\_1 + (1 - {\beta}_1) * grad - - & moment\_2\_out = {\beta}_2 * moment\_2 + (1 - {\beta}_2) * grad * grad - - & learning\_rate = learning\_rate * \frac{\sqrt{1 - {\beta}_2^t}}{1 - {\beta}_1^t} - - & param\_out = param - learning\_rate * (\frac{moment\_1}{\sqrt{moment\_2} + \epsilon} + \lambda * param) - - Args: - learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``. - It can be a float value or a LRScheduler. The default value is 0.001. - beta1 (float, optional): The exponential decay rate for the 1st moment estimates. - It should be a float number or a Tensor with shape [1] and data type as float32. - The default value is 0.9. - beta2 (float, optional): The exponential decay rate for the 2nd moment estimates. - It should be a float number or a Tensor with shape [1] and data type as float32. - The default value is 0.999. - epsilon (float, optional): A small float value for numerical stability. - It should be a float number or a Tensor with shape [1] and data type as float32. - The default value is 1e-08. - parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \ - This parameter is required in dygraph mode. \ - The default value is None in static mode, at this time all parameters will be updated. - weight_decay (float, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01. - apply_decay_param_fun (function|None, optional): If it is not None, - only tensors that makes apply_decay_param_fun(Tensor.name)==True - will be updated. It only works when we want to specify tensors. - Default: None. - grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of - some derived class of ``GradientClipBase`` . There are three cliping strategies - ( :ref:`api_fluid_clip_GradientClipByGlobalNorm` , :ref:`api_fluid_clip_GradientClipByNorm` , - :ref:`api_fluid_clip_GradientClipByValue` ). Default None, meaning there is no gradient clipping. - lazy_mode (bool, optional): The official Adam algorithm has two moving-average accumulators. - The accumulators are updated at every step. Every element of the two moving-average - is updated in both dense mode and sparse mode. If the size of parameter is very large, - then the update may be very slow. The lazy mode only update the element that has - gradient in current mini-batch, so it will be much more faster. But this mode has - different semantics with the original Adam algorithm and may lead to different result. - The default value is False. - multi_precision (bool, optional): Whether to use multi-precision during weight updating. Default is false. - layerwise_decay (float, optional): The layer-wise decay ratio. Defaults to 1.0. - n_layers (int, optional): The total number of encoder layers. Defaults to 12. - set_param_lr_fun (function|None, optional): If it's not None, set_param_lr_fun() will set the the parameter - learning rate before it executes Adam Operator. Defaults to :ref:`layerwise_lr_decay`. - name_dict (dict, optional): The keys of name_dict is dynamic name of model while the value - of name_dict is static name. Use model.named_parameters() to get name_dict. - name (str, optional): Normally there is no need for user to set this property. - For more information, please refer to :ref:`api_guide_Name`. - The default value is None. - - Examples: - .. code-block:: python - - import paddle - from paddlenlp.ops.optimizer import AdamWDL - def simple_lr_setting(decay_rate, name_dict, n_layers, param): - ratio = 1.0 - static_name = name_dict[param.name] - if "weight" in static_name: - ratio = decay_rate**0.5 - param.optimize_attr["learning_rate"] *= ratio - - linear = paddle.nn.Linear(10, 10) - - name_dict = dict() - for n, p in linear.named_parameters(): - name_dict[p.name] = n - - inp = paddle.rand([10,10], dtype="float32") - out = linear(inp) - loss = paddle.mean(out) - - adamwdl = AdamWDL( - learning_rate=1e-4, - parameters=linear.parameters(), - set_param_lr_fun=simple_lr_setting, - layerwise_decay=0.8, - name_dict=name_dict) - - loss.backward() - adamwdl.step() - adamwdl.clear_grad() - """ - - def __init__(self, - learning_rate=0.001, - beta1=0.9, - beta2=0.999, - epsilon=1e-8, - parameters=None, - weight_decay=0.01, - apply_decay_param_fun=None, - grad_clip=None, - lazy_mode=False, - multi_precision=False, - layerwise_decay=1.0, - n_layers=12, - set_param_lr_func=None, - name_dict=None, - name=None): - if not isinstance(layerwise_decay, float): - raise TypeError("coeff should be float or Tensor.") - self.layerwise_decay = layerwise_decay - self.n_layers = n_layers - self.set_param_lr_func = partial( - set_param_lr_func, layerwise_decay, name_dict, - n_layers) if set_param_lr_func is not None else set_param_lr_func - - if IS_PADDLE_LATER_2_4: - super(AdamWDL, self).__init__( - learning_rate=learning_rate, - parameters=parameters, - beta1=beta1, - beta2=beta2, - epsilon=epsilon, - grad_clip=grad_clip, - name=name, - apply_decay_param_fun=apply_decay_param_fun, - weight_decay=weight_decay, - lazy_mode=lazy_mode, - multi_precision=multi_precision, - lr_ratio=self.set_param_lr_func) - else: - super(AdamWDL, self).__init__( - learning_rate=learning_rate, - parameters=parameters, - beta1=beta1, - beta2=beta2, - epsilon=epsilon, - grad_clip=grad_clip, - name=name, - apply_decay_param_fun=apply_decay_param_fun, - weight_decay=weight_decay, - lazy_mode=lazy_mode, - multi_precision=multi_precision) - - -def _append_optimize_op(self, block, param_and_grad): - if self.set_param_lr_func is None: - return super(AdamWDL, self)._append_optimize_op(block, param_and_grad) - - self._append_decoupled_weight_decay(block, param_and_grad) - prev_lr = param_and_grad[0].optimize_attr["learning_rate"] - self.set_param_lr_func(param_and_grad[0]) - # excute Adam op - res = super(AdamW, self)._append_optimize_op(block, param_and_grad) - param_and_grad[0].optimize_attr["learning_rate"] = prev_lr - return res - - -if not IS_PADDLE_LATER_2_4: - AdamWDL._append_optimize_op = _append_optimize_op - - -def build_adamwdl(model, - lr=1e-4, - weight_decay=0.05, - betas=(0.9, 0.999), - layer_decay=0.65, - num_layers=None, - filter_bias_and_bn=True, - skip_decay_names=None, - set_param_lr_func='layerwise_lr_decay'): - - if skip_decay_names and filter_bias_and_bn: - decay_dict = { - param.name: not (len(param.shape) == 1 or name.endswith('.bias') or - any([_n in name for _n in skip_decay_names])) - for name, param in model.named_parameters() - } - parameters = [p for p in model.parameters()] - - else: - parameters = model.parameters() - - opt_args = dict( - parameters=parameters, learning_rate=lr, weight_decay=weight_decay) - - if decay_dict is not None: - opt_args['apply_decay_param_fun'] = lambda n: decay_dict[n] - - if isinstance(set_param_lr_func, str): - set_param_lr_func = eval(set_param_lr_func) - opt_args['set_param_lr_func'] = set_param_lr_func - - opt_args['beta1'] = betas[0] - opt_args['beta2'] = betas[1] - - opt_args['layerwise_decay'] = layer_decay - name_dict = {p.name: n for n, p in model.named_parameters()} - - opt_args['name_dict'] = name_dict - opt_args['n_layers'] = num_layers - - optimizer = AdamWDL(**opt_args) - - return optimizer diff --git a/pdfdet/models/Paddle/ppdet/optimizer/ema.py b/pdfdet/models/Paddle/ppdet/optimizer/ema.py deleted file mode 100644 index 84cc9ac..0000000 --- a/pdfdet/models/Paddle/ppdet/optimizer/ema.py +++ /dev/null @@ -1,195 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import math -import paddle -import weakref -from copy import deepcopy - -from .utils import get_bn_running_state_names - -__all__ = ['ModelEMA', 'SimpleModelEMA'] - - -class ModelEMA(object): - """ - Exponential Weighted Average for Deep Neutal Networks - Args: - model (nn.Layer): Detector of model. - decay (int): The decay used for updating ema parameter. - Ema's parameter are updated with the formula: - `ema_param = decay * ema_param + (1 - decay) * cur_param`. - Defaults is 0.9998. - ema_decay_type (str): type in ['threshold', 'normal', 'exponential'], - 'threshold' as default. - cycle_epoch (int): The epoch of interval to reset ema_param and - step. Defaults is -1, which means not reset. Its function is to - add a regular effect to ema, which is set according to experience - and is effective when the total training epoch is large. - ema_black_list (set|list|tuple, optional): The custom EMA black_list. - Blacklist of weight names that will not participate in EMA - calculation. Default: None. - """ - - def __init__(self, - model, - decay=0.9998, - ema_decay_type='threshold', - cycle_epoch=-1, - ema_black_list=None, - ema_filter_no_grad=False): - self.step = 0 - self.epoch = 0 - self.decay = decay - self.ema_decay_type = ema_decay_type - self.cycle_epoch = cycle_epoch - self.ema_black_list = self._match_ema_black_list( - model.state_dict().keys(), ema_black_list) - bn_states_names = get_bn_running_state_names(model) - if ema_filter_no_grad: - for n, p in model.named_parameters(): - if p.stop_gradient and n not in bn_states_names: - self.ema_black_list.add(n) - - self.state_dict = dict() - for k, v in model.state_dict().items(): - if k in self.ema_black_list: - self.state_dict[k] = v - else: - self.state_dict[k] = paddle.zeros_like(v, dtype='float32') - - self._model_state = { - k: weakref.ref(p) - for k, p in model.state_dict().items() - } - - def reset(self): - self.step = 0 - self.epoch = 0 - for k, v in self.state_dict.items(): - if k in self.ema_black_list: - self.state_dict[k] = v - else: - self.state_dict[k] = paddle.zeros_like(v) - - def resume(self, state_dict, step=0): - for k, v in state_dict.items(): - if k in self.state_dict: - if self.state_dict[k].dtype == v.dtype: - self.state_dict[k] = v - else: - self.state_dict[k] = v.astype(self.state_dict[k].dtype) - self.step = step - - def update(self, model=None): - if self.ema_decay_type == 'threshold': - decay = min(self.decay, (1 + self.step) / (10 + self.step)) - elif self.ema_decay_type == 'exponential': - decay = self.decay * (1 - math.exp(-(self.step + 1) / 2000)) - else: - decay = self.decay - self._decay = decay - - if model is not None: - model_dict = model.state_dict() - else: - model_dict = {k: p() for k, p in self._model_state.items()} - assert all( - [v is not None for _, v in model_dict.items()]), 'python gc.' - - for k, v in self.state_dict.items(): - if k not in self.ema_black_list: - v = decay * v + (1 - decay) * model_dict[k].astype('float32') - v.stop_gradient = True - self.state_dict[k] = v - self.step += 1 - - def apply(self): - if self.step == 0: - return self.state_dict - state_dict = dict() - model_dict = {k: p() for k, p in self._model_state.items()} - for k, v in self.state_dict.items(): - if k in self.ema_black_list: - v.stop_gradient = True - state_dict[k] = v - else: - if self.ema_decay_type != 'exponential': - v = v / (1 - self._decay**self.step) - v = v.astype(model_dict[k].dtype) - v.stop_gradient = True - state_dict[k] = v - self.epoch += 1 - if self.cycle_epoch > 0 and self.epoch == self.cycle_epoch: - self.reset() - - return state_dict - - def _match_ema_black_list(self, weight_name, ema_black_list=None): - out_list = set() - if ema_black_list: - for name in weight_name: - for key in ema_black_list: - if key in name: - out_list.add(name) - return out_list - - -class SimpleModelEMA(object): - """ - Model Exponential Moving Average from https://github.com/rwightman/pytorch-image-models - Keep a moving average of everything in the model state_dict (parameters and buffers). - This is intended to allow functionality like - https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage - A smoothed version of the weights is necessary for some training schemes to perform well. - This class is sensitive where it is initialized in the sequence of model init, - GPU assignment and distributed training wrappers. - """ - - def __init__(self, model=None, decay=0.9996): - """ - Args: - model (nn.Module): model to apply EMA. - decay (float): ema decay reate. - """ - self.model = deepcopy(model) - self.decay = decay - - def update(self, model, decay=None): - if decay is None: - decay = self.decay - - with paddle.no_grad(): - state = {} - msd = model.state_dict() - for k, v in self.model.state_dict().items(): - if paddle.is_floating_point(v): - v *= decay - v += (1.0 - decay) * msd[k].detach() - state[k] = v - self.model.set_state_dict(state) - - def resume(self, state_dict, step=0): - state = {} - msd = state_dict - for k, v in self.model.state_dict().items(): - if paddle.is_floating_point(v): - v = msd[k].detach() - state[k] = v - self.model.set_state_dict(state) - self.step = step diff --git a/pdfdet/models/Paddle/ppdet/optimizer/optimizer.py b/pdfdet/models/Paddle/ppdet/optimizer/optimizer.py deleted file mode 100644 index 3c528fc..0000000 --- a/pdfdet/models/Paddle/ppdet/optimizer/optimizer.py +++ /dev/null @@ -1,358 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import sys -import math -import paddle -import paddle.nn as nn - -import paddle.optimizer as optimizer -import paddle.regularizer as regularizer - -from ppdet.core.workspace import register, serializable -import copy - -from .adamw import AdamWDL, build_adamwdl - -__all__ = ['LearningRate', 'OptimizerBuilder'] - -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - - -@serializable -class CosineDecay(object): - """ - Cosine learning rate decay - - Args: - max_epochs (int): max epochs for the training process. - if you commbine cosine decay with warmup, it is recommended that - the max_iters is much larger than the warmup iter - use_warmup (bool): whether to use warmup. Default: True. - min_lr_ratio (float): minimum learning rate ratio. Default: 0. - last_plateau_epochs (int): use minimum learning rate in - the last few epochs. Default: 0. - """ - - def __init__(self, - max_epochs=1000, - use_warmup=True, - min_lr_ratio=0., - last_plateau_epochs=0): - self.max_epochs = max_epochs - self.use_warmup = use_warmup - self.min_lr_ratio = min_lr_ratio - self.last_plateau_epochs = last_plateau_epochs - - def __call__(self, - base_lr=None, - boundary=None, - value=None, - step_per_epoch=None): - assert base_lr is not None, "either base LR or values should be provided" - - max_iters = self.max_epochs * int(step_per_epoch) - last_plateau_iters = self.last_plateau_epochs * int(step_per_epoch) - min_lr = base_lr * self.min_lr_ratio - if boundary is not None and value is not None and self.use_warmup: - # use warmup - warmup_iters = len(boundary) - for i in range(int(boundary[-1]), max_iters): - boundary.append(i) - if i < max_iters - last_plateau_iters: - decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos( - (i - warmup_iters) * math.pi / - (max_iters - warmup_iters - last_plateau_iters)) + 1) - value.append(decayed_lr) - else: - value.append(min_lr) - return optimizer.lr.PiecewiseDecay(boundary, value) - elif last_plateau_iters > 0: - # not use warmup, but set `last_plateau_epochs` > 0 - boundary = [] - value = [] - for i in range(max_iters): - if i < max_iters - last_plateau_iters: - decayed_lr = min_lr + (base_lr - min_lr) * 0.5 * (math.cos( - i * math.pi / (max_iters - last_plateau_iters)) + 1) - value.append(decayed_lr) - else: - value.append(min_lr) - if i > 0: - boundary.append(i) - return optimizer.lr.PiecewiseDecay(boundary, value) - - return optimizer.lr.CosineAnnealingDecay( - base_lr, T_max=max_iters, eta_min=min_lr) - - -@serializable -class PiecewiseDecay(object): - """ - Multi step learning rate decay - - Args: - gamma (float | list): decay factor - milestones (list): steps at which to decay learning rate - """ - - def __init__(self, - gamma=[0.1, 0.01], - milestones=[8, 11], - values=None, - use_warmup=True): - super(PiecewiseDecay, self).__init__() - if type(gamma) is not list: - self.gamma = [] - for i in range(len(milestones)): - self.gamma.append(gamma / 10**i) - else: - self.gamma = gamma - self.milestones = milestones - self.values = values - self.use_warmup = use_warmup - - def __call__(self, - base_lr=None, - boundary=None, - value=None, - step_per_epoch=None): - if boundary is not None and self.use_warmup: - boundary.extend([int(step_per_epoch) * i for i in self.milestones]) - else: - # do not use LinearWarmup - boundary = [int(step_per_epoch) * i for i in self.milestones] - value = [base_lr] # during step[0, boundary[0]] is base_lr - - # self.values is setted directly in config - if self.values is not None: - assert len(self.milestones) + 1 == len(self.values) - return optimizer.lr.PiecewiseDecay(boundary, self.values) - - # value is computed by self.gamma - value = value if value is not None else [base_lr] - for i in self.gamma: - value.append(base_lr * i) - - return optimizer.lr.PiecewiseDecay(boundary, value) - - -@serializable -class LinearWarmup(object): - """ - Warm up learning rate linearly - - Args: - steps (int): warm up steps - start_factor (float): initial learning rate factor - epochs (int|None): use epochs as warm up steps, the priority - of `epochs` is higher than `steps`. Default: None. - """ - - def __init__(self, steps=500, start_factor=1. / 3, epochs=None, epochs_first=True): - super(LinearWarmup, self).__init__() - self.steps = steps - self.start_factor = start_factor - self.epochs = epochs - self.epochs_first = epochs_first - - def __call__(self, base_lr, step_per_epoch): - boundary = [] - value = [] - if self.epochs_first and self.epochs is not None: - warmup_steps = self.epochs * step_per_epoch - else: - warmup_steps = self.steps - warmup_steps = max(warmup_steps, 1) - for i in range(warmup_steps + 1): - if warmup_steps > 0: - alpha = i / warmup_steps - factor = self.start_factor * (1 - alpha) + alpha - lr = base_lr * factor - value.append(lr) - if i > 0: - boundary.append(i) - return boundary, value - - -@serializable -class ExpWarmup(object): - """ - Warm up learning rate in exponential mode - Args: - steps (int): warm up steps. - epochs (int|None): use epochs as warm up steps, the priority - of `epochs` is higher than `steps`. Default: None. - power (int): Exponential coefficient. Default: 2. - """ - - def __init__(self, steps=1000, epochs=None, power=2): - super(ExpWarmup, self).__init__() - self.steps = steps - self.epochs = epochs - self.power = power - - def __call__(self, base_lr, step_per_epoch): - boundary = [] - value = [] - warmup_steps = self.epochs * step_per_epoch if self.epochs is not None else self.steps - warmup_steps = max(warmup_steps, 1) - for i in range(warmup_steps + 1): - factor = (i / float(warmup_steps))**self.power - value.append(base_lr * factor) - if i > 0: - boundary.append(i) - return boundary, value - - -@register -class LearningRate(object): - """ - Learning Rate configuration - - Args: - base_lr (float): base learning rate - schedulers (list): learning rate schedulers - """ - __category__ = 'optim' - - def __init__(self, - base_lr=0.01, - schedulers=[PiecewiseDecay(), LinearWarmup()]): - super(LearningRate, self).__init__() - self.base_lr = base_lr - self.schedulers = [] - - schedulers = copy.deepcopy(schedulers) - for sched in schedulers: - if isinstance(sched, dict): - # support dict sched instantiate - module = sys.modules[__name__] - type = sched.pop("name") - scheduler = getattr(module, type)(**sched) - self.schedulers.append(scheduler) - else: - self.schedulers.append(sched) - - def __call__(self, step_per_epoch): - assert len(self.schedulers) >= 1 - if not self.schedulers[0].use_warmup: - return self.schedulers[0](base_lr=self.base_lr, - step_per_epoch=step_per_epoch) - - # TODO: split warmup & decay - # warmup - boundary, value = self.schedulers[1](self.base_lr, step_per_epoch) - # decay - decay_lr = self.schedulers[0](self.base_lr, boundary, value, - step_per_epoch) - return decay_lr - - -@register -class OptimizerBuilder(): - """ - Build optimizer handles - Args: - regularizer (object): an `Regularizer` instance - optimizer (object): an `Optimizer` instance - """ - __category__ = 'optim' - - def __init__(self, - clip_grad_by_norm=None, - clip_grad_by_value=None, - regularizer={'type': 'L2', - 'factor': .0001}, - optimizer={'type': 'Momentum', - 'momentum': .9}): - self.clip_grad_by_norm = clip_grad_by_norm - self.clip_grad_by_value = clip_grad_by_value - self.regularizer = regularizer - self.optimizer = optimizer - - def __call__(self, learning_rate, model=None): - if self.clip_grad_by_norm is not None: - grad_clip = nn.ClipGradByGlobalNorm( - clip_norm=self.clip_grad_by_norm) - elif self.clip_grad_by_value is not None: - var = abs(self.clip_grad_by_value) - grad_clip = nn.ClipGradByValue(min=-var, max=var) - else: - grad_clip = None - if self.regularizer and self.regularizer != 'None': - reg_type = self.regularizer['type'] + 'Decay' - reg_factor = self.regularizer['factor'] - regularization = getattr(regularizer, reg_type)(reg_factor) - else: - regularization = None - - optim_args = self.optimizer.copy() - optim_type = optim_args['type'] - del optim_args['type'] - - if optim_type == 'AdamWDL': - return build_adamwdl(model, lr=learning_rate, **optim_args) - - if optim_type != 'AdamW': - optim_args['weight_decay'] = regularization - - op = getattr(optimizer, optim_type) - - if 'param_groups' in optim_args: - assert isinstance(optim_args['param_groups'], list), '' - - param_groups = optim_args.pop('param_groups') - - params, visited = [], [] - for group in param_groups: - assert isinstance(group, - dict) and 'params' in group and isinstance( - group['params'], list), '' - _params = { - n: p - for n, p in model.named_parameters() - if any([k in n - for k in group['params']]) and p.trainable is True - } - _group = group.copy() - _group.update({'params': list(_params.values())}) - - params.append(_group) - visited.extend(list(_params.keys())) - - ext_params = [ - p for n, p in model.named_parameters() - if n not in visited and p.trainable is True - ] - - if len(ext_params) < len(model.parameters()): - params.append({'params': ext_params}) - - elif len(ext_params) > len(model.parameters()): - raise RuntimeError - - else: - _params = model.parameters() - params = [param for param in _params if param.trainable is True] - - return op(learning_rate=learning_rate, - parameters=params, - grad_clip=grad_clip, - **optim_args) diff --git a/pdfdet/models/Paddle/ppdet/optimizer/utils.py b/pdfdet/models/Paddle/ppdet/optimizer/utils.py deleted file mode 100644 index ce2de49..0000000 --- a/pdfdet/models/Paddle/ppdet/optimizer/utils.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn - -from typing import List - - -def get_bn_running_state_names(model: nn.Layer) -> List[str]: - """Get all bn state full names including running mean and variance - """ - names = [] - for n, m in model.named_sublayers(): - if isinstance(m, (nn.BatchNorm2D, nn.SyncBatchNorm)): - assert hasattr(m, '_mean'), f'assert {m} has _mean' - assert hasattr(m, '_variance'), f'assert {m} has _variance' - running_mean = f'{n}._mean' - running_var = f'{n}._variance' - names.extend([running_mean, running_var]) - - return names diff --git a/pdfdet/models/Paddle/ppdet/slim/__init__.py b/pdfdet/models/Paddle/ppdet/slim/__init__.py deleted file mode 100644 index 7129190..0000000 --- a/pdfdet/models/Paddle/ppdet/slim/__init__.py +++ /dev/null @@ -1,110 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from . import distill_loss -from . import distill_model -from . import ofa -from . import prune -from . import quant -from . import unstructured_prune - -from .distill_loss import * -from .distill_model import * -from .ofa import * -from .prune import * -from .quant import * -from .unstructured_prune import * - -import yaml -from ppdet.core.workspace import load_config -from ppdet.utils.checkpoint import load_pretrain_weight - - -def build_slim_model(cfg, slim_cfg, mode='train'): - with open(slim_cfg) as f: - slim_load_cfg = yaml.load(f, Loader=yaml.Loader) - - if mode != 'train' and slim_load_cfg['slim'] == 'Distill': - return cfg - - if slim_load_cfg['slim'] == 'Distill': - if "slim_method" in slim_load_cfg and slim_load_cfg[ - 'slim_method'] == "FGD": - model = FGDDistillModel(cfg, slim_cfg) - elif "slim_method" in slim_load_cfg and slim_load_cfg[ - 'slim_method'] == "LD": - model = LDDistillModel(cfg, slim_cfg) - elif "slim_method" in slim_load_cfg and slim_load_cfg[ - 'slim_method'] == "CWD": - model = CWDDistillModel(cfg, slim_cfg) - elif "slim_method" in slim_load_cfg and slim_load_cfg[ - 'slim_method'] == "PPYOLOEDistill": - model = PPYOLOEDistillModel(cfg, slim_cfg) - else: - # common distillation model - model = DistillModel(cfg, slim_cfg) - cfg['model'] = model - cfg['slim_type'] = cfg.slim - elif slim_load_cfg['slim'] == 'OFA': - load_config(slim_cfg) - model = create(cfg.architecture) - load_pretrain_weight(model, cfg.weights) - slim = create(cfg.slim) - cfg['slim'] = slim - cfg['model'] = slim(model, model.state_dict()) - cfg['slim_type'] = cfg.slim - elif slim_load_cfg['slim'] == 'DistillPrune': - if mode == 'train': - model = DistillModel(cfg, slim_cfg) - pruner = create(cfg.pruner) - pruner(model.student_model) - else: - model = create(cfg.architecture) - weights = cfg.weights - load_config(slim_cfg) - pruner = create(cfg.pruner) - model = pruner(model) - load_pretrain_weight(model, weights) - cfg['model'] = model - cfg['slim_type'] = cfg.slim - elif slim_load_cfg['slim'] == 'PTQ': - model = create(cfg.architecture) - load_config(slim_cfg) - load_pretrain_weight(model, cfg.weights) - slim = create(cfg.slim) - cfg['slim_type'] = cfg.slim - cfg['slim'] = slim - cfg['model'] = slim(model) - elif slim_load_cfg['slim'] == 'UnstructuredPruner': - load_config(slim_cfg) - slim = create(cfg.slim) - cfg['slim_type'] = cfg.slim - cfg['slim'] = slim - cfg['unstructured_prune'] = True - else: - load_config(slim_cfg) - model = create(cfg.architecture) - if mode == 'train': - load_pretrain_weight(model, cfg.pretrain_weights) - slim = create(cfg.slim) - cfg['slim_type'] = cfg.slim - # TODO: fix quant export model in framework. - if mode == 'test' and 'QAT' in slim_load_cfg['slim']: - slim.quant_config['activation_preprocess_type'] = None - cfg['model'] = slim(model) - cfg['slim'] = slim - if mode != 'train': - load_pretrain_weight(cfg['model'], cfg.weights) - - return cfg diff --git a/pdfdet/models/Paddle/ppdet/slim/distill_loss.py b/pdfdet/models/Paddle/ppdet/slim/distill_loss.py deleted file mode 100644 index d325a5b..0000000 --- a/pdfdet/models/Paddle/ppdet/slim/distill_loss.py +++ /dev/null @@ -1,919 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import math -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -from paddle import ParamAttr - -from ppdet.core.workspace import register -from ppdet.modeling import ops -from ppdet.modeling.losses.iou_loss import GIoULoss -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -__all__ = [ - 'DistillYOLOv3Loss', - 'KnowledgeDistillationKLDivLoss', - 'DistillPPYOLOELoss', - 'FGDFeatureLoss', - 'CWDFeatureLoss', - 'PKDFeatureLoss', - 'MGDFeatureLoss', -] - - -def parameter_init(mode="kaiming", value=0.): - if mode == "kaiming": - weight_attr = paddle.nn.initializer.KaimingUniform() - elif mode == "constant": - weight_attr = paddle.nn.initializer.Constant(value=value) - else: - weight_attr = paddle.nn.initializer.KaimingUniform() - - weight_init = ParamAttr(initializer=weight_attr) - return weight_init - - -def feature_norm(feat): - # Normalize the feature maps to have zero mean and unit variances. - assert len(feat.shape) == 4 - N, C, H, W = feat.shape - feat = feat.transpose([1, 0, 2, 3]).reshape([C, -1]) - mean = feat.mean(axis=-1, keepdim=True) - std = feat.std(axis=-1, keepdim=True) - feat = (feat - mean) / (std + 1e-6) - return feat.reshape([C, N, H, W]).transpose([1, 0, 2, 3]) - - -@register -class DistillYOLOv3Loss(nn.Layer): - def __init__(self, weight=1000): - super(DistillYOLOv3Loss, self).__init__() - self.loss_weight = weight - - def obj_weighted_reg(self, sx, sy, sw, sh, tx, ty, tw, th, tobj): - loss_x = ops.sigmoid_cross_entropy_with_logits(sx, F.sigmoid(tx)) - loss_y = ops.sigmoid_cross_entropy_with_logits(sy, F.sigmoid(ty)) - loss_w = paddle.abs(sw - tw) - loss_h = paddle.abs(sh - th) - loss = paddle.add_n([loss_x, loss_y, loss_w, loss_h]) - weighted_loss = paddle.mean(loss * F.sigmoid(tobj)) - return weighted_loss - - def obj_weighted_cls(self, scls, tcls, tobj): - loss = ops.sigmoid_cross_entropy_with_logits(scls, F.sigmoid(tcls)) - weighted_loss = paddle.mean(paddle.multiply(loss, F.sigmoid(tobj))) - return weighted_loss - - def obj_loss(self, sobj, tobj): - obj_mask = paddle.cast(tobj > 0., dtype="float32") - obj_mask.stop_gradient = True - loss = paddle.mean( - ops.sigmoid_cross_entropy_with_logits(sobj, obj_mask)) - return loss - - def forward(self, teacher_model, student_model): - teacher_distill_pairs = teacher_model.yolo_head.loss.distill_pairs - student_distill_pairs = student_model.yolo_head.loss.distill_pairs - distill_reg_loss, distill_cls_loss, distill_obj_loss = [], [], [] - for s_pair, t_pair in zip(student_distill_pairs, teacher_distill_pairs): - distill_reg_loss.append( - self.obj_weighted_reg(s_pair[0], s_pair[1], s_pair[2], s_pair[ - 3], t_pair[0], t_pair[1], t_pair[2], t_pair[3], t_pair[4])) - distill_cls_loss.append( - self.obj_weighted_cls(s_pair[5], t_pair[5], t_pair[4])) - distill_obj_loss.append(self.obj_loss(s_pair[4], t_pair[4])) - distill_reg_loss = paddle.add_n(distill_reg_loss) - distill_cls_loss = paddle.add_n(distill_cls_loss) - distill_obj_loss = paddle.add_n(distill_obj_loss) - loss = (distill_reg_loss + distill_cls_loss + distill_obj_loss - ) * self.loss_weight - return loss - - -@register -class KnowledgeDistillationKLDivLoss(nn.Layer): - """Loss function for knowledge distilling using KL divergence. - - Args: - reduction (str): Options are `'none'`, `'mean'` and `'sum'`. - loss_weight (float): Loss weight of current loss. - T (int): Temperature for distillation. - """ - - def __init__(self, reduction='mean', loss_weight=1.0, T=10): - super(KnowledgeDistillationKLDivLoss, self).__init__() - assert reduction in ('none', 'mean', 'sum') - assert T >= 1 - self.reduction = reduction - self.loss_weight = loss_weight - self.T = T - - def knowledge_distillation_kl_div_loss(self, - pred, - soft_label, - T, - detach_target=True): - r"""Loss function for knowledge distilling using KL divergence. - - Args: - pred (Tensor): Predicted logits with shape (N, n + 1). - soft_label (Tensor): Target logits with shape (N, N + 1). - T (int): Temperature for distillation. - detach_target (bool): Remove soft_label from automatic differentiation - """ - assert pred.shape == soft_label.shape - target = F.softmax(soft_label / T, axis=1) - if detach_target: - target = target.detach() - - kd_loss = F.kl_div( - F.log_softmax( - pred / T, axis=1), target, reduction='none').mean(1) * (T * T) - - return kd_loss - - def forward(self, - pred, - soft_label, - weight=None, - avg_factor=None, - reduction_override=None): - """Forward function. - - Args: - pred (Tensor): Predicted logits with shape (N, n + 1). - soft_label (Tensor): Target logits with shape (N, N + 1). - weight (Tensor, optional): The weight of loss for each - prediction. Defaults to None. - avg_factor (int, optional): Average factor that is used to average - the loss. Defaults to None. - reduction_override (str, optional): The reduction method used to - override the original reduction method of the loss. - Defaults to None. - """ - assert reduction_override in (None, 'none', 'mean', 'sum') - - reduction = (reduction_override - if reduction_override else self.reduction) - - loss_kd_out = self.knowledge_distillation_kl_div_loss( - pred, soft_label, T=self.T) - - if weight is not None: - loss_kd_out = weight * loss_kd_out - - if avg_factor is None: - if reduction == 'none': - loss = loss_kd_out - elif reduction == 'mean': - loss = loss_kd_out.mean() - elif reduction == 'sum': - loss = loss_kd_out.sum() - else: - # if reduction is mean, then average the loss by avg_factor - if reduction == 'mean': - loss = loss_kd_out.sum() / avg_factor - # if reduction is 'none', then do nothing, otherwise raise an error - elif reduction != 'none': - raise ValueError( - 'avg_factor can not be used with reduction="sum"') - - loss_kd = self.loss_weight * loss - return loss_kd - - -@register -class DistillPPYOLOELoss(nn.Layer): - def __init__( - self, - loss_weight={'logits': 4.0, - 'feat': 1.0}, - logits_distill=True, - logits_loss_weight={'class': 1.0, - 'iou': 2.5, - 'dfl': 0.5}, - logits_ld_distill=False, - logits_ld_params={'weight': 20000, - 'T': 10}, - feat_distill=True, - feat_distiller='fgd', - feat_distill_place='neck_feats', - teacher_width_mult=1.0, # L - student_width_mult=0.75, # M - feat_out_channels=[768, 384, 192]): - super(DistillPPYOLOELoss, self).__init__() - self.loss_weight_logits = loss_weight['logits'] - self.loss_weight_feat = loss_weight['feat'] - self.logits_distill = logits_distill - self.logits_ld_distill = logits_ld_distill - self.feat_distill = feat_distill - - if logits_distill and self.loss_weight_logits > 0: - self.bbox_loss_weight = logits_loss_weight['iou'] - self.dfl_loss_weight = logits_loss_weight['dfl'] - self.qfl_loss_weight = logits_loss_weight['class'] - self.loss_bbox = GIoULoss() - - if logits_ld_distill: - self.loss_kd = KnowledgeDistillationKLDivLoss( - loss_weight=logits_ld_params['weight'], T=logits_ld_params['T']) - - if feat_distill and self.loss_weight_feat > 0: - assert feat_distiller in ['cwd', 'fgd', 'pkd', 'mgd', 'mimic'] - assert feat_distill_place in ['backbone_feats', 'neck_feats'] - self.feat_distill_place = feat_distill_place - self.t_channel_list = [ - int(c * teacher_width_mult) for c in feat_out_channels - ] - self.s_channel_list = [ - int(c * student_width_mult) for c in feat_out_channels - ] - self.distill_feat_loss_modules = [] - for i in range(len(feat_out_channels)): - if feat_distiller == 'cwd': - feat_loss_module = CWDFeatureLoss( - student_channels=self.s_channel_list[i], - teacher_channels=self.t_channel_list[i], - normalize=True) - elif feat_distiller == 'fgd': - feat_loss_module = FGDFeatureLoss( - student_channels=self.s_channel_list[i], - teacher_channels=self.t_channel_list[i], - normalize=True, - alpha_fgd=0.00001, - beta_fgd=0.000005, - gamma_fgd=0.00001, - lambda_fgd=0.00000005) - elif feat_distiller == 'pkd': - feat_loss_module = PKDFeatureLoss( - student_channels=self.s_channel_list[i], - teacher_channels=self.t_channel_list[i], - normalize=True, - resize_stu=True) - elif feat_distiller == 'mgd': - feat_loss_module = MGDFeatureLoss( - student_channels=self.s_channel_list[i], - teacher_channels=self.t_channel_list[i], - normalize=True, - loss_func='ssim') - elif feat_distiller == 'mimic': - feat_loss_module = MimicFeatureLoss( - student_channels=self.s_channel_list[i], - teacher_channels=self.t_channel_list[i], - normalize=True) - else: - raise ValueError - self.distill_feat_loss_modules.append(feat_loss_module) - - def quality_focal_loss(self, - pred_logits, - soft_target_logits, - beta=2.0, - use_sigmoid=False, - num_total_pos=None): - if use_sigmoid: - func = F.binary_cross_entropy_with_logits - soft_target = F.sigmoid(soft_target_logits) - pred_sigmoid = F.sigmoid(pred_logits) - preds = pred_logits - else: - func = F.binary_cross_entropy - soft_target = soft_target_logits - pred_sigmoid = pred_logits - preds = pred_sigmoid - - scale_factor = pred_sigmoid - soft_target - loss = func( - preds, soft_target, reduction='none') * scale_factor.abs().pow(beta) - loss = loss.sum(1) - - if num_total_pos is not None: - loss = loss.sum() / num_total_pos - else: - loss = loss.mean() - return loss - - def bbox_loss(self, s_bbox, t_bbox, weight_targets=None): - # [x,y,w,h] - if weight_targets is not None: - loss = paddle.sum(self.loss_bbox(s_bbox, t_bbox) * weight_targets) - avg_factor = weight_targets.sum() - loss = loss / avg_factor - else: - loss = paddle.mean(self.loss_bbox(s_bbox, t_bbox)) - return loss - - def distribution_focal_loss(self, - pred_corners, - target_corners, - weight_targets=None): - target_corners_label = F.softmax(target_corners, axis=-1) - loss_dfl = F.cross_entropy( - pred_corners, - target_corners_label, - soft_label=True, - reduction='none') - loss_dfl = loss_dfl.sum(1) - - if weight_targets is not None: - loss_dfl = loss_dfl * (weight_targets.expand([-1, 4]).reshape([-1])) - loss_dfl = loss_dfl.sum(-1) / weight_targets.sum() - else: - loss_dfl = loss_dfl.mean(-1) - return loss_dfl / 4.0 # 4 direction - - def main_kd(self, mask_positive, pred_scores, soft_cls, num_classes): - num_pos = mask_positive.sum() - if num_pos > 0: - cls_mask = mask_positive.unsqueeze(-1).tile([1, 1, num_classes]) - pred_scores_pos = paddle.masked_select( - pred_scores, cls_mask).reshape([-1, num_classes]) - soft_cls_pos = paddle.masked_select( - soft_cls, cls_mask).reshape([-1, num_classes]) - loss_kd = self.loss_kd( - pred_scores_pos, soft_cls_pos, avg_factor=num_pos) - else: - loss_kd = paddle.zeros([1]) - return loss_kd - - def forward(self, teacher_model, student_model): - teacher_distill_pairs = teacher_model.yolo_head.distill_pairs - student_distill_pairs = student_model.yolo_head.distill_pairs - if self.logits_distill and self.loss_weight_logits > 0: - distill_bbox_loss, distill_dfl_loss, distill_cls_loss = [], [], [] - - distill_cls_loss.append( - self.quality_focal_loss( - student_distill_pairs['pred_cls_scores'].reshape( - (-1, student_distill_pairs['pred_cls_scores'].shape[-1] - )), - teacher_distill_pairs['pred_cls_scores'].detach().reshape( - (-1, teacher_distill_pairs['pred_cls_scores'].shape[-1] - )), - num_total_pos=student_distill_pairs['pos_num'], - use_sigmoid=False)) - - distill_bbox_loss.append( - self.bbox_loss(student_distill_pairs['pred_bboxes_pos'], - teacher_distill_pairs['pred_bboxes_pos'].detach(), - weight_targets=student_distill_pairs['bbox_weight'] - ) if 'pred_bboxes_pos' in student_distill_pairs and \ - 'pred_bboxes_pos' in teacher_distill_pairs and \ - 'bbox_weight' in student_distill_pairs - else paddle.zeros([1])) - - distill_dfl_loss.append( - self.distribution_focal_loss( - student_distill_pairs['pred_dist_pos'].reshape((-1, student_distill_pairs['pred_dist_pos'].shape[-1])), - teacher_distill_pairs['pred_dist_pos'].detach().reshape((-1, teacher_distill_pairs['pred_dist_pos'].shape[-1])), \ - weight_targets=student_distill_pairs['bbox_weight'] - ) if 'pred_dist_pos' in student_distill_pairs and \ - 'pred_dist_pos' in teacher_distill_pairs and \ - 'bbox_weight' in student_distill_pairs - else paddle.zeros([1])) - - distill_cls_loss = paddle.add_n(distill_cls_loss) - distill_bbox_loss = paddle.add_n(distill_bbox_loss) - distill_dfl_loss = paddle.add_n(distill_dfl_loss) - logits_loss = distill_bbox_loss * self.bbox_loss_weight + distill_cls_loss * self.qfl_loss_weight + distill_dfl_loss * self.dfl_loss_weight - - if self.logits_ld_distill: - loss_kd = self.main_kd( - student_distill_pairs['mask_positive_select'], - student_distill_pairs['pred_cls_scores'], - teacher_distill_pairs['pred_cls_scores'], - student_model.yolo_head.num_classes, ) - logits_loss += loss_kd - else: - logits_loss = paddle.zeros([1]) - - if self.feat_distill and self.loss_weight_feat > 0: - feat_loss_list = [] - inputs = student_model.inputs - assert 'gt_bbox' in inputs - assert self.feat_distill_place in student_distill_pairs - assert self.feat_distill_place in teacher_distill_pairs - stu_feats = student_distill_pairs[self.feat_distill_place] - tea_feats = teacher_distill_pairs[self.feat_distill_place] - for i, loss_module in enumerate(self.distill_feat_loss_modules): - feat_loss_list.append( - loss_module(stu_feats[i], tea_feats[i], inputs)) - feat_loss = paddle.add_n(feat_loss_list) - else: - feat_loss = paddle.zeros([1]) - - student_model.yolo_head.distill_pairs.clear() - teacher_model.yolo_head.distill_pairs.clear() - return logits_loss * self.loss_weight_logits, feat_loss * self.loss_weight_feat - - -@register -class CWDFeatureLoss(nn.Layer): - def __init__(self, - student_channels, - teacher_channels, - normalize=False, - tau=1.0, - weight=1.0): - super(CWDFeatureLoss, self).__init__() - self.normalize = normalize - self.tau = tau - self.loss_weight = weight - - if student_channels != teacher_channels: - self.align = nn.Conv2D( - student_channels, - teacher_channels, - kernel_size=1, - stride=1, - padding=0) - else: - self.align = None - - def distill_softmax(self, x, tau): - _, _, w, h = paddle.shape(x) - x = paddle.reshape(x, [-1, w * h]) - x /= tau - return F.softmax(x, axis=1) - - def forward(self, preds_s, preds_t, inputs=None): - assert preds_s.shape[-2:] == preds_t.shape[-2:] - N, C, H, W = preds_s.shape - eps = 1e-5 - if self.align is not None: - preds_s = self.align(preds_s) - - if self.normalize: - preds_s = feature_norm(preds_s) - preds_t = feature_norm(preds_t) - - softmax_pred_s = self.distill_softmax(preds_s, self.tau) - softmax_pred_t = self.distill_softmax(preds_t, self.tau) - - loss = paddle.sum(-softmax_pred_t * paddle.log(eps + softmax_pred_s) + - softmax_pred_t * paddle.log(eps + softmax_pred_t)) - return self.loss_weight * loss / (C * N) - - -@register -class FGDFeatureLoss(nn.Layer): - """ - Focal and Global Knowledge Distillation for Detectors - The code is reference from https://github.com/yzd-v/FGD/blob/master/mmdet/distillation/losses/fgd.py - - Args: - student_channels (int): The number of channels in the student's FPN feature map. Default to 256. - teacher_channels (int): The number of channels in the teacher's FPN feature map. Default to 256. - normalize (bool): Whether to normalize the feature maps. - temp (float, optional): The temperature coefficient. Defaults to 0.5. - alpha_fgd (float, optional): The weight of fg_loss. Defaults to 0.001 - beta_fgd (float, optional): The weight of bg_loss. Defaults to 0.0005 - gamma_fgd (float, optional): The weight of mask_loss. Defaults to 0.001 - lambda_fgd (float, optional): The weight of relation_loss. Defaults to 0.000005 - """ - - def __init__(self, - student_channels, - teacher_channels, - normalize=False, - loss_weight=1.0, - temp=0.5, - alpha_fgd=0.001, - beta_fgd=0.0005, - gamma_fgd=0.001, - lambda_fgd=0.000005): - super(FGDFeatureLoss, self).__init__() - self.normalize = normalize - self.loss_weight = loss_weight - self.temp = temp - self.alpha_fgd = alpha_fgd - self.beta_fgd = beta_fgd - self.gamma_fgd = gamma_fgd - self.lambda_fgd = lambda_fgd - kaiming_init = parameter_init("kaiming") - zeros_init = parameter_init("constant", 0.0) - - if student_channels != teacher_channels: - self.align = nn.Conv2D( - student_channels, - teacher_channels, - kernel_size=1, - stride=1, - padding=0, - weight_attr=kaiming_init) - student_channels = teacher_channels - else: - self.align = None - - self.conv_mask_s = nn.Conv2D( - student_channels, 1, kernel_size=1, weight_attr=kaiming_init) - self.conv_mask_t = nn.Conv2D( - teacher_channels, 1, kernel_size=1, weight_attr=kaiming_init) - - self.stu_conv_block = nn.Sequential( - nn.Conv2D( - student_channels, - student_channels // 2, - kernel_size=1, - weight_attr=zeros_init), - nn.LayerNorm([student_channels // 2, 1, 1]), - nn.ReLU(), - nn.Conv2D( - student_channels // 2, - student_channels, - kernel_size=1, - weight_attr=zeros_init)) - self.tea_conv_block = nn.Sequential( - nn.Conv2D( - teacher_channels, - teacher_channels // 2, - kernel_size=1, - weight_attr=zeros_init), - nn.LayerNorm([teacher_channels // 2, 1, 1]), - nn.ReLU(), - nn.Conv2D( - teacher_channels // 2, - teacher_channels, - kernel_size=1, - weight_attr=zeros_init)) - - def spatial_channel_attention(self, x, t=0.5): - shape = paddle.shape(x) - N, C, H, W = shape - _f = paddle.abs(x) - spatial_map = paddle.reshape( - paddle.mean( - _f, axis=1, keepdim=True) / t, [N, -1]) - spatial_map = F.softmax(spatial_map, axis=1, dtype="float32") * H * W - spatial_att = paddle.reshape(spatial_map, [N, H, W]) - - channel_map = paddle.mean( - paddle.mean( - _f, axis=2, keepdim=False), axis=2, keepdim=False) - channel_att = F.softmax(channel_map / t, axis=1, dtype="float32") * C - return [spatial_att, channel_att] - - def spatial_pool(self, x, mode="teacher"): - batch, channel, width, height = x.shape - x_copy = x - x_copy = paddle.reshape(x_copy, [batch, channel, height * width]) - x_copy = x_copy.unsqueeze(1) - if mode.lower() == "student": - context_mask = self.conv_mask_s(x) - else: - context_mask = self.conv_mask_t(x) - - context_mask = paddle.reshape(context_mask, [batch, 1, height * width]) - context_mask = F.softmax(context_mask, axis=2) - context_mask = context_mask.unsqueeze(-1) - context = paddle.matmul(x_copy, context_mask) - context = paddle.reshape(context, [batch, channel, 1, 1]) - return context - - def mask_loss(self, stu_channel_att, tea_channel_att, stu_spatial_att, - tea_spatial_att): - def _func(a, b): - return paddle.sum(paddle.abs(a - b)) / len(a) - - mask_loss = _func(stu_channel_att, tea_channel_att) + _func( - stu_spatial_att, tea_spatial_att) - return mask_loss - - def feature_loss(self, stu_feature, tea_feature, mask_fg, mask_bg, - tea_channel_att, tea_spatial_att): - mask_fg = mask_fg.unsqueeze(axis=1) - mask_bg = mask_bg.unsqueeze(axis=1) - tea_channel_att = tea_channel_att.unsqueeze(axis=-1).unsqueeze(axis=-1) - tea_spatial_att = tea_spatial_att.unsqueeze(axis=1) - - fea_t = paddle.multiply(tea_feature, paddle.sqrt(tea_spatial_att)) - fea_t = paddle.multiply(fea_t, paddle.sqrt(tea_channel_att)) - fg_fea_t = paddle.multiply(fea_t, paddle.sqrt(mask_fg)) - bg_fea_t = paddle.multiply(fea_t, paddle.sqrt(mask_bg)) - - fea_s = paddle.multiply(stu_feature, paddle.sqrt(tea_spatial_att)) - fea_s = paddle.multiply(fea_s, paddle.sqrt(tea_channel_att)) - fg_fea_s = paddle.multiply(fea_s, paddle.sqrt(mask_fg)) - bg_fea_s = paddle.multiply(fea_s, paddle.sqrt(mask_bg)) - - fg_loss = F.mse_loss(fg_fea_s, fg_fea_t, reduction="sum") / len(mask_fg) - bg_loss = F.mse_loss(bg_fea_s, bg_fea_t, reduction="sum") / len(mask_bg) - return fg_loss, bg_loss - - def relation_loss(self, stu_feature, tea_feature): - context_s = self.spatial_pool(stu_feature, "student") - context_t = self.spatial_pool(tea_feature, "teacher") - out_s = stu_feature + self.stu_conv_block(context_s) - out_t = tea_feature + self.tea_conv_block(context_t) - rela_loss = F.mse_loss(out_s, out_t, reduction="sum") / len(out_s) - return rela_loss - - def mask_value(self, mask, xl, xr, yl, yr, value): - mask[xl:xr, yl:yr] = paddle.maximum(mask[xl:xr, yl:yr], value) - return mask - - def forward(self, stu_feature, tea_feature, inputs): - assert stu_feature.shape[-2:] == stu_feature.shape[-2:] - assert "gt_bbox" in inputs.keys() and "im_shape" in inputs.keys() - gt_bboxes = inputs['gt_bbox'] - ins_shape = [ - inputs['im_shape'][i] for i in range(inputs['im_shape'].shape[0]) - ] - index_gt = [] - for i in range(len(gt_bboxes)): - if gt_bboxes[i].size > 2: - index_gt.append(i) - # only distill feature with labeled GTbox - if len(index_gt) != len(gt_bboxes): - index_gt_t = paddle.to_tensor(index_gt) - stu_feature = paddle.index_select(stu_feature, index_gt_t) - tea_feature = paddle.index_select(tea_feature, index_gt_t) - - ins_shape = [ins_shape[c] for c in index_gt] - gt_bboxes = [gt_bboxes[c] for c in index_gt] - assert len(gt_bboxes) == tea_feature.shape[0] - - if self.align is not None: - stu_feature = self.align(stu_feature) - - if self.normalize: - stu_feature = feature_norm(stu_feature) - tea_feature = feature_norm(tea_feature) - - tea_spatial_att, tea_channel_att = self.spatial_channel_attention( - tea_feature, self.temp) - stu_spatial_att, stu_channel_att = self.spatial_channel_attention( - stu_feature, self.temp) - - mask_fg = paddle.zeros(tea_spatial_att.shape) - mask_bg = paddle.ones_like(tea_spatial_att) - one_tmp = paddle.ones([*tea_spatial_att.shape[1:]]) - zero_tmp = paddle.zeros([*tea_spatial_att.shape[1:]]) - mask_fg.stop_gradient = True - mask_bg.stop_gradient = True - one_tmp.stop_gradient = True - zero_tmp.stop_gradient = True - - wmin, wmax, hmin, hmax = [], [], [], [] - - if len(gt_bboxes) == 0: - loss = self.relation_loss(stu_feature, tea_feature) - return self.lambda_fgd * loss - - N, _, H, W = stu_feature.shape - for i in range(N): - tmp_box = paddle.ones_like(gt_bboxes[i]) - tmp_box.stop_gradient = True - tmp_box[:, 0] = gt_bboxes[i][:, 0] / ins_shape[i][1] * W - tmp_box[:, 2] = gt_bboxes[i][:, 2] / ins_shape[i][1] * W - tmp_box[:, 1] = gt_bboxes[i][:, 1] / ins_shape[i][0] * H - tmp_box[:, 3] = gt_bboxes[i][:, 3] / ins_shape[i][0] * H - - zero = paddle.zeros_like(tmp_box[:, 0], dtype="int32") - ones = paddle.ones_like(tmp_box[:, 2], dtype="int32") - zero.stop_gradient = True - ones.stop_gradient = True - wmin.append( - paddle.cast(paddle.floor(tmp_box[:, 0]), "int32").maximum(zero)) - wmax.append(paddle.cast(paddle.ceil(tmp_box[:, 2]), "int32")) - hmin.append( - paddle.cast(paddle.floor(tmp_box[:, 1]), "int32").maximum(zero)) - hmax.append(paddle.cast(paddle.ceil(tmp_box[:, 3]), "int32")) - - area_recip = 1.0 / ( - hmax[i].reshape([1, -1]) + 1 - hmin[i].reshape([1, -1])) / ( - wmax[i].reshape([1, -1]) + 1 - wmin[i].reshape([1, -1])) - - for j in range(len(gt_bboxes[i])): - if gt_bboxes[i][j].sum() > 0: - mask_fg[i] = self.mask_value( - mask_fg[i], hmin[i][j], hmax[i][j] + 1, wmin[i][j], - wmax[i][j] + 1, area_recip[0][j]) - - mask_bg[i] = paddle.where(mask_fg[i] > zero_tmp, zero_tmp, one_tmp) - - if paddle.sum(mask_bg[i]): - mask_bg[i] /= paddle.sum(mask_bg[i]) - - fg_loss, bg_loss = self.feature_loss(stu_feature, tea_feature, mask_fg, - mask_bg, tea_channel_att, - tea_spatial_att) - mask_loss = self.mask_loss(stu_channel_att, tea_channel_att, - stu_spatial_att, tea_spatial_att) - rela_loss = self.relation_loss(stu_feature, tea_feature) - loss = self.alpha_fgd * fg_loss + self.beta_fgd * bg_loss \ - + self.gamma_fgd * mask_loss + self.lambda_fgd * rela_loss - return loss * self.loss_weight - - -@register -class PKDFeatureLoss(nn.Layer): - """ - PKD: General Distillation Framework for Object Detectors via Pearson Correlation Coefficient. - - Args: - loss_weight (float): Weight of loss. Defaults to 1.0. - resize_stu (bool): If True, we'll down/up sample the features of the - student model to the spatial size of those of the teacher model if - their spatial sizes are different. And vice versa. Defaults to - True. - """ - - def __init__(self, - student_channels=256, - teacher_channels=256, - normalize=True, - loss_weight=1.0, - resize_stu=True): - super(PKDFeatureLoss, self).__init__() - self.normalize = normalize - self.loss_weight = loss_weight - self.resize_stu = resize_stu - - def forward(self, stu_feature, tea_feature, inputs=None): - size_s, size_t = stu_feature.shape[2:], tea_feature.shape[2:] - if size_s[0] != size_t[0]: - if self.resize_stu: - stu_feature = F.interpolate( - stu_feature, size_t, mode='bilinear') - else: - tea_feature = F.interpolate( - tea_feature, size_s, mode='bilinear') - assert stu_feature.shape == tea_feature.shape - - if self.normalize: - stu_feature = feature_norm(stu_feature) - tea_feature = feature_norm(tea_feature) - - loss = F.mse_loss(stu_feature, tea_feature) / 2 - return loss * self.loss_weight - - -@register -class MimicFeatureLoss(nn.Layer): - def __init__(self, - student_channels=256, - teacher_channels=256, - normalize=True, - loss_weight=1.0): - super(MimicFeatureLoss, self).__init__() - self.normalize = normalize - self.loss_weight = loss_weight - self.mse_loss = nn.MSELoss() - - if student_channels != teacher_channels: - self.align = nn.Conv2D( - student_channels, - teacher_channels, - kernel_size=1, - stride=1, - padding=0) - else: - self.align = None - - def forward(self, stu_feature, tea_feature, inputs=None): - if self.align is not None: - stu_feature = self.align(stu_feature) - - if self.normalize: - stu_feature = feature_norm(stu_feature) - tea_feature = feature_norm(tea_feature) - - loss = self.mse_loss(stu_feature, tea_feature) - return loss * self.loss_weight - - -@register -class MGDFeatureLoss(nn.Layer): - def __init__(self, - student_channels=256, - teacher_channels=256, - normalize=True, - loss_weight=1.0, - loss_func='mse'): - super(MGDFeatureLoss, self).__init__() - self.normalize = normalize - self.loss_weight = loss_weight - assert loss_func in ['mse', 'ssim'] - self.loss_func = loss_func - self.mse_loss = nn.MSELoss(reduction='sum') - self.ssim_loss = SSIM(11) - - kaiming_init = parameter_init("kaiming") - if student_channels != teacher_channels: - self.align = nn.Conv2D( - student_channels, - teacher_channels, - kernel_size=1, - stride=1, - padding=0, - weight_attr=kaiming_init, - bias_attr=False) - else: - self.align = None - - self.generation = nn.Sequential( - nn.Conv2D( - teacher_channels, teacher_channels, kernel_size=3, padding=1), - nn.ReLU(), - nn.Conv2D( - teacher_channels, teacher_channels, kernel_size=3, padding=1)) - - def forward(self, stu_feature, tea_feature, inputs=None): - N = stu_feature.shape[0] - if self.align is not None: - stu_feature = self.align(stu_feature) - stu_feature = self.generation(stu_feature) - - if self.normalize: - stu_feature = feature_norm(stu_feature) - tea_feature = feature_norm(tea_feature) - - if self.loss_func == 'mse': - loss = self.mse_loss(stu_feature, tea_feature) / N - elif self.loss_func == 'ssim': - ssim_loss = self.ssim_loss(stu_feature, tea_feature) - loss = paddle.clip((1 - ssim_loss) / 2, 0, 1) - else: - raise ValueError - return loss * self.loss_weight - - -class SSIM(nn.Layer): - def __init__(self, window_size=11, size_average=True): - super(SSIM, self).__init__() - self.window_size = window_size - self.size_average = size_average - self.channel = 1 - self.window = self.create_window(window_size, self.channel) - - def gaussian(self, window_size, sigma): - gauss = paddle.to_tensor([ - math.exp(-(x - window_size // 2)**2 / float(2 * sigma**2)) - for x in range(window_size) - ]) - return gauss / gauss.sum() - - def create_window(self, window_size, channel): - _1D_window = self.gaussian(window_size, 1.5).unsqueeze(1) - _2D_window = _1D_window.mm(_1D_window.t()).unsqueeze(0).unsqueeze(0) - window = _2D_window.expand([channel, 1, window_size, window_size]) - return window - - def _ssim(self, img1, img2, window, window_size, channel, - size_average=True): - mu1 = F.conv2d(img1, window, padding=window_size // 2, groups=channel) - mu2 = F.conv2d(img2, window, padding=window_size // 2, groups=channel) - mu1_sq = mu1.pow(2) - mu2_sq = mu2.pow(2) - mu1_mu2 = mu1 * mu2 - - sigma1_sq = F.conv2d( - img1 * img1, window, padding=window_size // 2, - groups=channel) - mu1_sq - sigma2_sq = F.conv2d( - img2 * img2, window, padding=window_size // 2, - groups=channel) - mu2_sq - sigma12 = F.conv2d( - img1 * img2, window, padding=window_size // 2, - groups=channel) - mu1_mu2 - - C1 = 0.01**2 - C2 = 0.03**2 - ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ( - 1e-12 + (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)) - - if size_average: - return ssim_map.mean() - else: - return ssim_map.mean([1, 2, 3]) - - def forward(self, img1, img2): - channel = img1.shape[1] - if channel == self.channel and self.window.dtype == img1.dtype: - window = self.window - else: - window = self.create_window(self.window_size, channel) - self.window = window - self.channel = channel - - return self._ssim(img1, img2, window, self.window_size, channel, - self.size_average) diff --git a/pdfdet/models/Paddle/ppdet/slim/distill_model.py b/pdfdet/models/Paddle/ppdet/slim/distill_model.py deleted file mode 100644 index 4fa3ccc..0000000 --- a/pdfdet/models/Paddle/ppdet/slim/distill_model.py +++ /dev/null @@ -1,352 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn - -from ppdet.core.workspace import register, create, load_config -from ppdet.utils.checkpoint import load_pretrain_weight -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - -__all__ = [ - 'DistillModel', - 'FGDDistillModel', - 'CWDDistillModel', - 'LDDistillModel', - 'PPYOLOEDistillModel', -] - - -@register -class DistillModel(nn.Layer): - """ - Build common distill model. - Args: - cfg: The student config. - slim_cfg: The teacher and distill config. - """ - - def __init__(self, cfg, slim_cfg): - super(DistillModel, self).__init__() - self.arch = cfg.architecture - - self.stu_cfg = cfg - self.student_model = create(self.stu_cfg.architecture) - if 'pretrain_weights' in self.stu_cfg and self.stu_cfg.pretrain_weights: - stu_pretrain = self.stu_cfg.pretrain_weights - else: - stu_pretrain = None - - slim_cfg = load_config(slim_cfg) - self.tea_cfg = slim_cfg - self.teacher_model = create(self.tea_cfg.architecture) - if 'pretrain_weights' in self.tea_cfg and self.tea_cfg.pretrain_weights: - tea_pretrain = self.tea_cfg.pretrain_weights - else: - tea_pretrain = None - self.distill_cfg = slim_cfg - - # load pretrain weights - self.is_inherit = False - if stu_pretrain: - if self.is_inherit and tea_pretrain: - load_pretrain_weight(self.student_model, tea_pretrain) - logger.debug( - "Inheriting! loading teacher weights to student model!") - load_pretrain_weight(self.student_model, stu_pretrain) - logger.info("Student model has loaded pretrain weights!") - if tea_pretrain: - load_pretrain_weight(self.teacher_model, tea_pretrain) - logger.info("Teacher model has loaded pretrain weights!") - - self.teacher_model.eval() - for param in self.teacher_model.parameters(): - param.trainable = False - - self.distill_loss = self.build_loss(self.distill_cfg) - - def build_loss(self, distill_cfg): - if 'distill_loss' in distill_cfg and distill_cfg.distill_loss: - return create(distill_cfg.distill_loss) - else: - return None - - def parameters(self): - return self.student_model.parameters() - - def forward(self, inputs): - if self.training: - student_loss = self.student_model(inputs) - with paddle.no_grad(): - teacher_loss = self.teacher_model(inputs) - - loss = self.distill_loss(self.teacher_model, self.student_model) - student_loss['distill_loss'] = loss - student_loss['teacher_loss'] = teacher_loss['loss'] - student_loss['loss'] += student_loss['distill_loss'] - return student_loss - else: - return self.student_model(inputs) - - -@register -class FGDDistillModel(DistillModel): - """ - Build FGD distill model. - Args: - cfg: The student config. - slim_cfg: The teacher and distill config. - """ - - def __init__(self, cfg, slim_cfg): - super(FGDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg) - assert self.arch in ['RetinaNet', 'PicoDet' - ], 'Unsupported arch: {}'.format(self.arch) - self.is_inherit = True - - def build_loss(self, distill_cfg): - assert 'distill_loss_name' in distill_cfg and distill_cfg.distill_loss_name - assert 'distill_loss' in distill_cfg and distill_cfg.distill_loss - loss_func = dict() - name_list = distill_cfg.distill_loss_name - for name in name_list: - loss_func[name] = create(distill_cfg.distill_loss) - return loss_func - - def forward(self, inputs): - if self.training: - s_body_feats = self.student_model.backbone(inputs) - s_neck_feats = self.student_model.neck(s_body_feats) - with paddle.no_grad(): - t_body_feats = self.teacher_model.backbone(inputs) - t_neck_feats = self.teacher_model.neck(t_body_feats) - - loss_dict = {} - for idx, k in enumerate(self.distill_loss): - loss_dict[k] = self.distill_loss[k](s_neck_feats[idx], - t_neck_feats[idx], inputs) - if self.arch == "RetinaNet": - loss = self.student_model.head(s_neck_feats, inputs) - elif self.arch == "PicoDet": - head_outs = self.student_model.head( - s_neck_feats, self.student_model.export_post_process) - loss_gfl = self.student_model.head.get_loss(head_outs, inputs) - total_loss = paddle.add_n(list(loss_gfl.values())) - loss = {} - loss.update(loss_gfl) - loss.update({'loss': total_loss}) - else: - raise ValueError(f"Unsupported model {self.arch}") - - for k in loss_dict: - loss['loss'] += loss_dict[k] - loss[k] = loss_dict[k] - return loss - else: - body_feats = self.student_model.backbone(inputs) - neck_feats = self.student_model.neck(body_feats) - head_outs = self.student_model.head(neck_feats) - if self.arch == "RetinaNet": - bbox, bbox_num = self.student_model.head.post_process( - head_outs, inputs['im_shape'], inputs['scale_factor']) - return {'bbox': bbox, 'bbox_num': bbox_num} - elif self.arch == "PicoDet": - head_outs = self.student_model.head( - neck_feats, self.student_model.export_post_process) - scale_factor = inputs['scale_factor'] - bboxes, bbox_num = self.student_model.head.post_process( - head_outs, - scale_factor, - export_nms=self.student_model.export_nms) - return {'bbox': bboxes, 'bbox_num': bbox_num} - else: - raise ValueError(f"Unsupported model {self.arch}") - - -@register -class CWDDistillModel(DistillModel): - """ - Build CWD distill model. - Args: - cfg: The student config. - slim_cfg: The teacher and distill config. - """ - - def __init__(self, cfg, slim_cfg): - super(CWDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg) - assert self.arch in ['GFL', 'RetinaNet'], 'Unsupported arch: {}'.format( - self.arch) - - def build_loss(self, distill_cfg): - assert 'distill_loss_name' in distill_cfg and distill_cfg.distill_loss_name - assert 'distill_loss' in distill_cfg and distill_cfg.distill_loss - loss_func = dict() - name_list = distill_cfg.distill_loss_name - for name in name_list: - loss_func[name] = create(distill_cfg.distill_loss) - return loss_func - - def get_loss_retinanet(self, stu_fea_list, tea_fea_list, inputs): - loss = self.student_model.head(stu_fea_list, inputs) - loss_dict = {} - for idx, k in enumerate(self.distill_loss): - loss_dict[k] = self.distill_loss[k](stu_fea_list[idx], - tea_fea_list[idx]) - - loss['loss'] += loss_dict[k] - loss[k] = loss_dict[k] - return loss - - def get_loss_gfl(self, stu_fea_list, tea_fea_list, inputs): - loss = {} - head_outs = self.student_model.head(stu_fea_list) - loss_gfl = self.student_model.head.get_loss(head_outs, inputs) - loss.update(loss_gfl) - total_loss = paddle.add_n(list(loss.values())) - loss.update({'loss': total_loss}) - - feat_loss = {} - loss_dict = {} - s_cls_feat, t_cls_feat = [], [] - for s_neck_f, t_neck_f in zip(stu_fea_list, tea_fea_list): - conv_cls_feat, _ = self.student_model.head.conv_feat(s_neck_f) - cls_score = self.student_model.head.gfl_head_cls(conv_cls_feat) - t_conv_cls_feat, _ = self.teacher_model.head.conv_feat(t_neck_f) - t_cls_score = self.teacher_model.head.gfl_head_cls(t_conv_cls_feat) - s_cls_feat.append(cls_score) - t_cls_feat.append(t_cls_score) - - for idx, k in enumerate(self.distill_loss): - loss_dict[k] = self.distill_loss[k](s_cls_feat[idx], - t_cls_feat[idx]) - feat_loss[f"neck_f_{idx}"] = self.distill_loss[k](stu_fea_list[idx], - tea_fea_list[idx]) - - for k in feat_loss: - loss['loss'] += feat_loss[k] - loss[k] = feat_loss[k] - - for k in loss_dict: - loss['loss'] += loss_dict[k] - loss[k] = loss_dict[k] - return loss - - def forward(self, inputs): - if self.training: - s_body_feats = self.student_model.backbone(inputs) - s_neck_feats = self.student_model.neck(s_body_feats) - with paddle.no_grad(): - t_body_feats = self.teacher_model.backbone(inputs) - t_neck_feats = self.teacher_model.neck(t_body_feats) - - if self.arch == "RetinaNet": - loss = self.get_loss_retinanet(s_neck_feats, t_neck_feats, - inputs) - elif self.arch == "GFL": - loss = self.get_loss_gfl(s_neck_feats, t_neck_feats, inputs) - else: - raise ValueError(f"unsupported arch {self.arch}") - return loss - else: - body_feats = self.student_model.backbone(inputs) - neck_feats = self.student_model.neck(body_feats) - head_outs = self.student_model.head(neck_feats) - if self.arch == "RetinaNet": - bbox, bbox_num = self.student_model.head.post_process( - head_outs, inputs['im_shape'], inputs['scale_factor']) - return {'bbox': bbox, 'bbox_num': bbox_num} - elif self.arch == "GFL": - bbox_pred, bbox_num = head_outs - output = {'bbox': bbox_pred, 'bbox_num': bbox_num} - return output - else: - raise ValueError(f"unsupported arch {self.arch}") - - -@register -class LDDistillModel(DistillModel): - """ - Build LD distill model. - Args: - cfg: The student config. - slim_cfg: The teacher and distill config. - """ - - def __init__(self, cfg, slim_cfg): - super(LDDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg) - assert self.arch in ['GFL'], 'Unsupported arch: {}'.format(self.arch) - - def forward(self, inputs): - if self.training: - s_body_feats = self.student_model.backbone(inputs) - s_neck_feats = self.student_model.neck(s_body_feats) - s_head_outs = self.student_model.head(s_neck_feats) - with paddle.no_grad(): - t_body_feats = self.teacher_model.backbone(inputs) - t_neck_feats = self.teacher_model.neck(t_body_feats) - t_head_outs = self.teacher_model.head(t_neck_feats) - - soft_label_list = t_head_outs[0] - soft_targets_list = t_head_outs[1] - student_loss = self.student_model.head.get_loss( - s_head_outs, inputs, soft_label_list, soft_targets_list) - total_loss = paddle.add_n(list(student_loss.values())) - student_loss['loss'] = total_loss - return student_loss - else: - return self.student_model(inputs) - - -@register -class PPYOLOEDistillModel(DistillModel): - """ - Build PPYOLOE distill model, only used in PPYOLOE - Args: - cfg: The student config. - slim_cfg: The teacher and distill config. - """ - - def __init__(self, cfg, slim_cfg): - super(PPYOLOEDistillModel, self).__init__(cfg=cfg, slim_cfg=slim_cfg) - assert self.arch in ['PPYOLOE'], 'Unsupported arch: {}'.format( - self.arch) - - def forward(self, inputs, alpha=0.125): - if self.training: - with paddle.no_grad(): - teacher_loss = self.teacher_model(inputs) - if hasattr(self.teacher_model.yolo_head, "assigned_labels"): - self.student_model.yolo_head.assigned_labels, self.student_model.yolo_head.assigned_bboxes, self.student_model.yolo_head.assigned_scores = \ - self.teacher_model.yolo_head.assigned_labels, self.teacher_model.yolo_head.assigned_bboxes, self.teacher_model.yolo_head.assigned_scores - delattr(self.teacher_model.yolo_head, "assigned_labels") - delattr(self.teacher_model.yolo_head, "assigned_bboxes") - delattr(self.teacher_model.yolo_head, "assigned_scores") - student_loss = self.student_model(inputs) - - logits_loss, feat_loss = self.distill_loss(self.teacher_model, - self.student_model) - det_total_loss = student_loss['loss'] - total_loss = alpha * (det_total_loss + logits_loss + feat_loss) - student_loss['loss'] = total_loss - student_loss['det_loss'] = det_total_loss - student_loss['logits_loss'] = logits_loss - student_loss['feat_loss'] = feat_loss - return student_loss - else: - return self.student_model(inputs) diff --git a/pdfdet/models/Paddle/ppdet/slim/ofa.py b/pdfdet/models/Paddle/ppdet/slim/ofa.py deleted file mode 100644 index b75edac..0000000 --- a/pdfdet/models/Paddle/ppdet/slim/ofa.py +++ /dev/null @@ -1,89 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -import paddle.nn as nn -import paddle.nn.functional as F - -from ppdet.core.workspace import load_config, merge_config, create -from ppdet.utils.checkpoint import load_weight, load_pretrain_weight -from ppdet.utils.logger import setup_logger -from ppdet.core.workspace import register, serializable - -from paddle.utils import try_import - -logger = setup_logger(__name__) - - -@register -@serializable -class OFA(object): - def __init__(self, ofa_config): - super(OFA, self).__init__() - self.ofa_config = ofa_config - - def __call__(self, model, param_state_dict): - - paddleslim = try_import('paddleslim') - from paddleslim.nas.ofa import OFA, RunConfig, utils - from paddleslim.nas.ofa.convert_super import Convert, supernet - task = self.ofa_config['task'] - expand_ratio = self.ofa_config['expand_ratio'] - - skip_neck = self.ofa_config['skip_neck'] - skip_head = self.ofa_config['skip_head'] - - run_config = self.ofa_config['RunConfig'] - if 'skip_layers' in run_config: - skip_layers = run_config['skip_layers'] - else: - skip_layers = [] - - # supernet config - sp_config = supernet(expand_ratio=expand_ratio) - # convert to supernet - model = Convert(sp_config).convert(model) - - skip_names = [] - if skip_neck: - skip_names.append('neck.') - if skip_head: - skip_names.append('head.') - - for name, sublayer in model.named_sublayers(): - for n in skip_names: - if n in name: - skip_layers.append(name) - - run_config['skip_layers'] = skip_layers - run_config = RunConfig(**run_config) - - # build ofa model - ofa_model = OFA(model, run_config=run_config) - - ofa_model.set_epoch(0) - ofa_model.set_task(task) - - input_spec = [{ - "image": paddle.ones( - shape=[1, 3, 640, 640], dtype='float32'), - "im_shape": paddle.full( - [1, 2], 640, dtype='float32'), - "scale_factor": paddle.ones( - shape=[1, 2], dtype='float32') - }] - - ofa_model._clear_search_space(input_spec=input_spec) - ofa_model._build_ss = True - check_ss = ofa_model._sample_config('expand_ratio', phase=None) - # tokenize the search space - ofa_model.tokenize() - # check token map, search cands and search space - logger.info('Token map is {}'.format(ofa_model.token_map)) - logger.info('Search candidates is {}'.format(ofa_model.search_cands)) - logger.info('The length of search_space is {}, search_space is {}'. - format(len(ofa_model._ofa_layers), ofa_model._ofa_layers)) - # set model state dict into ofa model - utils.set_state_dict(ofa_model.model, param_state_dict) - return ofa_model diff --git a/pdfdet/models/Paddle/ppdet/slim/prune.py b/pdfdet/models/Paddle/ppdet/slim/prune.py deleted file mode 100644 index 28ffb75..0000000 --- a/pdfdet/models/Paddle/ppdet/slim/prune.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import paddle -from paddle.utils import try_import - -from ppdet.core.workspace import register, serializable -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - - -def print_prune_params(model): - model_dict = model.state_dict() - for key in model_dict.keys(): - weight_name = model_dict[key].name - logger.info('Parameter name: {}, shape: {}'.format( - weight_name, model_dict[key].shape)) - - -@register -@serializable -class Pruner(object): - def __init__(self, - criterion, - pruned_params, - pruned_ratios, - print_params=False): - super(Pruner, self).__init__() - assert criterion in ['l1_norm', 'fpgm'], \ - "unsupported prune criterion: {}".format(criterion) - self.criterion = criterion - self.pruned_params = pruned_params - self.pruned_ratios = pruned_ratios - self.print_params = print_params - - def __call__(self, model): - # FIXME: adapt to network graph when Training and inference are - # inconsistent, now only supports prune inference network graph. - model.eval() - paddleslim = try_import('paddleslim') - from paddleslim.analysis import dygraph_flops as flops - input_spec = [{ - "image": paddle.ones( - shape=[1, 3, 640, 640], dtype='float32'), - "im_shape": paddle.full( - [1, 2], 640, dtype='float32'), - "scale_factor": paddle.ones( - shape=[1, 2], dtype='float32') - }] - if self.print_params: - print_prune_params(model) - - ori_flops = flops(model, input_spec) / (1000**3) - logger.info("FLOPs before pruning: {}GFLOPs".format(ori_flops)) - if self.criterion == 'fpgm': - pruner = paddleslim.dygraph.FPGMFilterPruner(model, input_spec) - elif self.criterion == 'l1_norm': - pruner = paddleslim.dygraph.L1NormFilterPruner(model, input_spec) - - logger.info("pruned params: {}".format(self.pruned_params)) - pruned_ratios = [float(n) for n in self.pruned_ratios] - ratios = {} - for i, param in enumerate(self.pruned_params): - ratios[param] = pruned_ratios[i] - pruner.prune_vars(ratios, [0]) - pruned_flops = flops(model, input_spec) / (1000**3) - logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format( - pruned_flops, (ori_flops - pruned_flops) / ori_flops)) - - return model - - -@register -@serializable -class PrunerQAT(object): - def __init__(self, criterion, pruned_params, pruned_ratios, - print_prune_params, quant_config, print_qat_model): - super(PrunerQAT, self).__init__() - assert criterion in ['l1_norm', 'fpgm'], \ - "unsupported prune criterion: {}".format(criterion) - # Pruner hyperparameter - self.criterion = criterion - self.pruned_params = pruned_params - self.pruned_ratios = pruned_ratios - self.print_prune_params = print_prune_params - # QAT hyperparameter - self.quant_config = quant_config - self.print_qat_model = print_qat_model - - def __call__(self, model): - # FIXME: adapt to network graph when Training and inference are - # inconsistent, now only supports prune inference network graph. - model.eval() - paddleslim = try_import('paddleslim') - from paddleslim.analysis import dygraph_flops as flops - input_spec = [{ - "image": paddle.ones( - shape=[1, 3, 640, 640], dtype='float32'), - "im_shape": paddle.full( - [1, 2], 640, dtype='float32'), - "scale_factor": paddle.ones( - shape=[1, 2], dtype='float32') - }] - if self.print_prune_params: - print_prune_params(model) - - ori_flops = flops(model, input_spec) / 1000 - logger.info("FLOPs before pruning: {}GFLOPs".format(ori_flops)) - if self.criterion == 'fpgm': - pruner = paddleslim.dygraph.FPGMFilterPruner(model, input_spec) - elif self.criterion == 'l1_norm': - pruner = paddleslim.dygraph.L1NormFilterPruner(model, input_spec) - - logger.info("pruned params: {}".format(self.pruned_params)) - pruned_ratios = [float(n) for n in self.pruned_ratios] - ratios = {} - for i, param in enumerate(self.pruned_params): - ratios[param] = pruned_ratios[i] - pruner.prune_vars(ratios, [0]) - pruned_flops = flops(model, input_spec) / 1000 - logger.info("FLOPs after pruning: {}GFLOPs; pruned ratio: {}".format( - pruned_flops, (ori_flops - pruned_flops) / ori_flops)) - - self.quanter = paddleslim.dygraph.quant.QAT(config=self.quant_config) - - self.quanter.quantize(model) - - if self.print_qat_model: - logger.info("Quantized model:") - logger.info(model) - - return model - - def save_quantized_model(self, layer, path, input_spec=None, **config): - self.quanter.save_quantized_model( - model=layer, path=path, input_spec=input_spec, **config) diff --git a/pdfdet/models/Paddle/ppdet/slim/quant.py b/pdfdet/models/Paddle/ppdet/slim/quant.py deleted file mode 100644 index 4450819..0000000 --- a/pdfdet/models/Paddle/ppdet/slim/quant.py +++ /dev/null @@ -1,89 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from paddle.utils import try_import - -from ppdet.core.workspace import register, serializable -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - - -@register -@serializable -class QAT(object): - def __init__(self, quant_config, print_model): - super(QAT, self).__init__() - self.quant_config = quant_config - self.print_model = print_model - - def __call__(self, model): - paddleslim = try_import('paddleslim') - self.quanter = paddleslim.dygraph.quant.QAT(config=self.quant_config) - if self.print_model: - logger.info("Model before quant:") - logger.info(model) - - # For PP-YOLOE, convert model to deploy firstly. - for layer in model.sublayers(): - if hasattr(layer, 'convert_to_deploy'): - layer.convert_to_deploy() - - self.quanter.quantize(model) - - if self.print_model: - logger.info("Quantized model:") - logger.info(model) - - return model - - def save_quantized_model(self, layer, path, input_spec=None, **config): - self.quanter.save_quantized_model( - model=layer, path=path, input_spec=input_spec, **config) - - -@register -@serializable -class PTQ(object): - def __init__(self, - ptq_config, - quant_batch_num=10, - output_dir='output_inference', - fuse=True, - fuse_list=None): - super(PTQ, self).__init__() - self.ptq_config = ptq_config - self.quant_batch_num = quant_batch_num - self.output_dir = output_dir - self.fuse = fuse - self.fuse_list = fuse_list - - def __call__(self, model): - paddleslim = try_import('paddleslim') - self.ptq = paddleslim.PTQ(**self.ptq_config) - model.eval() - quant_model = self.ptq.quantize( - model, fuse=self.fuse, fuse_list=self.fuse_list) - - return quant_model - - def save_quantized_model(self, - quant_model, - quantize_model_path, - input_spec=None): - self.ptq.save_quantized_model(quant_model, quantize_model_path, - input_spec) diff --git a/pdfdet/models/Paddle/ppdet/slim/unstructured_prune.py b/pdfdet/models/Paddle/ppdet/slim/unstructured_prune.py deleted file mode 100644 index 1dc876a..0000000 --- a/pdfdet/models/Paddle/ppdet/slim/unstructured_prune.py +++ /dev/null @@ -1,66 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from paddle.utils import try_import - -from ppdet.core.workspace import register, serializable -from ppdet.utils.logger import setup_logger -logger = setup_logger(__name__) - - -@register -@serializable -class UnstructuredPruner(object): - def __init__(self, - stable_epochs, - pruning_epochs, - tunning_epochs, - pruning_steps, - ratio, - initial_ratio, - prune_params_type=None): - self.stable_epochs = stable_epochs - self.pruning_epochs = pruning_epochs - self.tunning_epochs = tunning_epochs - self.ratio = ratio - self.prune_params_type = prune_params_type - self.initial_ratio = initial_ratio - self.pruning_steps = pruning_steps - - def __call__(self, model, steps_per_epoch, skip_params_func=None): - paddleslim = try_import('paddleslim') - from paddleslim import GMPUnstructuredPruner - configs = { - 'pruning_strategy': 'gmp', - 'stable_iterations': self.stable_epochs * steps_per_epoch, - 'pruning_iterations': self.pruning_epochs * steps_per_epoch, - 'tunning_iterations': self.tunning_epochs * steps_per_epoch, - 'resume_iteration': 0, - 'pruning_steps': self.pruning_steps, - 'initial_ratio': self.initial_ratio, - } - - pruner = GMPUnstructuredPruner( - model, - ratio=self.ratio, - skip_params_func=skip_params_func, - prune_params_type=self.prune_params_type, - local_sparsity=True, - configs=configs) - - return pruner diff --git a/pdfdet/models/Paddle/ppdet/utils/__init__.py b/pdfdet/models/Paddle/ppdet/utils/__init__.py deleted file mode 100644 index d0c32e2..0000000 --- a/pdfdet/models/Paddle/ppdet/utils/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/pdfdet/models/Paddle/ppdet/utils/cam_utils.py b/pdfdet/models/Paddle/ppdet/utils/cam_utils.py deleted file mode 100644 index d2f7a47..0000000 --- a/pdfdet/models/Paddle/ppdet/utils/cam_utils.py +++ /dev/null @@ -1,343 +0,0 @@ -import numpy as np -import cv2 -import os -import sys -import glob -from ppdet.utils.logger import setup_logger -import copy -logger = setup_logger('ppdet_cam') - -import paddle -from ppdet.engine import Trainer - - -def get_test_images(infer_dir, infer_img): - """ - Get image path list in TEST mode - """ - assert infer_img is not None or infer_dir is not None, \ - "--infer_img or --infer_dir should be set" - assert infer_img is None or os.path.isfile(infer_img), \ - "{} is not a file".format(infer_img) - assert infer_dir is None or os.path.isdir(infer_dir), \ - "{} is not a directory".format(infer_dir) - - # infer_img has a higher priority - if infer_img and os.path.isfile(infer_img): - return [infer_img] - - images = set() - infer_dir = os.path.abspath(infer_dir) - assert os.path.isdir(infer_dir), \ - "infer_dir {} is not a directory".format(infer_dir) - exts = ['jpg', 'jpeg', 'png', 'bmp'] - exts += [ext.upper() for ext in exts] - for ext in exts: - images.update(glob.glob('{}/*.{}'.format(infer_dir, ext))) - images = list(images) - - assert len(images) > 0, "no image found in {}".format(infer_dir) - logger.info("Found {} inference images in total.".format(len(images))) - - return images - - -def compute_ious(boxes1, boxes2): - """[Compute pairwise IOU matrix for given two sets of boxes] - - Args: - boxes1 ([numpy ndarray with shape N,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)] - boxes2 ([numpy ndarray with shape M,4]): [representing bounding boxes with format (xmin,ymin,xmax,ymax)] - Returns: - pairwise IOU maxtrix with shape (N,M),where the value at ith row jth column hold the iou between ith - box and jth box from box1 and box2 respectively. - """ - lu = np.maximum( - boxes1[:, None, :2], boxes2[:, :2] - ) # lu with shape N,M,2 ; boxes1[:,None,:2] with shape (N,1,2) boxes2 with shape(M,2) - rd = np.minimum(boxes1[:, None, 2:], boxes2[:, 2:]) # rd same to lu - intersection_wh = np.maximum(0.0, rd - lu) - intersection_area = intersection_wh[:, :, - 0] * intersection_wh[:, :, - 1] # with shape (N,M) - boxes1_wh = np.maximum(0.0, boxes1[:, 2:] - boxes1[:, :2]) - boxes1_area = boxes1_wh[:, 0] * boxes1_wh[:, 1] # with shape (N,) - boxes2_wh = np.maximum(0.0, boxes2[:, 2:] - boxes2[:, :2]) - boxes2_area = boxes2_wh[:, 0] * boxes2_wh[:, 1] # with shape (M,) - union_area = np.maximum( - boxes1_area[:, None] + boxes2_area - intersection_area, - 1e-8) # with shape (N,M) - ious = np.clip(intersection_area / union_area, 0.0, 1.0) - return ious - - -def grad_cam(feat, grad): - """ - - Args: - feat: CxHxW - grad: CxHxW - - Returns: - cam: HxW - """ - exp = (feat * grad.mean((1, 2), keepdims=True)).mean(axis=0) - exp = np.maximum(-exp, 0) - return exp - - -def resize_cam(explanation, resize_shape) -> np.ndarray: - """ - - Args: - explanation: (width, height) - resize_shape: (width, height) - - Returns: - - """ - assert len(explanation.shape) == 2, f"{explanation.shape}. " \ - f"Currently support 2D explanation results for visualization. " \ - "Reduce higher dimensions to 2D for visualization." - - explanation = (explanation - explanation.min()) / ( - explanation.max() - explanation.min()) - - explanation = cv2.resize(explanation, resize_shape) - explanation = np.uint8(255 * explanation) - explanation = cv2.applyColorMap(explanation, cv2.COLORMAP_JET) - explanation = cv2.cvtColor(explanation, cv2.COLOR_BGR2RGB) - - return explanation - - -class BBoxCAM: - def __init__(self, FLAGS, cfg): - self.FLAGS = FLAGS - self.cfg = cfg - # build model - self.trainer = self.build_trainer(cfg) - # num_class - self.num_class = cfg.num_classes - # set hook for extraction of featuremaps and grads - self.set_hook(cfg) - self.nms_idx_need_divid_numclass_arch = ['FasterRCNN', 'MaskRCNN', 'CascadeRCNN'] - """ - In these networks, the bbox array shape before nms contain num_class, - the nms_keep_idx of the bbox need to divide the num_class; - """ - - # cam image output_dir - try: - os.makedirs(FLAGS.cam_out) - except: - print('Path already exists.') - pass - - def build_trainer(self, cfg): - # build trainer - trainer = Trainer(cfg, mode='test') - # load weights - trainer.load_weights(cfg.weights) - - # set for get extra_data before nms - trainer.model.use_extra_data=True - # set for record the bbox index before nms - if cfg.architecture in ['FasterRCNN', 'MaskRCNN']: - trainer.model.bbox_post_process.nms.return_index = True - elif cfg.architecture in ['YOLOv3', 'PPYOLOE', 'PPYOLOEWithAuxHead']: - if trainer.model.post_process is not None: - # anchor based YOLOs: YOLOv3,PP-YOLO - trainer.model.post_process.nms.return_index = True - else: - # anchor free YOLOs: PP-YOLOE, PP-YOLOE+ - trainer.model.yolo_head.nms.return_index = True - elif cfg.architecture=='BlazeFace' or cfg.architecture=='SSD': - trainer.model.post_process.nms.return_index = True - elif cfg.architecture=='RetinaNet': - trainer.model.head.nms.return_index = True - else: - print( - cfg.architecture+' is not supported for cam temporarily!' - ) - sys.exit() - # Todo: Unify the head/post_process name in each model - - return trainer - - def set_hook(self, cfg): - # set hook for extraction of featuremaps and grads - self.target_feats = {} - self.target_layer_name = cfg.target_feature_layer_name - # such as trainer.model.backbone, trainer.model.bbox_head.roi_extractor - - def hook(layer, input, output): - self.target_feats[layer._layer_name_for_hook] = output - - try: - exec('self.trainer.'+self.target_layer_name+'._layer_name_for_hook = self.target_layer_name') - # self.trainer.target_layer_name._layer_name_for_hook = self.target_layer_name - exec('self.trainer.'+self.target_layer_name+'.register_forward_post_hook(hook)') - # self.trainer.target_layer_name.register_forward_post_hook(hook) - except: - print("Error! " - "The target_layer_name--"+self.target_layer_name+" is not in model! " - "Please check the spelling and " - "the network's architecture!") - sys.exit() - - def get_bboxes(self): - # get inference images - images = get_test_images(self.FLAGS.infer_dir, self.FLAGS.infer_img) - - # inference - result = self.trainer.predict( - images, - draw_threshold=self.FLAGS.draw_threshold, - output_dir=self.FLAGS.output_dir, - save_results=self.FLAGS.save_results, - visualize=False)[0] - return result - - def get_bboxes_cams(self): - # Get the bboxes prediction(after nms result) of the input - inference_result = self.get_bboxes() - - # read input image - # Todo: Support folder multi-images process - from PIL import Image - img = np.array(Image.open(self.cfg.infer_img)) - - # data for calaulate bbox grad_cam - extra_data = inference_result['extra_data'] - """ - Example of Faster_RCNN based architecture: - extra_data: {'scores': tensor with shape [num_of_bboxes_before_nms, num_classes], for example: [1000, 80] - 'nms_keep_idx': tensor with shape [num_of_bboxes_after_nms, 1], for example: [300, 1] - } - Example of YOLOv3 based architecture: - extra_data: {'scores': tensor with shape [1, num_classes, num_of_yolo_bboxes_before_nms], #for example: [1, 80, 8400] - 'nms_keep_idx': tensor with shape [num_of_yolo_bboxes_after_nms, 1], # for example: [300, 1] - } - """ - - # array index of the predicted bbox before nms - if self.cfg.architecture in self.nms_idx_need_divid_numclass_arch: - # some network's bbox array shape before nms may be like [num_of_bboxes_before_nms, num_classes, 4], - # we need to divide num_classes to get the before_nms_index; - # currently, only include the rcnn architectures (fasterrcnn, maskrcnn, cascadercnn); - before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy( - ) // self.num_class # num_class - else : - before_nms_indexes = extra_data['nms_keep_idx'].cpu().numpy() - - # Calculate and visualize the heatmap of per predict bbox - for index, target_bbox in enumerate(inference_result['bbox']): - # target_bbox: [cls, score, x1, y1, x2, y2] - # filter bboxes with low predicted scores - if target_bbox[1] < self.FLAGS.draw_threshold: - continue - - target_bbox_before_nms = int(before_nms_indexes[index]) - - if len(extra_data['scores'].shape)==2: - score_out = extra_data['scores'][target_bbox_before_nms] - else: - score_out = extra_data['scores'][0, :, target_bbox_before_nms] - """ - There are two kinds array shape of bbox score output : - 1) [num_of_bboxes_before_nms, num_classes], for example: [1000, 80] - 2) [num_of_image, num_classes, num_of_yolo_bboxes_before_nms], for example: [1, 80, 1000] - """ - - - # construct one_hot label and do backward to get the gradients - predicted_label = paddle.argmax(score_out) - label_onehot = paddle.nn.functional.one_hot( - predicted_label, num_classes=len(score_out)) - label_onehot = label_onehot.squeeze() - target = paddle.sum(score_out * label_onehot) - target.backward(retain_graph=True) - - - if 'backbone' in self.target_layer_name or \ - 'neck' in self.target_layer_name: # backbone/neck level feature - if isinstance(self.target_feats[self.target_layer_name], list): - # when the featuremap contains of multiple scales, - # take the featuremap of the last scale - # Todo: fuse the cam result from multisclae featuremaps - if self.target_feats[self.target_layer_name][ - -1].shape[-1]==1: - """ - if the last level featuremap is 1x1 size, - we take the second last one - """ - cam_grad = self.target_feats[self.target_layer_name][ - -2].grad.squeeze().cpu().numpy() - cam_feat = self.target_feats[self.target_layer_name][ - -2].squeeze().cpu().numpy() - else: - cam_grad = self.target_feats[self.target_layer_name][ - -1].grad.squeeze().cpu().numpy() - cam_feat = self.target_feats[self.target_layer_name][ - -1].squeeze().cpu().numpy() - else: - cam_grad = self.target_feats[ - self.target_layer_name].grad.squeeze().cpu().numpy() - cam_feat = self.target_feats[ - self.target_layer_name].squeeze().cpu().numpy() - else: # roi level feature - cam_grad = self.target_feats[ - self.target_layer_name].grad.squeeze().cpu().numpy()[target_bbox_before_nms] - cam_feat = self.target_feats[ - self.target_layer_name].squeeze().cpu().numpy()[target_bbox_before_nms] - - # grad_cam: - exp = grad_cam(cam_feat, cam_grad) - - if 'backbone' in self.target_layer_name or \ - 'neck' in self.target_layer_name: - """ - when use backbone/neck featuremap, - we first do the cam on whole image, - and then set the area outside the predic bbox to 0 - """ - # reshape the cam image to the input image size - resized_exp = resize_cam(exp, (img.shape[1], img.shape[0])) - mask = np.zeros((img.shape[0], img.shape[1], 3)) - mask[int(target_bbox[3]):int(target_bbox[5]), int(target_bbox[2]): - int(target_bbox[4]), :] = 1 - resized_exp = resized_exp * mask - # add the bbox cam back to the input image - overlay_vis = np.uint8(resized_exp * 0.4 + img * 0.6) - elif 'roi' in self.target_layer_name: - # get the bbox part of the image - bbox_img = copy.deepcopy(img[int(target_bbox[3]):int(target_bbox[5]), - int(target_bbox[2]):int(target_bbox[4]), :]) - # reshape the cam image to the bbox size - resized_exp = resize_cam(exp, (bbox_img.shape[1], bbox_img.shape[0])) - # add the bbox cam back to the bbox image - bbox_overlay_vis = np.uint8(resized_exp * 0.4 + bbox_img * 0.6) - # put the bbox_cam image to the original image - overlay_vis = copy.deepcopy(img) - overlay_vis[int(target_bbox[3]):int(target_bbox[5]), - int(target_bbox[2]):int(target_bbox[4]), :] = bbox_overlay_vis - else: - print( - 'Only supported cam for backbone/neck feature and roi feature, the others are not supported temporarily!' - ) - sys.exit() - - # put the bbox rectangle on image - cv2.rectangle( - overlay_vis, (int(target_bbox[2]), int(target_bbox[3])), - (int(target_bbox[4]), int(target_bbox[5])), (0, 0, 255), 2) - - # save visualization result - cam_image = Image.fromarray(overlay_vis) - cam_image.save(self.FLAGS.cam_out + '/' + str(index) + '.jpg') - - # clear gradients after each bbox grad_cam - target.clear_gradient() - for n, v in self.trainer.model.named_sublayers(): - v.clear_gradients() diff --git a/pdfdet/models/Paddle/ppdet/utils/check.py b/pdfdet/models/Paddle/ppdet/utils/check.py deleted file mode 100644 index 7690ade..0000000 --- a/pdfdet/models/Paddle/ppdet/utils/check.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import sys - -import paddle -import six -import paddle.version as paddle_version - -from .logger import setup_logger -logger = setup_logger(__name__) - -__all__ = [ - 'check_gpu', 'check_npu', 'check_xpu', 'check_mlu', 'check_version', - 'check_config' -] - - -def check_mlu(use_mlu): - """ - Log error and exit when set use_mlu=true in paddlepaddle - cpu/gpu/xpu/npu version. - """ - err = "Config use_mlu cannot be set as true while you are " \ - "using paddlepaddle cpu/gpu/xpu/npu version ! \nPlease try: \n" \ - "\t1. Install paddlepaddle-mlu to run model on MLU \n" \ - "\t2. Set use_mlu as false in config file to run " \ - "model on CPU/GPU/XPU/NPU" - - try: - if use_mlu and not paddle.is_compiled_with_mlu(): - logger.error(err) - sys.exit(1) - except Exception as e: - pass - - -def check_npu(use_npu): - """ - Log error and exit when set use_npu=true in paddlepaddle - version without paddle-custom-npu installed. - """ - err = "Config use_npu cannot be set as true while you are " \ - "using paddlepaddle version without paddle-custom-npu " \ - "installed! \nPlease try: \n" \ - "\t1. Install paddle-custom-npu to run model on NPU \n" \ - "\t2. Set use_npu as false in config file to run " \ - "model on other devices supported." - - try: - if use_npu and not 'npu' in paddle.device.get_all_custom_device_type(): - logger.error(err) - sys.exit(1) - except Exception as e: - pass - - -def check_xpu(use_xpu): - """ - Log error and exit when set use_xpu=true in paddlepaddle - cpu/gpu/npu version. - """ - err = "Config use_xpu cannot be set as true while you are " \ - "using paddlepaddle cpu/gpu/npu version ! \nPlease try: \n" \ - "\t1. Install paddlepaddle-xpu to run model on XPU \n" \ - "\t2. Set use_xpu as false in config file to run " \ - "model on CPU/GPU/NPU" - - try: - if use_xpu and not paddle.is_compiled_with_xpu(): - logger.error(err) - sys.exit(1) - except Exception as e: - pass - - -def check_gpu(use_gpu): - """ - Log error and exit when set use_gpu=true in paddlepaddle - cpu version. - """ - err = "Config use_gpu cannot be set as true while you are " \ - "using paddlepaddle cpu version ! \nPlease try: \n" \ - "\t1. Install paddlepaddle-gpu to run model on GPU \n" \ - "\t2. Set use_gpu as false in config file to run " \ - "model on CPU" - - try: - if use_gpu and not paddle.is_compiled_with_cuda(): - logger.error(err) - sys.exit(1) - except Exception as e: - pass - - -def check_version(version='2.2'): - """ - Log error and exit when the installed version of paddlepaddle is - not satisfied. - """ - err = "PaddlePaddle version {} or higher is required, " \ - "or a suitable develop version is satisfied as well. \n" \ - "Please make sure the version is good with your code.".format(version) - - version_installed = [ - paddle_version.major, paddle_version.minor, paddle_version.patch, - paddle_version.rc - ] - - if version_installed == ['0', '0', '0', '0']: - return - - version_split = version.split('.') - - length = min(len(version_installed), len(version_split)) - for i in six.moves.range(length): - if version_installed[i] > version_split[i]: - return - if version_installed[i] < version_split[i]: - raise Exception(err) - - -def check_config(cfg): - """ - Check the correctness of the configuration file. Log error and exit - when Config is not compliant. - """ - err = "'{}' not specified in config file. Please set it in config file." - check_list = ['architecture', 'num_classes'] - try: - for var in check_list: - if not var in cfg: - logger.error(err.format(var)) - sys.exit(1) - except Exception as e: - pass - - if 'log_iter' not in cfg: - cfg.log_iter = 20 - - return cfg diff --git a/pdfdet/models/Paddle/ppdet/utils/checkpoint.py b/pdfdet/models/Paddle/ppdet/utils/checkpoint.py deleted file mode 100644 index 8672c98..0000000 --- a/pdfdet/models/Paddle/ppdet/utils/checkpoint.py +++ /dev/null @@ -1,377 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import os -import numpy as np -import paddle -import paddle.nn as nn -from .download import get_weights_path - -from .logger import setup_logger -logger = setup_logger(__name__) - - -def is_url(path): - """ - Whether path is URL. - Args: - path (string): URL string or not. - """ - return path.startswith('http://') \ - or path.startswith('https://') \ - or path.startswith('ppdet://') - - -def _strip_postfix(path): - path, ext = os.path.splitext(path) - assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \ - "Unknown postfix {} from weights".format(ext) - return path - - -def load_weight(model, weight, optimizer=None, ema=None, exchange=True): - if is_url(weight): - weight = get_weights_path(weight) - - path = _strip_postfix(weight) - pdparam_path = path + '.pdparams' - if not os.path.exists(pdparam_path): - raise ValueError("Model pretrain path {} does not " - "exists.".format(pdparam_path)) - - if ema is not None and os.path.exists(path + '.pdema'): - if exchange: - # Exchange model and ema_model to load - logger.info('Exchange model and ema_model to load:') - ema_state_dict = paddle.load(pdparam_path) - logger.info('Loading ema_model weights from {}'.format(path + - '.pdparams')) - param_state_dict = paddle.load(path + '.pdema') - logger.info('Loading model weights from {}'.format(path + '.pdema')) - else: - ema_state_dict = paddle.load(path + '.pdema') - logger.info('Loading ema_model weights from {}'.format(path + - '.pdema')) - param_state_dict = paddle.load(pdparam_path) - logger.info('Loading model weights from {}'.format(path + - '.pdparams')) - else: - ema_state_dict = None - param_state_dict = paddle.load(pdparam_path) - - if hasattr(model, 'modelTeacher') and hasattr(model, 'modelStudent'): - print('Loading pretrain weights for Teacher-Student framework.') - print('Loading pretrain weights for Student model.') - student_model_dict = model.modelStudent.state_dict() - student_param_state_dict = match_state_dict( - student_model_dict, param_state_dict, mode='student') - model.modelStudent.set_dict(student_param_state_dict) - print('Loading pretrain weights for Teacher model.') - teacher_model_dict = model.modelTeacher.state_dict() - - teacher_param_state_dict = match_state_dict( - teacher_model_dict, param_state_dict, mode='teacher') - model.modelTeacher.set_dict(teacher_param_state_dict) - - else: - model_dict = model.state_dict() - model_weight = {} - incorrect_keys = 0 - for key in model_dict.keys(): - if key in param_state_dict.keys(): - model_weight[key] = param_state_dict[key] - else: - logger.info('Unmatched key: {}'.format(key)) - incorrect_keys += 1 - assert incorrect_keys == 0, "Load weight {} incorrectly, \ - {} keys unmatched, please check again.".format(weight, - incorrect_keys) - logger.info('Finish resuming model weights: {}'.format(pdparam_path)) - model.set_dict(model_weight) - - last_epoch = 0 - if optimizer is not None and os.path.exists(path + '.pdopt'): - optim_state_dict = paddle.load(path + '.pdopt') - # to solve resume bug, will it be fixed in paddle 2.0 - for key in optimizer.state_dict().keys(): - if not key in optim_state_dict.keys(): - optim_state_dict[key] = optimizer.state_dict()[key] - if 'last_epoch' in optim_state_dict: - last_epoch = optim_state_dict.pop('last_epoch') - optimizer.set_state_dict(optim_state_dict) - - if ema_state_dict is not None: - ema.resume(ema_state_dict, - optim_state_dict['LR_Scheduler']['last_epoch']) - elif ema_state_dict is not None: - ema.resume(ema_state_dict) - return last_epoch - - -def match_state_dict(model_state_dict, weight_state_dict, mode='default'): - """ - Match between the model state dict and pretrained weight state dict. - Return the matched state dict. - - The method supposes that all the names in pretrained weight state dict are - subclass of the names in models`, if the prefix 'backbone.' in pretrained weight - keys is stripped. And we could get the candidates for each model key. Then we - select the name with the longest matched size as the final match result. For - example, the model state dict has the name of - 'backbone.res2.res2a.branch2a.conv.weight' and the pretrained weight as - name of 'res2.res2a.branch2a.conv.weight' and 'branch2a.conv.weight'. We - match the 'res2.res2a.branch2a.conv.weight' to the model key. - """ - - model_keys = sorted(model_state_dict.keys()) - weight_keys = sorted(weight_state_dict.keys()) - - def teacher_match(a, b): - # skip student params - if b.startswith('modelStudent'): - return False - return a == b or a.endswith("." + b) or b.endswith("." + a) - - def student_match(a, b): - # skip teacher params - if b.startswith('modelTeacher'): - return False - return a == b or a.endswith("." + b) or b.endswith("." + a) - - def match(a, b): - if b.startswith('backbone.res5'): - b = b[9:] - return a == b or a.endswith("." + b) - - if mode == 'student': - match_op = student_match - elif mode == 'teacher': - match_op = teacher_match - else: - match_op = match - - match_matrix = np.zeros([len(model_keys), len(weight_keys)]) - for i, m_k in enumerate(model_keys): - for j, w_k in enumerate(weight_keys): - if match_op(m_k, w_k): - match_matrix[i, j] = len(w_k) - max_id = match_matrix.argmax(1) - max_len = match_matrix.max(1) - max_id[max_len == 0] = -1 - load_id = set(max_id) - load_id.discard(-1) - not_load_weight_name = [] - if weight_keys[0].startswith('modelStudent') or weight_keys[0].startswith( - 'modelTeacher'): - for match_idx in range(len(max_id)): - if max_id[match_idx] == -1: - not_load_weight_name.append(model_keys[match_idx]) - if len(not_load_weight_name) > 0: - logger.info('{} in model is not matched with pretrained weights, ' - 'and its will be trained from scratch'.format( - not_load_weight_name)) - - else: - for idx in range(len(weight_keys)): - if idx not in load_id: - not_load_weight_name.append(weight_keys[idx]) - - if len(not_load_weight_name) > 0: - logger.info('{} in pretrained weight is not used in the model, ' - 'and its will not be loaded'.format( - not_load_weight_name)) - matched_keys = {} - result_state_dict = {} - for model_id, weight_id in enumerate(max_id): - if weight_id == -1: - continue - model_key = model_keys[model_id] - weight_key = weight_keys[weight_id] - weight_value = weight_state_dict[weight_key] - model_value_shape = list(model_state_dict[model_key].shape) - - if list(weight_value.shape) != model_value_shape: - logger.info( - 'The shape {} in pretrained weight {} is unmatched with ' - 'the shape {} in model {}. And the weight {} will not be ' - 'loaded'.format(weight_value.shape, weight_key, - model_value_shape, model_key, weight_key)) - continue - - assert model_key not in result_state_dict - result_state_dict[model_key] = weight_value - if weight_key in matched_keys: - raise ValueError('Ambiguity weight {} loaded, it matches at least ' - '{} and {} in the model'.format( - weight_key, model_key, matched_keys[ - weight_key])) - matched_keys[weight_key] = model_key - return result_state_dict - - -def load_pretrain_weight(model, pretrain_weight, ARSL_eval=False): - if is_url(pretrain_weight): - pretrain_weight = get_weights_path(pretrain_weight) - - path = _strip_postfix(pretrain_weight) - if not (os.path.isdir(path) or os.path.isfile(path) or - os.path.exists(path + '.pdparams')): - raise ValueError("Model pretrain path `{}` does not exists. " - "If you don't want to load pretrain model, " - "please delete `pretrain_weights` field in " - "config file.".format(path)) - teacher_student_flag = False - if not ARSL_eval: - if hasattr(model, 'modelTeacher') and hasattr(model, 'modelStudent'): - print('Loading pretrain weights for Teacher-Student framework.') - print( - 'Assert Teacher model has the same structure with Student model.' - ) - model_dict = model.modelStudent.state_dict() - teacher_student_flag = True - else: - model_dict = model.state_dict() - - weights_path = path + '.pdparams' - param_state_dict = paddle.load(weights_path) - param_state_dict = match_state_dict(model_dict, param_state_dict) - for k, v in param_state_dict.items(): - if isinstance(v, np.ndarray): - v = paddle.to_tensor(v) - if model_dict[k].dtype != v.dtype: - param_state_dict[k] = v.astype(model_dict[k].dtype) - - if teacher_student_flag: - model.modelStudent.set_dict(param_state_dict) - model.modelTeacher.set_dict(param_state_dict) - else: - model.set_dict(param_state_dict) - logger.info('Finish loading model weights: {}'.format(weights_path)) - - else: - weights_path = path + '.pdparams' - param_state_dict = paddle.load(weights_path) - student_model_dict = model.modelStudent.state_dict() - student_param_state_dict = match_state_dict( - student_model_dict, param_state_dict, mode='student') - model.modelStudent.set_dict(student_param_state_dict) - print('Loading pretrain weights for Teacher model.') - teacher_model_dict = model.modelTeacher.state_dict() - - teacher_param_state_dict = match_state_dict( - teacher_model_dict, param_state_dict, mode='teacher') - model.modelTeacher.set_dict(teacher_param_state_dict) - logger.info('Finish loading model weights: {}'.format(weights_path)) - - -def save_model(model, - optimizer, - save_dir, - save_name, - last_epoch, - ema_model=None): - """ - save model into disk. - - Args: - model (dict): the model state_dict to save parameters. - optimizer (paddle.optimizer.Optimizer): the Optimizer instance to - save optimizer states. - save_dir (str): the directory to be saved. - save_name (str): the path to be saved. - last_epoch (int): the epoch index. - ema_model (dict|None): the ema_model state_dict to save parameters. - """ - if paddle.distributed.get_rank() != 0: - return - - save_dir = os.path.normpath(save_dir) - if not os.path.exists(save_dir): - os.makedirs(save_dir) - - if save_name == "best_model": - best_model_path = os.path.join(save_dir, 'best_model') - if not os.path.exists(best_model_path): - os.makedirs(best_model_path) - - save_path = os.path.join(save_dir, save_name) - # save model - if isinstance(model, nn.Layer): - paddle.save(model.state_dict(), save_path + ".pdparams") - best_model = model.state_dict() - else: - assert isinstance(model, - dict), 'model is not a instance of nn.layer or dict' - if ema_model is None: - paddle.save(model, save_path + ".pdparams") - best_model = model - else: - assert isinstance(ema_model, - dict), ("ema_model is not a instance of dict, " - "please call model.state_dict() to get.") - # Exchange model and ema_model to save - paddle.save(ema_model, save_path + ".pdparams") - paddle.save(model, save_path + ".pdema") - best_model = ema_model - - if save_name == 'best_model': - best_model_path = os.path.join(best_model_path, 'model') - paddle.save(best_model, best_model_path + ".pdparams") - # save optimizer - state_dict = optimizer.state_dict() - state_dict['last_epoch'] = last_epoch - paddle.save(state_dict, save_path + ".pdopt") - logger.info("Save checkpoint: {}".format(save_dir)) - - -def save_semi_model(teacher_model, student_model, optimizer, save_dir, - save_name, last_epoch, last_iter): - """ - save teacher and student model into disk. - Args: - teacher_model (dict): the teacher_model state_dict to save parameters. - student_model (dict): the student_model state_dict to save parameters. - optimizer (paddle.optimizer.Optimizer): the Optimizer instance to - save optimizer states. - save_dir (str): the directory to be saved. - save_name (str): the path to be saved. - last_epoch (int): the epoch index. - last_iter (int): the iter index. - """ - if paddle.distributed.get_rank() != 0: - return - assert isinstance(teacher_model, dict), ( - "teacher_model is not a instance of dict, " - "please call teacher_model.state_dict() to get.") - assert isinstance(student_model, dict), ( - "student_model is not a instance of dict, " - "please call student_model.state_dict() to get.") - if not os.path.exists(save_dir): - os.makedirs(save_dir) - save_path = os.path.join(save_dir, save_name) - # save model - paddle.save(teacher_model, save_path + str(last_epoch) + "epoch_t.pdparams") - paddle.save(student_model, save_path + str(last_epoch) + "epoch_s.pdparams") - - # save optimizer - state_dict = optimizer.state_dict() - state_dict['last_epoch'] = last_epoch - state_dict['last_iter'] = last_iter - paddle.save(state_dict, save_path + str(last_epoch) + "epoch.pdopt") - logger.info("Save checkpoint: {}".format(save_dir)) diff --git a/pdfdet/models/Paddle/ppdet/utils/cli.py b/pdfdet/models/Paddle/ppdet/utils/cli.py deleted file mode 100644 index 2c5acc0..0000000 --- a/pdfdet/models/Paddle/ppdet/utils/cli.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from argparse import ArgumentParser, RawDescriptionHelpFormatter - -import yaml -import re -from ppdet.core.workspace import get_registered_modules, dump_value - -__all__ = ['ColorTTY', 'ArgsParser'] - - -class ColorTTY(object): - def __init__(self): - super(ColorTTY, self).__init__() - self.colors = ['red', 'green', 'yellow', 'blue', 'magenta', 'cyan'] - - def __getattr__(self, attr): - if attr in self.colors: - color = self.colors.index(attr) + 31 - - def color_message(message): - return "[{}m{}".format(color, message) - - setattr(self, attr, color_message) - return color_message - - def bold(self, message): - return self.with_code('01', message) - - def with_code(self, code, message): - return "[{}m{}".format(code, message) - - -class ArgsParser(ArgumentParser): - def __init__(self): - super(ArgsParser, self).__init__( - formatter_class=RawDescriptionHelpFormatter) - self.add_argument("-c", "--config", help="configuration file to use") - self.add_argument( - "-o", "--opt", nargs='*', help="set configuration options") - - def parse_args(self, argv=None): - args = super(ArgsParser, self).parse_args(argv) - assert args.config is not None, \ - "Please specify --config=configure_file_path." - args.opt = self._parse_opt(args.opt) - return args - - def _parse_opt(self, opts): - config = {} - if not opts: - return config - for s in opts: - s = s.strip() - k, v = s.split('=', 1) - if '.' not in k: - config[k] = yaml.load(v, Loader=yaml.Loader) - else: - keys = k.split('.') - if keys[0] not in config: - config[keys[0]] = {} - cur = config[keys[0]] - for idx, key in enumerate(keys[1:]): - if idx == len(keys) - 2: - cur[key] = yaml.load(v, Loader=yaml.Loader) - else: - cur[key] = {} - cur = cur[key] - return config - - -def merge_args(config, args, exclude_args=['config', 'opt', 'slim_config']): - for k, v in vars(args).items(): - if k not in exclude_args: - config[k] = v - return config - - -def print_total_cfg(config): - modules = get_registered_modules() - color_tty = ColorTTY() - green = '___{}___'.format(color_tty.colors.index('green') + 31) - - styled = {} - for key in config.keys(): - if not config[key]: # empty schema - continue - - if key not in modules and not hasattr(config[key], '__dict__'): - styled[key] = config[key] - continue - elif key in modules: - module = modules[key] - else: - type_name = type(config[key]).__name__ - if type_name in modules: - module = modules[type_name].copy() - module.update({ - k: v - for k, v in config[key].__dict__.items() - if k in module.schema - }) - key += " ({})".format(type_name) - default = module.find_default_keys() - missing = module.find_missing_keys() - mismatch = module.find_mismatch_keys() - extra = module.find_extra_keys() - dep_missing = [] - for dep in module.inject: - if isinstance(module[dep], str) and module[dep] != '': - if module[dep] not in modules: # not a valid module - dep_missing.append(dep) - else: - dep_mod = modules[module[dep]] - # empty dict but mandatory - if not dep_mod and dep_mod.mandatory(): - dep_missing.append(dep) - override = list( - set(module.keys()) - set(default) - set(extra) - set(dep_missing)) - replacement = {} - for name in set(override + default + extra + mismatch + missing): - new_name = name - if name in missing: - value = "" - else: - value = module[name] - - if name in extra: - value = dump_value(value) + " " - elif name in mismatch: - value = dump_value(value) + " " - elif name in dep_missing: - value = dump_value(value) + " " - elif name in override and value != '': - mark = green - new_name = mark + name - replacement[new_name] = value - styled[key] = replacement - buffer = yaml.dump(styled, default_flow_style=False, default_style='') - buffer = (re.sub(r"", r"[31m[0m", buffer)) - buffer = (re.sub(r"", r"[33m[0m", buffer)) - buffer = (re.sub(r"", r"[31m[0m", buffer)) - buffer = (re.sub(r"", - r"[31m[0m", buffer)) - buffer = re.sub(r"___(\d+)___(.*?):", r"[\1m\2[0m:", buffer) - print(buffer) diff --git a/pdfdet/models/Paddle/ppdet/utils/colormap.py b/pdfdet/models/Paddle/ppdet/utils/colormap.py deleted file mode 100644 index 67c68dc..0000000 --- a/pdfdet/models/Paddle/ppdet/utils/colormap.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import numpy as np - - -def colormap(rgb=False): - """ - Get colormap - - The code of this function is copied from https://github.com/facebookresearch/Detectron/blob/main/detectron/utils/colormap.py - """ - color_list = np.array([ - 0.000, 0.447, 0.741, 0.850, 0.325, 0.098, 0.929, 0.694, 0.125, 0.494, - 0.184, 0.556, 0.466, 0.674, 0.188, 0.301, 0.745, 0.933, 0.635, 0.078, - 0.184, 0.300, 0.300, 0.300, 0.600, 0.600, 0.600, 1.000, 0.000, 0.000, - 1.000, 0.500, 0.000, 0.749, 0.749, 0.000, 0.000, 1.000, 0.000, 0.000, - 0.000, 1.000, 0.667, 0.000, 1.000, 0.333, 0.333, 0.000, 0.333, 0.667, - 0.000, 0.333, 1.000, 0.000, 0.667, 0.333, 0.000, 0.667, 0.667, 0.000, - 0.667, 1.000, 0.000, 1.000, 0.333, 0.000, 1.000, 0.667, 0.000, 1.000, - 1.000, 0.000, 0.000, 0.333, 0.500, 0.000, 0.667, 0.500, 0.000, 1.000, - 0.500, 0.333, 0.000, 0.500, 0.333, 0.333, 0.500, 0.333, 0.667, 0.500, - 0.333, 1.000, 0.500, 0.667, 0.000, 0.500, 0.667, 0.333, 0.500, 0.667, - 0.667, 0.500, 0.667, 1.000, 0.500, 1.000, 0.000, 0.500, 1.000, 0.333, - 0.500, 1.000, 0.667, 0.500, 1.000, 1.000, 0.500, 0.000, 0.333, 1.000, - 0.000, 0.667, 1.000, 0.000, 1.000, 1.000, 0.333, 0.000, 1.000, 0.333, - 0.333, 1.000, 0.333, 0.667, 1.000, 0.333, 1.000, 1.000, 0.667, 0.000, - 1.000, 0.667, 0.333, 1.000, 0.667, 0.667, 1.000, 0.667, 1.000, 1.000, - 1.000, 0.000, 1.000, 1.000, 0.333, 1.000, 1.000, 0.667, 1.000, 0.167, - 0.000, 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, - 0.000, 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, - 0.000, 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, - 0.833, 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.167, 0.000, 0.000, - 0.333, 0.000, 0.000, 0.500, 0.000, 0.000, 0.667, 0.000, 0.000, 0.833, - 0.000, 0.000, 1.000, 0.000, 0.000, 0.000, 0.143, 0.143, 0.143, 0.286, - 0.286, 0.286, 0.429, 0.429, 0.429, 0.571, 0.571, 0.571, 0.714, 0.714, - 0.714, 0.857, 0.857, 0.857, 1.000, 1.000, 1.000 - ]).astype(np.float32) - color_list = color_list.reshape((-1, 3)) * 255 - if not rgb: - color_list = color_list[:, ::-1] - return color_list.astype('int32') diff --git a/pdfdet/models/Paddle/ppdet/utils/compact.py b/pdfdet/models/Paddle/ppdet/utils/compact.py deleted file mode 100644 index b2f803b..0000000 --- a/pdfdet/models/Paddle/ppdet/utils/compact.py +++ /dev/null @@ -1,11 +0,0 @@ -import PIL - -def imagedraw_textsize_c(draw, text, font=None): - if int(PIL.__version__.split('.')[0]) < 10: - tw, th = draw.textsize(text, font=font) - else: - left, top, right, bottom = draw.textbbox((0, 0), text, font=font) - tw, th = right - left, bottom - top - - return tw, th - diff --git a/pdfdet/models/Paddle/ppdet/utils/download.py b/pdfdet/models/Paddle/ppdet/utils/download.py deleted file mode 100644 index a7909b8..0000000 --- a/pdfdet/models/Paddle/ppdet/utils/download.py +++ /dev/null @@ -1,560 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import os.path as osp -import sys -import yaml -import time -import shutil -import requests -import tqdm -import hashlib -import base64 -import binascii -import tarfile -import zipfile -import errno - -from paddle.utils.download import _get_unique_endpoints -from ppdet.core.workspace import BASE_KEY -from .logger import setup_logger -from .voc_utils import create_list - -logger = setup_logger(__name__) - -__all__ = [ - 'get_weights_path', 'get_dataset_path', 'get_config_path', - 'download_dataset', 'create_voc_list' -] - -WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/weights") -DATASET_HOME = osp.expanduser("~/.cache/paddle/dataset") -CONFIGS_HOME = osp.expanduser("~/.cache/paddle/configs") - -# dict of {dataset_name: (download_info, sub_dirs)} -# download info: [(url, md5sum)] -DATASETS = { - 'coco': ([ - ( - 'http://images.cocodataset.org/zips/train2017.zip', - 'cced6f7f71b7629ddf16f17bbcfab6b2', ), - ( - 'http://images.cocodataset.org/zips/val2017.zip', - '442b8da7639aecaf257c1dceb8ba8c80', ), - ( - 'http://images.cocodataset.org/annotations/annotations_trainval2017.zip', - 'f4bbac642086de4f52a3fdda2de5fa2c', ), - ], ["annotations", "train2017", "val2017"]), - 'voc': ([ - ( - 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar', - '6cd6e144f989b92b3379bac3b3de84fd', ), - ( - 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar', - 'c52e279531787c972589f7e41ab4ae64', ), - ( - 'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar', - 'b6e924de25625d8de591ea690078ad9f', ), - ( - 'https://paddledet.bj.bcebos.com/data/label_list.txt', - '5ae5d62183cfb6f6d3ac109359d06a1b', ), - ], ["VOCdevkit/VOC2012", "VOCdevkit/VOC2007"]), - 'wider_face': ([ - ( - 'https://dataset.bj.bcebos.com/wider_face/WIDER_train.zip', - '3fedf70df600953d25982bcd13d91ba2', ), - ( - 'https://dataset.bj.bcebos.com/wider_face/WIDER_val.zip', - 'dfa7d7e790efa35df3788964cf0bbaea', ), - ( - 'https://dataset.bj.bcebos.com/wider_face/wider_face_split.zip', - 'a4a898d6193db4b9ef3260a68bad0dc7', ), - ], ["WIDER_train", "WIDER_val", "wider_face_split"]), - 'fruit': ([( - 'https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit.tar', - 'baa8806617a54ccf3685fa7153388ae6', ), ], - ['Annotations', 'JPEGImages']), - 'roadsign_voc': ([( - 'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_voc.tar', - '8d629c0f880dd8b48de9aeff44bf1f3e', ), ], ['annotations', 'images']), - 'roadsign_coco': ([( - 'https://paddlemodels.bj.bcebos.com/object_detection/roadsign_coco.tar', - '49ce5a9b5ad0d6266163cd01de4b018e', ), ], ['annotations', 'images']), - 'spine_coco': ([( - 'https://paddledet.bj.bcebos.com/data/spine.tar', - '8a3a353c2c54a2284ad7d2780b65f6a6', ), ], ['annotations', 'images']), - 'coco_ce': ([( - 'https://paddledet.bj.bcebos.com/data/coco_ce.tar', - 'eadd1b79bc2f069f2744b1dd4e0c0329', ), ], []), - 'culane': ([('https://bj.bcebos.com/v1/paddledet/data/culane.tar', None, ), ], []) -} - -DOWNLOAD_DATASETS_LIST = DATASETS.keys() - -DOWNLOAD_RETRY_LIMIT = 3 - -PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX = 'https://paddledet.bj.bcebos.com/' - - -# When running unit tests, there could be multiple processes that -# trying to create DATA_HOME directory simultaneously, so we cannot -# use a if condition to check for the existence of the directory; -# instead, we use the filesystem as the synchronization mechanism by -# catching returned errors. -def must_mkdirs(path): - try: - os.makedirs(path) - except OSError as exc: - if exc.errno != errno.EEXIST: - raise - pass - - -def parse_url(url): - url = url.replace("ppdet://", PPDET_WEIGHTS_DOWNLOAD_URL_PREFIX) - return url - - -def get_weights_path(url): - """Get weights path from WEIGHTS_HOME, if not exists, - download it from url. - """ - url = parse_url(url) - path, _ = get_path(url, WEIGHTS_HOME) - return path - - -def get_config_path(url): - """Get weights path from CONFIGS_HOME, if not exists, - download it from url. - """ - url = parse_url(url) - path = map_path(url, CONFIGS_HOME, path_depth=2) - if os.path.isfile(path): - return path - - # config file not found, try download - # 1. clear configs directory - if osp.isdir(CONFIGS_HOME): - shutil.rmtree(CONFIGS_HOME) - - # 2. get url - try: - from ppdet import __version__ as version - except ImportError: - version = None - - cfg_url = "ppdet://configs/{}/configs.tar".format(version) \ - if version else "ppdet://configs/configs.tar" - cfg_url = parse_url(cfg_url) - - # 3. download and decompress - cfg_fullname = _download_dist(cfg_url, osp.dirname(CONFIGS_HOME)) - _decompress_dist(cfg_fullname) - - # 4. check config file existing - if os.path.isfile(path): - return path - else: - logger.error("Get config {} failed after download, please contact us on " \ - "https://github.com/PaddlePaddle/PaddleDetection/issues".format(path)) - sys.exit(1) - - -def get_dataset_path(path, annotation, image_dir): - """ - If path exists, return path. - Otherwise, get dataset path from DATASET_HOME, if not exists, - download it. - """ - if _dataset_exists(path, annotation, image_dir): - return path - - data_name = os.path.split(path.strip().lower())[-1] - if data_name not in DOWNLOAD_DATASETS_LIST: - raise ValueError( - "Dataset {} is not valid for reason above, please check again.". - format(osp.realpath(path))) - else: - logger.warning( - "Dataset {} is not valid for reason above, try searching {} or " - "downloading dataset...".format(osp.realpath(path), DATASET_HOME)) - - for name, dataset in DATASETS.items(): - if data_name == name: - logger.debug("Parse dataset_dir {} as dataset " - "{}".format(path, name)) - data_dir = osp.join(DATASET_HOME, name) - - if name == "spine_coco": - if _dataset_exists(data_dir, annotation, image_dir): - return data_dir - - # For voc, only check dir VOCdevkit/VOC2012, VOCdevkit/VOC2007 - if name in ['voc', 'fruit', 'roadsign_voc']: - exists = True - for sub_dir in dataset[1]: - check_dir = osp.join(data_dir, sub_dir) - if osp.exists(check_dir): - logger.info("Found {}".format(check_dir)) - else: - exists = False - if exists: - return data_dir - - # voc exist is checked above, voc is not exist here - check_exist = name != 'voc' and name != 'fruit' and name != 'roadsign_voc' - for url, md5sum in dataset[0]: - get_path(url, data_dir, md5sum, check_exist) - - # voc should create list after download - if name == 'voc': - create_voc_list(data_dir) - return data_dir - - raise ValueError("Dataset automaticly downloading Error.") - - -def create_voc_list(data_dir, devkit_subdir='VOCdevkit'): - logger.debug("Create voc file list...") - devkit_dir = osp.join(data_dir, devkit_subdir) - years = ['2007', '2012'] - - # NOTE: since using auto download VOC - # dataset, VOC default label list should be used, - # do not generate label_list.txt here. For default - # label, see ../data/source/voc.py - create_list(devkit_dir, years, data_dir) - logger.debug("Create voc file list finished") - - -def map_path(url, root_dir, path_depth=1): - # parse path after download to decompress under root_dir - assert path_depth > 0, "path_depth should be a positive integer" - dirname = url - for _ in range(path_depth): - dirname = osp.dirname(dirname) - fpath = osp.relpath(url, dirname) - - zip_formats = ['.zip', '.tar', '.gz'] - for zip_format in zip_formats: - fpath = fpath.replace(zip_format, '') - return osp.join(root_dir, fpath) - - -def get_path(url, root_dir, md5sum=None, check_exist=True): - """ Download from given url to root_dir. - if file or directory specified by url is exists under - root_dir, return the path directly, otherwise download - from url and decompress it, return the path. - - url (str): download url - root_dir (str): root dir for downloading, it should be - WEIGHTS_HOME or DATASET_HOME - md5sum (str): md5 sum of download package - """ - # parse path after download to decompress under root_dir - fullpath = map_path(url, root_dir) - - # For same zip file, decompressed directory name different - # from zip file name, rename by following map - decompress_name_map = { - "VOCtrainval_11-May-2012": "VOCdevkit/VOC2012", - "VOCtrainval_06-Nov-2007": "VOCdevkit/VOC2007", - "VOCtest_06-Nov-2007": "VOCdevkit/VOC2007", - "annotations_trainval": "annotations" - } - for k, v in decompress_name_map.items(): - if fullpath.find(k) >= 0: - fullpath = osp.join(osp.split(fullpath)[0], v) - - if osp.exists(fullpath) and check_exist: - if not osp.isfile(fullpath) or \ - _check_exist_file_md5(fullpath, md5sum, url): - logger.debug("Found {}".format(fullpath)) - return fullpath, True - else: - os.remove(fullpath) - - fullname = _download_dist(url, root_dir, md5sum) - - # new weights format which postfix is 'pdparams' not - # need to decompress - if osp.splitext(fullname)[-1] not in ['.pdparams', '.yml', '.ttf']: - _decompress_dist(fullname) - - return fullpath, False - - -def download_dataset(path, dataset=None): - if dataset not in DATASETS.keys(): - logger.error("Unknown dataset {}, it should be " - "{}".format(dataset, DATASETS.keys())) - return - dataset_info = DATASETS[dataset][0] - for info in dataset_info: - get_path(info[0], path, info[1], False) - logger.debug("Download dataset {} finished.".format(dataset)) - - -def _dataset_exists(path, annotation, image_dir): - """ - Check if user define dataset exists - """ - if not osp.exists(path): - logger.warning("Config dataset_dir {} is not exits, " - "dataset config is not valid".format(path)) - return False - - if annotation: - annotation_path = osp.join(path, annotation) - if not osp.isfile(annotation_path): - logger.warning("Config annotation {} is not a " - "file, dataset config is not " - "valid".format(annotation_path)) - return False - if image_dir: - image_path = osp.join(path, image_dir) - if not osp.isdir(image_path): - logger.warning("Config image_dir {} is not a " - "directory, dataset config is not " - "valid".format(image_path)) - return False - return True - - -def _download(url, path, md5sum=None): - """ - Download from url, save to path. - - url (str): download url - path (str): download to given path - """ - must_mkdirs(path) - - fname = osp.split(url)[-1] - fullname = osp.join(path, fname) - retry_cnt = 0 - - while not (osp.exists(fullname) and _check_exist_file_md5(fullname, md5sum, - url)): - if retry_cnt < DOWNLOAD_RETRY_LIMIT: - retry_cnt += 1 - else: - raise RuntimeError("Download from {} failed. " - "Retry limit reached".format(url)) - - logger.info("Downloading {} from {}".format(fname, url)) - - # NOTE: windows path join may incur \, which is invalid in url - if sys.platform == "win32": - url = url.replace('\\', '/') - - req = requests.get(url, stream=True) - if req.status_code != 200: - raise RuntimeError("Downloading from {} failed with code " - "{}!".format(url, req.status_code)) - - # For protecting download interupted, download to - # tmp_fullname firstly, move tmp_fullname to fullname - # after download finished - tmp_fullname = fullname + "_tmp" - total_size = req.headers.get('content-length') - with open(tmp_fullname, 'wb') as f: - if total_size: - for chunk in tqdm.tqdm( - req.iter_content(chunk_size=1024), - total=(int(total_size) + 1023) // 1024, - unit='KB'): - f.write(chunk) - else: - for chunk in req.iter_content(chunk_size=1024): - if chunk: - f.write(chunk) - shutil.move(tmp_fullname, fullname) - return fullname - - -def _download_dist(url, path, md5sum=None): - env = os.environ - if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env: - # Mainly used to solve the problem of downloading data from - # different machines in the case of multiple machines. - # Different nodes will download data, and the same node - # will only download data once. - # Reference https://github.com/PaddlePaddle/PaddleClas/blob/develop/ppcls/utils/download.py#L108 - rank_id_curr_node = int(os.environ.get("PADDLE_RANK_IN_NODE", 0)) - num_trainers = int(env['PADDLE_TRAINERS_NUM']) - if num_trainers <= 1: - return _download(url, path, md5sum) - else: - fname = osp.split(url)[-1] - fullname = osp.join(path, fname) - lock_path = fullname + '.download.lock' - - must_mkdirs(path) - - if not osp.exists(fullname): - with open(lock_path, 'w'): # touch - os.utime(lock_path, None) - if rank_id_curr_node == 0: - _download(url, path, md5sum) - os.remove(lock_path) - else: - while os.path.exists(lock_path): - time.sleep(0.5) - return fullname - else: - return _download(url, path, md5sum) - - -def _check_exist_file_md5(filename, md5sum, url): - # if md5sum is None, and file to check is weights file, - # read md5um from url and check, else check md5sum directly - return _md5check_from_url(filename, url) if md5sum is None \ - and filename.endswith('pdparams') \ - else _md5check(filename, md5sum) - - -def _md5check_from_url(filename, url): - # For weights in bcebos URLs, MD5 value is contained - # in request header as 'content_md5' - req = requests.get(url, stream=True) - content_md5 = req.headers.get('content-md5') - req.close() - if not content_md5 or _md5check( - filename, - binascii.hexlify(base64.b64decode(content_md5.strip('"'))).decode( - )): - return True - else: - return False - - -def _md5check(fullname, md5sum=None): - if md5sum is None: - return True - - logger.debug("File {} md5 checking...".format(fullname)) - md5 = hashlib.md5() - with open(fullname, 'rb') as f: - for chunk in iter(lambda: f.read(4096), b""): - md5.update(chunk) - calc_md5sum = md5.hexdigest() - - if calc_md5sum != md5sum: - logger.warning("File {} md5 check failed, {}(calc) != " - "{}(base)".format(fullname, calc_md5sum, md5sum)) - return False - return True - - -def _decompress(fname): - """ - Decompress for zip and tar file - """ - logger.info("Decompressing {}...".format(fname)) - - # For protecting decompressing interupted, - # decompress to fpath_tmp directory firstly, if decompress - # successed, move decompress files to fpath and delete - # fpath_tmp and remove download compress file. - fpath = osp.split(fname)[0] - fpath_tmp = osp.join(fpath, 'tmp') - if osp.isdir(fpath_tmp): - shutil.rmtree(fpath_tmp) - os.makedirs(fpath_tmp) - - if fname.find('tar') >= 0: - with tarfile.open(fname) as tf: - tf.extractall(path=fpath_tmp) - elif fname.find('zip') >= 0: - with zipfile.ZipFile(fname) as zf: - zf.extractall(path=fpath_tmp) - elif fname.find('.txt') >= 0: - return - else: - raise TypeError("Unsupport compress file type {}".format(fname)) - - for f in os.listdir(fpath_tmp): - src_dir = osp.join(fpath_tmp, f) - dst_dir = osp.join(fpath, f) - _move_and_merge_tree(src_dir, dst_dir) - - shutil.rmtree(fpath_tmp) - os.remove(fname) - - -def _decompress_dist(fname): - env = os.environ - if 'PADDLE_TRAINERS_NUM' in env and 'PADDLE_TRAINER_ID' in env: - trainer_id = int(env['PADDLE_TRAINER_ID']) - num_trainers = int(env['PADDLE_TRAINERS_NUM']) - if num_trainers <= 1: - _decompress(fname) - else: - lock_path = fname + '.decompress.lock' - from paddle.distributed import ParallelEnv - unique_endpoints = _get_unique_endpoints(ParallelEnv() - .trainer_endpoints[:]) - # NOTE(dkp): _decompress_dist always performed after - # _download_dist, in _download_dist sub-trainers is waiting - # for download lock file release with sleeping, if decompress - # prograss is very fast and finished with in the sleeping gap - # time, e.g in tiny dataset such as coco_ce, spine_coco, main - # trainer may finish decompress and release lock file, so we - # only craete lock file in main trainer and all sub-trainer - # wait 1s for main trainer to create lock file, for 1s is - # twice as sleeping gap, this waiting time can keep all - # trainer pipeline in order - # **change this if you have more elegent methods** - if ParallelEnv().current_endpoint in unique_endpoints: - with open(lock_path, 'w'): # touch - os.utime(lock_path, None) - _decompress(fname) - os.remove(lock_path) - else: - time.sleep(1) - while os.path.exists(lock_path): - time.sleep(0.5) - else: - _decompress(fname) - - -def _move_and_merge_tree(src, dst): - """ - Move src directory to dst, if dst is already exists, - merge src to dst - """ - if not osp.exists(dst): - shutil.move(src, dst) - elif osp.isfile(src): - shutil.move(src, dst) - else: - for fp in os.listdir(src): - src_fp = osp.join(src, fp) - dst_fp = osp.join(dst, fp) - if osp.isdir(src_fp): - if osp.isdir(dst_fp): - _move_and_merge_tree(src_fp, dst_fp) - else: - shutil.move(src_fp, dst_fp) - elif osp.isfile(src_fp) and \ - not osp.isfile(dst_fp): - shutil.move(src_fp, dst_fp) diff --git a/pdfdet/models/Paddle/ppdet/utils/fuse_utils.py b/pdfdet/models/Paddle/ppdet/utils/fuse_utils.py deleted file mode 100644 index 647fa99..0000000 --- a/pdfdet/models/Paddle/ppdet/utils/fuse_utils.py +++ /dev/null @@ -1,179 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import copy -import paddle -import paddle.nn as nn - -__all__ = ['fuse_conv_bn'] - - -def fuse_conv_bn(model): - is_train = False - if model.training: - model.eval() - is_train = True - fuse_list = [] - tmp_pair = [None, None] - for name, layer in model.named_sublayers(): - if isinstance(layer, nn.Conv2D): - tmp_pair[0] = name - if isinstance(layer, nn.BatchNorm2D): - tmp_pair[1] = name - - if tmp_pair[0] and tmp_pair[1] and len(tmp_pair) == 2: - fuse_list.append(tmp_pair) - tmp_pair = [None, None] - model = fuse_layers(model, fuse_list) - if is_train: - model.train() - return model - - -def find_parent_layer_and_sub_name(model, name): - """ - Given the model and the name of a layer, find the parent layer and - the sub_name of the layer. - For example, if name is 'block_1/convbn_1/conv_1', the parent layer is - 'block_1/convbn_1' and the sub_name is `conv_1`. - Args: - model(paddle.nn.Layer): the model to be quantized. - name(string): the name of a layer - - Returns: - parent_layer, subname - """ - assert isinstance(model, nn.Layer), \ - "The model must be the instance of paddle.nn.Layer." - assert len(name) > 0, "The input (name) should not be empty." - - last_idx = 0 - idx = 0 - parent_layer = model - while idx < len(name): - if name[idx] == '.': - sub_name = name[last_idx:idx] - if hasattr(parent_layer, sub_name): - parent_layer = getattr(parent_layer, sub_name) - last_idx = idx + 1 - idx += 1 - sub_name = name[last_idx:idx] - return parent_layer, sub_name - - -class Identity(nn.Layer): - '''a layer to replace bn or relu layers''' - - def __init__(self, *args, **kwargs): - super(Identity, self).__init__() - - def forward(self, input): - return input - - -def fuse_layers(model, layers_to_fuse, inplace=False): - ''' - fuse layers in layers_to_fuse - - Args: - model(nn.Layer): The model to be fused. - layers_to_fuse(list): The layers' names to be fused. For - example,"fuse_list = [["conv1", "bn1"], ["conv2", "bn2"]]". - A TypeError would be raised if "fuse" was set as - True but "fuse_list" was None. - Default: None. - inplace(bool): Whether apply fusing to the input model. - Default: False. - - Return - fused_model(paddle.nn.Layer): The fused model. - ''' - if not inplace: - model = copy.deepcopy(model) - for layers_list in layers_to_fuse: - layer_list = [] - for layer_name in layers_list: - parent_layer, sub_name = find_parent_layer_and_sub_name(model, - layer_name) - layer_list.append(getattr(parent_layer, sub_name)) - new_layers = _fuse_func(layer_list) - for i, item in enumerate(layers_list): - parent_layer, sub_name = find_parent_layer_and_sub_name(model, item) - setattr(parent_layer, sub_name, new_layers[i]) - return model - - -def _fuse_func(layer_list): - '''choose the fuser method and fuse layers''' - types = tuple(type(m) for m in layer_list) - fusion_method = types_to_fusion_method.get(types, None) - new_layers = [None] * len(layer_list) - fused_layer = fusion_method(*layer_list) - for handle_id, pre_hook_fn in layer_list[0]._forward_pre_hooks.items(): - fused_layer.register_forward_pre_hook(pre_hook_fn) - del layer_list[0]._forward_pre_hooks[handle_id] - for handle_id, hook_fn in layer_list[-1]._forward_post_hooks.items(): - fused_layer.register_forward_post_hook(hook_fn) - del layer_list[-1]._forward_post_hooks[handle_id] - new_layers[0] = fused_layer - for i in range(1, len(layer_list)): - identity = Identity() - identity.training = layer_list[0].training - new_layers[i] = identity - return new_layers - - -def _fuse_conv_bn(conv, bn): - '''fuse conv and bn for train or eval''' - assert(conv.training == bn.training),\ - "Conv and BN both must be in the same mode (train or eval)." - if conv.training: - assert bn._num_features == conv._out_channels, 'Output channel of Conv2d must match num_features of BatchNorm2d' - raise NotImplementedError - else: - return _fuse_conv_bn_eval(conv, bn) - - -def _fuse_conv_bn_eval(conv, bn): - '''fuse conv and bn for eval''' - assert (not (conv.training or bn.training)), "Fusion only for eval!" - fused_conv = copy.deepcopy(conv) - - fused_weight, fused_bias = _fuse_conv_bn_weights( - fused_conv.weight, fused_conv.bias, bn._mean, bn._variance, bn._epsilon, - bn.weight, bn.bias) - fused_conv.weight.set_value(fused_weight) - if fused_conv.bias is None: - fused_conv.bias = paddle.create_parameter( - shape=[fused_conv._out_channels], is_bias=True, dtype=bn.bias.dtype) - fused_conv.bias.set_value(fused_bias) - return fused_conv - - -def _fuse_conv_bn_weights(conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b): - '''fuse weights and bias of conv and bn''' - if conv_b is None: - conv_b = paddle.zeros_like(bn_rm) - if bn_w is None: - bn_w = paddle.ones_like(bn_rm) - if bn_b is None: - bn_b = paddle.zeros_like(bn_rm) - bn_var_rsqrt = paddle.rsqrt(bn_rv + bn_eps) - conv_w = conv_w * \ - (bn_w * bn_var_rsqrt).reshape([-1] + [1] * (len(conv_w.shape) - 1)) - conv_b = (conv_b - bn_rm) * bn_var_rsqrt * bn_w + bn_b - return conv_w, conv_b - - -types_to_fusion_method = {(nn.Conv2D, nn.BatchNorm2D): _fuse_conv_bn, } diff --git a/pdfdet/models/Paddle/ppdet/utils/logger.py b/pdfdet/models/Paddle/ppdet/utils/logger.py deleted file mode 100644 index 51e2962..0000000 --- a/pdfdet/models/Paddle/ppdet/utils/logger.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import os -import sys - -import paddle.distributed as dist - -__all__ = ['setup_logger'] - -logger_initialized = [] - - -def setup_logger(name="ppdet", output=None): - """ - Initialize logger and set its verbosity level to INFO. - Args: - output (str): a file name or a directory to save log. If None, will not save log file. - If ends with ".txt" or ".log", assumed to be a file name. - Otherwise, logs will be saved to `output/log.txt`. - name (str): the root module name of this logger - - Returns: - logging.Logger: a logger - """ - logger = logging.getLogger(name) - if name in logger_initialized: - return logger - - logger.setLevel(logging.INFO) - logger.propagate = False - - formatter = logging.Formatter( - "[%(asctime)s] %(name)s %(levelname)s: %(message)s", - datefmt="%m/%d %H:%M:%S") - # stdout logging: master only - local_rank = dist.get_rank() - if local_rank == 0: - ch = logging.StreamHandler(stream=sys.stdout) - ch.setLevel(logging.DEBUG) - ch.setFormatter(formatter) - logger.addHandler(ch) - - # file logging: all workers - if output is not None: - if output.endswith(".txt") or output.endswith(".log"): - filename = output - else: - filename = os.path.join(output, "log.txt") - if local_rank > 0: - filename = filename + ".rank{}".format(local_rank) - os.makedirs(os.path.dirname(filename)) - fh = logging.FileHandler(filename, mode='a') - fh.setLevel(logging.DEBUG) - fh.setFormatter(logging.Formatter()) - logger.addHandler(fh) - logger_initialized.append(name) - return logger diff --git a/pdfdet/models/Paddle/ppdet/utils/profiler.py b/pdfdet/models/Paddle/ppdet/utils/profiler.py deleted file mode 100644 index 28ac467..0000000 --- a/pdfdet/models/Paddle/ppdet/utils/profiler.py +++ /dev/null @@ -1,129 +0,0 @@ -# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import paddle -import paddle.profiler as profiler - -# A global variable to record the number of calling times for profiler -# functions. It is used to specify the tracing range of training steps. -_profiler_step_id = 0 - -# A global variable to avoid parsing from string every time. -_profiler_options = None -_prof = None - -class ProfilerOptions(object): - ''' - Use a string to initialize a ProfilerOptions. - The string should be in the format: "key1=value1;key2=value;key3=value3". - For example: - "profile_path=model.profile" - "batch_range=[50, 60]; profile_path=model.profile" - "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile" - - ProfilerOptions supports following key-value pair: - batch_range - a integer list, e.g. [100, 110]. - state - a string, the optional values are 'CPU', 'GPU' or 'All'. - sorted_key - a string, the optional values are 'calls', 'total', - 'max', 'min' or 'ave. - tracer_option - a string, the optional values are 'Default', 'OpDetail', - 'AllOpDetail'. - profile_path - a string, the path to save the serialized profile data, - which can be used to generate a timeline. - exit_on_finished - a boolean. - ''' - - def __init__(self, options_str): - assert isinstance(options_str, str) - - self._options = { - 'batch_range': [10, 20], - 'state': 'All', - 'sorted_key': 'total', - 'tracer_option': 'Default', - 'profile_path': '/tmp/profile', - 'exit_on_finished': True, - 'timer_only': True - } - self._parse_from_string(options_str) - - def _parse_from_string(self, options_str): - for kv in options_str.replace(' ', '').split(';'): - key, value = kv.split('=') - if key == 'batch_range': - value_list = value.replace('[', '').replace(']', '').split(',') - value_list = list(map(int, value_list)) - if len(value_list) >= 2 and value_list[0] >= 0 and value_list[ - 1] > value_list[0]: - self._options[key] = value_list - elif key == 'exit_on_finished': - self._options[key] = value.lower() in ("yes", "true", "t", "1") - elif key in [ - 'state', 'sorted_key', 'tracer_option', 'profile_path' - ]: - self._options[key] = value - elif key == 'timer_only': - self._options[key] = value - - def __getitem__(self, name): - if self._options.get(name, None) is None: - raise ValueError( - "ProfilerOptions does not have an option named %s." % name) - return self._options[name] - - -def add_profiler_step(options_str=None): - ''' - Enable the operator-level timing using PaddlePaddle's profiler. - The profiler uses a independent variable to count the profiler steps. - One call of this function is treated as a profiler step. - Args: - profiler_options - a string to initialize the ProfilerOptions. - Default is None, and the profiler is disabled. - ''' - if options_str is None: - return - - global _prof - global _profiler_step_id - global _profiler_options - - if _profiler_options is None: - _profiler_options = ProfilerOptions(options_str) - # profile : https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/profiling_model.html#chakanxingnengshujudetongjibiaodan - # timer_only = True only the model's throughput and time overhead are displayed - # timer_only = False calling summary can print a statistical form that presents performance data from different perspectives. - # timer_only = False the output Timeline information can be found in the profiler_log directory - if _prof is None: - _timer_only = str(_profiler_options['timer_only']) == str(True) - _prof = profiler.Profiler( - scheduler = (_profiler_options['batch_range'][0], _profiler_options['batch_range'][1]), - on_trace_ready = profiler.export_chrome_tracing('./profiler_log'), - timer_only = _timer_only) - _prof.start() - else: - _prof.step() - - if _profiler_step_id == _profiler_options['batch_range'][1]: - _prof.stop() - _prof.summary( - op_detail=True, - thread_sep=False, - time_unit='ms') - _prof = None - if _profiler_options['exit_on_finished']: - sys.exit(0) - - _profiler_step_id += 1 diff --git a/pdfdet/models/Paddle/ppdet/utils/stats.py b/pdfdet/models/Paddle/ppdet/utils/stats.py deleted file mode 100644 index c070e65..0000000 --- a/pdfdet/models/Paddle/ppdet/utils/stats.py +++ /dev/null @@ -1,94 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import collections -import numpy as np - -__all__ = ['SmoothedValue', 'TrainingStats'] - - -class SmoothedValue(object): - """Track a series of values and provide access to smoothed values over a - window or the global series average. - """ - - def __init__(self, window_size=20, fmt=None): - if fmt is None: - fmt = "{median:.4f} ({avg:.4f})" - self.deque = collections.deque(maxlen=window_size) - self.fmt = fmt - self.total = 0. - self.count = 0 - - def update(self, value, n=1): - self.deque.append(value) - self.count += n - self.total += value * n - - @property - def median(self): - return np.median(self.deque) - - @property - def avg(self): - return np.mean(self.deque) - - @property - def max(self): - return np.max(self.deque) - - @property - def value(self): - return self.deque[-1] - - @property - def global_avg(self): - return self.total / self.count - - def __str__(self): - return self.fmt.format( - median=self.median, avg=self.avg, max=self.max, value=self.value) - - -class TrainingStats(object): - def __init__(self, window_size, delimiter=' '): - self.meters = None - self.window_size = window_size - self.delimiter = delimiter - - def update(self, stats): - if self.meters is None: - self.meters = { - k: SmoothedValue(self.window_size) - for k in stats.keys() - } - for k, v in self.meters.items(): - v.update(float(stats[k])) - - def get(self, extras=None): - stats = collections.OrderedDict() - if extras: - for k, v in extras.items(): - stats[k] = v - for k, v in self.meters.items(): - stats[k] = format(v.median, '.6f') - - return stats - - def log(self, extras=None): - d = self.get(extras) - strs = [] - for k, v in d.items(): - strs.append("{}: {}".format(k, str(v))) - return self.delimiter.join(strs) diff --git a/pdfdet/models/Paddle/ppdet/utils/visualizer.py b/pdfdet/models/Paddle/ppdet/utils/visualizer.py deleted file mode 100644 index e29a189..0000000 --- a/pdfdet/models/Paddle/ppdet/utils/visualizer.py +++ /dev/null @@ -1,465 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import os -import numpy as np -from PIL import Image, ImageDraw, ImageFont -import cv2 -import math - -from .colormap import colormap -from ppdet.utils.logger import setup_logger -from ppdet.utils.compact import imagedraw_textsize_c -from ppdet.utils.download import get_path -logger = setup_logger(__name__) - -__all__ = ['visualize_results'] - - -def visualize_results(image, - bbox_res, - mask_res, - segm_res, - keypoint_res, - pose3d_res, - im_id, - catid2name, - threshold=0.5): - """ - Visualize bbox and mask results - """ - if bbox_res is not None: - image = draw_bbox(image, im_id, catid2name, bbox_res, threshold) - if mask_res is not None: - image = draw_mask(image, im_id, mask_res, threshold) - if segm_res is not None: - image = draw_segm(image, im_id, catid2name, segm_res, threshold) - if keypoint_res is not None: - image = draw_pose(image, keypoint_res, threshold) - if pose3d_res is not None: - pose3d = np.array(pose3d_res[0]['pose3d']) * 1000 - image = draw_pose3d(image, pose3d, visual_thread=threshold) - return image - - -def draw_mask(image, im_id, segms, threshold, alpha=0.7): - """ - Draw mask on image - """ - mask_color_id = 0 - w_ratio = .4 - color_list = colormap(rgb=True) - img_array = np.array(image).astype('float32') - for dt in np.array(segms): - if im_id != dt['image_id']: - continue - segm, score = dt['segmentation'], dt['score'] - if score < threshold: - continue - import pycocotools.mask as mask_util - mask = mask_util.decode(segm) * 255 - color_mask = color_list[mask_color_id % len(color_list), 0:3] - mask_color_id += 1 - for c in range(3): - color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255 - idx = np.nonzero(mask) - img_array[idx[0], idx[1], :] *= 1.0 - alpha - img_array[idx[0], idx[1], :] += alpha * color_mask - return Image.fromarray(img_array.astype('uint8')) - - -def draw_bbox(image, im_id, catid2name, bboxes, threshold): - """ - Draw bbox on image - """ - font_url = "https://paddledet.bj.bcebos.com/simfang.ttf" - font_path , _ = get_path(font_url, "~/.cache/paddle/") - font_size = 18 - font = ImageFont.truetype(font_path, font_size, encoding="utf-8") - - draw = ImageDraw.Draw(image) - - catid2color = {} - color_list = colormap(rgb=True)[:40] - for dt in np.array(bboxes): - if im_id != dt['image_id']: - continue - catid, bbox, score = dt['category_id'], dt['bbox'], dt['score'] - if score < threshold: - continue - - if catid not in catid2color: - idx = np.random.randint(len(color_list)) - catid2color[catid] = color_list[idx] - color = tuple(catid2color[catid]) - - # draw bbox - if len(bbox) == 4: - # draw bbox - xmin, ymin, w, h = bbox - xmax = xmin + w - ymax = ymin + h - draw.line( - [(xmin, ymin), (xmin, ymax), (xmax, ymax), (xmax, ymin), - (xmin, ymin)], - width=2, - fill=color) - elif len(bbox) == 8: - x1, y1, x2, y2, x3, y3, x4, y4 = bbox - draw.line( - [(x1, y1), (x2, y2), (x3, y3), (x4, y4), (x1, y1)], - width=2, - fill=color) - xmin = min(x1, x2, x3, x4) - ymin = min(y1, y2, y3, y4) - else: - logger.error('the shape of bbox must be [M, 4] or [M, 8]!') - - # draw label - text = "{} {:.2f}".format(catid2name[catid], score) - tw, th = imagedraw_textsize_c(draw, text, font=font) - draw.rectangle( - [(xmin + 1, ymin - th), (xmin + tw + 1, ymin)], fill=color) - draw.text((xmin + 1, ymin - th), text, fill=(255, 255, 255), font=font) - - return image - - -def save_result(save_path, results, catid2name, threshold): - """ - save result as txt - """ - img_id = int(results["im_id"]) - with open(save_path, 'w') as f: - if "bbox_res" in results: - for dt in results["bbox_res"]: - catid, bbox, score = dt['category_id'], dt['bbox'], dt['score'] - if score < threshold: - continue - # each bbox result as a line - # for rbox: classname score x1 y1 x2 y2 x3 y3 x4 y4 - # for bbox: classname score x1 y1 w h - bbox_pred = '{} {} '.format(catid2name[catid], - score) + ' '.join( - [str(e) for e in bbox]) - f.write(bbox_pred + '\n') - elif "keypoint_res" in results: - for dt in results["keypoint_res"]: - kpts = dt['keypoints'] - scores = dt['score'] - keypoint_pred = [img_id, scores, kpts] - print(keypoint_pred, file=f) - else: - print("No valid results found, skip txt save") - - -def draw_segm(image, - im_id, - catid2name, - segms, - threshold, - alpha=0.7, - draw_box=True): - """ - Draw segmentation on image - """ - mask_color_id = 0 - w_ratio = .4 - color_list = colormap(rgb=True) - img_array = np.array(image).astype('float32') - for dt in np.array(segms): - if im_id != dt['image_id']: - continue - segm, score, catid = dt['segmentation'], dt['score'], dt['category_id'] - if score < threshold: - continue - import pycocotools.mask as mask_util - mask = mask_util.decode(segm) * 255 - color_mask = color_list[mask_color_id % len(color_list), 0:3] - mask_color_id += 1 - for c in range(3): - color_mask[c] = color_mask[c] * (1 - w_ratio) + w_ratio * 255 - idx = np.nonzero(mask) - img_array[idx[0], idx[1], :] *= 1.0 - alpha - img_array[idx[0], idx[1], :] += alpha * color_mask - - if not draw_box: - center_y, center_x = ndimage.measurements.center_of_mass(mask) - label_text = "{}".format(catid2name[catid]) - vis_pos = (max(int(center_x) - 10, 0), int(center_y)) - cv2.putText(img_array, label_text, vis_pos, - cv2.FONT_HERSHEY_COMPLEX, 0.3, (255, 255, 255)) - else: - mask = mask_util.decode(segm) * 255 - sum_x = np.sum(mask, axis=0) - x = np.where(sum_x > 0.5)[0] - sum_y = np.sum(mask, axis=1) - y = np.where(sum_y > 0.5)[0] - x0, x1, y0, y1 = x[0], x[-1], y[0], y[-1] - cv2.rectangle(img_array, (x0, y0), (x1, y1), - tuple(color_mask.astype('int32').tolist()), 1) - bbox_text = '%s %.2f' % (catid2name[catid], score) - t_size = cv2.getTextSize(bbox_text, 0, 0.3, thickness=1)[0] - cv2.rectangle(img_array, (x0, y0), (x0 + t_size[0], - y0 - t_size[1] - 3), - tuple(color_mask.astype('int32').tolist()), -1) - cv2.putText( - img_array, - bbox_text, (x0, y0 - 2), - cv2.FONT_HERSHEY_SIMPLEX, - 0.3, (0, 0, 0), - 1, - lineType=cv2.LINE_AA) - - return Image.fromarray(img_array.astype('uint8')) - - -def draw_pose(image, - results, - visual_thread=0.6, - save_name='pose.jpg', - save_dir='output', - returnimg=False, - ids=None): - try: - import matplotlib.pyplot as plt - import matplotlib - plt.switch_backend('agg') - except Exception as e: - logger.error('Matplotlib not found, please install matplotlib.' - 'for example: `pip install matplotlib`.') - raise e - - skeletons = np.array([item['keypoints'] for item in results]) - kpt_nums = 17 - if len(skeletons) > 0: - kpt_nums = int(skeletons.shape[1] / 3) - skeletons = skeletons.reshape(-1, kpt_nums, 3) - if kpt_nums == 17: #plot coco keypoint - EDGES = [(0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6), (5, 7), (6, 8), - (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14), - (13, 15), (14, 16), (11, 12)] - else: #plot mpii keypoint - EDGES = [(0, 1), (1, 2), (3, 4), (4, 5), (2, 6), (3, 6), (6, 7), (7, 8), - (8, 9), (10, 11), (11, 12), (13, 14), (14, 15), (8, 12), - (8, 13)] - NUM_EDGES = len(EDGES) - - colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \ - [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \ - [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]] - cmap = matplotlib.cm.get_cmap('hsv') - plt.figure() - - img = np.array(image).astype('float32') - - color_set = results['colors'] if 'colors' in results else None - - if 'bbox' in results and ids is None: - bboxs = results['bbox'] - for j, rect in enumerate(bboxs): - xmin, ymin, xmax, ymax = rect - color = colors[0] if color_set is None else colors[color_set[j] % - len(colors)] - cv2.rectangle(img, (xmin, ymin), (xmax, ymax), color, 1) - - canvas = img.copy() - for i in range(kpt_nums): - for j in range(len(skeletons)): - if skeletons[j][i, 2] < visual_thread: - continue - if ids is None: - color = colors[i] if color_set is None else colors[color_set[j] - % - len(colors)] - else: - color = get_color(ids[j]) - - cv2.circle( - canvas, - tuple(skeletons[j][i, 0:2].astype('int32')), - 2, - color, - thickness=-1) - - to_plot = cv2.addWeighted(img, 0.3, canvas, 0.7, 0) - fig = matplotlib.pyplot.gcf() - - stickwidth = 2 - - for i in range(NUM_EDGES): - for j in range(len(skeletons)): - edge = EDGES[i] - if skeletons[j][edge[0], 2] < visual_thread or skeletons[j][edge[ - 1], 2] < visual_thread: - continue - - cur_canvas = canvas.copy() - X = [skeletons[j][edge[0], 1], skeletons[j][edge[1], 1]] - Y = [skeletons[j][edge[0], 0], skeletons[j][edge[1], 0]] - mX = np.mean(X) - mY = np.mean(Y) - length = ((X[0] - X[1])**2 + (Y[0] - Y[1])**2)**0.5 - angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1])) - polygon = cv2.ellipse2Poly((int(mY), int(mX)), - (int(length / 2), stickwidth), - int(angle), 0, 360, 1) - if ids is None: - color = colors[i] if color_set is None else colors[color_set[j] - % - len(colors)] - else: - color = get_color(ids[j]) - cv2.fillConvexPoly(cur_canvas, polygon, color) - canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0) - image = Image.fromarray(canvas.astype('uint8')) - plt.close() - return image - - -def draw_pose3d(image, - pose3d, - pose2d=None, - visual_thread=0.6, - save_name='pose3d.jpg', - returnimg=True): - try: - import matplotlib.pyplot as plt - import matplotlib - plt.switch_backend('agg') - except Exception as e: - logger.error('Matplotlib not found, please install matplotlib.' - 'for example: `pip install matplotlib`.') - raise e - - if pose3d.shape[0] == 24: - joints_connectivity_dict = [ - [0, 1, 0], [1, 2, 0], [5, 4, 1], [4, 3, 1], [2, 3, 0], [2, 14, 1], - [3, 14, 1], [14, 16, 1], [15, 16, 1], [15, 12, 1], [6, 7, 0], - [7, 8, 0], [11, 10, 1], [10, 9, 1], [8, 12, 0], [9, 12, 1], - [12, 19, 1], [19, 18, 1], [19, 20, 0], [19, 21, 1], [22, 20, 0], - [23, 21, 1] - ] - elif pose3d.shape[0] == 14: - joints_connectivity_dict = [ - [0, 1, 0], [1, 2, 0], [5, 4, 1], [4, 3, 1], [2, 3, 0], [2, 12, 0], - [3, 12, 1], [6, 7, 0], [7, 8, 0], [11, 10, 1], [10, 9, 1], - [8, 12, 0], [9, 12, 1], [12, 13, 1] - ] - else: - print( - "not defined joints number :{}, cannot visualize because unknown of joint connectivity". - format(pose.shape[0])) - return - - def draw3Dpose(pose3d, - ax, - lcolor="#3498db", - rcolor="#e74c3c", - add_labels=False): - # pose3d = orthographic_projection(pose3d, cam) - for i in joints_connectivity_dict: - x, y, z = [ - np.array([pose3d[i[0], j], pose3d[i[1], j]]) for j in range(3) - ] - ax.plot(-x, -z, -y, lw=2, c=lcolor if i[2] else rcolor) - - RADIUS = 1000 - center_xy = 2 if pose3d.shape[0] == 14 else 14 - x, y, z = pose3d[center_xy, 0], pose3d[center_xy, 1], pose3d[center_xy, - 2] - ax.set_xlim3d([-RADIUS + x, RADIUS + x]) - ax.set_ylim3d([-RADIUS + y, RADIUS + y]) - ax.set_zlim3d([-RADIUS + z, RADIUS + z]) - - ax.set_xlabel("x") - ax.set_ylabel("y") - ax.set_zlabel("z") - - def draw2Dpose(pose2d, - ax, - lcolor="#3498db", - rcolor="#e74c3c", - add_labels=False): - for i in joints_connectivity_dict: - if pose2d[i[0], 2] and pose2d[i[1], 2]: - x, y = [ - np.array([pose2d[i[0], j], pose2d[i[1], j]]) - for j in range(2) - ] - ax.plot(x, y, 0, lw=2, c=lcolor if i[2] else rcolor) - - def draw_img_pose(pose3d, - pose2d=None, - frame=None, - figsize=(12, 12), - savepath=None): - fig = plt.figure(figsize=figsize, dpi=80) - # fig.clear() - fig.tight_layout() - - ax = fig.add_subplot(221) - if frame is not None: - ax.imshow(frame, interpolation='nearest') - if pose2d is not None: - draw2Dpose(pose2d, ax) - - ax = fig.add_subplot(222, projection='3d') - ax.view_init(45, 45) - draw3Dpose(pose3d, ax) - ax = fig.add_subplot(223, projection='3d') - ax.view_init(0, 0) - draw3Dpose(pose3d, ax) - ax = fig.add_subplot(224, projection='3d') - ax.view_init(0, 90) - draw3Dpose(pose3d, ax) - - if savepath is not None: - plt.savefig(savepath) - plt.close() - else: - return fig - - def fig2data(fig): - """ - fig = plt.figure() - image = fig2data(fig) - @brief Convert a Matplotlib figure to a 4D numpy array with RGBA channels and return it - @param fig a matplotlib figure - @return a numpy 3D array of RGBA values - """ - # draw the renderer - fig.canvas.draw() - - # Get the RGBA buffer from the figure - w, h = fig.canvas.get_width_height() - buf = np.fromstring(fig.canvas.tostring_argb(), dtype=np.uint8) - buf.shape = (w, h, 4) - - # canvas.tostring_argb give pixmap in ARGB mode. Roll the ALPHA channel to have it in RGBA mode - buf = np.roll(buf, 3, axis=2) - image = Image.frombytes("RGBA", (w, h), buf.tostring()) - return image.convert("RGB") - - fig = draw_img_pose(pose3d, pose2d, frame=image) - data = fig2data(fig) - if returnimg is False: - data.save(save_name) - else: - return data diff --git a/pdfdet/models/Paddle/ppdet/utils/voc_utils.py b/pdfdet/models/Paddle/ppdet/utils/voc_utils.py deleted file mode 100644 index cd6d9f9..0000000 --- a/pdfdet/models/Paddle/ppdet/utils/voc_utils.py +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import os -import os.path as osp -import re -import random - -__all__ = ['create_list'] - - -def create_list(devkit_dir, years, output_dir): - """ - create following list: - 1. trainval.txt - 2. test.txt - """ - trainval_list = [] - test_list = [] - for year in years: - trainval, test = _walk_voc_dir(devkit_dir, year, output_dir) - trainval_list.extend(trainval) - test_list.extend(test) - - random.shuffle(trainval_list) - with open(osp.join(output_dir, 'trainval.txt'), 'w') as ftrainval: - for item in trainval_list: - ftrainval.write(item[0] + ' ' + item[1] + '\n') - - with open(osp.join(output_dir, 'test.txt'), 'w') as fval: - ct = 0 - for item in test_list: - ct += 1 - fval.write(item[0] + ' ' + item[1] + '\n') - - -def _get_voc_dir(devkit_dir, year, type): - return osp.join(devkit_dir, 'VOC' + year, type) - - -def _walk_voc_dir(devkit_dir, year, output_dir): - filelist_dir = _get_voc_dir(devkit_dir, year, 'ImageSets/Main') - annotation_dir = _get_voc_dir(devkit_dir, year, 'Annotations') - img_dir = _get_voc_dir(devkit_dir, year, 'JPEGImages') - trainval_list = [] - test_list = [] - added = set() - - for _, _, files in os.walk(filelist_dir): - for fname in files: - img_ann_list = [] - if re.match(r'[a-z]+_trainval\.txt', fname): - img_ann_list = trainval_list - elif re.match(r'[a-z]+_test\.txt', fname): - img_ann_list = test_list - else: - continue - fpath = osp.join(filelist_dir, fname) - for line in open(fpath): - name_prefix = line.strip().split()[0] - if name_prefix in added: - continue - added.add(name_prefix) - ann_path = osp.join( - osp.relpath(annotation_dir, output_dir), - name_prefix + '.xml') - img_path = osp.join( - osp.relpath(img_dir, output_dir), name_prefix + '.jpg') - img_ann_list.append((img_path, ann_path)) - - return trainval_list, test_list diff --git a/requirements.txt b/requirements.txt index 8fe0fc1..ac390ec 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,9 @@ +cnstd paddlepaddle==2.6.0 +paddledet==2.6.0 +PyMuPDF==1.23.26 tqdm opencv-python -PyMuPDF==1.23.26 pyyaml requests six @@ -9,5 +11,4 @@ scipy scikit-learn pycocotools pandas -shapely -cnstd \ No newline at end of file +shapely \ No newline at end of file